[mlpack-git] master: Add DatasetInfo class for categorical features. (3ee916e)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:41:34 EST 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit 3ee916ed8018b850ebb3ccf07f83b0dceb12933f
Author: ryan <ryan at ratml.org>
Date:   Wed Sep 9 10:19:06 2015 -0400

    Add DatasetInfo class for categorical features.


>---------------------------------------------------------------

3ee916ed8018b850ebb3ccf07f83b0dceb12933f
 src/mlpack/core/data/dataset_info.hpp      | 65 +++++++++++++++++++++++++
 src/mlpack/core/data/dataset_info_impl.hpp | 76 ++++++++++++++++++++++++++++++
 2 files changed, 141 insertions(+)

diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp
new file mode 100644
index 0000000..77aeb36
--- /dev/null
+++ b/src/mlpack/core/data/dataset_info.hpp
@@ -0,0 +1,65 @@
+/**
+ * @file dataset_info.hpp
+ * @author Ryan Curtin
+ *
+ * Defines the DatasetInfo class, which holds information about a dataset.  This
+ * is useful when the dataset contains categorical non-numeric features that
+ * needs to be mapped to categorical numeric features.
+ */
+#ifndef __MLPACK_CORE_DATA_DATASET_INFO_HPP
+#define __MLPACK_CORE_DATA_DATASET_INFO_HPP
+
+#include <mlpack/core.hpp>
+#include <unordered_map>
+#include <boost/bimap.hpp>
+
+namespace mlpack {
+namespace data {
+
+/**
+ * Auxiliary information for a dataset, including mappings to/from strings and
+ * the datatype of each dimension.  DatasetInfo objects are optionally produced
+ * by data::Load(), and store the type of each dimension (Datatype::NUMERIC or
+ * Datatype::CATEGORICAL) as well as mappings from strings to unsigned integers
+ * and vice versa.
+ */
+class DatasetInfo
+{
+ public:
+  enum Datatype
+  {
+    NUMERIC = 0,
+    CATEGORICAL = 1
+  };
+
+  DatasetInfo();
+
+  /**
+   * Given the string and the dimension to which it belongs, return its numeric
+   * mapping.  If no mapping yet exists, the string is added to the list of
+   * mappings for the given dimension.  The dimension parameter refers to the
+   * index of the dimension of the string (i.e. the row in the dataset).
+   *
+   * @param string String to find/create mapping for.
+   * @param dimension Index of the dimension of the string.
+   */
+  size_t MapString(const std::string& string, const size_t dimension);
+
+  const std::string& UnmapString(const size_t value, const size_t dimension);
+
+  Datatype Type(const size_t dimension) const;
+
+  size_t NumMappings(const size_t dimension);
+
+ private:
+  // Map entries will only exist for dimensions that are categorical.
+  std::unordered_map<size_t, std::pair<boost::bimap<std::string, size_t>,
+      size_t>> maps;
+};
+
+} // namespace data
+} // namespace mlpack
+
+#include "dataset_info_impl.hpp"
+
+#endif
diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp
new file mode 100644
index 0000000..bd389cc
--- /dev/null
+++ b/src/mlpack/core/data/dataset_info_impl.hpp
@@ -0,0 +1,76 @@
+/**
+ * @file dataset_info_impl.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of the DatasetInfo class.
+ */
+#ifndef __MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP
+#define __MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP
+
+// In case it hasn't already been included.
+#include "dataset_info.hpp"
+
+namespace mlpack {
+namespace data {
+
+inline DatasetInfo::DatasetInfo()
+{
+  // Nothing to initialize.
+}
+
+// Map the string to a numeric id.
+inline size_t DatasetInfo::MapString(const std::string& string,
+                                     const size_t dimension)
+{
+  // If this condition is true, either we have no mapping for the given string
+  // or we have no mappings for the given dimension at all.  In either case,
+  // we create a mapping.
+  if (maps.count(dimension) == 0 ||
+      maps[dimension].first.left.count(string) == 0)
+  {
+    // This string does not exist yet.
+    size_t& numMappings = maps[dimension].second;
+    typedef boost::bimap<std::string, size_t>::value_type PairType;
+    maps[dimension].first.insert(PairType(string, numMappings));
+    return numMappings++;
+  }
+  else
+  {
+    // This string already exists in the mapping.
+    return maps[dimension].first.left.at(string);
+  }
+}
+
+// Return the string corresponding to a value in a given dimension.
+inline const std::string& DatasetInfo::UnmapString(
+    const size_t value,
+    const size_t dimension)
+{
+  // Throw an exception if the value doesn't exist.
+  if (maps[dimension].first.right.count(value) == 0)
+  {
+    std::ostringstream oss;
+    oss << "DatasetInfo::UnmapString(): value '" << value << "' unknown for "
+        << "dimension " << dimension;
+    throw std::invalid_argument(oss.str());
+  }
+
+  return maps[dimension].first.right.at(value);
+}
+
+// Get the type of a particular dimension.
+inline DatasetInfo::Datatype DatasetInfo::Type(const size_t dimension) const
+{
+  return (maps.count(dimension) == 0) ? Datatype::NUMERIC :
+      Datatype::CATEGORICAL;
+}
+
+inline size_t DatasetInfo::NumMappings(const size_t dimension)
+{
+  return (maps.count(dimension) == 0) ? 0 : maps[dimension].second;
+}
+
+} // namespace data
+} // namespace mlpack
+
+#endif



More information about the mlpack-git mailing list