[mlpack-git] master: Add DatasetInfo class for categorical features. (3ee916e)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:41:34 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit 3ee916ed8018b850ebb3ccf07f83b0dceb12933f
Author: ryan <ryan at ratml.org>
Date: Wed Sep 9 10:19:06 2015 -0400
Add DatasetInfo class for categorical features.
>---------------------------------------------------------------
3ee916ed8018b850ebb3ccf07f83b0dceb12933f
src/mlpack/core/data/dataset_info.hpp | 65 +++++++++++++++++++++++++
src/mlpack/core/data/dataset_info_impl.hpp | 76 ++++++++++++++++++++++++++++++
2 files changed, 141 insertions(+)
diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp
new file mode 100644
index 0000000..77aeb36
--- /dev/null
+++ b/src/mlpack/core/data/dataset_info.hpp
@@ -0,0 +1,65 @@
+/**
+ * @file dataset_info.hpp
+ * @author Ryan Curtin
+ *
+ * Defines the DatasetInfo class, which holds information about a dataset. This
+ * is useful when the dataset contains categorical non-numeric features that
+ * needs to be mapped to categorical numeric features.
+ */
+#ifndef __MLPACK_CORE_DATA_DATASET_INFO_HPP
+#define __MLPACK_CORE_DATA_DATASET_INFO_HPP
+
+#include <mlpack/core.hpp>
+#include <unordered_map>
+#include <boost/bimap.hpp>
+
+namespace mlpack {
+namespace data {
+
+/**
+ * Auxiliary information for a dataset, including mappings to/from strings and
+ * the datatype of each dimension. DatasetInfo objects are optionally produced
+ * by data::Load(), and store the type of each dimension (Datatype::NUMERIC or
+ * Datatype::CATEGORICAL) as well as mappings from strings to unsigned integers
+ * and vice versa.
+ */
+class DatasetInfo
+{
+ public:
+ enum Datatype
+ {
+ NUMERIC = 0,
+ CATEGORICAL = 1
+ };
+
+ DatasetInfo();
+
+ /**
+ * Given the string and the dimension to which it belongs, return its numeric
+ * mapping. If no mapping yet exists, the string is added to the list of
+ * mappings for the given dimension. The dimension parameter refers to the
+ * index of the dimension of the string (i.e. the row in the dataset).
+ *
+ * @param string String to find/create mapping for.
+ * @param dimension Index of the dimension of the string.
+ */
+ size_t MapString(const std::string& string, const size_t dimension);
+
+ const std::string& UnmapString(const size_t value, const size_t dimension);
+
+ Datatype Type(const size_t dimension) const;
+
+ size_t NumMappings(const size_t dimension);
+
+ private:
+ // Map entries will only exist for dimensions that are categorical.
+ std::unordered_map<size_t, std::pair<boost::bimap<std::string, size_t>,
+ size_t>> maps;
+};
+
+} // namespace data
+} // namespace mlpack
+
+#include "dataset_info_impl.hpp"
+
+#endif
diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp
new file mode 100644
index 0000000..bd389cc
--- /dev/null
+++ b/src/mlpack/core/data/dataset_info_impl.hpp
@@ -0,0 +1,76 @@
+/**
+ * @file dataset_info_impl.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of the DatasetInfo class.
+ */
+#ifndef __MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP
+#define __MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP
+
+// In case it hasn't already been included.
+#include "dataset_info.hpp"
+
+namespace mlpack {
+namespace data {
+
+inline DatasetInfo::DatasetInfo()
+{
+ // Nothing to initialize.
+}
+
+// Map the string to a numeric id.
+inline size_t DatasetInfo::MapString(const std::string& string,
+ const size_t dimension)
+{
+ // If this condition is true, either we have no mapping for the given string
+ // or we have no mappings for the given dimension at all. In either case,
+ // we create a mapping.
+ if (maps.count(dimension) == 0 ||
+ maps[dimension].first.left.count(string) == 0)
+ {
+ // This string does not exist yet.
+ size_t& numMappings = maps[dimension].second;
+ typedef boost::bimap<std::string, size_t>::value_type PairType;
+ maps[dimension].first.insert(PairType(string, numMappings));
+ return numMappings++;
+ }
+ else
+ {
+ // This string already exists in the mapping.
+ return maps[dimension].first.left.at(string);
+ }
+}
+
+// Return the string corresponding to a value in a given dimension.
+inline const std::string& DatasetInfo::UnmapString(
+ const size_t value,
+ const size_t dimension)
+{
+ // Throw an exception if the value doesn't exist.
+ if (maps[dimension].first.right.count(value) == 0)
+ {
+ std::ostringstream oss;
+ oss << "DatasetInfo::UnmapString(): value '" << value << "' unknown for "
+ << "dimension " << dimension;
+ throw std::invalid_argument(oss.str());
+ }
+
+ return maps[dimension].first.right.at(value);
+}
+
+// Get the type of a particular dimension.
+inline DatasetInfo::Datatype DatasetInfo::Type(const size_t dimension) const
+{
+ return (maps.count(dimension) == 0) ? Datatype::NUMERIC :
+ Datatype::CATEGORICAL;
+}
+
+inline size_t DatasetInfo::NumMappings(const size_t dimension)
+{
+ return (maps.count(dimension) == 0) ? 0 : maps[dimension].second;
+}
+
+} // namespace data
+} // namespace mlpack
+
+#endif
More information about the mlpack-git
mailing list