[mlpack-git] master: add comments and use more proper names (63268a3)

gitdub at mlpack.org gitdub at mlpack.org
Mon Jul 25 12:19:16 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2

>---------------------------------------------------------------

commit 63268a3f1cc1ace8143ae0e6f9e8d9aa81822fd2
Author: Keon Kim <kwk236 at gmail.com>
Date:   Fri Jul 8 03:52:03 2016 +0900

    add comments and use more proper names


>---------------------------------------------------------------

63268a3f1cc1ace8143ae0e6f9e8d9aa81822fd2
 src/mlpack/core/data/CMakeLists.txt                |  4 +-
 .../data/{dataset_info.hpp => dataset_mapper.hpp}  | 15 ++--
 ...taset_info_impl.hpp => dataset_mapper_impl.hpp} | 29 +++----
 .../data/imputation_methods/custom_imputation.hpp  | 20 ++++-
 .../data/imputation_methods/listwise_deletion.hpp  | 19 +++--
 .../data/imputation_methods/mean_imputation.hpp    | 18 +++--
 .../data/imputation_methods/median_imputation.hpp  | 19 +++--
 src/mlpack/core/data/imputer.hpp                   | 29 ++++---
 src/mlpack/core/data/load.hpp                      |  2 +-
 src/mlpack/core/data/map_policies/datatype.hpp     |  2 -
 .../core/data/map_policies/increment_policy.hpp    | 53 +++++++++---
 .../core/data/map_policies/missing_policy.hpp      | 77 ++++++++++++++----
 .../methods/preprocess/preprocess_imputer_main.cpp | 94 +++++++++++++---------
 src/mlpack/tests/imputation_test.cpp               |  2 +-
 14 files changed, 258 insertions(+), 125 deletions(-)

diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt
index 65e4fc3..2fbd5d3 100644
--- a/src/mlpack/core/data/CMakeLists.txt
+++ b/src/mlpack/core/data/CMakeLists.txt
@@ -1,8 +1,8 @@
 # Define the files that we need to compile.
 # Anything not in this list will not be compiled into mlpack.
 set(SOURCES
-  dataset_info.hpp
-  dataset_info_impl.hpp
+  dataset_mapper.hpp
+  dataset_mapper_impl.hpp
   extension.hpp
   format.hpp
   load.hpp
diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_mapper.hpp
similarity index 94%
rename from src/mlpack/core/data/dataset_info.hpp
rename to src/mlpack/core/data/dataset_mapper.hpp
index c504540..ab9340c 100644
--- a/src/mlpack/core/data/dataset_info.hpp
+++ b/src/mlpack/core/data/dataset_mapper.hpp
@@ -1,6 +1,7 @@
 /**
- * @file dataset_info.hpp
+ * @file dataset_mapper.hpp
  * @author Ryan Curtin
+ * @author Keon Kim
  *
  * Defines the DatasetMapper class, which holds information about a dataset.
  * This is useful when the dataset contains categorical non-numeric features
@@ -53,7 +54,7 @@ class DatasetMapper
    * @param string String to find/create mapping for.
    * @param dimension Index of the dimension of the string.
    */
-  typename PolicyType::mapped_type MapString(const std::string& string,
+  typename PolicyType::MappedType MapString(const std::string& string,
                                             const size_t dimension);
 
   /**
@@ -75,13 +76,13 @@ class DatasetMapper
    * @param string Mapped string for value.
    * @param dimension Dimension to unmap string from.
    */
-  typename PolicyType::mapped_type UnmapValue(const std::string& string,
+  typename PolicyType::MappedType UnmapValue(const std::string& string,
                                             const size_t dimension);
 
   template <typename eT>
   void MapTokens(const std::vector<std::string>& tokens,
-                 size_t& row,
-                 arma::Mat<eT>& matrix);
+      size_t& row,
+      arma::Mat<eT>& matrix);
 
   //! Return the type of a given dimension (numeric or categorical).
   Datatype Type(const size_t dimension) const;
@@ -126,7 +127,7 @@ class DatasetMapper
   std::vector<Datatype> types;
 
   // BiMapType definition
-  using BiMapType = boost::bimap<std::string, typename PolicyType::mapped_type>;
+  using BiMapType = boost::bimap<std::string, typename PolicyType::MappedType>;
 
   // Mappings from strings to integers.
   // Map entries will only exist for dimensions that are categorical.
@@ -143,6 +144,6 @@ using DatasetInfo = DatasetMapper<data::IncrementPolicy>;
 } // namespace data
 } // namespace mlpack
 
-#include "dataset_info_impl.hpp"
+#include "dataset_mapper_impl.hpp"
 
 #endif
diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_mapper_impl.hpp
similarity index 78%
rename from src/mlpack/core/data/dataset_info_impl.hpp
rename to src/mlpack/core/data/dataset_mapper_impl.hpp
index 6bb7d75..6b291e2 100644
--- a/src/mlpack/core/data/dataset_info_impl.hpp
+++ b/src/mlpack/core/data/dataset_mapper_impl.hpp
@@ -1,6 +1,7 @@
 /**
- * @file dataset_info_impl.hpp
+ * @file dataset_mapper_impl.hpp
  * @author Ryan Curtin
+ * @author Keon Kim
  *
  * An implementation of the DatasetMapper<PolicyType> class.
  */
@@ -8,7 +9,7 @@
 #define MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP
 
 // In case it hasn't already been included.
-#include "dataset_info.hpp"
+#include "dataset_mapper.hpp"
 
 namespace mlpack {
 namespace data {
@@ -23,7 +24,7 @@ inline DatasetMapper<PolicyType>::DatasetMapper(const size_t dimensionality) :
 
 template<typename PolicyType>
 inline DatasetMapper<PolicyType>::DatasetMapper(PolicyType& policy,
-                                               const size_t dimensionality) :
+    const size_t dimensionality) :
     types(dimensionality, Datatype::numeric),
     policy(std::move(policy))
 {
@@ -33,9 +34,9 @@ inline DatasetMapper<PolicyType>::DatasetMapper(PolicyType& policy,
 // When we want to insert value into the map,
 // we could use the policy to map the string
 template<typename PolicyType>
-inline typename PolicyType::mapped_type DatasetMapper<PolicyType>::MapString(
-                                                    const std::string& string,
-                                                    const size_t dimension)
+inline typename PolicyType::MappedType DatasetMapper<PolicyType>::MapString(
+    const std::string& string,
+    const size_t dimension)
 {
   return policy.template MapString<MapType>(string, dimension, maps, types);
 }
@@ -43,8 +44,8 @@ inline typename PolicyType::mapped_type DatasetMapper<PolicyType>::MapString(
 // Return the string corresponding to a value in a given dimension.
 template<typename PolicyType>
 inline const std::string& DatasetMapper<PolicyType>::UnmapString(
-                                                    const size_t value,
-                                                    const size_t dimension)
+    const size_t value,
+    const size_t dimension)
 {
   // Throw an exception if the value doesn't exist.
   if (maps[dimension].first.right.count(value) == 0)
@@ -60,9 +61,9 @@ inline const std::string& DatasetMapper<PolicyType>::UnmapString(
 
 // Return the value corresponding to a string in a given dimension.
 template<typename PolicyType>
-inline typename PolicyType::mapped_type DatasetMapper<PolicyType>::UnmapValue(
-                                                    const std::string& string,
-                                                    const size_t dimension)
+inline typename PolicyType::MappedType DatasetMapper<PolicyType>::UnmapValue(
+    const std::string& string,
+    const size_t dimension)
 {
   // Throw an exception if the value doesn't exist.
   if (maps[dimension].first.left.count(string) == 0)
@@ -79,9 +80,9 @@ inline typename PolicyType::mapped_type DatasetMapper<PolicyType>::UnmapValue(
 template<typename PolicyType>
 template<typename eT>
 inline void DatasetMapper<PolicyType>::MapTokens(
-                                        const std::vector<std::string>& tokens,
-                                        size_t& row,
-                                        arma::Mat<eT>& matrix)
+    const std::vector<std::string>& tokens,
+    size_t& row,
+    arma::Mat<eT>& matrix)
 {
   return policy.template MapTokens<eT, MapType>(tokens, row, matrix, maps,
                                                 types);
diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
index 83bde9d..1698ba9 100644
--- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
@@ -8,17 +8,29 @@
 #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_IMPUTATION_HPP
 
 #include <mlpack/core.hpp>
-#include <cmath>
-
-using namespace std;
 
 namespace mlpack {
 namespace data {
-
+/**
+ * A simple custom imputation class
+ * @tparam T Type of armadillo matrix
+ */
 template <typename T>
 class CustomImputation
 {
  public:
+  /**
+   * Impute function searches through the input looking for mappedValue and
+   * replaces it with the user-defined custom value of the given dimension.
+   * The result is saved to the output.
+   *
+   * @param input Matrix that contains mappedValue.
+   * @param output Matrix that the result will be saved into.
+   * @param mappedValue Value that the user wants to get rid of.
+   * @param customValue Value that the user wants to replace mappedValue with.
+   * @param dimension Index of the dimension of the mappedValue.
+   * @param transpose State of whether the input matrix is transposed or not.
+   */
   void Impute(const arma::Mat<T>& input,
               arma::Mat<T>& output,
               const T& mappedValue,
diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
index 9f3a7d2..06db83a 100644
--- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
+++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
@@ -8,21 +8,28 @@
 #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_LISTWISE_DELETION_HPP
 
 #include <mlpack/core.hpp>
-#include <cmath>
-
-using namespace std;
 
 namespace mlpack {
 namespace data {
-
 /**
- * complete-case analysis.
+ * A complete-case analysis to remove the values containing mappedValue.
  * Removes all data for a case that has one or more missing values.
+ * @tparam T Type of armadillo matrix
  */
 template <typename T>
 class ListwiseDeletion
 {
  public:
+  /**
+   * Impute function searches through the input looking for mappedValue and
+   * remove the whole row or column. The result is saved to the output.
+   *
+   * @param input Matrix that contains mappedValue.
+   * @param output Matrix that the result will be saved into.
+   * @param mappedValue Value that the user wants to get rid of.
+   * @param dimension Index of the dimension of the mappedValue.
+   * @param transpose State of whether the input matrix is transposed or not.
+   */
   void Impute(const arma::Mat<T>& input,
               arma::Mat<T>& output,
               const T& mappedValue,
@@ -47,7 +54,7 @@ class ListwiseDeletion
     }
     else
     {
-      for (size_t i = 0; i < input.n_rows; ++i)\
+      for (size_t i = 0; i < input.n_rows; ++i)
       {
         if (input(i, dimension) == mappedValue ||
              std::isnan(input(i, dimension)))
diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
index 157ba53..05134e5 100644
--- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
@@ -8,20 +8,28 @@
 #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_IMPUTATION_HPP
 
 #include <mlpack/core.hpp>
-#include <cmath>
-
-using namespace std;
 
 namespace mlpack {
 namespace data {
-
 /**
- * A simple mean imputation
+ * A simple mean imputation class
+ * @tparam T Type of armadillo matrix
  */
 template <typename T>
 class MeanImputation
 {
  public:
+  /**
+   * Impute function searches through the input looking for mappedValue and
+   * replaces it with the mean of the given dimension. The result is saved
+   * to the output.
+   *
+   * @param input Matrix that contains mappedValue.
+   * @param output Matrix that the result will be saved into.
+   * @param mappedValue Value that the user wants to get rid of.
+   * @param dimension Index of the dimension of the mappedValue.
+   * @param transpose State of whether the input matrix is transposed or not.
+   */
   void Impute(const arma::Mat<T>& input,
               arma::Mat<T>& output,
               const T& mappedValue,
diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
index 0035be9..8a111d4 100644
--- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
@@ -9,19 +9,28 @@
 
 #include <mlpack/core.hpp>
 
-using namespace std;
-
 namespace mlpack {
 namespace data {
-
 /**
- * A simple median imputation
+ * This is a class implementation of simple median imputation.
  * replace missing value with middle or average of middle values
+ * @tparam T Type of armadillo matrix
  */
 template <typename T>
 class MedianImputation
 {
  public:
+  /**
+   * Impute function searches through the input looking for mappedValue and
+   * replaces it with the median of the given dimension. The result is saved
+   * to the output.
+   *
+   * @param input Matrix that contains mappedValue.
+   * @param output Matrix that the result will be saved into.
+   * @param mappedValue Value that the user wants to get rid of.
+   * @param dimension Index of the dimension of the mappedValue.
+   * @param transpose State of whether the input matrix is transposed or not.
+   */
   void Impute(const arma::Mat<T>& input,
               arma::Mat<T>& output,
               const T& mappedValue,
@@ -56,7 +65,7 @@ class MedianImputation
       }
     }
   }
-}; // class MeanImputation
+}; // class MedianImputation
 
 } // namespace data
 } // namespace mlpack
diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp
index 3bc9cb5..b719ba2 100644
--- a/src/mlpack/core/data/imputer.hpp
+++ b/src/mlpack/core/data/imputer.hpp
@@ -9,7 +9,9 @@
 #define MLPACK_CORE_DATA_IMPUTER_HPP
 
 #include <mlpack/core.hpp>
-#include "dataset_info.hpp"
+#include "dataset_mapper.hpp"
+#include "map_policies/missing_policy.hpp"
+#include "map_policies/increment_policy.hpp"
 
 namespace mlpack {
 namespace data {
@@ -27,22 +29,18 @@ class Imputer
 {
  public:
   Imputer(MapperType mapper, bool transpose = true):
-    mapper(std::move(mapper)),
-    transpose(transpose)
+      mapper(std::move(mapper)),
+      transpose(transpose)
   {
-    //static_assert(std::is_same<typename std::decay<MapperType>::type,
-        //data::IncrementPolicy>::value, "The type of MapperType must be "
-        //"IncrementPolicy");
+    // Nothing to initialize here.
   }
 
   Imputer(MapperType mapper, StrategyType strategy, bool transpose = true):
-    strategy(std::move(strategy)),
-    mapper(std::move(mapper)),
-    transpose(transpose)
+      strategy(std::move(strategy)),
+      mapper(std::move(mapper)),
+      transpose(transpose)
   {
-    //static_assert(std::is_same<typename std::decay<MapperType>::type,
-        //data::IncrementPolicy>::value, "The type of MapperType must be "
-        //"IncrementPolicy");
+    // Nothing to initialize here.
   }
 
   /**
@@ -66,6 +64,13 @@ class Imputer
   /**
   * This overload of Impute() lets users to define custom value that can be
   * replaced with the target value.
+  *
+  * @param input Input dataset to apply imputation.
+  * @param output Armadillo matrix to save the results
+  * @oaran missingValue User defined missing value; it can be anything.
+  * @param customValue The numeric value that a user wants to replace
+  *        missingValue with.
+  * @param dimension Dimension to apply the imputation.
   */
   void Impute(const arma::Mat<T>& input,
               arma::Mat<T>& output,
diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp
index 4b5debe..40d3834 100644
--- a/src/mlpack/core/data/load.hpp
+++ b/src/mlpack/core/data/load.hpp
@@ -14,7 +14,7 @@
 #include <string>
 
 #include "format.hpp"
-#include "dataset_info.hpp"
+#include "dataset_mapper.hpp"
 
 namespace mlpack {
 namespace data /** Functions to load and save matrices and models. */ {
diff --git a/src/mlpack/core/data/map_policies/datatype.hpp b/src/mlpack/core/data/map_policies/datatype.hpp
index 0cafba8..3a3b1ac 100644
--- a/src/mlpack/core/data/map_policies/datatype.hpp
+++ b/src/mlpack/core/data/map_policies/datatype.hpp
@@ -10,7 +10,6 @@
 
 namespace mlpack {
 namespace data {
-
 /**
  * The Datatype enum specifies the types of data mlpack algorithms can use.
  * The vast majority of mlpack algorithms can only use numeric data (i.e.
@@ -23,7 +22,6 @@ enum Datatype : bool /* [> bool is all the precision we need for two types <] */
   categorical = 1
 };
 
-
 } // namespace data
 } // namespace mlpack
 
diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp
index 4a971d2..4ff7341 100644
--- a/src/mlpack/core/data/map_policies/increment_policy.hpp
+++ b/src/mlpack/core/data/map_policies/increment_policy.hpp
@@ -12,26 +12,39 @@
 #include <boost/bimap.hpp>
 #include <mlpack/core/data/map_policies/datatype.hpp>
 
-using namespace std;
-
 namespace mlpack {
 namespace data {
-
 /**
- * This class is used to map strings to incrementing unsigned integers (size_t).
- * First string to be mapped will be mapped to 0, next to 1 and so on.
+ * IncrementPolicy is used as a helper class for DatasetMapper. It tells how the
+ * strings should be mapped. Purpose of this policy is to map all dimension if
+ * one if the variables in a dimension turns out to be a categorical variable.
+ * IncrementPolicy maps strings to incrementing unsigned integers (size_t).
+ * The first string to be mapped will be mapped to 0, the next to 1 and so on.
  */
 class IncrementPolicy
 {
  public:
-  // typedef of mapped_type
-  using mapped_type = size_t;
+  // typedef of MappedType
+  using MappedType = size_t;
 
+  /**
+   * Given the string and the dimension to which the it belongs, and the maps
+   * and types given by the DatasetMapper class, returns its numeric mapping.
+   * If no mapping yet exists, the string is added to the list of mappings for
+   * the given dimension. This function is used as a helper function for
+   * DatasetMapper class.
+   *
+   * @tparam MapType Type of unordered_map that contains mapped value pairs
+   * @param string String to find/create mapping for.
+   * @param dimension Index of the dimension of the string.
+   * @param maps Unordered map given by the DatasetMapper.
+   * @param types Vector containing the type information about each dimensions.
+   */
   template <typename MapType>
-  mapped_type MapString(const std::string& string,
-                        const size_t dimension,
-                        MapType& maps,
-                        std::vector<Datatype>& types)
+  MappedType MapString(const std::string& string,
+                       const size_t dimension,
+                       MapType& maps,
+                       std::vector<Datatype>& types)
   {
     // If this condition is true, either we have no mapping for the given string
     // or we have no mappings for the given dimension at all.  In either case,
@@ -46,7 +59,7 @@ class IncrementPolicy
       if (numMappings == 0)
         types[dimension] = Datatype::categorical;
 
-      typedef boost::bimap<std::string, mapped_type>::value_type PairType;
+      typedef boost::bimap<std::string, MappedType>::value_type PairType;
       maps[dimension].first.insert(PairType(string, numMappings));
       return numMappings++;
     }
@@ -57,6 +70,21 @@ class IncrementPolicy
     }
   }
 
+  /**
+   * MapTokens turns vector of strings into numeric variables and puts them
+   * into a given matrix. It is used as a helper function when trying to load
+   * files. Each dimension's tokens are given in to this function. If one of the
+   * tokens turns out to be a string, all the tokens should be mapped using the
+   * MapString() funciton.
+   *
+   * @tparam eT Type of armadillo matrix.
+   * @tparam MapType Type of unordered_map that contains mapped value pairs.
+   * @param tokens Vector of variables inside a dimension.
+   * @param row Position of the given tokens.
+   * @param matrix Matrix to save the data into.
+   * @param maps Maps given by the DatasetMapper class.
+   * @param types Types of each dimensions given by the DatasetMapper class.
+   */
   template <typename eT, typename MapType>
   void MapTokens(const std::vector<std::string>& tokens,
                  size_t& row,
@@ -81,7 +109,6 @@ class IncrementPolicy
        {
          const eT val = static_cast<eT>(this->MapString(tokens[i], row, maps,
                                                         types));
-         double temp = (double) val;
          matrix.at(row, i) = val;
        }
     }
diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp
index c68fc27..ead543a 100644
--- a/src/mlpack/core/data/map_policies/missing_policy.hpp
+++ b/src/mlpack/core/data/map_policies/missing_policy.hpp
@@ -13,38 +13,58 @@
 #include <mlpack/core/data/map_policies/datatype.hpp>
 #include <limits>
 
-
-using namespace std;
-
 namespace mlpack {
 namespace data {
-
 /**
- * Same as increment map policy, but does not change type of features.
+ * MissingPolicy is used as a helper class for DatasetMapper. It tells how the
+ * strings should be mapped. Purpose of this policy is to map all user-defined
+ * missing variables into maps so that users can decide what to do with the
+ * corrupted data. User-defined missing variables are given by the missingSet.
+ * Note that MissingPolicy does not change type of features.
  */
 class MissingPolicy
 {
  public:
-  // typedef of mapped_type
-  using mapped_type = double;
+  // typedef of MappedType
+  using MappedType = double;
 
   MissingPolicy()
   {
     // Nothing to initialize here.
   }
 
+  /**
+   * Create the MissingPolicy object with the given missingSet. Note that the
+   * missingSet cannot be changed later; you will have to create a new
+   * MissingPolicy object.
+   *
+   * @param missingSet Set of strings that should be mapped.
+   */
   explicit MissingPolicy(std::set<std::string> missingSet) :
-    missingSet(std::move(missingSet))
+      missingSet(std::move(missingSet))
   {
     // Nothing to initialize here.
   }
 
-
+  /**
+   * Given the string and the dimension to which it belongs by the user, and
+   * the maps and types given by the DatasetMapper class, returns its numeric
+   * mapping. If no mapping yet exists and the string is included in the
+   * missingSet, the string is added to the list of mappings for the given
+   * dimension. This function is used as a helper function for DatasetMapper
+   * class.
+   *
+   * @tparam MapType Type of unordered_map that contains mapped value pairs
+   * @param string String to find/create mapping for.
+   * @param dimension Index of the dimension of the string.
+   * @param maps Unordered map given by the DatasetMapper.
+   * @param types Vector containing the type information about each dimensions.
+   */
   template <typename MapType>
-  mapped_type MapString(const std::string& string,
-                        const size_t dimension,
-                        MapType maps,
-                        std::vector<Datatype>& types)
+  MappedType MapString(const std::string& string,
+                       const size_t dimension,
+                       MapType maps,
+                       std::vector<Datatype>& types)
   {
     // If this condition is true, either we have no mapping for the given string
     // or we have no mappings for the given dimension at all.  In either case,
@@ -57,7 +77,7 @@ class MissingPolicy
       // This string does not exist yet.
       size_t& numMappings = maps[dimension].second;
 
-      typedef boost::bimap<std::string, mapped_type>::value_type PairType;
+      typedef boost::bimap<std::string, MappedType>::value_type PairType;
       maps[dimension].first.insert(PairType(string, NaN));
 
       ++numMappings;
@@ -65,11 +85,28 @@ class MissingPolicy
     }
     else
     {
-      // This string already exists in the mapping or .
+      // This string already exists in the mapping
+      // or not included in missingSet.
       return NaN;
     }
   }
 
+  /**
+   * MapTokens turns vector of strings into numeric variables and puts them
+   * into a given matrix. It is used as a helper function when trying to load
+   * files. Each dimension's tokens are given in to this function. If one of the
+   * tokens turns out to be a string or one of the missingSet's variables, only
+   * the token responsible for it should be mapped using the MapString()
+   * funciton.
+   *
+   * @tparam eT Type of armadillo matrix.
+   * @tparam MapType Type of unordered_map that contains mapped value pairs.
+   * @param tokens Vector of variables inside a dimension.
+   * @param row Position of the given tokens.
+   * @param matrix Matrix to save the data into.
+   * @param maps Maps given by the DatasetMapper class.
+   * @param types Types of each dimensions given by the DatasetMapper class.
+   */
   template <typename eT, typename MapType>
   void MapTokens(const std::vector<std::string>& tokens,
                  size_t& row,
@@ -77,9 +114,14 @@ class MissingPolicy
                  MapType& maps,
                  std::vector<Datatype>& types)
   {
+    // MissingPolicy allows double type matrix only, because it uses NaN.
+    static_assert(std::is_same<eT, double>::value, "You must use double type "
+        " matrix in order to apply MissingPolicy");
+
     std::stringstream token;
     for (size_t i = 0; i != tokens.size(); ++i)
     {
+      // if token is a number, but is included in the missingSet, map it.
       if (missingSet.find(tokens[i]) != std::end(missingSet))
       {
          const eT val = static_cast<eT>(this->MapString(tokens[i], row, maps,
@@ -88,7 +130,8 @@ class MissingPolicy
       }
       token.str(tokens[i]);
       token>>matrix.at(row, i);
-      if (token.fail()) // if not number, map it to datasetmapper
+      // if the token is not number, map it.
+      if (token.fail())
       {
         const eT val = static_cast<eT>(this->MapString(tokens[i], row, maps,
                                                        types));
@@ -99,6 +142,8 @@ class MissingPolicy
   }
 
  private:
+  // Note that missingSet and maps are different.
+  // missingSet specifies which value/string should be mapped.
   std::set<std::string> missingSet;
 }; // class MissingPolicy
 
diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
index a0b0a13..603cdcc 100644
--- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
@@ -7,7 +7,7 @@
  */
 #include <mlpack/core.hpp>
 #include <mlpack/core/data/imputer.hpp>
-#include <mlpack/core/data/dataset_info.hpp>
+#include <mlpack/core/data/dataset_mapper.hpp>
 #include <mlpack/core/data/map_policies/increment_policy.hpp>
 #include <mlpack/core/data/map_policies/missing_policy.hpp>
 #include <mlpack/core/data/imputation_methods/mean_imputation.hpp>
@@ -19,8 +19,8 @@ PROGRAM_INFO("Impute Data", "This utility takes a dataset and converts user "
     "defined missing variable to another to provide more meaningful analysis "
     "\n\n"
     "The program does not modify the original file, but instead makes a "
-    "separate file to save the output data; The program requires you to "
-    "specify the file name with --output_file (-o)."
+    "separate file to save the output data; You can save the output by "
+    "specifying the file name with --output_file (-o)."
     "\n\n"
     "For example, if we consider 'NULL' in dimension 0 to be a missing "
     "variable and want to delete whole row containing the NULL in the "
@@ -43,7 +43,6 @@ using namespace data;
 
 int main(int argc, char** argv)
 {
-  // Parse command line options.
   CLI::ParseCommandLine(argc, argv);
 
   const string inputFile = CLI::GetParam<string>("input_file");
@@ -53,21 +52,25 @@ int main(int argc, char** argv)
   const size_t dimension = (size_t) CLI::GetParam<int>("dimension");
   string strategy = CLI::GetParam<string>("strategy");
 
-  // missing value should be specified
+  // The program needs user-defined missing values.
+  // Missing values can be any list of strings such as "1", "a", "NULL".
   if (!CLI::HasParam("missing_value"))
     Log::Fatal << "--missing_value must be specified in order to perform "
         << "any imputation strategies." << endl;
 
-  // warn if user did not specify output_file
+  if (!CLI::HasParam("strategy"))
+    Log::Fatal << "--strategy must be specified in order to perform "
+        << "imputation."<< endl;
+
   if (!CLI::HasParam("output_file"))
     Log::Warn << "--output_file is not specified, no "
         << "results from this program will be saved!" << endl;
 
-  // warn if user did not specify dimension
   if (!CLI::HasParam("dimension"))
-    Log::Warn << "--dimension is required to be specified!" << endl;
+    Log::Warn << "--dimension is not specified, the imputation will be "
+        << "applied to all dimensions."<< endl;
 
-  // if custom value is specified, and imputation strategy is not,
+  // If custom value is specified, and imputation strategy is not,
   // set imputation strategy to "custom"
   if (CLI::HasParam("custom_value") && !CLI::HasParam("impute_strategy"))
   {
@@ -76,7 +79,7 @@ int main(int argc, char** argv)
         << "--impute_strategy is automatically set to 'custom'." << endl;
   }
 
-  // custom value and any other impute strategies cannot be specified at
+  // Custom value and any other impute strategies cannot be specified at
   // the same time.
   if (CLI::HasParam("custom_value") && CLI::HasParam("impute_strategy") &&
       strategy != "custom")
@@ -89,6 +92,7 @@ int main(int argc, char** argv)
         << "'custom' strategy" << endl;
 
   arma::mat input;
+  arma::mat output;
   // Policy tells how the DatasetMapper should map the values.
   std::set<std::string> missingSet;
   missingSet.insert(missingValue);
@@ -98,9 +102,6 @@ int main(int argc, char** argv)
 
   Load(inputFile, input, info, true, true);
 
-  // for testing purpose
-  Log::Info << input << endl;
-
   // print how many mapping exist in each dimensions
   for (size_t i = 0; i < input.n_rows; ++i)
   {
@@ -108,49 +109,68 @@ int main(int argc, char** argv)
         << endl;
   }
 
-  arma::Mat<double> output(input);
-
-  Log::Info << "Performing '" << strategy << "' imputation strategy "
-      << "on dimension '" << dimension << "'." << endl;
-
-  // custom strategy only
-  if (strategy == "custom")
+  // default imputer is mean imputation (to provide scope)
+  Imputer<double, MapperType, MeanImputation<double>> impu(info);
+  if (strategy == "median")
+  {
+    Imputer<double, MapperType, MedianImputation<double>> impu(info);
+  }
+  else if (strategy == "listwise_deletion")
   {
-    Log::Info << "Replacing all '" << missingValue << "' with '" << customValue
-        << "'." << endl;
-    Imputer<double, MapperType, CustomImputation<double>> impu(info);
-    impu.Impute(input, output, missingValue, customValue, dimension);
+    Imputer<double, MapperType, ListwiseDeletion<double>> impu(info);
   }
   else
   {
-    Log::Info << "Replacing all '" << missingValue << "' with '"
-        << strategy << "' strategy." << endl;
+    Log::Fatal << "'" <<  strategy << "' imputation strategy does not exist"
+        << endl;
+  }
 
-    if (strategy == "mean")
+  // Initialize imputer class
+
+  if (CLI::HasParam("dimension"))
+  {
+    // when --dimension is specified,
+    // the program will apply the changes to only the given dimension.
+    Log::Info << "Performing '" << strategy << "' imputation strategy "
+        << "to replace '" << missingValue << "' on all dimensions." << endl;
+
+    if (strategy == "custom")
     {
-      Imputer<double, MapperType, MeanImputation<double>> impu(info);
-      impu.Impute(input, output, missingValue, dimension);
+      Imputer<double, MapperType, CustomImputation<double>> impu(info);
+      impu.Impute(input, output, missingValue, customValue, dimension);
     }
-    else if (strategy == "median")
+    else
     {
-      Imputer<double, MapperType, MedianImputation<double>> impu(info);
       impu.Impute(input, output, missingValue, dimension);
     }
-    else if (strategy == "listwise_deletion")
+  }
+  else
+  {
+    // when --dimension is not specified,
+    // the program will apply the changes to all dimensions.
+    Log::Info << "Performing '" << strategy << "' imputation strategy "
+        << "to replace '" << missingValue << "' on all dimensions." << endl;
+
+    if (strategy == "custom")
     {
-      Imputer<double, MapperType, ListwiseDeletion<double>> impu(info);
-      impu.Impute(input, output, missingValue, dimension);
+      Imputer<double, MapperType, CustomImputation<double>> impu(info);
+      for (size_t i = 0; i < input.n_rows; ++i)
+      {
+        impu.Impute(input, output, missingValue, customValue, i);
+      }
     }
     else
     {
-      Log::Warn << "You did not choose any imputation strategy" << endl;
+      for (size_t i = 0; i < input.n_rows; ++i)
+      {
+        impu.Impute(input, output, missingValue, i);
+      }
     }
   }
 
-
   if (!outputFile.empty())
   {
-    Log::Info << "Saving model to '" << outputFile << "'." << endl;
+    Log::Info << "Saving results to '" << outputFile << "'." << endl;
     Save(outputFile, output, false);
   }
 }
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
index f02e97e..e118bfb 100644
--- a/src/mlpack/tests/imputation_test.cpp
+++ b/src/mlpack/tests/imputation_test.cpp
@@ -7,7 +7,7 @@
 #include <sstream>
 
 #include <mlpack/core.hpp>
-#include <mlpack/core/data/dataset_info.hpp>
+#include <mlpack/core/data/dataset_mapper.hpp>
 #include <mlpack/core/data/map_policies/increment_policy.hpp>
 #include <mlpack/core/data/map_policies/missing_policy.hpp>
 #include <mlpack/core/data/imputer.hpp>




More information about the mlpack-git mailing list