[mlpack-git] master: modify custom impute interface and rename variables (2eb6754)

gitdub at mlpack.org gitdub at mlpack.org
Sun Jul 10 17:26:48 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2

>---------------------------------------------------------------

commit 2eb675467528574b65fa23d78e4e8d3e6e6ea6c5
Author: Keon Kim <kwk236 at gmail.com>
Date:   Mon Jul 11 06:26:48 2016 +0900

    modify custom impute interface and rename variables


>---------------------------------------------------------------

2eb675467528574b65fa23d78e4e8d3e6e6ea6c5
 src/mlpack/core/data/dataset_mapper.hpp            |  1 +
 .../data/imputation_methods/custom_imputation.hpp  | 20 +++++---
 .../data/imputation_methods/listwise_deletion.hpp  |  6 +--
 .../data/imputation_methods/mean_imputation.hpp    |  6 +--
 .../data/imputation_methods/median_imputation.hpp  |  6 +--
 src/mlpack/core/data/imputer.hpp                   | 36 +++----------
 .../core/data/map_policies/missing_policy.hpp      | 20 ++++----
 .../methods/preprocess/preprocess_imputer_main.cpp | 59 +++++++++-------------
 src/mlpack/tests/data/impute_test.csv              |  3 ++
 src/mlpack/tests/imputation_test.cpp               | 16 +++---
 10 files changed, 76 insertions(+), 97 deletions(-)

diff --git a/src/mlpack/core/data/dataset_mapper.hpp b/src/mlpack/core/data/dataset_mapper.hpp
index ab9340c..0001438 100644
--- a/src/mlpack/core/data/dataset_mapper.hpp
+++ b/src/mlpack/core/data/dataset_mapper.hpp
@@ -131,6 +131,7 @@ class DatasetMapper
 
   // Mappings from strings to integers.
   // Map entries will only exist for dimensions that are categorical.
+  // MapType = map<dimension, pair<bimap<string, MappedType>, numMappings>>
   using MapType = std::unordered_map<size_t, std::pair<BiMapType, size_t>>;
 
   MapType maps;
diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
index 1698ba9..a34658b 100644
--- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
@@ -19,30 +19,35 @@ template <typename T>
 class CustomImputation
 {
  public:
+  CustomImputation(T customValue):
+      customValue(std::move(customValue))
+  {
+    // nothing to initialize here
+  }
+
   /**
    * Impute function searches through the input looking for mappedValue and
    * replaces it with the user-defined custom value of the given dimension.
-   * The result is saved to the output.
+   * The result is saved to the output. Custom value must be set when
+   * initializing the CustomImputation object.
    *
    * @param input Matrix that contains mappedValue.
    * @param output Matrix that the result will be saved into.
    * @param mappedValue Value that the user wants to get rid of.
-   * @param customValue Value that the user wants to replace mappedValue with.
    * @param dimension Index of the dimension of the mappedValue.
-   * @param transpose State of whether the input matrix is transposed or not.
+   * @param columnMajor State of whether the input matrix is columnMajord or not.
    */
   void Impute(const arma::Mat<T>& input,
               arma::Mat<T>& output,
               const T& mappedValue,
-              const T& customValue,
               const size_t dimension,
-              const bool transpose = true)
+              const bool columnMajor = true)
   {
     // initiate output
     output = input;
 
     // replace the target value to custom value
-    if (transpose)
+    if (columnMajor)
     {
       for (size_t i = 0; i < input.n_cols; ++i)
       {
@@ -65,6 +70,9 @@ class CustomImputation
       }
     }
   }
+
+ private:
+  T customValue;
 }; // class CustomImputation
 
 } // namespace data
diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
index 06db83a..9a695a6 100644
--- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
+++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
@@ -28,19 +28,19 @@ class ListwiseDeletion
    * @param output Matrix that the result will be saved into.
    * @param mappedValue Value that the user wants to get rid of.
    * @param dimension Index of the dimension of the mappedValue.
-   * @param transpose State of whether the input matrix is transposed or not.
+   * @param columnMajor State of whether the input matrix is columnMajor or not.
    */
   void Impute(const arma::Mat<T>& input,
               arma::Mat<T>& output,
               const T& mappedValue,
               const size_t dimension,
-              const bool transpose = true)
+              const bool columnMajor = true)
   {
     // initiate output
     output = input;
     size_t count = 0;
 
-    if (transpose)
+    if (columnMajor)
     {
       for (size_t i = 0; i < input.n_cols; ++i)
       {
diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
index 05134e5..c4085c6 100644
--- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
@@ -28,13 +28,13 @@ class MeanImputation
    * @param output Matrix that the result will be saved into.
    * @param mappedValue Value that the user wants to get rid of.
    * @param dimension Index of the dimension of the mappedValue.
-   * @param transpose State of whether the input matrix is transposed or not.
+   * @param columnMajor State of whether the input matrix is columnMajor or not.
    */
   void Impute(const arma::Mat<T>& input,
               arma::Mat<T>& output,
               const T& mappedValue,
               const size_t dimension,
-              const bool transpose = true)
+              const bool columnMajor = true)
   {
     // initiate output
     output = input;
@@ -49,7 +49,7 @@ class MeanImputation
 
     // calculate number of elements and sum of them excluding mapped value or
     // nan. while doing that, remember where mappedValue or NaN exists.
-    if (transpose)
+    if (columnMajor)
     {
       for (size_t i = 0; i < input.n_cols; ++i)
       {
diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
index 8a111d4..0022366 100644
--- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
@@ -29,18 +29,18 @@ class MedianImputation
    * @param output Matrix that the result will be saved into.
    * @param mappedValue Value that the user wants to get rid of.
    * @param dimension Index of the dimension of the mappedValue.
-   * @param transpose State of whether the input matrix is transposed or not.
+   * @param columnMajor State of whether the input matrix is columnMajor or not.
    */
   void Impute(const arma::Mat<T>& input,
               arma::Mat<T>& output,
               const T& mappedValue,
               const size_t dimension,
-              const bool transpose = true)
+              const bool columnMajor = true)
   {
     //initiate output
     output = input;
 
-    if (transpose)
+    if (columnMajor)
     {
       arma::Mat<T> medianMat = arma::median(input, 1);
       for (size_t i = 0; i < input.n_cols; ++i)
diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp
index b719ba2..a30508b 100644
--- a/src/mlpack/core/data/imputer.hpp
+++ b/src/mlpack/core/data/imputer.hpp
@@ -28,17 +28,17 @@ template<typename T, typename MapperType, typename StrategyType>
 class Imputer
 {
  public:
-  Imputer(MapperType mapper, bool transpose = true):
+  Imputer(MapperType mapper, bool columnMajor = true):
       mapper(std::move(mapper)),
-      transpose(transpose)
+      columnMajor(columnMajor)
   {
     // Nothing to initialize here.
   }
 
-  Imputer(MapperType mapper, StrategyType strategy, bool transpose = true):
+  Imputer(MapperType mapper, StrategyType strategy, bool columnMajor = true):
       strategy(std::move(strategy)),
       mapper(std::move(mapper)),
-      transpose(transpose)
+      columnMajor(columnMajor)
   {
     // Nothing to initialize here.
   }
@@ -58,29 +58,7 @@ class Imputer
               const size_t dimension)
   {
     T mappedValue = static_cast<T>(mapper.UnmapValue(missingValue, dimension));
-    strategy.Impute(input, output, mappedValue, dimension, transpose);
-  }
-
-  /**
-  * This overload of Impute() lets users to define custom value that can be
-  * replaced with the target value.
-  *
-  * @param input Input dataset to apply imputation.
-  * @param output Armadillo matrix to save the results
-  * @oaran missingValue User defined missing value; it can be anything.
-  * @param customValue The numeric value that a user wants to replace
-  *        missingValue with.
-  * @param dimension Dimension to apply the imputation.
-  */
-  void Impute(const arma::Mat<T>& input,
-              arma::Mat<T>& output,
-              const std::string& missingValue,
-              const T& customValue,
-              const size_t dimension)
-  {
-    T mappedValue = static_cast<T>(mapper.UnmapValue(missingValue, dimension));
-    strategy.Impute(input, output, mappedValue, customValue, dimension,
-                    transpose);
+    strategy.Impute(input, output, mappedValue, dimension, columnMajor);
   }
 
   //! Get the strategy
@@ -102,8 +80,8 @@ class Imputer
   // DatasetMapperType<MapPolicy>
   MapperType mapper;
 
-  // save transpose as a member variable since it is rarely changed.
-  bool transpose;
+  // save columnMajor as a member variable since it is rarely changed.
+  bool columnMajor;
 
 }; // class Imputer
 
diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp
index ead543a..ff60a5a 100644
--- a/src/mlpack/core/data/map_policies/missing_policy.hpp
+++ b/src/mlpack/core/data/map_policies/missing_policy.hpp
@@ -63,9 +63,11 @@ class MissingPolicy
   template <typename MapType>
   MappedType MapString(const std::string& string,
                        const size_t dimension,
-                       MapType maps,
+                       MapType& maps,
                        std::vector<Datatype>& types)
   {
+    // mute the unused parameter warning (does nothing here.)
+    (void)types;
     // If this condition is true, either we have no mapping for the given string
     // or we have no mappings for the given dimension at all.  In either case,
     // we create a mapping.
@@ -75,11 +77,10 @@ class MissingPolicy
          maps[dimension].first.left.count(string) == 0))
     {
       // This string does not exist yet.
-      size_t& numMappings = maps[dimension].second;
-
       typedef boost::bimap<std::string, MappedType>::value_type PairType;
       maps[dimension].first.insert(PairType(string, NaN));
 
+      size_t& numMappings = maps[dimension].second;
       ++numMappings;
       return NaN;
     }
@@ -87,6 +88,9 @@ class MissingPolicy
     {
       // This string already exists in the mapping
       // or not included in missingSet.
+      // Unlike IncrementPolicy, MissingPolicy counts all mapped values.
+      size_t& numMappings = maps[dimension].second;
+      ++numMappings;
       return NaN;
     }
   }
@@ -121,17 +125,11 @@ class MissingPolicy
     std::stringstream token;
     for (size_t i = 0; i != tokens.size(); ++i)
     {
-      // if token is a number, but is included in the missingSet, map it.
-      if (missingSet.find(tokens[i]) != std::end(missingSet))
-      {
-         const eT val = static_cast<eT>(this->MapString(tokens[i], row, maps,
-                                                        types));
-         matrix.at(row, i) = val;
-      }
       token.str(tokens[i]);
       token>>matrix.at(row, i);
       // if the token is not number, map it.
-      if (token.fail())
+      // or if token is a number, but is included in the missingSet, map it.
+      if (token.fail() || missingSet.find(tokens[i]) != std::end(missingSet))
       {
         const eT val = static_cast<eT>(this->MapString(tokens[i], row, maps,
                                                        types));
diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
index 603cdcc..e367b6a 100644
--- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
@@ -72,16 +72,16 @@ int main(int argc, char** argv)
 
   // If custom value is specified, and imputation strategy is not,
   // set imputation strategy to "custom"
-  if (CLI::HasParam("custom_value") && !CLI::HasParam("impute_strategy"))
+  if (CLI::HasParam("custom_value") && !CLI::HasParam("strategy"))
   {
     strategy = "custom";
-    Log::Warn << "--custom_value is specified without --impute_strategy, "
-        << "--impute_strategy is automatically set to 'custom'." << endl;
+    Log::Warn << "--custom_value is specified without --strategy, "
+        << "--strategy is automatically set to 'custom'." << endl;
   }
 
   // Custom value and any other impute strategies cannot be specified at
   // the same time.
-  if (CLI::HasParam("custom_value") && CLI::HasParam("impute_strategy") &&
+  if (CLI::HasParam("custom_value") && CLI::HasParam("strategy") &&
       strategy != "custom")
     Log::Fatal << "--custom_value cannot be specified with "
         << "impute strategies excluding 'custom' strategy" << endl;
@@ -109,15 +109,26 @@ int main(int argc, char** argv)
         << endl;
   }
 
-  // default imputer is mean imputation (to provide scope)
-  Imputer<double, MapperType, MeanImputation<double>> impu(info);
-  if (strategy == "median")
+  Log::Info << input << endl;
+
+  // Initialize imputer class
+  Imputer<double, MapperType, MeanImputation<double>> imputer(info);
+  if (strategy == "mean")
+  {
+    Imputer<double, MapperType, MeanImputation<double>> imputer(info);
+  }
+  else if (strategy == "median")
   {
-    Imputer<double, MapperType, MedianImputation<double>> impu(info);
+    Imputer<double, MapperType, MedianImputation<double>> imputer(info);
   }
   else if (strategy == "listwise_deletion")
   {
-    Imputer<double, MapperType, ListwiseDeletion<double>> impu(info);
+    Imputer<double, MapperType, ListwiseDeletion<double>> imputer(info);
+  }
+  else if (strategy == "custom")
+  {
+    CustomImputation<double> strat(customValue);
+    Imputer<double, MapperType, CustomImputation<double>> imputer(info, strat);
   }
   else
   {
@@ -125,24 +136,15 @@ int main(int argc, char** argv)
         << endl;
   }
 
-  // Initialize imputer class
-
   if (CLI::HasParam("dimension"))
   {
     // when --dimension is specified,
     // the program will apply the changes to only the given dimension.
     Log::Info << "Performing '" << strategy << "' imputation strategy "
-        << "to replace '" << missingValue << "' on all dimensions." << endl;
+        << "to replace '" << missingValue << "' on dimension " << dimension
+        << "." << endl;
 
-    if (strategy == "custom")
-    {
-      Imputer<double, MapperType, CustomImputation<double>> impu(info);
-      impu.Impute(input, output, missingValue, customValue, dimension);
-    }
-    else
-    {
-      impu.Impute(input, output, missingValue, dimension);
-    }
+    imputer.Impute(input, output, missingValue, dimension);
   }
   else
   {
@@ -151,20 +153,9 @@ int main(int argc, char** argv)
     Log::Info << "Performing '" << strategy << "' imputation strategy "
         << "to replace '" << missingValue << "' on all dimensions." << endl;
 
-    if (strategy == "custom")
-    {
-      Imputer<double, MapperType, CustomImputation<double>> impu(info);
-      for (size_t i = 0; i < input.n_rows; ++i)
-      {
-        impu.Impute(input, output, missingValue, customValue, i);
-      }
-    }
-    else
+    for (size_t i = 0; i < input.n_rows; ++i)
     {
-      for (size_t i = 0; i < input.n_rows; ++i)
-      {
-        impu.Impute(input, output, missingValue, i);
-      }
+      imputer.Impute(input, output, missingValue, i);
     }
   }
 
diff --git a/src/mlpack/tests/data/impute_test.csv b/src/mlpack/tests/data/impute_test.csv
new file mode 100644
index 0000000..06256a4
--- /dev/null
+++ b/src/mlpack/tests/data/impute_test.csv
@@ -0,0 +1,3 @@
+a, 2, 3
+5, 6, a
+1, 9, 1
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
index e118bfb..08ef4e1 100644
--- a/src/mlpack/tests/imputation_test.cpp
+++ b/src/mlpack/tests/imputation_test.cpp
@@ -42,9 +42,8 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
 
   std::set<string> mset;
   mset.insert("a");
-  mset.insert("b");
-  MissingPolicy miss(mset);
-  DatasetMapper<MissingPolicy> info(miss);
+  MissingPolicy policy(mset);
+  DatasetMapper<MissingPolicy> info(policy);
   BOOST_REQUIRE(data::Load("test_file.csv", input, info) == true);
 
   // row and column test
@@ -63,10 +62,11 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
   BOOST_REQUIRE(std::isnan(input(2, 1)) == true);
   BOOST_REQUIRE_CLOSE(input(2, 2), 10.0, 1e-5);
 
+  CustomImputation<double> customStrategy(99); // convert missing vals to 99.
   Imputer<double,
           DatasetMapper<MissingPolicy>,
-          CustomImputation<double>> imputer(info);
-  imputer.Impute(input, output, "a", 99, 0); // convert a -> 99 for dimension 0
+          CustomImputation<double>> imputer(info, customStrategy);
+  imputer.Impute(input, output, "a", 0); // convert a -> 99 for dimension 0
 
   // Custom imputation result check
   BOOST_REQUIRE_CLOSE(output(0, 0), 99.0, 1e-5);
@@ -96,10 +96,10 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest)
   double customValue = 99;
   double mappedValue = 0.0;
 
-  CustomImputation<double> imputer;
+  CustomImputation<double> imputer(customValue);
 
   // transposed
-  imputer.Impute(input, outputT, mappedValue, customValue, 0/*dimension*/, true);
+  imputer.Impute(input, outputT, mappedValue, 0/*dimension*/, true);
 
   BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5);
   BOOST_REQUIRE_CLOSE(outputT(0, 1), 99.0, 1e-5);
@@ -115,7 +115,7 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest)
   BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5);
 
   // not transposed
-  imputer.Impute(input, output, mappedValue, customValue, 1, false);
+  imputer.Impute(input, output, mappedValue, 1, false);
 
   BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(0, 1), 99.0, 1e-5);




More information about the mlpack-git mailing list