[mlpack-git] master: MissingPolicy uses NaN instead of numbers (896a018)

gitdub at mlpack.org gitdub at mlpack.org
Mon Jul 25 12:19:00 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2

>---------------------------------------------------------------

commit 896a01819631f3a4a0d7ab3fb08189d02435f1c9
Author: Keon Kim <kwk236 at gmail.com>
Date:   Mon Jul 4 11:23:27 2016 +0900

    MissingPolicy uses NaN instead of numbers


>---------------------------------------------------------------

896a01819631f3a4a0d7ab3fb08189d02435f1c9
 src/mlpack/core/data/dataset_info.hpp              |  2 +-
 src/mlpack/core/data/dataset_info_impl.hpp         |  2 +-
 .../data/imputation_methods/custom_imputation.hpp  |  7 +++++--
 .../data/imputation_methods/listwise_deletion.hpp  |  7 +++++--
 .../data/imputation_methods/mean_imputation.hpp    |  7 +++++--
 .../data/imputation_methods/median_imputation.hpp  |  6 ++++--
 .../core/data/map_policies/increment_policy.hpp    |  2 +-
 .../core/data/map_policies/missing_policy.hpp      | 14 +++++++++-----
 src/mlpack/tests/imputation_test.cpp               | 22 ++++++++++++++++------
 9 files changed, 47 insertions(+), 22 deletions(-)

diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp
index d87e027..14f2e1c 100644
--- a/src/mlpack/core/data/dataset_info.hpp
+++ b/src/mlpack/core/data/dataset_info.hpp
@@ -108,7 +108,7 @@ class DatasetMapper
   }
 
   //! Return the policy of the mapper.
-  PolicyType& Policy() const;
+  const PolicyType& Policy() const;
 
   //! Modify the policy of the mapper (be careful!).
   PolicyType& Policy();
diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp
index 1c35097..0f88688 100644
--- a/src/mlpack/core/data/dataset_info_impl.hpp
+++ b/src/mlpack/core/data/dataset_info_impl.hpp
@@ -114,7 +114,7 @@ inline size_t DatasetMapper<PolicyType>::Dimensionality() const
 }
 
 template<typename PolicyType>
-inline PolicyType& DatasetMapper<PolicyType>::Policy() const
+inline const PolicyType& DatasetMapper<PolicyType>::Policy() const
 {
   return this->policy;
 }
diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
index 73100e2..fc95e30 100644
--- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
@@ -8,6 +8,7 @@
 #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_IMPUTATION_HPP
 
 #include <mlpack/core.hpp>
+#include <cmath>
 
 using namespace std;
 
@@ -33,7 +34,8 @@ class CustomImputation
     {
       for (size_t i = 0; i < input.n_cols; ++i)
       {
-        if (input(dimension, i) == mappedValue)
+        if (input(dimension, i) == mappedValue ||
+            std::isnan(input(dimension, i)))
         {
           output(dimension, i) = customValue;
         }
@@ -43,7 +45,8 @@ class CustomImputation
     {
       for (size_t i = 0; i < input.n_rows; ++i)
       {
-        if (input(i, dimension) == mappedValue)
+        if (input(i, dimension) == mappedValue ||
+            std::isnan(input(i, dimension)))
         {
           output(i, dimension) = customValue;
         }
diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
index f957a85..19487fa 100644
--- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
+++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
@@ -8,6 +8,7 @@
 #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_LISTWISE_DELETION_HPP
 
 #include <mlpack/core.hpp>
+#include <cmath>
 
 using namespace std;
 
@@ -36,7 +37,8 @@ class ListwiseDeletion
     {
       for (size_t i = 0; i < input.n_cols; ++i)
       {
-         if (input(dimension, i) == mappedValue)
+         if (input(dimension, i) == mappedValue ||
+             std::isnan(input(dimension, i)))
          {
            output.shed_col(i - count);
            count++;
@@ -47,7 +49,8 @@ class ListwiseDeletion
     {
       for (size_t i = 0; i < input.n_rows; ++i)\
       {
-        if (input(i, dimension) == mappedValue)
+        if (input(i, dimension) == mappedValue ||
+             std::isnan(input(i, dimension)))
         {
            output.shed_row(i - count);
            count++;
diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
index 43f14a0..3c3f853 100644
--- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
@@ -8,6 +8,7 @@
 #define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_IMPUTATION_HPP
 
 #include <mlpack/core.hpp>
+#include <cmath>
 
 using namespace std;
 
@@ -44,7 +45,8 @@ class MeanImputation
     {
       for (size_t i = 0; i < input.n_cols; ++i)
       {
-        if (input(dimension, i) == mappedValue)
+        if (input(dimension, i) == mappedValue ||
+            std::isnan(input(dimension, i)))
         {
           targets.emplace_back(dimension, i);
         }
@@ -59,7 +61,8 @@ class MeanImputation
     {
       for (size_t i = 0; i < input.n_rows; ++i)
       {
-        if (input(i, dimension) == mappedValue)
+        if (input(i, dimension) == mappedValue ||
+            std::isnan(input(i, dimension)))
         {
           targets.emplace_back(i, dimension);
         }
diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
index 05eff34..c46d326 100644
--- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
@@ -36,7 +36,8 @@ class MedianImputation
       arma::Mat<T> medianMat = arma::median(input, 1);
       for (size_t i = 0; i < input.n_cols; ++i)
       {
-        if (input(dimension, i) == mappedValue)
+        if (input(dimension, i) == mappedValue ||
+            std::isnan(input(dimension, i)))
         {
           output(dimension, i) = medianMat(dimension, 0);
         }
@@ -47,7 +48,8 @@ class MedianImputation
       arma::Mat<T> medianMat = arma::median(input, 0);
       for (size_t i = 0; i < input.n_rows; ++i)
       {
-        if (input(i, dimension) == mappedValue)
+        if (input(i, dimension) == mappedValue ||
+            std::isnan(input(i, dimension)))
         {
           output(i, dimension) = medianMat(0, dimension);
         }
diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp
index d4b104b..3aa0956 100644
--- a/src/mlpack/core/data/map_policies/increment_policy.hpp
+++ b/src/mlpack/core/data/map_policies/increment_policy.hpp
@@ -46,7 +46,7 @@ class IncrementPolicy
       if (numMappings == 0)
         types[dimension] = Datatype::categorical;
 
-      typedef boost::bimap<std::string, size_t>::value_type PairType;
+      typedef boost::bimap<std::string, mapped_type>::value_type PairType;
       maps[dimension].first.insert(PairType(string, numMappings));
       return numMappings++;
     }
diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp
index c5df023..b041fe1 100644
--- a/src/mlpack/core/data/map_policies/missing_policy.hpp
+++ b/src/mlpack/core/data/map_policies/missing_policy.hpp
@@ -11,6 +11,7 @@
 #include <unordered_map>
 #include <boost/bimap.hpp>
 #include <mlpack/core/data/map_policies/datatype.hpp>
+#include <limits>
 
 
 using namespace std;
@@ -25,7 +26,7 @@ class MissingPolicy
 {
  public:
   // typedef of mapped_type
-  using mapped_type = size_t;
+  using mapped_type = double;
 
   MissingPolicy()
   {
@@ -48,21 +49,24 @@ class MissingPolicy
     // If this condition is true, either we have no mapping for the given string
     // or we have no mappings for the given dimension at all.  In either case,
     // we create a mapping.
+    const double NaN = std::numeric_limits<double>::quiet_NaN();
     if (missingSet.count(string) != 0 &&
         (maps.count(dimension) == 0 ||
          maps[dimension].first.left.count(string) == 0))
     {
       // This string does not exist yet.
       size_t& numMappings = maps[dimension].second;
+      numMappings++;
 
-      typedef boost::bimap<std::string, size_t>::value_type PairType;
-      maps[dimension].first.insert(PairType(string, numMappings));
-      return numMappings++;
+      typedef boost::bimap<std::string, mapped_type>::value_type PairType;
+      maps[dimension].first.insert(PairType(string, NaN));
+      return NaN;
     }
     else
     {
       // This string already exists in the mapping.
-      return maps[dimension].first.left.at(string);
+      //return maps[dimension].first.left.at(string);
+      return NaN;
     }
   }
  private:
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
index c7d6d4c..6f88ca2 100644
--- a/src/mlpack/tests/imputation_test.cpp
+++ b/src/mlpack/tests/imputation_test.cpp
@@ -33,13 +33,12 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
   fstream f;
   f.open("test_file.csv", fstream::out);
   f << "a, 2, 3"  << endl;
-  f << "5, 6, b"  << endl;
+  f << "5, 6, a"  << endl;
   f << "8, 9, 10" << endl;
   f.close();
 
   arma::mat input;
   arma::mat output;
-  size_t dimension = 0;
 
   std::set<string> mset;
   mset.insert("a");
@@ -48,17 +47,28 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
   DatasetMapper<MissingPolicy> info(miss);
   BOOST_REQUIRE(data::Load("test_file.csv", input, info) == true);
 
+  // row and column test
   BOOST_REQUIRE_EQUAL(input.n_rows, 3);
   BOOST_REQUIRE_EQUAL(input.n_cols, 3);
 
-  /* TODO: Connect Load with the new DatasetMapper instead of DatasetInfo*/
+  // Load check
+  // MissingPolicy should convert strings to nans
+  BOOST_REQUIRE(std::isnan(output(0, 0)));
+  BOOST_REQUIRE_CLOSE(output(0, 1), 5.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(0, 2), 8.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(1, 0), 2.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(1, 2), 9.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(2, 0), 3.0, 1e-5);
+  BOOST_REQUIRE(std::isnan(output(2, 1)));
+  BOOST_REQUIRE_CLOSE(output(2, 2), 10.0, 1e-5);
 
   Imputer<double,
           DatasetMapper<MissingPolicy>,
           CustomImputation<double>> imputer(info);
-  imputer.Impute(input, output, "a", 99, dimension); // convert a -> 99
-  imputer.Impute(input, output, "b", 99, dimension); // convert b -> 99
+  imputer.Impute(input, output, "a", 99, 0); // convert a -> 99 for dimension 0
 
+  // Custom imputation result check
   BOOST_REQUIRE_CLOSE(output(0, 0), 99.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(0, 1), 5.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(0, 2), 8.0, 1e-5);
@@ -66,7 +76,7 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
   BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(1, 2), 9.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(2, 0), 3.0, 1e-5);
-  BOOST_REQUIRE_CLOSE(output(2, 1), 99.0, 1e-5);
+  BOOST_REQUIRE(std::isnan(output(2, 1))); // remains as NaN
   BOOST_REQUIRE_CLOSE(output(2, 2), 10.0, 1e-5);
 
   // Remove the file.




More information about the mlpack-git mailing list