[mlpack-git] master: update median imputation to exclude missing values (fedc5e0)

gitdub at mlpack.org gitdub at mlpack.org
Sun Jul 10 23:10:32 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2

>---------------------------------------------------------------

commit fedc5e0ece901746ad15b9c13244713bb36d4f9e
Author: Keon Kim <kwk236 at gmail.com>
Date:   Mon Jul 11 12:10:32 2016 +0900

    update median imputation to exclude missing values


>---------------------------------------------------------------

fedc5e0ece901746ad15b9c13244713bb36d4f9e
 .../data/imputation_methods/mean_imputation.hpp    |  1 -
 .../data/imputation_methods/median_imputation.hpp  | 56 ++++++++++++++++++----
 .../methods/preprocess/preprocess_imputer_main.cpp |  2 -
 src/mlpack/tests/imputation_test.cpp               |  6 +--
 4 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
index cfe0de1..6c6a7e4 100644
--- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
@@ -46,7 +46,6 @@ class MeanImputation
     // dimensions and indexes are saved as pairs inside this vector.
     std::vector<PairType> targets;
 
-
     // calculate number of elements and sum of them excluding mapped value or
     // nan. while doing that, remember where mappedValue or NaN exists.
     if (columnMajor)
diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
index cf48241..5c03bc2 100644
--- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
@@ -40,30 +40,50 @@ class MedianImputation
     //initiate output
     output = input;
 
+    using PairType = std::pair<size_t, size_t>;
+    // dimensions and indexes are saved as pairs inside this vector.
+    std::vector<PairType> targets;
+    // good elements are kept inside this vector.
+    std::vector<double> elemsToKeep;
+
     if (columnMajor)
     {
-      arma::Mat<T> medianMat = arma::median(input, 1);
       for (size_t i = 0; i < input.n_cols; ++i)
       {
         if (input(dimension, i) == mappedValue ||
             std::isnan(input(dimension, i)))
         {
-          output(dimension, i) = medianMat(dimension, 0);
+          targets.emplace_back(dimension, i);
+        }
+        else
+        {
+          elemsToKeep.push_back(input(dimension, i));
         }
       }
     }
     else
     {
-      arma::Mat<T> medianMat = arma::median(input, 0);
       for (size_t i = 0; i < input.n_rows; ++i)
       {
         if (input(i, dimension) == mappedValue ||
             std::isnan(input(i, dimension)))
         {
-          output(i, dimension) = medianMat(0, dimension);
+          targets.emplace_back(i, dimension);
+        }
+        else
+        {
+           elemsToKeep.push_back(input(i, dimension));
         }
       }
     }
+
+    // calculate median
+    const double median = arma::median(arma::vec(elemsToKeep));
+
+    for (const PairType& target : targets)
+    {
+       output(target.first, target.second) = median;
+    }
   }
 
   /**
@@ -81,30 +101,50 @@ class MedianImputation
               const size_t dimension,
               const bool columnMajor = true)
   {
+    using PairType = std::pair<size_t, size_t>;
+    // dimensions and indexes are saved as pairs inside this vector.
+    std::vector<PairType> targets;
+    // good elements are kept inside this vector.
+    std::vector<double> elemsToKeep;
+
     if (columnMajor)
     {
-      arma::Mat<T> medianMat = arma::median(input, 1);
       for (size_t i = 0; i < input.n_cols; ++i)
       {
         if (input(dimension, i) == mappedValue ||
             std::isnan(input(dimension, i)))
         {
-          input(dimension, i) = medianMat(dimension, 0);
+          targets.emplace_back(dimension, i);
+        }
+        else
+        {
+          elemsToKeep.push_back(input(dimension, i));
         }
       }
     }
     else
     {
-      arma::Mat<T> medianMat = arma::median(input, 0);
       for (size_t i = 0; i < input.n_rows; ++i)
       {
         if (input(i, dimension) == mappedValue ||
             std::isnan(input(i, dimension)))
         {
-          input(i, dimension) = medianMat(0, dimension);
+          targets.emplace_back(i, dimension);
+        }
+        else
+        {
+           elemsToKeep.push_back(input(i, dimension));
         }
       }
     }
+
+    // calculate median
+    const double median = arma::median(arma::vec(elemsToKeep));
+
+    for (const PairType& target : targets)
+    {
+       input(target.first, target.second) = median;
+    }
   }
 }; // class MedianImputation
 
diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
index e367b6a..bacc040 100644
--- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
@@ -109,8 +109,6 @@ int main(int argc, char** argv)
         << endl;
   }
 
-  Log::Info << input << endl;
-
   // Initialize imputer class
   Imputer<double, MapperType, MeanImputation<double>> imputer(info);
   if (strategy == "mean")
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
index 9b19262..9d79bd9 100644
--- a/src/mlpack/tests/imputation_test.cpp
+++ b/src/mlpack/tests/imputation_test.cpp
@@ -233,7 +233,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest)
   BOOST_REQUIRE_CLOSE(outputT(0, 3), 0.0, 1e-5);
   BOOST_REQUIRE_CLOSE(outputT(1, 0), 5.0, 1e-5);
   BOOST_REQUIRE_CLOSE(outputT(1, 1), 6.0, 1e-5);
-  BOOST_REQUIRE_CLOSE(outputT(1, 2), 5.5, 1e-5);
+  BOOST_REQUIRE_CLOSE(outputT(1, 2), 6.0, 1e-5);
   BOOST_REQUIRE_CLOSE(outputT(1, 3), 6.0, 1e-5);
   BOOST_REQUIRE_CLOSE(outputT(2, 0), 9.0, 1e-5);
   BOOST_REQUIRE_CLOSE(outputT(2, 1), 8.0, 1e-5);
@@ -244,7 +244,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest)
   imputer.Impute(input, output, mappedValue, 1, false);
 
   BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5);
-  BOOST_REQUIRE_CLOSE(output(0, 1), 6.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(0, 1), 7.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(0, 2), 2.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(0, 3), 0.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(1, 0), 5.0, 1e-5);
@@ -264,7 +264,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest)
   BOOST_REQUIRE_CLOSE(input(0, 3), 0.0, 1e-5);
   BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5);
   BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5);
-  BOOST_REQUIRE_CLOSE(input(1, 2), 5.5, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 2), 6.0, 1e-5);
   BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5);
   BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5);
   BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5);




More information about the mlpack-git mailing list