[mlpack-git] master: add input-only overloads to imputation methods (6d43aa3)

gitdub at mlpack.org gitdub at mlpack.org
Sun Jul 10 19:08:24 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2

>---------------------------------------------------------------

commit 6d43aa3b3dcd93fdc1bd3e9918267b59f762f3a1
Author: Keon Kim <kwk236 at gmail.com>
Date:   Mon Jul 11 08:08:24 2016 +0900

    add input-only overloads to imputation methods


>---------------------------------------------------------------

6d43aa3b3dcd93fdc1bd3e9918267b59f762f3a1
 .../data/imputation_methods/custom_imputation.hpp  | 43 ++++++++++++-
 .../data/imputation_methods/listwise_deletion.hpp  | 62 ++++++++++++++----
 .../data/imputation_methods/mean_imputation.hpp    | 73 ++++++++++++++++++++++
 .../data/imputation_methods/median_imputation.hpp  | 41 ++++++++++++
 src/mlpack/core/data/imputer.hpp                   | 21 ++++++-
 src/mlpack/tests/imputation_test.cpp               | 59 ++++++++++++++++-
 6 files changed, 285 insertions(+), 14 deletions(-)

diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
index a34658b..35326a7 100644
--- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
@@ -35,7 +35,7 @@ class CustomImputation
    * @param output Matrix that the result will be saved into.
    * @param mappedValue Value that the user wants to get rid of.
    * @param dimension Index of the dimension of the mappedValue.
-   * @param columnMajor State of whether the input matrix is columnMajord or not.
+   * @param columnMajor State of whether the input matrix is columnMajor or not.
    */
   void Impute(const arma::Mat<T>& input,
               arma::Mat<T>& output,
@@ -71,6 +71,47 @@ class CustomImputation
     }
   }
 
+  /**
+   * Impute function searches through the input looking for mappedValue and
+   * replaces it with the user-defined custom value of the given dimension.
+   * The result is overwritten to the input, not creating any copy. Custom value
+   * must be set when initializing the CustomImputation object.
+   *
+   * @param input Matrix that contains mappedValue.
+   * @param mappedValue Value that the user wants to get rid of.
+   * @param dimension Index of the dimension of the mappedValue.
+   * @param columnMajor State of whether the input matrix is columnMajor or not.
+   */
+  void Impute(arma::Mat<T>& input,
+              const T& mappedValue,
+              const size_t dimension,
+              const bool columnMajor = true)
+  {
+    // replace the target value to custom value
+    if (columnMajor)
+    {
+      for (size_t i = 0; i < input.n_cols; ++i)
+      {
+        if (input(dimension, i) == mappedValue ||
+            std::isnan(input(dimension, i)))
+        {
+          input(dimension, i) = customValue;
+        }
+      }
+    }
+    else
+    {
+      for (size_t i = 0; i < input.n_rows; ++i)
+      {
+        if (input(i, dimension) == mappedValue ||
+            std::isnan(input(i, dimension)))
+        {
+          input(i, dimension) = customValue;
+        }
+      }
+    }
+  }
+
  private:
   T customValue;
 }; // class CustomImputation
diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
index 9a695a6..0ac84ae 100644
--- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
+++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
@@ -36,33 +36,73 @@ class ListwiseDeletion
               const size_t dimension,
               const bool columnMajor = true)
   {
-    // initiate output
-    output = input;
-    size_t count = 0;
+    std::vector<arma::uword> colsToKeep;
 
     if (columnMajor)
     {
       for (size_t i = 0; i < input.n_cols; ++i)
       {
-         if (input(dimension, i) == mappedValue ||
-             std::isnan(input(dimension, i)))
+         if (!(input(dimension, i) == mappedValue ||
+             std::isnan(input(dimension, i))))
          {
-           output.shed_col(i - count);
-           count++;
+           colsToKeep.push_back(i);
          }
       }
+      output = input.cols(arma::uvec(colsToKeep));
     }
     else
     {
       for (size_t i = 0; i < input.n_rows; ++i)
       {
-        if (input(i, dimension) == mappedValue ||
-             std::isnan(input(i, dimension)))
+        if (!(input(i, dimension) == mappedValue ||
+             std::isnan(input(i, dimension))))
         {
-           output.shed_row(i - count);
-           count++;
+           colsToKeep.push_back(i);
         }
       }
+      output = input.rows(arma::uvec(colsToKeep));
+    }
+  }
+
+  /**
+   * Impute function searches through the input looking for mappedValue and
+   * remove the whole row or column. The result is overwritten to the input.
+   *
+   * @param input Matrix that contains mappedValue.
+   * @param mappedValue Value that the user wants to get rid of.
+   * @param dimension Index of the dimension of the mappedValue.
+   * @param columnMajor State of whether the input matrix is columnMajor or not.
+   */
+  void Impute(arma::Mat<T>& input,
+              const T& mappedValue,
+              const size_t dimension,
+              const bool columnMajor = true)
+  {
+    std::vector<arma::uword> colsToKeep;
+
+    if (columnMajor)
+    {
+      for (size_t i = 0; i < input.n_cols; ++i)
+      {
+         if (!(input(dimension, i) == mappedValue ||
+             std::isnan(input(dimension, i))))
+         {
+           colsToKeep.push_back(i);
+         }
+      }
+      input = input.cols(arma::uvec(colsToKeep));
+    }
+    else
+    {
+      for (size_t i = 0; i < input.n_rows; ++i)
+      {
+        if (!(input(i, dimension) == mappedValue ||
+             std::isnan(input(i, dimension))))
+        {
+           colsToKeep.push_back(i);
+        }
+      }
+      input = input.rows(arma::uvec(colsToKeep));
     }
   }
 }; // class ListwiseDeletion
diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
index c4085c6..cfe0de1 100644
--- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
@@ -96,6 +96,79 @@ class MeanImputation
       output(target.first, target.second) = mean;
     }
   }
+
+  /**
+   * Impute function searches through the input looking for mappedValue and
+   * replaces it with the mean of the given dimension. The result is overwritten
+   * to the input matrix.
+   *
+   * @param input Matrix that contains mappedValue.
+   * @param mappedValue Value that the user wants to get rid of.
+   * @param dimension Index of the dimension of the mappedValue.
+   * @param columnMajor State of whether the input matrix is columnMajor or not.
+   */
+  void Impute(arma::Mat<T>& input,
+              const T& mappedValue,
+              const size_t dimension,
+              const bool columnMajor = true)
+  {
+    double sum = 0;
+    size_t elems = 0; // excluding nan or missing target
+
+    using PairType = std::pair<size_t, size_t>;
+    // dimensions and indexes are saved as pairs inside this vector.
+    std::vector<PairType> targets;
+
+
+    // calculate number of elements and sum of them excluding mapped value or
+    // nan. while doing that, remember where mappedValue or NaN exists.
+    if (columnMajor)
+    {
+      for (size_t i = 0; i < input.n_cols; ++i)
+      {
+        if (input(dimension, i) == mappedValue ||
+            std::isnan(input(dimension, i)))
+        {
+          targets.emplace_back(dimension, i);
+        }
+        else
+        {
+          elems++;
+          sum += input(dimension, i);
+        }
+      }
+    }
+    else
+    {
+      for (size_t i = 0; i < input.n_rows; ++i)
+      {
+        if (input(i, dimension) == mappedValue ||
+            std::isnan(input(i, dimension)))
+        {
+          targets.emplace_back(i, dimension);
+        }
+        else
+        {
+          elems++;
+          sum += input(i, dimension);
+        }
+      }
+    }
+
+    if (elems == 0)
+      Log::Fatal << "it is impossible to calculate mean; no valid elements in "
+          << "the dimension" << std::endl;
+
+    // calculate mean;
+    const double mean = sum / elems;
+
+    // Now replace the calculated mean to the missing variables
+    // It only needs to loop through targets vector, not the whole matrix.
+    for (const PairType& target : targets)
+    {
+      input(target.first, target.second) = mean;
+    }
+  }
 }; // class MeanImputation
 
 } // namespace data
diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
index 0022366..cf48241 100644
--- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
@@ -65,6 +65,47 @@ class MedianImputation
       }
     }
   }
+
+  /**
+   * Impute function searches through the input looking for mappedValue and
+   * replaces it with the median of the given dimension. The result is
+   * overwritten to the input matrix.
+   *
+   * @param input Matrix that contains mappedValue.
+   * @param mappedValue Value that the user wants to get rid of.
+   * @param dimension Index of the dimension of the mappedValue.
+   * @param columnMajor State of whether the input matrix is columnMajor or not.
+   */
+  void Impute(arma::Mat<T>& input,
+              const T& mappedValue,
+              const size_t dimension,
+              const bool columnMajor = true)
+  {
+    if (columnMajor)
+    {
+      arma::Mat<T> medianMat = arma::median(input, 1);
+      for (size_t i = 0; i < input.n_cols; ++i)
+      {
+        if (input(dimension, i) == mappedValue ||
+            std::isnan(input(dimension, i)))
+        {
+          input(dimension, i) = medianMat(dimension, 0);
+        }
+      }
+    }
+    else
+    {
+      arma::Mat<T> medianMat = arma::median(input, 0);
+      for (size_t i = 0; i < input.n_rows; ++i)
+      {
+        if (input(i, dimension) == mappedValue ||
+            std::isnan(input(i, dimension)))
+        {
+          input(i, dimension) = medianMat(0, dimension);
+        }
+      }
+    }
+  }
 }; // class MedianImputation
 
 } // namespace data
diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp
index a30508b..4787343 100644
--- a/src/mlpack/core/data/imputer.hpp
+++ b/src/mlpack/core/data/imputer.hpp
@@ -45,7 +45,8 @@ class Imputer
 
   /**
   * Given an input dataset, replace missing values with given imputation
-  * strategy.
+  * strategy. This overload saves the result into the output matrix and does not
+  * change the input matrix.
   *
   * @param input Input dataset to apply imputation.
   * @param output Armadillo matrix to save the results
@@ -61,6 +62,24 @@ class Imputer
     strategy.Impute(input, output, mappedValue, dimension, columnMajor);
   }
 
+  /**
+  * Given an input dataset, replace missing values with given imputation
+  * strategy. This overload does not produce output matrix, but overwrites the
+  * result into the input matrix.
+  *
+  * @param input Input dataset to apply imputation.
+  * @oaran missingValue User defined missing value; it can be anything.
+  * @param dimension Dimension to apply the imputation.
+  */
+  void Impute(arma::Mat<T>& input,
+              const std::string& missingValue,
+              const size_t dimension)
+  {
+    T mappedValue = static_cast<T>(mapper.UnmapValue(missingValue, dimension));
+    strategy.Impute(input, mappedValue, dimension, columnMajor);
+  }
+
+
   //! Get the strategy
   const StrategyType& Strategy() const { return strategy; }
 
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
index 08ef4e1..9b19262 100644
--- a/src/mlpack/tests/imputation_test.cpp
+++ b/src/mlpack/tests/imputation_test.cpp
@@ -129,6 +129,22 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest)
   BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(2, 3), 8.0, 1e-5);
+
+  // overwrite to the input
+  imputer.Impute(input, mappedValue, 0/*dimension*/, true);
+
+  BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(0, 1), 99.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(0, 2), 2.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(0, 3), 99.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 2), 0.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(2, 2), 4.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(2, 3), 8.0, 1e-5);
 }
 
 /**
@@ -176,6 +192,22 @@ BOOST_AUTO_TEST_CASE(MeanImputationTest)
   BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(2, 3), 8.0, 1e-5);
+
+  // overwrite to the input
+  imputer.Impute(input, mappedValue, 0/*dimension*/, true);
+
+  BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(0, 1), 2.5, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(0, 2), 2.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(0, 3), 2.5, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 2), 0.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(2, 2), 4.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(2, 3), 8.0, 1e-5);
 }
 
 /**
@@ -222,7 +254,22 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest)
   BOOST_REQUIRE_CLOSE(output(2, 0), 9.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5);
-  BOOST_REQUIRE_CLOSE(output(2, 3), 8.0, 1e-5);
+
+  // overwrite to the input
+  imputer.Impute(input, mappedValue, 1/*dimension*/, true);
+
+  BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(0, 1), 0.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(0, 2), 2.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(0, 3), 0.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 2), 5.5, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(2, 2), 4.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(2, 3), 8.0, 1e-5);
 }
 
 /**
@@ -260,6 +307,16 @@ BOOST_AUTO_TEST_CASE(ListwiseDeletionTest)
   BOOST_REQUIRE_CLOSE(output(1, 1), 8.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(1, 2), 4.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(1, 3), 8.0, 1e-5);
+
+  // overwrite to the input
+  imputer.Impute(input, mappedValue, 0, true); // transposed
+
+  BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(0, 1), 2.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(1, 1), 0.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(input(2, 1), 4.0, 1e-5);
 }
 
 




More information about the mlpack-git mailing list