[mlpack-git] master: add input-only overloads to imputation methods (6d43aa3)
gitdub at mlpack.org
gitdub at mlpack.org
Sun Jul 10 19:08:24 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2
>---------------------------------------------------------------
commit 6d43aa3b3dcd93fdc1bd3e9918267b59f762f3a1
Author: Keon Kim <kwk236 at gmail.com>
Date: Mon Jul 11 08:08:24 2016 +0900
add input-only overloads to imputation methods
>---------------------------------------------------------------
6d43aa3b3dcd93fdc1bd3e9918267b59f762f3a1
.../data/imputation_methods/custom_imputation.hpp | 43 ++++++++++++-
.../data/imputation_methods/listwise_deletion.hpp | 62 ++++++++++++++----
.../data/imputation_methods/mean_imputation.hpp | 73 ++++++++++++++++++++++
.../data/imputation_methods/median_imputation.hpp | 41 ++++++++++++
src/mlpack/core/data/imputer.hpp | 21 ++++++-
src/mlpack/tests/imputation_test.cpp | 59 ++++++++++++++++-
6 files changed, 285 insertions(+), 14 deletions(-)
diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
index a34658b..35326a7 100644
--- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
@@ -35,7 +35,7 @@ class CustomImputation
* @param output Matrix that the result will be saved into.
* @param mappedValue Value that the user wants to get rid of.
* @param dimension Index of the dimension of the mappedValue.
- * @param columnMajor State of whether the input matrix is columnMajord or not.
+ * @param columnMajor State of whether the input matrix is columnMajor or not.
*/
void Impute(const arma::Mat<T>& input,
arma::Mat<T>& output,
@@ -71,6 +71,47 @@ class CustomImputation
}
}
+ /**
+ * Impute function searches through the input looking for mappedValue and
+ * replaces it with the user-defined custom value of the given dimension.
+ * The result is overwritten to the input, not creating any copy. Custom value
+ * must be set when initializing the CustomImputation object.
+ *
+ * @param input Matrix that contains mappedValue.
+ * @param mappedValue Value that the user wants to get rid of.
+ * @param dimension Index of the dimension of the mappedValue.
+ * @param columnMajor State of whether the input matrix is columnMajor or not.
+ */
+ void Impute(arma::Mat<T>& input,
+ const T& mappedValue,
+ const size_t dimension,
+ const bool columnMajor = true)
+ {
+ // replace the target value to custom value
+ if (columnMajor)
+ {
+ for (size_t i = 0; i < input.n_cols; ++i)
+ {
+ if (input(dimension, i) == mappedValue ||
+ std::isnan(input(dimension, i)))
+ {
+ input(dimension, i) = customValue;
+ }
+ }
+ }
+ else
+ {
+ for (size_t i = 0; i < input.n_rows; ++i)
+ {
+ if (input(i, dimension) == mappedValue ||
+ std::isnan(input(i, dimension)))
+ {
+ input(i, dimension) = customValue;
+ }
+ }
+ }
+ }
+
private:
T customValue;
}; // class CustomImputation
diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
index 9a695a6..0ac84ae 100644
--- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
+++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
@@ -36,33 +36,73 @@ class ListwiseDeletion
const size_t dimension,
const bool columnMajor = true)
{
- // initiate output
- output = input;
- size_t count = 0;
+ std::vector<arma::uword> colsToKeep;
if (columnMajor)
{
for (size_t i = 0; i < input.n_cols; ++i)
{
- if (input(dimension, i) == mappedValue ||
- std::isnan(input(dimension, i)))
+ if (!(input(dimension, i) == mappedValue ||
+ std::isnan(input(dimension, i))))
{
- output.shed_col(i - count);
- count++;
+ colsToKeep.push_back(i);
}
}
+ output = input.cols(arma::uvec(colsToKeep));
}
else
{
for (size_t i = 0; i < input.n_rows; ++i)
{
- if (input(i, dimension) == mappedValue ||
- std::isnan(input(i, dimension)))
+ if (!(input(i, dimension) == mappedValue ||
+ std::isnan(input(i, dimension))))
{
- output.shed_row(i - count);
- count++;
+ colsToKeep.push_back(i);
}
}
+ output = input.rows(arma::uvec(colsToKeep));
+ }
+ }
+
+ /**
+ * Impute function searches through the input looking for mappedValue and
+ * remove the whole row or column. The result is overwritten to the input.
+ *
+ * @param input Matrix that contains mappedValue.
+ * @param mappedValue Value that the user wants to get rid of.
+ * @param dimension Index of the dimension of the mappedValue.
+ * @param columnMajor State of whether the input matrix is columnMajor or not.
+ */
+ void Impute(arma::Mat<T>& input,
+ const T& mappedValue,
+ const size_t dimension,
+ const bool columnMajor = true)
+ {
+ std::vector<arma::uword> colsToKeep;
+
+ if (columnMajor)
+ {
+ for (size_t i = 0; i < input.n_cols; ++i)
+ {
+ if (!(input(dimension, i) == mappedValue ||
+ std::isnan(input(dimension, i))))
+ {
+ colsToKeep.push_back(i);
+ }
+ }
+ input = input.cols(arma::uvec(colsToKeep));
+ }
+ else
+ {
+ for (size_t i = 0; i < input.n_rows; ++i)
+ {
+ if (!(input(i, dimension) == mappedValue ||
+ std::isnan(input(i, dimension))))
+ {
+ colsToKeep.push_back(i);
+ }
+ }
+ input = input.rows(arma::uvec(colsToKeep));
}
}
}; // class ListwiseDeletion
diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
index c4085c6..cfe0de1 100644
--- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
@@ -96,6 +96,79 @@ class MeanImputation
output(target.first, target.second) = mean;
}
}
+
+ /**
+ * Impute function searches through the input looking for mappedValue and
+ * replaces it with the mean of the given dimension. The result is overwritten
+ * to the input matrix.
+ *
+ * @param input Matrix that contains mappedValue.
+ * @param mappedValue Value that the user wants to get rid of.
+ * @param dimension Index of the dimension of the mappedValue.
+ * @param columnMajor State of whether the input matrix is columnMajor or not.
+ */
+ void Impute(arma::Mat<T>& input,
+ const T& mappedValue,
+ const size_t dimension,
+ const bool columnMajor = true)
+ {
+ double sum = 0;
+ size_t elems = 0; // excluding nan or missing target
+
+ using PairType = std::pair<size_t, size_t>;
+ // dimensions and indexes are saved as pairs inside this vector.
+ std::vector<PairType> targets;
+
+
+ // calculate number of elements and sum of them excluding mapped value or
+ // nan. while doing that, remember where mappedValue or NaN exists.
+ if (columnMajor)
+ {
+ for (size_t i = 0; i < input.n_cols; ++i)
+ {
+ if (input(dimension, i) == mappedValue ||
+ std::isnan(input(dimension, i)))
+ {
+ targets.emplace_back(dimension, i);
+ }
+ else
+ {
+ elems++;
+ sum += input(dimension, i);
+ }
+ }
+ }
+ else
+ {
+ for (size_t i = 0; i < input.n_rows; ++i)
+ {
+ if (input(i, dimension) == mappedValue ||
+ std::isnan(input(i, dimension)))
+ {
+ targets.emplace_back(i, dimension);
+ }
+ else
+ {
+ elems++;
+ sum += input(i, dimension);
+ }
+ }
+ }
+
+ if (elems == 0)
+ Log::Fatal << "it is impossible to calculate mean; no valid elements in "
+ << "the dimension" << std::endl;
+
+ // calculate mean;
+ const double mean = sum / elems;
+
+ // Now replace the calculated mean to the missing variables
+ // It only needs to loop through targets vector, not the whole matrix.
+ for (const PairType& target : targets)
+ {
+ input(target.first, target.second) = mean;
+ }
+ }
}; // class MeanImputation
} // namespace data
diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
index 0022366..cf48241 100644
--- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
@@ -65,6 +65,47 @@ class MedianImputation
}
}
}
+
+ /**
+ * Impute function searches through the input looking for mappedValue and
+ * replaces it with the median of the given dimension. The result is
+ * overwritten to the input matrix.
+ *
+ * @param input Matrix that contains mappedValue.
+ * @param mappedValue Value that the user wants to get rid of.
+ * @param dimension Index of the dimension of the mappedValue.
+ * @param columnMajor State of whether the input matrix is columnMajor or not.
+ */
+ void Impute(arma::Mat<T>& input,
+ const T& mappedValue,
+ const size_t dimension,
+ const bool columnMajor = true)
+ {
+ if (columnMajor)
+ {
+ arma::Mat<T> medianMat = arma::median(input, 1);
+ for (size_t i = 0; i < input.n_cols; ++i)
+ {
+ if (input(dimension, i) == mappedValue ||
+ std::isnan(input(dimension, i)))
+ {
+ input(dimension, i) = medianMat(dimension, 0);
+ }
+ }
+ }
+ else
+ {
+ arma::Mat<T> medianMat = arma::median(input, 0);
+ for (size_t i = 0; i < input.n_rows; ++i)
+ {
+ if (input(i, dimension) == mappedValue ||
+ std::isnan(input(i, dimension)))
+ {
+ input(i, dimension) = medianMat(0, dimension);
+ }
+ }
+ }
+ }
}; // class MedianImputation
} // namespace data
diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp
index a30508b..4787343 100644
--- a/src/mlpack/core/data/imputer.hpp
+++ b/src/mlpack/core/data/imputer.hpp
@@ -45,7 +45,8 @@ class Imputer
/**
* Given an input dataset, replace missing values with given imputation
- * strategy.
+ * strategy. This overload saves the result into the output matrix and does not
+ * change the input matrix.
*
* @param input Input dataset to apply imputation.
* @param output Armadillo matrix to save the results
@@ -61,6 +62,24 @@ class Imputer
strategy.Impute(input, output, mappedValue, dimension, columnMajor);
}
+ /**
+ * Given an input dataset, replace missing values with given imputation
+ * strategy. This overload does not produce output matrix, but overwrites the
+ * result into the input matrix.
+ *
+ * @param input Input dataset to apply imputation.
+ * @oaran missingValue User defined missing value; it can be anything.
+ * @param dimension Dimension to apply the imputation.
+ */
+ void Impute(arma::Mat<T>& input,
+ const std::string& missingValue,
+ const size_t dimension)
+ {
+ T mappedValue = static_cast<T>(mapper.UnmapValue(missingValue, dimension));
+ strategy.Impute(input, mappedValue, dimension, columnMajor);
+ }
+
+
//! Get the strategy
const StrategyType& Strategy() const { return strategy; }
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
index 08ef4e1..9b19262 100644
--- a/src/mlpack/tests/imputation_test.cpp
+++ b/src/mlpack/tests/imputation_test.cpp
@@ -129,6 +129,22 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest)
BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(2, 3), 8.0, 1e-5);
+
+ // overwrite to the input
+ imputer.Impute(input, mappedValue, 0/*dimension*/, true);
+
+ BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(0, 1), 99.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(0, 2), 2.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(0, 3), 99.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 2), 0.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(2, 2), 4.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(2, 3), 8.0, 1e-5);
}
/**
@@ -176,6 +192,22 @@ BOOST_AUTO_TEST_CASE(MeanImputationTest)
BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(2, 3), 8.0, 1e-5);
+
+ // overwrite to the input
+ imputer.Impute(input, mappedValue, 0/*dimension*/, true);
+
+ BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(0, 1), 2.5, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(0, 2), 2.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(0, 3), 2.5, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 2), 0.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(2, 2), 4.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(2, 3), 8.0, 1e-5);
}
/**
@@ -222,7 +254,22 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest)
BOOST_REQUIRE_CLOSE(output(2, 0), 9.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(2, 1), 8.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(2, 2), 4.0, 1e-5);
- BOOST_REQUIRE_CLOSE(output(2, 3), 8.0, 1e-5);
+
+ // overwrite to the input
+ imputer.Impute(input, mappedValue, 1/*dimension*/, true);
+
+ BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(0, 1), 0.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(0, 2), 2.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(0, 3), 0.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 2), 5.5, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(2, 2), 4.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(2, 3), 8.0, 1e-5);
}
/**
@@ -260,6 +307,16 @@ BOOST_AUTO_TEST_CASE(ListwiseDeletionTest)
BOOST_REQUIRE_CLOSE(output(1, 1), 8.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(1, 2), 4.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(1, 3), 8.0, 1e-5);
+
+ // overwrite to the input
+ imputer.Impute(input, mappedValue, 0, true); // transposed
+
+ BOOST_REQUIRE_CLOSE(input(0, 0), 3.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(0, 1), 2.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 1), 0.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(2, 1), 4.0, 1e-5);
}
More information about the mlpack-git
mailing list