[mlpack-git] master: update median imputation to exclude missing values (fedc5e0)
gitdub at mlpack.org
gitdub at mlpack.org
Sun Jul 10 23:10:32 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2
>---------------------------------------------------------------
commit fedc5e0ece901746ad15b9c13244713bb36d4f9e
Author: Keon Kim <kwk236 at gmail.com>
Date: Mon Jul 11 12:10:32 2016 +0900
update median imputation to exclude missing values
>---------------------------------------------------------------
fedc5e0ece901746ad15b9c13244713bb36d4f9e
.../data/imputation_methods/mean_imputation.hpp | 1 -
.../data/imputation_methods/median_imputation.hpp | 56 ++++++++++++++++++----
.../methods/preprocess/preprocess_imputer_main.cpp | 2 -
src/mlpack/tests/imputation_test.cpp | 6 +--
4 files changed, 51 insertions(+), 14 deletions(-)
diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
index cfe0de1..6c6a7e4 100644
--- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
@@ -46,7 +46,6 @@ class MeanImputation
// dimensions and indexes are saved as pairs inside this vector.
std::vector<PairType> targets;
-
// calculate number of elements and sum of them excluding mapped value or
// nan. while doing that, remember where mappedValue or NaN exists.
if (columnMajor)
diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
index cf48241..5c03bc2 100644
--- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
@@ -40,30 +40,50 @@ class MedianImputation
//initiate output
output = input;
+ using PairType = std::pair<size_t, size_t>;
+ // dimensions and indexes are saved as pairs inside this vector.
+ std::vector<PairType> targets;
+ // good elements are kept inside this vector.
+ std::vector<double> elemsToKeep;
+
if (columnMajor)
{
- arma::Mat<T> medianMat = arma::median(input, 1);
for (size_t i = 0; i < input.n_cols; ++i)
{
if (input(dimension, i) == mappedValue ||
std::isnan(input(dimension, i)))
{
- output(dimension, i) = medianMat(dimension, 0);
+ targets.emplace_back(dimension, i);
+ }
+ else
+ {
+ elemsToKeep.push_back(input(dimension, i));
}
}
}
else
{
- arma::Mat<T> medianMat = arma::median(input, 0);
for (size_t i = 0; i < input.n_rows; ++i)
{
if (input(i, dimension) == mappedValue ||
std::isnan(input(i, dimension)))
{
- output(i, dimension) = medianMat(0, dimension);
+ targets.emplace_back(i, dimension);
+ }
+ else
+ {
+ elemsToKeep.push_back(input(i, dimension));
}
}
}
+
+ // calculate median
+ const double median = arma::median(arma::vec(elemsToKeep));
+
+ for (const PairType& target : targets)
+ {
+ output(target.first, target.second) = median;
+ }
}
/**
@@ -81,30 +101,50 @@ class MedianImputation
const size_t dimension,
const bool columnMajor = true)
{
+ using PairType = std::pair<size_t, size_t>;
+ // dimensions and indexes are saved as pairs inside this vector.
+ std::vector<PairType> targets;
+ // good elements are kept inside this vector.
+ std::vector<double> elemsToKeep;
+
if (columnMajor)
{
- arma::Mat<T> medianMat = arma::median(input, 1);
for (size_t i = 0; i < input.n_cols; ++i)
{
if (input(dimension, i) == mappedValue ||
std::isnan(input(dimension, i)))
{
- input(dimension, i) = medianMat(dimension, 0);
+ targets.emplace_back(dimension, i);
+ }
+ else
+ {
+ elemsToKeep.push_back(input(dimension, i));
}
}
}
else
{
- arma::Mat<T> medianMat = arma::median(input, 0);
for (size_t i = 0; i < input.n_rows; ++i)
{
if (input(i, dimension) == mappedValue ||
std::isnan(input(i, dimension)))
{
- input(i, dimension) = medianMat(0, dimension);
+ targets.emplace_back(i, dimension);
+ }
+ else
+ {
+ elemsToKeep.push_back(input(i, dimension));
}
}
}
+
+ // calculate median
+ const double median = arma::median(arma::vec(elemsToKeep));
+
+ for (const PairType& target : targets)
+ {
+ input(target.first, target.second) = median;
+ }
}
}; // class MedianImputation
diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
index e367b6a..bacc040 100644
--- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
@@ -109,8 +109,6 @@ int main(int argc, char** argv)
<< endl;
}
- Log::Info << input << endl;
-
// Initialize imputer class
Imputer<double, MapperType, MeanImputation<double>> imputer(info);
if (strategy == "mean")
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
index 9b19262..9d79bd9 100644
--- a/src/mlpack/tests/imputation_test.cpp
+++ b/src/mlpack/tests/imputation_test.cpp
@@ -233,7 +233,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest)
BOOST_REQUIRE_CLOSE(outputT(0, 3), 0.0, 1e-5);
BOOST_REQUIRE_CLOSE(outputT(1, 0), 5.0, 1e-5);
BOOST_REQUIRE_CLOSE(outputT(1, 1), 6.0, 1e-5);
- BOOST_REQUIRE_CLOSE(outputT(1, 2), 5.5, 1e-5);
+ BOOST_REQUIRE_CLOSE(outputT(1, 2), 6.0, 1e-5);
BOOST_REQUIRE_CLOSE(outputT(1, 3), 6.0, 1e-5);
BOOST_REQUIRE_CLOSE(outputT(2, 0), 9.0, 1e-5);
BOOST_REQUIRE_CLOSE(outputT(2, 1), 8.0, 1e-5);
@@ -244,7 +244,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest)
imputer.Impute(input, output, mappedValue, 1, false);
BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5);
- BOOST_REQUIRE_CLOSE(output(0, 1), 6.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(output(0, 1), 7.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(0, 2), 2.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(0, 3), 0.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(1, 0), 5.0, 1e-5);
@@ -264,7 +264,7 @@ BOOST_AUTO_TEST_CASE(MedianImputationTest)
BOOST_REQUIRE_CLOSE(input(0, 3), 0.0, 1e-5);
BOOST_REQUIRE_CLOSE(input(1, 0), 5.0, 1e-5);
BOOST_REQUIRE_CLOSE(input(1, 1), 6.0, 1e-5);
- BOOST_REQUIRE_CLOSE(input(1, 2), 5.5, 1e-5);
+ BOOST_REQUIRE_CLOSE(input(1, 2), 6.0, 1e-5);
BOOST_REQUIRE_CLOSE(input(1, 3), 6.0, 1e-5);
BOOST_REQUIRE_CLOSE(input(2, 0), 9.0, 1e-5);
BOOST_REQUIRE_CLOSE(input(2, 1), 8.0, 1e-5);
More information about the mlpack-git
mailing list