[mlpack-git] master: MissingPolicy uses NaN instead of numbers (896a018)
gitdub at mlpack.org
gitdub at mlpack.org
Mon Jul 25 12:19:00 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2
>---------------------------------------------------------------
commit 896a01819631f3a4a0d7ab3fb08189d02435f1c9
Author: Keon Kim <kwk236 at gmail.com>
Date: Mon Jul 4 11:23:27 2016 +0900
MissingPolicy uses NaN instead of numbers
>---------------------------------------------------------------
896a01819631f3a4a0d7ab3fb08189d02435f1c9
src/mlpack/core/data/dataset_info.hpp | 2 +-
src/mlpack/core/data/dataset_info_impl.hpp | 2 +-
.../data/imputation_methods/custom_imputation.hpp | 7 +++++--
.../data/imputation_methods/listwise_deletion.hpp | 7 +++++--
.../data/imputation_methods/mean_imputation.hpp | 7 +++++--
.../data/imputation_methods/median_imputation.hpp | 6 ++++--
.../core/data/map_policies/increment_policy.hpp | 2 +-
.../core/data/map_policies/missing_policy.hpp | 14 +++++++++-----
src/mlpack/tests/imputation_test.cpp | 22 ++++++++++++++++------
9 files changed, 47 insertions(+), 22 deletions(-)
diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp
index d87e027..14f2e1c 100644
--- a/src/mlpack/core/data/dataset_info.hpp
+++ b/src/mlpack/core/data/dataset_info.hpp
@@ -108,7 +108,7 @@ class DatasetMapper
}
//! Return the policy of the mapper.
- PolicyType& Policy() const;
+ const PolicyType& Policy() const;
//! Modify the policy of the mapper (be careful!).
PolicyType& Policy();
diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp
index 1c35097..0f88688 100644
--- a/src/mlpack/core/data/dataset_info_impl.hpp
+++ b/src/mlpack/core/data/dataset_info_impl.hpp
@@ -114,7 +114,7 @@ inline size_t DatasetMapper<PolicyType>::Dimensionality() const
}
template<typename PolicyType>
-inline PolicyType& DatasetMapper<PolicyType>::Policy() const
+inline const PolicyType& DatasetMapper<PolicyType>::Policy() const
{
return this->policy;
}
diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
index 73100e2..fc95e30 100644
--- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
@@ -8,6 +8,7 @@
#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_IMPUTATION_HPP
#include <mlpack/core.hpp>
+#include <cmath>
using namespace std;
@@ -33,7 +34,8 @@ class CustomImputation
{
for (size_t i = 0; i < input.n_cols; ++i)
{
- if (input(dimension, i) == mappedValue)
+ if (input(dimension, i) == mappedValue ||
+ std::isnan(input(dimension, i)))
{
output(dimension, i) = customValue;
}
@@ -43,7 +45,8 @@ class CustomImputation
{
for (size_t i = 0; i < input.n_rows; ++i)
{
- if (input(i, dimension) == mappedValue)
+ if (input(i, dimension) == mappedValue ||
+ std::isnan(input(i, dimension)))
{
output(i, dimension) = customValue;
}
diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
index f957a85..19487fa 100644
--- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
+++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
@@ -8,6 +8,7 @@
#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_LISTWISE_DELETION_HPP
#include <mlpack/core.hpp>
+#include <cmath>
using namespace std;
@@ -36,7 +37,8 @@ class ListwiseDeletion
{
for (size_t i = 0; i < input.n_cols; ++i)
{
- if (input(dimension, i) == mappedValue)
+ if (input(dimension, i) == mappedValue ||
+ std::isnan(input(dimension, i)))
{
output.shed_col(i - count);
count++;
@@ -47,7 +49,8 @@ class ListwiseDeletion
{
for (size_t i = 0; i < input.n_rows; ++i)\
{
- if (input(i, dimension) == mappedValue)
+ if (input(i, dimension) == mappedValue ||
+ std::isnan(input(i, dimension)))
{
output.shed_row(i - count);
count++;
diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
index 43f14a0..3c3f853 100644
--- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
@@ -8,6 +8,7 @@
#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_IMPUTATION_HPP
#include <mlpack/core.hpp>
+#include <cmath>
using namespace std;
@@ -44,7 +45,8 @@ class MeanImputation
{
for (size_t i = 0; i < input.n_cols; ++i)
{
- if (input(dimension, i) == mappedValue)
+ if (input(dimension, i) == mappedValue ||
+ std::isnan(input(dimension, i)))
{
targets.emplace_back(dimension, i);
}
@@ -59,7 +61,8 @@ class MeanImputation
{
for (size_t i = 0; i < input.n_rows; ++i)
{
- if (input(i, dimension) == mappedValue)
+ if (input(i, dimension) == mappedValue ||
+ std::isnan(input(i, dimension)))
{
targets.emplace_back(i, dimension);
}
diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
index 05eff34..c46d326 100644
--- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
@@ -36,7 +36,8 @@ class MedianImputation
arma::Mat<T> medianMat = arma::median(input, 1);
for (size_t i = 0; i < input.n_cols; ++i)
{
- if (input(dimension, i) == mappedValue)
+ if (input(dimension, i) == mappedValue ||
+ std::isnan(input(dimension, i)))
{
output(dimension, i) = medianMat(dimension, 0);
}
@@ -47,7 +48,8 @@ class MedianImputation
arma::Mat<T> medianMat = arma::median(input, 0);
for (size_t i = 0; i < input.n_rows; ++i)
{
- if (input(i, dimension) == mappedValue)
+ if (input(i, dimension) == mappedValue ||
+ std::isnan(input(i, dimension)))
{
output(i, dimension) = medianMat(0, dimension);
}
diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp
index d4b104b..3aa0956 100644
--- a/src/mlpack/core/data/map_policies/increment_policy.hpp
+++ b/src/mlpack/core/data/map_policies/increment_policy.hpp
@@ -46,7 +46,7 @@ class IncrementPolicy
if (numMappings == 0)
types[dimension] = Datatype::categorical;
- typedef boost::bimap<std::string, size_t>::value_type PairType;
+ typedef boost::bimap<std::string, mapped_type>::value_type PairType;
maps[dimension].first.insert(PairType(string, numMappings));
return numMappings++;
}
diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp
index c5df023..b041fe1 100644
--- a/src/mlpack/core/data/map_policies/missing_policy.hpp
+++ b/src/mlpack/core/data/map_policies/missing_policy.hpp
@@ -11,6 +11,7 @@
#include <unordered_map>
#include <boost/bimap.hpp>
#include <mlpack/core/data/map_policies/datatype.hpp>
+#include <limits>
using namespace std;
@@ -25,7 +26,7 @@ class MissingPolicy
{
public:
// typedef of mapped_type
- using mapped_type = size_t;
+ using mapped_type = double;
MissingPolicy()
{
@@ -48,21 +49,24 @@ class MissingPolicy
// If this condition is true, either we have no mapping for the given string
// or we have no mappings for the given dimension at all. In either case,
// we create a mapping.
+ const double NaN = std::numeric_limits<double>::quiet_NaN();
if (missingSet.count(string) != 0 &&
(maps.count(dimension) == 0 ||
maps[dimension].first.left.count(string) == 0))
{
// This string does not exist yet.
size_t& numMappings = maps[dimension].second;
+ numMappings++;
- typedef boost::bimap<std::string, size_t>::value_type PairType;
- maps[dimension].first.insert(PairType(string, numMappings));
- return numMappings++;
+ typedef boost::bimap<std::string, mapped_type>::value_type PairType;
+ maps[dimension].first.insert(PairType(string, NaN));
+ return NaN;
}
else
{
// This string already exists in the mapping.
- return maps[dimension].first.left.at(string);
+ //return maps[dimension].first.left.at(string);
+ return NaN;
}
}
private:
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
index c7d6d4c..6f88ca2 100644
--- a/src/mlpack/tests/imputation_test.cpp
+++ b/src/mlpack/tests/imputation_test.cpp
@@ -33,13 +33,12 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
fstream f;
f.open("test_file.csv", fstream::out);
f << "a, 2, 3" << endl;
- f << "5, 6, b" << endl;
+ f << "5, 6, a" << endl;
f << "8, 9, 10" << endl;
f.close();
arma::mat input;
arma::mat output;
- size_t dimension = 0;
std::set<string> mset;
mset.insert("a");
@@ -48,17 +47,28 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
DatasetMapper<MissingPolicy> info(miss);
BOOST_REQUIRE(data::Load("test_file.csv", input, info) == true);
+ // row and column test
BOOST_REQUIRE_EQUAL(input.n_rows, 3);
BOOST_REQUIRE_EQUAL(input.n_cols, 3);
- /* TODO: Connect Load with the new DatasetMapper instead of DatasetInfo*/
+ // Load check
+ // MissingPolicy should convert strings to nans
+ BOOST_REQUIRE(std::isnan(output(0, 0)));
+ BOOST_REQUIRE_CLOSE(output(0, 1), 5.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(output(0, 2), 8.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(output(1, 0), 2.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(output(1, 2), 9.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(output(2, 0), 3.0, 1e-5);
+ BOOST_REQUIRE(std::isnan(output(2, 1)));
+ BOOST_REQUIRE_CLOSE(output(2, 2), 10.0, 1e-5);
Imputer<double,
DatasetMapper<MissingPolicy>,
CustomImputation<double>> imputer(info);
- imputer.Impute(input, output, "a", 99, dimension); // convert a -> 99
- imputer.Impute(input, output, "b", 99, dimension); // convert b -> 99
+ imputer.Impute(input, output, "a", 99, 0); // convert a -> 99 for dimension 0
+ // Custom imputation result check
BOOST_REQUIRE_CLOSE(output(0, 0), 99.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(0, 1), 5.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(0, 2), 8.0, 1e-5);
@@ -66,7 +76,7 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(1, 2), 9.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(2, 0), 3.0, 1e-5);
- BOOST_REQUIRE_CLOSE(output(2, 1), 99.0, 1e-5);
+ BOOST_REQUIRE(std::isnan(output(2, 1))); // remains as NaN
BOOST_REQUIRE_CLOSE(output(2, 2), 10.0, 1e-5);
// Remove the file.
More information about the mlpack-git
mailing list