[mlpack-git] master: modify custom impute interface and rename variables (2eb6754)
gitdub at mlpack.org
gitdub at mlpack.org
Sun Jul 10 17:26:48 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2
>---------------------------------------------------------------
commit 2eb675467528574b65fa23d78e4e8d3e6e6ea6c5
Author: Keon Kim <kwk236 at gmail.com>
Date: Mon Jul 11 06:26:48 2016 +0900
modify custom impute interface and rename variables
>---------------------------------------------------------------
2eb675467528574b65fa23d78e4e8d3e6e6ea6c5
src/mlpack/core/data/dataset_mapper.hpp | 1 +
.../data/imputation_methods/custom_imputation.hpp | 20 +++++---
.../data/imputation_methods/listwise_deletion.hpp | 6 +--
.../data/imputation_methods/mean_imputation.hpp | 6 +--
.../data/imputation_methods/median_imputation.hpp | 6 +--
src/mlpack/core/data/imputer.hpp | 36 +++----------
.../core/data/map_policies/missing_policy.hpp | 20 ++++----
.../methods/preprocess/preprocess_imputer_main.cpp | 59 +++++++++-------------
src/mlpack/tests/data/impute_test.csv | 3 ++
src/mlpack/tests/imputation_test.cpp | 16 +++---
10 files changed, 76 insertions(+), 97 deletions(-)
diff --git a/src/mlpack/core/data/dataset_mapper.hpp b/src/mlpack/core/data/dataset_mapper.hpp
index ab9340c..0001438 100644
--- a/src/mlpack/core/data/dataset_mapper.hpp
+++ b/src/mlpack/core/data/dataset_mapper.hpp
@@ -131,6 +131,7 @@ class DatasetMapper
// Mappings from strings to integers.
// Map entries will only exist for dimensions that are categorical.
+ // MapType = map<dimension, pair<bimap<string, MappedType>, numMappings>>
using MapType = std::unordered_map<size_t, std::pair<BiMapType, size_t>>;
MapType maps;
diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
index 1698ba9..a34658b 100644
--- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
@@ -19,30 +19,35 @@ template <typename T>
class CustomImputation
{
public:
+ CustomImputation(T customValue):
+ customValue(std::move(customValue))
+ {
+ // nothing to initialize here
+ }
+
/**
* Impute function searches through the input looking for mappedValue and
* replaces it with the user-defined custom value of the given dimension.
- * The result is saved to the output.
+ * The result is saved to the output. Custom value must be set when
+ * initializing the CustomImputation object.
*
* @param input Matrix that contains mappedValue.
* @param output Matrix that the result will be saved into.
* @param mappedValue Value that the user wants to get rid of.
- * @param customValue Value that the user wants to replace mappedValue with.
* @param dimension Index of the dimension of the mappedValue.
- * @param transpose State of whether the input matrix is transposed or not.
+ * @param columnMajor State of whether the input matrix is columnMajord or not.
*/
void Impute(const arma::Mat<T>& input,
arma::Mat<T>& output,
const T& mappedValue,
- const T& customValue,
const size_t dimension,
- const bool transpose = true)
+ const bool columnMajor = true)
{
// initiate output
output = input;
// replace the target value to custom value
- if (transpose)
+ if (columnMajor)
{
for (size_t i = 0; i < input.n_cols; ++i)
{
@@ -65,6 +70,9 @@ class CustomImputation
}
}
}
+
+ private:
+ T customValue;
}; // class CustomImputation
} // namespace data
diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
index 06db83a..9a695a6 100644
--- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
+++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
@@ -28,19 +28,19 @@ class ListwiseDeletion
* @param output Matrix that the result will be saved into.
* @param mappedValue Value that the user wants to get rid of.
* @param dimension Index of the dimension of the mappedValue.
- * @param transpose State of whether the input matrix is transposed or not.
+ * @param columnMajor State of whether the input matrix is columnMajor or not.
*/
void Impute(const arma::Mat<T>& input,
arma::Mat<T>& output,
const T& mappedValue,
const size_t dimension,
- const bool transpose = true)
+ const bool columnMajor = true)
{
// initiate output
output = input;
size_t count = 0;
- if (transpose)
+ if (columnMajor)
{
for (size_t i = 0; i < input.n_cols; ++i)
{
diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
index 05134e5..c4085c6 100644
--- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
@@ -28,13 +28,13 @@ class MeanImputation
* @param output Matrix that the result will be saved into.
* @param mappedValue Value that the user wants to get rid of.
* @param dimension Index of the dimension of the mappedValue.
- * @param transpose State of whether the input matrix is transposed or not.
+ * @param columnMajor State of whether the input matrix is columnMajor or not.
*/
void Impute(const arma::Mat<T>& input,
arma::Mat<T>& output,
const T& mappedValue,
const size_t dimension,
- const bool transpose = true)
+ const bool columnMajor = true)
{
// initiate output
output = input;
@@ -49,7 +49,7 @@ class MeanImputation
// calculate number of elements and sum of them excluding mapped value or
// nan. while doing that, remember where mappedValue or NaN exists.
- if (transpose)
+ if (columnMajor)
{
for (size_t i = 0; i < input.n_cols; ++i)
{
diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
index 8a111d4..0022366 100644
--- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
@@ -29,18 +29,18 @@ class MedianImputation
* @param output Matrix that the result will be saved into.
* @param mappedValue Value that the user wants to get rid of.
* @param dimension Index of the dimension of the mappedValue.
- * @param transpose State of whether the input matrix is transposed or not.
+ * @param columnMajor State of whether the input matrix is columnMajor or not.
*/
void Impute(const arma::Mat<T>& input,
arma::Mat<T>& output,
const T& mappedValue,
const size_t dimension,
- const bool transpose = true)
+ const bool columnMajor = true)
{
//initiate output
output = input;
- if (transpose)
+ if (columnMajor)
{
arma::Mat<T> medianMat = arma::median(input, 1);
for (size_t i = 0; i < input.n_cols; ++i)
diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp
index b719ba2..a30508b 100644
--- a/src/mlpack/core/data/imputer.hpp
+++ b/src/mlpack/core/data/imputer.hpp
@@ -28,17 +28,17 @@ template<typename T, typename MapperType, typename StrategyType>
class Imputer
{
public:
- Imputer(MapperType mapper, bool transpose = true):
+ Imputer(MapperType mapper, bool columnMajor = true):
mapper(std::move(mapper)),
- transpose(transpose)
+ columnMajor(columnMajor)
{
// Nothing to initialize here.
}
- Imputer(MapperType mapper, StrategyType strategy, bool transpose = true):
+ Imputer(MapperType mapper, StrategyType strategy, bool columnMajor = true):
strategy(std::move(strategy)),
mapper(std::move(mapper)),
- transpose(transpose)
+ columnMajor(columnMajor)
{
// Nothing to initialize here.
}
@@ -58,29 +58,7 @@ class Imputer
const size_t dimension)
{
T mappedValue = static_cast<T>(mapper.UnmapValue(missingValue, dimension));
- strategy.Impute(input, output, mappedValue, dimension, transpose);
- }
-
- /**
- * This overload of Impute() lets users to define custom value that can be
- * replaced with the target value.
- *
- * @param input Input dataset to apply imputation.
- * @param output Armadillo matrix to save the results
- * @oaran missingValue User defined missing value; it can be anything.
- * @param customValue The numeric value that a user wants to replace
- * missingValue with.
- * @param dimension Dimension to apply the imputation.
- */
- void Impute(const arma::Mat<T>& input,
- arma::Mat<T>& output,
- const std::string& missingValue,
- const T& customValue,
- const size_t dimension)
- {
- T mappedValue = static_cast<T>(mapper.UnmapValue(missingValue, dimension));
- strategy.Impute(input, output, mappedValue, customValue, dimension,
- transpose);
+ strategy.Impute(input, output, mappedValue, dimension, columnMajor);
}
//! Get the strategy
@@ -102,8 +80,8 @@ class Imputer
// DatasetMapperType<MapPolicy>
MapperType mapper;
- // save transpose as a member variable since it is rarely changed.
- bool transpose;
+ // save columnMajor as a member variable since it is rarely changed.
+ bool columnMajor;
}; // class Imputer
diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp
index ead543a..ff60a5a 100644
--- a/src/mlpack/core/data/map_policies/missing_policy.hpp
+++ b/src/mlpack/core/data/map_policies/missing_policy.hpp
@@ -63,9 +63,11 @@ class MissingPolicy
template <typename MapType>
MappedType MapString(const std::string& string,
const size_t dimension,
- MapType maps,
+ MapType& maps,
std::vector<Datatype>& types)
{
+ // mute the unused parameter warning (does nothing here.)
+ (void)types;
// If this condition is true, either we have no mapping for the given string
// or we have no mappings for the given dimension at all. In either case,
// we create a mapping.
@@ -75,11 +77,10 @@ class MissingPolicy
maps[dimension].first.left.count(string) == 0))
{
// This string does not exist yet.
- size_t& numMappings = maps[dimension].second;
-
typedef boost::bimap<std::string, MappedType>::value_type PairType;
maps[dimension].first.insert(PairType(string, NaN));
+ size_t& numMappings = maps[dimension].second;
++numMappings;
return NaN;
}
@@ -87,6 +88,9 @@ class MissingPolicy
{
// This string already exists in the mapping
// or not included in missingSet.
+ // Unlike IncrementPolicy, MissingPolicy counts all mapped values.
+ size_t& numMappings = maps[dimension].second;
+ ++numMappings;
return NaN;
}
}
@@ -121,17 +125,11 @@ class MissingPolicy
std::stringstream token;
for (size_t i = 0; i != tokens.size(); ++i)
{
- // if token is a number, but is included in the missingSet, map it.
- if (missingSet.find(tokens[i]) != std::end(missingSet))
- {
- const eT val = static_cast<eT>(this->MapString(tokens[i], row, maps,
- types));
- matrix.at(row, i) = val;
- }
token.str(tokens[i]);
token>>matrix.at(row, i);
// if the token is not number, map it.
- if (token.fail())
+ // or if token is a number, but is included in the missingSet, map it.
+ if (token.fail() || missingSet.find(tokens[i]) != std::end(missingSet))
{
const eT val = static_cast<eT>(this->MapString(tokens[i], row, maps,
types));
diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
index 603cdcc..e367b6a 100644
--- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
@@ -72,16 +72,16 @@ int main(int argc, char** argv)
// If custom value is specified, and imputation strategy is not,
// set imputation strategy to "custom"
- if (CLI::HasParam("custom_value") && !CLI::HasParam("impute_strategy"))
+ if (CLI::HasParam("custom_value") && !CLI::HasParam("strategy"))
{
strategy = "custom";
- Log::Warn << "--custom_value is specified without --impute_strategy, "
- << "--impute_strategy is automatically set to 'custom'." << endl;
+ Log::Warn << "--custom_value is specified without --strategy, "
+ << "--strategy is automatically set to 'custom'." << endl;
}
// Custom value and any other impute strategies cannot be specified at
// the same time.
- if (CLI::HasParam("custom_value") && CLI::HasParam("impute_strategy") &&
+ if (CLI::HasParam("custom_value") && CLI::HasParam("strategy") &&
strategy != "custom")
Log::Fatal << "--custom_value cannot be specified with "
<< "impute strategies excluding 'custom' strategy" << endl;
@@ -109,15 +109,26 @@ int main(int argc, char** argv)
<< endl;
}
- // default imputer is mean imputation (to provide scope)
- Imputer<double, MapperType, MeanImputation<double>> impu(info);
- if (strategy == "median")
+ Log::Info << input << endl;
+
+ // Initialize imputer class
+ Imputer<double, MapperType, MeanImputation<double>> imputer(info);
+ if (strategy == "mean")
+ {
+ Imputer<double, MapperType, MeanImputation<double>> imputer(info);
+ }
+ else if (strategy == "median")
{
- Imputer<double, MapperType, MedianImputation<double>> impu(info);
+ Imputer<double, MapperType, MedianImputation<double>> imputer(info);
}
else if (strategy == "listwise_deletion")
{
- Imputer<double, MapperType, ListwiseDeletion<double>> impu(info);
+ Imputer<double, MapperType, ListwiseDeletion<double>> imputer(info);
+ }
+ else if (strategy == "custom")
+ {
+ CustomImputation<double> strat(customValue);
+ Imputer<double, MapperType, CustomImputation<double>> imputer(info, strat);
}
else
{
@@ -125,24 +136,15 @@ int main(int argc, char** argv)
<< endl;
}
- // Initialize imputer class
-
if (CLI::HasParam("dimension"))
{
// when --dimension is specified,
// the program will apply the changes to only the given dimension.
Log::Info << "Performing '" << strategy << "' imputation strategy "
- << "to replace '" << missingValue << "' on all dimensions." << endl;
+ << "to replace '" << missingValue << "' on dimension " << dimension
+ << "." << endl;
- if (strategy == "custom")
- {
- Imputer<double, MapperType, CustomImputation<double>> impu(info);
- impu.Impute(input, output, missingValue, customValue, dimension);
- }
- else
- {
- impu.Impute(input, output, missingValue, dimension);
- }
+ imputer.Impute(input, output, missingValue, dimension);
}
else
{
@@ -151,20 +153,9 @@ int main(int argc, char** argv)
Log::Info << "Performing '" << strategy << "' imputation strategy "
<< "to replace '" << missingValue << "' on all dimensions." << endl;
- if (strategy == "custom")
- {
- Imputer<double, MapperType, CustomImputation<double>> impu(info);
- for (size_t i = 0; i < input.n_rows; ++i)
- {
- impu.Impute(input, output, missingValue, customValue, i);
- }
- }
- else
+ for (size_t i = 0; i < input.n_rows; ++i)
{
- for (size_t i = 0; i < input.n_rows; ++i)
- {
- impu.Impute(input, output, missingValue, i);
- }
+ imputer.Impute(input, output, missingValue, i);
}
}
diff --git a/src/mlpack/tests/data/impute_test.csv b/src/mlpack/tests/data/impute_test.csv
new file mode 100644
index 0000000..06256a4
--- /dev/null
+++ b/src/mlpack/tests/data/impute_test.csv
@@ -0,0 +1,3 @@
+a, 2, 3
+5, 6, a
+1, 9, 1
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
index e118bfb..08ef4e1 100644
--- a/src/mlpack/tests/imputation_test.cpp
+++ b/src/mlpack/tests/imputation_test.cpp
@@ -42,9 +42,8 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
std::set<string> mset;
mset.insert("a");
- mset.insert("b");
- MissingPolicy miss(mset);
- DatasetMapper<MissingPolicy> info(miss);
+ MissingPolicy policy(mset);
+ DatasetMapper<MissingPolicy> info(policy);
BOOST_REQUIRE(data::Load("test_file.csv", input, info) == true);
// row and column test
@@ -63,10 +62,11 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
BOOST_REQUIRE(std::isnan(input(2, 1)) == true);
BOOST_REQUIRE_CLOSE(input(2, 2), 10.0, 1e-5);
+ CustomImputation<double> customStrategy(99); // convert missing vals to 99.
Imputer<double,
DatasetMapper<MissingPolicy>,
- CustomImputation<double>> imputer(info);
- imputer.Impute(input, output, "a", 99, 0); // convert a -> 99 for dimension 0
+ CustomImputation<double>> imputer(info, customStrategy);
+ imputer.Impute(input, output, "a", 0); // convert a -> 99 for dimension 0
// Custom imputation result check
BOOST_REQUIRE_CLOSE(output(0, 0), 99.0, 1e-5);
@@ -96,10 +96,10 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest)
double customValue = 99;
double mappedValue = 0.0;
- CustomImputation<double> imputer;
+ CustomImputation<double> imputer(customValue);
// transposed
- imputer.Impute(input, outputT, mappedValue, customValue, 0/*dimension*/, true);
+ imputer.Impute(input, outputT, mappedValue, 0/*dimension*/, true);
BOOST_REQUIRE_CLOSE(outputT(0, 0), 3.0, 1e-5);
BOOST_REQUIRE_CLOSE(outputT(0, 1), 99.0, 1e-5);
@@ -115,7 +115,7 @@ BOOST_AUTO_TEST_CASE(CustomImputationTest)
BOOST_REQUIRE_CLOSE(outputT(2, 3), 8.0, 1e-5);
// not transposed
- imputer.Impute(input, output, mappedValue, customValue, 1, false);
+ imputer.Impute(input, output, mappedValue, 1, false);
BOOST_REQUIRE_CLOSE(output(0, 0), 3.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(0, 1), 99.0, 1e-5);
More information about the mlpack-git
mailing list