[mlpack-git] master: add comments and use more proper names (63268a3)
gitdub at mlpack.org
gitdub at mlpack.org
Mon Jul 25 12:19:16 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2
>---------------------------------------------------------------
commit 63268a3f1cc1ace8143ae0e6f9e8d9aa81822fd2
Author: Keon Kim <kwk236 at gmail.com>
Date: Fri Jul 8 03:52:03 2016 +0900
add comments and use more proper names
>---------------------------------------------------------------
63268a3f1cc1ace8143ae0e6f9e8d9aa81822fd2
src/mlpack/core/data/CMakeLists.txt | 4 +-
.../data/{dataset_info.hpp => dataset_mapper.hpp} | 15 ++--
...taset_info_impl.hpp => dataset_mapper_impl.hpp} | 29 +++----
.../data/imputation_methods/custom_imputation.hpp | 20 ++++-
.../data/imputation_methods/listwise_deletion.hpp | 19 +++--
.../data/imputation_methods/mean_imputation.hpp | 18 +++--
.../data/imputation_methods/median_imputation.hpp | 19 +++--
src/mlpack/core/data/imputer.hpp | 29 ++++---
src/mlpack/core/data/load.hpp | 2 +-
src/mlpack/core/data/map_policies/datatype.hpp | 2 -
.../core/data/map_policies/increment_policy.hpp | 53 +++++++++---
.../core/data/map_policies/missing_policy.hpp | 77 ++++++++++++++----
.../methods/preprocess/preprocess_imputer_main.cpp | 94 +++++++++++++---------
src/mlpack/tests/imputation_test.cpp | 2 +-
14 files changed, 258 insertions(+), 125 deletions(-)
diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt
index 65e4fc3..2fbd5d3 100644
--- a/src/mlpack/core/data/CMakeLists.txt
+++ b/src/mlpack/core/data/CMakeLists.txt
@@ -1,8 +1,8 @@
# Define the files that we need to compile.
# Anything not in this list will not be compiled into mlpack.
set(SOURCES
- dataset_info.hpp
- dataset_info_impl.hpp
+ dataset_mapper.hpp
+ dataset_mapper_impl.hpp
extension.hpp
format.hpp
load.hpp
diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_mapper.hpp
similarity index 94%
rename from src/mlpack/core/data/dataset_info.hpp
rename to src/mlpack/core/data/dataset_mapper.hpp
index c504540..ab9340c 100644
--- a/src/mlpack/core/data/dataset_info.hpp
+++ b/src/mlpack/core/data/dataset_mapper.hpp
@@ -1,6 +1,7 @@
/**
- * @file dataset_info.hpp
+ * @file dataset_mapper.hpp
* @author Ryan Curtin
+ * @author Keon Kim
*
* Defines the DatasetMapper class, which holds information about a dataset.
* This is useful when the dataset contains categorical non-numeric features
@@ -53,7 +54,7 @@ class DatasetMapper
* @param string String to find/create mapping for.
* @param dimension Index of the dimension of the string.
*/
- typename PolicyType::mapped_type MapString(const std::string& string,
+ typename PolicyType::MappedType MapString(const std::string& string,
const size_t dimension);
/**
@@ -75,13 +76,13 @@ class DatasetMapper
* @param string Mapped string for value.
* @param dimension Dimension to unmap string from.
*/
- typename PolicyType::mapped_type UnmapValue(const std::string& string,
+ typename PolicyType::MappedType UnmapValue(const std::string& string,
const size_t dimension);
template <typename eT>
void MapTokens(const std::vector<std::string>& tokens,
- size_t& row,
- arma::Mat<eT>& matrix);
+ size_t& row,
+ arma::Mat<eT>& matrix);
//! Return the type of a given dimension (numeric or categorical).
Datatype Type(const size_t dimension) const;
@@ -126,7 +127,7 @@ class DatasetMapper
std::vector<Datatype> types;
// BiMapType definition
- using BiMapType = boost::bimap<std::string, typename PolicyType::mapped_type>;
+ using BiMapType = boost::bimap<std::string, typename PolicyType::MappedType>;
// Mappings from strings to integers.
// Map entries will only exist for dimensions that are categorical.
@@ -143,6 +144,6 @@ using DatasetInfo = DatasetMapper<data::IncrementPolicy>;
} // namespace data
} // namespace mlpack
-#include "dataset_info_impl.hpp"
+#include "dataset_mapper_impl.hpp"
#endif
diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_mapper_impl.hpp
similarity index 78%
rename from src/mlpack/core/data/dataset_info_impl.hpp
rename to src/mlpack/core/data/dataset_mapper_impl.hpp
index 6bb7d75..6b291e2 100644
--- a/src/mlpack/core/data/dataset_info_impl.hpp
+++ b/src/mlpack/core/data/dataset_mapper_impl.hpp
@@ -1,6 +1,7 @@
/**
- * @file dataset_info_impl.hpp
+ * @file dataset_mapper_impl.hpp
* @author Ryan Curtin
+ * @author Keon Kim
*
* An implementation of the DatasetMapper<PolicyType> class.
*/
@@ -8,7 +9,7 @@
#define MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP
// In case it hasn't already been included.
-#include "dataset_info.hpp"
+#include "dataset_mapper.hpp"
namespace mlpack {
namespace data {
@@ -23,7 +24,7 @@ inline DatasetMapper<PolicyType>::DatasetMapper(const size_t dimensionality) :
template<typename PolicyType>
inline DatasetMapper<PolicyType>::DatasetMapper(PolicyType& policy,
- const size_t dimensionality) :
+ const size_t dimensionality) :
types(dimensionality, Datatype::numeric),
policy(std::move(policy))
{
@@ -33,9 +34,9 @@ inline DatasetMapper<PolicyType>::DatasetMapper(PolicyType& policy,
// When we want to insert value into the map,
// we could use the policy to map the string
template<typename PolicyType>
-inline typename PolicyType::mapped_type DatasetMapper<PolicyType>::MapString(
- const std::string& string,
- const size_t dimension)
+inline typename PolicyType::MappedType DatasetMapper<PolicyType>::MapString(
+ const std::string& string,
+ const size_t dimension)
{
return policy.template MapString<MapType>(string, dimension, maps, types);
}
@@ -43,8 +44,8 @@ inline typename PolicyType::mapped_type DatasetMapper<PolicyType>::MapString(
// Return the string corresponding to a value in a given dimension.
template<typename PolicyType>
inline const std::string& DatasetMapper<PolicyType>::UnmapString(
- const size_t value,
- const size_t dimension)
+ const size_t value,
+ const size_t dimension)
{
// Throw an exception if the value doesn't exist.
if (maps[dimension].first.right.count(value) == 0)
@@ -60,9 +61,9 @@ inline const std::string& DatasetMapper<PolicyType>::UnmapString(
// Return the value corresponding to a string in a given dimension.
template<typename PolicyType>
-inline typename PolicyType::mapped_type DatasetMapper<PolicyType>::UnmapValue(
- const std::string& string,
- const size_t dimension)
+inline typename PolicyType::MappedType DatasetMapper<PolicyType>::UnmapValue(
+ const std::string& string,
+ const size_t dimension)
{
// Throw an exception if the value doesn't exist.
if (maps[dimension].first.left.count(string) == 0)
@@ -79,9 +80,9 @@ inline typename PolicyType::mapped_type DatasetMapper<PolicyType>::UnmapValue(
template<typename PolicyType>
template<typename eT>
inline void DatasetMapper<PolicyType>::MapTokens(
- const std::vector<std::string>& tokens,
- size_t& row,
- arma::Mat<eT>& matrix)
+ const std::vector<std::string>& tokens,
+ size_t& row,
+ arma::Mat<eT>& matrix)
{
return policy.template MapTokens<eT, MapType>(tokens, row, matrix, maps,
types);
diff --git a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
index 83bde9d..1698ba9 100644
--- a/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/custom_imputation.hpp
@@ -8,17 +8,29 @@
#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_CUSTOM_IMPUTATION_HPP
#include <mlpack/core.hpp>
-#include <cmath>
-
-using namespace std;
namespace mlpack {
namespace data {
-
+/**
+ * A simple custom imputation class
+ * @tparam T Type of armadillo matrix
+ */
template <typename T>
class CustomImputation
{
public:
+ /**
+ * Impute function searches through the input looking for mappedValue and
+ * replaces it with the user-defined custom value of the given dimension.
+ * The result is saved to the output.
+ *
+ * @param input Matrix that contains mappedValue.
+ * @param output Matrix that the result will be saved into.
+ * @param mappedValue Value that the user wants to get rid of.
+ * @param customValue Value that the user wants to replace mappedValue with.
+ * @param dimension Index of the dimension of the mappedValue.
+ * @param transpose State of whether the input matrix is transposed or not.
+ */
void Impute(const arma::Mat<T>& input,
arma::Mat<T>& output,
const T& mappedValue,
diff --git a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
index 9f3a7d2..06db83a 100644
--- a/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
+++ b/src/mlpack/core/data/imputation_methods/listwise_deletion.hpp
@@ -8,21 +8,28 @@
#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_LISTWISE_DELETION_HPP
#include <mlpack/core.hpp>
-#include <cmath>
-
-using namespace std;
namespace mlpack {
namespace data {
-
/**
- * complete-case analysis.
+ * A complete-case analysis to remove the values containing mappedValue.
* Removes all data for a case that has one or more missing values.
+ * @tparam T Type of armadillo matrix
*/
template <typename T>
class ListwiseDeletion
{
public:
+ /**
+ * Impute function searches through the input looking for mappedValue and
+ * remove the whole row or column. The result is saved to the output.
+ *
+ * @param input Matrix that contains mappedValue.
+ * @param output Matrix that the result will be saved into.
+ * @param mappedValue Value that the user wants to get rid of.
+ * @param dimension Index of the dimension of the mappedValue.
+ * @param transpose State of whether the input matrix is transposed or not.
+ */
void Impute(const arma::Mat<T>& input,
arma::Mat<T>& output,
const T& mappedValue,
@@ -47,7 +54,7 @@ class ListwiseDeletion
}
else
{
- for (size_t i = 0; i < input.n_rows; ++i)\
+ for (size_t i = 0; i < input.n_rows; ++i)
{
if (input(i, dimension) == mappedValue ||
std::isnan(input(i, dimension)))
diff --git a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
index 157ba53..05134e5 100644
--- a/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/mean_imputation.hpp
@@ -8,20 +8,28 @@
#define MLPACK_CORE_DATA_IMPUTE_STRATEGIES_MEAN_IMPUTATION_HPP
#include <mlpack/core.hpp>
-#include <cmath>
-
-using namespace std;
namespace mlpack {
namespace data {
-
/**
- * A simple mean imputation
+ * A simple mean imputation class
+ * @tparam T Type of armadillo matrix
*/
template <typename T>
class MeanImputation
{
public:
+ /**
+ * Impute function searches through the input looking for mappedValue and
+ * replaces it with the mean of the given dimension. The result is saved
+ * to the output.
+ *
+ * @param input Matrix that contains mappedValue.
+ * @param output Matrix that the result will be saved into.
+ * @param mappedValue Value that the user wants to get rid of.
+ * @param dimension Index of the dimension of the mappedValue.
+ * @param transpose State of whether the input matrix is transposed or not.
+ */
void Impute(const arma::Mat<T>& input,
arma::Mat<T>& output,
const T& mappedValue,
diff --git a/src/mlpack/core/data/imputation_methods/median_imputation.hpp b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
index 0035be9..8a111d4 100644
--- a/src/mlpack/core/data/imputation_methods/median_imputation.hpp
+++ b/src/mlpack/core/data/imputation_methods/median_imputation.hpp
@@ -9,19 +9,28 @@
#include <mlpack/core.hpp>
-using namespace std;
-
namespace mlpack {
namespace data {
-
/**
- * A simple median imputation
+ * This is a class implementation of simple median imputation.
* replace missing value with middle or average of middle values
+ * @tparam T Type of armadillo matrix
*/
template <typename T>
class MedianImputation
{
public:
+ /**
+ * Impute function searches through the input looking for mappedValue and
+ * replaces it with the median of the given dimension. The result is saved
+ * to the output.
+ *
+ * @param input Matrix that contains mappedValue.
+ * @param output Matrix that the result will be saved into.
+ * @param mappedValue Value that the user wants to get rid of.
+ * @param dimension Index of the dimension of the mappedValue.
+ * @param transpose State of whether the input matrix is transposed or not.
+ */
void Impute(const arma::Mat<T>& input,
arma::Mat<T>& output,
const T& mappedValue,
@@ -56,7 +65,7 @@ class MedianImputation
}
}
}
-}; // class MeanImputation
+}; // class MedianImputation
} // namespace data
} // namespace mlpack
diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp
index 3bc9cb5..b719ba2 100644
--- a/src/mlpack/core/data/imputer.hpp
+++ b/src/mlpack/core/data/imputer.hpp
@@ -9,7 +9,9 @@
#define MLPACK_CORE_DATA_IMPUTER_HPP
#include <mlpack/core.hpp>
-#include "dataset_info.hpp"
+#include "dataset_mapper.hpp"
+#include "map_policies/missing_policy.hpp"
+#include "map_policies/increment_policy.hpp"
namespace mlpack {
namespace data {
@@ -27,22 +29,18 @@ class Imputer
{
public:
Imputer(MapperType mapper, bool transpose = true):
- mapper(std::move(mapper)),
- transpose(transpose)
+ mapper(std::move(mapper)),
+ transpose(transpose)
{
- //static_assert(std::is_same<typename std::decay<MapperType>::type,
- //data::IncrementPolicy>::value, "The type of MapperType must be "
- //"IncrementPolicy");
+ // Nothing to initialize here.
}
Imputer(MapperType mapper, StrategyType strategy, bool transpose = true):
- strategy(std::move(strategy)),
- mapper(std::move(mapper)),
- transpose(transpose)
+ strategy(std::move(strategy)),
+ mapper(std::move(mapper)),
+ transpose(transpose)
{
- //static_assert(std::is_same<typename std::decay<MapperType>::type,
- //data::IncrementPolicy>::value, "The type of MapperType must be "
- //"IncrementPolicy");
+ // Nothing to initialize here.
}
/**
@@ -66,6 +64,13 @@ class Imputer
/**
* This overload of Impute() lets users to define custom value that can be
* replaced with the target value.
+ *
+ * @param input Input dataset to apply imputation.
+ * @param output Armadillo matrix to save the results
+ * @oaran missingValue User defined missing value; it can be anything.
+ * @param customValue The numeric value that a user wants to replace
+ * missingValue with.
+ * @param dimension Dimension to apply the imputation.
*/
void Impute(const arma::Mat<T>& input,
arma::Mat<T>& output,
diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp
index 4b5debe..40d3834 100644
--- a/src/mlpack/core/data/load.hpp
+++ b/src/mlpack/core/data/load.hpp
@@ -14,7 +14,7 @@
#include <string>
#include "format.hpp"
-#include "dataset_info.hpp"
+#include "dataset_mapper.hpp"
namespace mlpack {
namespace data /** Functions to load and save matrices and models. */ {
diff --git a/src/mlpack/core/data/map_policies/datatype.hpp b/src/mlpack/core/data/map_policies/datatype.hpp
index 0cafba8..3a3b1ac 100644
--- a/src/mlpack/core/data/map_policies/datatype.hpp
+++ b/src/mlpack/core/data/map_policies/datatype.hpp
@@ -10,7 +10,6 @@
namespace mlpack {
namespace data {
-
/**
* The Datatype enum specifies the types of data mlpack algorithms can use.
* The vast majority of mlpack algorithms can only use numeric data (i.e.
@@ -23,7 +22,6 @@ enum Datatype : bool /* [> bool is all the precision we need for two types <] */
categorical = 1
};
-
} // namespace data
} // namespace mlpack
diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp
index 4a971d2..4ff7341 100644
--- a/src/mlpack/core/data/map_policies/increment_policy.hpp
+++ b/src/mlpack/core/data/map_policies/increment_policy.hpp
@@ -12,26 +12,39 @@
#include <boost/bimap.hpp>
#include <mlpack/core/data/map_policies/datatype.hpp>
-using namespace std;
-
namespace mlpack {
namespace data {
-
/**
- * This class is used to map strings to incrementing unsigned integers (size_t).
- * First string to be mapped will be mapped to 0, next to 1 and so on.
+ * IncrementPolicy is used as a helper class for DatasetMapper. It tells how the
+ * strings should be mapped. Purpose of this policy is to map all dimension if
+ * one if the variables in a dimension turns out to be a categorical variable.
+ * IncrementPolicy maps strings to incrementing unsigned integers (size_t).
+ * The first string to be mapped will be mapped to 0, the next to 1 and so on.
*/
class IncrementPolicy
{
public:
- // typedef of mapped_type
- using mapped_type = size_t;
+ // typedef of MappedType
+ using MappedType = size_t;
+ /**
+ * Given the string and the dimension to which the it belongs, and the maps
+ * and types given by the DatasetMapper class, returns its numeric mapping.
+ * If no mapping yet exists, the string is added to the list of mappings for
+ * the given dimension. This function is used as a helper function for
+ * DatasetMapper class.
+ *
+ * @tparam MapType Type of unordered_map that contains mapped value pairs
+ * @param string String to find/create mapping for.
+ * @param dimension Index of the dimension of the string.
+ * @param maps Unordered map given by the DatasetMapper.
+ * @param types Vector containing the type information about each dimensions.
+ */
template <typename MapType>
- mapped_type MapString(const std::string& string,
- const size_t dimension,
- MapType& maps,
- std::vector<Datatype>& types)
+ MappedType MapString(const std::string& string,
+ const size_t dimension,
+ MapType& maps,
+ std::vector<Datatype>& types)
{
// If this condition is true, either we have no mapping for the given string
// or we have no mappings for the given dimension at all. In either case,
@@ -46,7 +59,7 @@ class IncrementPolicy
if (numMappings == 0)
types[dimension] = Datatype::categorical;
- typedef boost::bimap<std::string, mapped_type>::value_type PairType;
+ typedef boost::bimap<std::string, MappedType>::value_type PairType;
maps[dimension].first.insert(PairType(string, numMappings));
return numMappings++;
}
@@ -57,6 +70,21 @@ class IncrementPolicy
}
}
+ /**
+ * MapTokens turns vector of strings into numeric variables and puts them
+ * into a given matrix. It is used as a helper function when trying to load
+ * files. Each dimension's tokens are given in to this function. If one of the
+ * tokens turns out to be a string, all the tokens should be mapped using the
+ * MapString() funciton.
+ *
+ * @tparam eT Type of armadillo matrix.
+ * @tparam MapType Type of unordered_map that contains mapped value pairs.
+ * @param tokens Vector of variables inside a dimension.
+ * @param row Position of the given tokens.
+ * @param matrix Matrix to save the data into.
+ * @param maps Maps given by the DatasetMapper class.
+ * @param types Types of each dimensions given by the DatasetMapper class.
+ */
template <typename eT, typename MapType>
void MapTokens(const std::vector<std::string>& tokens,
size_t& row,
@@ -81,7 +109,6 @@ class IncrementPolicy
{
const eT val = static_cast<eT>(this->MapString(tokens[i], row, maps,
types));
- double temp = (double) val;
matrix.at(row, i) = val;
}
}
diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp
index c68fc27..ead543a 100644
--- a/src/mlpack/core/data/map_policies/missing_policy.hpp
+++ b/src/mlpack/core/data/map_policies/missing_policy.hpp
@@ -13,38 +13,58 @@
#include <mlpack/core/data/map_policies/datatype.hpp>
#include <limits>
-
-using namespace std;
-
namespace mlpack {
namespace data {
-
/**
- * Same as increment map policy, but does not change type of features.
+ * MissingPolicy is used as a helper class for DatasetMapper. It tells how the
+ * strings should be mapped. Purpose of this policy is to map all user-defined
+ * missing variables into maps so that users can decide what to do with the
+ * corrupted data. User-defined missing variables are given by the missingSet.
+ * Note that MissingPolicy does not change type of features.
*/
class MissingPolicy
{
public:
- // typedef of mapped_type
- using mapped_type = double;
+ // typedef of MappedType
+ using MappedType = double;
MissingPolicy()
{
// Nothing to initialize here.
}
+ /**
+ * Create the MissingPolicy object with the given missingSet. Note that the
+ * missingSet cannot be changed later; you will have to create a new
+ * MissingPolicy object.
+ *
+ * @param missingSet Set of strings that should be mapped.
+ */
explicit MissingPolicy(std::set<std::string> missingSet) :
- missingSet(std::move(missingSet))
+ missingSet(std::move(missingSet))
{
// Nothing to initialize here.
}
-
+ /**
+ * Given the string and the dimension to which it belongs by the user, and
+ * the maps and types given by the DatasetMapper class, returns its numeric
+ * mapping. If no mapping yet exists and the string is included in the
+ * missingSet, the string is added to the list of mappings for the given
+ * dimension. This function is used as a helper function for DatasetMapper
+ * class.
+ *
+ * @tparam MapType Type of unordered_map that contains mapped value pairs
+ * @param string String to find/create mapping for.
+ * @param dimension Index of the dimension of the string.
+ * @param maps Unordered map given by the DatasetMapper.
+ * @param types Vector containing the type information about each dimensions.
+ */
template <typename MapType>
- mapped_type MapString(const std::string& string,
- const size_t dimension,
- MapType maps,
- std::vector<Datatype>& types)
+ MappedType MapString(const std::string& string,
+ const size_t dimension,
+ MapType maps,
+ std::vector<Datatype>& types)
{
// If this condition is true, either we have no mapping for the given string
// or we have no mappings for the given dimension at all. In either case,
@@ -57,7 +77,7 @@ class MissingPolicy
// This string does not exist yet.
size_t& numMappings = maps[dimension].second;
- typedef boost::bimap<std::string, mapped_type>::value_type PairType;
+ typedef boost::bimap<std::string, MappedType>::value_type PairType;
maps[dimension].first.insert(PairType(string, NaN));
++numMappings;
@@ -65,11 +85,28 @@ class MissingPolicy
}
else
{
- // This string already exists in the mapping or .
+ // This string already exists in the mapping
+ // or not included in missingSet.
return NaN;
}
}
+ /**
+ * MapTokens turns vector of strings into numeric variables and puts them
+ * into a given matrix. It is used as a helper function when trying to load
+ * files. Each dimension's tokens are given in to this function. If one of the
+ * tokens turns out to be a string or one of the missingSet's variables, only
+ * the token responsible for it should be mapped using the MapString()
+ * funciton.
+ *
+ * @tparam eT Type of armadillo matrix.
+ * @tparam MapType Type of unordered_map that contains mapped value pairs.
+ * @param tokens Vector of variables inside a dimension.
+ * @param row Position of the given tokens.
+ * @param matrix Matrix to save the data into.
+ * @param maps Maps given by the DatasetMapper class.
+ * @param types Types of each dimensions given by the DatasetMapper class.
+ */
template <typename eT, typename MapType>
void MapTokens(const std::vector<std::string>& tokens,
size_t& row,
@@ -77,9 +114,14 @@ class MissingPolicy
MapType& maps,
std::vector<Datatype>& types)
{
+ // MissingPolicy allows double type matrix only, because it uses NaN.
+ static_assert(std::is_same<eT, double>::value, "You must use double type "
+ " matrix in order to apply MissingPolicy");
+
std::stringstream token;
for (size_t i = 0; i != tokens.size(); ++i)
{
+ // if token is a number, but is included in the missingSet, map it.
if (missingSet.find(tokens[i]) != std::end(missingSet))
{
const eT val = static_cast<eT>(this->MapString(tokens[i], row, maps,
@@ -88,7 +130,8 @@ class MissingPolicy
}
token.str(tokens[i]);
token>>matrix.at(row, i);
- if (token.fail()) // if not number, map it to datasetmapper
+ // if the token is not number, map it.
+ if (token.fail())
{
const eT val = static_cast<eT>(this->MapString(tokens[i], row, maps,
types));
@@ -99,6 +142,8 @@ class MissingPolicy
}
private:
+ // Note that missingSet and maps are different.
+ // missingSet specifies which value/string should be mapped.
std::set<std::string> missingSet;
}; // class MissingPolicy
diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
index a0b0a13..603cdcc 100644
--- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
@@ -7,7 +7,7 @@
*/
#include <mlpack/core.hpp>
#include <mlpack/core/data/imputer.hpp>
-#include <mlpack/core/data/dataset_info.hpp>
+#include <mlpack/core/data/dataset_mapper.hpp>
#include <mlpack/core/data/map_policies/increment_policy.hpp>
#include <mlpack/core/data/map_policies/missing_policy.hpp>
#include <mlpack/core/data/imputation_methods/mean_imputation.hpp>
@@ -19,8 +19,8 @@ PROGRAM_INFO("Impute Data", "This utility takes a dataset and converts user "
"defined missing variable to another to provide more meaningful analysis "
"\n\n"
"The program does not modify the original file, but instead makes a "
- "separate file to save the output data; The program requires you to "
- "specify the file name with --output_file (-o)."
+ "separate file to save the output data; You can save the output by "
+ "specifying the file name with --output_file (-o)."
"\n\n"
"For example, if we consider 'NULL' in dimension 0 to be a missing "
"variable and want to delete whole row containing the NULL in the "
@@ -43,7 +43,6 @@ using namespace data;
int main(int argc, char** argv)
{
- // Parse command line options.
CLI::ParseCommandLine(argc, argv);
const string inputFile = CLI::GetParam<string>("input_file");
@@ -53,21 +52,25 @@ int main(int argc, char** argv)
const size_t dimension = (size_t) CLI::GetParam<int>("dimension");
string strategy = CLI::GetParam<string>("strategy");
- // missing value should be specified
+ // The program needs user-defined missing values.
+ // Missing values can be any list of strings such as "1", "a", "NULL".
if (!CLI::HasParam("missing_value"))
Log::Fatal << "--missing_value must be specified in order to perform "
<< "any imputation strategies." << endl;
- // warn if user did not specify output_file
+ if (!CLI::HasParam("strategy"))
+ Log::Fatal << "--strategy must be specified in order to perform "
+ << "imputation."<< endl;
+
if (!CLI::HasParam("output_file"))
Log::Warn << "--output_file is not specified, no "
<< "results from this program will be saved!" << endl;
- // warn if user did not specify dimension
if (!CLI::HasParam("dimension"))
- Log::Warn << "--dimension is required to be specified!" << endl;
+ Log::Warn << "--dimension is not specified, the imputation will be "
+ << "applied to all dimensions."<< endl;
- // if custom value is specified, and imputation strategy is not,
+ // If custom value is specified, and imputation strategy is not,
// set imputation strategy to "custom"
if (CLI::HasParam("custom_value") && !CLI::HasParam("impute_strategy"))
{
@@ -76,7 +79,7 @@ int main(int argc, char** argv)
<< "--impute_strategy is automatically set to 'custom'." << endl;
}
- // custom value and any other impute strategies cannot be specified at
+ // Custom value and any other impute strategies cannot be specified at
// the same time.
if (CLI::HasParam("custom_value") && CLI::HasParam("impute_strategy") &&
strategy != "custom")
@@ -89,6 +92,7 @@ int main(int argc, char** argv)
<< "'custom' strategy" << endl;
arma::mat input;
+ arma::mat output;
// Policy tells how the DatasetMapper should map the values.
std::set<std::string> missingSet;
missingSet.insert(missingValue);
@@ -98,9 +102,6 @@ int main(int argc, char** argv)
Load(inputFile, input, info, true, true);
- // for testing purpose
- Log::Info << input << endl;
-
// print how many mapping exist in each dimensions
for (size_t i = 0; i < input.n_rows; ++i)
{
@@ -108,49 +109,68 @@ int main(int argc, char** argv)
<< endl;
}
- arma::Mat<double> output(input);
-
- Log::Info << "Performing '" << strategy << "' imputation strategy "
- << "on dimension '" << dimension << "'." << endl;
-
- // custom strategy only
- if (strategy == "custom")
+ // default imputer is mean imputation (to provide scope)
+ Imputer<double, MapperType, MeanImputation<double>> impu(info);
+ if (strategy == "median")
+ {
+ Imputer<double, MapperType, MedianImputation<double>> impu(info);
+ }
+ else if (strategy == "listwise_deletion")
{
- Log::Info << "Replacing all '" << missingValue << "' with '" << customValue
- << "'." << endl;
- Imputer<double, MapperType, CustomImputation<double>> impu(info);
- impu.Impute(input, output, missingValue, customValue, dimension);
+ Imputer<double, MapperType, ListwiseDeletion<double>> impu(info);
}
else
{
- Log::Info << "Replacing all '" << missingValue << "' with '"
- << strategy << "' strategy." << endl;
+ Log::Fatal << "'" << strategy << "' imputation strategy does not exist"
+ << endl;
+ }
- if (strategy == "mean")
+ // Initialize imputer class
+
+ if (CLI::HasParam("dimension"))
+ {
+ // when --dimension is specified,
+ // the program will apply the changes to only the given dimension.
+ Log::Info << "Performing '" << strategy << "' imputation strategy "
+ << "to replace '" << missingValue << "' on all dimensions." << endl;
+
+ if (strategy == "custom")
{
- Imputer<double, MapperType, MeanImputation<double>> impu(info);
- impu.Impute(input, output, missingValue, dimension);
+ Imputer<double, MapperType, CustomImputation<double>> impu(info);
+ impu.Impute(input, output, missingValue, customValue, dimension);
}
- else if (strategy == "median")
+ else
{
- Imputer<double, MapperType, MedianImputation<double>> impu(info);
impu.Impute(input, output, missingValue, dimension);
}
- else if (strategy == "listwise_deletion")
+ }
+ else
+ {
+ // when --dimension is not specified,
+ // the program will apply the changes to all dimensions.
+ Log::Info << "Performing '" << strategy << "' imputation strategy "
+ << "to replace '" << missingValue << "' on all dimensions." << endl;
+
+ if (strategy == "custom")
{
- Imputer<double, MapperType, ListwiseDeletion<double>> impu(info);
- impu.Impute(input, output, missingValue, dimension);
+ Imputer<double, MapperType, CustomImputation<double>> impu(info);
+ for (size_t i = 0; i < input.n_rows; ++i)
+ {
+ impu.Impute(input, output, missingValue, customValue, i);
+ }
}
else
{
- Log::Warn << "You did not choose any imputation strategy" << endl;
+ for (size_t i = 0; i < input.n_rows; ++i)
+ {
+ impu.Impute(input, output, missingValue, i);
+ }
}
}
-
if (!outputFile.empty())
{
- Log::Info << "Saving model to '" << outputFile << "'." << endl;
+ Log::Info << "Saving results to '" << outputFile << "'." << endl;
Save(outputFile, output, false);
}
}
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
index f02e97e..e118bfb 100644
--- a/src/mlpack/tests/imputation_test.cpp
+++ b/src/mlpack/tests/imputation_test.cpp
@@ -7,7 +7,7 @@
#include <sstream>
#include <mlpack/core.hpp>
-#include <mlpack/core/data/dataset_info.hpp>
+#include <mlpack/core/data/dataset_mapper.hpp>
#include <mlpack/core/data/map_policies/increment_policy.hpp>
#include <mlpack/core/data/map_policies/missing_policy.hpp>
#include <mlpack/core/data/imputer.hpp>
More information about the mlpack-git
mailing list