[mlpack-git] master: add imputer doc (bc187ca)
gitdub at mlpack.org
gitdub at mlpack.org
Mon Jul 25 12:18:50 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2
>---------------------------------------------------------------
commit bc187cab7bb3d8847a2bd60343eee8eb7719118e
Author: Keon Kim <kwk236 at gmail.com>
Date: Sat Jul 2 02:14:34 2016 +0900
add imputer doc
>---------------------------------------------------------------
bc187cab7bb3d8847a2bd60343eee8eb7719118e
src/mlpack/core/data/load.hpp | 46 +++++++++++++++++++---
src/mlpack/core/data/load_impl.hpp | 2 +-
.../core/data/map_policies/missing_policy.hpp | 1 -
.../methods/preprocess/preprocess_imputer_main.cpp | 33 ++++++++++------
4 files changed, 62 insertions(+), 20 deletions(-)
diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp
index 476c3ad..da770b4 100644
--- a/src/mlpack/core/data/load.hpp
+++ b/src/mlpack/core/data/load.hpp
@@ -61,10 +61,10 @@ bool Load(const std::string& filename,
/**
* Loads a matrix from a file, guessing the filetype from the extension and
- * mapping categorical features with a DatasetInfo object. This will transpose
- * the matrix (unless the transpose parameter is set to false). This particular
- * overload of Load() can only load text-based formats, such as those given
- * below:
+ * mapping categorical features with a DatasetMapper object. This will
+ * transpose the matrix (unless the transpose parameter is set to false).
+ * This particular overload of Load() can only load text-based formats, such as
+ * those given below:
*
* - CSV (csv_ascii), denoted by .csv, or optionally .txt
* - TSV (raw_ascii), denoted by .tsv, .csv, or .txt
@@ -81,12 +81,12 @@ bool Load(const std::string& filename,
* mlpack requires column-major matrices, this should be left at its default
* value of 'true'.
*
- * The DatasetInfo object passed to this function will be re-created, so any
+ * The DatasetMapper object passed to this function will be re-created, so any
* mappings from previous loads will be lost.
*
* @param filename Name of file to load.
* @param matrix Matrix to load contents of file into.
- * @param info DatasetInfo object to populate with mappings and data types.
+ * @param info DatasetMapper object to populate with mappings and data types.
* @param fatal If an error should be reported as fatal (default false).
* @param transpose If true, transpose the matrix after loading.
* @return Boolean value indicating success or failure of load.
@@ -98,6 +98,40 @@ bool Load(const std::string& filename,
const bool fatal = false,
const bool transpose = true);
+/**
+ * Loads a matrix from a file, guessing the filetype from the extension and
+ * mapping categorical features with a DatasetMapper object. This will
+ * transpose the matrix (unless the transpose parameter is set to false).
+ * This particular overload of Load() can only load text-based formats, such as
+ * those given below:
+ *
+ * - CSV (csv_ascii), denoted by .csv, or optionally .txt
+ * - TSV (raw_ascii), denoted by .tsv, .csv, or .txt
+ * - ASCII (raw_ascii), denoted by .txt
+ *
+ * If the file extension is not one of those types, an error will be given.
+ * This is preferable to Armadillo's default behavior of loading an unknown
+ * filetype as raw_binary, which can have very confusing effects.
+ *
+ * If the parameter 'fatal' is set to true, a std::runtime_error exception will
+ * be thrown if the matrix does not load successfully. The parameter
+ * 'transpose' controls whether or not the matrix is transposed after loading.
+ * In most cases, because data is generally stored in a row-major format and
+ * mlpack requires column-major matrices, this should be left at its default
+ * value of 'true'.
+ *
+ * The DatasetMapper object passed to this function will be re-created, so any
+ * mappings from previous loads will be lost. policy is passed to the
+ * constructor of DatasetMapper to create a new instance.
+ *
+ * @param filename Name of file to load.
+ * @param matrix Matrix to load contents of file into.
+ * @param info DatasetMapper object to populate with mappings and data types.
+ * @param policy Policy class that decides how the DatasetMapper should map.
+ * @param fatal If an error should be reported as fatal (default false).
+ * @param transpose If true, transpose the matrix after loading.
+ * @return Boolean value indicating success or failure of load.
+ */
template<typename eT, typename PolicyType>
bool Load(const std::string& filename,
arma::Mat<eT>& matrix,
diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp
index f1e7651..4f03221 100644
--- a/src/mlpack/core/data/load_impl.hpp
+++ b/src/mlpack/core/data/load_impl.hpp
@@ -369,7 +369,7 @@ bool Load(const std::string& filename,
return success;
}
-// Load with mappings. Unfortunately we have to implement this ourselves.
+// Load with mappings and policy.
template<typename eT, typename PolicyType>
bool Load(const std::string& filename,
arma::Mat<eT>& matrix,
diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp
index 2611e17..a38d877 100644
--- a/src/mlpack/core/data/map_policies/missing_policy.hpp
+++ b/src/mlpack/core/data/map_policies/missing_policy.hpp
@@ -29,7 +29,6 @@ class MissingPolicy
MissingPolicy()
{
Log::Debug << "MissingPolicy()" << std::endl;
- missingSet.insert("a");
}
explicit MissingPolicy(std::set<std::string> missingSet) :
diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
index 687e78e..7334407 100644
--- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
@@ -15,14 +15,23 @@
#include <mlpack/core/data/imputation_methods/custom_imputation.hpp>
#include <mlpack/core/data/imputation_methods/listwise_deletion.hpp>
-PROGRAM_INFO("Imputer", "This "
- "utility takes an any type of data and provides "
- "imputation strategies for missing data.");
+PROGRAM_INFO("Impute Data", "This utility takes a dataset and converts user "
+ "defined missing variable to another to provide more meaningful analysis "
+ "\n\n"
+ "The program does not modify the original file, but instead makes a "
+ "separate file to save the output data; The program requires you to "
+ "specify the file name with --output_file (-o)."
+ "\n\n"
+ "For example, if we consider 'NULL' in dimension 0 to be a missing "
+ "variable and want to delete whole row containing the NULL in the "
+ "column-wise dataset, and save the result to result.csv, we could run"
+ "\n\n"
+ "$ mlpack_preprocess_imputer -i dataset.csv -o result.csv -m NULL -d 0 \n"
+ "> -s listwise_deletion")
PARAM_STRING_REQ("input_file", "File containing data,", "i");
PARAM_STRING("output_file", "File to save output", "o", "");
PARAM_STRING("missing_value", "User defined missing value", "m", "")
-PARAM_STRING("map_policy", "mapping policy to be used while loading", "p", "")
PARAM_STRING("strategy", "imputation strategy to be applied", "s", "")
PARAM_DOUBLE("custom_value", "user_defined custom value", "c", 0.0)
PARAM_INT("dimension", "the dimension to apply imputation", "d", 0);
@@ -40,7 +49,6 @@ int main(int argc, char** argv)
const string inputFile = CLI::GetParam<string>("input_file");
const string outputFile = CLI::GetParam<string>("output_file");
const string missingValue = CLI::GetParam<string>("missing_value");
- const string mapPolicy = CLI::GetParam<string>("map_policy");
const double customValue = CLI::GetParam<double>("custom_value");
const size_t dimension = (size_t) CLI::GetParam<int>("dimension");
string strategy = CLI::GetParam<string>("strategy");
@@ -81,12 +89,13 @@ int main(int argc, char** argv)
<< "'custom' strategy" << endl;
arma::mat input;
- // DatasetInfo holds how the DatasetMapper should map the values.
+ // Policy tells how the DatasetMapper should map the values.
// can be specified by passing map_policy classes as template parameters
// ex) DatasetMapper<IncrementPolicy> info;
std::set<std::string> missingSet;
missingSet.insert(missingValue);
MissingPolicy policy(missingSet);
+ using MapperType = DatasetMapper<MissingPolicy>;
DatasetMapper<MissingPolicy> info(policy);
Load<double, MissingPolicy>(inputFile, input, info, policy, true, true);
@@ -104,14 +113,14 @@ int main(int argc, char** argv)
arma::Mat<double> output(input);
Log::Info << "Performing '" << strategy << "' imputation strategy "
- << "on dimension '" << dimension << endl;
+ << "on dimension '" << dimension << "'." << endl;
// custom strategy only
if (strategy == "custom")
{
Log::Info << "Replacing all '" << missingValue << "' with '" << customValue
<< "'." << endl;
- Imputer<double, MissingPolicy, CustomImputation<double>> impu(info);
+ Imputer<double, MapperType, CustomImputation<double>> impu(info);
impu.Impute(input, output, missingValue, customValue, dimension);
}
else
@@ -121,17 +130,17 @@ int main(int argc, char** argv)
if (strategy == "mean")
{
- Imputer<double, MissingPolicy, MeanImputation<double>> impu(info);
+ Imputer<double, MapperType, MeanImputation<double>> impu(info);
impu.Impute(input, output, missingValue, dimension);
}
else if (strategy == "median")
{
- Imputer<double, MissingPolicy, MedianImputation<double>> impu(info);
+ Imputer<double, MapperType, MedianImputation<double>> impu(info);
impu.Impute(input, output, missingValue, dimension);
}
- else if (strategy == "listwise")
+ else if (strategy == "listwise_deletion")
{
- Imputer<double, MissingPolicy, ListwiseDeletion<double>> impu(info);
+ Imputer<double, MapperType, ListwiseDeletion<double>> impu(info);
impu.Impute(input, output, missingValue, dimension);
}
else
More information about the mlpack-git
mailing list