[mlpack-git] master: add imputer doc (bc187ca)

gitdub at mlpack.org gitdub at mlpack.org
Mon Jul 25 12:18:50 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2

>---------------------------------------------------------------

commit bc187cab7bb3d8847a2bd60343eee8eb7719118e
Author: Keon Kim <kwk236 at gmail.com>
Date:   Sat Jul 2 02:14:34 2016 +0900

    add imputer doc


>---------------------------------------------------------------

bc187cab7bb3d8847a2bd60343eee8eb7719118e
 src/mlpack/core/data/load.hpp                      | 46 +++++++++++++++++++---
 src/mlpack/core/data/load_impl.hpp                 |  2 +-
 .../core/data/map_policies/missing_policy.hpp      |  1 -
 .../methods/preprocess/preprocess_imputer_main.cpp | 33 ++++++++++------
 4 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp
index 476c3ad..da770b4 100644
--- a/src/mlpack/core/data/load.hpp
+++ b/src/mlpack/core/data/load.hpp
@@ -61,10 +61,10 @@ bool Load(const std::string& filename,
 
 /**
  * Loads a matrix from a file, guessing the filetype from the extension and
- * mapping categorical features with a DatasetInfo object.  This will transpose
- * the matrix (unless the transpose parameter is set to false).  This particular
- * overload of Load() can only load text-based formats, such as those given
- * below:
+ * mapping categorical features with a DatasetMapper object.  This will
+ * transpose the matrix (unless the transpose parameter is set to false).
+ * This particular overload of Load() can only load text-based formats, such as
+ * those given below:
  *
  * - CSV (csv_ascii), denoted by .csv, or optionally .txt
  * - TSV (raw_ascii), denoted by .tsv, .csv, or .txt
@@ -81,12 +81,12 @@ bool Load(const std::string& filename,
  * mlpack requires column-major matrices, this should be left at its default
  * value of 'true'.
  *
- * The DatasetInfo object passed to this function will be re-created, so any
+ * The DatasetMapper object passed to this function will be re-created, so any
  * mappings from previous loads will be lost.
  *
  * @param filename Name of file to load.
  * @param matrix Matrix to load contents of file into.
- * @param info DatasetInfo object to populate with mappings and data types.
+ * @param info DatasetMapper object to populate with mappings and data types.
  * @param fatal If an error should be reported as fatal (default false).
  * @param transpose If true, transpose the matrix after loading.
  * @return Boolean value indicating success or failure of load.
@@ -98,6 +98,40 @@ bool Load(const std::string& filename,
           const bool fatal = false,
           const bool transpose = true);
 
+/**
+ * Loads a matrix from a file, guessing the filetype from the extension and
+ * mapping categorical features with a DatasetMapper object.  This will
+ * transpose the matrix (unless the transpose parameter is set to false).
+ * This particular overload of Load() can only load text-based formats, such as
+ * those given below:
+ *
+ * - CSV (csv_ascii), denoted by .csv, or optionally .txt
+ * - TSV (raw_ascii), denoted by .tsv, .csv, or .txt
+ * - ASCII (raw_ascii), denoted by .txt
+ *
+ * If the file extension is not one of those types, an error will be given.
+ * This is preferable to Armadillo's default behavior of loading an unknown
+ * filetype as raw_binary, which can have very confusing effects.
+ *
+ * If the parameter 'fatal' is set to true, a std::runtime_error exception will
+ * be thrown if the matrix does not load successfully.  The parameter
+ * 'transpose' controls whether or not the matrix is transposed after loading.
+ * In most cases, because data is generally stored in a row-major format and
+ * mlpack requires column-major matrices, this should be left at its default
+ * value of 'true'.
+ *
+ * The DatasetMapper object passed to this function will be re-created, so any
+ * mappings from previous loads will be lost. policy is passed to the
+ * constructor of DatasetMapper to create a new instance.
+ *
+ * @param filename Name of file to load.
+ * @param matrix Matrix to load contents of file into.
+ * @param info DatasetMapper object to populate with mappings and data types.
+ * @param policy Policy class that decides how the DatasetMapper should map.
+ * @param fatal If an error should be reported as fatal (default false).
+ * @param transpose If true, transpose the matrix after loading.
+ * @return Boolean value indicating success or failure of load.
+ */
 template<typename eT, typename PolicyType>
 bool Load(const std::string& filename,
           arma::Mat<eT>& matrix,
diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp
index f1e7651..4f03221 100644
--- a/src/mlpack/core/data/load_impl.hpp
+++ b/src/mlpack/core/data/load_impl.hpp
@@ -369,7 +369,7 @@ bool Load(const std::string& filename,
   return success;
 }
 
-// Load with mappings.  Unfortunately we have to implement this ourselves.
+// Load with mappings and policy.
 template<typename eT, typename PolicyType>
 bool Load(const std::string& filename,
           arma::Mat<eT>& matrix,
diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp
index 2611e17..a38d877 100644
--- a/src/mlpack/core/data/map_policies/missing_policy.hpp
+++ b/src/mlpack/core/data/map_policies/missing_policy.hpp
@@ -29,7 +29,6 @@ class MissingPolicy
   MissingPolicy()
   {
     Log::Debug << "MissingPolicy()" << std::endl;
-    missingSet.insert("a");
   }
 
   explicit MissingPolicy(std::set<std::string> missingSet) :
diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
index 687e78e..7334407 100644
--- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
@@ -15,14 +15,23 @@
 #include <mlpack/core/data/imputation_methods/custom_imputation.hpp>
 #include <mlpack/core/data/imputation_methods/listwise_deletion.hpp>
 
-PROGRAM_INFO("Imputer", "This "
-    "utility takes an any type of data and provides "
-    "imputation strategies for missing data.");
+PROGRAM_INFO("Impute Data", "This utility takes a dataset and converts user "
+    "defined missing variable to another to provide more meaningful analysis "
+    "\n\n"
+    "The program does not modify the original file, but instead makes a "
+    "separate file to save the output data; The program requires you to "
+    "specify the file name with --output_file (-o)."
+    "\n\n"
+    "For example, if we consider 'NULL' in dimension 0 to be a missing "
+    "variable and want to delete whole row containing the NULL in the "
+    "column-wise dataset, and save the result to result.csv, we could run"
+    "\n\n"
+    "$ mlpack_preprocess_imputer -i dataset.csv -o result.csv -m NULL -d 0 \n"
+    "> -s listwise_deletion")
 
 PARAM_STRING_REQ("input_file", "File containing data,", "i");
 PARAM_STRING("output_file", "File to save output", "o", "");
 PARAM_STRING("missing_value", "User defined missing value", "m", "")
-PARAM_STRING("map_policy", "mapping policy to be used while loading", "p", "")
 PARAM_STRING("strategy", "imputation strategy to be applied", "s", "")
 PARAM_DOUBLE("custom_value", "user_defined custom value", "c", 0.0)
 PARAM_INT("dimension", "the dimension to apply imputation", "d", 0);
@@ -40,7 +49,6 @@ int main(int argc, char** argv)
   const string inputFile = CLI::GetParam<string>("input_file");
   const string outputFile = CLI::GetParam<string>("output_file");
   const string missingValue = CLI::GetParam<string>("missing_value");
-  const string mapPolicy = CLI::GetParam<string>("map_policy");
   const double customValue = CLI::GetParam<double>("custom_value");
   const size_t dimension = (size_t) CLI::GetParam<int>("dimension");
   string strategy = CLI::GetParam<string>("strategy");
@@ -81,12 +89,13 @@ int main(int argc, char** argv)
         << "'custom' strategy" << endl;
 
   arma::mat input;
-  // DatasetInfo holds how the DatasetMapper should map the values.
+  // Policy tells how the DatasetMapper should map the values.
   // can be specified by passing map_policy classes as template parameters
   // ex) DatasetMapper<IncrementPolicy> info;
   std::set<std::string> missingSet;
   missingSet.insert(missingValue);
   MissingPolicy policy(missingSet);
+  using MapperType = DatasetMapper<MissingPolicy>;
   DatasetMapper<MissingPolicy> info(policy);
 
   Load<double, MissingPolicy>(inputFile, input, info, policy, true, true);
@@ -104,14 +113,14 @@ int main(int argc, char** argv)
   arma::Mat<double> output(input);
 
   Log::Info << "Performing '" << strategy << "' imputation strategy "
-      << "on dimension '" << dimension << endl;
+      << "on dimension '" << dimension << "'." << endl;
 
   // custom strategy only
   if (strategy == "custom")
   {
     Log::Info << "Replacing all '" << missingValue << "' with '" << customValue
         << "'." << endl;
-    Imputer<double, MissingPolicy, CustomImputation<double>> impu(info);
+    Imputer<double, MapperType, CustomImputation<double>> impu(info);
     impu.Impute(input, output, missingValue, customValue, dimension);
   }
   else
@@ -121,17 +130,17 @@ int main(int argc, char** argv)
 
     if (strategy == "mean")
     {
-      Imputer<double, MissingPolicy, MeanImputation<double>> impu(info);
+      Imputer<double, MapperType, MeanImputation<double>> impu(info);
       impu.Impute(input, output, missingValue, dimension);
     }
     else if (strategy == "median")
     {
-      Imputer<double, MissingPolicy, MedianImputation<double>> impu(info);
+      Imputer<double, MapperType, MedianImputation<double>> impu(info);
       impu.Impute(input, output, missingValue, dimension);
     }
-    else if (strategy == "listwise")
+    else if (strategy == "listwise_deletion")
     {
-      Imputer<double, MissingPolicy, ListwiseDeletion<double>> impu(info);
+      Imputer<double, MapperType, ListwiseDeletion<double>> impu(info);
       impu.Impute(input, output, missingValue, dimension);
     }
     else




More information about the mlpack-git mailing list