[mlpack-git] master: write binarize docs (b54902e)
gitdub at mlpack.org
gitdub at mlpack.org
Sat Jun 18 04:16:25 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/a0b31abe5ff69117645c664dbeac1476dd5e48f7...2da9c5bac14a00145c757b8139c245913b86e034
>---------------------------------------------------------------
commit b54902e75ba9c88ab6f25660acde424db805ec3f
Author: Keon Kim <kwk236 at gmail.com>
Date: Sat Jun 18 17:16:25 2016 +0900
write binarize docs
>---------------------------------------------------------------
b54902e75ba9c88ab6f25660acde424db805ec3f
.../preprocess/preprocess_binarize_main.cpp | 52 ++++++++++++++--------
1 file changed, 33 insertions(+), 19 deletions(-)
diff --git a/src/mlpack/methods/preprocess/preprocess_binarize_main.cpp b/src/mlpack/methods/preprocess/preprocess_binarize_main.cpp
index 5a8b278..118fe7c 100644
--- a/src/mlpack/methods/preprocess/preprocess_binarize_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_binarize_main.cpp
@@ -2,21 +2,40 @@
* @file preprocess_binarize_main.cpp
* @author Keon Kim
*
- * split data CLI executable
+ * binarize CLI executable
*/
#include <mlpack/core.hpp>
#include <mlpack/core/data/binarize.hpp>
-PROGRAM_INFO("Split Data", "This utility takes a dataset and optionally labels "
- "and splits ");
+PROGRAM_INFO("Binarize Data", "This utility takes a dataset and binarizes the "
+ "variables into either 0 or 1 given threshold. User can apply binarization "
+ "on a dimension or the whole dataset. A dimension can be specified using "
+ "--dimension (-d) option. Threshold can also be specified with the "
+ "--threshold (-t) option; The default is 0.0."
+ "\n\n"
+ "The program does not modify the original file, but instead makes a "
+ "separate file to save the binarized data; The program requires you to "
+ "specify the file name with --output_file (-o)."
+ "\n\n"
+ "For example, if we want to make all variables greater than 5 in dataset "
+ "to 1 and ones that are less than or equal to 5.0 to 0, and save the "
+ "result to result.csv, we could run"
+ "\n\n"
+ "$ mlpack_preprocess_binarize -i dataset.csv -t 5 -o result.csv"
+ "\n\n"
+ "But if we want to apply this to only the first (0th) dimension of the "
+ "dataset, we could run"
+ "\n\n"
+ "$ mlpack_preprocess_binarize -i dataset.csv -t 5 -d 0 -o result.csv");
// Define parameters for data.
PARAM_STRING_REQ("input_file", "File containing data,", "i");
// Define optional parameters.
PARAM_STRING("output_file", "File to save the output,", "o", "");
-PARAM_INT("feature", "File containing labels", "f", 0);
-PARAM_DOUBLE("threshold", "Ratio of test set, if not set,"
- "the threshold defaults to 0.0", "t", 0.0);
+PARAM_INT("dimension", "Dimension to apply the binarization. If not set, the "
+ "program will binarize every dimension by default", "d", 0);
+PARAM_DOUBLE("threshold", "Threshold to be applied for binarization. If not "
+ "set, the threshold defaults to 0.0", "t", 0.0);
using namespace mlpack;
using namespace arma;
@@ -28,21 +47,21 @@ int main(int argc, char** argv)
CLI::ParseCommandLine(argc, argv);
const string inputFile = CLI::GetParam<string>("input_file");
const string outputFile = CLI::GetParam<string>("output_file");
- const size_t feature = (size_t) CLI::GetParam<int>("feature");
+ const size_t dimension = (size_t) CLI::GetParam<int>("dimension");
const double threshold = CLI::GetParam<double>("threshold");
// Check on data parameters.
- if (!CLI::HasParam("feature"))
- Log::Warn << "You did not specify --feature, so the program will perform "
- << "binarize on every features." << endl;
+ if (!CLI::HasParam("dimension"))
+ Log::Warn << "You did not specify --dimension, so the program will perform "
+ << "binarize on every dimensions." << endl;
if (!CLI::HasParam("threshold"))
Log::Warn << "You did not specify --threshold, so the threhold "
- << "will be automatically set to '0.0'." << endl;
+ << "will be automatically set to '0.0'." << endl;
if (!CLI::HasParam("output_file"))
Log::Warn << "You did not specify --output_file, so no result will be"
- << "saved." << endl;
+ << "saved." << endl;
// Load the data.
arma::mat input;
@@ -50,9 +69,9 @@ int main(int argc, char** argv)
data::Load(inputFile, input, true);
Timer::Start("binarize");
- if (CLI::HasParam("feature"))
+ if (CLI::HasParam("dimension"))
{
- data::Binarize<double>(input, output, threshold, feature);
+ data::Binarize<double>(input, output, threshold, dimension);
}
else
{
@@ -61,11 +80,6 @@ int main(int argc, char** argv)
}
Timer::Stop("binarize");
- Log::Info << "input" << endl;
- Log::Info << input << endl;
- Log::Info << "output" << endl;
- Log::Info << output << endl;
-
if (CLI::HasParam("output_file"))
data::Save(outputFile, output, false);
}
More information about the mlpack-git
mailing list