[mlpack-git] master: Update some documentation, change --input_labels to --input_labels_file. (6a94eb6)
gitdub at mlpack.org
gitdub at mlpack.org
Thu Jun 2 14:54:46 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/1f562a1aba7ae55475afcc95659511c2b7f694e5...5b8fdce471328f722fcd8c0f22a6d995ce22c98b
>---------------------------------------------------------------
commit 6a94eb6efb37eca09e95fd53bd7c21334abf7614
Author: Ryan Curtin <ryan at ratml.org>
Date: Thu Jun 2 14:54:46 2016 -0400
Update some documentation, change --input_labels to --input_labels_file.
>---------------------------------------------------------------
6a94eb6efb37eca09e95fd53bd7c21334abf7614
.../methods/preprocess/preprocess_split_main.cpp | 125 ++++++++++++---------
1 file changed, 73 insertions(+), 52 deletions(-)
diff --git a/src/mlpack/methods/preprocess/preprocess_split_main.cpp b/src/mlpack/methods/preprocess/preprocess_split_main.cpp
index d24b8ff..1e063db 100644
--- a/src/mlpack/methods/preprocess/preprocess_split_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_split_main.cpp
@@ -7,27 +7,47 @@
#include <mlpack/core.hpp>
#include <mlpack/core/data/split_data.hpp>
-PROGRAM_INFO("Split Data", "This utility takes data and split into a training "
- "set and a test set. Before the split happens, it shuffles the data in "
- "the each feature. Without (--test_ratio) specified, the default "
- "test-to-training ratio is set to 0.2."
+PROGRAM_INFO("Split Data", "This utility takes a dataset and optionally labels "
+ "and splits them into a training set and a test set. Before the split, the "
+ "points in the dataset are randomly reordered. The percentage of the "
+ "dataset to be used as the test set can be specified with the --test_ratio "
+ "(-r) option; the default is 0.2 (20%)."
"\n\n"
- "The program does not modify or write on the original file, but instead "
- "makes a seperate files to save the training and test files; you can "
- "specify the file names with (-training_file) and (-test_file). If the "
- "names are not specified, the program automatically names the training "
- "and test file by attaching 'train_' and 'test_' in front of the "
- "original file name"
+ "The program does not modify the original file, but instead makes separate "
+ "files to save the training and test files; you can specify the file names "
+ "with --training_file (-t) and --test_file (-T). If these options are not "
+ "specified, the program automatically names the training and test file by "
+ "prepending 'train_' and 'test_' to the dataset filename (which was "
+ "specified by --input_file)."
"\n\n"
- "Optionally, a label can be also be splited along with the data at the "
- "same time by specifying (--input_lables) option. Splitting label works "
- "the same as splitting the data and you can also specify the names using "
- "(--trainning_labels_file) and (--test_labels_file).");
+ "Optionally, labels can be also be split along with the data by specifying "
+ "the --input_labels_file (-I) option. Splitting labels works the same way "
+ "as splitting the data. The output training and test labels will be saved "
+ "to the files specified by --training_labels_file (-l) and "
+ "--test_labels_file (-L), respectively. If these options are not specified,"
+ " then the program will automatically name the training labels and test "
+ "labels file by prepending 'train_' and 'test_' to the labels filename "
+ "(which was specified by --input_labels_file)."
+ "\n\n"
+ "So, a simple example where we want to split dataset.csv into "
+ "train_dataset.csv and test_dataset.csv with 60% of the data in the "
+ "training set and 40% of the dataset in the test set, we could run"
+ "\n\n"
+ "$ mlpack_preprocess_split -i dataset.csv -r 0.4"
+ "\n\n"
+ "If we had a dataset in dataset.csv and associated labels in labels.csv, "
+ "and we wanted to split these into training_set.csv, training_labels.csv, "
+ "test_set.csv, and test_labels.csv, with 30% of the data in the test set, "
+ "we could run"
+ "\n\n"
+ "$ mlpack_preprocess_split -i dataset.csv -I labels.csv -r 0.3\n"
+ "> -t training_set.csv -l training_labels.csv -T test_set.csv\n"
+ "> -L test_labels.csv");
-// Define parameters for data
+// Define parameters for data.
PARAM_STRING_REQ("input_file", "File containing data,", "i");
-// Define optional parameters
-PARAM_STRING("input_labels", "File containing labels", "I", "");
+// Define optional parameters.
+PARAM_STRING("input_labels_file", "File containing labels", "I", "");
PARAM_STRING("training_file", "File name to save train data", "t", "");
PARAM_STRING("test_file", "File name to save test data", "T", "");
PARAM_STRING("training_labels_file", "File name to save train label", "l", "");
@@ -46,44 +66,44 @@ int main(int argc, char** argv)
// Parse command line options.
CLI::ParseCommandLine(argc, argv);
const string inputFile = CLI::GetParam<string>("input_file");
- const string inputLabels = CLI::GetParam<string>("input_labels");
+ const string inputLabels = CLI::GetParam<string>("input_labels_file");
string trainingFile = CLI::GetParam<string>("training_file");
string testFile = CLI::GetParam<string>("test_file");
string trainingLabelsFile = CLI::GetParam<string>("training_labels_file");
string testLabelsFile = CLI::GetParam<string>("test_labels_file");
const double testRatio = CLI::GetParam<double>("test_ratio");
- // check on data parameters
+ // Check on data parameters.
if (trainingFile.empty())
{
trainingFile = "train_" + inputFile;
- Log::Warn << "You did not specify --training_file. "
- << "Training file name is automatically set to: "
- << trainingFile << endl;
+ Log::Warn << "You did not specify --training_file, so the training set file"
+ << " name will be automatically set to '" << trainingFile << "'."
+ << endl;
}
if (testFile.empty())
{
testFile = "test_" + inputFile;
- Log::Warn << "You did not specify --test_file. "
- << "Test file name is automatically set to: " << testFile << endl;
+ Log::Warn << "You did not specify --test_file, so the test set file name "
+ << "will be automatically set to '" << testFile << "'." << endl;
}
- // check on label parameters
+ // Check on label parameters.
if (!inputLabels.empty())
{
if (!CLI::HasParam("training_labels_file"))
{
trainingLabelsFile = "train_" + inputLabels;
- Log::Warn << "You did not specify --training_labels_file. "
- << "Training labels file name is automatically set to: "
- << trainingLabelsFile << endl;
+ Log::Warn << "You did not specify --training_labels_file, so the training"
+ << "set labels file name will be automatically set to '"
+ << trainingLabelsFile << "'." << endl;
}
if (!CLI::HasParam("test_labels_file"))
{
testLabelsFile = "test_" + inputLabels;
- Log::Warn << "You did not specify --test_labels_file. "
- << "Test labels file name is automatically set to: "
- << testLabelsFile << endl;
+ Log::Warn << "You did not specify --test_labels_file, so the test set "
+ << "labels file name will be automatically set to '"
+ << testLabelsFile << "'." << endl;
}
}
else
@@ -92,56 +112,57 @@ int main(int argc, char** argv)
|| CLI::HasParam("test_labels_file"))
{
Log::Fatal << "When specifying --training_labels_file or "
- << "test_labels_file, you must also specify --input_labels. " << endl;
+ << "--test_labels_file, you must also specify --input_labels. "
+ << endl;
}
}
- // check on test_ratio
+ // Check test_ratio.
if (CLI::HasParam("test_ratio"))
{
- //sanity check on test_ratio
if ((testRatio < 0.0) || (testRatio > 1.0))
{
- Log::Fatal << "Invalid parameter for test_ratio. "
- << "test_ratio must be between 0.0 and 1.0" << endl;
+ Log::Fatal << "Invalid parameter for test_ratio; "
+ << "--test_ratio must be between 0.0 and 1.0." << endl;
}
}
- else // if test_ratio is not set
+ else // If test_ratio is not set, warn the user.
{
- Log::Warn << "You did not specify --test_ratio_file. "
- << "Test ratio is automatically set to: 0.2"<< endl;
+ Log::Warn << "You did not specify --test_ratio, so it will be automatically"
+ << " set to 0.2." << endl;
}
- // load data
+ // Load the data.
arma::mat data;
data::Load(inputFile, data, true);
- // if parameters for labels exist
- if (CLI::HasParam("input_labels"))
+ // If parameters for labels exist, we must split the labels too.
+ if (CLI::HasParam("input_labels_file"))
{
arma::mat labels;
data::Load(inputLabels, labels, true);
- arma::rowvec labels_row = labels.row(0); // extract first row
+ arma::rowvec labelsRow = labels.row(0);
- const auto value = data::Split(data, labels_row, testRatio);
- Log::Info << "Train Data Count: " << get<0>(value).n_cols << endl;
- Log::Info << "Test Data Count: " << get<1>(value).n_cols << endl;
- Log::Info << "Train Label Count: " << get<2>(value).n_cols << endl;
- Log::Info << "Test Label Count: " << get<3>(value).n_cols << endl;
+ const auto value = data::Split(data, labelsRow, testRatio);
+ Log::Info << "Training data contains " << get<0>(value).n_cols << " points."
+ << endl;
+ Log::Info << "Test data contains " << get<1>(value).n_cols << " points."
+ << endl;
data::Save(trainingFile, get<0>(value), false);
data::Save(testFile, get<1>(value), false);
data::Save(trainingLabelsFile, get<2>(value), false);
data::Save(testLabelsFile, get<3>(value), false);
}
- else // split without parameters
+ else // We have no labels, so just split the dataset.
{
const auto value = data::Split(data, testRatio);
- Log::Info << "Train Data Count: " << get<0>(value).n_cols << endl;
- Log::Info << "Test Data Count: " << get<1>(value).n_cols << endl;
+ Log::Info << "Training data contains " << get<0>(value).n_cols << " points."
+ << endl;
+ Log::Info << "Test data contains " << get<1>(value).n_cols << " points."
+ << endl;
data::Save(trainingFile, get<0>(value), false);
data::Save(testFile, get<1>(value), false);
}
}
-
More information about the mlpack-git
mailing list