[mlpack-git] master: add cli executable for data_split (a35c390)

gitdub at mlpack.org gitdub at mlpack.org
Thu May 26 07:56:15 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/1f562a1aba7ae55475afcc95659511c2b7f694e5...5b8fdce471328f722fcd8c0f22a6d995ce22c98b

>---------------------------------------------------------------

commit a35c39061245b0ccb757f65dfe6626b8a3c04d9c
Author: Keon Kim <kwk236 at gmail.com>
Date:   Thu May 26 20:25:35 2016 +0900

    add cli executable for data_split


>---------------------------------------------------------------

a35c39061245b0ccb757f65dfe6626b8a3c04d9c
 src/mlpack/methods/CMakeLists.txt                  |  1 +
 .../{adaboost => preprocess}/CMakeLists.txt        |  9 ++-
 .../methods/preprocess/preprocess_split_main.cpp   | 77 ++++++++++++++++++++++
 3 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/src/mlpack/methods/CMakeLists.txt b/src/mlpack/methods/CMakeLists.txt
index 209beef..5734d5c 100644
--- a/src/mlpack/methods/CMakeLists.txt
+++ b/src/mlpack/methods/CMakeLists.txt
@@ -15,6 +15,7 @@ endmacro ()
 
 # Recurse into each method mlpack provides.
 set(DIRS
+  preprocess
   adaboost
   amf
   ann
diff --git a/src/mlpack/methods/adaboost/CMakeLists.txt b/src/mlpack/methods/preprocess/CMakeLists.txt
similarity index 73%
copy from src/mlpack/methods/adaboost/CMakeLists.txt
copy to src/mlpack/methods/preprocess/CMakeLists.txt
index af2c59a..3a2f7bf 100644
--- a/src/mlpack/methods/adaboost/CMakeLists.txt
+++ b/src/mlpack/methods/preprocess/CMakeLists.txt
@@ -1,10 +1,6 @@
-cmake_minimum_required(VERSION 2.8)
-
 # Define the files we need to compile.
 # Anything not in this list will not be compiled into mlpack.
 set(SOURCES
-  adaboost.hpp
-  adaboost_impl.hpp
 )
 
 # Add directory name to sources.
@@ -16,4 +12,7 @@ endforeach()
 # the parent scope).
 set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)
 
-add_cli_executable (adaboost)
+#add_cli_executable(preprocess_stats)
+add_cli_executable(preprocess_split)
+#add_cli_executable(preprocess_scan)
+#add_cli_executable(preprocess_imputer)
diff --git a/src/mlpack/methods/preprocess/preprocess_split_main.cpp b/src/mlpack/methods/preprocess/preprocess_split_main.cpp
new file mode 100644
index 0000000..996272a
--- /dev/null
+++ b/src/mlpack/methods/preprocess/preprocess_split_main.cpp
@@ -0,0 +1,77 @@
+/**
+ * @file preprocess_split_main.cpp
+ * @author Keon Woo Kim
+ *
+ * split data CLI executable
+ */
+#include <mlpack/core.hpp>
+#include <mlpack/core/data/split_data.hpp>
+
+PROGRAM_INFO("Split into Train and Test Data", "This "
+    "utility takes data and labels and split into a training "
+    "set and a test set.");
+
+// Define parameters for data
+PARAM_STRING_REQ("input_file", "File containing data,", "i");
+PARAM_STRING_REQ("output_train_data", "File name to save train data", "d");
+PARAM_STRING_REQ("output_test_data", "File name to save test data", "D");
+
+// Define parameters for labels
+PARAM_STRING_REQ("input_label", "File containing labels", "I");
+PARAM_STRING_REQ("output_train_label", "File name to save train label", "l");
+PARAM_STRING_REQ("output_test_label", "File name to save test label", "L");
+
+// Define optional test ratio, default is 0.2 (Test 20% Train 80%)
+PARAM_DOUBLE("test_ratio", "Ratio of test set, defaults to 0.2"
+    "if not set", "r", 0.2);
+
+using namespace mlpack;
+using namespace arma;
+using namespace std;
+
+int main(int argc, char** argv)
+{
+  // Parse command line options.
+  CLI::ParseCommandLine(argc, argv);
+
+  // data
+  const string inputFile = CLI::GetParam<string>("input_file");
+  const string outputTrainData = CLI::GetParam<string>("output_train_data");
+  const string outputTestData = CLI::GetParam<string>("output_test_data");
+  // labels
+  const string inputLabel = CLI::GetParam<string>("input_label");
+  const string outputTrainLabel = CLI::GetParam<string>("output_train_label");
+  const string outputTestLabel = CLI::GetParam<string>("output_test_label");
+
+  // Ratio
+  const double testRatio = CLI::GetParam<double>("test_ratio");
+
+  // container for input data and labels
+  arma::mat data;
+  arma::Mat<size_t> labels;
+
+  // Load Data and Labels
+  data::Load(inputFile, data, true);
+  data::Load(inputLabel, labels, true);
+  arma::Row<size_t> labels_row = labels.row(0); // extract first row
+
+  // Split Data
+  const auto value = data::TrainTestSplit(data, labels_row, testRatio);
+  Log::Info << "Train Data Count: " << get<0>(value).n_cols << endl;
+  Log::Info << "Test Data Count: " << get<1>(value).n_cols << endl;
+  Log::Info << "Train Label Count: " << get<2>(value).n_cols << endl;
+  Log::Info << "Test Label Count: " << get<3>(value).n_cols << endl;
+
+  // Save Train Data
+  data::Save(outputTrainData, get<0>(value), false);
+
+  // Save Test Data
+  data::Save(outputTestData, get<1>(value), false);
+
+  // Save Train Label
+  data::Save(outputTrainLabel, get<2>(value), false);
+
+  // Save Test Label
+  data::Save(outputTestLabel, get<3>(value), false);
+}
+




More information about the mlpack-git mailing list