[mlpack-git] master: optimize binarize and add binarize executable (cd0a377)

gitdub at mlpack.org gitdub at mlpack.org
Tue Jun 14 08:11:15 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/a0b31abe5ff69117645c664dbeac1476dd5e48f7...2da9c5bac14a00145c757b8139c245913b86e034

>---------------------------------------------------------------

commit cd0a3770a12e31e7f443d2672cc48ca9ed6d80e4
Author: Keon Kim <kwk236 at gmail.com>
Date:   Tue Jun 14 20:17:49 2016 +0900

    optimize binarize and add binarize executable


>---------------------------------------------------------------

cd0a3770a12e31e7f443d2672cc48ca9ed6d80e4
 src/mlpack/core/data/binarize.hpp                  | 100 ++++-----------------
 src/mlpack/methods/preprocess/CMakeLists.txt       |   1 +
 .../preprocess/preprocess_binarize_main.cpp        |  71 +++++++++++++++
 .../methods/preprocess/preprocess_split_main.cpp   |   2 +-
 src/mlpack/tests/binarize_test.cpp                 |  68 +++++++-------
 5 files changed, 130 insertions(+), 112 deletions(-)

diff --git a/src/mlpack/core/data/binarize.hpp b/src/mlpack/core/data/binarize.hpp
index 13c0d19..38d22b6 100644
--- a/src/mlpack/core/data/binarize.hpp
+++ b/src/mlpack/core/data/binarize.hpp
@@ -9,41 +9,10 @@
 #define MLPACK_CORE_DATA_BINARIZE_HPP
 
 #include <mlpack/core.hpp>
+#include <omp.h>
 
 namespace mlpack {
 namespace data {
-/**
- * Given an input dataset and threshold, set values greater than threshold to
- * 1 and values less than or equal to the threshold to 0. This overload takes
- * a dimension and applys the changes to the given dimension.
- *
- * @code
- * arma::mat input = loadData();
- * double threshold = 0;
- * size_t dimension = 0;
- *
- * // Binarize the first dimension. All positive values in the first dimension
- * // will be set to 1 and the values less than or equal to 0 will become 0.
- * Binarize(input, threshold, dimension);
- * @endcode
- *
- * @param input Input matrix to Binarize.
- * @param threshold Threshold can by any number.
- * @param dimension Feature to apply the Binarize function.
- */
-template<typename T>
-void Binarize(arma::Mat<T>& input,
-              const double threshold,
-              const size_t dimension)
-{
-  for (size_t i = 0; i < input.n_cols; ++i)
-  {
-    if (input(dimension, i) > threshold)
-      input(dimension, i) = 1;
-    else
-      input(dimension, i) = 0;
-  }
-}
 
 /**
  * Given an input dataset and threshold, set values greater than threshold to
@@ -51,46 +20,13 @@ void Binarize(arma::Mat<T>& input,
  * the changes to all dimensions.
  *
  * @code
- * arma::mat input = loadData();
- * double threshold = 0;
- *
- * // Binarize the whole Matrix. All positive values in will be set to 1 and
- * // the values less than or equal to 0 will become 0.
- * Binarize(input, threshold);
- * @endcode
- *
- * @param input Input matrix to Binarize.
- * @param threshold Threshold can by any number.
- */
-template<typename T>
-void Binarize(arma::Mat<T>& input,
-              const double threshold)
-{
-  for (size_t i = 0; i < input.n_cols; ++i)
-  {
-    for (size_t j = 0; j < input.n_rows; ++j)
-    {
-      if (input(i, j) > threshold)
-        input(i, j) = 1;
-      else
-        input(i, j) = 0;
-    }
-  }
- }
-
-/**
- * Given an input dataset and threshold, set values greater than threshold to
- * 1 and values less than or equal to the threshold to 0. This overload applies
- * the changes to all dimensions.
- *
- * @code
- * arma::mat input = loadData();
- * arma::mat output;
+ * arma::Mat<double> input = loadData();
+ * arma::Mat<double> output;
  * double threshold = 0.5;
  *
  * // Binarize the whole Matrix. All positive values in will be set to 1 and
  * // the values less than or equal to 0.5 will become 0.
- * Binarize(input, output, threshold);
+ * Binarize<double>(input, output, threshold);
  * @endcode
  *
  * @param input Input matrix to Binarize.
@@ -104,15 +40,17 @@ void Binarize(const arma::Mat<T>& input,
 {
   output.copy_size(input);
 
-  for (size_t i = 0; i < input.n_cols; ++i)
+  const size_t totalElems = static_cast<size_t>(input.n_elem);
+  const T *inPtr = input.memptr();
+  T *outPtr = output.memptr();
+
+  #pragma omp parallel for
+  for (size_t i = 0; i < totalElems; ++i)
   {
-    for (size_t j = 0; j < input.n_rows; ++j)
-    {
-      if (input(i, j) > threshold)
-        output(i, j) = 1;
-      else
-        output(i, j) = 0;
-    }
+    if (inPtr[i] < threshold)
+      outPtr[i] = 0;
+    else
+      outPtr[i] = 1;
   }
 }
 
@@ -122,14 +60,14 @@ void Binarize(const arma::Mat<T>& input,
  * a dimension and applys the changes to the given dimension.
  *
  * @code
- * arma::mat input = loadData();
- * arma::mat output;
+ * arma::Mat<double> input = loadData();
+ * arma::Mat<double> output;
  * double threshold = 0.5;
  * size_t dimension = 0;
  *
  * // Binarize the first dimension. All positive values in the first dimension
  * // will be set to 1 and the values less than or equal to 0 will become 0.
- * Binarize(input, output, threshold, dimension);
+ * Binarize<double>(input, output, threshold, dimension);
  * @endcode
  *
  * @param input Input matrix to Binarize.
@@ -137,15 +75,15 @@ void Binarize(const arma::Mat<T>& input,
  * @param threshold Threshold can by any number.
  * @param dimension Feature to apply the Binarize function.
  */
-
 template<typename T>
 void Binarize(const arma::Mat<T>& input,
               arma::Mat<T>& output,
               const double threshold,
               const size_t dimension)
 {
-  output(input);
+  output = input;
 
+  #pragma omp parallel for
   for (size_t i = 0; i < input.n_cols; ++i)
   {
     if (input(dimension, i) > threshold)
diff --git a/src/mlpack/methods/preprocess/CMakeLists.txt b/src/mlpack/methods/preprocess/CMakeLists.txt
index 3a2f7bf..b10c8ea 100644
--- a/src/mlpack/methods/preprocess/CMakeLists.txt
+++ b/src/mlpack/methods/preprocess/CMakeLists.txt
@@ -14,5 +14,6 @@ set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)
 
 #add_cli_executable(preprocess_stats)
 add_cli_executable(preprocess_split)
+add_cli_executable(preprocess_binarize)
 #add_cli_executable(preprocess_scan)
 #add_cli_executable(preprocess_imputer)
diff --git a/src/mlpack/methods/preprocess/preprocess_binarize_main.cpp b/src/mlpack/methods/preprocess/preprocess_binarize_main.cpp
new file mode 100644
index 0000000..5a8b278
--- /dev/null
+++ b/src/mlpack/methods/preprocess/preprocess_binarize_main.cpp
@@ -0,0 +1,71 @@
+/**
+ * @file preprocess_binarize_main.cpp
+ * @author Keon Kim
+ *
+ * split data CLI executable
+ */
+#include <mlpack/core.hpp>
+#include <mlpack/core/data/binarize.hpp>
+
+PROGRAM_INFO("Split Data", "This utility takes a dataset and optionally labels "
+    "and splits ");
+
+// Define parameters for data.
+PARAM_STRING_REQ("input_file", "File containing data,", "i");
+// Define optional parameters.
+PARAM_STRING("output_file", "File to save the output,", "o", "");
+PARAM_INT("feature", "File containing labels", "f", 0);
+PARAM_DOUBLE("threshold", "Ratio of test set, if not set,"
+    "the threshold defaults to 0.0", "t", 0.0);
+
+using namespace mlpack;
+using namespace arma;
+using namespace std;
+
+int main(int argc, char** argv)
+{
+  // Parse command line options.
+  CLI::ParseCommandLine(argc, argv);
+  const string inputFile = CLI::GetParam<string>("input_file");
+  const string outputFile = CLI::GetParam<string>("output_file");
+  const size_t feature = (size_t) CLI::GetParam<int>("feature");
+  const double threshold = CLI::GetParam<double>("threshold");
+
+  // Check on data parameters.
+  if (!CLI::HasParam("feature"))
+    Log::Warn << "You did not specify --feature, so the program will perform "
+              << "binarize on every features." << endl;
+
+  if (!CLI::HasParam("threshold"))
+    Log::Warn << "You did not specify --threshold, so the threhold "
+              << "will be automatically set to '0.0'." << endl;
+
+  if (!CLI::HasParam("output_file"))
+    Log::Warn << "You did not specify --output_file, so no result will be"
+              << "saved." << endl;
+
+  // Load the data.
+  arma::mat input;
+  arma::mat output;
+  data::Load(inputFile, input, true);
+
+  Timer::Start("binarize");
+  if (CLI::HasParam("feature"))
+  {
+    data::Binarize<double>(input, output, threshold, feature);
+  }
+  else
+  {
+    // binarize the whole data
+    data::Binarize<double>(input, output, threshold);
+  }
+  Timer::Stop("binarize");
+
+  Log::Info << "input" << endl;
+  Log::Info << input << endl;
+  Log::Info << "output" << endl;
+  Log::Info << output << endl;
+
+  if (CLI::HasParam("output_file"))
+    data::Save(outputFile, output, false);
+}
diff --git a/src/mlpack/methods/preprocess/preprocess_split_main.cpp b/src/mlpack/methods/preprocess/preprocess_split_main.cpp
index 1e063db..f47c9e1 100644
--- a/src/mlpack/methods/preprocess/preprocess_split_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_split_main.cpp
@@ -1,6 +1,6 @@
 /**
  * @file preprocess_split_main.cpp
- * @author Keon Woo Kim
+ * @author Keon Kim
  *
  * split data CLI executable
  */
diff --git a/src/mlpack/tests/binarize_test.cpp b/src/mlpack/tests/binarize_test.cpp
index 5ccaf38..8a98af4 100644
--- a/src/mlpack/tests/binarize_test.cpp
+++ b/src/mlpack/tests/binarize_test.cpp
@@ -17,42 +17,50 @@ using namespace mlpack::data;
 
 BOOST_AUTO_TEST_SUITE(BinarizeTest);
 
-/**
- * Compare the binarized data with answer.
- *
- * @param input The original data set before Binarize.
- * @param answer The data want to compare with the input.
- */
-void CheckAnswer(const mat& input,
-                 const umat& answer)
+BOOST_AUTO_TEST_CASE(BinerizeOneDimension)
 {
-  for (size_t i = 0; i < input.n_cols; ++i)
-  {
-    const mat& lhsCol = input.col(i);
-    const umat& rhsCol = answer.col(i);
-    for (size_t j = 0; j < lhsCol.n_rows; ++j)
-    {
-      if (std::abs(rhsCol(j)) < 1e-5)
-        BOOST_REQUIRE_SMALL(lhsCol(j), 1e-5);
-      else
-        BOOST_REQUIRE_CLOSE(lhsCol(j), rhsCol(j), 1e-5);
-    }
-  }
+  mat input;
+  input << 1 << 2 << 3 << endr
+        << 4 << 5 << 6 << endr // this row will be tested
+        << 7 << 8 << 9;
+
+  mat output;
+  const double threshold = 5.0;
+  const size_t dimension = 1;
+  Binarize<double>(input, output, threshold, dimension);
+
+  BOOST_REQUIRE_CLOSE(input(0, 0), 1, 1e-5); // 1
+  BOOST_REQUIRE_CLOSE(input(0, 1), 2, 1e-5); // 2
+  BOOST_REQUIRE_CLOSE(input(0, 2), 3, 1e-5); // 3
+  BOOST_REQUIRE_SMALL(input(1, 0), 1e-5); // 4 target
+  BOOST_REQUIRE_SMALL(input(1, 1), 1e-5); // 5 target
+  BOOST_REQUIRE_CLOSE(input(1, 2), 1, 1e-5); // 6 target
+  BOOST_REQUIRE_CLOSE(input(2, 0), 7, 1e-5); // 7
+  BOOST_REQUIRE_CLOSE(input(2, 1), 8, 1e-5); // 8
+  BOOST_REQUIRE_CLOSE(input(2, 2), 9, 1e-5); // 9
 }
 
-BOOST_AUTO_TEST_CASE(BinarizeThreshold)
+BOOST_AUTO_TEST_CASE(BinerizeAll)
 {
-  mat input(10, 10, fill::randu); // fill input with randome Number
-  mat constMat(10, 10);
-  double threshold = math::Random(); // random number threshold
-  constMat.fill(threshold);
-
-  umat answer = input > constMat;
+  mat input;
+  input << 1 << 2 << 3 << endr
+        << 4 << 5 << 6 << endr // this row will be tested
+        << 7 << 8 << 9;
 
-  // Binarize every values inside the matrix with threshold of 0;
-  Binarize(input, threshold);
+  mat output;
+  const double threshold = 5.0;
+  const size_t dimension = 1;
+  Binarize<double>(input, output, threshold);
 
-  CheckAnswer(input, answer);
+  BOOST_REQUIRE_SMALL(input(0, 0), 1e-5); // 1
+  BOOST_REQUIRE_SMALL(input(0, 1), 1e-5); // 2
+  BOOST_REQUIRE_SMALL(input(0, 2), 1e-5); // 3
+  BOOST_REQUIRE_SMALL(input(1, 0), 1e-5); // 4
+  BOOST_REQUIRE_SMALL(input(1, 1), 1e-5); // 5
+  BOOST_REQUIRE_CLOSE(input(1, 2), 1.0, 1e-5); // 6
+  BOOST_REQUIRE_CLOSE(input(2, 0), 1.0, 1e-5); // 7
+  BOOST_REQUIRE_CLOSE(input(2, 1), 1.0, 1e-5); // 8
+  BOOST_REQUIRE_CLOSE(input(2, 2), 1.0, 1e-5); // 9
 }
 
 BOOST_AUTO_TEST_SUITE_END();




More information about the mlpack-git mailing list