[mlpack-git] master: optimize binarize and add binarize executable (cd0a377)
gitdub at mlpack.org
gitdub at mlpack.org
Tue Jun 14 08:11:15 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/a0b31abe5ff69117645c664dbeac1476dd5e48f7...2da9c5bac14a00145c757b8139c245913b86e034
>---------------------------------------------------------------
commit cd0a3770a12e31e7f443d2672cc48ca9ed6d80e4
Author: Keon Kim <kwk236 at gmail.com>
Date: Tue Jun 14 20:17:49 2016 +0900
optimize binarize and add binarize executable
>---------------------------------------------------------------
cd0a3770a12e31e7f443d2672cc48ca9ed6d80e4
src/mlpack/core/data/binarize.hpp | 100 ++++-----------------
src/mlpack/methods/preprocess/CMakeLists.txt | 1 +
.../preprocess/preprocess_binarize_main.cpp | 71 +++++++++++++++
.../methods/preprocess/preprocess_split_main.cpp | 2 +-
src/mlpack/tests/binarize_test.cpp | 68 +++++++-------
5 files changed, 130 insertions(+), 112 deletions(-)
diff --git a/src/mlpack/core/data/binarize.hpp b/src/mlpack/core/data/binarize.hpp
index 13c0d19..38d22b6 100644
--- a/src/mlpack/core/data/binarize.hpp
+++ b/src/mlpack/core/data/binarize.hpp
@@ -9,41 +9,10 @@
#define MLPACK_CORE_DATA_BINARIZE_HPP
#include <mlpack/core.hpp>
+#include <omp.h>
namespace mlpack {
namespace data {
-/**
- * Given an input dataset and threshold, set values greater than threshold to
- * 1 and values less than or equal to the threshold to 0. This overload takes
- * a dimension and applys the changes to the given dimension.
- *
- * @code
- * arma::mat input = loadData();
- * double threshold = 0;
- * size_t dimension = 0;
- *
- * // Binarize the first dimension. All positive values in the first dimension
- * // will be set to 1 and the values less than or equal to 0 will become 0.
- * Binarize(input, threshold, dimension);
- * @endcode
- *
- * @param input Input matrix to Binarize.
- * @param threshold Threshold can by any number.
- * @param dimension Feature to apply the Binarize function.
- */
-template<typename T>
-void Binarize(arma::Mat<T>& input,
- const double threshold,
- const size_t dimension)
-{
- for (size_t i = 0; i < input.n_cols; ++i)
- {
- if (input(dimension, i) > threshold)
- input(dimension, i) = 1;
- else
- input(dimension, i) = 0;
- }
-}
/**
* Given an input dataset and threshold, set values greater than threshold to
@@ -51,46 +20,13 @@ void Binarize(arma::Mat<T>& input,
* the changes to all dimensions.
*
* @code
- * arma::mat input = loadData();
- * double threshold = 0;
- *
- * // Binarize the whole Matrix. All positive values in will be set to 1 and
- * // the values less than or equal to 0 will become 0.
- * Binarize(input, threshold);
- * @endcode
- *
- * @param input Input matrix to Binarize.
- * @param threshold Threshold can by any number.
- */
-template<typename T>
-void Binarize(arma::Mat<T>& input,
- const double threshold)
-{
- for (size_t i = 0; i < input.n_cols; ++i)
- {
- for (size_t j = 0; j < input.n_rows; ++j)
- {
- if (input(i, j) > threshold)
- input(i, j) = 1;
- else
- input(i, j) = 0;
- }
- }
- }
-
-/**
- * Given an input dataset and threshold, set values greater than threshold to
- * 1 and values less than or equal to the threshold to 0. This overload applies
- * the changes to all dimensions.
- *
- * @code
- * arma::mat input = loadData();
- * arma::mat output;
+ * arma::Mat<double> input = loadData();
+ * arma::Mat<double> output;
* double threshold = 0.5;
*
* // Binarize the whole Matrix. All positive values in will be set to 1 and
* // the values less than or equal to 0.5 will become 0.
- * Binarize(input, output, threshold);
+ * Binarize<double>(input, output, threshold);
* @endcode
*
* @param input Input matrix to Binarize.
@@ -104,15 +40,17 @@ void Binarize(const arma::Mat<T>& input,
{
output.copy_size(input);
- for (size_t i = 0; i < input.n_cols; ++i)
+ const size_t totalElems = static_cast<size_t>(input.n_elem);
+ const T *inPtr = input.memptr();
+ T *outPtr = output.memptr();
+
+ #pragma omp parallel for
+ for (size_t i = 0; i < totalElems; ++i)
{
- for (size_t j = 0; j < input.n_rows; ++j)
- {
- if (input(i, j) > threshold)
- output(i, j) = 1;
- else
- output(i, j) = 0;
- }
+ if (inPtr[i] < threshold)
+ outPtr[i] = 0;
+ else
+ outPtr[i] = 1;
}
}
@@ -122,14 +60,14 @@ void Binarize(const arma::Mat<T>& input,
* a dimension and applys the changes to the given dimension.
*
* @code
- * arma::mat input = loadData();
- * arma::mat output;
+ * arma::Mat<double> input = loadData();
+ * arma::Mat<double> output;
* double threshold = 0.5;
* size_t dimension = 0;
*
* // Binarize the first dimension. All positive values in the first dimension
* // will be set to 1 and the values less than or equal to 0 will become 0.
- * Binarize(input, output, threshold, dimension);
+ * Binarize<double>(input, output, threshold, dimension);
* @endcode
*
* @param input Input matrix to Binarize.
@@ -137,15 +75,15 @@ void Binarize(const arma::Mat<T>& input,
* @param threshold Threshold can by any number.
* @param dimension Feature to apply the Binarize function.
*/
-
template<typename T>
void Binarize(const arma::Mat<T>& input,
arma::Mat<T>& output,
const double threshold,
const size_t dimension)
{
- output(input);
+ output = input;
+ #pragma omp parallel for
for (size_t i = 0; i < input.n_cols; ++i)
{
if (input(dimension, i) > threshold)
diff --git a/src/mlpack/methods/preprocess/CMakeLists.txt b/src/mlpack/methods/preprocess/CMakeLists.txt
index 3a2f7bf..b10c8ea 100644
--- a/src/mlpack/methods/preprocess/CMakeLists.txt
+++ b/src/mlpack/methods/preprocess/CMakeLists.txt
@@ -14,5 +14,6 @@ set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)
#add_cli_executable(preprocess_stats)
add_cli_executable(preprocess_split)
+add_cli_executable(preprocess_binarize)
#add_cli_executable(preprocess_scan)
#add_cli_executable(preprocess_imputer)
diff --git a/src/mlpack/methods/preprocess/preprocess_binarize_main.cpp b/src/mlpack/methods/preprocess/preprocess_binarize_main.cpp
new file mode 100644
index 0000000..5a8b278
--- /dev/null
+++ b/src/mlpack/methods/preprocess/preprocess_binarize_main.cpp
@@ -0,0 +1,71 @@
+/**
+ * @file preprocess_binarize_main.cpp
+ * @author Keon Kim
+ *
+ * split data CLI executable
+ */
+#include <mlpack/core.hpp>
+#include <mlpack/core/data/binarize.hpp>
+
+PROGRAM_INFO("Split Data", "This utility takes a dataset and optionally labels "
+ "and splits ");
+
+// Define parameters for data.
+PARAM_STRING_REQ("input_file", "File containing data,", "i");
+// Define optional parameters.
+PARAM_STRING("output_file", "File to save the output,", "o", "");
+PARAM_INT("feature", "File containing labels", "f", 0);
+PARAM_DOUBLE("threshold", "Ratio of test set, if not set,"
+ "the threshold defaults to 0.0", "t", 0.0);
+
+using namespace mlpack;
+using namespace arma;
+using namespace std;
+
+int main(int argc, char** argv)
+{
+ // Parse command line options.
+ CLI::ParseCommandLine(argc, argv);
+ const string inputFile = CLI::GetParam<string>("input_file");
+ const string outputFile = CLI::GetParam<string>("output_file");
+ const size_t feature = (size_t) CLI::GetParam<int>("feature");
+ const double threshold = CLI::GetParam<double>("threshold");
+
+ // Check on data parameters.
+ if (!CLI::HasParam("feature"))
+ Log::Warn << "You did not specify --feature, so the program will perform "
+ << "binarize on every features." << endl;
+
+ if (!CLI::HasParam("threshold"))
+ Log::Warn << "You did not specify --threshold, so the threhold "
+ << "will be automatically set to '0.0'." << endl;
+
+ if (!CLI::HasParam("output_file"))
+ Log::Warn << "You did not specify --output_file, so no result will be"
+ << "saved." << endl;
+
+ // Load the data.
+ arma::mat input;
+ arma::mat output;
+ data::Load(inputFile, input, true);
+
+ Timer::Start("binarize");
+ if (CLI::HasParam("feature"))
+ {
+ data::Binarize<double>(input, output, threshold, feature);
+ }
+ else
+ {
+ // binarize the whole data
+ data::Binarize<double>(input, output, threshold);
+ }
+ Timer::Stop("binarize");
+
+ Log::Info << "input" << endl;
+ Log::Info << input << endl;
+ Log::Info << "output" << endl;
+ Log::Info << output << endl;
+
+ if (CLI::HasParam("output_file"))
+ data::Save(outputFile, output, false);
+}
diff --git a/src/mlpack/methods/preprocess/preprocess_split_main.cpp b/src/mlpack/methods/preprocess/preprocess_split_main.cpp
index 1e063db..f47c9e1 100644
--- a/src/mlpack/methods/preprocess/preprocess_split_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_split_main.cpp
@@ -1,6 +1,6 @@
/**
* @file preprocess_split_main.cpp
- * @author Keon Woo Kim
+ * @author Keon Kim
*
* split data CLI executable
*/
diff --git a/src/mlpack/tests/binarize_test.cpp b/src/mlpack/tests/binarize_test.cpp
index 5ccaf38..8a98af4 100644
--- a/src/mlpack/tests/binarize_test.cpp
+++ b/src/mlpack/tests/binarize_test.cpp
@@ -17,42 +17,50 @@ using namespace mlpack::data;
BOOST_AUTO_TEST_SUITE(BinarizeTest);
-/**
- * Compare the binarized data with answer.
- *
- * @param input The original data set before Binarize.
- * @param answer The data want to compare with the input.
- */
-void CheckAnswer(const mat& input,
- const umat& answer)
+BOOST_AUTO_TEST_CASE(BinerizeOneDimension)
{
- for (size_t i = 0; i < input.n_cols; ++i)
- {
- const mat& lhsCol = input.col(i);
- const umat& rhsCol = answer.col(i);
- for (size_t j = 0; j < lhsCol.n_rows; ++j)
- {
- if (std::abs(rhsCol(j)) < 1e-5)
- BOOST_REQUIRE_SMALL(lhsCol(j), 1e-5);
- else
- BOOST_REQUIRE_CLOSE(lhsCol(j), rhsCol(j), 1e-5);
- }
- }
+ mat input;
+ input << 1 << 2 << 3 << endr
+ << 4 << 5 << 6 << endr // this row will be tested
+ << 7 << 8 << 9;
+
+ mat output;
+ const double threshold = 5.0;
+ const size_t dimension = 1;
+ Binarize<double>(input, output, threshold, dimension);
+
+ BOOST_REQUIRE_CLOSE(input(0, 0), 1, 1e-5); // 1
+ BOOST_REQUIRE_CLOSE(input(0, 1), 2, 1e-5); // 2
+ BOOST_REQUIRE_CLOSE(input(0, 2), 3, 1e-5); // 3
+ BOOST_REQUIRE_SMALL(input(1, 0), 1e-5); // 4 target
+ BOOST_REQUIRE_SMALL(input(1, 1), 1e-5); // 5 target
+ BOOST_REQUIRE_CLOSE(input(1, 2), 1, 1e-5); // 6 target
+ BOOST_REQUIRE_CLOSE(input(2, 0), 7, 1e-5); // 7
+ BOOST_REQUIRE_CLOSE(input(2, 1), 8, 1e-5); // 8
+ BOOST_REQUIRE_CLOSE(input(2, 2), 9, 1e-5); // 9
}
-BOOST_AUTO_TEST_CASE(BinarizeThreshold)
+BOOST_AUTO_TEST_CASE(BinerizeAll)
{
- mat input(10, 10, fill::randu); // fill input with randome Number
- mat constMat(10, 10);
- double threshold = math::Random(); // random number threshold
- constMat.fill(threshold);
-
- umat answer = input > constMat;
+ mat input;
+ input << 1 << 2 << 3 << endr
+ << 4 << 5 << 6 << endr // this row will be tested
+ << 7 << 8 << 9;
- // Binarize every values inside the matrix with threshold of 0;
- Binarize(input, threshold);
+ mat output;
+ const double threshold = 5.0;
+ const size_t dimension = 1;
+ Binarize<double>(input, output, threshold);
- CheckAnswer(input, answer);
+ BOOST_REQUIRE_SMALL(input(0, 0), 1e-5); // 1
+ BOOST_REQUIRE_SMALL(input(0, 1), 1e-5); // 2
+ BOOST_REQUIRE_SMALL(input(0, 2), 1e-5); // 3
+ BOOST_REQUIRE_SMALL(input(1, 0), 1e-5); // 4
+ BOOST_REQUIRE_SMALL(input(1, 1), 1e-5); // 5
+ BOOST_REQUIRE_CLOSE(input(1, 2), 1.0, 1e-5); // 6
+ BOOST_REQUIRE_CLOSE(input(2, 0), 1.0, 1e-5); // 7
+ BOOST_REQUIRE_CLOSE(input(2, 1), 1.0, 1e-5); // 8
+ BOOST_REQUIRE_CLOSE(input(2, 2), 1.0, 1e-5); // 9
}
BOOST_AUTO_TEST_SUITE_END();
More information about the mlpack-git
mailing list