[mlpack-git] master: Add Classify() functions and tests. (5546ebc)

Wed Jun 1 14:27:23 EDT 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/e6d2ca7bf64b47a36ac489335cf0dd8933e13076...5546ebcf02598c9da06e19ed447e73ddcd0d3347

>---------------------------------------------------------------

commit 5546ebcf02598c9da06e19ed447e73ddcd0d3347
Author: Ryan Curtin <ryan at ratml.org>
Date:   Wed Jun 1 11:26:09 2016 -0700

    Add Classify() functions and tests.


>---------------------------------------------------------------

5546ebcf02598c9da06e19ed447e73ddcd0d3347
 .../logistic_regression/logistic_regression.hpp    |  41 ++++++
 .../logistic_regression_impl.hpp                   |  31 +++++
 src/mlpack/tests/logistic_regression_test.cpp      | 150 +++++++++++++++++++++
 3 files changed, 222 insertions(+)

diff --git a/src/mlpack/methods/logistic_regression/logistic_regression.hpp b/src/mlpack/methods/logistic_regression/logistic_regression.hpp
index 0d56e0c..008193a 100644
--- a/src/mlpack/methods/logistic_regression/logistic_regression.hpp
+++ b/src/mlpack/methods/logistic_regression/logistic_regression.hpp
@@ -152,6 +152,8 @@ class LogisticRegression
    * the decision boundary, the response is taken to be 1; otherwise, it is 0.
    * By default the decision boundary is 0.5.
    *
+   * This method is deprecated---you should use Classify() instead.
+   *
    * @param predictors Input predictors.
    * @param responses Vector to put output predictions of responses into.
    * @param decisionBoundary Decision boundary (default 0.5).
@@ -161,6 +163,45 @@ class LogisticRegression
                const double decisionBoundary = 0.5) const;
 
   /**
+   * Classify the given point.  The predicted label is returned.  Optionally,
+   * specify the decision boundary; logistic regression returns a value between
+   * 0 and 1.  If the value is greater than the decision boundary, the response
+   * is taken to be 1; otherwise, it is 0.  By default the decision boundary is
+   * 0.5.
+   *
+   * @param point Point to classify.
+   * @param decisionBoundary Decision boundary (default 0.5).
+   * @return Predicted label of point.
+   */
+  template<typename VecType>
+  size_t Classify(const VecType& point,
+                  const double decisionBoundary = 0.5) const;
+
+  /**
+   * Classify the given points, returning the predicted labels for each point.
+   * Optionally, specify the decision boundary; logistic regression returns a
+   * value between 0 and 1.  If the value is greater than the decision boundary,
+   * the response is taken to be 1; otherwise, it is 0.  By default the decision
+   * boundary is 0.5.
+   *
+   * @param dataset Set of points to classify.
+   * @param labels Predicted labels for each point.
+   * @param decisionBoundary Decision boundary (default 0.5).
+   */
+  void Classify(const MatType& dataset,
+                arma::Row<size_t>& labels,
+                const double decisionBoundary = 0.5) const;
+
+  /**
+   * Classify the given points, returning class probabilities for each point.
+   *
+   * @param dataset Set of points to classify.
+   * @param probabilities Class probabilities for each point (output).
+   */
+  void Classify(const MatType& dataset,
+                arma::mat& probabilities) const;
+
+  /**
    * Compute the accuracy of the model on the given predictors and responses,
    * optionally using the given decision boundary.  The responses should be
    * either 0 or 1.  Logistic regression returns a value between 0 and 1.  If
diff --git a/src/mlpack/methods/logistic_regression/logistic_regression_impl.hpp b/src/mlpack/methods/logistic_regression/logistic_regression_impl.hpp
index ee4396e..5b6a2c6 100644
--- a/src/mlpack/methods/logistic_regression/logistic_regression_impl.hpp
+++ b/src/mlpack/methods/logistic_regression/logistic_regression_impl.hpp
@@ -106,6 +106,37 @@ void LogisticRegression<MatType>::Predict(const MatType& predictors,
 }
 
 template<typename MatType>
+template<typename VecType>
+size_t LogisticRegression<MatType>::Classify(const VecType& point,
+                                             const double decisionBoundary)
+    const
+{
+  return size_t(1.0 / (1.0 + std::exp(-parameters(0) - arma::dot(point,
+      parameters.subvec(1, parameters.n_elem - 1)))) +
+      (1.0 - decisionBoundary));
+}
+
+template<typename MatType>
+void LogisticRegression<MatType>::Classify(const MatType& dataset,
+                                           arma::Row<size_t>& labels,
+                                           const double decisionBoundary) const
+{
+  Predict(dataset, labels, decisionBoundary);
+}
+
+template<typename MatType>
+void LogisticRegression<MatType>::Classify(const MatType& dataset,
+                                           arma::mat& probabilities) const
+{
+  // Set correct size of output matrix.
+  probabilities.set_size(2, dataset.n_cols);
+
+  probabilities.row(1) = 1.0 / (1.0 + arma::exp(-parameters(0) - dataset.t() *
+      parameters.subvec(1, parameters.n_elem - 1))).t();
+  probabilities.row(0) = 1.0 - probabilities.row(1);
+}
+
+template<typename MatType>
 double LogisticRegression<MatType>::ComputeError(
     const MatType& predictors,
     const arma::Row<size_t>& responses) const
diff --git a/src/mlpack/tests/logistic_regression_test.cpp b/src/mlpack/tests/logistic_regression_test.cpp
index f567049..7881bb2 100644
--- a/src/mlpack/tests/logistic_regression_test.cpp
+++ b/src/mlpack/tests/logistic_regression_test.cpp
@@ -807,4 +807,154 @@ BOOST_AUTO_TEST_CASE(LogisticRegressionSparseSGDTest)
     BOOST_REQUIRE_CLOSE(lr.Parameters()[i], lrSparse.Parameters()[i], 1e-5);
 }
 
+/**
+ * Test multi-point classification (Classify()).
+ */
+BOOST_AUTO_TEST_CASE(ClassifyTest)
+{
+  // Generate a two-Gaussian dataset.
+  GaussianDistribution g1(arma::vec("1.0 1.0 1.0"), arma::eye<arma::mat>(3, 3));
+  GaussianDistribution g2(arma::vec("9.0 9.0 9.0"), arma::eye<arma::mat>(3, 3));
+
+  arma::mat data(3, 1000);
+  arma::Row<size_t> responses(1000);
+  for (size_t i = 0; i < 500; ++i)
+  {
+    data.col(i) = g1.Random();
+    responses[i] = 0;
+  }
+  for (size_t i = 500; i < 1000; ++i)
+  {
+    data.col(i) = g2.Random();
+    responses[i] = 1;
+  }
+
+  // Now train a logistic regression object on it.
+  LogisticRegression<> lr(data.n_rows, 0.5);
+  lr.Train<>(data, responses);
+
+  // Create a test set.
+  for (size_t i = 0; i < 500; ++i)
+  {
+    data.col(i) = g1.Random();
+    responses[i] = 0;
+  }
+  for (size_t i = 500; i < 1000; ++i)
+  {
+    data.col(i) = g2.Random();
+    responses[i] = 1;
+  }
+
+  arma::Row<size_t> predictions;
+  lr.Classify(data, predictions);
+
+  BOOST_REQUIRE_GE((double) arma::accu(predictions == responses), 900);
+}
+
+/**
+ * Test that single-point classification gives the same results as multi-point
+ * classification.
+ */
+BOOST_AUTO_TEST_CASE(SinglePointClassifyTest)
+{
+  // Generate a two-Gaussian dataset.
+  GaussianDistribution g1(arma::vec("1.0 1.0 1.0"), arma::eye<arma::mat>(3, 3));
+  GaussianDistribution g2(arma::vec("9.0 9.0 9.0"), arma::eye<arma::mat>(3, 3));
+
+  arma::mat data(3, 1000);
+  arma::Row<size_t> responses(1000);
+  for (size_t i = 0; i < 500; ++i)
+  {
+    data.col(i) = g1.Random();
+    responses[i] = 0;
+  }
+  for (size_t i = 500; i < 1000; ++i)
+  {
+    data.col(i) = g2.Random();
+    responses[i] = 1;
+  }
+
+  // Now train a logistic regression object on it.
+  LogisticRegression<> lr(data.n_rows, 0.5);
+  lr.Train<>(data, responses);
+
+  // Create a test set.
+  for (size_t i = 0; i < 500; ++i)
+  {
+    data.col(i) = g1.Random();
+    responses[i] = 0;
+  }
+  for (size_t i = 500; i < 1000; ++i)
+  {
+    data.col(i) = g2.Random();
+    responses[i] = 1;
+  }
+
+  arma::Row<size_t> predictions;
+  lr.Classify(data, predictions);
+
+  for (size_t i = 0; i < data.n_cols; ++i)
+  {
+    size_t pred = lr.Classify(data.col(i));
+
+    BOOST_REQUIRE_EQUAL(pred, predictions[i]);
+  }
+}
+
+/**
+ * Test that giving point probabilities works.
+ */
+BOOST_AUTO_TEST_CASE(ClassifyProbabilitiesTest)
+{
+  // Generate a two-Gaussian dataset.
+  GaussianDistribution g1(arma::vec("1.0 1.0 1.0"), arma::eye<arma::mat>(3, 3));
+  GaussianDistribution g2(arma::vec("9.0 9.0 9.0"), arma::eye<arma::mat>(3, 3));
+
+  arma::mat data(3, 1000);
+  arma::Row<size_t> responses(1000);
+  for (size_t i = 0; i < 500; ++i)
+  {
+    data.col(i) = g1.Random();
+    responses[i] = 0;
+  }
+  for (size_t i = 500; i < 1000; ++i)
+  {
+    data.col(i) = g2.Random();
+    responses[i] = 1;
+  }
+
+  // Now train a logistic regression object on it.
+  LogisticRegression<> lr(data.n_rows, 0.5);
+  lr.Train<>(data, responses);
+
+  // Create a test set.
+  for (size_t i = 0; i < 500; ++i)
+  {
+    data.col(i) = g1.Random();
+    responses[i] = 0;
+  }
+  for (size_t i = 500; i < 1000; ++i)
+  {
+    data.col(i) = g2.Random();
+    responses[i] = 1;
+  }
+
+  arma::mat probabilities;
+  lr.Classify(data, probabilities);
+
+  BOOST_REQUIRE_EQUAL(probabilities.n_cols, data.n_cols);
+  BOOST_REQUIRE_EQUAL(probabilities.n_rows, 2);
+
+  for (size_t i = 0; i < data.n_cols; ++i)
+  {
+    BOOST_REQUIRE_CLOSE(probabilities(0, i) + probabilities(1, i), 1.0, 1e-5);
+
+    // 10% tolerance.
+    if (responses[i] == 0)
+      BOOST_REQUIRE_CLOSE(probabilities(0, i), 1.0, 10.0);
+    else
+      BOOST_REQUIRE_CLOSE(probabilities(1, i), 1.0, 10.0);
+  }
+}
+
 BOOST_AUTO_TEST_SUITE_END();