[mlpack-git] master,mlpack-1.0.x: Overhaul implementation; do not use gmm::phi(). This gives serious speedup, as high-dimensional matrix inverses are not being calculated. The previous calls to gmm::phi() would invert a diagonal matrix without being able to assume that the matrix was diagonal. This explains the very very poor benchmarking results for nbc in mlpack. (154cb57)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Thu Mar 5 21:46:20 EST 2015


Repository : https://github.com/mlpack/mlpack

On branches: master,mlpack-1.0.x
Link       : https://github.com/mlpack/mlpack/compare/904762495c039e345beba14c1142fd719b3bd50e...f94823c800ad6f7266995c700b1b630d5ffdcf40

>---------------------------------------------------------------

commit 154cb575eb1e9df903c3e74f570dd7ce9261b40f
Author: Ryan Curtin <ryan at ratml.org>
Date:   Mon Apr 14 20:11:16 2014 +0000

    Overhaul implementation; do not use gmm::phi().  This gives serious speedup, as
    high-dimensional matrix inverses are not being calculated.  The previous calls
    to gmm::phi() would invert a diagonal matrix without being able to assume that
    the matrix was diagonal.  This explains the very very poor benchmarking results
    for nbc in mlpack.


>---------------------------------------------------------------

154cb575eb1e9df903c3e74f570dd7ce9261b40f
 .../naive_bayes/naive_bayes_classifier_impl.hpp    | 57 +++++++++++++---------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/src/mlpack/methods/naive_bayes/naive_bayes_classifier_impl.hpp b/src/mlpack/methods/naive_bayes/naive_bayes_classifier_impl.hpp
index c1b0290..08791ee 100644
--- a/src/mlpack/methods/naive_bayes/naive_bayes_classifier_impl.hpp
+++ b/src/mlpack/methods/naive_bayes/naive_bayes_classifier_impl.hpp
@@ -1,10 +1,11 @@
 /**
- * @file simple_nbc_impl.hpp
+ * @file naive_bayes_classifier_impl.hpp
  * @author Parikshit Ram (pram at cc.gatech.edu)
  *
  * A Naive Bayes Classifier which parametrically estimates the distribution of
- * the features.  It is assumed that the features have been sampled from a
- * Gaussian PDF.
+ * the features.  This classifier makes its predictions based on the assumption
+ * that the features have been sampled from a set of Gaussians with diagonal
+ * covariance.
  */
 #ifndef __MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP
 #define __MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP
@@ -53,6 +54,11 @@ NaiveBayesClassifier<MatType>::NaiveBayesClassifier(
       means.col(i) /= probabilities[i];
       variances.col(i) /= (probabilities[i] - 1);
     }
+
+    // Make sure variance is invertible.
+    for (size_t j = 0; j < dimensionality; ++j)
+      if (variances(j, i) == 0.0)
+        variances(j, i) = 1e-50;
   }
 
   probabilities /= data.n_cols;
@@ -66,9 +72,12 @@ void NaiveBayesClassifier<MatType>::Classify(const MatType& data,
   // training data.
   Log::Assert(data.n_rows == means.n_rows);
 
-  arma::vec probs(means.n_cols);
+  arma::vec probs = arma::log(probabilities);
+  arma::mat invVar = 1.0 / variances;
+
+  arma::mat testProbs = arma::repmat(probs.t(), data.n_cols, 1);
 
-  results.zeros(data.n_cols);
+  results.set_size(data.n_cols); // No need to fill with anything yet.
 
   Log::Info << "Running Naive Bayes classifier on " << data.n_cols
       << " data points with " << data.n_rows << " features each." << std::endl;
@@ -76,28 +85,30 @@ void NaiveBayesClassifier<MatType>::Classify(const MatType& data,
   // Calculate the joint probability for each of the data points for each of the
   // means.n_cols.
 
-  // Loop over every test case.
-  for (size_t n = 0; n < data.n_cols; n++)
+  // Loop over every class.
+  for (size_t i = 0; i < means.n_cols; i++)
   {
-    // Loop over every class.
-    for (size_t i = 0; i < means.n_cols; i++)
-    {
-      // Use the log values to prevent floating point underflow.
-      probs(i) = log(probabilities(i));
-
-      // Loop over every feature, but avoid inverting empty matrices.
-      if (probabilities[i] != 0)
-      {
-        probs(i) += log(gmm::phi(data.unsafe_col(n), means.unsafe_col(i),
-            diagmat(variances.unsafe_col(i))));
-      }
-    }
+    // This is an adaptation of gmm::phi() for the case where the covariance is
+    // a diagonal matrix.
+    arma::mat diffs = data - arma::repmat(means.col(i), 1, data.n_cols);
+    arma::mat rhs = -0.5 * arma::diagmat(invVar.col(i)) * diffs;
+    arma::vec exponents(diffs.n_cols);
+    for (size_t j = 0; j < diffs.n_cols; ++j)
+      exponents(j) = std::exp(arma::accu(diffs.col(j) % rhs.unsafe_col(j)));
+
+    testProbs.col(i) += log(pow(2 * M_PI, (double) data.n_rows / -2.0) *
+        pow(det(arma::diagmat(invVar.col(i))), -0.5) * exponents);
+  }
 
-    // Find the index of the maximum value in tmp_vals.
+  // Now calculate the label.
+  for (size_t i = 0; i < data.n_cols; ++i)
+  {
+    // Find the index of the class with maximum probability for this point.
     arma::uword maxIndex = 0;
-    probs.max(maxIndex);
+    arma::vec pointProbs = testProbs.row(i).t();
+    pointProbs.max(maxIndex);
 
-    results[n] = maxIndex;
+    results[i] = maxIndex;
   }
 
   return;



More information about the mlpack-git mailing list