[mlpack-git] master, mlpack-1.0.x: Change to two-pass algorithm suggested by Vahab in #344. (eae14ef)

Thu Mar 5 21:46:27 EST 2015

Repository : https://github.com/mlpack/mlpack

On branches: master,mlpack-1.0.x
Link       : https://github.com/mlpack/mlpack/compare/904762495c039e345beba14c1142fd719b3bd50e...f94823c800ad6f7266995c700b1b630d5ffdcf40

>---------------------------------------------------------------

commit eae14efc6102953d78a802f23e9fda46d74207fe
Author: Ryan Curtin <ryan at ratml.org>
Date:   Wed Apr 16 18:18:13 2014 +0000

    Change to two-pass algorithm suggested by Vahab in #344.


>---------------------------------------------------------------

eae14efc6102953d78a802f23e9fda46d74207fe
 .../naive_bayes/naive_bayes_classifier_impl.hpp    | 30 +++++++++++++++-------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/mlpack/methods/naive_bayes/naive_bayes_classifier_impl.hpp b/src/mlpack/methods/naive_bayes/naive_bayes_classifier_impl.hpp
index 2fd92f4..5cd4fb9 100644
--- a/src/mlpack/methods/naive_bayes/naive_bayes_classifier_impl.hpp
+++ b/src/mlpack/methods/naive_bayes/naive_bayes_classifier_impl.hpp
@@ -1,6 +1,7 @@
 /**
  * @file naive_bayes_classifier_impl.hpp
  * @author Parikshit Ram (pram at cc.gatech.edu)
+ * @author Vahab Akbarzadeh (v.akbarzadeh at gmail.com)
  *
  * A Naive Bayes Classifier which parametrically estimates the distribution of
  * the features.  This classifier makes its predictions based on the assumption
@@ -59,25 +60,36 @@ NaiveBayesClassifier<MatType>::NaiveBayesClassifier(
   }
   else
   {
-    // Don't use incremental algorithm.
+    // Don't use incremental algorithm.  This is a two-pass algorithm.  It is
+    // possible to calculate the means and variances using a faster one-pass
+    // algorithm but there are some precision and stability issues.  If this is
+    // too slow, it's an option to use the faster algorithm by default and then
+    // have this (and the incremental algorithm) be other options.
+
+    // Calculate the means.
     for (size_t j = 0; j < data.n_cols; ++j)
     {
       const size_t label = labels[j];
       ++probabilities[label];
-
       means.col(label) += data.col(j);
-      variances.col(label) += square(data.col(j));
     }
 
+    // Normalize means.
     for (size_t i = 0; i < classes; ++i)
-    {
-      if (probabilities[i] != 0)
-      {
-        variances.col(i) -= (square(means.col(i)) / probabilities[i]);
+      if (probabilities[i] != 0.0)
         means.col(i) /= probabilities[i];
-        variances.col(i) /= (probabilities[i] - 1);
-      }
+
+    // Calculate variances.
+    for (size_t j = 0; j < data.n_cols; ++j)
+    {
+      const size_t label = labels[j];
+      variances.col(label) += square(data.col(j) - means.col(label));
     }
+
+    // Normalize variances.
+    for (size_t i = 0; i < classes; ++i)
+      if (probabilities[i] > 1)
+        variances.col(i) /= (probabilities[i] - 1);
   }
 
   // Ensure that the variances are invertible.