[mlpack-git] master: Efficiency and documentation fixes for GammDistribution. (29cadf0)

Fri Jul 22 07:45:54 EDT 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/ba850f782a53c5a77b7985f7647f609bd96cb5e7...2c026d838925df436d967439899813da5d34c702

>---------------------------------------------------------------

commit 29cadf08524c58565e51133253d217a24c837c64
Author: Yannis Mentekidis <mentekid at gmail.com>
Date:   Fri Jul 22 12:45:54 2016 +0100

    Efficiency and documentation fixes for GammDistribution.


>---------------------------------------------------------------

29cadf08524c58565e51133253d217a24c837c64
 src/mlpack/core/dists/gamma_distribution.cpp | 36 +++++++++++--------
 src/mlpack/core/dists/gamma_distribution.hpp | 47 +++++++++---------------
 src/mlpack/tests/distribution_test.cpp       | 53 +++++-----------------------
 3 files changed, 45 insertions(+), 91 deletions(-)

diff --git a/src/mlpack/core/dists/gamma_distribution.cpp b/src/mlpack/core/dists/gamma_distribution.cpp
index 4d93917..c014077 100644
--- a/src/mlpack/core/dists/gamma_distribution.cpp
+++ b/src/mlpack/core/dists/gamma_distribution.cpp
@@ -12,20 +12,19 @@ using namespace mlpack;
 using namespace mlpack::distribution;
 
 // Returns true if computation converged.
-inline bool GammaDistribution::converged(double aOld, double aNew)
+inline bool GammaDistribution::converged(const double aOld, 
+                                         const double aNew, 
+                                         const double tol)
 {
   return (std::abs(aNew - aOld) / aNew) < tol;
 }
 
 // Fits an alpha and beta parameter to each dimension of the data.
-void GammaDistribution::Train(const arma::mat& rdata)
+void GammaDistribution::Train(const arma::mat& rdata, const double tol)
 {
-  // Use pseudonym fittingSet regardless of if rdata was provided by user.
-  const arma::mat& fittingSet = 
-    rdata.n_elem == 0 ? referenceSet : rdata;
 
   // If fittingSet is empty, nothing to do.
-  if (arma::size(fittingSet) == arma::size(arma::mat()))
+  if (arma::size(rdata) == arma::size(arma::mat()))
     return;
 
   // Use boost's definitions of digamma and tgamma, and std::log.
@@ -34,16 +33,22 @@ void GammaDistribution::Train(const arma::mat& rdata)
   using std::log;
 
   // Allocate space for alphas and betas (Assume independent rows).
-  alpha.set_size(fittingSet.n_rows);
-  beta.set_size(fittingSet.n_rows);
+  alpha.set_size(rdata.n_rows);
+  beta.set_size(rdata.n_rows);
+   
+  // Calculate log(mean(x)) and mean(log(x)) of each dataset row.
+  const arma::vec meanLogxVec = arma::mean(arma::log(rdata), 1);
+  const arma::vec meanxVec = arma::mean(rdata, 1);
+  const arma::vec logMeanxVec = arma::log(meanxVec);
 
   // Treat each dimension (i.e. row) independently.
-  for (size_t row = 0; row < fittingSet.n_rows; ++row)
+  for (size_t row = 0; row < rdata.n_rows; ++row)
   {
-    // Calculate log(mean(x)) and mean(log(x)) required for fitting process.
-    const double meanLogx = arma::mean(arma::log(fittingSet.row(row)));
-    const double meanx = arma::mean(fittingSet.row(row));
-    const double logMeanx = std::log(arma::mean(meanx));
+
+    // Statistics for this row.
+    const double meanLogx = meanLogxVec(row);
+    const double meanx = meanxVec(row);
+    const double logMeanx = logMeanxVec(row);
 
     // Starting point for Generalized Newton.
     double aEst = 0.5 / (logMeanx - meanLogx);
@@ -63,9 +68,10 @@ void GammaDistribution::Train(const arma::mat& rdata)
       aEst = 1.0 / ((1.0 / aEst) + nominator / denominator);
 
       // Protect against nan values (aEst will be passed to logarithm).
-      assert(aEst > 0);
+      if (aEst <= 0)
+        throw std::logic_error("GammaDistribution parameter alpha will be <=0");
 
-    } while (! converged(aEst, aOld) );
+    } while (! converged(aEst, aOld, tol) );
 
     alpha(row) = aEst;
     beta(row) = meanx/aEst;
diff --git a/src/mlpack/core/dists/gamma_distribution.hpp b/src/mlpack/core/dists/gamma_distribution.hpp
index 18859ef..5763180 100644
--- a/src/mlpack/core/dists/gamma_distribution.hpp
+++ b/src/mlpack/core/dists/gamma_distribution.hpp
@@ -3,8 +3,9 @@
  * @author Yannis Mentekidis
  *
  * Implementation of a Gamma distribution of multidimensional data that fits
- * gamma parameters (alpha, beta) to data. Assumes each data dimension is
- * uncorrelated.
+ * gamma parameters (alpha, beta) to data.
+ * The fitting is done independently for each dataset dimension (row), based on
+ * the assumption each dimension is fully indepeendent.
  *
  * Based on "Estimating a Gamma Distribution" by Thomas P. Minka:
  * research.microsoft.com/~minka/papers/minka-gamma.pdf
@@ -24,21 +25,12 @@ class GammaDistribution
 {
   public:
     /**
-     * Empty constructor. Set tolerance to default.
+     * Empty constructor.
      */
-    GammaDistribution() : tol(1e-8)
-    { /* Nothing to do. */ };
+    GammaDistribution() { /* Nothing to do. */ };
 
     /**
-     * Constructor with reference data parameter.
-     *
-     * @param rdata Reference data for the object.
-     */
-    GammaDistribution(arma::mat& rdata) : referenceSet(rdata), tol(1e-8)
-    { /* Nothing to do. */ };
-
-    /**
-     * Destructor,
+     * Destructor.
      */
     ~GammaDistribution() {};
 
@@ -46,21 +38,12 @@ class GammaDistribution
      * This function trains (fits distribution parameters) to new data or the
      * dataset the object owns.
      *
-     * @param rdata Reference data to fit parameters to. If not specified,
-     *    reference data will be used. Results are stored in the alpha and beta
-     *    vectors (old results are removed).
+     * @param rdata Reference data to fit parameters to.
+     * @param tol Convergence tolerance. This is *not* an absolute measure:
+     *    It will stop the approximation once the *change* in the value is 
+     *    smaller than tol.
      */
-    void Train(const arma::mat& rdata = arma::mat());
-
-    /* Access to referenceSet. */
-    arma::mat& ReferenceSet(void) { return referenceSet; };
-    /* Change referenceSet. */
-    void ReferenceSet(arma::mat& rdata) { referenceSet = rdata; };
-
-    /* Access to tolerance. */
-    double Tol(void) const { return tol; };
-    /* Change tolerance. */
-    void Tol(double tolerance) { tol = tolerance; };
+    void Train(const arma::mat& rdata, const double tol = 1e-8);
 
     // Access to Gamma Distribution Parameters.
     /* Get alpha parameters of each dimension */
@@ -71,8 +54,6 @@ class GammaDistribution
   private:
     arma::Col<double> alpha; // Array of fitted alphas.
     arma::Col<double> beta; // Array of fitted betas.
-    arma::mat referenceSet; // Matrix of reference set.
-    double tol; // Convergence tolerance.
 
     /**
      * This is a small function that returns true if the update of alpha is smaller
@@ -80,8 +61,12 @@ class GammaDistribution
      *
      * @param aOld old value of parameter we want to estimate (alpha in our case).
      * @param aNew new value of parameter (the value after 1 iteration from aOld).
+     * @param tol Convergence tolerance. Relative measure (see documentation of
+     * GammaDistribution::Train)
      */
-    inline bool converged(double aOld, double aNew);
+    inline bool converged(const double aOld, 
+                          const double aNew, 
+                          const double tol);
 };
 
 } // namespace distributions.
diff --git a/src/mlpack/tests/distribution_test.cpp b/src/mlpack/tests/distribution_test.cpp
index 3d754de..3862548 100644
--- a/src/mlpack/tests/distribution_test.cpp
+++ b/src/mlpack/tests/distribution_test.cpp
@@ -390,45 +390,8 @@ BOOST_AUTO_TEST_CASE(GaussianDistributionTrainTest)
 /******************************/
 
 /**
- * Make sure that both the default constructor and parameterized constructor
- * create the GammaDistribution object properly.
- */
-BOOST_AUTO_TEST_CASE(GammaDistributionConstructorTest)
-{
-  // Empty constructor test. Make sure reference set is empty.
-  GammaDistribution gDist;
-  BOOST_REQUIRE_EQUAL(arma::size(gDist.ReferenceSet()),
-      arma::size(arma::mat()));
-
-  // Parameterized constructor test. Make sure reference set is passed
-  // correctly.
-  // Create an Nxd reference set
-  size_t N = 200;
-  size_t d = 3;
-  arma::mat rdata(d, N);
-  
-  double alphaReal = 5.3;
-  double betaReal = 1.5;
-
-  // Create gamma distribution.
-  std::default_random_engine generator;
-  std::gamma_distribution<double> dist(alphaReal, betaReal);
-
-  // Random generation of gamma-like points.
-  for (size_t j = 0; j < d; ++j)
-    for (size_t i = 0; i < N; ++i)
-      rdata(j, i) = dist(generator);
-
-  // Create Gamma object.
-  GammaDistribution gDist2(rdata);
-  BOOST_REQUIRE_EQUAL(arma::size(gDist2.ReferenceSet()),
-      arma::size(arma::mat(d, N)));
-}
-
-/**
- * Make sure that constructing an object with one reference set and then asking
- * to fit another works properly: The object retains the old reference set, but
- * fits parameters (alpha, beta) to the new reference set.
+ * Make sure that using an object to fit one reference set and then asking
+ * to fit another works properly.
  */
 BOOST_AUTO_TEST_CASE(GammaDistributionTrainTest)
 {
@@ -449,8 +412,8 @@ BOOST_AUTO_TEST_CASE(GammaDistributionTrainTest)
       rdata(j, i) = dist(generator);
 
   // Create Gamma object and call Train() on reference set.
-  GammaDistribution gDist(rdata);
-  gDist.Train();
+  GammaDistribution gDist;
+  gDist.Train(rdata);
 
   // Training must estimate d pairs of alpha and beta parameters.
   BOOST_REQUIRE_EQUAL(gDist.Alpha().n_elem, d);
@@ -505,8 +468,8 @@ BOOST_AUTO_TEST_CASE(GammaDistributionFittingTest)
       rdata(j, i) = dist(generator);
 
   // Create Gamma object and call Train() on reference set.
-  GammaDistribution gDist(rdata);
-  gDist.Train();
+  GammaDistribution gDist;
+  gDist.Train(rdata);
 
   // Estimated parameter must be close to real.
   BOOST_REQUIRE_CLOSE(gDist.Alpha()[0], alphaReal, errorTolerance);
@@ -527,8 +490,8 @@ BOOST_AUTO_TEST_CASE(GammaDistributionFittingTest)
       rdata2(j, i) = dist2(generator2);
 
   // Create Gamma object and call Train() on reference set.
-  GammaDistribution gDist2(rdata2);
-  gDist2.Train();
+  GammaDistribution gDist2;
+  gDist2.Train(rdata2);
 
   // Estimated parameter must be close to real.
   BOOST_REQUIRE_CLOSE(gDist2.Alpha()[0], alphaReal2, errorTolerance);