[mlpack-git] master: Adds a GammaDistribution object for estimating gamma parameters (5293b50)

Wed Jul 20 06:58:26 EDT 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/ba850f782a53c5a77b7985f7647f609bd96cb5e7...2c026d838925df436d967439899813da5d34c702

>---------------------------------------------------------------

commit 5293b5037a19735d70937bef7bc97838ca542bbc
Author: Yannis Mentekidis <mentekid at gmail.com>
Date:   Wed Jul 20 11:58:26 2016 +0100

    Adds a GammaDistribution object for estimating gamma parameters


>---------------------------------------------------------------

5293b5037a19735d70937bef7bc97838ca542bbc
 src/mlpack/core.hpp                          |   1 +
 src/mlpack/core/dists/CMakeLists.txt         |   2 +
 src/mlpack/core/dists/gamma_distribution.cpp |  76 +++++++++++++
 src/mlpack/core/dists/gamma_distribution.hpp |  91 +++++++++++++++
 src/mlpack/tests/distribution_test.cpp       | 163 ++++++++++++++++++++++++++-
 5 files changed, 332 insertions(+), 1 deletion(-)

diff --git a/src/mlpack/core.hpp b/src/mlpack/core.hpp
index 1b2226c..e3c8f89 100644
--- a/src/mlpack/core.hpp
+++ b/src/mlpack/core.hpp
@@ -215,6 +215,7 @@
 #include <mlpack/core/dists/discrete_distribution.hpp>
 #include <mlpack/core/dists/gaussian_distribution.hpp>
 #include <mlpack/core/dists/laplace_distribution.hpp>
+#include <mlpack/core/dists/gamma_distribution.hpp>
 //mlpack::backtrace only for linux
 #ifdef HAS_BFD_DL
   #include <mlpack/core/util/backtrace.hpp>
diff --git a/src/mlpack/core/dists/CMakeLists.txt b/src/mlpack/core/dists/CMakeLists.txt
index 536a7b6..a98c8db 100644
--- a/src/mlpack/core/dists/CMakeLists.txt
+++ b/src/mlpack/core/dists/CMakeLists.txt
@@ -9,6 +9,8 @@ set(SOURCES
   laplace_distribution.cpp
   regression_distribution.hpp
   regression_distribution.cpp
+  gamma_distribution.hpp
+  gamma_distribution.cpp
 )
 
 # add directory name to sources
diff --git a/src/mlpack/core/dists/gamma_distribution.cpp b/src/mlpack/core/dists/gamma_distribution.cpp
new file mode 100644
index 0000000..bb52ce6
--- /dev/null
+++ b/src/mlpack/core/dists/gamma_distribution.cpp
@@ -0,0 +1,76 @@
+/**
+ * @file gamma_distribution.cpp
+ * @author Yannis Mentekidis
+ *
+ * Implementation of the methods of GammaDistribution.
+ */
+#include "gamma_distribution.hpp"
+
+#include <boost/math/special_functions/digamma.hpp>
+#include <boost/math/special_functions/trigamma.hpp>
+
+using namespace mlpack;
+using namespace mlpack::distribution;
+
+// Returns true if computation converged.
+inline bool GammaDistribution::converged(double aOld, double aNew)
+{
+  return (std::abs(aNew - aOld) / aNew) < tol;
+}
+
+// Fits an alpha and beta parameter to each dimension of the data.
+void GammaDistribution::Train(const arma::mat& rdata)
+{
+  // Use pseudonym fittingSet regardless of if rdata was provided by user.
+  const arma::mat& fittingSet = 
+    rdata.n_elem == 0 ? referenceSet : rdata;
+
+  // If fittingSet is empty, nothing to do.
+  if (arma::size(fittingSet) == arma::size(arma::mat()))
+    return;
+
+  // Use boost's definitions of digamma and tgamma, and std::log.
+  using boost::math::digamma;
+  using boost::math::trigamma;
+  using std::log;
+
+  // Allocate space for alphas and betas (Assume independent rows).
+  alpha.set_size(fittingSet.n_rows);
+  beta.set_size(fittingSet.n_rows);
+
+  // Treat each dimension (i.e. row) independently.
+  for (size_t row = 0; row < fittingSet.n_rows; ++row)
+  {
+    // Calculate log(mean(x)) and mean(log(x)) required for fitting process.
+    const double meanLogx = arma::mean(arma::log(fittingSet.row(row)));
+    const double meanx = arma::mean(fittingSet.row(row));
+    const double logMeanx = std::log(arma::mean(meanx));
+    
+    // Starting point for Generalized Newton.
+    double aEst = 0.5 / (logMeanx - meanLogx);
+    double aOld;
+
+    // Newton's method: In each step, make an update to aEst. If value didn't
+    // change much (abs(aNew - aEst)/aEst < tol), then stop.
+    do
+    {
+      // Needed for convergence test.
+      aOld = aEst;
+
+      // Calculate new value for alpha. 
+      double nominator = meanLogx - logMeanx + log(aEst) - digamma(aEst);
+      double denominator = pow(aEst, 2) * (1 / aEst - trigamma(aEst));
+      assert (denominator != 0); // Protect against division by 0.
+      aEst = 1.0 / ((1.0 / aEst) + nominator / denominator);
+
+      // Protect against nan values (aEst will be passed to logarithm).
+      assert(aEst > 0);
+
+    } while (! converged(aEst, aOld) );
+    
+    alpha(row) = aEst;
+    beta(row) = meanx/aEst;
+  }
+  return;
+}
+
diff --git a/src/mlpack/core/dists/gamma_distribution.hpp b/src/mlpack/core/dists/gamma_distribution.hpp
new file mode 100644
index 0000000..9562194
--- /dev/null
+++ b/src/mlpack/core/dists/gamma_distribution.hpp
@@ -0,0 +1,91 @@
+/**
+ * @file gamma_distribution.hpp
+ * @author Yannis Mentekidis
+ *
+ * Implementation of a Gamma distribution of multidimensional data that fits
+ * gamma parameters (alpha, beta) to data. Assumes each data dimension is
+ * uncorrelated.
+ *
+ * Based on "Estimating a Gamma Distribution" by Thomas P. Minka:
+ * research.microsoft.com/~minka/papers/minka-gamma.pdf
+ */
+
+#ifndef _MLPACK_CORE_DISTRIBUTIONS_GAMMA_DISTRIBUTION_HPP 
+#define _MLPACK_CORE_DISTRIBUTIONS_GAMMA_DISTRIBUTION_HPP
+
+#include <mlpack/core.hpp>
+namespace mlpack{
+namespace distribution{
+
+
+/**
+ * Class for fitting the Gamma Distribution to a dataset.
+ */
+class GammaDistribution
+{
+  public:
+    /**
+     * Empty constructor. Set tolerance to default.
+     */
+    GammaDistribution() : tol(1e-8)
+    { /* Nothing to do. */ };
+
+    /**
+     * Constructor with reference data parameter.
+     *
+     * @param rdata Reference data for the object.
+     */
+    GammaDistribution(arma::mat& rdata) : referenceSet(rdata), tol(1e-8)
+    { /* Nothing to do. */ };
+
+    /**
+     * Destructor,
+     */
+    ~GammaDistribution() {};
+
+    /**
+     * This function trains (fits distribution parameters) to new data or data
+     * object was constructed with.
+     *
+     * @param rdata Reference data to fit parameters to. If not specified,
+     *    reference data will be used. Results are stored in the alpha and beta
+     *    vectors.
+     */
+    void Train(const arma::mat& rdata = arma::mat());
+
+    /* Access to referenceSet. */
+    arma::mat& ReferenceSet(void) { return referenceSet; };
+    /* Change referenceSet. */
+    void ReferenceSet(arma::mat& rdata) { referenceSet = rdata; };
+
+    /* Access to tolerance. */
+    double Tol(void) const { return tol; };
+    /* Change tolerance. */
+    void Tol(double tolerance) { tol = tolerance; };
+
+    // Access to Gamma Distribution Parameters.
+    /* Get alpha parameters of each dimension */
+    arma::Col<double>& Alpha(void) { return alpha; };
+    /* Get beta parameters of each dimension */
+    arma::Col<double>& Beta(void) { return beta; };
+
+  private:
+    arma::Col<double> alpha; // Array of fitted alphas.
+    arma::Col<double> beta; // Array of fitted betas.
+    arma::mat referenceSet; // Matrix of reference set.
+    double tol; // Convergence tolerance.
+
+    /**
+     * This is a small function that returns true if the update of alpha is smaller
+     * than the tolerance ratio.
+     *
+     * @param aOld old value of parameter we want to estimate (alpha in our case).
+     * @param aNew new value of parameter (the value after 1 iteration from aOld).
+     */
+    inline bool converged(double aOld, double aNew);
+};
+
+} // namespace distributions.
+} // namespace mlpack.
+
+#endif
diff --git a/src/mlpack/tests/distribution_test.cpp b/src/mlpack/tests/distribution_test.cpp
index d0261dd..e605134 100644
--- a/src/mlpack/tests/distribution_test.cpp
+++ b/src/mlpack/tests/distribution_test.cpp
@@ -1,8 +1,12 @@
 /**
  * @file distribution_test.cpp
  * @author Ryan Curtin
+ * @author Yannis Mentekidis
  *
- * Test for the mlpack::distribution::DiscreteDistribution class.
+ * Tests for the classes:
+ *  * mlpack::distribution::DiscreteDistribution
+ *  * mlpack::distribution::GaussianDistribution
+ *  * mlpack::distribution::GammaDistribution
  */
 #include <mlpack/core.hpp>
 
@@ -14,6 +18,10 @@ using namespace mlpack::distribution;
 
 BOOST_AUTO_TEST_SUITE(DistributionTest);
 
+/*********************************/
+/** Discrete Distribution Tests **/
+/*********************************/
+
 /**
  * Make sure we initialize correctly.
  */
@@ -105,6 +113,10 @@ BOOST_AUTO_TEST_CASE(DiscreteDistributionTrainProbTest)
   BOOST_REQUIRE_CLOSE(d.Probability("2"), 0.5, 1e-5);
 }
 
+/*********************************/
+/** Gaussian Distribution Tests **/
+/*********************************/
+
 /**
  * Make sure Gaussian distributions are initialized correctly.
  */
@@ -373,4 +385,153 @@ BOOST_AUTO_TEST_CASE(GaussianDistributionTrainTest)
       BOOST_REQUIRE_SMALL(d.Covariance()(i, j) - actualCov(i, j), 1e-5);
 }
 
+/******************************/
+/** Gamma Distribution Tests **/
+/******************************/
+
+/**
+ * Make sure that both the default constructor and parameterized constructor
+ * create the GammaDistribution object properly.
+ */
+BOOST_AUTO_TEST_CASE(GammaDistributionConstructorTest)
+{
+  // Empty constructor test. Make sure reference set is empty.
+  GammaDistribution gDist;
+  BOOST_REQUIRE_EQUAL(arma::size(gDist.ReferenceSet()), arma::size(arma::mat()));
+
+  // Parameterized constructor test. Make sure reference set is passed
+  // correctly.
+  // Create an Nxd reference set
+  size_t N = 200;
+  size_t d = 3;
+  arma::mat rdata(d, N);
+  
+  double alphaReal = 5.3;
+  double betaReal = 1.5;
+
+  // Create gamma distribution.
+  std::default_random_engine generator;
+  std::gamma_distribution<double> dist(alphaReal, betaReal);
+
+  // Random generation of gamma-like points.
+  for (size_t j = 0; j < d; ++j)
+    for (size_t i = 0; i < N; ++i)
+      rdata(j, i) = dist(generator);
+
+  // Create Gamma object.
+  GammaDistribution gDist2(rdata);
+  BOOST_REQUIRE_EQUAL(arma::size(gDist2.ReferenceSet()), arma::size(arma::mat(d, N)));
+}
+
+/**
+ * Make sure that constructing an object with one reference set and then asking
+ * to fit another works properly: The object retains the old reference set, but
+ * fits parameters (alpha, beta) to the new reference set.
+ */
+BOOST_AUTO_TEST_CASE(GammaDistributionTrainTest)
+{
+  // Create a gamma distribution random generator.
+  double alphaReal = 5.3;
+  double betaReal = 1.5;
+  std::default_random_engine generator;
+  std::gamma_distribution<double> dist(alphaReal, betaReal);
+
+  // Create a N x d gamma distribution data and fit the results.
+  size_t N = 200;
+  size_t d = 2;
+  arma::mat rdata(d, N);
+
+  // Random generation of gamma-like points.
+  for (size_t j = 0; j < d; ++j)
+    for (size_t i = 0; i < N; ++i)
+      rdata(j, i) = dist(generator);
+
+  // Create Gamma object and call Train() on reference set.
+  GammaDistribution gDist(rdata);
+  gDist.Train();
+
+  // Training must estimate d pairs of alpha and beta parameters.
+  BOOST_REQUIRE_EQUAL(gDist.Alpha().n_elem, d);
+  BOOST_REQUIRE_EQUAL(gDist.Beta().n_elem, d);
+
+  // Create a N' x d' gamma distribution, fit results without new object.
+  size_t N2 = 350;
+  size_t d2 = 4;
+  arma::mat rdata2(d2, N2);
+
+  // Random generation of gamma-like points.
+  for (size_t j = 0; j < d2; ++j)
+    for (size_t i = 0; i < N2; ++i)
+      rdata2(j, i) = dist(generator);
+
+  // Fit results using old object.
+  gDist.Train(rdata2);
+
+  // Training must estimate d' pairs of alpha and beta parameters.
+  BOOST_REQUIRE_EQUAL(gDist.Alpha().n_elem, d2);
+  BOOST_REQUIRE_EQUAL(gDist.Beta().n_elem, d2);
+
+}
+
+/**
+ * This test verifies that the fitting procedure for GammaDistribution works
+ * properly and converges near the actual gamma parameters. We do this twice
+ * with different alpha/beta parameters so we make sure we don't have some weird
+ * bug that always converges to the same number.
+ */
+BOOST_AUTO_TEST_CASE(GammaDistributionFittingTest)
+{
+  // Offset from the actual alpha/beta. 10% is quite a relaxed tolerance since
+  // the random points we generate are few (for test speed) and might be fitted
+  // better by a similar distribution.
+  double errorTolerance = 10;
+
+  size_t N = 500;
+  size_t d = 1; // Only 1 dimension is required for this.
+
+  /** Iteration 1 (first parameter set) **/
+  
+  // Create a gamma-random generator and data
+  double alphaReal = 5.3;
+  double betaReal = 1.5;
+  std::default_random_engine generator;
+  std::gamma_distribution<double> dist(alphaReal, betaReal);
+  
+  // Random generation of gamma-like points.
+  arma::mat rdata(d, N);
+  for (size_t j = 0; j < d; ++j)
+    for (size_t i = 0; i < N; ++i)
+      rdata(j, i) = dist(generator);
+
+  // Create Gamma object and call Train() on reference set.
+  GammaDistribution gDist(rdata);
+  gDist.Train();
+
+  // Estimated parameter must be close to real.
+  BOOST_REQUIRE_CLOSE(gDist.Alpha()[0], alphaReal, errorTolerance);
+  BOOST_REQUIRE_CLOSE(gDist.Beta()[0], betaReal, errorTolerance);
+
+  /** Iteration 2 (different parameter set) **/
+
+  // Create a gamma-random generator and data
+  double alphaReal2 = 7.2;
+  double betaReal2 = 0.9;
+  std::default_random_engine generator2;
+  std::gamma_distribution<double> dist2(alphaReal2, betaReal2);
+  
+  // Random generation of gamma-like points.
+  arma::mat rdata2(d, N);
+  for (size_t j = 0; j < d; ++j)
+    for (size_t i = 0; i < N; ++i)
+      rdata2(j, i) = dist2(generator2);
+
+  // Create Gamma object and call Train() on reference set.
+  GammaDistribution gDist2(rdata2);
+  gDist2.Train();
+
+  // Estimated parameter must be close to real.
+  BOOST_REQUIRE_CLOSE(gDist2.Alpha()[0], alphaReal2, errorTolerance);
+  BOOST_REQUIRE_CLOSE(gDist2.Beta()[0], betaReal2, errorTolerance);
+}
+
 BOOST_AUTO_TEST_SUITE_END();