[mlpack-git] master: Adds a GammaDistribution object for estimating gamma parameters (5293b50)
gitdub at mlpack.org
gitdub at mlpack.org
Wed Jul 20 06:58:26 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/ba850f782a53c5a77b7985f7647f609bd96cb5e7...2c026d838925df436d967439899813da5d34c702
>---------------------------------------------------------------
commit 5293b5037a19735d70937bef7bc97838ca542bbc
Author: Yannis Mentekidis <mentekid at gmail.com>
Date: Wed Jul 20 11:58:26 2016 +0100
Adds a GammaDistribution object for estimating gamma parameters
>---------------------------------------------------------------
5293b5037a19735d70937bef7bc97838ca542bbc
src/mlpack/core.hpp | 1 +
src/mlpack/core/dists/CMakeLists.txt | 2 +
src/mlpack/core/dists/gamma_distribution.cpp | 76 +++++++++++++
src/mlpack/core/dists/gamma_distribution.hpp | 91 +++++++++++++++
src/mlpack/tests/distribution_test.cpp | 163 ++++++++++++++++++++++++++-
5 files changed, 332 insertions(+), 1 deletion(-)
diff --git a/src/mlpack/core.hpp b/src/mlpack/core.hpp
index 1b2226c..e3c8f89 100644
--- a/src/mlpack/core.hpp
+++ b/src/mlpack/core.hpp
@@ -215,6 +215,7 @@
#include <mlpack/core/dists/discrete_distribution.hpp>
#include <mlpack/core/dists/gaussian_distribution.hpp>
#include <mlpack/core/dists/laplace_distribution.hpp>
+#include <mlpack/core/dists/gamma_distribution.hpp>
//mlpack::backtrace only for linux
#ifdef HAS_BFD_DL
#include <mlpack/core/util/backtrace.hpp>
diff --git a/src/mlpack/core/dists/CMakeLists.txt b/src/mlpack/core/dists/CMakeLists.txt
index 536a7b6..a98c8db 100644
--- a/src/mlpack/core/dists/CMakeLists.txt
+++ b/src/mlpack/core/dists/CMakeLists.txt
@@ -9,6 +9,8 @@ set(SOURCES
laplace_distribution.cpp
regression_distribution.hpp
regression_distribution.cpp
+ gamma_distribution.hpp
+ gamma_distribution.cpp
)
# add directory name to sources
diff --git a/src/mlpack/core/dists/gamma_distribution.cpp b/src/mlpack/core/dists/gamma_distribution.cpp
new file mode 100644
index 0000000..bb52ce6
--- /dev/null
+++ b/src/mlpack/core/dists/gamma_distribution.cpp
@@ -0,0 +1,76 @@
+/**
+ * @file gamma_distribution.cpp
+ * @author Yannis Mentekidis
+ *
+ * Implementation of the methods of GammaDistribution.
+ */
+#include "gamma_distribution.hpp"
+
+#include <boost/math/special_functions/digamma.hpp>
+#include <boost/math/special_functions/trigamma.hpp>
+
+using namespace mlpack;
+using namespace mlpack::distribution;
+
+// Returns true if computation converged.
+inline bool GammaDistribution::converged(double aOld, double aNew)
+{
+ return (std::abs(aNew - aOld) / aNew) < tol;
+}
+
+// Fits an alpha and beta parameter to each dimension of the data.
+void GammaDistribution::Train(const arma::mat& rdata)
+{
+ // Use pseudonym fittingSet regardless of if rdata was provided by user.
+ const arma::mat& fittingSet =
+ rdata.n_elem == 0 ? referenceSet : rdata;
+
+ // If fittingSet is empty, nothing to do.
+ if (arma::size(fittingSet) == arma::size(arma::mat()))
+ return;
+
+ // Use boost's definitions of digamma and tgamma, and std::log.
+ using boost::math::digamma;
+ using boost::math::trigamma;
+ using std::log;
+
+ // Allocate space for alphas and betas (Assume independent rows).
+ alpha.set_size(fittingSet.n_rows);
+ beta.set_size(fittingSet.n_rows);
+
+ // Treat each dimension (i.e. row) independently.
+ for (size_t row = 0; row < fittingSet.n_rows; ++row)
+ {
+ // Calculate log(mean(x)) and mean(log(x)) required for fitting process.
+ const double meanLogx = arma::mean(arma::log(fittingSet.row(row)));
+ const double meanx = arma::mean(fittingSet.row(row));
+ const double logMeanx = std::log(arma::mean(meanx));
+
+ // Starting point for Generalized Newton.
+ double aEst = 0.5 / (logMeanx - meanLogx);
+ double aOld;
+
+ // Newton's method: In each step, make an update to aEst. If value didn't
+ // change much (abs(aNew - aEst)/aEst < tol), then stop.
+ do
+ {
+ // Needed for convergence test.
+ aOld = aEst;
+
+ // Calculate new value for alpha.
+ double nominator = meanLogx - logMeanx + log(aEst) - digamma(aEst);
+ double denominator = pow(aEst, 2) * (1 / aEst - trigamma(aEst));
+ assert (denominator != 0); // Protect against division by 0.
+ aEst = 1.0 / ((1.0 / aEst) + nominator / denominator);
+
+ // Protect against nan values (aEst will be passed to logarithm).
+ assert(aEst > 0);
+
+ } while (! converged(aEst, aOld) );
+
+ alpha(row) = aEst;
+ beta(row) = meanx/aEst;
+ }
+ return;
+}
+
diff --git a/src/mlpack/core/dists/gamma_distribution.hpp b/src/mlpack/core/dists/gamma_distribution.hpp
new file mode 100644
index 0000000..9562194
--- /dev/null
+++ b/src/mlpack/core/dists/gamma_distribution.hpp
@@ -0,0 +1,91 @@
+/**
+ * @file gamma_distribution.hpp
+ * @author Yannis Mentekidis
+ *
+ * Implementation of a Gamma distribution of multidimensional data that fits
+ * gamma parameters (alpha, beta) to data. Assumes each data dimension is
+ * uncorrelated.
+ *
+ * Based on "Estimating a Gamma Distribution" by Thomas P. Minka:
+ * research.microsoft.com/~minka/papers/minka-gamma.pdf
+ */
+
+#ifndef _MLPACK_CORE_DISTRIBUTIONS_GAMMA_DISTRIBUTION_HPP
+#define _MLPACK_CORE_DISTRIBUTIONS_GAMMA_DISTRIBUTION_HPP
+
+#include <mlpack/core.hpp>
+namespace mlpack{
+namespace distribution{
+
+
+/**
+ * Class for fitting the Gamma Distribution to a dataset.
+ */
+class GammaDistribution
+{
+ public:
+ /**
+ * Empty constructor. Set tolerance to default.
+ */
+ GammaDistribution() : tol(1e-8)
+ { /* Nothing to do. */ };
+
+ /**
+ * Constructor with reference data parameter.
+ *
+ * @param rdata Reference data for the object.
+ */
+ GammaDistribution(arma::mat& rdata) : referenceSet(rdata), tol(1e-8)
+ { /* Nothing to do. */ };
+
+ /**
+ * Destructor,
+ */
+ ~GammaDistribution() {};
+
+ /**
+ * This function trains (fits distribution parameters) to new data or data
+ * object was constructed with.
+ *
+ * @param rdata Reference data to fit parameters to. If not specified,
+ * reference data will be used. Results are stored in the alpha and beta
+ * vectors.
+ */
+ void Train(const arma::mat& rdata = arma::mat());
+
+ /* Access to referenceSet. */
+ arma::mat& ReferenceSet(void) { return referenceSet; };
+ /* Change referenceSet. */
+ void ReferenceSet(arma::mat& rdata) { referenceSet = rdata; };
+
+ /* Access to tolerance. */
+ double Tol(void) const { return tol; };
+ /* Change tolerance. */
+ void Tol(double tolerance) { tol = tolerance; };
+
+ // Access to Gamma Distribution Parameters.
+ /* Get alpha parameters of each dimension */
+ arma::Col<double>& Alpha(void) { return alpha; };
+ /* Get beta parameters of each dimension */
+ arma::Col<double>& Beta(void) { return beta; };
+
+ private:
+ arma::Col<double> alpha; // Array of fitted alphas.
+ arma::Col<double> beta; // Array of fitted betas.
+ arma::mat referenceSet; // Matrix of reference set.
+ double tol; // Convergence tolerance.
+
+ /**
+ * This is a small function that returns true if the update of alpha is smaller
+ * than the tolerance ratio.
+ *
+ * @param aOld old value of parameter we want to estimate (alpha in our case).
+ * @param aNew new value of parameter (the value after 1 iteration from aOld).
+ */
+ inline bool converged(double aOld, double aNew);
+};
+
+} // namespace distributions.
+} // namespace mlpack.
+
+#endif
diff --git a/src/mlpack/tests/distribution_test.cpp b/src/mlpack/tests/distribution_test.cpp
index d0261dd..e605134 100644
--- a/src/mlpack/tests/distribution_test.cpp
+++ b/src/mlpack/tests/distribution_test.cpp
@@ -1,8 +1,12 @@
/**
* @file distribution_test.cpp
* @author Ryan Curtin
+ * @author Yannis Mentekidis
*
- * Test for the mlpack::distribution::DiscreteDistribution class.
+ * Tests for the classes:
+ * * mlpack::distribution::DiscreteDistribution
+ * * mlpack::distribution::GaussianDistribution
+ * * mlpack::distribution::GammaDistribution
*/
#include <mlpack/core.hpp>
@@ -14,6 +18,10 @@ using namespace mlpack::distribution;
BOOST_AUTO_TEST_SUITE(DistributionTest);
+/*********************************/
+/** Discrete Distribution Tests **/
+/*********************************/
+
/**
* Make sure we initialize correctly.
*/
@@ -105,6 +113,10 @@ BOOST_AUTO_TEST_CASE(DiscreteDistributionTrainProbTest)
BOOST_REQUIRE_CLOSE(d.Probability("2"), 0.5, 1e-5);
}
+/*********************************/
+/** Gaussian Distribution Tests **/
+/*********************************/
+
/**
* Make sure Gaussian distributions are initialized correctly.
*/
@@ -373,4 +385,153 @@ BOOST_AUTO_TEST_CASE(GaussianDistributionTrainTest)
BOOST_REQUIRE_SMALL(d.Covariance()(i, j) - actualCov(i, j), 1e-5);
}
+/******************************/
+/** Gamma Distribution Tests **/
+/******************************/
+
+/**
+ * Make sure that both the default constructor and parameterized constructor
+ * create the GammaDistribution object properly.
+ */
+BOOST_AUTO_TEST_CASE(GammaDistributionConstructorTest)
+{
+ // Empty constructor test. Make sure reference set is empty.
+ GammaDistribution gDist;
+ BOOST_REQUIRE_EQUAL(arma::size(gDist.ReferenceSet()), arma::size(arma::mat()));
+
+ // Parameterized constructor test. Make sure reference set is passed
+ // correctly.
+ // Create an Nxd reference set
+ size_t N = 200;
+ size_t d = 3;
+ arma::mat rdata(d, N);
+
+ double alphaReal = 5.3;
+ double betaReal = 1.5;
+
+ // Create gamma distribution.
+ std::default_random_engine generator;
+ std::gamma_distribution<double> dist(alphaReal, betaReal);
+
+ // Random generation of gamma-like points.
+ for (size_t j = 0; j < d; ++j)
+ for (size_t i = 0; i < N; ++i)
+ rdata(j, i) = dist(generator);
+
+ // Create Gamma object.
+ GammaDistribution gDist2(rdata);
+ BOOST_REQUIRE_EQUAL(arma::size(gDist2.ReferenceSet()), arma::size(arma::mat(d, N)));
+}
+
+/**
+ * Make sure that constructing an object with one reference set and then asking
+ * to fit another works properly: The object retains the old reference set, but
+ * fits parameters (alpha, beta) to the new reference set.
+ */
+BOOST_AUTO_TEST_CASE(GammaDistributionTrainTest)
+{
+ // Create a gamma distribution random generator.
+ double alphaReal = 5.3;
+ double betaReal = 1.5;
+ std::default_random_engine generator;
+ std::gamma_distribution<double> dist(alphaReal, betaReal);
+
+ // Create a N x d gamma distribution data and fit the results.
+ size_t N = 200;
+ size_t d = 2;
+ arma::mat rdata(d, N);
+
+ // Random generation of gamma-like points.
+ for (size_t j = 0; j < d; ++j)
+ for (size_t i = 0; i < N; ++i)
+ rdata(j, i) = dist(generator);
+
+ // Create Gamma object and call Train() on reference set.
+ GammaDistribution gDist(rdata);
+ gDist.Train();
+
+ // Training must estimate d pairs of alpha and beta parameters.
+ BOOST_REQUIRE_EQUAL(gDist.Alpha().n_elem, d);
+ BOOST_REQUIRE_EQUAL(gDist.Beta().n_elem, d);
+
+ // Create a N' x d' gamma distribution, fit results without new object.
+ size_t N2 = 350;
+ size_t d2 = 4;
+ arma::mat rdata2(d2, N2);
+
+ // Random generation of gamma-like points.
+ for (size_t j = 0; j < d2; ++j)
+ for (size_t i = 0; i < N2; ++i)
+ rdata2(j, i) = dist(generator);
+
+ // Fit results using old object.
+ gDist.Train(rdata2);
+
+ // Training must estimate d' pairs of alpha and beta parameters.
+ BOOST_REQUIRE_EQUAL(gDist.Alpha().n_elem, d2);
+ BOOST_REQUIRE_EQUAL(gDist.Beta().n_elem, d2);
+
+}
+
+/**
+ * This test verifies that the fitting procedure for GammaDistribution works
+ * properly and converges near the actual gamma parameters. We do this twice
+ * with different alpha/beta parameters so we make sure we don't have some weird
+ * bug that always converges to the same number.
+ */
+BOOST_AUTO_TEST_CASE(GammaDistributionFittingTest)
+{
+ // Offset from the actual alpha/beta. 10% is quite a relaxed tolerance since
+ // the random points we generate are few (for test speed) and might be fitted
+ // better by a similar distribution.
+ double errorTolerance = 10;
+
+ size_t N = 500;
+ size_t d = 1; // Only 1 dimension is required for this.
+
+ /** Iteration 1 (first parameter set) **/
+
+ // Create a gamma-random generator and data
+ double alphaReal = 5.3;
+ double betaReal = 1.5;
+ std::default_random_engine generator;
+ std::gamma_distribution<double> dist(alphaReal, betaReal);
+
+ // Random generation of gamma-like points.
+ arma::mat rdata(d, N);
+ for (size_t j = 0; j < d; ++j)
+ for (size_t i = 0; i < N; ++i)
+ rdata(j, i) = dist(generator);
+
+ // Create Gamma object and call Train() on reference set.
+ GammaDistribution gDist(rdata);
+ gDist.Train();
+
+ // Estimated parameter must be close to real.
+ BOOST_REQUIRE_CLOSE(gDist.Alpha()[0], alphaReal, errorTolerance);
+ BOOST_REQUIRE_CLOSE(gDist.Beta()[0], betaReal, errorTolerance);
+
+ /** Iteration 2 (different parameter set) **/
+
+ // Create a gamma-random generator and data
+ double alphaReal2 = 7.2;
+ double betaReal2 = 0.9;
+ std::default_random_engine generator2;
+ std::gamma_distribution<double> dist2(alphaReal2, betaReal2);
+
+ // Random generation of gamma-like points.
+ arma::mat rdata2(d, N);
+ for (size_t j = 0; j < d; ++j)
+ for (size_t i = 0; i < N; ++i)
+ rdata2(j, i) = dist2(generator2);
+
+ // Create Gamma object and call Train() on reference set.
+ GammaDistribution gDist2(rdata2);
+ gDist2.Train();
+
+ // Estimated parameter must be close to real.
+ BOOST_REQUIRE_CLOSE(gDist2.Alpha()[0], alphaReal2, errorTolerance);
+ BOOST_REQUIRE_CLOSE(gDist2.Beta()[0], betaReal2, errorTolerance);
+}
+
BOOST_AUTO_TEST_SUITE_END();
More information about the mlpack-git
mailing list