[mlpack-git] master: Efficiency and documentation fixes for GammDistribution. (29cadf0)
gitdub at mlpack.org
gitdub at mlpack.org
Fri Jul 22 07:45:54 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/ba850f782a53c5a77b7985f7647f609bd96cb5e7...2c026d838925df436d967439899813da5d34c702
>---------------------------------------------------------------
commit 29cadf08524c58565e51133253d217a24c837c64
Author: Yannis Mentekidis <mentekid at gmail.com>
Date: Fri Jul 22 12:45:54 2016 +0100
Efficiency and documentation fixes for GammDistribution.
>---------------------------------------------------------------
29cadf08524c58565e51133253d217a24c837c64
src/mlpack/core/dists/gamma_distribution.cpp | 36 +++++++++++--------
src/mlpack/core/dists/gamma_distribution.hpp | 47 +++++++++---------------
src/mlpack/tests/distribution_test.cpp | 53 +++++-----------------------
3 files changed, 45 insertions(+), 91 deletions(-)
diff --git a/src/mlpack/core/dists/gamma_distribution.cpp b/src/mlpack/core/dists/gamma_distribution.cpp
index 4d93917..c014077 100644
--- a/src/mlpack/core/dists/gamma_distribution.cpp
+++ b/src/mlpack/core/dists/gamma_distribution.cpp
@@ -12,20 +12,19 @@ using namespace mlpack;
using namespace mlpack::distribution;
// Returns true if computation converged.
-inline bool GammaDistribution::converged(double aOld, double aNew)
+inline bool GammaDistribution::converged(const double aOld,
+ const double aNew,
+ const double tol)
{
return (std::abs(aNew - aOld) / aNew) < tol;
}
// Fits an alpha and beta parameter to each dimension of the data.
-void GammaDistribution::Train(const arma::mat& rdata)
+void GammaDistribution::Train(const arma::mat& rdata, const double tol)
{
- // Use pseudonym fittingSet regardless of if rdata was provided by user.
- const arma::mat& fittingSet =
- rdata.n_elem == 0 ? referenceSet : rdata;
// If fittingSet is empty, nothing to do.
- if (arma::size(fittingSet) == arma::size(arma::mat()))
+ if (arma::size(rdata) == arma::size(arma::mat()))
return;
// Use boost's definitions of digamma and tgamma, and std::log.
@@ -34,16 +33,22 @@ void GammaDistribution::Train(const arma::mat& rdata)
using std::log;
// Allocate space for alphas and betas (Assume independent rows).
- alpha.set_size(fittingSet.n_rows);
- beta.set_size(fittingSet.n_rows);
+ alpha.set_size(rdata.n_rows);
+ beta.set_size(rdata.n_rows);
+
+ // Calculate log(mean(x)) and mean(log(x)) of each dataset row.
+ const arma::vec meanLogxVec = arma::mean(arma::log(rdata), 1);
+ const arma::vec meanxVec = arma::mean(rdata, 1);
+ const arma::vec logMeanxVec = arma::log(meanxVec);
// Treat each dimension (i.e. row) independently.
- for (size_t row = 0; row < fittingSet.n_rows; ++row)
+ for (size_t row = 0; row < rdata.n_rows; ++row)
{
- // Calculate log(mean(x)) and mean(log(x)) required for fitting process.
- const double meanLogx = arma::mean(arma::log(fittingSet.row(row)));
- const double meanx = arma::mean(fittingSet.row(row));
- const double logMeanx = std::log(arma::mean(meanx));
+
+ // Statistics for this row.
+ const double meanLogx = meanLogxVec(row);
+ const double meanx = meanxVec(row);
+ const double logMeanx = logMeanxVec(row);
// Starting point for Generalized Newton.
double aEst = 0.5 / (logMeanx - meanLogx);
@@ -63,9 +68,10 @@ void GammaDistribution::Train(const arma::mat& rdata)
aEst = 1.0 / ((1.0 / aEst) + nominator / denominator);
// Protect against nan values (aEst will be passed to logarithm).
- assert(aEst > 0);
+ if (aEst <= 0)
+ throw std::logic_error("GammaDistribution parameter alpha will be <=0");
- } while (! converged(aEst, aOld) );
+ } while (! converged(aEst, aOld, tol) );
alpha(row) = aEst;
beta(row) = meanx/aEst;
diff --git a/src/mlpack/core/dists/gamma_distribution.hpp b/src/mlpack/core/dists/gamma_distribution.hpp
index 18859ef..5763180 100644
--- a/src/mlpack/core/dists/gamma_distribution.hpp
+++ b/src/mlpack/core/dists/gamma_distribution.hpp
@@ -3,8 +3,9 @@
* @author Yannis Mentekidis
*
* Implementation of a Gamma distribution of multidimensional data that fits
- * gamma parameters (alpha, beta) to data. Assumes each data dimension is
- * uncorrelated.
+ * gamma parameters (alpha, beta) to data.
+ * The fitting is done independently for each dataset dimension (row), based on
+ * the assumption each dimension is fully indepeendent.
*
* Based on "Estimating a Gamma Distribution" by Thomas P. Minka:
* research.microsoft.com/~minka/papers/minka-gamma.pdf
@@ -24,21 +25,12 @@ class GammaDistribution
{
public:
/**
- * Empty constructor. Set tolerance to default.
+ * Empty constructor.
*/
- GammaDistribution() : tol(1e-8)
- { /* Nothing to do. */ };
+ GammaDistribution() { /* Nothing to do. */ };
/**
- * Constructor with reference data parameter.
- *
- * @param rdata Reference data for the object.
- */
- GammaDistribution(arma::mat& rdata) : referenceSet(rdata), tol(1e-8)
- { /* Nothing to do. */ };
-
- /**
- * Destructor,
+ * Destructor.
*/
~GammaDistribution() {};
@@ -46,21 +38,12 @@ class GammaDistribution
* This function trains (fits distribution parameters) to new data or the
* dataset the object owns.
*
- * @param rdata Reference data to fit parameters to. If not specified,
- * reference data will be used. Results are stored in the alpha and beta
- * vectors (old results are removed).
+ * @param rdata Reference data to fit parameters to.
+ * @param tol Convergence tolerance. This is *not* an absolute measure:
+ * It will stop the approximation once the *change* in the value is
+ * smaller than tol.
*/
- void Train(const arma::mat& rdata = arma::mat());
-
- /* Access to referenceSet. */
- arma::mat& ReferenceSet(void) { return referenceSet; };
- /* Change referenceSet. */
- void ReferenceSet(arma::mat& rdata) { referenceSet = rdata; };
-
- /* Access to tolerance. */
- double Tol(void) const { return tol; };
- /* Change tolerance. */
- void Tol(double tolerance) { tol = tolerance; };
+ void Train(const arma::mat& rdata, const double tol = 1e-8);
// Access to Gamma Distribution Parameters.
/* Get alpha parameters of each dimension */
@@ -71,8 +54,6 @@ class GammaDistribution
private:
arma::Col<double> alpha; // Array of fitted alphas.
arma::Col<double> beta; // Array of fitted betas.
- arma::mat referenceSet; // Matrix of reference set.
- double tol; // Convergence tolerance.
/**
* This is a small function that returns true if the update of alpha is smaller
@@ -80,8 +61,12 @@ class GammaDistribution
*
* @param aOld old value of parameter we want to estimate (alpha in our case).
* @param aNew new value of parameter (the value after 1 iteration from aOld).
+ * @param tol Convergence tolerance. Relative measure (see documentation of
+ * GammaDistribution::Train)
*/
- inline bool converged(double aOld, double aNew);
+ inline bool converged(const double aOld,
+ const double aNew,
+ const double tol);
};
} // namespace distributions.
diff --git a/src/mlpack/tests/distribution_test.cpp b/src/mlpack/tests/distribution_test.cpp
index 3d754de..3862548 100644
--- a/src/mlpack/tests/distribution_test.cpp
+++ b/src/mlpack/tests/distribution_test.cpp
@@ -390,45 +390,8 @@ BOOST_AUTO_TEST_CASE(GaussianDistributionTrainTest)
/******************************/
/**
- * Make sure that both the default constructor and parameterized constructor
- * create the GammaDistribution object properly.
- */
-BOOST_AUTO_TEST_CASE(GammaDistributionConstructorTest)
-{
- // Empty constructor test. Make sure reference set is empty.
- GammaDistribution gDist;
- BOOST_REQUIRE_EQUAL(arma::size(gDist.ReferenceSet()),
- arma::size(arma::mat()));
-
- // Parameterized constructor test. Make sure reference set is passed
- // correctly.
- // Create an Nxd reference set
- size_t N = 200;
- size_t d = 3;
- arma::mat rdata(d, N);
-
- double alphaReal = 5.3;
- double betaReal = 1.5;
-
- // Create gamma distribution.
- std::default_random_engine generator;
- std::gamma_distribution<double> dist(alphaReal, betaReal);
-
- // Random generation of gamma-like points.
- for (size_t j = 0; j < d; ++j)
- for (size_t i = 0; i < N; ++i)
- rdata(j, i) = dist(generator);
-
- // Create Gamma object.
- GammaDistribution gDist2(rdata);
- BOOST_REQUIRE_EQUAL(arma::size(gDist2.ReferenceSet()),
- arma::size(arma::mat(d, N)));
-}
-
-/**
- * Make sure that constructing an object with one reference set and then asking
- * to fit another works properly: The object retains the old reference set, but
- * fits parameters (alpha, beta) to the new reference set.
+ * Make sure that using an object to fit one reference set and then asking
+ * to fit another works properly.
*/
BOOST_AUTO_TEST_CASE(GammaDistributionTrainTest)
{
@@ -449,8 +412,8 @@ BOOST_AUTO_TEST_CASE(GammaDistributionTrainTest)
rdata(j, i) = dist(generator);
// Create Gamma object and call Train() on reference set.
- GammaDistribution gDist(rdata);
- gDist.Train();
+ GammaDistribution gDist;
+ gDist.Train(rdata);
// Training must estimate d pairs of alpha and beta parameters.
BOOST_REQUIRE_EQUAL(gDist.Alpha().n_elem, d);
@@ -505,8 +468,8 @@ BOOST_AUTO_TEST_CASE(GammaDistributionFittingTest)
rdata(j, i) = dist(generator);
// Create Gamma object and call Train() on reference set.
- GammaDistribution gDist(rdata);
- gDist.Train();
+ GammaDistribution gDist;
+ gDist.Train(rdata);
// Estimated parameter must be close to real.
BOOST_REQUIRE_CLOSE(gDist.Alpha()[0], alphaReal, errorTolerance);
@@ -527,8 +490,8 @@ BOOST_AUTO_TEST_CASE(GammaDistributionFittingTest)
rdata2(j, i) = dist2(generator2);
// Create Gamma object and call Train() on reference set.
- GammaDistribution gDist2(rdata2);
- gDist2.Train();
+ GammaDistribution gDist2;
+ gDist2.Train(rdata2);
// Estimated parameter must be close to real.
BOOST_REQUIRE_CLOSE(gDist2.Alpha()[0], alphaReal2, errorTolerance);
More information about the mlpack-git
mailing list