[mlpack-svn] r15761 - mlpack/trunk/src/mlpack/methods/pca
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Wed Sep 11 17:22:02 EDT 2013
Author: rcurtin
Date: Wed Sep 11 17:22:01 2013
New Revision: 15761
Log:
Add some new functions from Sumedh.
Modified:
mlpack/trunk/src/mlpack/methods/pca/pca.cpp
mlpack/trunk/src/mlpack/methods/pca/pca.hpp
Modified: mlpack/trunk/src/mlpack/methods/pca/pca.cpp
==============================================================================
--- mlpack/trunk/src/mlpack/methods/pca/pca.cpp (original)
+++ mlpack/trunk/src/mlpack/methods/pca/pca.cpp Wed Sep 11 17:22:01 2013
@@ -31,6 +31,8 @@
arma::vec& eigVal,
arma::mat& coeff) const
{
+ Timer::Start("pca");
+
// This matrix will store the right singular values; we do not need them.
arma::mat v;
@@ -72,6 +74,8 @@
// Project the samples to the principals.
transformedData = arma::trans(coeff) * centeredData;
+
+ Timer::Stop("pca");
}
/**
@@ -90,22 +94,77 @@
}
/**
- * Apply Dimensionality Reduction using Principal Component Analysis
- * to the provided data set.
+ * Use PCA for dimensionality reduction on the given dataset. This will save
+ * the newDimension largest principal components of the data and remove the
+ * rest. The parameter returned is the amount of variance of the data that is
+ * retained; this is a value between 0 and 1. For instance, a value of 0.9
+ * indicates that 90% of the variance present in the data was retained.
+ *
+ * @param data Data matrix.
+ * @param newDimension New dimension of the data.
+ * @return Amount of the variance of the data retained (between 0 and 1).
+ */
+double PCA::Apply(arma::mat& data, const size_t newDimension) const
+{
+ // Parameter validation.
+ if (newDimension == 0)
+ Log::Fatal << "PCA::Apply(): newDimension (" << newDimension << ") cannot "
+ << "be zero!" << endl;
+ if (newDimension > data.n_rows)
+ Log::Fatal << "PCA::Apply(): newDimension (" << newDimension << ") cannot "
+ << "be greater than the existing dimensionality of the data ("
+ << data.n_rows << ")!" << endl;
+
+ arma::mat coeffs;
+ arma::vec eigVal;
+
+ Apply(data, data, eigVal, coeffs);
+
+ // Drop unnecessary rows.
+ data.shed_rows(newDimension, data.n_rows - 1);
+
+ // Calculate the total amount of variance retained.
+ return (sum(eigVal.subvec(0, newDimension - 1)) / sum(eigVal));
+}
+
+/**
+ * Use PCA for dimensionality reduction on the given dataset. This will save
+ * as many dimensions as necessary to retain at least the given amount of
+ * variance (specified by parameter varRetained). The amount should be
+ * between 0 and 1; if the amount is 0, then only 1 dimension will be
+ * retained. If the amount is 1, then all dimensions will be retained.
*
- * @param data - M x N Data matrix
- * @param newDimension - matrix consisting of N column vectors,
- * where each vector is the projection of the corresponding data vector
- * from data matrix onto the basis vectors contained in the columns of
- * coeff/eigen vector matrix with only newDimension number of columns chosen.
+ * The method returns the actual amount of variance retained, which will
+ * always be greater than or equal to the varRetained parameter.
*/
-void PCA::Apply(arma::mat& data, const size_t newDimension) const
+double PCA::Apply(arma::mat& data, const double varRetained) const
{
+ // Parameter validation.
+ if (varRetained < 0)
+ Log::Fatal << "PCA::Apply(): varRetained (" << varRetained << ") must be "
+ << "greater than or equal to 0." << endl;
+ if (varRetained > 1)
+ Log::Fatal << "PCA::Apply(): varRetained (" << varRetained << ") should be "
+ << "less than or equal to 1." << endl;
+
arma::mat coeffs;
arma::vec eigVal;
Apply(data, data, eigVal, coeffs);
- if (newDimension < coeffs.n_rows && newDimension > 0)
+ // Calculate the dimension we should keep.
+ size_t newDimension = 0;
+ double varSum = 0.0;
+ eigVal /= arma::sum(eigVal); // Normalize eigenvalues.
+ while ((varSum < varRetained) && (newDimension < eigVal.n_elem))
+ {
+ varSum += eigVal[newDimension];
+ ++newDimension;
+ }
+
+ // varSum is the actual variance we will retain.
+ if (newDimension < eigVal.n_elem)
data.shed_rows(newDimension, data.n_rows - 1);
+
+ return varSum;
}
Modified: mlpack/trunk/src/mlpack/methods/pca/pca.hpp
==============================================================================
--- mlpack/trunk/src/mlpack/methods/pca/pca.hpp (original)
+++ mlpack/trunk/src/mlpack/methods/pca/pca.hpp Wed Sep 11 17:22:01 2013
@@ -32,7 +32,8 @@
PCA(const bool scaleData = false);
/**
- * Apply Principal Component Analysis to the provided data set.
+ * Apply Principal Component Analysis to the provided data set. It is safe to
+ * pass the same matrix reference for both data and transformedData.
*
* @param data Data matrix.
* @param transformedData Matrix to put results of PCA into.
@@ -45,7 +46,8 @@
arma::mat& eigvec) const;
/**
- * Apply Principal Component Analysis to the provided data set.
+ * Apply Principal Component Analysis to the provided data set. It is safe to
+ * pass the same matrix reference for both data and transformedData.
*
* @param data Data matrix.
* @param transformedData Matrix to store results of PCA in.
@@ -58,12 +60,38 @@
/**
* Use PCA for dimensionality reduction on the given dataset. This will save
* the newDimension largest principal components of the data and remove the
- * rest.
+ * rest. The parameter returned is the amount of variance of the data that is
+ * retained; this is a value between 0 and 1. For instance, a value of 0.9
+ * indicates that 90% of the variance present in the data was retained.
*
* @param data Data matrix.
* @param newDimension New dimension of the data.
+ * @return Amount of the variance of the data retained (between 0 and 1).
*/
- void Apply(arma::mat& data, const size_t newDimension) const;
+ double Apply(arma::mat& data, const size_t newDimension) const;
+
+ //! This overload is here to make sure int gets casted right to size_t.
+ inline double Apply(arma::mat& data, const int newDimension) const
+ {
+ return Apply(data, size_t(newDimension));
+ }
+
+ /**
+ * Use PCA for dimensionality reduction on the given dataset. This will save
+ * as many dimensions as necessary to retain at least the given amount of
+ * variance (specified by parameter varRetained). The amount should be
+ * between 0 and 1; if the amount is 0, then only 1 dimension will be
+ * retained. If the amount is 1, then all dimensions will be retained.
+ *
+ * The method returns the actual amount of variance retained, which will
+ * always be greater than or equal to the varRetained parameter.
+ *
+ * @param data Data matrix.
+ * @param varRetained Lower bound on amount of variance to retain; should be
+ * between 0 and 1.
+ * @return Actual amount of variance retained (between 0 and 1).
+ */
+ double Apply(arma::mat& data, const double varRetained) const;
//! Get whether or not this PCA object will scale (by standard deviation) the
//! data when PCA is performed.
More information about the mlpack-svn
mailing list