[mlpack-svn] r15761 - mlpack/trunk/src/mlpack/methods/pca

fastlab-svn at coffeetalk-1.cc.gatech.edu fastlab-svn at coffeetalk-1.cc.gatech.edu
Wed Sep 11 17:22:02 EDT 2013


Author: rcurtin
Date: Wed Sep 11 17:22:01 2013
New Revision: 15761

Log:
Add some new functions from Sumedh.


Modified:
   mlpack/trunk/src/mlpack/methods/pca/pca.cpp
   mlpack/trunk/src/mlpack/methods/pca/pca.hpp

Modified: mlpack/trunk/src/mlpack/methods/pca/pca.cpp
==============================================================================
--- mlpack/trunk/src/mlpack/methods/pca/pca.cpp	(original)
+++ mlpack/trunk/src/mlpack/methods/pca/pca.cpp	Wed Sep 11 17:22:01 2013
@@ -31,6 +31,8 @@
                 arma::vec& eigVal,
                 arma::mat& coeff) const
 {
+  Timer::Start("pca");
+
   // This matrix will store the right singular values; we do not need them.
   arma::mat v;
 
@@ -72,6 +74,8 @@
 
   // Project the samples to the principals.
   transformedData = arma::trans(coeff) * centeredData;
+
+  Timer::Stop("pca");
 }
 
 /**
@@ -90,22 +94,77 @@
 }
 
 /**
- * Apply Dimensionality Reduction using Principal Component Analysis
- * to the provided data set.
+ * Use PCA for dimensionality reduction on the given dataset.  This will save
+ * the newDimension largest principal components of the data and remove the
+ * rest.  The parameter returned is the amount of variance of the data that is
+ * retained; this is a value between 0 and 1.  For instance, a value of 0.9
+ * indicates that 90% of the variance present in the data was retained.
+ *
+ * @param data Data matrix.
+ * @param newDimension New dimension of the data.
+ * @return Amount of the variance of the data retained (between 0 and 1).
+ */
+double PCA::Apply(arma::mat& data, const size_t newDimension) const
+{
+  // Parameter validation.
+  if (newDimension == 0)
+    Log::Fatal << "PCA::Apply(): newDimension (" << newDimension << ") cannot "
+        << "be zero!" << endl;
+  if (newDimension > data.n_rows)
+    Log::Fatal << "PCA::Apply(): newDimension (" << newDimension << ") cannot "
+        << "be greater than the existing dimensionality of the data ("
+        << data.n_rows << ")!" << endl;
+
+  arma::mat coeffs;
+  arma::vec eigVal;
+
+  Apply(data, data, eigVal, coeffs);
+
+  // Drop unnecessary rows.
+  data.shed_rows(newDimension, data.n_rows - 1);
+
+  // Calculate the total amount of variance retained.
+  return (sum(eigVal.subvec(0, newDimension - 1)) / sum(eigVal));
+}
+
+/**
+ * Use PCA for dimensionality reduction on the given dataset.  This will save
+ * as many dimensions as necessary to retain at least the given amount of
+ * variance (specified by parameter varRetained).  The amount should be
+ * between 0 and 1; if the amount is 0, then only 1 dimension will be
+ * retained.  If the amount is 1, then all dimensions will be retained.
  *
- * @param data - M x N Data matrix
- * @param newDimension - matrix consisting of N column vectors,
- * where each vector is the projection of the corresponding data vector
- * from data matrix onto the basis vectors contained in the columns of
- * coeff/eigen vector matrix with only newDimension number of columns chosen.
+ * The method returns the actual amount of variance retained, which will
+ * always be greater than or equal to the varRetained parameter.
  */
-void PCA::Apply(arma::mat& data, const size_t newDimension) const
+double PCA::Apply(arma::mat& data, const double varRetained) const
 {
+  // Parameter validation.
+  if (varRetained < 0)
+    Log::Fatal << "PCA::Apply(): varRetained (" << varRetained << ") must be "
+        << "greater than or equal to 0." << endl;
+  if (varRetained > 1)
+    Log::Fatal << "PCA::Apply(): varRetained (" << varRetained << ") should be "
+        << "less than or equal to 1." << endl;
+
   arma::mat coeffs;
   arma::vec eigVal;
 
   Apply(data, data, eigVal, coeffs);
 
-  if (newDimension < coeffs.n_rows && newDimension > 0)
+  // Calculate the dimension we should keep.
+  size_t newDimension = 0;
+  double varSum = 0.0;
+  eigVal /= arma::sum(eigVal); // Normalize eigenvalues.
+  while ((varSum < varRetained) && (newDimension < eigVal.n_elem))
+  {
+    varSum += eigVal[newDimension];
+    ++newDimension;
+  }
+
+  // varSum is the actual variance we will retain.
+  if (newDimension < eigVal.n_elem)
     data.shed_rows(newDimension, data.n_rows - 1);
+
+  return varSum;
 }

Modified: mlpack/trunk/src/mlpack/methods/pca/pca.hpp
==============================================================================
--- mlpack/trunk/src/mlpack/methods/pca/pca.hpp	(original)
+++ mlpack/trunk/src/mlpack/methods/pca/pca.hpp	Wed Sep 11 17:22:01 2013
@@ -32,7 +32,8 @@
   PCA(const bool scaleData = false);
 
   /**
-   * Apply Principal Component Analysis to the provided data set.
+   * Apply Principal Component Analysis to the provided data set.  It is safe to
+   * pass the same matrix reference for both data and transformedData.
    *
    * @param data Data matrix.
    * @param transformedData Matrix to put results of PCA into.
@@ -45,7 +46,8 @@
              arma::mat& eigvec) const;
 
   /**
-   * Apply Principal Component Analysis to the provided data set.
+   * Apply Principal Component Analysis to the provided data set.  It is safe to
+   * pass the same matrix reference for both data and transformedData.
    *
    * @param data Data matrix.
    * @param transformedData Matrix to store results of PCA in.
@@ -58,12 +60,38 @@
   /**
    * Use PCA for dimensionality reduction on the given dataset.  This will save
    * the newDimension largest principal components of the data and remove the
-   * rest.
+   * rest.  The parameter returned is the amount of variance of the data that is
+   * retained; this is a value between 0 and 1.  For instance, a value of 0.9
+   * indicates that 90% of the variance present in the data was retained.
    *
    * @param data Data matrix.
    * @param newDimension New dimension of the data.
+   * @return Amount of the variance of the data retained (between 0 and 1).
    */
-  void Apply(arma::mat& data, const size_t newDimension) const;
+  double Apply(arma::mat& data, const size_t newDimension) const;
+
+  //! This overload is here to make sure int gets casted right to size_t.
+  inline double Apply(arma::mat& data, const int newDimension) const
+  {
+    return Apply(data, size_t(newDimension));
+  }
+
+  /**
+   * Use PCA for dimensionality reduction on the given dataset.  This will save
+   * as many dimensions as necessary to retain at least the given amount of
+   * variance (specified by parameter varRetained).  The amount should be
+   * between 0 and 1; if the amount is 0, then only 1 dimension will be
+   * retained.  If the amount is 1, then all dimensions will be retained.
+   *
+   * The method returns the actual amount of variance retained, which will
+   * always be greater than or equal to the varRetained parameter.
+   *
+   * @param data Data matrix.
+   * @param varRetained Lower bound on amount of variance to retain; should be
+   *     between 0 and 1.
+   * @return Actual amount of variance retained (between 0 and 1).
+   */
+  double Apply(arma::mat& data, const double varRetained) const;
 
   //! Get whether or not this PCA object will scale (by standard deviation) the
   //! data when PCA is performed.



More information about the mlpack-svn mailing list