[mlpack-svn] r13047 - mlpack/trunk/src/mlpack/methods/pca

Wed Jun 13 12:18:45 EDT 2012

Author: rmohan
Date: 2012-06-13 12:18:45 -0400 (Wed, 13 Jun 2012)
New Revision: 13047

Added:
   mlpack/trunk/src/mlpack/methods/pca/mdistupdate.hpp
   mlpack/trunk/src/mlpack/methods/pca/mdivupdate.hpp
   mlpack/trunk/src/mlpack/methods/pca/nmf.hpp
   mlpack/trunk/src/mlpack/methods/pca/nmf_impl.hpp
   mlpack/trunk/src/mlpack/methods/pca/nmf_main.cpp
Modified:
   mlpack/trunk/src/mlpack/methods/pca/CMakeLists.txt
Log:
Basic implementation of NMF


Modified: mlpack/trunk/src/mlpack/methods/pca/CMakeLists.txt
===================================================================

--- mlpack/trunk/src/mlpack/methods/pca/CMakeLists.txt	2012-06-13 15:48:27 UTC (rev 13046)
+++ mlpack/trunk/src/mlpack/methods/pca/CMakeLists.txt	2012-06-13 16:18:45 UTC (rev 13047)
@@ -3,8 +3,10 @@
 # Define the files we need to compile
 # Anything not in this list will not be compiled into MLPACK.
 set(SOURCES
-  pca.hpp
-  pca.cpp
+  mdistupdate.hpp
+  mdivupdate.hpp
+  nmf.hpp
+  nmf_impl.hpp
 )
 
 # Add directory name to sources.
@@ -16,10 +18,10 @@
 # the parent scope).
 set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)
 
-add_executable(pca
-  pca_main.cpp
+add_executable(nmf
+  nmf_main.cpp
 )
-target_link_libraries(pca
+target_link_libraries(nmf
   mlpack
 )
-install(TARGETS pca RUNTIME DESTINATION bin)
+install(TARGETS nmf RUNTIME DESTINATION bin)

Added: mlpack/trunk/src/mlpack/methods/pca/mdistupdate.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/pca/mdistupdate.hpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/methods/pca/mdistupdate.hpp	2012-06-13 16:18:45 UTC (rev 13047)
@@ -0,0 +1,95 @@
+/**
+ * @file mdistupdate.hpp
+ * @author Mohan Rajendran
+ *
+ * Update rules for the Non-negative Matrix Factorization. This follows a method
+ * described in the paper 'Algorithms for Non-negative Matrix Factorization' 
+ * by D. D. Lee and H. S. Seung. This is a multiplicative rule that ensures
+ * that the Frobenius norm \f$ \sqrt{\sum_i \sum_j(V-WH)^2} \f$ is
+ * non-increasing between subsequent iterations. Both of the update rules
+ * for W and H are defined in this file.
+ *
+ */
+
+#ifndef __MLPACK_METHODS_NMF_MDISTUPDATE_HPP
+#define __MLPACK_METHODS_NMF_MDISTUPDATE_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace nmf {
+
+/**
+ * The update rule for the basis matrix W. The formula used is 
+ * \f[ 
+ * W_{ia} \leftarrow W_{ia} \frac{(VH^T)_{ia}}{(WHH^T)_{ia}}
+ * \f]
+ */
+class MultiplicativeDistanceW
+{
+ public:
+  // Empty constructor required for the WUpdateRule template
+  MultiplicativeDistanceW() { }
+
+  /**
+   * The update function that actually updates the W matrix. The function takes
+   * in all the salient matrices and only changes the value of the W matrix.
+   *
+   * @param V Input matrix to be factorized
+   * @param W Basis matrix to be output
+   * @param H Encoding matrix to output
+   */
+
+  inline static void Update(const arma::mat& V,
+                     arma::mat& W, 
+                     const arma::mat& H)
+  {
+    // Simple implementation. This can be left here.
+    arma::mat t1,t2;
+    
+    t1 = V*H.t();
+    t2 = W*H*H.t();
+    
+    W = (W%t1)/t2;
+  }
+}; // Class MultiplicativeDistanceW
+
+/**
+ * The update rule for the encoding matrix H. The formula used is
+ * \f[
+ * H_{a\mu} \leftarrow H_{a\mu} \frac{(W^T V)_{a\mu}}{(W^T WH)_{a\mu}}
+ * \f]
+ */
+class MultiplicativeDistanceH
+{
+ public:
+  // Empty constructor required for the HUpdateRule template
+  MultiplicativeDistanceH() { }
+
+  /**
+   * The update function that actually updates the H matrix. The function takes
+   * in all the salient matrices and only changes the value of the H matrix.
+   *
+   * @param V Input matrix to be factorized
+   * @param W Basis matrix to be output
+   * @param H Encoding matrix to output
+   */
+
+  inline static void Update(const arma::mat& V,
+                     const arma::mat& W, 
+                     arma::mat& H)
+  {
+    // Simple implementation. This can be left here.
+    arma::mat t1,t2;
+    
+    t1 = W.t()*V;
+    t2 = W.t()*W*H;
+
+    H = (H%t1)/t2;
+  }
+}; // Class MultiplicativeDistanceH
+
+}; // namespace nmf
+}; // namespace mlpack
+
+#endif

Added: mlpack/trunk/src/mlpack/methods/pca/mdivupdate.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/pca/mdivupdate.hpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/methods/pca/mdivupdate.hpp	2012-06-13 16:18:45 UTC (rev 13047)
@@ -0,0 +1,112 @@
+/**
+ * @file mdivupdate.hpp
+ * @author Mohan Rajendran
+ *
+ * Update rules for the Non-negative Matrix Factorization. This follows a method
+ * described in the paper 'Algorithms for Non-negative Matrix Factorization' 
+ * by D. D. Lee and H. S. Seung. This is a multiplicative rule that ensures
+ * that the the 'divergence' 
+ * \f$ \sum_i \sum_j (V_{ij} log\frac{V_{ij}}{(WH)_{ij}}-V_{ij}+(WH)_{ij}) \f$is
+ * non-increasing between subsequent iterations. Both of the update rules
+ * for W and H are defined in this file.
+ *
+ */
+
+#ifndef __MLPACK_METHODS_NMF_MDIVUPDATE_HPP
+#define __MLPACK_METHODS_NMF_MDIVUPDATE_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace nmf {
+
+/**
+ * The update rule for the basis matrix W. The formula used is 
+ * \f[ 
+ * W_{ia} \leftarrow W_{ia} \frac{\sum_{\mu} H_{a\mu} V_{i\mu}/(WH)_{i\mu}}
+ * {\sum_{\nu} H_{a\nu}}
+ * \f]
+ */
+class MultiplicativeDistanceW
+{
+ public:
+  // Empty constructor required for the WUpdateRule template
+  MultiplicativeDivergenceW() { }
+
+  /**
+   * The update function that actually updates the W matrix. The function takes
+   * in all the salient matrices and only changes the value of the W matrix.
+   *
+   * @param V Input matrix to be factorized
+   * @param W Basis matrix to be output
+   * @param H Encoding matrix to output
+   */
+
+  inline static void Update(const arma::mat& V,
+                     arma::mat& W, 
+                     const arma::mat& H)
+  {
+    // Simple implementation. This can be left here.
+    arma::mat t1;
+    arma::rowvec t2;
+
+    t1 = W*H;
+    for(size_t i=0;i<W.n_rows;i++)
+    {
+      for(size_t j=0;j<W.n_cols;j++)
+      {
+        t2 = H.row(j)%V.row(i)/t1.row(i);
+        W(i,j) = W(i,j)*sum(t2)/sum(H.row(i));
+      }
+    }
+
+  }
+}; // Class MultiplicativeDivergenceW
+
+/**
+ * The update rule for the encoding matrix H. The formula used is
+ * \f[
+ * H_{a\mu} \leftarrow H_{a\mu} \frac{\sum_{i} W_{ia} V_{i\mu}/(WH)_{i\mu}}
+ * {\sum_{k} H_{ka}}
+ * \f]
+ */
+class MultiplicativeDivergenceH
+{
+ public:
+  // Empty constructor required for the HUpdateRule template
+  MultiplicativeDistanceH() { }
+
+  /**
+   * The update function that actually updates the H matrix. The function takes
+   * in all the salient matrices and only changes the value of the H matrix.
+   *
+   * @param V Input matrix to be factorized
+   * @param W Basis matrix to be output
+   * @param H Encoding matrix to output
+   */
+
+  inline static void Update(const arma::mat& V,
+                     const arma::mat& W, 
+                     arma::mat& H)
+  {
+    // Simple implementation. This can be left here.
+    arma::mat t1;
+    arma::colvec t2;
+    
+    t1 = W*H;
+    for(size_t i=0;i<H.n_rows;i++)
+    {
+      for(size_t j=0;j<H.n_cols;j++)
+      {
+        t2 = W.col(i)%V.col(j)/t1.col(j);
+        H(i,j) = H(i,j)*sum(t2)/sum(H.col(i));
+      }
+    }
+
+  }
+}; // Class MultiplicativeDivergenceH
+
+}; // namespace nmf
+}; // namespace mlpack
+
+#endif

Added: mlpack/trunk/src/mlpack/methods/pca/nmf.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/pca/nmf.hpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/methods/pca/nmf.hpp	2012-06-13 16:18:45 UTC (rev 13047)
@@ -0,0 +1,101 @@
+/**
+ * @file nmf.hpp
+ * @author Mohan Rajendran
+ *
+ * Defines the NMF class to perform Non-negative Matrix Factorization 
+ * on the given matrix.
+ */
+#ifndef __MLPACK_METHODS_NMF_NMF_HPP
+#define __MLPACK_METHODS_NMF_NMF_HPP
+
+#include <mlpack/core.hpp>
+#include "mdistupdate.hpp"
+
+namespace mlpack {
+namespace nmf {
+
+/**
+ * This class implements the NMF on the given matrix V. Non-negative Matrix 
+ * Factorization decomposes V in the form \f$ V \approx WH \f$ where W is 
+ * called the basis matrix and H is called the encoding matrix. V is taken 
+ * to be of size n*m and the obtained W is n*r and H is r*m. The size r is 
+ * called the rank of the factorization.
+ * 
+ * The implementation requires the supply of two templates. One for the update
+ * rule for updating the W matrix during each iteration and another rule for
+ * updating the H matrix during each iteration. This allows the user to
+ * try out various update rules for performing the factorization.
+ *
+ * A simple example of how to run NMF is shown below.
+ *
+ * @code
+ * extern arma::mat V; // Matrix that we want to perform NMF on.
+ * size_t r = 10; // Rank of decomposition
+ * arma::mat W; // Basis matrix
+ * arma::mat H; // Encoding matrix
+ * 
+ * NMF<> nmf(); // Default options
+ * nmf.Apply(V,W,H,r);
+ * @endcode
+ *
+ * @tparam WUpdateRule The update rule for calculating W matrix at each 
+ * iteration; @see MultiplicativeDistanceW for an example.
+ * @tparam HUpdateRule The update rule for calculating H matrix at each
+ * iteration; @see MultiplicativeDistanceH for an example.
+ */
+template<typename WUpdateRule = MultiplicativeDistanceW,
+         typename HUpdateRule = MultiplicativeDistanceH>
+class NMF
+{
+ public:
+  /**
+   * Create the NMF object and (optionally) set the parameters which NMF will
+   * run with. This implementation allows us to use different update rules for
+   * the updation of the basis and encoding matrices over each iteration.
+   * 
+   * @param maxIterations Maximum number of iterations allowed before giving up
+   * @param maxResidue The maximum root mean square of the difference between 
+   *    two subsequent iteration of product WH at which to terminate iteration. 
+   *    A low residual value denotes that subsequent iterationas are not 
+   *    producing much different values of W and H. Once the difference goes 
+   *    below the supplied value, the iteration terminates.
+   * @param WUpdate Optional WUpdateRule object; for when the update rule for
+   *    the W vector has states that it needs to store.
+   * @param HUpdate Optional HUpdateRule object; for when the update rule for
+   *    the H vector has states that it needs to store.
+   */
+  NMF(const size_t maxIterations = 1000,
+      const double maxResidue = 1e-10,
+      const WUpdateRule WUpdate = WUpdateRule(),
+      const HUpdateRule HUpdate = HUpdateRule());
+
+  /**
+   * Apply the Non-Negative Matrix Factorization on the provided matrix.
+   *
+   * @param V Input matrix to be factorized
+   * @param W Basis matrix to be output
+   * @param H Encoding matrix to output
+   * @param r Rank r of the factorization
+   */
+  void Apply(const arma::mat& V, arma::mat& W, arma::mat& H,
+              size_t& r) const;
+
+  private:
+  //! The  maximum number of iterations allowed before giving up
+  size_t maxIterations;
+  //! The maximum residue below which iteration is considered converged
+  double maxResidue;
+  //! Instantiated W Update Rule
+  WUpdateRule WUpdate;
+  //! Instantiated H Update Rule
+  HUpdateRule HUpdate;
+
+}; // class NMF
+
+}; // namespace nmf
+}; // namespace mlpack
+
+// Include implementation.
+#include "nmf_impl.hpp"
+
+#endif

Added: mlpack/trunk/src/mlpack/methods/pca/nmf_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/pca/nmf_impl.hpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/methods/pca/nmf_impl.hpp	2012-06-13 16:18:45 UTC (rev 13047)
@@ -0,0 +1,89 @@
+/**
+ * @file nmf.cpp
+ * @author Mohan Rajendran
+ *
+ * Implementation of NMF class to perform Non-Negative Matrix Factorization
+ * on the given matrix.
+ */
+#include "nmf.hpp"
+
+namespace mlpack {
+namespace nmf {
+
+/**
+ * Construct the NMF object.
+ */
+template<typename WUpdateRule,
+         typename HUpdateRule>
+NMF<WUpdateRule,
+    HUpdateRule>::
+NMF(const size_t maxIterations,
+      const double maxResidue,
+      const WUpdateRule WUpdate,
+      const HUpdateRule HUpdate) :
+    maxIterations(maxIterations),
+    maxResidue(maxResidue),
+    WUpdate(WUpdate),
+    HUpdate(HUpdate)
+{
+  if (maxResidue < 0.0)
+  {
+    Log::Warn << "NMF::NMF(): maxResidue must be a positive value ("
+        << maxResidue << " given). Setting to the default value of "
+        << "1e-10.\n";
+    this->maxResidue = 1e-10;
+  } 
+}
+
+/**
+ * Apply the Non-Negative Matrix Factorization on the provided matrix.
+ *
+ * @param V Input matrix to be factorized
+ * @param W Basis matrix to be output
+ * @param H Encoding matrix to output
+ * @param r Rank r of the factorization
+ */
+template<typename WUpdateRule,
+         typename HUpdateRule>
+void NMF<WUpdateRule,
+    HUpdateRule>::
+Apply(const arma::mat& V, arma::mat& W, arma::mat& H, size_t& r) const
+{
+  size_t n = V.n_rows;
+  size_t m = V.n_cols;
+  // old and new product WH for residue checking
+  arma::mat WHold,WH,diff;
+  
+  // Allocate random values to the starting iteration
+  W.randu(n,r);
+  H.randu(r,m);
+  // Store the original calculated value for residue checking
+  WHold = W*H;
+  
+  size_t iteration = 0;
+  double residue;
+  double sqrRes = maxResidue*maxResidue;
+
+  do
+  {
+    // Update step.
+    // Update the value of W and H based on the Update Rules provided
+    WUpdate.Update(V,W,H);
+    HUpdate.Update(V,W,H);
+
+    // Calculate square of residue after iteration
+    WH = W*H;
+    diff = WHold-WH;
+    diff = diff%diff;
+    residue = accu(diff)/(double)(n*m);
+    WHold = WH;
+
+    iteration++;
+  
+  } while (residue >= sqrRes  && iteration != maxIterations);
+
+  Log::Debug << "Iterations: " << iteration << std::endl;
+}
+
+}; // namespace nmf
+}; // namespace mlpack

Added: mlpack/trunk/src/mlpack/methods/pca/nmf_main.cpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/pca/nmf_main.cpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/methods/pca/nmf_main.cpp	2012-06-13 16:18:45 UTC (rev 13047)
@@ -0,0 +1,65 @@
+/**
+ * @file nmf_main.cpp
+ * @author Mohan Rajendran
+ *
+ * Main executable to run NMF.
+ */
+#include <mlpack/core.hpp>
+
+#include "nmf.hpp"
+
+using namespace mlpack;
+using namespace mlpack::nmf;
+using namespace std;
+
+// Document program.
+PROGRAM_INFO("Non-negative Matrix Factorization", "This program performs the "
+    "non-negative matrix factorization on the given vector. It will store the "
+    "calculated factors in the reference matrix arguments supplied.");
+
+// Parameters for program.
+PARAM_STRING_REQ("input_file", "Input matrix to perform NMF on.", "i");
+PARAM_STRING_REQ("W_output_file", "File to save the calculated W matrix to.",
+    "w");
+PARAM_STRING_REQ("H_output_file", "File to save the calculated H matrix to.",
+    "h");
+PARAM_INT_REQ("rank", "Rank of the factorization.", "r");
+PARAM_INT("max_iterations", "Number of iterations before NMF terminates", 
+    "m", 1000);
+PARAM_DOUBLE("max_residue", "The maximum root mean square allowed below which "
+    "the program termiates", "e", 1e-10);
+
+int main(int argc, char** argv)
+{
+  // Parse commandline.
+  CLI::ParseCommandLine(argc, argv);
+
+  // Load input dataset.
+  string inputFile = CLI::GetParam<string>("input_file");
+  arma::mat V;
+  data::Load(inputFile.c_str(), V);
+  arma::mat W;
+  arma::mat H;
+
+  // Find out the rank of the factorization.
+  size_t r = CLI::GetParam<int>("rank");
+  if (r<1)
+  {
+    Log::Fatal << "The rank of the factorization cannot be less than 1. "
+          << std::endl;
+  }
+  
+  size_t maxiterations = CLI::GetParam<int>("max_iterations");
+  double maxresidue = CLI::GetParam<double>("max_residue");
+
+  // Perform NMF. 
+  NMF<> nmf(maxiterations,maxresidue);
+  Log::Info << "Performing NMF on the given matrix..." << endl;
+  nmf.Apply(V,W,H,r);
+
+  // Save results
+  string outputFile = CLI::GetParam<string>("W_output_file");
+  data::Save(outputFile, W);
+  outputFile = CLI::GetParam<string>("H_output_file");
+  data::Save(outputFile, H);
+}