[mlpack-git] master: Refactor CF program to allow specifying the maximum number of iterations. (ddb8ae9)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Wed Jul 8 08:51:49 EDT 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/da092bd34072dae51aeca3df868430f34c4cccb5...ddb8ae9e0784f43b51f0868a095b2275aa3b5bda

>---------------------------------------------------------------

commit ddb8ae9e0784f43b51f0868a095b2275aa3b5bda
Author: Ryan Curtin <ryan at ratml.org>
Date:   Wed Jul 8 12:51:01 2015 +0000

    Refactor CF program to allow specifying the maximum number of iterations.


>---------------------------------------------------------------

ddb8ae9e0784f43b51f0868a095b2275aa3b5bda
 src/mlpack/methods/cf/cf_main.cpp | 214 ++++++++++++++++++++++++++------------
 1 file changed, 145 insertions(+), 69 deletions(-)

diff --git a/src/mlpack/methods/cf/cf_main.cpp b/src/mlpack/methods/cf/cf_main.cpp
index b460d8e..a870349 100644
--- a/src/mlpack/methods/cf/cf_main.cpp
+++ b/src/mlpack/methods/cf/cf_main.cpp
@@ -9,6 +9,7 @@
 
 #include <mlpack/methods/amf/amf.hpp>
 #include <mlpack/methods/regularized_svd/regularized_svd.hpp>
+#include <mlpack/methods/amf/termination_policies/max_iteration_termination.hpp>
 #include "cf.hpp"
 
 using namespace mlpack;
@@ -20,15 +21,9 @@ using namespace std;
 // Document program.
 PROGRAM_INFO("Collaborating Filtering", "This program performs collaborative "
     "filtering (CF) on the given dataset. Given a list of user, item and "
-    "preferences (--input_file) the program will output a set of "
-    "recommendations for each user."
-    "\n\n"
-    "Optionally, the set of query users can be specified with the --query_file "
-    "option.  In addition, the number of recommendations to generate can be "
-    "specified with the --recommendations (-r) parameter, and the number of "
-    "similar users (the size of the neighborhood) to be considered when "
-    "generating recommendations can be specified with the --neighborhood (-n) "
-    "option."
+    "preferences (--input_file) the program will perform a matrix decomposition"
+    " and then can perform a series of actions related to collaborative "
+    "filtering."
     "\n\n"
     "The input file should contain a 3-column matrix of ratings, where the "
     "first column is the user, the second column is the item, and the third "
@@ -36,8 +31,17 @@ PROGRAM_INFO("Collaborating Filtering", "This program performs collaborative "
     "should be numeric indices, not names. The indices are assumed to start "
     "from 0."
     "\n\n"
-    "The following optimization algorithms can be used with --algorithm (-a) "
-    "parameter: "
+    "A set of query users for which recommendations can be generated may be "
+    "specified with the --query_file (-q) option; alternately, recommendations "
+    "may be generated for every user in the dataset by specifying the "
+    "--all_user_recommendations (-A) option.  In addition, the number of "
+    "recommendations per user to generate can be specified with the "
+    "--recommendations (-r) parameter, and the number of similar users (the "
+    "size of the neighborhood) to be considered when generating recommendations"
+    " can be specified with the --neighborhood (-n) option."
+    "\n\n"
+    "For performing the matrix decomposition, the following optimization "
+    "algorithms can be specified via the --algorithm (-a) parameter: "
     "\n"
     "'RegSVD' -- Regularized SVD using a SGD optimizer\n"
     "'NMF' -- Non-negative matrix factorization with alternating least squares "
@@ -49,8 +53,9 @@ PROGRAM_INFO("Collaborating Filtering", "This program performs collaborative "
 // Parameters for program.
 PARAM_STRING_REQ("input_file", "Input dataset to perform CF on.", "i");
 PARAM_STRING("query_file", "List of users for which recommendations are to "
-    "be generated (if unspecified, then recommendations are generated for all "
-    "users).", "q", "");
+    "be generated.", "q", "");
+PARAM_FLAG("all_user_recommendations", "Generate recommendations for all "
+    "users.", "A");
 
 PARAM_STRING("output_file","File to save output recommendations to.", "o",
     "recommendations.csv");
@@ -63,20 +68,24 @@ PARAM_INT("recommendations", "Number of recommendations to generate for each "
 PARAM_INT("neighborhood", "Size of the neighborhood of similar users to "
     "consider for each query user.", "n", 5);
 
-PARAM_INT("rank", "Rank of decomposed matrices.", "R", 2);
+PARAM_INT("rank", "Rank of decomposed matrices (if 0, a heuristic is used to "
+    "estimate the rank).", "R", 0);
 
 PARAM_STRING("test_file", "Test set to calculate RMSE on.", "t", "");
 
+// Offer the user the option to set the maximum number of iterations, and
+// terminate only based on the number of iterations.
+PARAM_INT("max_iterations", "Maximum number of iterations.", "m", 1000);
+PARAM_FLAG("iteration_only_termination", "Terminate only when the maximum "
+    "number of iterations is reached.", "I");
+PARAM_DOUBLE("min_residue", "Residue required to terminate the factorization "
+    "(lower values generally mean better fits).", "r", 1e-5);
+
 template<typename Factorizer>
-void ComputeRecommendations(Factorizer factorizer,
-                            arma::mat& dataset,
+void ComputeRecommendations(CF<Factorizer>& cf,
                             const size_t numRecs,
-                            const size_t neighbourhood,
-                            const size_t rank,
                             arma::Mat<size_t>& recommendations)
 {
-  CF<Factorizer> c(dataset, factorizer, neighbourhood, rank);
-
   // Reading users.
   const string queryFile = CLI::GetParam<string>("query_file");
   if (queryFile != "")
@@ -89,23 +98,18 @@ void ComputeRecommendations(Factorizer factorizer,
 
     Log::Info << "Generating recommendations for " << users.n_elem << " users "
         << "in '" << queryFile << "'." << endl;
-    c.GetRecommendations(numRecs, recommendations, users);
+    cf.GetRecommendations(numRecs, recommendations, users);
   }
   else
   {
     Log::Info << "Generating recommendations for all users." << endl;
-    c.GetRecommendations(numRecs, recommendations);
+    cf.GetRecommendations(numRecs, recommendations);
   }
 }
 
 template<typename Factorizer>
-void ComputeRMSE(Factorizer&& factorizer,
-                 const arma::mat& dataset,
-                 const size_t neighborhood,
-                 const size_t rank)
+void ComputeRMSE(CF<Factorizer>& cf)
 {
-  CF<Factorizer> c(dataset, factorizer, neighborhood, rank);
-
   // Now, compute each test point.
   const string testFile = CLI::GetParam<string>("test_file");
   arma::mat testData;
@@ -121,7 +125,7 @@ void ComputeRMSE(Factorizer&& factorizer,
 
   // Now compute the RMSE.
   arma::vec predictions;
-  c.Predict(combinations, predictions);
+  cf.Predict(combinations, predictions);
 
   // Compute the root of the sum of the squared errors, divide by the number of
   // points to get the RMSE.  It turns out this is just the L2-norm divided by
@@ -133,8 +137,94 @@ void ComputeRMSE(Factorizer&& factorizer,
   Log::Info << "RMSE is " << rmse << "." << endl;
 }
 
-#define CR(x) ComputeRecommendations(x, dataset, numRecs, neighborhood, rank, recommendations)
-#define RMSE(x) ComputeRMSE(x, dataset, neighborhood, rank)
+template<typename Factorizer>
+void PerformAction(Factorizer&& factorizer,
+                   arma::mat& dataset,
+                   const size_t rank)
+{
+  // Parameters for generating the CF object.
+  const size_t neighborhood = (size_t) CLI::GetParam<int>("neighborhood");
+  CF<Factorizer> c(dataset, factorizer, neighborhood, rank);
+
+  if (CLI::HasParam("query_file") || CLI::HasParam("all_user_recommendations"))
+  {
+    // Get parameters for generating recommendations.
+    const size_t numRecs = (size_t) CLI::GetParam<int>("recommendations");
+
+    // Get the recommendations.
+    arma::Mat<size_t> recommendations;
+    ComputeRecommendations(c, numRecs, recommendations);
+
+    // Save the output.
+    const string outputFile = CLI::GetParam<string>("output_file");
+    data::Save(outputFile, recommendations);
+  }
+
+  if (CLI::HasParam("test_file"))
+  {
+    ComputeRMSE(c);
+  }
+}
+
+void AssembleFactorizerType(const std::string& algorithm,
+                            arma::mat& dataset,
+                            const bool maxIterationTermination,
+                            const size_t rank)
+{
+  const size_t maxIterations = (size_t) CLI::GetParam<int>("max_iterations");
+  if (maxIterationTermination)
+  {
+    // Force termination when maximum number of iterations reached.
+    MaxIterationTermination mit(maxIterations);
+    if (algorithm == "NMF")
+    {
+      typedef AMF<MaxIterationTermination, RandomInitialization, NMFALSUpdate>
+          FactorizerType;
+      PerformAction(FactorizerType(mit), dataset, rank);
+    }
+    else if (algorithm == "SVDBatch")
+    {
+      typedef AMF<MaxIterationTermination, RandomInitialization,
+          SVDBatchLearning> FactorizerType;
+      PerformAction(FactorizerType(mit), dataset, rank);
+    }
+    else if (algorithm == "SVDIncompleteIncremental")
+    {
+      typedef AMF<MaxIterationTermination, RandomInitialization,
+          SVDIncompleteIncrementalLearning> FactorizerType;
+      PerformAction(FactorizerType(mit), dataset, rank);
+    }
+    else if (algorithm == "SVDCompleteIncremental")
+    {
+      typedef AMF<MaxIterationTermination, RandomInitialization,
+          SVDCompleteIncrementalLearning<arma::sp_mat>> FactorizerType;
+      PerformAction(FactorizerType(mit), dataset, rank);
+    }
+    else if (algorithm == "RegSVD")
+    {
+      Log::Fatal << "--iteration_only_termination not supported with 'RegSVD' "
+          << "algorithm!" << endl;
+    }
+  }
+  else
+  {
+    // Use default termination (SimpleResidueTermination), but set the maximum
+    // number of iterations.
+    const double minResidue = CLI::GetParam<double>("min_residue");
+    SimpleResidueTermination srt(minResidue, maxIterations);
+    if (algorithm == "NMF")
+      PerformAction(NMFALSFactorizer(srt), dataset, rank);
+    else if (algorithm == "SVDBatch")
+      PerformAction(SVDBatchFactorizer(srt), dataset, rank);
+    else if (algorithm == "SVDIncompleteIncremental")
+      PerformAction(SparseSVDIncompleteIncrementalFactorizer(srt), dataset,
+          rank);
+    else if (algorithm == "SVDCompleteIncremental")
+      PerformAction(SparseSVDCompleteIncrementalFactorizer(srt), dataset, rank);
+    else if (algorithm == "RegSVD")
+      PerformAction(RegularizedSVD<>(maxIterations), dataset, rank);
+  }
+}
 
 int main(int argc, char** argv)
 {
@@ -150,50 +240,36 @@ int main(int argc, char** argv)
   arma::Mat<size_t> recommendations;
 
   // Get parameters.
-  const size_t numRecs = (size_t) CLI::GetParam<int>("recommendations");
-  const size_t neighborhood = (size_t) CLI::GetParam<int>("neighborhood");
   const size_t rank = (size_t) CLI::GetParam<int>("rank");
 
+  // Check that nothing stupid is happening.
+  if (CLI::HasParam("query_file") && CLI::HasParam("all_user_recommendations"))
+    Log::Fatal << "Both --query_file and --all_user_recommendations are given, "
+        << "but only one is allowed!" << endl;
+
   // Perform decomposition to prepare for recommendations.
   Log::Info << "Performing CF matrix decomposition on dataset..." << endl;
 
   const string algo = CLI::GetParam<string>("algorithm");
 
-  if (!CLI::HasParam("test_file"))
-  {
-    if (algo == "NMF")
-      CR(NMFALSFactorizer());
-    else if (algo == "SVDBatch")
-      CR(SparseSVDBatchFactorizer());
-    else if (algo == "SVDIncompleteIncremental")
-      CR(SparseSVDIncompleteIncrementalFactorizer());
-    else if (algo == "SVDCompleteIncremental")
-      CR(SparseSVDCompleteIncrementalFactorizer());
-    else if (algo == "RegSVD")
-      CR(RegularizedSVD<>());
-    else
-      Log::Fatal << "Invalid decomposition algorithm.  Choices are 'NMF', "
-          << "'SVDBatch', 'SVDIncompleteIncremental', 'SVDCompleteIncremental',"
-          << " and 'RegSVD'." << endl;
-  }
-  else
-  {
-    if (algo == "NMF")
-      RMSE(NMFALSFactorizer());
-    else if (algo == "SVDBatch")
-      RMSE(SparseSVDBatchFactorizer());
-    else if (algo == "SVDIncompleteIncremental")
-      RMSE(SparseSVDIncompleteIncrementalFactorizer());
-    else if (algo == "SVDCompleteIncremental")
-      RMSE(SparseSVDCompleteIncrementalFactorizer());
-    else if (algo == "RegSVD")
-      RMSE(RegularizedSVD<>());
-    else
-      Log::Fatal << "Invalid decomposition algorithm.  Choices are 'NMF', "
-          << "'SVDBatch', 'SVDIncompleteIncremental', 'SVDCompleteIncremental',"
-          << " and 'RegSVD'." << endl;
-  }
+  // Issue an error if an invalid factorizer is used.
+  if (algo != "NMF" &&
+      algo != "SVDBatch" &&
+      algo != "SVDIncompleteIncremental" &&
+      algo != "SVDCompleteIncremental" &&
+      algo != "RegSVD")
+    Log::Fatal << "Invalid decomposition algorithm.  Choices are 'NMF', "
+        << "'SVDBatch', 'SVDIncompleteIncremental', 'SVDCompleteIncremental',"
+        << " and 'RegSVD'." << endl;
+
+  // Issue a warning if the user provided a minimum residue but it will be
+  // ignored.
+  if (CLI::HasParam("min_residue") &&
+      CLI::HasParam("iteration_only_termination"))
+    Log::Warn << "--min_residue ignored, because --iteration_only_termination "
+        << "is specified." << endl;
 
-  const string outputFile = CLI::GetParam<string>("output_file");
-  data::Save(outputFile, recommendations);
+  // Perform the factorization and do whatever the user wanted.
+  AssembleFactorizerType(algo, dataset,
+      CLI::HasParam("iteration_only_termination"), rank);
 }



More information about the mlpack-git mailing list