[mlpack-git] master: Refactor CF program to allow specifying the maximum number of iterations. (ddb8ae9)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Jul 8 08:51:49 EDT 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/da092bd34072dae51aeca3df868430f34c4cccb5...ddb8ae9e0784f43b51f0868a095b2275aa3b5bda
>---------------------------------------------------------------
commit ddb8ae9e0784f43b51f0868a095b2275aa3b5bda
Author: Ryan Curtin <ryan at ratml.org>
Date: Wed Jul 8 12:51:01 2015 +0000
Refactor CF program to allow specifying the maximum number of iterations.
>---------------------------------------------------------------
ddb8ae9e0784f43b51f0868a095b2275aa3b5bda
src/mlpack/methods/cf/cf_main.cpp | 214 ++++++++++++++++++++++++++------------
1 file changed, 145 insertions(+), 69 deletions(-)
diff --git a/src/mlpack/methods/cf/cf_main.cpp b/src/mlpack/methods/cf/cf_main.cpp
index b460d8e..a870349 100644
--- a/src/mlpack/methods/cf/cf_main.cpp
+++ b/src/mlpack/methods/cf/cf_main.cpp
@@ -9,6 +9,7 @@
#include <mlpack/methods/amf/amf.hpp>
#include <mlpack/methods/regularized_svd/regularized_svd.hpp>
+#include <mlpack/methods/amf/termination_policies/max_iteration_termination.hpp>
#include "cf.hpp"
using namespace mlpack;
@@ -20,15 +21,9 @@ using namespace std;
// Document program.
PROGRAM_INFO("Collaborating Filtering", "This program performs collaborative "
"filtering (CF) on the given dataset. Given a list of user, item and "
- "preferences (--input_file) the program will output a set of "
- "recommendations for each user."
- "\n\n"
- "Optionally, the set of query users can be specified with the --query_file "
- "option. In addition, the number of recommendations to generate can be "
- "specified with the --recommendations (-r) parameter, and the number of "
- "similar users (the size of the neighborhood) to be considered when "
- "generating recommendations can be specified with the --neighborhood (-n) "
- "option."
+ "preferences (--input_file) the program will perform a matrix decomposition"
+ " and then can perform a series of actions related to collaborative "
+ "filtering."
"\n\n"
"The input file should contain a 3-column matrix of ratings, where the "
"first column is the user, the second column is the item, and the third "
@@ -36,8 +31,17 @@ PROGRAM_INFO("Collaborating Filtering", "This program performs collaborative "
"should be numeric indices, not names. The indices are assumed to start "
"from 0."
"\n\n"
- "The following optimization algorithms can be used with --algorithm (-a) "
- "parameter: "
+ "A set of query users for which recommendations can be generated may be "
+ "specified with the --query_file (-q) option; alternately, recommendations "
+ "may be generated for every user in the dataset by specifying the "
+ "--all_user_recommendations (-A) option. In addition, the number of "
+ "recommendations per user to generate can be specified with the "
+ "--recommendations (-r) parameter, and the number of similar users (the "
+ "size of the neighborhood) to be considered when generating recommendations"
+ " can be specified with the --neighborhood (-n) option."
+ "\n\n"
+ "For performing the matrix decomposition, the following optimization "
+ "algorithms can be specified via the --algorithm (-a) parameter: "
"\n"
"'RegSVD' -- Regularized SVD using a SGD optimizer\n"
"'NMF' -- Non-negative matrix factorization with alternating least squares "
@@ -49,8 +53,9 @@ PROGRAM_INFO("Collaborating Filtering", "This program performs collaborative "
// Parameters for program.
PARAM_STRING_REQ("input_file", "Input dataset to perform CF on.", "i");
PARAM_STRING("query_file", "List of users for which recommendations are to "
- "be generated (if unspecified, then recommendations are generated for all "
- "users).", "q", "");
+ "be generated.", "q", "");
+PARAM_FLAG("all_user_recommendations", "Generate recommendations for all "
+ "users.", "A");
PARAM_STRING("output_file","File to save output recommendations to.", "o",
"recommendations.csv");
@@ -63,20 +68,24 @@ PARAM_INT("recommendations", "Number of recommendations to generate for each "
PARAM_INT("neighborhood", "Size of the neighborhood of similar users to "
"consider for each query user.", "n", 5);
-PARAM_INT("rank", "Rank of decomposed matrices.", "R", 2);
+PARAM_INT("rank", "Rank of decomposed matrices (if 0, a heuristic is used to "
+ "estimate the rank).", "R", 0);
PARAM_STRING("test_file", "Test set to calculate RMSE on.", "t", "");
+// Offer the user the option to set the maximum number of iterations, and
+// terminate only based on the number of iterations.
+PARAM_INT("max_iterations", "Maximum number of iterations.", "m", 1000);
+PARAM_FLAG("iteration_only_termination", "Terminate only when the maximum "
+ "number of iterations is reached.", "I");
+PARAM_DOUBLE("min_residue", "Residue required to terminate the factorization "
+ "(lower values generally mean better fits).", "r", 1e-5);
+
template<typename Factorizer>
-void ComputeRecommendations(Factorizer factorizer,
- arma::mat& dataset,
+void ComputeRecommendations(CF<Factorizer>& cf,
const size_t numRecs,
- const size_t neighbourhood,
- const size_t rank,
arma::Mat<size_t>& recommendations)
{
- CF<Factorizer> c(dataset, factorizer, neighbourhood, rank);
-
// Reading users.
const string queryFile = CLI::GetParam<string>("query_file");
if (queryFile != "")
@@ -89,23 +98,18 @@ void ComputeRecommendations(Factorizer factorizer,
Log::Info << "Generating recommendations for " << users.n_elem << " users "
<< "in '" << queryFile << "'." << endl;
- c.GetRecommendations(numRecs, recommendations, users);
+ cf.GetRecommendations(numRecs, recommendations, users);
}
else
{
Log::Info << "Generating recommendations for all users." << endl;
- c.GetRecommendations(numRecs, recommendations);
+ cf.GetRecommendations(numRecs, recommendations);
}
}
template<typename Factorizer>
-void ComputeRMSE(Factorizer&& factorizer,
- const arma::mat& dataset,
- const size_t neighborhood,
- const size_t rank)
+void ComputeRMSE(CF<Factorizer>& cf)
{
- CF<Factorizer> c(dataset, factorizer, neighborhood, rank);
-
// Now, compute each test point.
const string testFile = CLI::GetParam<string>("test_file");
arma::mat testData;
@@ -121,7 +125,7 @@ void ComputeRMSE(Factorizer&& factorizer,
// Now compute the RMSE.
arma::vec predictions;
- c.Predict(combinations, predictions);
+ cf.Predict(combinations, predictions);
// Compute the root of the sum of the squared errors, divide by the number of
// points to get the RMSE. It turns out this is just the L2-norm divided by
@@ -133,8 +137,94 @@ void ComputeRMSE(Factorizer&& factorizer,
Log::Info << "RMSE is " << rmse << "." << endl;
}
-#define CR(x) ComputeRecommendations(x, dataset, numRecs, neighborhood, rank, recommendations)
-#define RMSE(x) ComputeRMSE(x, dataset, neighborhood, rank)
+template<typename Factorizer>
+void PerformAction(Factorizer&& factorizer,
+ arma::mat& dataset,
+ const size_t rank)
+{
+ // Parameters for generating the CF object.
+ const size_t neighborhood = (size_t) CLI::GetParam<int>("neighborhood");
+ CF<Factorizer> c(dataset, factorizer, neighborhood, rank);
+
+ if (CLI::HasParam("query_file") || CLI::HasParam("all_user_recommendations"))
+ {
+ // Get parameters for generating recommendations.
+ const size_t numRecs = (size_t) CLI::GetParam<int>("recommendations");
+
+ // Get the recommendations.
+ arma::Mat<size_t> recommendations;
+ ComputeRecommendations(c, numRecs, recommendations);
+
+ // Save the output.
+ const string outputFile = CLI::GetParam<string>("output_file");
+ data::Save(outputFile, recommendations);
+ }
+
+ if (CLI::HasParam("test_file"))
+ {
+ ComputeRMSE(c);
+ }
+}
+
+void AssembleFactorizerType(const std::string& algorithm,
+ arma::mat& dataset,
+ const bool maxIterationTermination,
+ const size_t rank)
+{
+ const size_t maxIterations = (size_t) CLI::GetParam<int>("max_iterations");
+ if (maxIterationTermination)
+ {
+ // Force termination when maximum number of iterations reached.
+ MaxIterationTermination mit(maxIterations);
+ if (algorithm == "NMF")
+ {
+ typedef AMF<MaxIterationTermination, RandomInitialization, NMFALSUpdate>
+ FactorizerType;
+ PerformAction(FactorizerType(mit), dataset, rank);
+ }
+ else if (algorithm == "SVDBatch")
+ {
+ typedef AMF<MaxIterationTermination, RandomInitialization,
+ SVDBatchLearning> FactorizerType;
+ PerformAction(FactorizerType(mit), dataset, rank);
+ }
+ else if (algorithm == "SVDIncompleteIncremental")
+ {
+ typedef AMF<MaxIterationTermination, RandomInitialization,
+ SVDIncompleteIncrementalLearning> FactorizerType;
+ PerformAction(FactorizerType(mit), dataset, rank);
+ }
+ else if (algorithm == "SVDCompleteIncremental")
+ {
+ typedef AMF<MaxIterationTermination, RandomInitialization,
+ SVDCompleteIncrementalLearning<arma::sp_mat>> FactorizerType;
+ PerformAction(FactorizerType(mit), dataset, rank);
+ }
+ else if (algorithm == "RegSVD")
+ {
+ Log::Fatal << "--iteration_only_termination not supported with 'RegSVD' "
+ << "algorithm!" << endl;
+ }
+ }
+ else
+ {
+ // Use default termination (SimpleResidueTermination), but set the maximum
+ // number of iterations.
+ const double minResidue = CLI::GetParam<double>("min_residue");
+ SimpleResidueTermination srt(minResidue, maxIterations);
+ if (algorithm == "NMF")
+ PerformAction(NMFALSFactorizer(srt), dataset, rank);
+ else if (algorithm == "SVDBatch")
+ PerformAction(SVDBatchFactorizer(srt), dataset, rank);
+ else if (algorithm == "SVDIncompleteIncremental")
+ PerformAction(SparseSVDIncompleteIncrementalFactorizer(srt), dataset,
+ rank);
+ else if (algorithm == "SVDCompleteIncremental")
+ PerformAction(SparseSVDCompleteIncrementalFactorizer(srt), dataset, rank);
+ else if (algorithm == "RegSVD")
+ PerformAction(RegularizedSVD<>(maxIterations), dataset, rank);
+ }
+}
int main(int argc, char** argv)
{
@@ -150,50 +240,36 @@ int main(int argc, char** argv)
arma::Mat<size_t> recommendations;
// Get parameters.
- const size_t numRecs = (size_t) CLI::GetParam<int>("recommendations");
- const size_t neighborhood = (size_t) CLI::GetParam<int>("neighborhood");
const size_t rank = (size_t) CLI::GetParam<int>("rank");
+ // Check that nothing stupid is happening.
+ if (CLI::HasParam("query_file") && CLI::HasParam("all_user_recommendations"))
+ Log::Fatal << "Both --query_file and --all_user_recommendations are given, "
+ << "but only one is allowed!" << endl;
+
// Perform decomposition to prepare for recommendations.
Log::Info << "Performing CF matrix decomposition on dataset..." << endl;
const string algo = CLI::GetParam<string>("algorithm");
- if (!CLI::HasParam("test_file"))
- {
- if (algo == "NMF")
- CR(NMFALSFactorizer());
- else if (algo == "SVDBatch")
- CR(SparseSVDBatchFactorizer());
- else if (algo == "SVDIncompleteIncremental")
- CR(SparseSVDIncompleteIncrementalFactorizer());
- else if (algo == "SVDCompleteIncremental")
- CR(SparseSVDCompleteIncrementalFactorizer());
- else if (algo == "RegSVD")
- CR(RegularizedSVD<>());
- else
- Log::Fatal << "Invalid decomposition algorithm. Choices are 'NMF', "
- << "'SVDBatch', 'SVDIncompleteIncremental', 'SVDCompleteIncremental',"
- << " and 'RegSVD'." << endl;
- }
- else
- {
- if (algo == "NMF")
- RMSE(NMFALSFactorizer());
- else if (algo == "SVDBatch")
- RMSE(SparseSVDBatchFactorizer());
- else if (algo == "SVDIncompleteIncremental")
- RMSE(SparseSVDIncompleteIncrementalFactorizer());
- else if (algo == "SVDCompleteIncremental")
- RMSE(SparseSVDCompleteIncrementalFactorizer());
- else if (algo == "RegSVD")
- RMSE(RegularizedSVD<>());
- else
- Log::Fatal << "Invalid decomposition algorithm. Choices are 'NMF', "
- << "'SVDBatch', 'SVDIncompleteIncremental', 'SVDCompleteIncremental',"
- << " and 'RegSVD'." << endl;
- }
+ // Issue an error if an invalid factorizer is used.
+ if (algo != "NMF" &&
+ algo != "SVDBatch" &&
+ algo != "SVDIncompleteIncremental" &&
+ algo != "SVDCompleteIncremental" &&
+ algo != "RegSVD")
+ Log::Fatal << "Invalid decomposition algorithm. Choices are 'NMF', "
+ << "'SVDBatch', 'SVDIncompleteIncremental', 'SVDCompleteIncremental',"
+ << " and 'RegSVD'." << endl;
+
+ // Issue a warning if the user provided a minimum residue but it will be
+ // ignored.
+ if (CLI::HasParam("min_residue") &&
+ CLI::HasParam("iteration_only_termination"))
+ Log::Warn << "--min_residue ignored, because --iteration_only_termination "
+ << "is specified." << endl;
- const string outputFile = CLI::GetParam<string>("output_file");
- data::Save(outputFile, recommendations);
+ // Perform the factorization and do whatever the user wanted.
+ AssembleFactorizerType(algo, dataset,
+ CLI::HasParam("iteration_only_termination"), rank);
}
More information about the mlpack-git
mailing list