[mlpack-svn] r16345 - in mlpack/trunk/src/mlpack: methods/cf tests

fastlab-svn at coffeetalk-1.cc.gatech.edu fastlab-svn at coffeetalk-1.cc.gatech.edu
Sat Mar 1 23:39:49 EST 2014


Author: rcurtin
Date: Sat Mar  1 23:39:49 2014
New Revision: 16345

Log:
Patch from Siddharth: templatize CF to accept arbitrary types of factorizers.


Added:
   mlpack/trunk/src/mlpack/methods/cf/cf_impl.hpp
Removed:
   mlpack/trunk/src/mlpack/methods/cf/cf.cpp
Modified:
   mlpack/trunk/src/mlpack/methods/cf/CMakeLists.txt
   mlpack/trunk/src/mlpack/methods/cf/cf.hpp
   mlpack/trunk/src/mlpack/methods/cf/cf_main.cpp
   mlpack/trunk/src/mlpack/tests/cf_test.cpp
   mlpack/trunk/src/mlpack/tests/to_string_test.cpp

Modified: mlpack/trunk/src/mlpack/methods/cf/CMakeLists.txt
==============================================================================
--- mlpack/trunk/src/mlpack/methods/cf/CMakeLists.txt	(original)
+++ mlpack/trunk/src/mlpack/methods/cf/CMakeLists.txt	Sat Mar  1 23:39:49 2014
@@ -2,7 +2,7 @@
 # Anything not in this list will not be compiled into MLPACK.
 set(SOURCES
   cf.hpp
-  cf.cpp
+  cf_impl.hpp
 )
 
 # Add directory name to sources.

Modified: mlpack/trunk/src/mlpack/methods/cf/cf.hpp
==============================================================================
--- mlpack/trunk/src/mlpack/methods/cf/cf.hpp	(original)
+++ mlpack/trunk/src/mlpack/methods/cf/cf.hpp	Sat Mar  1 23:39:49 2014
@@ -12,10 +12,14 @@
 
 #include <mlpack/core.hpp>
 #include <mlpack/methods/neighbor_search/neighbor_search.hpp>
+#include <mlpack/methods/nmf/nmf.hpp>
+#include <mlpack/methods/nmf/als_update_rules.hpp>
 #include <set>
 #include <map>
 #include <iostream>
 
+using namespace mlpack::nmf;
+
 namespace mlpack {
 namespace cf /** Collaborative filtering. */{
 
@@ -32,7 +36,7 @@
  * arma::Mat<size_t> recommendations; // Recommendations
  * size_t numRecommendations = 10;
  *
- * CF cf(data); // Default options.
+ * CF<> cf(data); // Default options.
  *
  * // Generate the default number of recommendations for all users.
  * cf.GetRecommendations(recommendations);
@@ -49,8 +53,17 @@
  * should have three rows.  The first represents the user; the second represents
  * the item; and the third represents the rating.  The user and item, while they
  * are in a matrix that holds doubles, should hold integer (or size_t) values.
- * The user and item indices are assumed to be starting from 0.
+ * The user and item indices are assumed to start at 0.
+ *
+ * @tparam FactorizerType The type of matrix factorization to use to decompose
+ *     the rating matrix (a W and H matrix).  This must implement the method
+ *     Apply(arma::sp_mat& data, size_t rank, arma::mat& W, arma::mat& H).
  */
+template<
+    typename FactorizerType = NMF<RandomInitialization,
+                                  WAlternatingLeastSquaresRule,
+                                  HAlternatingLeastSquaresRule>
+>
 class CF
 {
  public:
@@ -82,13 +95,13 @@
     this->numRecs = recs;
   }
 
-  //! Gets numRecs
-  size_t NumRecs()
+  //! Gets the number of recommendations.
+  size_t NumRecs() const
   {
     return numRecs;
   }
 
-  //! Sets number of user for calculating similarity.
+  //! Sets number of users for calculating similarity.
   void NumUsersForSimilarity(const size_t num)
   {
     if (num < 1)
@@ -101,7 +114,7 @@
   }
 
   //! Gets number of users for calculating similarity.
-  size_t NumUsersForSimilarity()
+  size_t NumUsersForSimilarity() const
   {
     return numUsersForSimilarity;
   }
@@ -113,11 +126,17 @@
   }
 
   //! Gets rank parameter for matrix factorization.
-  size_t Rank()
+  size_t Rank() const
   {
     return rank;
   }
 
+  //! Sets factorizer for NMF
+  void Factorizer(const FactorizerType& f)
+  {
+    this->factorizer = f;
+  }
+
   //! Get the User Matrix.
   const arma::mat& W() const { return w; }
   //! Get the Item Matrix.
@@ -182,6 +201,8 @@
   size_t numUsersForSimilarity;
   //! Rank used for matrix factorization.
   size_t rank;
+  //! Instantiated factorizer object.
+  FactorizerType factorizer;
   //! User matrix.
   arma::mat w;
   //! Item matrix.
@@ -214,4 +235,7 @@
 }; // namespace cf
 }; // namespace mlpack
 
+//Include implementation
+#include "cf_impl.hpp"
+
 #endif

Added: mlpack/trunk/src/mlpack/methods/cf/cf_impl.hpp
==============================================================================
--- (empty file)
+++ mlpack/trunk/src/mlpack/methods/cf/cf_impl.hpp	Sat Mar  1 23:39:49 2014
@@ -0,0 +1,272 @@
+/**
+ * @file cf.cpp
+ * @author Mudit Raj Gupta
+ *
+ * Collaborative Filtering.
+ *
+ * Implementation of CF class to perform Collaborative Filtering on the
+ * specified data set.
+ *
+ */
+#include "cf.hpp"
+#include <mlpack/methods/nmf/nmf.hpp>
+#include <mlpack/methods/nmf/als_update_rules.hpp>
+
+using namespace mlpack::nmf;
+using namespace mlpack::neighbor;
+using namespace std;
+
+namespace mlpack {
+namespace cf {
+
+/**
+ * Construct the CF object.
+ */
+template<typename FactorizerType>
+CF<FactorizerType>::CF(arma::mat& data,
+                      const size_t numRecs,
+                      const size_t numUsersForSimilarity,
+                      const size_t rank) :
+    data(data),
+    numRecs(numRecs),
+    numUsersForSimilarity(numUsersForSimilarity),
+    rank(rank)
+{
+  // Validate number of recommendation factor.
+  if (numRecs < 1)
+  {
+    Log::Warn << "CF::CF(): number of recommendations should be > 0("
+        << numRecs << " given). Setting value to 5.\n";
+    //Setting Default Value of 5
+    this->numRecs = 5;
+  }
+
+  // Validate neighbourhood size.
+  if (numUsersForSimilarity < 1)
+  {
+    Log::Warn << "CF::CF(): neighbourhood size should be > 0("
+        << numUsersForSimilarity << " given). Setting value to 5.\n";
+    //Setting Default Value of 5
+    this->numUsersForSimilarity = 5;
+  }
+
+  //Set default factorizer
+  FactorizerType f(10000, 1e-5);
+  Factorizer(f);
+
+  CleanData();
+}
+
+template<typename FactorizerType>
+void CF<FactorizerType>::GetRecommendations(arma::Mat<size_t>& recommendations)
+{
+  // Used to save user IDs.
+  arma::Col<size_t> users =
+    arma::zeros<arma::Col<size_t> >(cleanedData.n_cols, 1);
+  // Getting all user IDs.
+  for (size_t i = 0; i < cleanedData.n_cols; i++)
+    users(i) = i;
+
+  // Calling base function for recommendations.
+  GetRecommendations(recommendations, users);
+}
+
+template<typename FactorizerType>
+void CF<FactorizerType>::GetRecommendations(arma::Mat<size_t>& recommendations,
+                                            arma::Col<size_t>& users)
+{
+  // Base function for calculating recommendations.
+
+  // Check if the user wanted us to choose a rank for them.
+  if (rank == 0)
+  {
+    // This is a simple heuristic that picks a rank based on the density of the
+    // dataset between 5 and 105.
+    const double density = (cleanedData.n_nonzero * 100.0) / cleanedData.n_elem;
+    const size_t rankEstimate = size_t(density) + 5;
+
+    // Set to heuristic value.
+    Log::Info << "No rank given for decomposition; using rank of "
+        << rankEstimate << " calculated by density-based heuristic."
+        << std::endl;
+    rank = rankEstimate;
+  }
+
+  // Operations independent of the query:
+  // Decompose the sparse data matrix to user and data matrices.
+  // Presently only ALS (via NMF) is supported as an optimizer.
+  factorizer.Apply(cleanedData, rank, w, h);
+
+  // Generate new table by multiplying approximate values.
+  rating = w * h;
+
+  // Now, we will use the decomposed w and h matrices to estimate what the user
+  // would have rated items as, and then pick the best items.
+
+  // Temporarily store feature vector of queried users.
+  arma::mat query(rating.n_rows, users.n_elem);
+
+  // Select feature vectors of queried users.
+  for (size_t i = 0; i < users.n_elem; i++)
+    query.col(i) = rating.col(users(i));
+
+  // Temporary storage for neighborhood of the queried users.
+  arma::Mat<size_t> neighborhood;
+
+  // Calculate the neighborhood of the queried users.
+  // This should be a templatized option.
+  AllkNN a(rating, query);
+  arma::mat resultingDistances; // Temporary storage.
+  a.Search(numUsersForSimilarity, neighborhood, resultingDistances);
+
+  // Temporary storage for storing the average rating for each user in their
+  // neighborhood.
+  arma::mat averages = arma::zeros<arma::mat>(rating.n_rows, query.n_cols);
+
+  // Iterate over each query user.
+  for (size_t i = 0; i < neighborhood.n_cols; ++i)
+  {
+    // Iterate over each neighbor of the query user.
+    for (size_t j = 0; j < neighborhood.n_rows; ++j)
+      averages.col(i) += rating.col(neighborhood(j, i));
+    // Normalize average.
+    averages.col(i) /= neighborhood.n_rows;
+  }
+
+  // Generate recommendations for each query user by finding the maximum numRecs
+  // elements in the averages matrix.
+  recommendations.set_size(numRecs, users.n_elem);
+  recommendations.fill(cleanedData.n_rows); // Invalid item number.
+  arma::mat values(numRecs, users.n_elem);
+  values.fill(-DBL_MAX); // The smallest possible value.
+  for (size_t i = 0; i < users.n_elem; i++)
+  {
+    // Look through the averages column corresponding to the current user.
+    for (size_t j = 0; j < averages.n_rows; ++j)
+    {
+      // Ensure that the user hasn't already rated the item.
+      if (cleanedData(j, users(i)) != 0.0)
+        continue; // The user already rated the item.
+
+      // Is the estimated value better than the worst candidate?
+      const double value = averages(j, i);
+      if (value > values(values.n_rows - 1, i))
+      {
+        // It should be inserted.  Which position?
+        size_t insertPosition = values.n_rows - 1;
+        while (insertPosition > 0)
+        {
+          if (value <= values(insertPosition - 1, i))
+            break; // The current value is the right one.
+          insertPosition--;
+        }
+
+        // Now insert it into the list.
+        InsertNeighbor(i, insertPosition, j, value, recommendations,
+            values);
+      }
+    }
+
+    // If we were not able to come up with enough recommendations, issue a
+    // warning.
+    if (recommendations(values.n_rows - 1, i) == cleanedData.n_rows + 1)
+      Log::Warn << "Could not provide " << values.n_rows << " recommendations "
+          << "for user " << users(i) << " (not enough un-rated items)!" << endl;
+  }
+}
+
+template<typename FactorizerType>
+void CF<FactorizerType>::GetRecommendations(arma::Mat<size_t>& recommendations,
+                                            arma::Col<size_t>& users,size_t num)
+{
+  //Setting Number of Recommendations
+  NumRecs(num);
+  //Calling Base Function for Recommendations
+  GetRecommendations(recommendations,users);
+}
+
+template<typename FactorizerType>
+void CF<FactorizerType>::GetRecommendations(arma::Mat<size_t>& recommendations,
+                                            arma::Col<size_t>& users,size_t num,
+                                            size_t s)
+{
+  //Setting number of users that should be used for calculating
+  //neighbours
+  NumUsersForSimilarity(s);
+  //Setting Number of Recommendations
+  NumRecs(num);
+  //Calling Base Function for Recommendations
+  GetRecommendations(recommendations,users,num);
+}
+
+template<typename FactorizerType>
+void CF<FactorizerType>::CleanData()
+{
+  // Generate list of locations for batch insert constructor for sparse
+  // matrices.
+  arma::umat locations(2, data.n_cols);
+  arma::vec values(data.n_cols);
+  for (size_t i = 0; i < data.n_cols; ++i)
+  {
+    // We have to transpose it because items are rows, and users are columns.
+    locations(1, i) = ((arma::uword) data(0, i));
+    locations(0, i) = ((arma::uword) data(1, i));
+    values(i) = data(2, i);
+  }
+
+  // Find maximum user and item IDs.
+  const size_t maxItemID = (size_t) max(locations.row(0)) + 1;
+  const size_t maxUserID = (size_t) max(locations.row(1)) + 1;
+
+  // Fill sparse matrix.
+  cleanedData = arma::sp_mat(locations, values, maxItemID, maxUserID);
+}
+
+/**
+ * Helper function to insert a point into the recommendation matrices.
+ *
+ * @param queryIndex Index of point whose recommendations we are inserting into.
+ * @param pos Position in list to insert into.
+ * @param neighbor Index of item being inserted as a recommendation.
+ * @param value Value of recommendation.
+ */
+template<typename FactorizerType>
+void CF<FactorizerType>::InsertNeighbor(const size_t queryIndex,
+                                        const size_t pos,
+                                        const size_t neighbor,
+                                        const double value,
+                                        arma::Mat<size_t>& recommendations,
+                                        arma::mat& values) const
+{
+  // We only memmove() if there is actually a need to shift something.
+  if (pos < (recommendations.n_rows - 1))
+  {
+    const int len = (values.n_rows - 1) - pos;
+    memmove(values.colptr(queryIndex) + (pos + 1),
+        values.colptr(queryIndex) + pos,
+        sizeof(double) * len);
+    memmove(recommendations.colptr(queryIndex) + (pos + 1),
+        recommendations.colptr(queryIndex) + pos,
+        sizeof(size_t) * len);
+  }
+
+  // Now put the new information in the right index.
+  values(pos, queryIndex) = value;
+  recommendations(pos, queryIndex) = neighbor;
+}
+
+// Return string of object.
+template<typename FactorizerType>
+std::string CF<FactorizerType>::ToString() const
+{
+  std::ostringstream convert;
+  convert << "Collaborative Filtering [" << this << "]" << std::endl;
+  //convert << "  Number of Recommendations: " << numRecs << std::endl;
+  //convert << "  Number of Users for Similarity: " << numUsersForSimilarity;
+  //convert << std::endl;
+  //convert << "  Data: " << data.n_rows << "x" << data.n_cols << std::endl;
+  return convert.str();
+}
+
+}; // namespace mlpack
+}; // namespace cf

Modified: mlpack/trunk/src/mlpack/methods/cf/cf_main.cpp
==============================================================================
--- mlpack/trunk/src/mlpack/methods/cf/cf_main.cpp	(original)
+++ mlpack/trunk/src/mlpack/methods/cf/cf_main.cpp	Sat Mar  1 23:39:49 2014
@@ -70,7 +70,7 @@
 
   // Perform decomposition to prepare for recommendations.
   Log::Info << "Performing CF matrix decomposition on dataset..." << endl;
-  CF c(dataset);
+  CF<> c(dataset);
   c.NumRecs(numRecs);
   c.NumUsersForSimilarity(neighborhood);
 

Modified: mlpack/trunk/src/mlpack/tests/cf_test.cpp
==============================================================================
--- mlpack/trunk/src/mlpack/tests/cf_test.cpp	(original)
+++ mlpack/trunk/src/mlpack/tests/cf_test.cpp	Sat Mar  1 23:39:49 2014
@@ -33,7 +33,7 @@
   // Number of users for similarity (not the default).
   const size_t numUsersForSimilarity = 8;
 
-  CF c(dataset, numRecs, numUsersForSimilarity);
+  CF<> c(dataset, numRecs, numUsersForSimilarity);
 
   // Check parameters.
   BOOST_REQUIRE_EQUAL(c.NumRecs(), numRecs);
@@ -68,7 +68,7 @@
   data::Load("GroupLens100k.csv", dataset);
 
   // Creat a CF object
-  CF c(dataset);
+  CF<> c(dataset);
 
   // Set number of recommendations.
   c.NumRecs(numRecs);
@@ -106,7 +106,7 @@
   arma::mat dataset;
   data::Load("GroupLens100k.csv", dataset);
 
-  CF c(dataset);
+  CF<> c(dataset);
 
   // Generate recommendations when query set is specified.
   c.GetRecommendations(recommendations, users);
@@ -161,7 +161,7 @@
   }
 
   // Now create the CF object.
-  CF c(dataset);
+  CF<> c(dataset);
 
   // Obtain 150 recommendations for the users in savedCols, and make sure the
   // missing item shows up in most of them.  First, create the list of users,

Modified: mlpack/trunk/src/mlpack/tests/to_string_test.cpp
==============================================================================
--- mlpack/trunk/src/mlpack/tests/to_string_test.cpp	(original)
+++ mlpack/trunk/src/mlpack/tests/to_string_test.cpp	Sat Mar  1 23:39:49 2014
@@ -292,7 +292,7 @@
   c(0, 2) = 1;
   c(1, 2) = 3;
   c(2, 2) = 0.7;
-  mlpack::cf::CF d(c, a, a);
+  mlpack::cf::CF<> d(c, a, a);
   Log::Debug << d;
   std::string s = d.ToString();
   BOOST_REQUIRE_NE(s, "");



More information about the mlpack-svn mailing list