[mlpack-git] master: Add Predict() method for predicting individual ratings. (191713c)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Mon Apr 27 15:26:43 EDT 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/1352c1d37a82b0454df6bd1385734b5965c54002...191713c35eee8fd7370287c2b4ec7b17e01aec5a

>---------------------------------------------------------------

commit 191713c35eee8fd7370287c2b4ec7b17e01aec5a
Author: Ryan Curtin <ryan at ratml.org>
Date:   Mon Apr 27 15:26:27 2015 -0400

    Add Predict() method for predicting individual ratings.


>---------------------------------------------------------------

191713c35eee8fd7370287c2b4ec7b17e01aec5a
 src/mlpack/methods/cf/cf.hpp      |  8 ++++++
 src/mlpack/methods/cf/cf_impl.hpp | 45 ++++++++++++++++++++++++++++-
 src/mlpack/tests/cf_test.cpp      | 60 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/src/mlpack/methods/cf/cf.hpp b/src/mlpack/methods/cf/cf.hpp
index 03e953d..5c4ce2b 100644
--- a/src/mlpack/methods/cf/cf.hpp
+++ b/src/mlpack/methods/cf/cf.hpp
@@ -167,6 +167,14 @@ class CF
                           arma::Col<size_t>& users);
 
   /**
+   * Predict the rating of an item by a particular user.
+   *
+   * @param user User to predict for.
+   * @param item Item to predict for.
+   */
+  double Predict(const size_t user, const size_t item) const;
+
+  /**
    * Returns a string representation of this object.
    */
   std::string ToString() const;
diff --git a/src/mlpack/methods/cf/cf_impl.hpp b/src/mlpack/methods/cf/cf_impl.hpp
index d857166..5ad7e45 100644
--- a/src/mlpack/methods/cf/cf_impl.hpp
+++ b/src/mlpack/methods/cf/cf_impl.hpp
@@ -96,7 +96,8 @@ CF<FactorizerType>::CF(arma::mat& data,
 
   // Operations independent of the query:
   // Decompose the sparse data matrix to user and data matrices.
-  ApplyFactorizer<FactorizerType>(data, cleanedData, factorizer, this->rank, w, h);
+  ApplyFactorizer<FactorizerType>(data, cleanedData, factorizer, this->rank, w,
+      h);
 }
 
 template<typename FactorizerType>
@@ -199,6 +200,48 @@ void CF<FactorizerType>::GetRecommendations(const size_t numRecs,
   }
 }
 
+// Predict the rating for a single user/item combination.
+template<typename FactorizerType>
+double CF<FactorizerType>::Predict(const size_t user, const size_t item) const
+{
+  // First, we need to find the nearest neighbors of the given user.
+  // We'll use the same technique as for GetRecommendations().
+
+  // We want to avoid calculating the full rating matrix, so we will do nearest
+  // neighbor search only on the H matrix, using the observation that if the
+  // rating matrix X = W*H, then d(X.col(i), X.col(j)) = d(W H.col(i), W
+  // H.col(j)).  This can be seen as nearest neighbor search on the H matrix
+  // with the Mahalanobis distance where M^{-1} = W^T W.  So, we'll decompose
+  // M^{-1} = L L^T (the Cholesky decomposition), and then multiply H by L^T.
+  // Then we can perform nearest neighbor search.
+  arma::mat l = arma::chol(w.t() * w);
+  arma::mat stretchedH = l * h; // Due to the Armadillo API, l is L^T.
+
+  // Now, we will use the decomposed w and h matrices to estimate what the user
+  // would have rated items as, and then pick the best items.
+
+  // Temporarily store feature vector of queried users.
+  arma::mat query = stretchedH.col(user);
+
+  // Temporary storage for neighborhood of the queried users.
+  arma::Mat<size_t> neighborhood;
+
+  // Calculate the neighborhood of the queried users.
+  // This should be a templatized option.
+  neighbor::AllkNN a(stretchedH, false, true /* single-tree mode */);
+  arma::mat resultingDistances; // Temporary storage.
+
+  a.Search(query, numUsersForSimilarity, neighborhood, resultingDistances);
+
+  double rating = 0; // We'll take the average of neighborhood values.
+
+  for (size_t j = 0; j < neighborhood.n_rows; ++j)
+    rating += arma::as_scalar(w.row(item) * h.col(neighborhood(j, 0)));
+  rating /= neighborhood.n_rows;
+
+  return rating;
+}
+
 template<typename FactorizerType>
 void CF<FactorizerType>::CleanData(const arma::mat& data)
 {
diff --git a/src/mlpack/tests/cf_test.cpp b/src/mlpack/tests/cf_test.cpp
index 6b4d41e..fff769a 100644
--- a/src/mlpack/tests/cf_test.cpp
+++ b/src/mlpack/tests/cf_test.cpp
@@ -175,4 +175,64 @@ BOOST_AUTO_TEST_CASE(RecommendationAccuracyTest)
   BOOST_REQUIRE_LT(failures, 100);
 }
 
+// Make sure that Predict() is returning reasonable results.
+BOOST_AUTO_TEST_CASE(CFPredictTest)
+{
+  // Load the GroupLens dataset; then, we will remove some values from it.
+  arma::mat dataset;
+  data::Load("GroupLens100k.csv", dataset);
+
+  // Save the columns we've removed.
+  arma::mat savedCols(3, 300); // Remove 300 5-star ratings.
+  size_t currentCol = 0;
+  for (size_t i = 0; i < dataset.n_cols; ++i)
+  {
+    if (currentCol == 300)
+      break;
+
+    if (dataset(2, i) > 4.5) // 5-star rating.
+    {
+      // Make sure we don't have this user yet.  This is a slow way to do this
+      // but I don't particularly care here because it's in the tests.
+      bool found = false;
+      for (size_t j = 0; j < currentCol; ++j)
+      {
+        if (savedCols(0, j) == dataset(0, i))
+        {
+          found = true;
+          break;
+        }
+      }
+
+      // If this user doesn't already exist in savedCols, add them.  Otherwise
+      // ignore this point.
+      if (!found)
+      {
+        savedCols.col(currentCol) = dataset.col(i);
+        dataset.shed_col(i);
+        ++currentCol;
+      }
+    }
+  }
+
+  // Now create the CF object.
+  CF<> c(dataset);
+
+  // Now, for each removed rating, make sure the prediction is... reasonably
+  // accurate.
+  double totalError = 0.0;
+  for (size_t i = 0; i < savedCols.n_cols; ++i)
+  {
+    const double prediction = c.Predict(savedCols(0, i), savedCols(1, i));
+
+    const double error = std::pow(prediction - savedCols(2, i), 2.0);
+    totalError += error;
+  }
+
+  totalError = std::sqrt(totalError) / savedCols.n_cols;
+
+  // The mean squared error should be less than one.
+  BOOST_REQUIRE_LT(totalError, 0.5);
+}
+
 BOOST_AUTO_TEST_SUITE_END();



More information about the mlpack-git mailing list