[mlpack-git] master: Add Predict() method for predicting individual ratings. (191713c)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Mon Apr 27 15:26:43 EDT 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/1352c1d37a82b0454df6bd1385734b5965c54002...191713c35eee8fd7370287c2b4ec7b17e01aec5a
>---------------------------------------------------------------
commit 191713c35eee8fd7370287c2b4ec7b17e01aec5a
Author: Ryan Curtin <ryan at ratml.org>
Date: Mon Apr 27 15:26:27 2015 -0400
Add Predict() method for predicting individual ratings.
>---------------------------------------------------------------
191713c35eee8fd7370287c2b4ec7b17e01aec5a
src/mlpack/methods/cf/cf.hpp | 8 ++++++
src/mlpack/methods/cf/cf_impl.hpp | 45 ++++++++++++++++++++++++++++-
src/mlpack/tests/cf_test.cpp | 60 +++++++++++++++++++++++++++++++++++++++
3 files changed, 112 insertions(+), 1 deletion(-)
diff --git a/src/mlpack/methods/cf/cf.hpp b/src/mlpack/methods/cf/cf.hpp
index 03e953d..5c4ce2b 100644
--- a/src/mlpack/methods/cf/cf.hpp
+++ b/src/mlpack/methods/cf/cf.hpp
@@ -167,6 +167,14 @@ class CF
arma::Col<size_t>& users);
/**
+ * Predict the rating of an item by a particular user.
+ *
+ * @param user User to predict for.
+ * @param item Item to predict for.
+ */
+ double Predict(const size_t user, const size_t item) const;
+
+ /**
* Returns a string representation of this object.
*/
std::string ToString() const;
diff --git a/src/mlpack/methods/cf/cf_impl.hpp b/src/mlpack/methods/cf/cf_impl.hpp
index d857166..5ad7e45 100644
--- a/src/mlpack/methods/cf/cf_impl.hpp
+++ b/src/mlpack/methods/cf/cf_impl.hpp
@@ -96,7 +96,8 @@ CF<FactorizerType>::CF(arma::mat& data,
// Operations independent of the query:
// Decompose the sparse data matrix to user and data matrices.
- ApplyFactorizer<FactorizerType>(data, cleanedData, factorizer, this->rank, w, h);
+ ApplyFactorizer<FactorizerType>(data, cleanedData, factorizer, this->rank, w,
+ h);
}
template<typename FactorizerType>
@@ -199,6 +200,48 @@ void CF<FactorizerType>::GetRecommendations(const size_t numRecs,
}
}
+// Predict the rating for a single user/item combination.
+template<typename FactorizerType>
+double CF<FactorizerType>::Predict(const size_t user, const size_t item) const
+{
+ // First, we need to find the nearest neighbors of the given user.
+ // We'll use the same technique as for GetRecommendations().
+
+ // We want to avoid calculating the full rating matrix, so we will do nearest
+ // neighbor search only on the H matrix, using the observation that if the
+ // rating matrix X = W*H, then d(X.col(i), X.col(j)) = d(W H.col(i), W
+ // H.col(j)). This can be seen as nearest neighbor search on the H matrix
+ // with the Mahalanobis distance where M^{-1} = W^T W. So, we'll decompose
+ // M^{-1} = L L^T (the Cholesky decomposition), and then multiply H by L^T.
+ // Then we can perform nearest neighbor search.
+ arma::mat l = arma::chol(w.t() * w);
+ arma::mat stretchedH = l * h; // Due to the Armadillo API, l is L^T.
+
+ // Now, we will use the decomposed w and h matrices to estimate what the user
+ // would have rated items as, and then pick the best items.
+
+ // Temporarily store feature vector of queried users.
+ arma::mat query = stretchedH.col(user);
+
+ // Temporary storage for neighborhood of the queried users.
+ arma::Mat<size_t> neighborhood;
+
+ // Calculate the neighborhood of the queried users.
+ // This should be a templatized option.
+ neighbor::AllkNN a(stretchedH, false, true /* single-tree mode */);
+ arma::mat resultingDistances; // Temporary storage.
+
+ a.Search(query, numUsersForSimilarity, neighborhood, resultingDistances);
+
+ double rating = 0; // We'll take the average of neighborhood values.
+
+ for (size_t j = 0; j < neighborhood.n_rows; ++j)
+ rating += arma::as_scalar(w.row(item) * h.col(neighborhood(j, 0)));
+ rating /= neighborhood.n_rows;
+
+ return rating;
+}
+
template<typename FactorizerType>
void CF<FactorizerType>::CleanData(const arma::mat& data)
{
diff --git a/src/mlpack/tests/cf_test.cpp b/src/mlpack/tests/cf_test.cpp
index 6b4d41e..fff769a 100644
--- a/src/mlpack/tests/cf_test.cpp
+++ b/src/mlpack/tests/cf_test.cpp
@@ -175,4 +175,64 @@ BOOST_AUTO_TEST_CASE(RecommendationAccuracyTest)
BOOST_REQUIRE_LT(failures, 100);
}
+// Make sure that Predict() is returning reasonable results.
+BOOST_AUTO_TEST_CASE(CFPredictTest)
+{
+ // Load the GroupLens dataset; then, we will remove some values from it.
+ arma::mat dataset;
+ data::Load("GroupLens100k.csv", dataset);
+
+ // Save the columns we've removed.
+ arma::mat savedCols(3, 300); // Remove 300 5-star ratings.
+ size_t currentCol = 0;
+ for (size_t i = 0; i < dataset.n_cols; ++i)
+ {
+ if (currentCol == 300)
+ break;
+
+ if (dataset(2, i) > 4.5) // 5-star rating.
+ {
+ // Make sure we don't have this user yet. This is a slow way to do this
+ // but I don't particularly care here because it's in the tests.
+ bool found = false;
+ for (size_t j = 0; j < currentCol; ++j)
+ {
+ if (savedCols(0, j) == dataset(0, i))
+ {
+ found = true;
+ break;
+ }
+ }
+
+ // If this user doesn't already exist in savedCols, add them. Otherwise
+ // ignore this point.
+ if (!found)
+ {
+ savedCols.col(currentCol) = dataset.col(i);
+ dataset.shed_col(i);
+ ++currentCol;
+ }
+ }
+ }
+
+ // Now create the CF object.
+ CF<> c(dataset);
+
+ // Now, for each removed rating, make sure the prediction is... reasonably
+ // accurate.
+ double totalError = 0.0;
+ for (size_t i = 0; i < savedCols.n_cols; ++i)
+ {
+ const double prediction = c.Predict(savedCols(0, i), savedCols(1, i));
+
+ const double error = std::pow(prediction - savedCols(2, i), 2.0);
+ totalError += error;
+ }
+
+ totalError = std::sqrt(totalError) / savedCols.n_cols;
+
+ // The mean squared error should be less than one.
+ BOOST_REQUIRE_LT(totalError, 0.5);
+}
+
BOOST_AUTO_TEST_SUITE_END();
More information about the mlpack-git
mailing list