[mlpack-git] master: Merge pull request #437 from sumedhghaisas/master (d2f2976)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Thu May 21 21:05:13 EDT 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/7e9cd46afb53817ae93ccbd02637d7726137ce4d...d2f2976c7a43f8ab9139064ae33304bcf9f4f884
>---------------------------------------------------------------
commit d2f2976c7a43f8ab9139064ae33304bcf9f4f884
Merge: 7e9cd46 436c65b
Author: Ryan Curtin <ryan at ratml.org>
Date: Thu May 21 21:05:07 2015 -0400
Merge pull request #437 from sumedhghaisas/master
CF module refactoring
>---------------------------------------------------------------
d2f2976c7a43f8ab9139064ae33304bcf9f4f884
src/mlpack/methods/cf/cf.hpp | 30 +++++++++++++++++++++++--
src/mlpack/methods/cf/cf_impl.hpp | 46 +++++++++++++++++++++++++++++++++++++--
2 files changed, 72 insertions(+), 4 deletions(-)
diff --cc src/mlpack/methods/cf/cf.hpp
index 33e967d,835c62a..1a063f6
--- a/src/mlpack/methods/cf/cf.hpp
+++ b/src/mlpack/methods/cf/cf.hpp
@@@ -165,30 -191,10 +190,33 @@@ class C
arma::Mat<size_t>& recommendations,
arma::Col<size_t>& users);
+ //! Converts the User, Item, Value Matrix to User-Item Table
+ static void CleanData(const arma::mat& data, arma::sp_mat& cleanedData);
+
/**
+ * Predict the rating of an item by a particular user.
+ *
+ * @param user User to predict for.
+ * @param item Item to predict for.
+ */
+ double Predict(const size_t user, const size_t item) const;
+
+ /**
+ * Predict ratings for each user-item combination in the given coordinate list
+ * matrix. The matrix 'combinations' should have two rows and number of
+ * columns equal to the number of desired predictions. The first element of
+ * each column corresponds to the user index, and the second element of each
+ * column corresponds to the item index. The output vector 'predictions' will
+ * have length equal to combinations.n_cols, and predictions[i] will be equal
+ * to the prediction for the user/item combination in combinations.col(i).
+ *
+ * @param combinations User/item combinations to predict.
+ * @param predictions Predicted ratings for each user/item combination.
+ */
+ void Predict(const arma::Mat<size_t>& combinations,
+ arma::vec& predictions) const;
+
+ /**
* Returns a string representation of this object.
*/
std::string ToString() const;
diff --cc src/mlpack/methods/cf/cf_impl.hpp
index 63df8d1,63dbbbd..c2bd2d6
--- a/src/mlpack/methods/cf/cf_impl.hpp
+++ b/src/mlpack/methods/cf/cf_impl.hpp
@@@ -96,10 -96,51 +96,52 @@@ CF<FactorizerType>::CF(arma::mat& data
// Operations independent of the query:
// Decompose the sparse data matrix to user and data matrices.
- ApplyFactorizer<FactorizerType>(data, cleanedData, factorizer, this->rank, w, h);
+ ApplyFactorizer<FactorizerType>(data, cleanedData, factorizer, this->rank, w,
+ h);
}
+ /**
+ * Construct the CF object using an instantiated factorizer.
+ */
+ template<typename FactorizerType>
+ template<typename U, class>
+ CF<FactorizerType>::CF(const arma::sp_mat& data,
+ FactorizerType factorizer,
+ const size_t numUsersForSimilarity,
+ const size_t rank) :
+ numUsersForSimilarity(numUsersForSimilarity),
+ rank(rank),
+ factorizer(factorizer)
+ {
+ // Validate neighbourhood size.
+ if (numUsersForSimilarity < 1)
+ {
+ Log::Warn << "CF::CF(): neighbourhood size should be > 0("
+ << numUsersForSimilarity << " given). Setting value to 5.\n";
+ //Setting Default Value of 5
+ this->numUsersForSimilarity = 5;
+ }
+
+ cleanedData = data;
+
+ // Check if the user wanted us to choose a rank for them.
+ if (rank == 0)
+ {
+ // This is a simple heuristic that picks a rank based on the density of the
+ // dataset between 5 and 105.
+ const double density = (cleanedData.n_nonzero * 100.0) / cleanedData.n_elem;
+ const size_t rankEstimate = size_t(density) + 5;
+
+ // Set to heuristic value.
+ Log::Info << "No rank given for decomposition; using rank of "
+ << rankEstimate << " calculated by density-based heuristic."
+ << std::endl;
+ this->rank = rankEstimate;
+ }
+
+ factorizer.Apply(cleanedData, rank, w, h);
+ }
+
template<typename FactorizerType>
void CF<FactorizerType>::GetRecommendations(const size_t numRecs,
arma::Mat<size_t>& recommendations)
@@@ -200,105 -241,8 +242,105 @@@ void CF<FactorizerType>::GetRecommendat
}
}
+// Predict the rating for a single user/item combination.
+template<typename FactorizerType>
+double CF<FactorizerType>::Predict(const size_t user, const size_t item) const
+{
+ // First, we need to find the nearest neighbors of the given user.
+ // We'll use the same technique as for GetRecommendations().
+
+ // We want to avoid calculating the full rating matrix, so we will do nearest
+ // neighbor search only on the H matrix, using the observation that if the
+ // rating matrix X = W*H, then d(X.col(i), X.col(j)) = d(W H.col(i), W
+ // H.col(j)). This can be seen as nearest neighbor search on the H matrix
+ // with the Mahalanobis distance where M^{-1} = W^T W. So, we'll decompose
+ // M^{-1} = L L^T (the Cholesky decomposition), and then multiply H by L^T.
+ // Then we can perform nearest neighbor search.
+ arma::mat l = arma::chol(w.t() * w);
+ arma::mat stretchedH = l * h; // Due to the Armadillo API, l is L^T.
+
+ // Now, we will use the decomposed w and h matrices to estimate what the user
+ // would have rated items as, and then pick the best items.
+
+ // Temporarily store feature vector of queried users.
+ arma::mat query = stretchedH.col(user);
+
+ // Temporary storage for neighborhood of the queried users.
+ arma::Mat<size_t> neighborhood;
+
+ // Calculate the neighborhood of the queried users.
+ // This should be a templatized option.
+ neighbor::AllkNN a(stretchedH, false, true /* single-tree mode */);
+ arma::mat resultingDistances; // Temporary storage.
+
+ a.Search(query, numUsersForSimilarity, neighborhood, resultingDistances);
+
+ double rating = 0; // We'll take the average of neighborhood values.
+
+ for (size_t j = 0; j < neighborhood.n_rows; ++j)
+ rating += arma::as_scalar(w.row(item) * h.col(neighborhood(j, 0)));
+ rating /= neighborhood.n_rows;
+
+ return rating;
+}
+
+// Predict the rating for a group of user/item combinations.
+template<typename FactorizerType>
+void CF<FactorizerType>::Predict(const arma::Mat<size_t>& combinations,
+ arma::vec& predictions) const
+{
+ // First, for nearest neighbor search, stretch the H matrix.
+ arma::mat l = arma::chol(w.t() * w);
+ arma::mat stretchedH = l * h; // Due to the Armadillo API, l is L^T.
+
+ // Now, we must determine those query indices we need to find the nearest
+ // neighbors for. This is easiest if we just sort the combinations matrix.
+ arma::Mat<size_t> sortedCombinations(combinations.n_rows,
+ combinations.n_cols);
+ arma::uvec ordering = arma::sort_index(combinations.row(0).t());
+ for (size_t i = 0; i < ordering.n_elem; ++i)
+ sortedCombinations.col(i) = combinations.col(ordering[i]);
+
+ // Now, we have to get the list of unique users we will be searching for.
+ arma::Col<size_t> users = arma::unique(combinations.row(0).t());
+
+ // Assemble our query matrix from the stretchedH matrix.
+ arma::mat queries(stretchedH.n_rows, users.n_elem);
+ for (size_t i = 0; i < queries.n_cols; ++i)
+ queries.col(i) = stretchedH.col(users[i]);
+
+ // Now calculate the neighborhood of these users.
+ neighbor::AllkNN a(stretchedH);
+ arma::mat distances;
+ arma::Mat<size_t> neighborhood;
+
+ a.Search(queries, numUsersForSimilarity, neighborhood, distances);
+
+ // Now that we have the neighborhoods we need, calculate the predictions.
+ predictions.set_size(combinations.n_cols);
+
+ size_t user = 0; // Cumulative user count, because we are doing it in order.
+ for (size_t i = 0; i < sortedCombinations.n_cols; ++i)
+ {
+ // Could this be made faster by calculating dot products for multiple items
+ // at once?
+ double rating = 0.0;
+
+ // Map the combination's user to the user ID used for kNN.
+ while (users[user] < sortedCombinations(0, i))
+ ++user;
+
+ for (size_t j = 0; j < neighborhood.n_rows; ++j)
+ rating += arma::as_scalar(w.row(sortedCombinations(1, i)) *
+ h.col(neighborhood(j, user)));
+ rating /= neighborhood.n_rows;
+
+ predictions(ordering[i]) = rating;
+ }
+}
+
template<typename FactorizerType>
- void CF<FactorizerType>::CleanData(const arma::mat& data)
+ void CF<FactorizerType>::CleanData(const arma::mat& data, arma::sp_mat& cleanedData)
{
// Generate list of locations for batch insert constructor for sparse
// matrices.
More information about the mlpack-git
mailing list