[mlpack-git] master: Merge pull request #437 from sumedhghaisas/master (d2f2976)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Thu May 21 21:05:13 EDT 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/7e9cd46afb53817ae93ccbd02637d7726137ce4d...d2f2976c7a43f8ab9139064ae33304bcf9f4f884

>---------------------------------------------------------------

commit d2f2976c7a43f8ab9139064ae33304bcf9f4f884
Merge: 7e9cd46 436c65b
Author: Ryan Curtin <ryan at ratml.org>
Date:   Thu May 21 21:05:07 2015 -0400

    Merge pull request #437 from sumedhghaisas/master
    
    CF module refactoring


>---------------------------------------------------------------

d2f2976c7a43f8ab9139064ae33304bcf9f4f884
 src/mlpack/methods/cf/cf.hpp      | 30 +++++++++++++++++++++++--
 src/mlpack/methods/cf/cf_impl.hpp | 46 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 72 insertions(+), 4 deletions(-)

diff --cc src/mlpack/methods/cf/cf.hpp
index 33e967d,835c62a..1a063f6
--- a/src/mlpack/methods/cf/cf.hpp
+++ b/src/mlpack/methods/cf/cf.hpp
@@@ -165,30 -191,10 +190,33 @@@ class C
                            arma::Mat<size_t>& recommendations,
                            arma::Col<size_t>& users);
                            
+   //! Converts the User, Item, Value Matrix to User-Item Table
+   static void CleanData(const arma::mat& data, arma::sp_mat& cleanedData);
+ 
    /**
 +   * Predict the rating of an item by a particular user.
 +   *
 +   * @param user User to predict for.
 +   * @param item Item to predict for.
 +   */
 +  double Predict(const size_t user, const size_t item) const;
 +
 +  /**
 +   * Predict ratings for each user-item combination in the given coordinate list
 +   * matrix.  The matrix 'combinations' should have two rows and number of
 +   * columns equal to the number of desired predictions.  The first element of
 +   * each column corresponds to the user index, and the second element of each
 +   * column corresponds to the item index.  The output vector 'predictions' will
 +   * have length equal to combinations.n_cols, and predictions[i] will be equal
 +   * to the prediction for the user/item combination in combinations.col(i).
 +   *
 +   * @param combinations User/item combinations to predict.
 +   * @param predictions Predicted ratings for each user/item combination.
 +   */
 +  void Predict(const arma::Mat<size_t>& combinations,
 +               arma::vec& predictions) const;
 +
 +  /**
     * Returns a string representation of this object.
     */
    std::string ToString() const;
diff --cc src/mlpack/methods/cf/cf_impl.hpp
index 63df8d1,63dbbbd..c2bd2d6
--- a/src/mlpack/methods/cf/cf_impl.hpp
+++ b/src/mlpack/methods/cf/cf_impl.hpp
@@@ -96,10 -96,51 +96,52 @@@ CF<FactorizerType>::CF(arma::mat& data
  
    // Operations independent of the query:
    // Decompose the sparse data matrix to user and data matrices.
 -  ApplyFactorizer<FactorizerType>(data, cleanedData, factorizer, this->rank, w, h);
 +  ApplyFactorizer<FactorizerType>(data, cleanedData, factorizer, this->rank, w,
 +      h);
  }
  
+ /**
+  * Construct the CF object using an instantiated factorizer.
+  */
+ template<typename FactorizerType>
+ template<typename U, class>
+ CF<FactorizerType>::CF(const arma::sp_mat& data,
+             FactorizerType factorizer,
+             const size_t numUsersForSimilarity,
+             const size_t rank) :
+     numUsersForSimilarity(numUsersForSimilarity),
+     rank(rank),
+     factorizer(factorizer)
+ {
+   // Validate neighbourhood size.
+   if (numUsersForSimilarity < 1)
+   {
+     Log::Warn << "CF::CF(): neighbourhood size should be > 0("
+         << numUsersForSimilarity << " given). Setting value to 5.\n";
+     //Setting Default Value of 5
+     this->numUsersForSimilarity = 5;
+   }
+ 
+   cleanedData = data;
+ 
+   // Check if the user wanted us to choose a rank for them.
+   if (rank == 0)
+   {
+     // This is a simple heuristic that picks a rank based on the density of the
+     // dataset between 5 and 105.
+     const double density = (cleanedData.n_nonzero * 100.0) / cleanedData.n_elem;
+     const size_t rankEstimate = size_t(density) + 5;
+ 
+     // Set to heuristic value.
+     Log::Info << "No rank given for decomposition; using rank of "
+         << rankEstimate << " calculated by density-based heuristic."
+         << std::endl;
+     this->rank = rankEstimate;
+   }
+ 
+   factorizer.Apply(cleanedData, rank, w, h);
+ }
+ 
  template<typename FactorizerType>
  void CF<FactorizerType>::GetRecommendations(const size_t numRecs,
                                              arma::Mat<size_t>& recommendations)
@@@ -200,105 -241,8 +242,105 @@@ void CF<FactorizerType>::GetRecommendat
    }
  }
  
 +// Predict the rating for a single user/item combination.
 +template<typename FactorizerType>
 +double CF<FactorizerType>::Predict(const size_t user, const size_t item) const
 +{
 +  // First, we need to find the nearest neighbors of the given user.
 +  // We'll use the same technique as for GetRecommendations().
 +
 +  // We want to avoid calculating the full rating matrix, so we will do nearest
 +  // neighbor search only on the H matrix, using the observation that if the
 +  // rating matrix X = W*H, then d(X.col(i), X.col(j)) = d(W H.col(i), W
 +  // H.col(j)).  This can be seen as nearest neighbor search on the H matrix
 +  // with the Mahalanobis distance where M^{-1} = W^T W.  So, we'll decompose
 +  // M^{-1} = L L^T (the Cholesky decomposition), and then multiply H by L^T.
 +  // Then we can perform nearest neighbor search.
 +  arma::mat l = arma::chol(w.t() * w);
 +  arma::mat stretchedH = l * h; // Due to the Armadillo API, l is L^T.
 +
 +  // Now, we will use the decomposed w and h matrices to estimate what the user
 +  // would have rated items as, and then pick the best items.
 +
 +  // Temporarily store feature vector of queried users.
 +  arma::mat query = stretchedH.col(user);
 +
 +  // Temporary storage for neighborhood of the queried users.
 +  arma::Mat<size_t> neighborhood;
 +
 +  // Calculate the neighborhood of the queried users.
 +  // This should be a templatized option.
 +  neighbor::AllkNN a(stretchedH, false, true /* single-tree mode */);
 +  arma::mat resultingDistances; // Temporary storage.
 +
 +  a.Search(query, numUsersForSimilarity, neighborhood, resultingDistances);
 +
 +  double rating = 0; // We'll take the average of neighborhood values.
 +
 +  for (size_t j = 0; j < neighborhood.n_rows; ++j)
 +    rating += arma::as_scalar(w.row(item) * h.col(neighborhood(j, 0)));
 +  rating /= neighborhood.n_rows;
 +
 +  return rating;
 +}
 +
 +// Predict the rating for a group of user/item combinations.
 +template<typename FactorizerType>
 +void CF<FactorizerType>::Predict(const arma::Mat<size_t>& combinations,
 +                                 arma::vec& predictions) const
 +{
 +  // First, for nearest neighbor search, stretch the H matrix.
 +  arma::mat l = arma::chol(w.t() * w);
 +  arma::mat stretchedH = l * h; // Due to the Armadillo API, l is L^T.
 +
 +  // Now, we must determine those query indices we need to find the nearest
 +  // neighbors for.  This is easiest if we just sort the combinations matrix.
 +  arma::Mat<size_t> sortedCombinations(combinations.n_rows,
 +                                       combinations.n_cols);
 +  arma::uvec ordering = arma::sort_index(combinations.row(0).t());
 +  for (size_t i = 0; i < ordering.n_elem; ++i)
 +    sortedCombinations.col(i) = combinations.col(ordering[i]);
 +
 +  // Now, we have to get the list of unique users we will be searching for.
 +  arma::Col<size_t> users = arma::unique(combinations.row(0).t());
 +
 +  // Assemble our query matrix from the stretchedH matrix.
 +  arma::mat queries(stretchedH.n_rows, users.n_elem);
 +  for (size_t i = 0; i < queries.n_cols; ++i)
 +    queries.col(i) = stretchedH.col(users[i]);
 +
 +  // Now calculate the neighborhood of these users.
 +  neighbor::AllkNN a(stretchedH);
 +  arma::mat distances;
 +  arma::Mat<size_t> neighborhood;
 +
 +  a.Search(queries, numUsersForSimilarity, neighborhood, distances);
 +
 +  // Now that we have the neighborhoods we need, calculate the predictions.
 +  predictions.set_size(combinations.n_cols);
 +
 +  size_t user = 0; // Cumulative user count, because we are doing it in order.
 +  for (size_t i = 0; i < sortedCombinations.n_cols; ++i)
 +  {
 +    // Could this be made faster by calculating dot products for multiple items
 +    // at once?
 +    double rating = 0.0;
 +
 +    // Map the combination's user to the user ID used for kNN.
 +    while (users[user] < sortedCombinations(0, i))
 +      ++user;
 +
 +    for (size_t j = 0; j < neighborhood.n_rows; ++j)
 +      rating += arma::as_scalar(w.row(sortedCombinations(1, i)) *
 +          h.col(neighborhood(j, user)));
 +    rating /= neighborhood.n_rows;
 +
 +    predictions(ordering[i]) = rating;
 +  }
 +}
 +
  template<typename FactorizerType>
- void CF<FactorizerType>::CleanData(const arma::mat& data)
+ void CF<FactorizerType>::CleanData(const arma::mat& data, arma::sp_mat& cleanedData)
  {
    // Generate list of locations for batch insert constructor for sparse
    // matrices.



More information about the mlpack-git mailing list