[mlpack-git] master: * Adding support for user cleaned matrix in CF (b28b2de)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Thu May 21 21:05:08 EDT 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/7e9cd46afb53817ae93ccbd02637d7726137ce4d...d2f2976c7a43f8ab9139064ae33304bcf9f4f884

>---------------------------------------------------------------

commit b28b2deb137beaca385e3e9a6ef07bcc27652973
Author: sumedhghaisas <sumedhghaisas at gmail.com>
Date:   Tue Apr 28 03:50:17 2015 +0530

    * Adding support for user cleaned matrix in CF


>---------------------------------------------------------------

b28b2deb137beaca385e3e9a6ef07bcc27652973
 src/mlpack/methods/cf/cf.hpp      | 24 ++++++++++++++++++--
 src/mlpack/methods/cf/cf_impl.hpp | 48 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/src/mlpack/methods/cf/cf.hpp b/src/mlpack/methods/cf/cf.hpp
index 03e953d..61b9135 100644
--- a/src/mlpack/methods/cf/cf.hpp
+++ b/src/mlpack/methods/cf/cf.hpp
@@ -95,6 +95,25 @@ class CF
      const size_t numUsersForSimilarity = 5,
      const size_t rank = 0);
      
+  /**
+   * Initialize the CF object using an instantiated factorizer. Store a
+   * reference to the data that we will be using. There are parameters that can
+   * be set; default values are provided for each of them. If the rank is left
+   * unset (or is set to 0), a simple density-based heuristic will be used to
+   * choose a rank. Data will be considered in the format of items vs. users and 
+   * will be passed directly to the factorizer without cleaning.
+   *
+   * @param data Initial (user, item, rating) matrix.
+   * @param factorizer Instantiated factorizer object.
+   * @param numUsersForSimilarity Size of the neighborhood.
+   * @param rank Rank parameter for matrix factorization.
+   * @param isCleaned If the data passed is cleaned for CF
+   */
+  CF(arma::sp_mat& data,
+     FactorizerType factorizer = FactorizerType(),
+     const size_t numUsersForSimilarity = 5,
+     const size_t rank = 0);
+   
   /*void ApplyFactorizer(arma::mat& data, const typename boost::enable_if_c<
       FactorizerTraits<FactorizerType>::IsCleaned == false, int*>::type);
       
@@ -166,6 +185,9 @@ class CF
                           arma::Mat<size_t>& recommendations,
                           arma::Col<size_t>& users);
                           
+  //! Converts the User, Item, Value Matrix to User-Item Table
+  static void CleanData(const arma::mat& data, arma::sp_mat& cleanedData);
+
   /**
    * Returns a string representation of this object.
    */
@@ -186,8 +208,6 @@ class CF
   arma::mat rating;
   //! Cleaned data matrix.
   arma::sp_mat cleanedData;
-  //! Converts the User, Item, Value Matrix to User-Item Table
-  void CleanData(const arma::mat& data);
 
   /**
    * Helper function to insert a point into the recommendation matrices.
diff --git a/src/mlpack/methods/cf/cf_impl.hpp b/src/mlpack/methods/cf/cf_impl.hpp
index d857166..0b0eee4 100644
--- a/src/mlpack/methods/cf/cf_impl.hpp
+++ b/src/mlpack/methods/cf/cf_impl.hpp
@@ -63,6 +63,50 @@ template<typename FactorizerType>
 CF<FactorizerType>::CF(arma::mat& data,
                        FactorizerType factorizer,
                        const size_t numUsersForSimilarity,
+                       const size_t rank,
+                       bool isCleaned) :
+    numUsersForSimilarity(numUsersForSimilarity),
+    rank(rank),
+    factorizer(factorizer)
+{
+  // Validate neighbourhood size.
+  if (numUsersForSimilarity < 1)
+  {
+    Log::Warn << "CF::CF(): neighbourhood size should be > 0("
+        << numUsersForSimilarity << " given). Setting value to 5.\n";
+    //Setting Default Value of 5
+    this->numUsersForSimilarity = 5;
+  }
+
+  CleanData(data, cleanedData);
+
+  // Check if the user wanted us to choose a rank for them.
+  if (rank == 0)
+  {
+    // This is a simple heuristic that picks a rank based on the density of the
+    // dataset between 5 and 105.
+    const double density = (cleanedData.n_nonzero * 100.0) / cleanedData.n_elem;
+    const size_t rankEstimate = size_t(density) + 5;
+
+    // Set to heuristic value.
+    Log::Info << "No rank given for decomposition; using rank of "
+        << rankEstimate << " calculated by density-based heuristic."
+        << std::endl;
+    this->rank = rankEstimate;
+  }
+
+  // Operations independent of the query:
+  // Decompose the sparse data matrix to user and data matrices.
+  ApplyFactorizer<FactorizerType>(data, cleanedData, factorizer, this->rank, w, h);
+}
+
+/**
+ * Construct the CF object using an instantiated factorizer.
+ */
+template<typename FactorizerType>
+CF<FactorizerType>::CF(arma::mat& data,
+                       FactorizerType factorizer,
+                       const size_t numUsersForSimilarity,
                        const size_t rank) :
     numUsersForSimilarity(numUsersForSimilarity),
     rank(rank),
@@ -77,7 +121,7 @@ CF<FactorizerType>::CF(arma::mat& data,
     this->numUsersForSimilarity = 5;
   }
 
-  CleanData(data);
+  CleanData(data, cleanedData);
 
   // Check if the user wanted us to choose a rank for them.
   if (rank == 0)
@@ -200,7 +244,7 @@ void CF<FactorizerType>::GetRecommendations(const size_t numRecs,
 }
 
 template<typename FactorizerType>
-void CF<FactorizerType>::CleanData(const arma::mat& data)
+void CF<FactorizerType>::CleanData(const arma::mat& data, arma::sp_mat& cleanedData)
 {
   // Generate list of locations for batch insert constructor for sparse
   // matrices.



More information about the mlpack-git mailing list