[mlpack-svn] r16345 - in mlpack/trunk/src/mlpack: methods/cf tests
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Sat Mar 1 23:39:49 EST 2014
Author: rcurtin
Date: Sat Mar 1 23:39:49 2014
New Revision: 16345
Log:
Patch from Siddharth: templatize CF to accept arbitrary types of factorizers.
Added:
mlpack/trunk/src/mlpack/methods/cf/cf_impl.hpp
Removed:
mlpack/trunk/src/mlpack/methods/cf/cf.cpp
Modified:
mlpack/trunk/src/mlpack/methods/cf/CMakeLists.txt
mlpack/trunk/src/mlpack/methods/cf/cf.hpp
mlpack/trunk/src/mlpack/methods/cf/cf_main.cpp
mlpack/trunk/src/mlpack/tests/cf_test.cpp
mlpack/trunk/src/mlpack/tests/to_string_test.cpp
Modified: mlpack/trunk/src/mlpack/methods/cf/CMakeLists.txt
==============================================================================
--- mlpack/trunk/src/mlpack/methods/cf/CMakeLists.txt (original)
+++ mlpack/trunk/src/mlpack/methods/cf/CMakeLists.txt Sat Mar 1 23:39:49 2014
@@ -2,7 +2,7 @@
# Anything not in this list will not be compiled into MLPACK.
set(SOURCES
cf.hpp
- cf.cpp
+ cf_impl.hpp
)
# Add directory name to sources.
Modified: mlpack/trunk/src/mlpack/methods/cf/cf.hpp
==============================================================================
--- mlpack/trunk/src/mlpack/methods/cf/cf.hpp (original)
+++ mlpack/trunk/src/mlpack/methods/cf/cf.hpp Sat Mar 1 23:39:49 2014
@@ -12,10 +12,14 @@
#include <mlpack/core.hpp>
#include <mlpack/methods/neighbor_search/neighbor_search.hpp>
+#include <mlpack/methods/nmf/nmf.hpp>
+#include <mlpack/methods/nmf/als_update_rules.hpp>
#include <set>
#include <map>
#include <iostream>
+using namespace mlpack::nmf;
+
namespace mlpack {
namespace cf /** Collaborative filtering. */{
@@ -32,7 +36,7 @@
* arma::Mat<size_t> recommendations; // Recommendations
* size_t numRecommendations = 10;
*
- * CF cf(data); // Default options.
+ * CF<> cf(data); // Default options.
*
* // Generate the default number of recommendations for all users.
* cf.GetRecommendations(recommendations);
@@ -49,8 +53,17 @@
* should have three rows. The first represents the user; the second represents
* the item; and the third represents the rating. The user and item, while they
* are in a matrix that holds doubles, should hold integer (or size_t) values.
- * The user and item indices are assumed to be starting from 0.
+ * The user and item indices are assumed to start at 0.
+ *
+ * @tparam FactorizerType The type of matrix factorization to use to decompose
+ * the rating matrix (a W and H matrix). This must implement the method
+ * Apply(arma::sp_mat& data, size_t rank, arma::mat& W, arma::mat& H).
*/
+template<
+ typename FactorizerType = NMF<RandomInitialization,
+ WAlternatingLeastSquaresRule,
+ HAlternatingLeastSquaresRule>
+>
class CF
{
public:
@@ -82,13 +95,13 @@
this->numRecs = recs;
}
- //! Gets numRecs
- size_t NumRecs()
+ //! Gets the number of recommendations.
+ size_t NumRecs() const
{
return numRecs;
}
- //! Sets number of user for calculating similarity.
+ //! Sets number of users for calculating similarity.
void NumUsersForSimilarity(const size_t num)
{
if (num < 1)
@@ -101,7 +114,7 @@
}
//! Gets number of users for calculating similarity.
- size_t NumUsersForSimilarity()
+ size_t NumUsersForSimilarity() const
{
return numUsersForSimilarity;
}
@@ -113,11 +126,17 @@
}
//! Gets rank parameter for matrix factorization.
- size_t Rank()
+ size_t Rank() const
{
return rank;
}
+ //! Sets factorizer for NMF
+ void Factorizer(const FactorizerType& f)
+ {
+ this->factorizer = f;
+ }
+
//! Get the User Matrix.
const arma::mat& W() const { return w; }
//! Get the Item Matrix.
@@ -182,6 +201,8 @@
size_t numUsersForSimilarity;
//! Rank used for matrix factorization.
size_t rank;
+ //! Instantiated factorizer object.
+ FactorizerType factorizer;
//! User matrix.
arma::mat w;
//! Item matrix.
@@ -214,4 +235,7 @@
}; // namespace cf
}; // namespace mlpack
+//Include implementation
+#include "cf_impl.hpp"
+
#endif
Added: mlpack/trunk/src/mlpack/methods/cf/cf_impl.hpp
==============================================================================
--- (empty file)
+++ mlpack/trunk/src/mlpack/methods/cf/cf_impl.hpp Sat Mar 1 23:39:49 2014
@@ -0,0 +1,272 @@
+/**
+ * @file cf.cpp
+ * @author Mudit Raj Gupta
+ *
+ * Collaborative Filtering.
+ *
+ * Implementation of CF class to perform Collaborative Filtering on the
+ * specified data set.
+ *
+ */
+#include "cf.hpp"
+#include <mlpack/methods/nmf/nmf.hpp>
+#include <mlpack/methods/nmf/als_update_rules.hpp>
+
+using namespace mlpack::nmf;
+using namespace mlpack::neighbor;
+using namespace std;
+
+namespace mlpack {
+namespace cf {
+
+/**
+ * Construct the CF object.
+ */
+template<typename FactorizerType>
+CF<FactorizerType>::CF(arma::mat& data,
+ const size_t numRecs,
+ const size_t numUsersForSimilarity,
+ const size_t rank) :
+ data(data),
+ numRecs(numRecs),
+ numUsersForSimilarity(numUsersForSimilarity),
+ rank(rank)
+{
+ // Validate number of recommendation factor.
+ if (numRecs < 1)
+ {
+ Log::Warn << "CF::CF(): number of recommendations should be > 0("
+ << numRecs << " given). Setting value to 5.\n";
+ //Setting Default Value of 5
+ this->numRecs = 5;
+ }
+
+ // Validate neighbourhood size.
+ if (numUsersForSimilarity < 1)
+ {
+ Log::Warn << "CF::CF(): neighbourhood size should be > 0("
+ << numUsersForSimilarity << " given). Setting value to 5.\n";
+ //Setting Default Value of 5
+ this->numUsersForSimilarity = 5;
+ }
+
+ //Set default factorizer
+ FactorizerType f(10000, 1e-5);
+ Factorizer(f);
+
+ CleanData();
+}
+
+template<typename FactorizerType>
+void CF<FactorizerType>::GetRecommendations(arma::Mat<size_t>& recommendations)
+{
+ // Used to save user IDs.
+ arma::Col<size_t> users =
+ arma::zeros<arma::Col<size_t> >(cleanedData.n_cols, 1);
+ // Getting all user IDs.
+ for (size_t i = 0; i < cleanedData.n_cols; i++)
+ users(i) = i;
+
+ // Calling base function for recommendations.
+ GetRecommendations(recommendations, users);
+}
+
+template<typename FactorizerType>
+void CF<FactorizerType>::GetRecommendations(arma::Mat<size_t>& recommendations,
+ arma::Col<size_t>& users)
+{
+ // Base function for calculating recommendations.
+
+ // Check if the user wanted us to choose a rank for them.
+ if (rank == 0)
+ {
+ // This is a simple heuristic that picks a rank based on the density of the
+ // dataset between 5 and 105.
+ const double density = (cleanedData.n_nonzero * 100.0) / cleanedData.n_elem;
+ const size_t rankEstimate = size_t(density) + 5;
+
+ // Set to heuristic value.
+ Log::Info << "No rank given for decomposition; using rank of "
+ << rankEstimate << " calculated by density-based heuristic."
+ << std::endl;
+ rank = rankEstimate;
+ }
+
+ // Operations independent of the query:
+ // Decompose the sparse data matrix to user and data matrices.
+ // Presently only ALS (via NMF) is supported as an optimizer.
+ factorizer.Apply(cleanedData, rank, w, h);
+
+ // Generate new table by multiplying approximate values.
+ rating = w * h;
+
+ // Now, we will use the decomposed w and h matrices to estimate what the user
+ // would have rated items as, and then pick the best items.
+
+ // Temporarily store feature vector of queried users.
+ arma::mat query(rating.n_rows, users.n_elem);
+
+ // Select feature vectors of queried users.
+ for (size_t i = 0; i < users.n_elem; i++)
+ query.col(i) = rating.col(users(i));
+
+ // Temporary storage for neighborhood of the queried users.
+ arma::Mat<size_t> neighborhood;
+
+ // Calculate the neighborhood of the queried users.
+ // This should be a templatized option.
+ AllkNN a(rating, query);
+ arma::mat resultingDistances; // Temporary storage.
+ a.Search(numUsersForSimilarity, neighborhood, resultingDistances);
+
+ // Temporary storage for storing the average rating for each user in their
+ // neighborhood.
+ arma::mat averages = arma::zeros<arma::mat>(rating.n_rows, query.n_cols);
+
+ // Iterate over each query user.
+ for (size_t i = 0; i < neighborhood.n_cols; ++i)
+ {
+ // Iterate over each neighbor of the query user.
+ for (size_t j = 0; j < neighborhood.n_rows; ++j)
+ averages.col(i) += rating.col(neighborhood(j, i));
+ // Normalize average.
+ averages.col(i) /= neighborhood.n_rows;
+ }
+
+ // Generate recommendations for each query user by finding the maximum numRecs
+ // elements in the averages matrix.
+ recommendations.set_size(numRecs, users.n_elem);
+ recommendations.fill(cleanedData.n_rows); // Invalid item number.
+ arma::mat values(numRecs, users.n_elem);
+ values.fill(-DBL_MAX); // The smallest possible value.
+ for (size_t i = 0; i < users.n_elem; i++)
+ {
+ // Look through the averages column corresponding to the current user.
+ for (size_t j = 0; j < averages.n_rows; ++j)
+ {
+ // Ensure that the user hasn't already rated the item.
+ if (cleanedData(j, users(i)) != 0.0)
+ continue; // The user already rated the item.
+
+ // Is the estimated value better than the worst candidate?
+ const double value = averages(j, i);
+ if (value > values(values.n_rows - 1, i))
+ {
+ // It should be inserted. Which position?
+ size_t insertPosition = values.n_rows - 1;
+ while (insertPosition > 0)
+ {
+ if (value <= values(insertPosition - 1, i))
+ break; // The current value is the right one.
+ insertPosition--;
+ }
+
+ // Now insert it into the list.
+ InsertNeighbor(i, insertPosition, j, value, recommendations,
+ values);
+ }
+ }
+
+ // If we were not able to come up with enough recommendations, issue a
+ // warning.
+ if (recommendations(values.n_rows - 1, i) == cleanedData.n_rows + 1)
+ Log::Warn << "Could not provide " << values.n_rows << " recommendations "
+ << "for user " << users(i) << " (not enough un-rated items)!" << endl;
+ }
+}
+
+template<typename FactorizerType>
+void CF<FactorizerType>::GetRecommendations(arma::Mat<size_t>& recommendations,
+ arma::Col<size_t>& users,size_t num)
+{
+ //Setting Number of Recommendations
+ NumRecs(num);
+ //Calling Base Function for Recommendations
+ GetRecommendations(recommendations,users);
+}
+
+template<typename FactorizerType>
+void CF<FactorizerType>::GetRecommendations(arma::Mat<size_t>& recommendations,
+ arma::Col<size_t>& users,size_t num,
+ size_t s)
+{
+ //Setting number of users that should be used for calculating
+ //neighbours
+ NumUsersForSimilarity(s);
+ //Setting Number of Recommendations
+ NumRecs(num);
+ //Calling Base Function for Recommendations
+ GetRecommendations(recommendations,users,num);
+}
+
+template<typename FactorizerType>
+void CF<FactorizerType>::CleanData()
+{
+ // Generate list of locations for batch insert constructor for sparse
+ // matrices.
+ arma::umat locations(2, data.n_cols);
+ arma::vec values(data.n_cols);
+ for (size_t i = 0; i < data.n_cols; ++i)
+ {
+ // We have to transpose it because items are rows, and users are columns.
+ locations(1, i) = ((arma::uword) data(0, i));
+ locations(0, i) = ((arma::uword) data(1, i));
+ values(i) = data(2, i);
+ }
+
+ // Find maximum user and item IDs.
+ const size_t maxItemID = (size_t) max(locations.row(0)) + 1;
+ const size_t maxUserID = (size_t) max(locations.row(1)) + 1;
+
+ // Fill sparse matrix.
+ cleanedData = arma::sp_mat(locations, values, maxItemID, maxUserID);
+}
+
+/**
+ * Helper function to insert a point into the recommendation matrices.
+ *
+ * @param queryIndex Index of point whose recommendations we are inserting into.
+ * @param pos Position in list to insert into.
+ * @param neighbor Index of item being inserted as a recommendation.
+ * @param value Value of recommendation.
+ */
+template<typename FactorizerType>
+void CF<FactorizerType>::InsertNeighbor(const size_t queryIndex,
+ const size_t pos,
+ const size_t neighbor,
+ const double value,
+ arma::Mat<size_t>& recommendations,
+ arma::mat& values) const
+{
+ // We only memmove() if there is actually a need to shift something.
+ if (pos < (recommendations.n_rows - 1))
+ {
+ const int len = (values.n_rows - 1) - pos;
+ memmove(values.colptr(queryIndex) + (pos + 1),
+ values.colptr(queryIndex) + pos,
+ sizeof(double) * len);
+ memmove(recommendations.colptr(queryIndex) + (pos + 1),
+ recommendations.colptr(queryIndex) + pos,
+ sizeof(size_t) * len);
+ }
+
+ // Now put the new information in the right index.
+ values(pos, queryIndex) = value;
+ recommendations(pos, queryIndex) = neighbor;
+}
+
+// Return string of object.
+template<typename FactorizerType>
+std::string CF<FactorizerType>::ToString() const
+{
+ std::ostringstream convert;
+ convert << "Collaborative Filtering [" << this << "]" << std::endl;
+ //convert << " Number of Recommendations: " << numRecs << std::endl;
+ //convert << " Number of Users for Similarity: " << numUsersForSimilarity;
+ //convert << std::endl;
+ //convert << " Data: " << data.n_rows << "x" << data.n_cols << std::endl;
+ return convert.str();
+}
+
+}; // namespace mlpack
+}; // namespace cf
Modified: mlpack/trunk/src/mlpack/methods/cf/cf_main.cpp
==============================================================================
--- mlpack/trunk/src/mlpack/methods/cf/cf_main.cpp (original)
+++ mlpack/trunk/src/mlpack/methods/cf/cf_main.cpp Sat Mar 1 23:39:49 2014
@@ -70,7 +70,7 @@
// Perform decomposition to prepare for recommendations.
Log::Info << "Performing CF matrix decomposition on dataset..." << endl;
- CF c(dataset);
+ CF<> c(dataset);
c.NumRecs(numRecs);
c.NumUsersForSimilarity(neighborhood);
Modified: mlpack/trunk/src/mlpack/tests/cf_test.cpp
==============================================================================
--- mlpack/trunk/src/mlpack/tests/cf_test.cpp (original)
+++ mlpack/trunk/src/mlpack/tests/cf_test.cpp Sat Mar 1 23:39:49 2014
@@ -33,7 +33,7 @@
// Number of users for similarity (not the default).
const size_t numUsersForSimilarity = 8;
- CF c(dataset, numRecs, numUsersForSimilarity);
+ CF<> c(dataset, numRecs, numUsersForSimilarity);
// Check parameters.
BOOST_REQUIRE_EQUAL(c.NumRecs(), numRecs);
@@ -68,7 +68,7 @@
data::Load("GroupLens100k.csv", dataset);
// Creat a CF object
- CF c(dataset);
+ CF<> c(dataset);
// Set number of recommendations.
c.NumRecs(numRecs);
@@ -106,7 +106,7 @@
arma::mat dataset;
data::Load("GroupLens100k.csv", dataset);
- CF c(dataset);
+ CF<> c(dataset);
// Generate recommendations when query set is specified.
c.GetRecommendations(recommendations, users);
@@ -161,7 +161,7 @@
}
// Now create the CF object.
- CF c(dataset);
+ CF<> c(dataset);
// Obtain 150 recommendations for the users in savedCols, and make sure the
// missing item shows up in most of them. First, create the list of users,
Modified: mlpack/trunk/src/mlpack/tests/to_string_test.cpp
==============================================================================
--- mlpack/trunk/src/mlpack/tests/to_string_test.cpp (original)
+++ mlpack/trunk/src/mlpack/tests/to_string_test.cpp Sat Mar 1 23:39:49 2014
@@ -292,7 +292,7 @@
c(0, 2) = 1;
c(1, 2) = 3;
c(2, 2) = 0.7;
- mlpack::cf::CF d(c, a, a);
+ mlpack::cf::CF<> d(c, a, a);
Log::Debug << d;
std::string s = d.ToString();
BOOST_REQUIRE_NE(s, "");
More information about the mlpack-svn
mailing list