[mlpack-git] master: Adds new constructor to LSHSearch and merges Train() and BuildHash() methods (06fdfa8)

Thu Jun 2 07:39:07 EDT 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/eba4f9924694bc10daec74ff5059dbb8af001416...e3a23c256f017ebb8185b15847c82f51d359cdfd

>---------------------------------------------------------------

commit 06fdfa866a1d85bc8213c0f86777865174fafd84
Author: Yannis Mentekidis <mentekid at gmail.com>
Date:   Thu Jun 2 14:39:07 2016 +0300

    Adds new constructor to LSHSearch and merges Train() and BuildHash() methods


>---------------------------------------------------------------

06fdfa866a1d85bc8213c0f86777865174fafd84
 HISTORY.md                                 |  10 +
 src/mlpack/methods/lsh/lsh_search.hpp      |  27 ++-
 src/mlpack/methods/lsh/lsh_search_impl.hpp | 331 +++++++++++++++--------------
 3 files changed, 205 insertions(+), 163 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index cddb28a..de21f89 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,15 @@
 ### mlpack 2.0.2
 ###### 2016-??-??
+  * LSHSearch::Projection(size_t) that returned a single projection matrix has
+    been removed. In its place, LSHSearch::Projections() has been added, which
+    returns an arma::cube with each projection table in a slice (#663).
+
+  * A new constructor has been added to LSHSearch that creates objects using
+    projection tables provided in an arma::cube (#663).
+
+  * LSHSearch::Projections(arma::cube) has been added that allows users to
+    change the projection tables of an LSHSearch object (#663).
+
   * Handle zero-variance dimensions in DET (#515).
 
   * Add MiniBatchSGD optimizer (src/mlpack/core/optimizers/minibatch_sgd/) and
diff --git a/src/mlpack/methods/lsh/lsh_search.hpp b/src/mlpack/methods/lsh/lsh_search.hpp
index d3bc2f9..9c7c1d6 100644
--- a/src/mlpack/methods/lsh/lsh_search.hpp
+++ b/src/mlpack/methods/lsh/lsh_search.hpp
@@ -65,6 +65,31 @@ class LSHSearch
    *     Default values are already provided here.
    */
   LSHSearch(const arma::mat& referenceSet,
+            const arma::cube& projections,
+            const double hashWidth = 0.0,
+            const size_t secondHashSize = 99901,
+            const size_t bucketSize = 500);
+
+  /**
+   * This function initializes the LSH class. It builds the hash one the
+   * reference set using the provided projections. See the individual functions
+   * performing the hashing for details on how the hashing is done.
+   *
+   * @param referenceSet Set of reference points and the set of queries.
+   * @param projections Cube of projection tables. For a cube of size (a, b, c)
+   *     we set numProj = a, numTables = c. b is the reference set
+   *     dimensionality.
+   * @param hashWidth The width of hash for every table. If 0 (the default) is
+   *     provided, then the hash width is automatically obtained by computing
+   *     the average pairwise distance of 25 pairs.  This should be a reasonable
+   *     upper bound on the nearest-neighbor distance in general.
+   * @param secondHashSize The size of the second hash table. This should be a
+   *     large prime number.
+   * @param bucketSize The size of the bucket in the second hash table. This is
+   *     the maximum number of points that can be hashed into single bucket.
+   *     Default values are already provided here.
+   */
+  LSHSearch(const arma::mat& referenceSet,
             const size_t numProj,
             const size_t numTables,
             const double hashWidth = 0.0,
@@ -177,7 +202,7 @@ class LSHSearch
   const arma::Mat<size_t>& SecondHashTable() const { return secondHashTable; }
 
   //! Get the projection tables.
-  const arma::cube Projections() { return projections; }
+  const arma::cube& Projections() { return projections; }
 
   //! Change the projection tables (Retrains object)
   void Projections(const arma::cube &projTables)
diff --git a/src/mlpack/methods/lsh/lsh_search_impl.hpp b/src/mlpack/methods/lsh/lsh_search_impl.hpp
index afeaf05..c0fc57e 100644
--- a/src/mlpack/methods/lsh/lsh_search_impl.hpp
+++ b/src/mlpack/methods/lsh/lsh_search_impl.hpp
@@ -12,7 +12,7 @@
 namespace mlpack {
 namespace neighbor {
 
-// Construct the object.
+// Construct the object with random tables
 template<typename SortPolicy>
 LSHSearch<SortPolicy>::
 LSHSearch(const arma::mat& referenceSet,
@@ -35,6 +35,28 @@ LSHSearch(const arma::mat& referenceSet,
       bucketSize);
 }
 
+// Construct the object with given tables
+template<typename SortPolicy>
+LSHSearch<SortPolicy>::
+LSHSearch(const arma::mat& referenceSet,
+          const arma::cube& projections,
+          const double hashWidthIn,
+          const size_t secondHashSize,
+          const size_t bucketSize) :
+  referenceSet(NULL), // This will be set in Train().
+  ownsSet(false),
+  numProj(projections.n_cols),
+  numTables(projections.n_slices),
+  hashWidth(hashWidthIn),
+  secondHashSize(secondHashSize),
+  bucketSize(bucketSize),
+  distanceEvaluations(0)
+{
+  // Pass work to training function
+  Train(referenceSet, numProj, numTables, hashWidthIn, secondHashSize,
+      bucketSize, projections);
+}
+
 // Empty constructor.
 template<typename SortPolicy>
 LSHSearch<SortPolicy>::LSHSearch() :
@@ -98,7 +120,152 @@ void LSHSearch<SortPolicy>::Train(const arma::mat& referenceSet,
 
   Log::Info << "Hash width chosen as: " << hashWidth << std::endl;
 
-  BuildHash(projection);
+  // Hash Building Procedure
+  // The first level hash for a single table outputs a 'numProj'-dimensional
+  // integer key for each point in the set -- (key, pointID)
+  // The key creation details are presented below
+  //
+
+  // Step I: Prepare the second level hash.
+
+  // Obtain the weights for the second hash.
+  secondHashWeights = arma::floor(arma::randu(numProj) *
+                                  (double) secondHashSize);
+
+  // The 'secondHashTable' is initially an empty matrix of size
+  // ('secondHashSize' x 'bucketSize'). But by only filling the buckets
+  // as points land in them allows us to shrink the size of the
+  // 'secondHashTable' at the end of the hashing.
+
+  // Fill the second hash table n = referenceSet.n_cols.  This is because no
+  // point has index 'n' so the presence of this in the bucket denotes that
+  // there are no more points in this bucket.
+  secondHashTable.set_size(secondHashSize, bucketSize);
+  secondHashTable.fill(referenceSet.n_cols);
+
+  // Keep track of the size of each bucket in the hash.  At the end of hashing
+  // most buckets will be empty.
+  bucketContentSize.zeros(secondHashSize);
+
+  // Instead of putting the points in the row corresponding to the bucket, we
+  // chose the next empty row and keep track of the row in which the bucket
+  // lies. This allows us to stack together and slice out the empty buckets at
+  // the end of the hashing.
+  bucketRowInHashTable.set_size(secondHashSize);
+  bucketRowInHashTable.fill(secondHashSize);
+
+  // Keep track of number of non-empty rows in the 'secondHashTable'.
+  size_t numRowsInTable = 0;
+
+  // Step II: The offsets for all projections in all tables.
+  // Since the 'offsets' are in [0, hashWidth], we obtain the 'offsets'
+  // as randu(numProj, numTables) * hashWidth.
+  offsets.randu(numProj, numTables);
+  offsets *= hashWidth;
+
+
+
+
+  // Step III: Obtain the 'numProj' projections for each table.
+  projections.clear(); // Reset projections vector.
+
+  if (projection.n_slices == 0) //random generation of tables
+  {
+    // For L2 metric, 2-stable distributions are used, and
+    // the normal Z ~ N(0, 1) is a 2-stable distribution.
+
+    //numTables random tables arranged in a cube
+    projections.randn(
+        referenceSet.n_rows,
+        numProj,
+        numTables
+    );
+  }
+  else if (projection.n_slices == numTables) //user defined tables
+  {
+    projections = projection;
+  }
+  else //invalid argument
+  {
+    throw std::invalid_argument(
+        "number of projection tables provided must be equal to numProj"
+        );
+  }
+
+
+  for (size_t i = 0; i < numTables; i++)
+  {
+    // Step IV: create the 'numProj'-dimensional key for each point in each
+    // table.
+
+    // The following code performs the task of hashing each point to a
+    // 'numProj'-dimensional integer key.  Hence you get a ('numProj' x
+    // 'referenceSet.n_cols') key matrix.
+    //
+    // For a single table, let the 'numProj' projections be denoted by 'proj_i'
+    // and the corresponding offset be 'offset_i'.  Then the key of a single
+    // point is obtained as:
+    // key = { floor( (<proj_i, point> + offset_i) / 'hashWidth' ) forall i }
+    arma::mat offsetMat = arma::repmat(offsets.unsafe_col(i), 1,
+                                       referenceSet.n_cols);
+    arma::mat hashMat = projections.slice(i).t() * (referenceSet);
+    hashMat += offsetMat;
+    hashMat /= hashWidth;
+
+    // Step V: Putting the points in the 'secondHashTable' by hashing the key.
+    // Now we hash every key, point ID to its corresponding bucket.
+    arma::rowvec secondHashVec = secondHashWeights.t() * arma::floor(hashMat);
+
+    // This gives us the bucket for the corresponding point ID.
+    for (size_t j = 0; j < secondHashVec.n_elem; j++)
+      secondHashVec[j] = (double)((size_t) secondHashVec[j] % secondHashSize);
+
+    Log::Assert(secondHashVec.n_elem == referenceSet.n_cols);
+
+    // Insert the point in the corresponding row to its bucket in the
+    // 'secondHashTable'.
+    for (size_t j = 0; j < secondHashVec.n_elem; j++)
+    {
+      // This is the bucket number.
+      size_t hashInd = (size_t) secondHashVec[j];
+      // The point ID is 'j'.
+
+      // If this is currently an empty bucket, start a new row keep track of
+      // which row corresponds to the bucket.
+      if (bucketContentSize[hashInd] == 0)
+      {
+        // Start a new row for hash.
+        bucketRowInHashTable[hashInd] = numRowsInTable;
+        secondHashTable(numRowsInTable, 0) = j;
+
+        numRowsInTable++;
+      }
+
+      else
+      {
+        // If bucket is already present in the 'secondHashTable', find the
+        // corresponding row and insert the point ID in this row unless the
+        // bucket is full, in which case, do nothing.
+        if (bucketContentSize[hashInd] < bucketSize)
+          secondHashTable(bucketRowInHashTable[hashInd],
+                          bucketContentSize[hashInd]) = j;
+      }
+
+      // Increment the count of the points in this bucket.
+      if (bucketContentSize[hashInd] < bucketSize)
+        bucketContentSize[hashInd]++;
+    } // Loop over all points in the reference set.
+  } // Loop over tables.
+
+  // Step VI: Condensing the 'secondHashTable'.
+  size_t maxBucketSize = 0;
+  for (size_t i = 0; i < bucketContentSize.n_elem; i++)
+    if (bucketContentSize[i] > maxBucketSize)
+      maxBucketSize = bucketContentSize[i];
+
+  Log::Info << "Final hash table size: (" << numRowsInTable << " x "
+            << maxBucketSize << ")" << std::endl;
+  secondHashTable.resize(numRowsInTable, maxBucketSize);
 }
 
 template<typename SortPolicy>
@@ -359,166 +526,6 @@ Search(const size_t k,
 template<typename SortPolicy>
 void LSHSearch<SortPolicy>::BuildHash(const arma::cube &projection)
 {
-  // The first level hash for a single table outputs a 'numProj'-dimensional
-  // integer key for each point in the set -- (key, pointID)
-  // The key creation details are presented below
-  //
-  // The second level hash is performed by hashing the key to
-  // an integer in the range [0, 'secondHashSize').
-  //
-  // This is done by creating a weight vector 'secondHashWeights' of
-  // length 'numProj' with each entry an integer randomly chosen
-  // between [0, 'secondHashSize').
-  //
-  // Then the bucket for any key and its corresponding point is
-  // given by <key, 'secondHashWeights'> % 'secondHashSize'
-  // and the corresponding point ID is put into that bucket.
-
-  // Step I: Prepare the second level hash.
-
-  // Obtain the weights for the second hash.
-  secondHashWeights = arma::floor(arma::randu(numProj) *
-                                  (double) secondHashSize);
-
-  // The 'secondHashTable' is initially an empty matrix of size
-  // ('secondHashSize' x 'bucketSize'). But by only filling the buckets
-  // as points land in them allows us to shrink the size of the
-  // 'secondHashTable' at the end of the hashing.
-
-  // Fill the second hash table n = referenceSet.n_cols.  This is because no
-  // point has index 'n' so the presence of this in the bucket denotes that
-  // there are no more points in this bucket.
-  secondHashTable.set_size(secondHashSize, bucketSize);
-  secondHashTable.fill(referenceSet->n_cols);
-
-  // Keep track of the size of each bucket in the hash.  At the end of hashing
-  // most buckets will be empty.
-  bucketContentSize.zeros(secondHashSize);
-
-  // Instead of putting the points in the row corresponding to the bucket, we
-  // chose the next empty row and keep track of the row in which the bucket
-  // lies. This allows us to stack together and slice out the empty buckets at
-  // the end of the hashing.
-  bucketRowInHashTable.set_size(secondHashSize);
-  bucketRowInHashTable.fill(secondHashSize);
-
-  // Keep track of number of non-empty rows in the 'secondHashTable'.
-  size_t numRowsInTable = 0;
-
-  // Step II: The offsets for all projections in all tables.
-  // Since the 'offsets' are in [0, hashWidth], we obtain the 'offsets'
-  // as randu(numProj, numTables) * hashWidth.
-  offsets.randu(numProj, numTables);
-  offsets *= hashWidth;
-
-  // Step III: Create each hash table in the first level hash one by one and
-  // putting them directly into the 'secondHashTable' for memory efficiency.
-  //projections.clear(); // Reset projections vector.
-
-
-
-  // Step IV: Obtain the 'numProj' projections for each table.
-
-  if (projection.n_slices == 0) //random generation of tables
-  {
-    // For L2 metric, 2-stable distributions are used, and
-    // the normal Z ~ N(0, 1) is a 2-stable distribution.
-
-    //numTables random tables arranged in a cube
-    projections.randn(
-        referenceSet->n_rows,
-        numProj,
-        numTables
-    );
-  }
-  else if (projection.n_slices == numTables) //user defined tables
-  {
-    projections = projection;
-  }
-  else //invalid argument
-  {
-    throw std::invalid_argument(
-        "number of projection tables provided must be equal to numProj"
-        );
-  }
-    
-
-  for (size_t i = 0; i < numTables; i++)
-  {
-
-
-    // Step V: create the 'numProj'-dimensional key for each point in each
-    // table.
-
-    // The following code performs the task of hashing each point to a
-    // 'numProj'-dimensional integer key.  Hence you get a ('numProj' x
-    // 'referenceSet.n_cols') key matrix.
-    //
-    // For a single table, let the 'numProj' projections be denoted by 'proj_i'
-    // and the corresponding offset be 'offset_i'.  Then the key of a single
-    // point is obtained as:
-    // key = { floor( (<proj_i, point> + offset_i) / 'hashWidth' ) forall i }
-    arma::mat offsetMat = arma::repmat(offsets.unsafe_col(i), 1,
-                                       referenceSet->n_cols);
-    // arma::mat hashMat = projMat.t() * (*referenceSet);
-    arma::mat hashMat = projections.slice(i).t() * (*referenceSet);
-    hashMat += offsetMat;
-    hashMat /= hashWidth;
-
-    // Step VI: Putting the points in the 'secondHashTable' by hashing the key.
-    // Now we hash every key, point ID to its corresponding bucket.
-    arma::rowvec secondHashVec = secondHashWeights.t() * arma::floor(hashMat);
-
-    // This gives us the bucket for the corresponding point ID.
-    for (size_t j = 0; j < secondHashVec.n_elem; j++)
-      secondHashVec[j] = (double)((size_t) secondHashVec[j] % secondHashSize);
-
-    Log::Assert(secondHashVec.n_elem == referenceSet->n_cols);
-
-    // Insert the point in the corresponding row to its bucket in the
-    // 'secondHashTable'.
-    for (size_t j = 0; j < secondHashVec.n_elem; j++)
-    {
-      // This is the bucket number.
-      size_t hashInd = (size_t) secondHashVec[j];
-      // The point ID is 'j'.
-
-      // If this is currently an empty bucket, start a new row keep track of
-      // which row corresponds to the bucket.
-      if (bucketContentSize[hashInd] == 0)
-      {
-        // Start a new row for hash.
-        bucketRowInHashTable[hashInd] = numRowsInTable;
-        secondHashTable(numRowsInTable, 0) = j;
-
-        numRowsInTable++;
-      }
-
-      else
-      {
-        // If bucket is already present in the 'secondHashTable', find the
-        // corresponding row and insert the point ID in this row unless the
-        // bucket is full, in which case, do nothing.
-        if (bucketContentSize[hashInd] < bucketSize)
-          secondHashTable(bucketRowInHashTable[hashInd],
-                          bucketContentSize[hashInd]) = j;
-      }
-
-      // Increment the count of the points in this bucket.
-      if (bucketContentSize[hashInd] < bucketSize)
-        bucketContentSize[hashInd]++;
-    } // Loop over all points in the reference set.
-  } // Loop over tables.
-
-  // Step VII: Condensing the 'secondHashTable'.
-  size_t maxBucketSize = 0;
-  for (size_t i = 0; i < bucketContentSize.n_elem; i++)
-    if (bucketContentSize[i] > maxBucketSize)
-      maxBucketSize = bucketContentSize[i];
-
-  Log::Info << "Final hash table size: (" << numRowsInTable << " x "
-            << maxBucketSize << ")" << std::endl;
-  secondHashTable.resize(numRowsInTable, maxBucketSize);
 }
 
 template<typename SortPolicy>