[mlpack-git] master: Adds a deterministic test for multiprobe (f0638a0)

gitdub at mlpack.org gitdub at mlpack.org
Thu Jun 30 15:11:48 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/eaa7182ebed8cce3fd6191dc1f8170546ea297da...812048c7c6bee0b6c8d936677f23bbb5930c6cfc

>---------------------------------------------------------------

commit f0638a0314c1502f960c71cd0347bf1ecc888556
Author: Yannis Mentekidis <mentekid at gmail.com>
Date:   Tue Jun 14 11:42:05 2016 +0300

    Adds a deterministic test for multiprobe


>---------------------------------------------------------------

f0638a0314c1502f960c71cd0347bf1ecc888556
 src/mlpack/methods/lsh/lsh_search_impl.hpp | 51 +++++++++---------------
 src/mlpack/tests/lsh_test.cpp              | 62 +++++++++++++++++++++++++++++-
 2 files changed, 79 insertions(+), 34 deletions(-)

diff --git a/src/mlpack/methods/lsh/lsh_search_impl.hpp b/src/mlpack/methods/lsh/lsh_search_impl.hpp
index 6b061e2..bc49662 100644
--- a/src/mlpack/methods/lsh/lsh_search_impl.hpp
+++ b/src/mlpack/methods/lsh/lsh_search_impl.hpp
@@ -9,6 +9,8 @@
 
 #include <mlpack/core.hpp>
 
+using std::cout; using std::endl; //TODO: remove
+
 namespace mlpack {
 namespace neighbor {
 
@@ -436,6 +438,8 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
   if (T == 0)
     return;
 
+
+
   // Each column of additionalProbingBins is the code of a bin.
   additionalProbingBins.set_size(numProj, T);
   
@@ -475,34 +479,10 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
     arma::linspace< arma::Col<size_t> >(0, numProj - 1, numProj);
 
   // optimization: no need to create heap for 1 or 2 codes
-  if (T == 1)
+  if (T <= 2)
   {
-    // special case: 1 code only
-    // simply find location of minimum score, generate 1 perturbation vector,
-    // and add its code to additionalProbingBins column 0
 
-    // find location and value of smallest element of scores vector
-    double minscore = scores[0];
-    size_t minloc = 0;
-    for (size_t s = 0; s < 2 * numProj; ++s)
-    {
-      if (minscore > scores[s])
-      {
-        minscore = scores[s];
-        minloc = s;
-      }
-    }
-    
-    // add or subtract 1 to dimension corresponding to minimum score
-    additionalProbingBins(positions[minloc], 0) += actions[minloc];
-    
-    // done - only 1 code was needed
-    return;
-  }
-
-  if (T == 2)
-  {
-    // special case: 2 codes only
+    // special case: 1 or 2 codes only
     // The second perturbation vector still can't comprise of more than one
     // change in the bin codes. This is because of the way perturbation vectors
     // are generated: First we create the one with the smallest score (Ao) and
@@ -514,7 +494,7 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
     
 
     // First, find location of minimum score, generate 1 perturbation vector,
-    // and add its code to additionalProbingBins column 0, like in T==1.
+    // and add its code to additionalProbingBins column 0.
 
     // find location and value of smallest element of scores vector
     double minscore = scores[0];
@@ -530,6 +510,8 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
     
     // add or subtract 1 to dimension corresponding to minimum score
     additionalProbingBins(positions[minloc], 0) += actions[minloc];
+    if (T == 1)
+      return; //done if asked for only 1 code
 
     // Now, find location of second smallest score and generate one more vector.
     
@@ -537,15 +519,17 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
     size_t minloc2 = 0;
     for (size_t s = 0; s < 2 * numProj; ++s) // here we can't start from 0
     {
-      if ( minscore2 > scores[s] && scores[s] > minscore) //second smallest
+      if ( minscore2 > scores[s] && s != minloc) //second smallest
       {
         minscore2 = scores[s];
         minloc2 = s;
       }
     }
 
-    // add or subtract 1 to second smallest score
+    // add or subtract 1 to create second-lowest scoring vector
     additionalProbingBins(positions[minloc2], 1) += actions[minloc2];
+    return;
+    
   }
 
   // sort everything in increasing order
@@ -556,8 +540,8 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
 
 
   // Theory:
-  // From the paper: This is the part that creates the probing sequence
-  // A probing sequence is a sequence of T probing bins where query's
+  // From the paper: This is the part that creates the probing sequence.
+  // A probing sequence is a sequence of T probing bins where a query's
   // neighbors are most likely to be. Likelihood is dependent only on a bin's
   // score, which is the sum of scores of all dimension-action pairs, so we
   // need to calculate the T smallest sums of scores that are not conflicting.
@@ -671,12 +655,13 @@ void LSHSearch<SortPolicy>::ReturnIndicesFromTable(
   for (size_t i = 0; i < numTablesToSearch; i++)
     queryCodesNotFloored.unsafe_col(i) = projections.slice(i).t() * queryPoint;
   queryCodesNotFloored += offsets.cols(0, numTablesToSearch - 1);
-  allProjInTables = queryCodesNotFloored/hashWidth;
+  allProjInTables = arma::floor(queryCodesNotFloored/hashWidth);
 
   // Compute the hash value of each key of the query into a bucket of the
   // 'secondHashTable' using the 'secondHashWeights'.
-  arma::rowvec hashVec = secondHashWeights.t() * arma::floor(allProjInTables);
+  arma::rowvec hashVec = secondHashWeights.t() * allProjInTables;
 
+  // mod and floor hashVec to compute 2nd-level codes
   for (size_t i = 0; i < hashVec.n_elem; i++)
     hashVec[i] = (double) ((size_t) hashVec[i] % secondHashSize);
 
diff --git a/src/mlpack/tests/lsh_test.cpp b/src/mlpack/tests/lsh_test.cpp
index c144f3d..c66c8ef 100644
--- a/src/mlpack/tests/lsh_test.cpp
+++ b/src/mlpack/tests/lsh_test.cpp
@@ -563,7 +563,7 @@ BOOST_AUTO_TEST_CASE(MultiprobeTest)
       if (p > 0)
       {
         // more probes should at the very least not lower recall...
-        BOOST_REQUIRE(recall >= prevRecall);
+        BOOST_REQUIRE_GE(recall, prevRecall);
 
         // ... and should ideally increase it a bit
         if (recall > prevRecall + epsilonIncrease)
@@ -576,6 +576,66 @@ BOOST_AUTO_TEST_CASE(MultiprobeTest)
   BOOST_REQUIRE(foundIncrease);
 }
 
+/**
+ * Test: This is a deterministic test that verifies multiprobe LSH works
+ * correctly. To do this, we generate a query , q1, that will end up
+ * outside of the bucket of its nearest neighbors in C1. The neighbors will be
+ * found only by searching additional probing bins.
+ *
+ * We verify this actually is the case.
+ */
+BOOST_AUTO_TEST_CASE(MultiprobeDeterministicTest)
+{
+  // generate known deterministic clusters of points
+  const size_t N = 40;
+  arma::mat rdata;
+  GetPointset(N, rdata);
+
+  const int k = N/2;
+  const double hashWidth = 1;
+  const int secondHashSize = 99901;
+  const int bucketSize = 500;
+
+  // 1 table, projections on orthonormal plane
+  arma::cube projections(2, 2, 1);
+  projections(0, 0, 0) = 1;
+  projections(1, 0, 0) = 0;
+  projections(0, 1, 0) = 0;
+  projections(1, 1, 0) = 1;
+
+  // construct LSH object with given tables
+  LSHSearch<> lshTest(rdata, projections,
+                      hashWidth, secondHashSize, bucketSize);
+
+  // get offset of each projection dimension. Since projection table is I(2),
+  // offsets will map to offsets of actual dimensions. This is to enforce that
+  // q1 and q4 are right outside the bounding boxes of C1 and C4 respectively.
+  arma::mat offsets = lshTest.Offsets();
+
+
+  // two points near the clusters but outside their bins. q1's lowest scoring
+  // perturbation vector will map to C1 cluster's bin. q4's two-lowest scoring
+  // vectors will map to empty bins, and its third vector will map to C4.
+  arma::mat q1;
+  q1 << 1.1 << arma::endr << 3.3; // vector [1.1, 3.3]
+  q1 += offsets;
+
+  arma::mat q4;
+  q4 << 3.3 << arma::endr << 1.1; // vector [3.3, 1.1]
+  q4 += offsets;
+
+  arma::Mat<size_t> neighbors;
+  arma::mat distances;
+
+  // Test that q1 simple search comes up empty
+  lshTest.Search(q1, k, neighbors, distances);
+  BOOST_REQUIRE( arma::all(neighbors.col(0) == N) );
+
+  // Searching with 3 additional probing bins should find neighbors
+  lshTest.Search(q1, k, neighbors, distances, 0, 3);
+  BOOST_REQUIRE( arma::all(neighbors.col(0) == N || neighbors.col(0) < 10) );
+}
+
 BOOST_AUTO_TEST_CASE(LSHTrainTest)
 {
   // This is a not very good test that simply checks that the re-trained LSH




More information about the mlpack-git mailing list