[mlpack-git] master: Adds a deterministic test for multiprobe (f0638a0)
gitdub at mlpack.org
gitdub at mlpack.org
Thu Jun 30 15:11:48 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/eaa7182ebed8cce3fd6191dc1f8170546ea297da...812048c7c6bee0b6c8d936677f23bbb5930c6cfc
>---------------------------------------------------------------
commit f0638a0314c1502f960c71cd0347bf1ecc888556
Author: Yannis Mentekidis <mentekid at gmail.com>
Date: Tue Jun 14 11:42:05 2016 +0300
Adds a deterministic test for multiprobe
>---------------------------------------------------------------
f0638a0314c1502f960c71cd0347bf1ecc888556
src/mlpack/methods/lsh/lsh_search_impl.hpp | 51 +++++++++---------------
src/mlpack/tests/lsh_test.cpp | 62 +++++++++++++++++++++++++++++-
2 files changed, 79 insertions(+), 34 deletions(-)
diff --git a/src/mlpack/methods/lsh/lsh_search_impl.hpp b/src/mlpack/methods/lsh/lsh_search_impl.hpp
index 6b061e2..bc49662 100644
--- a/src/mlpack/methods/lsh/lsh_search_impl.hpp
+++ b/src/mlpack/methods/lsh/lsh_search_impl.hpp
@@ -9,6 +9,8 @@
#include <mlpack/core.hpp>
+using std::cout; using std::endl; //TODO: remove
+
namespace mlpack {
namespace neighbor {
@@ -436,6 +438,8 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
if (T == 0)
return;
+
+
// Each column of additionalProbingBins is the code of a bin.
additionalProbingBins.set_size(numProj, T);
@@ -475,34 +479,10 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
arma::linspace< arma::Col<size_t> >(0, numProj - 1, numProj);
// optimization: no need to create heap for 1 or 2 codes
- if (T == 1)
+ if (T <= 2)
{
- // special case: 1 code only
- // simply find location of minimum score, generate 1 perturbation vector,
- // and add its code to additionalProbingBins column 0
- // find location and value of smallest element of scores vector
- double minscore = scores[0];
- size_t minloc = 0;
- for (size_t s = 0; s < 2 * numProj; ++s)
- {
- if (minscore > scores[s])
- {
- minscore = scores[s];
- minloc = s;
- }
- }
-
- // add or subtract 1 to dimension corresponding to minimum score
- additionalProbingBins(positions[minloc], 0) += actions[minloc];
-
- // done - only 1 code was needed
- return;
- }
-
- if (T == 2)
- {
- // special case: 2 codes only
+ // special case: 1 or 2 codes only
// The second perturbation vector still can't comprise of more than one
// change in the bin codes. This is because of the way perturbation vectors
// are generated: First we create the one with the smallest score (Ao) and
@@ -514,7 +494,7 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
// First, find location of minimum score, generate 1 perturbation vector,
- // and add its code to additionalProbingBins column 0, like in T==1.
+ // and add its code to additionalProbingBins column 0.
// find location and value of smallest element of scores vector
double minscore = scores[0];
@@ -530,6 +510,8 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
// add or subtract 1 to dimension corresponding to minimum score
additionalProbingBins(positions[minloc], 0) += actions[minloc];
+ if (T == 1)
+ return; //done if asked for only 1 code
// Now, find location of second smallest score and generate one more vector.
@@ -537,15 +519,17 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
size_t minloc2 = 0;
for (size_t s = 0; s < 2 * numProj; ++s) // here we can't start from 0
{
- if ( minscore2 > scores[s] && scores[s] > minscore) //second smallest
+ if ( minscore2 > scores[s] && s != minloc) //second smallest
{
minscore2 = scores[s];
minloc2 = s;
}
}
- // add or subtract 1 to second smallest score
+ // add or subtract 1 to create second-lowest scoring vector
additionalProbingBins(positions[minloc2], 1) += actions[minloc2];
+ return;
+
}
// sort everything in increasing order
@@ -556,8 +540,8 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
// Theory:
- // From the paper: This is the part that creates the probing sequence
- // A probing sequence is a sequence of T probing bins where query's
+ // From the paper: This is the part that creates the probing sequence.
+ // A probing sequence is a sequence of T probing bins where a query's
// neighbors are most likely to be. Likelihood is dependent only on a bin's
// score, which is the sum of scores of all dimension-action pairs, so we
// need to calculate the T smallest sums of scores that are not conflicting.
@@ -671,12 +655,13 @@ void LSHSearch<SortPolicy>::ReturnIndicesFromTable(
for (size_t i = 0; i < numTablesToSearch; i++)
queryCodesNotFloored.unsafe_col(i) = projections.slice(i).t() * queryPoint;
queryCodesNotFloored += offsets.cols(0, numTablesToSearch - 1);
- allProjInTables = queryCodesNotFloored/hashWidth;
+ allProjInTables = arma::floor(queryCodesNotFloored/hashWidth);
// Compute the hash value of each key of the query into a bucket of the
// 'secondHashTable' using the 'secondHashWeights'.
- arma::rowvec hashVec = secondHashWeights.t() * arma::floor(allProjInTables);
+ arma::rowvec hashVec = secondHashWeights.t() * allProjInTables;
+ // mod and floor hashVec to compute 2nd-level codes
for (size_t i = 0; i < hashVec.n_elem; i++)
hashVec[i] = (double) ((size_t) hashVec[i] % secondHashSize);
diff --git a/src/mlpack/tests/lsh_test.cpp b/src/mlpack/tests/lsh_test.cpp
index c144f3d..c66c8ef 100644
--- a/src/mlpack/tests/lsh_test.cpp
+++ b/src/mlpack/tests/lsh_test.cpp
@@ -563,7 +563,7 @@ BOOST_AUTO_TEST_CASE(MultiprobeTest)
if (p > 0)
{
// more probes should at the very least not lower recall...
- BOOST_REQUIRE(recall >= prevRecall);
+ BOOST_REQUIRE_GE(recall, prevRecall);
// ... and should ideally increase it a bit
if (recall > prevRecall + epsilonIncrease)
@@ -576,6 +576,66 @@ BOOST_AUTO_TEST_CASE(MultiprobeTest)
BOOST_REQUIRE(foundIncrease);
}
+/**
+ * Test: This is a deterministic test that verifies multiprobe LSH works
+ * correctly. To do this, we generate a query , q1, that will end up
+ * outside of the bucket of its nearest neighbors in C1. The neighbors will be
+ * found only by searching additional probing bins.
+ *
+ * We verify this actually is the case.
+ */
+BOOST_AUTO_TEST_CASE(MultiprobeDeterministicTest)
+{
+ // generate known deterministic clusters of points
+ const size_t N = 40;
+ arma::mat rdata;
+ GetPointset(N, rdata);
+
+ const int k = N/2;
+ const double hashWidth = 1;
+ const int secondHashSize = 99901;
+ const int bucketSize = 500;
+
+ // 1 table, projections on orthonormal plane
+ arma::cube projections(2, 2, 1);
+ projections(0, 0, 0) = 1;
+ projections(1, 0, 0) = 0;
+ projections(0, 1, 0) = 0;
+ projections(1, 1, 0) = 1;
+
+ // construct LSH object with given tables
+ LSHSearch<> lshTest(rdata, projections,
+ hashWidth, secondHashSize, bucketSize);
+
+ // get offset of each projection dimension. Since projection table is I(2),
+ // offsets will map to offsets of actual dimensions. This is to enforce that
+ // q1 and q4 are right outside the bounding boxes of C1 and C4 respectively.
+ arma::mat offsets = lshTest.Offsets();
+
+
+ // two points near the clusters but outside their bins. q1's lowest scoring
+ // perturbation vector will map to C1 cluster's bin. q4's two-lowest scoring
+ // vectors will map to empty bins, and its third vector will map to C4.
+ arma::mat q1;
+ q1 << 1.1 << arma::endr << 3.3; // vector [1.1, 3.3]
+ q1 += offsets;
+
+ arma::mat q4;
+ q4 << 3.3 << arma::endr << 1.1; // vector [3.3, 1.1]
+ q4 += offsets;
+
+ arma::Mat<size_t> neighbors;
+ arma::mat distances;
+
+ // Test that q1 simple search comes up empty
+ lshTest.Search(q1, k, neighbors, distances);
+ BOOST_REQUIRE( arma::all(neighbors.col(0) == N) );
+
+ // Searching with 3 additional probing bins should find neighbors
+ lshTest.Search(q1, k, neighbors, distances, 0, 3);
+ BOOST_REQUIRE( arma::all(neighbors.col(0) == N || neighbors.col(0) < 10) );
+}
+
BOOST_AUTO_TEST_CASE(LSHTrainTest)
{
// This is a not very good test that simply checks that the re-trained LSH
More information about the mlpack-git
mailing list