[mlpack-git] master: Add 3 more recall-based tests for LSHSearch (1708dfd)
gitdub at mlpack.org
gitdub at mlpack.org
Tue Apr 5 06:38:39 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/5bc514c122d53590397fdfad42c7845d9ad91fa1...f0675d7789b69746f7c337c3ec4a778cef932924
>---------------------------------------------------------------
commit 1708dfdc24c8ed06a57cacd536813c4437eeadfb
Author: Yannis Mentekidis <mentekid at gmail.com>
Date: Tue Apr 5 11:38:39 2016 +0100
Add 3 more recall-based tests for LSHSearch
>---------------------------------------------------------------
1708dfdc24c8ed06a57cacd536813c4437eeadfb
src/mlpack/tests/lsh_test.cpp | 107 +++++++++++++++++++++++++++++++++++-------
1 file changed, 89 insertions(+), 18 deletions(-)
diff --git a/src/mlpack/tests/lsh_test.cpp b/src/mlpack/tests/lsh_test.cpp
index 9561dda..0f1e5e6 100644
--- a/src/mlpack/tests/lsh_test.cpp
+++ b/src/mlpack/tests/lsh_test.cpp
@@ -41,7 +41,7 @@ BOOST_AUTO_TEST_CASE(LSHSearchTest)
math::RandomSeed(time(0));
//kNN and LSH parameters (use LSH default parameters)
const int k = 4;
- //const int numTables = 30;
+ const int numTables = 30;
const int numProj = 10;
const double hashWidth = 0;
const int secondHashSize = 99901;
@@ -71,47 +71,118 @@ BOOST_AUTO_TEST_CASE(LSHSearchTest)
//tables will increase recall. Epsilon ensures that if noise lightly affects
//the projections, the test will not fail.
- //Run LSH for different number of tables
- const int L_table[] = {1, 8, 16, 32, 64, 128}; //number of tables
- const int Lsize = 6;
- double recall_L[Lsize] = {0.0}; //recall of each LSH run
+ const int numTries = 5; //tries for each test before declaring failure
+ const int Lsize = 6; //number of runs
+ const int L_value[] = {1, 8, 16, 32, 64, 128}; //number of tables
+ double L_value_recall[Lsize] = {0.0}; //recall of each LSH run
for (int l=0; l < Lsize; ++l)
{
//run LSH with only numTables varying (other values default)
- LSHSearch<> lsh_test1(rdata, numProj, L_table[l],
+ LSHSearch<> lsh_test1(rdata, numProj, L_value[l],
hashWidth, secondHashSize, bucketSize);
arma::Mat<size_t> LSHneighbors;
arma::mat LSHdistances;
lsh_test1.Search(qdata, k, LSHneighbors, LSHdistances);
//compute recall for each query
- recall_L[l] = compute_recall(LSHneighbors, groundTruth);
+ L_value_recall[l] = compute_recall(LSHneighbors, groundTruth);
if (l > 0)
- BOOST_CHECK(recall_L[l] >= recall_L[l-1]-epsilon);
+ BOOST_CHECK(L_value_recall[l] >= L_value_recall[l-1]-epsilon);
+ }
+
+ //Test: Run LSH with varying hash width, keeping all other parameters
+ //constant. Compute the recall, i.e. the number of reported neighbors that
+ //are real neighbors of the query.
+ //LSH's property is that (with high probability), increasing the hash width
+ //will increase recall. Epsilon ensures that if noise lightly affects the
+ //projections, the test will not fail.
+
+ const int Hsize = 7; //number of runs
+ const double H_value[] = {0.1, 0.5, 1, 5, 10, 50, 500}; //hash width
+ double H_value_recall[Hsize] = {0.0}; //recall of each run
+
+ for (int h=0; h < Hsize; ++h)
+ {
+ //run LSH with only hashWidth varying (other values default)
+ LSHSearch<> lsh_test2(rdata, numProj, numTables,
+ H_value[h], secondHashSize, bucketSize);
+ arma::Mat<size_t> LSHneighbors;
+ arma::mat LSHdistances;
+ lsh_test2.Search(qdata, k, LSHneighbors, LSHdistances);
+
+ //compute recall for each query
+ H_value_recall[h] = compute_recall(LSHneighbors, groundTruth);
+
+ if (h > 0)
+ BOOST_CHECK(H_value_recall[h] >= H_value_recall[h-1]-epsilon);
}
+ //Test: Run LSH with varying number of Projections, keeping all other parameters
+ //constant. Compute the recall, i.e. the number of reported neighbors that
+ //are real neighbors of the query.
+ //LSH's property is that (with high probability), increasing the number of
+ //projections per table will decrease recall. Epsilon ensures that if noise lightly
+ //affects the projections, the test will not fail.
+
+ const int Psize = 5; //number of runs
+ const int P_value[] = {1, 10, 20, 50, 100}; //number of projections
+ double P_value_recall[Psize] = {0.0}; //recall of each run
+
+ for (int p=0; p < Psize; ++p)
+ {
+ //run LSH with only numProj varying (other values default)
+ LSHSearch<> lsh_test3(rdata, P_value[p], numTables,
+ hashWidth, secondHashSize, bucketSize);
+ arma::Mat<size_t> LSHneighbors;
+ arma::mat LSHdistances;
+ lsh_test3.Search(qdata, k, LSHneighbors, LSHdistances);
+
+ //compute recall for each query
+ P_value_recall[p] = compute_recall(LSHneighbors, groundTruth);
+
+ if (p > 0) //don't check first run, only that increasing P decreases recall
+ BOOST_CHECK(P_value_recall[p] - epsilon <= P_value_recall[p-1]);
+ }
+
//Test: Run a very expensive LSH search, with a large number of hash tables
//and a large hash width. This run should return an acceptable recall. We set
//the bar very low (recall >= 50%) to make sure that a test fail means bad
//implementation.
- const int Ht2 = 10000; //first-level hash width
- const int Kt2 = 128; //projections per table
- const int Tt2 = 128; //number of tables
- const double recall_thresh_t2 = 0.5;
+ const int H_exp = 10000; //first-level hash width
+ const int K_exp = 1; //projections per table
+ const int T_exp = 128; //number of tables
+ const double recall_thresh_exp = 0.5;
+
+ LSHSearch<> lsh_test_exp(rdata, K_exp, T_exp, H_exp, secondHashSize, bucketSize);
+ arma::Mat<size_t> LSHneighbors_exp;
+ arma::mat LSHdistances_exp;
+ lsh_test_exp.Search(qdata, k, LSHneighbors_exp, LSHdistances_exp);
+
+ const double recall_exp = compute_recall(LSHneighbors_exp, groundTruth);
+
+ BOOST_CHECK(recall_exp >= recall_thresh_exp);
- LSHSearch<> lsh_test2(rdata, Kt2, Tt2, Ht2, secondHashSize, bucketSize);
- arma::Mat<size_t> LSHneighbors;
- arma::mat LSHdistances;
- lsh_test2.Search(qdata, k, LSHneighbors, LSHdistances);
+ //Test: Run a very cheap LSH search, with parameters that should cause recall
+ //to be very low. Set the threshhold very high (recall <= 25%) to make sure
+ //that a test fail means bad implementation.
+ //This mainly checks that user-specified parameters are not ignored.
- const double recall_t2 = compute_recall(LSHneighbors, groundTruth);
+ const int H_chp = 1; //small first-level hash width
+ const int K_chp = 1000; //large number of projections per table
+ const int T_chp = 1; //only one table
+ const double recall_thresh_chp = 0.25; //recall threshold
- BOOST_CHECK(recall_t2 >= recall_thresh_t2);
+ LSHSearch<> lsh_test_chp(rdata, K_chp, T_chp, H_chp, secondHashSize, bucketSize);
+ arma::Mat<size_t> LSHneighbors_chp;
+ arma::mat LSHdistances_chp;
+ lsh_test_chp.Search(qdata, k, LSHneighbors_chp, LSHdistances_chp);
+ const double recall_chp = compute_recall(LSHneighbors_chp, groundTruth);
+ BOOST_CHECK(recall_chp <= recall_thresh_chp);
}
BOOST_AUTO_TEST_CASE(LSHTrainTest)
More information about the mlpack-git
mailing list