[mlpack-git] master: Separate tests into Boost test cases (f229e4e)
gitdub at mlpack.org
gitdub at mlpack.org
Wed Apr 6 04:52:59 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/5bc514c122d53590397fdfad42c7845d9ad91fa1...f0675d7789b69746f7c337c3ec4a778cef932924
>---------------------------------------------------------------
commit f229e4e7555a62e51889d91eb99db336c309c794
Author: Yannis Mentekidis <mentekid at gmail.com>
Date: Wed Apr 6 09:52:59 2016 +0100
Separate tests into Boost test cases
>---------------------------------------------------------------
f229e4e7555a62e51889d91eb99db336c309c794
src/mlpack/tests/lsh_test.cpp | 193 ++++++++++++++++++++++++++++++------------
1 file changed, 141 insertions(+), 52 deletions(-)
diff --git a/src/mlpack/tests/lsh_test.cpp b/src/mlpack/tests/lsh_test.cpp
index d5533c1..a04a459 100644
--- a/src/mlpack/tests/lsh_test.cpp
+++ b/src/mlpack/tests/lsh_test.cpp
@@ -22,27 +22,36 @@ double compute_recall(
const size_t queries = LSHneighbors.n_cols;
const size_t neigh = LSHneighbors.n_rows;
- int found_same = 0;
+ int same = 0;
for (size_t q = 0; q < queries; ++q)
{
for (size_t n = 0; n < neigh; ++n)
{
- found_same+=(LSHneighbors(n,q)==groundTruth(n,q));
+ same+=(LSHneighbors(n,q)==groundTruth(n,q));
}
}
- return static_cast<double>(found_same)/
+ return static_cast<double>(same)/
(static_cast<double>(queries*neigh));
}
BOOST_AUTO_TEST_SUITE(LSHTest);
-BOOST_AUTO_TEST_CASE(LSHSearchTest)
+/**
+ * Test: Run LSH with varying number of tables, keeping all other parameters
+ * constant. Compute the recall, i.e. the number of reported neighbors that
+ * are real neighbors of the query.
+ * LSH's property is that (with high probability), increasing the number of
+ * tables will increase recall. Epsilon ensures that if noise lightly affects
+ * the projections, the test will not fail.
+ * This produces false negatives, so we attempt the test numTries times and
+ * only declare failure if all of them fail.
+ */
+BOOST_AUTO_TEST_CASE(numTablesTest)
{
- math::RandomSeed(time(0));
+ //math::RandomSeed(time(0));
//kNN and LSH parameters (use LSH default parameters)
const int k = 4;
- const int numTables = 30;
const int numProj = 10;
const double hashWidth = 0;
const int secondHashSize = 99901;
@@ -66,18 +75,10 @@ BOOST_AUTO_TEST_CASE(LSHSearchTest)
arma::mat groundDistances;
knn.Search(qdata, k, groundTruth, groundDistances);
- //Test: Run LSH with varying number of tables, keeping all other parameters
- //constant. Compute the recall, i.e. the number of reported neighbors that
- //are real neighbors of the query.
- //LSH's property is that (with high probability), increasing the number of
- //tables will increase recall. Epsilon ensures that if noise lightly affects
- //the projections, the test will not fail.
- //This produces false negatives, so we attempt the test numTries times and
- //only declare failure if all of them fail.
bool fail;
-
- for (int t = 0; t < numTries; ++t){
+ for (int t = 0; t < numTries; ++t)
+ {
fail = false;
const int lSize = 6; //number of runs
@@ -87,37 +88,65 @@ BOOST_AUTO_TEST_CASE(LSHSearchTest)
for (size_t l=0; l < lSize; ++l)
{
//run LSH with only numTables varying (other values default)
- LSHSearch<> lsh_test1(rdata, numProj, lValue[l],
+ LSHSearch<> lshTest(rdata, numProj, lValue[l],
hashWidth, secondHashSize, bucketSize);
arma::Mat<size_t> LSHneighbors;
arma::mat LSHdistances;
- lsh_test1.Search(qdata, k, LSHneighbors, LSHdistances);
+ lshTest.Search(qdata, k, LSHneighbors, LSHdistances);
//compute recall for each query
lValueRecall[l] = compute_recall(LSHneighbors, groundTruth);
- if (l > 0){
- if(lValueRecall[l] < lValueRecall[l-1]-epsilon){
+ if (l > 0)
+ {
+ if(lValueRecall[l] < lValueRecall[l-1]-epsilon)
+ {
fail = true; //if test fails at one point, stop and retry
break;
}
}
}
+
if ( !fail )
- {
break; //if test passes one time, it is sufficient
- }
-
}
BOOST_REQUIRE(fail == false);
+}
+
+/*Test: Run LSH with varying hash width, keeping all other parameters
+ * constant. Compute the recall, i.e. the number of reported neighbors that
+ * are real neighbors of the query.
+ * LSH's property is that (with high probability), increasing the hash width
+ * will increase recall. Epsilon ensures that if noise lightly affects the
+ * projections, the test will not fail.
+ */
+BOOST_AUTO_TEST_CASE(hashWidthTest)
+{
+
+ //math::RandomSeed(time(0));
+ //kNN and LSH parameters (use LSH default parameters)
+ const int k = 4;
+ const int numTables = 30;
+ const int numProj = 10;
+ const int secondHashSize = 99901;
+ const int bucketSize = 500;
+
+ //test parameters
+ const double epsilon = 0.1; //allowed deviation from expected monotonicity
- //Test: Run LSH with varying hash width, keeping all other parameters
- //constant. Compute the recall, i.e. the number of reported neighbors that
- //are real neighbors of the query.
- //LSH's property is that (with high probability), increasing the hash width
- //will increase recall. Epsilon ensures that if noise lightly affects the
- //projections, the test will not fail.
+ //read iris training and testing data as reference and query
+ const string trainSet="iris_train.csv";
+ const string testSet="iris_test.csv";
+ arma::mat rdata;
+ arma::mat qdata;
+ data::Load(trainSet, rdata, true);
+ data::Load(testSet, qdata, true);
+ //Run classic knn on reference data
+ AllkNN knn(rdata);
+ arma::Mat<size_t> groundTruth;
+ arma::mat groundDistances;
+ knn.Search(qdata, k, groundTruth, groundDistances);
const int hSize = 7; //number of runs
const double hValue[] = {0.1, 0.5, 1, 5, 10, 50, 500}; //hash width
double hValueRecall[hSize] = {0.0}; //recall of each run
@@ -125,7 +154,7 @@ BOOST_AUTO_TEST_CASE(LSHSearchTest)
for (size_t h=0; h < hSize; ++h)
{
//run LSH with only hashWidth varying (other values default)
- LSHSearch<> lsh_test2(
+ LSHSearch<> lshTest(
rdata,
numProj,
numTables,
@@ -135,7 +164,7 @@ BOOST_AUTO_TEST_CASE(LSHSearchTest)
arma::Mat<size_t> LSHneighbors;
arma::mat LSHdistances;
- lsh_test2.Search(qdata, k, LSHneighbors, LSHdistances);
+ lshTest.Search(qdata, k, LSHneighbors, LSHdistances);
//compute recall for each query
hValueRecall[h] = compute_recall(LSHneighbors, groundTruth);
@@ -144,14 +173,45 @@ BOOST_AUTO_TEST_CASE(LSHSearchTest)
BOOST_REQUIRE_GE(hValueRecall[h], hValueRecall[h-1]-epsilon);
}
+}
+
+/**
+ * Test: Run LSH with varying number of projections, keeping other parameters
+ * constant. Compute the recall, i.e. the number of reported neighbors that
+ * are real neighbors of the query.
+ * LSH's property is that (with high probability), increasing the number of
+ * projections per table will decrease recall. Epsilon ensures that if noise
+ * lightly affects the projections, the test will not fail.
+ */
+BOOST_AUTO_TEST_CASE(numProjTest)
+{
+
+ //math::RandomSeed(time(0));
+ //kNN and LSH parameters (use LSH default parameters)
+ const int k = 4;
+ const int numTables = 30;
+ const double hashWidth = 0;
+ const int secondHashSize = 99901;
+ const int bucketSize = 500;
+
+ //test parameters
+ const double epsilon = 0.1; //allowed deviation from expected monotonicity
+
+ //read iris training and testing data as reference and query
+ const string trainSet="iris_train.csv";
+ const string testSet="iris_test.csv";
+ arma::mat rdata;
+ arma::mat qdata;
+ data::Load(trainSet, rdata, true);
+ data::Load(testSet, qdata, true);
- //Test: Run LSH with varying number of projections, keeping other parameters
- //constant. Compute the recall, i.e. the number of reported neighbors that
- //are real neighbors of the query.
- //LSH's property is that (with high probability), increasing the number of
- //projections per table will decrease recall. Epsilon ensures that if noise
- //lightly affects the projections, the test will not fail.
+ //Run classic knn on reference data
+ AllkNN knn(rdata);
+ arma::Mat<size_t> groundTruth;
+ arma::mat groundDistances;
+ knn.Search(qdata, k, groundTruth, groundDistances);
+ //LSH test parameters for numProj
const int pSize = 5; //number of runs
const int pValue[] = {1, 10, 20, 50, 100}; //number of projections
double pValueRecall[pSize] = {0.0}; //recall of each run
@@ -159,7 +219,7 @@ BOOST_AUTO_TEST_CASE(LSHSearchTest)
for (size_t p=0; p < pSize; ++p)
{
//run LSH with only numProj varying (other values default)
- LSHSearch<> lsh_test3(
+ LSHSearch<> lshTest(
rdata,
pValue[p],
numTables,
@@ -169,7 +229,7 @@ BOOST_AUTO_TEST_CASE(LSHSearchTest)
arma::Mat<size_t> LSHneighbors;
arma::mat LSHdistances;
- lsh_test3.Search(qdata, k, LSHneighbors, LSHdistances);
+ lshTest.Search(qdata, k, LSHneighbors, LSHdistances);
//compute recall for each query
pValueRecall[p] = compute_recall(LSHneighbors, groundTruth);
@@ -177,18 +237,48 @@ BOOST_AUTO_TEST_CASE(LSHSearchTest)
if (p > 0) //don't check first run, only that increasing P decreases recall
BOOST_REQUIRE_LE(pValueRecall[p] - epsilon, pValueRecall[p-1]);
}
+}
+
+/**
+ * Test: Run two LSH searches:
+ * First, a very expensive LSH search, with a large number of hash tables
+ * and a large hash width. This run should return an acceptable recall. We set
+ * the bar very low (recall >= 50%) to make sure that a test fail means bad
+ * implementation.
+ * Second, a very cheap LSH search, with parameters that should cause recall
+ * to be very low. Set the threshhold very high (recall <= 25%) to make sure
+ * that a test fail means bad implementation.
+ */
+BOOST_AUTO_TEST_CASE(recallTest)
+{
+ //math::RandomSeed(time(0));
+ //kNN and LSH parameters (use LSH default parameters)
+ const int k = 4;
+ const int secondHashSize = 99901;
+ const int bucketSize = 500;
+
+
+ //read iris training and testing data as reference and query
+ const string trainSet="iris_train.csv";
+ const string testSet="iris_test.csv";
+ arma::mat rdata;
+ arma::mat qdata;
+ data::Load(trainSet, rdata, true);
+ data::Load(testSet, qdata, true);
- //Test: Run a very expensive LSH search, with a large number of hash tables
- //and a large hash width. This run should return an acceptable recall. We set
- //the bar very low (recall >= 50%) to make sure that a test fail means bad
- //implementation.
+ //Run classic knn on reference data
+ AllkNN knn(rdata);
+ arma::Mat<size_t> groundTruth;
+ arma::mat groundDistances;
+ knn.Search(qdata, k, groundTruth, groundDistances);
+ //Expensive LSH run
const int hExp = 10000; //first-level hash width
const int kExp = 1; //projections per table
const int tExp = 128; //number of tables
const double recallThreshExp = 0.5;
- LSHSearch<> lsh_test_exp(
+ LSHSearch<> lshTestExp(
rdata,
kExp,
tExp,
@@ -197,23 +287,20 @@ BOOST_AUTO_TEST_CASE(LSHSearchTest)
bucketSize);
arma::Mat<size_t> LSHneighborsExp;
arma::mat LSHdistancesExp;
- lsh_test_exp.Search(qdata, k, LSHneighborsExp, LSHdistancesExp);
+ lshTestExp.Search(qdata, k, LSHneighborsExp, LSHdistancesExp);
const double recallExp = compute_recall(LSHneighborsExp, groundTruth);
+ //This run should have recall higher than the threshold
BOOST_REQUIRE_GE(recallExp, recallThreshExp);
- //Test: Run a very cheap LSH search, with parameters that should cause recall
- //to be very low. Set the threshhold very high (recall <= 25%) to make sure
- //that a test fail means bad implementation.
- //This mainly checks that user-specified parameters are not ignored.
-
+ //Cheap LSH Run
const int hChp = 1; //small first-level hash width
const int kChp = 1000; //large number of projections per table
const int tChp = 1; //only one table
const double recallThreshChp = 0.25; //recall threshold
- LSHSearch<> lsh_test_chp(
+ LSHSearch<> lshTestChp(
rdata,
kChp,
tChp,
@@ -222,9 +309,11 @@ BOOST_AUTO_TEST_CASE(LSHSearchTest)
bucketSize);
arma::Mat<size_t> LSHneighborsChp;
arma::mat LSHdistancesChp;
- lsh_test_chp.Search(qdata, k, LSHneighborsChp, LSHdistancesChp);
+ lshTestChp.Search(qdata, k, LSHneighborsChp, LSHdistancesChp);
const double recallChp = compute_recall(LSHneighborsChp, groundTruth);
+
+ //This run should have recall lower than the threshold
BOOST_REQUIRE_LE(recallChp, recallThreshChp);
}
More information about the mlpack-git
mailing list