[mlpack-git] master: Merged changes from #690 and #675 (840efe9)
gitdub at mlpack.org
gitdub at mlpack.org
Wed Jun 22 14:31:02 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/eaa7182ebed8cce3fd6191dc1f8170546ea297da...812048c7c6bee0b6c8d936677f23bbb5930c6cfc
>---------------------------------------------------------------
commit 840efe976789936d718501b0c392b5b082520f5b
Merge: ae81ee5 a9f5622
Author: Yannis Mentekidis <mentekid at gmail.com>
Date: Wed Jun 22 19:31:02 2016 +0100
Merged changes from #690 and #675
>---------------------------------------------------------------
840efe976789936d718501b0c392b5b082520f5b
.appveyor.yml | 12 +-
.gitignore | 5 -
CMakeLists.txt | 2 +-
HISTORY.md | 9 +-
LICENSE.txt | 12 +-
README.md | 2 +-
doc/tutorials/kmeans/kmeans.txt | 10 +-
src/mlpack/core.hpp | 2 +-
src/mlpack/core/arma_extend/CMakeLists.txt | 4 -
src/mlpack/core/arma_extend/Mat_extra_bones.hpp | 10 +
src/mlpack/core/arma_extend/Mat_extra_meat.hpp | 10 +
src/mlpack/core/arma_extend/README.md | 23 +
src/mlpack/core/arma_extend/SpMat_extra_bones.hpp | 40 +-
src/mlpack/core/arma_extend/SpMat_extra_meat.hpp | 323 +------------
src/mlpack/core/arma_extend/arma_extend.hpp | 4 -
src/mlpack/core/arma_extend/fn_ind2sub.hpp | 8 +
src/mlpack/core/arma_extend/hdf5_misc.hpp | 11 +
src/mlpack/core/arma_extend/operator_minus.hpp | 10 +
src/mlpack/core/arma_extend/promote_type.hpp | 50 --
src/mlpack/core/arma_extend/restrictors.hpp | 22 -
src/mlpack/core/arma_extend/traits.hpp | 49 --
src/mlpack/core/arma_extend/typedef.hpp | 30 --
src/mlpack/core/boost_backport/LICENSE.txt | 23 +
src/mlpack/core/boost_backport/README.md | 11 +-
.../unordered_collections_load_imp.hpp | 3 +
.../unordered_collections_save_imp.hpp | 3 +
src/mlpack/core/boost_backport/unordered_map.hpp | 5 +-
src/mlpack/core/data/CMakeLists.txt | 1 +
src/mlpack/core/data/binarize.hpp | 99 ++++
src/mlpack/core/metrics/ip_metric.hpp | 2 +-
src/mlpack/core/metrics/ip_metric_impl.hpp | 10 +-
src/mlpack/core/metrics/lmetric.hpp | 3 +-
src/mlpack/core/metrics/lmetric_impl.hpp | 57 ++-
src/mlpack/core/optimizers/lbfgs/lbfgs.hpp | 4 +-
.../binary_space_tree/binary_space_tree_impl.hpp | 1 -
.../core/tree/cover_tree/cover_tree_impl.hpp | 1 -
.../core/tree/rectangle_tree/r_star_tree_split.hpp | 12 +-
.../tree/rectangle_tree/r_star_tree_split_impl.hpp | 25 +-
.../core/tree/rectangle_tree/r_tree_split.hpp | 18 +-
.../core/tree/rectangle_tree/r_tree_split_impl.hpp | 51 +-
.../core/tree/rectangle_tree/rectangle_tree.hpp | 4 +-
.../tree/rectangle_tree/rectangle_tree_impl.hpp | 6 +-
.../core/tree/rectangle_tree/x_tree_split.hpp | 9 +-
.../core/tree/rectangle_tree/x_tree_split_impl.hpp | 24 +-
src/mlpack/core/util/CMakeLists.txt | 2 -
src/mlpack/core/util/cli.cpp | 57 +--
src/mlpack/core/util/cli.hpp | 18 +-
src/mlpack/core/util/cli_impl.hpp | 45 +-
src/mlpack/core/util/prefixedoutstream.hpp | 1 -
src/mlpack/core/util/string_util.cpp | 47 --
src/mlpack/core/util/string_util.hpp | 23 -
src/mlpack/methods/ann/ffn.hpp | 24 +-
src/mlpack/methods/ann/rnn.hpp | 20 +-
src/mlpack/methods/cf/cf_main.cpp | 2 +-
src/mlpack/methods/det/det_main.cpp | 2 +-
src/mlpack/methods/emst/dtb_rules.hpp | 28 --
src/mlpack/methods/emst/dtb_rules_impl.hpp | 46 --
src/mlpack/methods/emst/emst_main.cpp | 25 +-
src/mlpack/methods/gmm/gmm_generate_main.cpp | 11 +-
src/mlpack/methods/gmm/gmm_probability_main.cpp | 17 +-
src/mlpack/methods/hmm/hmm_generate_main.cpp | 16 +-
src/mlpack/methods/hmm/hmm_train_main.cpp | 7 +-
src/mlpack/methods/hmm/hmm_viterbi_main.cpp | 13 +-
.../hoeffding_trees/hoeffding_tree_main.cpp | 45 +-
src/mlpack/methods/kmeans/CMakeLists.txt | 1 +
src/mlpack/methods/kmeans/allow_empty_clusters.hpp | 15 +-
.../methods/kmeans/dual_tree_kmeans_impl.hpp | 1 -
src/mlpack/methods/kmeans/elkan_kmeans_impl.hpp | 2 -
src/mlpack/methods/kmeans/hamerly_kmeans_impl.hpp | 2 -
..._empty_clusters.hpp => kill_empty_clusters.hpp} | 25 +-
src/mlpack/methods/kmeans/kmeans_main.cpp | 26 +-
src/mlpack/methods/kmeans/naive_kmeans.hpp | 5 +-
src/mlpack/methods/kmeans/naive_kmeans_impl.hpp | 2 -
.../methods/kmeans/pelleg_moore_kmeans_impl.hpp | 6 +-
src/mlpack/methods/lars/lars_main.cpp | 37 +-
.../linear_regression/linear_regression_main.cpp | 54 ++-
src/mlpack/methods/lsh/lsh_main.cpp | 17 +-
src/mlpack/methods/lsh/lsh_search.hpp | 71 ++-
src/mlpack/methods/lsh/lsh_search_impl.hpp | 241 ++++++----
src/mlpack/methods/mvu/mvu_main.cpp | 18 +-
src/mlpack/methods/neighbor_search/CMakeLists.txt | 30 +-
.../methods/neighbor_search/neighbor_search.hpp | 4 +-
src/mlpack/methods/neighbor_search/ns_model.hpp | 210 +++++++-
.../methods/neighbor_search/ns_model_impl.hpp | 532 +++++++++------------
src/mlpack/methods/pca/pca_main.cpp | 7 +-
src/mlpack/methods/perceptron/perceptron_main.cpp | 15 +-
src/mlpack/methods/preprocess/CMakeLists.txt | 1 +
.../preprocess/preprocess_binarize_main.cpp | 85 ++++
.../methods/preprocess/preprocess_split_main.cpp | 69 +--
src/mlpack/methods/radical/radical_main.cpp | 6 +-
src/mlpack/methods/rann/CMakeLists.txt | 20 +-
.../rann/{allkrann_main.cpp => krann_main.cpp} | 6 +-
src/mlpack/methods/rann/ra_typedef.hpp | 43 +-
.../softmax_regression/softmax_regression_main.cpp | 78 ++-
src/mlpack/tests/CMakeLists.txt | 3 +-
src/mlpack/tests/binarize_test.cpp | 66 +++
...krann_search_test.cpp => krann_search_test.cpp} | 10 +-
src/mlpack/tests/lsh_test.cpp | 218 +++++----
src/mlpack/tests/metric_test.cpp | 4 +-
src/mlpack/tests/rectangle_tree_test.cpp | 1 -
src/mlpack/tests/serialization_test.cpp | 12 +-
101 files changed, 1713 insertions(+), 1699 deletions(-)
diff --cc src/mlpack/methods/lsh/lsh_main.cpp
index b2681da,c4d6ff6..0c22f93
--- a/src/mlpack/methods/lsh/lsh_main.cpp
+++ b/src/mlpack/methods/lsh/lsh_main.cpp
@@@ -64,11 -64,10 +64,12 @@@ PARAM_INT("tables", "The number of has
PARAM_DOUBLE("hash_width", "The hash width for the first-level hashing in the "
"LSH preprocessing. By default, the LSH class automatically estimates a "
"hash width for its use.", "H", 0.0);
+PARAM_INT("num_probes", "Number of additional probes for Multiprobe LSH"
+ " If 0, traditional LSH is used.", "T", 0);
PARAM_INT("second_hash_size", "The size of the second level hash table.", "S",
99901);
- PARAM_INT("bucket_size", "The size of a bucket in the second level hash.", "B",
+ PARAM_INT("bucket_size", "The maximum size of a bucket in the second level "
+ "hash; 0 indicates no limit (so the table can be arbitrarily large!).", "B",
500);
PARAM_INT("seed", "Random seed. If 0, 'std::time(NULL)' is used.", "s", 0);
diff --cc src/mlpack/methods/lsh/lsh_search_impl.hpp
index a89ea36,149beab..2f81b66
--- a/src/mlpack/methods/lsh/lsh_search_impl.hpp
+++ b/src/mlpack/methods/lsh/lsh_search_impl.hpp
@@@ -106,8 -105,8 +106,9 @@@ void LSHSearch<SortPolicy>::Train(cons
if (hashWidth == 0.0) // The user has not provided any value.
{
++ const size_t numSamples = 25;
// Compute a heuristic hash width from the data.
-- for (size_t i = 0; i < 25; i++)
++ for (size_t i = 0; i < numSamples; i++)
{
size_t p1 = (size_t) math::RandInt(referenceSet.n_cols);
size_t p2 = (size_t) math::RandInt(referenceSet.n_cols);
@@@ -116,7 -115,7 +117,7 @@@
referenceSet.unsafe_col(p1), referenceSet.unsafe_col(p2)));
}
-- hashWidth /= 25;
++ hashWidth /= numSamples;
}
Log::Info << "Hash width chosen as: " << hashWidth << std::endl;
@@@ -625,69 -354,31 +618,70 @@@ void LSHSearch<SortPolicy>::ReturnIndic
// Compute the projection of the query in each table.
arma::mat allProjInTables(numProj, numTablesToSearch);
+ arma::mat queryCodesNotFloored(numProj, numTablesToSearch);
for (size_t i = 0; i < numTablesToSearch; i++)
- //allProjInTables.unsafe_col(i) = projections[i].t() * queryPoint;
- allProjInTables.unsafe_col(i) = projections.slice(i).t() * queryPoint;
- allProjInTables += offsets.cols(0, numTablesToSearch - 1);
- allProjInTables /= hashWidth;
+ queryCodesNotFloored.unsafe_col(i) = projections.slice(i).t() * queryPoint;
+ queryCodesNotFloored += offsets.cols(0, numTablesToSearch - 1);
+ allProjInTables = arma::floor(queryCodesNotFloored/hashWidth);
-
// Compute the hash value of each key of the query into a bucket of the
// 'secondHashTable' using the 'secondHashWeights'.
- arma::rowvec hashVec = secondHashWeights.t() * arma::floor(allProjInTables);
-
+ arma::Row<size_t> hashVec =
+ arma::conv_to< arma::Row<size_t> >::
+ from( secondHashWeights.t() * allProjInTables ); // typecast to floor
+ // mod to compute 2nd-level codes
for (size_t i = 0; i < hashVec.n_elem; i++)
- hashVec[i] = (double) ((size_t) hashVec[i] % secondHashSize);
+ hashVec[i] = (hashVec[i] % secondHashSize);
Log::Assert(hashVec.n_elem == numTablesToSearch);
+ // Compute hashVectors of additional probing bins
+ arma::Mat<size_t> hashMat;
+ if (T > 0)
+ {
+ hashMat.zeros(T, numTablesToSearch);
+
+ for (size_t i = 0; i < numTablesToSearch; ++i)
+ {
+ // construct this table's probing sequence of length T
+ arma::mat additionalProbingBins;
+ GetAdditionalProbingBins(allProjInTables.unsafe_col(i),
+ queryCodesNotFloored.unsafe_col(i),
+ T,
+ additionalProbingBins);
+
+ // map the probing bin to second hash table bins
+ hashMat.col(i) =
+ arma::conv_to< arma::Col<size_t> >::
+ from(additionalProbingBins.t() * secondHashWeights); // typecast floor
+ for (size_t p = 0; p < T; ++p)
+ hashMat(p, i) = (hashMat(p, i) % secondHashSize);
+ }
+
+ // top row of hashMat is primary bins for each table
+ hashMat = arma::join_vert(hashVec, hashMat);
+ }
+ else
+ {
+ // if not multiprobe, hashMat is only hashVec's elements
+ hashMat.set_size(1, numTablesToSearch);
+ hashMat.row(0) = hashVec;
+ }
+
- // Count number of points hashed in the same bucket as the query
++
+ // Count number of points hashed in the same bucket as the query.
size_t maxNumPoints = 0;
- for (size_t i = 0; i < numTablesToSearch; ++i) //For all tables
+ for (size_t i = 0; i < numTablesToSearch; ++i)
{
- const size_t hashInd = (size_t) hashVec[i];
- const size_t tableRow = bucketRowInHashTable[hashInd];
- if (tableRow != secondHashSize)
- maxNumPoints += bucketContentSize[tableRow];
+ for (size_t p = 0; p < T + 1; ++p)
+ {
- size_t hashInd = (size_t) hashMat(p, i); //find query's bucket
- maxNumPoints += bucketContentSize[hashInd]; //count bucket contents
++ const size_t hashInd = hashMat(p, i); // find query's bucket
++ const size_t tableRow = bucketRowInHashTable[hashInd];
++ if (tableRow != secondHashSize)
++ maxNumPoints += bucketContentSize[hashInd]; // count bucket contents
+ }
}
-
// There are two ways to proceed here:
// Either allocate a maxNumPoints-size vector, place all candidates, and run
// unique on the vector to discard duplicates.
@@@ -707,25 -398,15 +701,25 @@@
arma::Col<size_t> refPointsConsidered;
refPointsConsidered.zeros(referenceSet->n_cols);
- for (size_t i = 0; i < hashVec.n_elem; ++i)
+ for (size_t i = 0; i < numTablesToSearch; ++i) // for all tables
{
- const size_t hashInd = (size_t) hashVec[i];
- const size_t tableRow = bucketRowInHashTable[hashInd];
+ for (size_t p = 0; p < T + 1; ++p) // for entire probing sequence
+ {
- // Pick the indices in the bucket corresponding to 'hashInd'.
- if (tableRow != secondHashSize)
- for (size_t j = 0; j < bucketContentSize[tableRow]; j++)
- refPointsConsidered[secondHashTable[tableRow](j)]++;
+ // get the sequence code
- size_t hashInd = (size_t) hashMat(p, i);
++ size_t hashInd = hashMat(p, i);
+
+ if (bucketContentSize[hashInd] > 0)
+ {
+ // Pick the indices in the bucket corresponding to hashInd.
+ size_t tableRow = bucketRowInHashTable[hashInd];
- assert(tableRow < secondHashSize);
- assert(tableRow < secondHashTable.n_rows);
-
- for (size_t j = 0; j < bucketContentSize[hashInd]; ++j)
- refPointsConsidered[secondHashTable(tableRow, j)]++;
++ if (tableRow != secondHashSize)
++ {
++ for (size_t j = 0; j < bucketContentSize[tableRow]; ++j)
++ refPointsConsidered[ secondHashTable[tableRow](j) ]++;
++ }
+ }
+ }
}
// Only keep reference points found in at least one bucket.
@@@ -744,22 -425,13 +738,16 @@@
size_t start = 0;
for (size_t i = 0; i < numTablesToSearch; ++i) // For all tables
{
- const size_t hashInd = (size_t) hashVec[i]; // Find the query's bucket.
- const size_t tableRow = bucketRowInHashTable[hashInd];
+ for (size_t p = 0; p < T + 1; ++p)
+ {
- size_t hashInd = hashMat(p, i); // Find the query's bucket.
++ const size_t hashInd = hashMat(p, i); // Find the query's bucket.
++ const size_t tableRow = bucketRowInHashTable[hashInd];
- if (bucketContentSize[hashInd] > 0)
- {
- // tableRow hash indices corresponding to query.
- size_t tableRow = bucketRowInHashTable[hashInd];
- assert(tableRow < secondHashSize);
- assert(tableRow < secondHashTable.n_rows);
-
- // Store all secondHashTable points in the candidates set.
- for (size_t j = 0; j < bucketContentSize[hashInd]; ++j)
- refPointsConsideredSmall(start++) = secondHashTable(tableRow, j);
- }
- // Store all secondHashTable points in the candidates set.
- if (tableRow != secondHashSize)
- for (size_t j = 0; j < bucketContentSize[tableRow]; ++j)
- refPointsConsideredSmall(start++) = secondHashTable[tableRow][j];
++ if (tableRow != secondHashSize)
++ // Store all secondHashTable points in the candidates set.
++ for (size_t j = 0; j < bucketContentSize[tableRow]; ++j)
++ refPointsConsideredSmall(start++) = secondHashTable[tableRow][j];
+ }
}
// Only keep unique candidates.
diff --cc src/mlpack/tests/lsh_test.cpp
index 2d9f556,65d1d78..7ae48d1
--- a/src/mlpack/tests/lsh_test.cpp
+++ b/src/mlpack/tests/lsh_test.cpp
@@@ -477,160 -454,18 +454,155 @@@ BOOST_AUTO_TEST_CASE(DeterministicNoMer
if (neighbors(j, 0) == N || neighbors(j, 1) == N)
continue;
- //query 1 is in cluster 3, which is points 20:29
+ // Query 1 is in cluster 3, which is points 20:29.
q = 0;
- BOOST_REQUIRE(
- neighbors(j, q) < 3 * N / 4 &&
- neighbors(j, q) >= N / 2
- );
+ BOOST_REQUIRE_LT(neighbors(j, q), 3 * N / 4);
+ BOOST_REQUIRE_GE(neighbors(j, q), N / 2);
- //query 2 is in cluster 2, which is points 10:19
+ // Query 2 is in cluster 2, which is points 10:19.
q = 1;
- BOOST_REQUIRE(
- neighbors(j, q) < N / 2 &&
- neighbors(j, q) >= N / 4
- );
+ BOOST_REQUIRE_LT(neighbors(j, q), N / 2);
+ BOOST_REQUIRE_GE(neighbors(j, q), N / 4);
}
-
}
+/**
+ * Test: Create an LSHSearch object and use an increasing number of probes to
+ * search for points. Require that recall for the same object doesn't decrease
+ * with increasing number of probes. Also require that at least a few times
+ * there's some increase in recall.
+ */
+BOOST_AUTO_TEST_CASE(MultiprobeTest)
+{
+ const double epsilonIncrease = 0.05;
+ const size_t repetitions = 5; // train five objects
+
+ const size_t probeTrials = 5;
+ const size_t numProbes[probeTrials] = {0, 1, 2, 3, 4};
+
+
+ /// algorithm parameters
+ const int k = 4;
+ const int numTables = 16;
+ const int numProj = 3;
+ const double hashWidth = 0;
+ const int secondHashSize = 99901;
+ const int bucketSize = 500;
+
+ const string trainSet = "iris_train.csv";
+ const string testSet = "iris_test.csv";
+ arma::mat rdata;
+ arma::mat qdata;
+ data::Load(trainSet, rdata, true);
+ data::Load(testSet, qdata, true);
+
+ // Run classic knn on reference set
+ KNN knn(rdata);
+ arma::Mat<size_t> groundTruth;
+ arma::mat groundDistances;
+ knn.Search(qdata, k, groundTruth, groundDistances);
+
+ bool foundIncrease = 0;
+
+ for (size_t rep = 0; rep < repetitions; ++rep)
+ {
+ // train a model
+ LSHSearch<> multiprobeTest(
+ rdata,
+ numProj,
+ numTables,
+ hashWidth,
+ secondHashSize,
+ bucketSize);
+
+ double prevRecall = 0;
+ // search with varying number of probes
+ for (size_t p = 0; p < probeTrials; ++p)
+ {
+ arma::Mat<size_t> lshNeighbors;
+ arma::mat lshDistances; //move outside of loop for speed?
+
+ multiprobeTest.Search(
+ qdata,
+ k,
+ lshNeighbors,
+ lshDistances,
+ 0,
+ numProbes[p]);
+
+ // compute recall
+ double recall = ComputeRecall(lshNeighbors, groundTruth); //TODO: change to LSHSearch::ComputeRecall??
+ if (p > 0)
+ {
+ // more probes should at the very least not lower recall...
+ BOOST_REQUIRE_GE(recall, prevRecall);
+
+ // ... and should ideally increase it a bit
+ if (recall > prevRecall + epsilonIncrease)
+ foundIncrease = true;
+ prevRecall = recall;
+ }
+ }
+ }
+
+ BOOST_REQUIRE(foundIncrease);
+}
+
+/**
+ * Test: This is a deterministic test that verifies multiprobe LSH works
+ * correctly. To do this, we generate a query , q1, that will end up
+ * outside of the bucket of its nearest neighbors in C1. The neighbors will be
+ * found only by searching additional probing bins.
+ *
+ * We verify this actually is the case.
+ */
+BOOST_AUTO_TEST_CASE(MultiprobeDeterministicTest)
+{
+ // generate known deterministic clusters of points
+ const size_t N = 40;
+ arma::mat rdata;
+ GetPointset(N, rdata);
+
+ const int k = N/2;
+ const double hashWidth = 1;
+ const int secondHashSize = 99901;
+ const int bucketSize = 500;
+
+ // 1 table, projections on orthonormal plane
+ arma::cube projections(2, 2, 1);
+ projections(0, 0, 0) = 1;
+ projections(1, 0, 0) = 0;
+ projections(0, 1, 0) = 0;
+ projections(1, 1, 0) = 1;
+
+ // construct LSH object with given tables
+ LSHSearch<> lshTest(rdata, projections,
+ hashWidth, secondHashSize, bucketSize);
+
+ // get offset of each projection dimension. Since projection table is I(2),
+ // offsets will map to offsets of actual dimensions. This is to enforce that
+ // q1 is right outside the bounding box of C1.
+ arma::mat offsets = lshTest.Offsets();
+
+
+ // two points near the clusters but outside their bins. q1's lowest scoring
+ // perturbation vectors will map to C1 cluster's bins.
+ arma::mat q1;
+ q1 << 1.1 << arma::endr << 3.3; // vector [1.1, 3.3]
+ q1 += offsets;
+
+ arma::Mat<size_t> neighbors;
+ arma::mat distances;
+
+ // Test that q1 simple search comes up empty
+ lshTest.Search(q1, k, neighbors, distances);
+ BOOST_REQUIRE( arma::all(neighbors.col(0) == N) );
+
+ // Searching with 3 additional probing bins should find neighbors
+ lshTest.Search(q1, k, neighbors, distances, 0, 3);
+ BOOST_REQUIRE( arma::all(neighbors.col(0) == N || neighbors.col(0) < 10) );
+}
+
BOOST_AUTO_TEST_CASE(LSHTrainTest)
{
// This is a not very good test that simply checks that the re-trained LSH
More information about the mlpack-git
mailing list