[mlpack-git] master: Allow 0 as a bucketSize option. (9c28c08)
gitdub at mlpack.org
gitdub at mlpack.org
Tue Jun 14 17:52:19 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/8d7e5db0bed8fc236407bdc5dee00d716d72a5ab...ae6c9e63b56c1ed1faa9aef9352854bbeb826a2f
>---------------------------------------------------------------
commit 9c28c08ea7255721caef85927d74fe27aa1d47a3
Author: Ryan Curtin <ryan at ratml.org>
Date: Tue Jun 14 17:52:19 2016 -0400
Allow 0 as a bucketSize option.
>---------------------------------------------------------------
9c28c08ea7255721caef85927d74fe27aa1d47a3
src/mlpack/methods/lsh/lsh_main.cpp | 3 +-
src/mlpack/methods/lsh/lsh_search.hpp | 49 +++++++++++++++++++++---------
src/mlpack/methods/lsh/lsh_search_impl.hpp | 5 +--
3 files changed, 40 insertions(+), 17 deletions(-)
diff --git a/src/mlpack/methods/lsh/lsh_main.cpp b/src/mlpack/methods/lsh/lsh_main.cpp
index 2894411..7412619 100644
--- a/src/mlpack/methods/lsh/lsh_main.cpp
+++ b/src/mlpack/methods/lsh/lsh_main.cpp
@@ -65,7 +65,8 @@ PARAM_DOUBLE("hash_width", "The hash width for the first-level hashing in the "
"hash width for its use.", "H", 0.0);
PARAM_INT("second_hash_size", "The size of the second level hash table.", "S",
99901);
-PARAM_INT("bucket_size", "The size of a bucket in the second level hash.", "B",
+PARAM_INT("bucket_size", "The maximum size of a bucket in the second level "
+ "hash; 0 indicates no limit (so the table can be arbitrarily large!).", "B",
500);
PARAM_INT("seed", "Random seed. If 0, 'std::time(NULL)' is used.", "s", 0);
diff --git a/src/mlpack/methods/lsh/lsh_search.hpp b/src/mlpack/methods/lsh/lsh_search.hpp
index 7cbe1e6..ad285ea 100644
--- a/src/mlpack/methods/lsh/lsh_search.hpp
+++ b/src/mlpack/methods/lsh/lsh_search.hpp
@@ -50,10 +50,9 @@ class LSHSearch
* performing the hashing for details on how the hashing is done.
*
* @param referenceSet Set of reference points and the set of queries.
- * @param numProj Number of projections in each hash table (anything between
- * 10-50 might be a decent choice).
- * @param numTables Total number of hash tables (anything between 10-20
- * should suffice).
+ * @param projections Cube of projection tables. For a cube of size (a, b, c)
+ * we set numProj = a, numTables = c. b is the reference set
+ * dimensionality.
* @param hashWidth The width of hash for every table. If 0 (the default) is
* provided, then the hash width is automatically obtained by computing
* the average pairwise distance of 25 pairs. This should be a reasonable
@@ -61,8 +60,9 @@ class LSHSearch
* @param secondHashSize The size of the second hash table. This should be a
* large prime number.
* @param bucketSize The size of the bucket in the second hash table. This is
- * the maximum number of points that can be hashed into single bucket.
- * Default values are already provided here.
+ * the maximum number of points that can be hashed into single bucket. A
+ * value of 0 indicates that there is no limit (so the second hash table
+ * can be arbitrarily large---be careful!).
*/
LSHSearch(const arma::mat& referenceSet,
const arma::cube& projections,
@@ -76,9 +76,10 @@ class LSHSearch
* performing the hashing for details on how the hashing is done.
*
* @param referenceSet Set of reference points and the set of queries.
- * @param projections Cube of projection tables. For a cube of size (a, b, c)
- * we set numProj = a, numTables = c. b is the reference set
- * dimensionality.
+ * @param numProj Number of projections in each hash table (anything between
+ * 10-50 might be a decent choice).
+ * @param numTables Total number of hash tables (anything between 10-20
+ * should suffice).
* @param hashWidth The width of hash for every table. If 0 (the default) is
* provided, then the hash width is automatically obtained by computing
* the average pairwise distance of 25 pairs. This should be a reasonable
@@ -86,8 +87,9 @@ class LSHSearch
* @param secondHashSize The size of the second hash table. This should be a
* large prime number.
* @param bucketSize The size of the bucket in the second hash table. This is
- * the maximum number of points that can be hashed into single bucket.
- * Default values are already provided here.
+ * the maximum number of points that can be hashed into single bucket. A
+ * value of 0 indicates that there is no limit (so the second hash table
+ * can be arbitrarily large---be careful!).
*/
LSHSearch(const arma::mat& referenceSet,
const size_t numProj,
@@ -108,9 +110,28 @@ class LSHSearch
~LSHSearch();
/**
- * Train the LSH model on the given dataset. If a correct vector is not
- * provided, this means building new hash tables. Otherwise, we use the ones
- * provided by the user.
+ * Train the LSH model on the given dataset. If a correctly-sized projection
+ * cube is not provided, this means building new hash tables. Otherwise, we
+ * use the projections provided by the user.
+ *
+ * @param referenceSet Set of reference points and the set of queries.
+ * @param numProj Number of projections in each hash table (anything between
+ * 10-50 might be a decent choice).
+ * @param numTables Total number of hash tables (anything between 10-20
+ * should suffice).
+ * @param hashWidth The width of hash for every table. If 0 (the default) is
+ * provided, then the hash width is automatically obtained by computing
+ * the average pairwise distance of 25 pairs. This should be a reasonable
+ * upper bound on the nearest-neighbor distance in general.
+ * @param secondHashSize The size of the second hash table. This should be a
+ * large prime number.
+ * @param bucketSize The size of the bucket in the second hash table. This is
+ * the maximum number of points that can be hashed into single bucket. A
+ * value of 0 indicates that there is no limit (so the second hash table
+ * can be arbitrarily large---be careful!).
+ * @param projections Cube of projection tables. For a cube of size (a, b, c)
+ * we set numProj = a, numTables = c. b is the reference set
+ * dimensionality.
*/
void Train(const arma::mat& referenceSet,
const size_t numProj,
diff --git a/src/mlpack/methods/lsh/lsh_search_impl.hpp b/src/mlpack/methods/lsh/lsh_search_impl.hpp
index ac65a86..98acad1 100644
--- a/src/mlpack/methods/lsh/lsh_search_impl.hpp
+++ b/src/mlpack/methods/lsh/lsh_search_impl.hpp
@@ -205,8 +205,9 @@ void LSHSearch<SortPolicy>::Train(const arma::mat& referenceSet,
secondHashBinCounts[secondHashVectors[i]]++;
// Enforce the maximum bucket size.
- secondHashBinCounts.transform([bucketSize](size_t val)
- { return std::min(val, bucketSize); });
+ const size_t effectiveBucketSize = (bucketSize == 0) ? SIZE_MAX : bucketSize;
+ secondHashBinCounts.transform([effectiveBucketSize](size_t val)
+ { return std::min(val, effectiveBucketSize); });
const size_t numRowsInTable = arma::accu(secondHashBinCounts > 0);
bucketContentSize.zeros(numRowsInTable);
More information about the mlpack-git
mailing list