[mlpack-git] master: Allow 0 as a bucketSize option. (9c28c08)

Tue Jun 14 17:52:19 EDT 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/8d7e5db0bed8fc236407bdc5dee00d716d72a5ab...ae6c9e63b56c1ed1faa9aef9352854bbeb826a2f

>---------------------------------------------------------------

commit 9c28c08ea7255721caef85927d74fe27aa1d47a3
Author: Ryan Curtin <ryan at ratml.org>
Date:   Tue Jun 14 17:52:19 2016 -0400

    Allow 0 as a bucketSize option.


>---------------------------------------------------------------

9c28c08ea7255721caef85927d74fe27aa1d47a3
 src/mlpack/methods/lsh/lsh_main.cpp        |  3 +-
 src/mlpack/methods/lsh/lsh_search.hpp      | 49 +++++++++++++++++++++---------
 src/mlpack/methods/lsh/lsh_search_impl.hpp |  5 +--
 3 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/src/mlpack/methods/lsh/lsh_main.cpp b/src/mlpack/methods/lsh/lsh_main.cpp
index 2894411..7412619 100644
--- a/src/mlpack/methods/lsh/lsh_main.cpp
+++ b/src/mlpack/methods/lsh/lsh_main.cpp
@@ -65,7 +65,8 @@ PARAM_DOUBLE("hash_width", "The hash width for the first-level hashing in the "
     "hash width for its use.", "H", 0.0);
 PARAM_INT("second_hash_size", "The size of the second level hash table.", "S",
     99901);
-PARAM_INT("bucket_size", "The size of a bucket in the second level hash.", "B",
+PARAM_INT("bucket_size", "The maximum size of a bucket in the second level "
+    "hash; 0 indicates no limit (so the table can be arbitrarily large!).", "B",
     500);
 PARAM_INT("seed", "Random seed.  If 0, 'std::time(NULL)' is used.", "s", 0);
 
diff --git a/src/mlpack/methods/lsh/lsh_search.hpp b/src/mlpack/methods/lsh/lsh_search.hpp
index 7cbe1e6..ad285ea 100644
--- a/src/mlpack/methods/lsh/lsh_search.hpp
+++ b/src/mlpack/methods/lsh/lsh_search.hpp
@@ -50,10 +50,9 @@ class LSHSearch
    * performing the hashing for details on how the hashing is done.
    *
    * @param referenceSet Set of reference points and the set of queries.
-   * @param numProj Number of projections in each hash table (anything between
-   *     10-50 might be a decent choice).
-   * @param numTables Total number of hash tables (anything between 10-20
-   *     should suffice).
+   * @param projections Cube of projection tables. For a cube of size (a, b, c)
+   *     we set numProj = a, numTables = c. b is the reference set
+   *     dimensionality.
    * @param hashWidth The width of hash for every table. If 0 (the default) is
    *     provided, then the hash width is automatically obtained by computing
    *     the average pairwise distance of 25 pairs.  This should be a reasonable
@@ -61,8 +60,9 @@ class LSHSearch
    * @param secondHashSize The size of the second hash table. This should be a
    *     large prime number.
    * @param bucketSize The size of the bucket in the second hash table. This is
-   *     the maximum number of points that can be hashed into single bucket.
-   *     Default values are already provided here.
+   *     the maximum number of points that can be hashed into single bucket.  A
+   *     value of 0 indicates that there is no limit (so the second hash table
+   *     can be arbitrarily large---be careful!).
    */
   LSHSearch(const arma::mat& referenceSet,
             const arma::cube& projections,
@@ -76,9 +76,10 @@ class LSHSearch
    * performing the hashing for details on how the hashing is done.
    *
    * @param referenceSet Set of reference points and the set of queries.
-   * @param projections Cube of projection tables. For a cube of size (a, b, c)
-   *     we set numProj = a, numTables = c. b is the reference set
-   *     dimensionality.
+   * @param numProj Number of projections in each hash table (anything between
+   *     10-50 might be a decent choice).
+   * @param numTables Total number of hash tables (anything between 10-20
+   *     should suffice).
    * @param hashWidth The width of hash for every table. If 0 (the default) is
    *     provided, then the hash width is automatically obtained by computing
    *     the average pairwise distance of 25 pairs.  This should be a reasonable
@@ -86,8 +87,9 @@ class LSHSearch
    * @param secondHashSize The size of the second hash table. This should be a
    *     large prime number.
    * @param bucketSize The size of the bucket in the second hash table. This is
-   *     the maximum number of points that can be hashed into single bucket.
-   *     Default values are already provided here.
+   *     the maximum number of points that can be hashed into single bucket.  A
+   *     value of 0 indicates that there is no limit (so the second hash table
+   *     can be arbitrarily large---be careful!).
    */
   LSHSearch(const arma::mat& referenceSet,
             const size_t numProj,
@@ -108,9 +110,28 @@ class LSHSearch
   ~LSHSearch();
 
   /**
-   * Train the LSH model on the given dataset.  If a correct vector is not
-   * provided, this means building new hash tables. Otherwise, we use the ones
-   * provided by the user.
+   * Train the LSH model on the given dataset.  If a correctly-sized projection
+   * cube is not provided, this means building new hash tables. Otherwise, we
+   * use the projections provided by the user.
+   *
+   * @param referenceSet Set of reference points and the set of queries.
+   * @param numProj Number of projections in each hash table (anything between
+   *     10-50 might be a decent choice).
+   * @param numTables Total number of hash tables (anything between 10-20
+   *     should suffice).
+   * @param hashWidth The width of hash for every table. If 0 (the default) is
+   *     provided, then the hash width is automatically obtained by computing
+   *     the average pairwise distance of 25 pairs.  This should be a reasonable
+   *     upper bound on the nearest-neighbor distance in general.
+   * @param secondHashSize The size of the second hash table. This should be a
+   *     large prime number.
+   * @param bucketSize The size of the bucket in the second hash table. This is
+   *     the maximum number of points that can be hashed into single bucket.  A
+   *     value of 0 indicates that there is no limit (so the second hash table
+   *     can be arbitrarily large---be careful!).
+   * @param projections Cube of projection tables. For a cube of size (a, b, c)
+   *     we set numProj = a, numTables = c. b is the reference set
+   *     dimensionality.
    */
   void Train(const arma::mat& referenceSet,
              const size_t numProj,
diff --git a/src/mlpack/methods/lsh/lsh_search_impl.hpp b/src/mlpack/methods/lsh/lsh_search_impl.hpp
index ac65a86..98acad1 100644
--- a/src/mlpack/methods/lsh/lsh_search_impl.hpp
+++ b/src/mlpack/methods/lsh/lsh_search_impl.hpp
@@ -205,8 +205,9 @@ void LSHSearch<SortPolicy>::Train(const arma::mat& referenceSet,
     secondHashBinCounts[secondHashVectors[i]]++;
 
   // Enforce the maximum bucket size.
-  secondHashBinCounts.transform([bucketSize](size_t val)
-      { return std::min(val, bucketSize); });
+  const size_t effectiveBucketSize = (bucketSize == 0) ? SIZE_MAX : bucketSize;
+  secondHashBinCounts.transform([effectiveBucketSize](size_t val)
+      { return std::min(val, effectiveBucketSize); });
 
   const size_t numRowsInTable = arma::accu(secondHashBinCounts > 0);
   bucketContentSize.zeros(numRowsInTable);