[mlpack-git] master: Adds new constructor to LSHSearch and merges Train() and BuildHash() methods (06fdfa8)
gitdub at mlpack.org
gitdub at mlpack.org
Thu Jun 2 07:39:07 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/eba4f9924694bc10daec74ff5059dbb8af001416...e3a23c256f017ebb8185b15847c82f51d359cdfd
>---------------------------------------------------------------
commit 06fdfa866a1d85bc8213c0f86777865174fafd84
Author: Yannis Mentekidis <mentekid at gmail.com>
Date: Thu Jun 2 14:39:07 2016 +0300
Adds new constructor to LSHSearch and merges Train() and BuildHash() methods
>---------------------------------------------------------------
06fdfa866a1d85bc8213c0f86777865174fafd84
HISTORY.md | 10 +
src/mlpack/methods/lsh/lsh_search.hpp | 27 ++-
src/mlpack/methods/lsh/lsh_search_impl.hpp | 331 +++++++++++++++--------------
3 files changed, 205 insertions(+), 163 deletions(-)
diff --git a/HISTORY.md b/HISTORY.md
index cddb28a..de21f89 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,15 @@
### mlpack 2.0.2
###### 2016-??-??
+ * LSHSearch::Projection(size_t) that returned a single projection matrix has
+ been removed. In its place, LSHSearch::Projections() has been added, which
+ returns an arma::cube with each projection table in a slice (#663).
+
+ * A new constructor has been added to LSHSearch that creates objects using
+ projection tables provided in an arma::cube (#663).
+
+ * LSHSearch::Projections(arma::cube) has been added that allows users to
+ change the projection tables of an LSHSearch object (#663).
+
* Handle zero-variance dimensions in DET (#515).
* Add MiniBatchSGD optimizer (src/mlpack/core/optimizers/minibatch_sgd/) and
diff --git a/src/mlpack/methods/lsh/lsh_search.hpp b/src/mlpack/methods/lsh/lsh_search.hpp
index d3bc2f9..9c7c1d6 100644
--- a/src/mlpack/methods/lsh/lsh_search.hpp
+++ b/src/mlpack/methods/lsh/lsh_search.hpp
@@ -65,6 +65,31 @@ class LSHSearch
* Default values are already provided here.
*/
LSHSearch(const arma::mat& referenceSet,
+ const arma::cube& projections,
+ const double hashWidth = 0.0,
+ const size_t secondHashSize = 99901,
+ const size_t bucketSize = 500);
+
+ /**
+ * This function initializes the LSH class. It builds the hash one the
+ * reference set using the provided projections. See the individual functions
+ * performing the hashing for details on how the hashing is done.
+ *
+ * @param referenceSet Set of reference points and the set of queries.
+ * @param projections Cube of projection tables. For a cube of size (a, b, c)
+ * we set numProj = a, numTables = c. b is the reference set
+ * dimensionality.
+ * @param hashWidth The width of hash for every table. If 0 (the default) is
+ * provided, then the hash width is automatically obtained by computing
+ * the average pairwise distance of 25 pairs. This should be a reasonable
+ * upper bound on the nearest-neighbor distance in general.
+ * @param secondHashSize The size of the second hash table. This should be a
+ * large prime number.
+ * @param bucketSize The size of the bucket in the second hash table. This is
+ * the maximum number of points that can be hashed into single bucket.
+ * Default values are already provided here.
+ */
+ LSHSearch(const arma::mat& referenceSet,
const size_t numProj,
const size_t numTables,
const double hashWidth = 0.0,
@@ -177,7 +202,7 @@ class LSHSearch
const arma::Mat<size_t>& SecondHashTable() const { return secondHashTable; }
//! Get the projection tables.
- const arma::cube Projections() { return projections; }
+ const arma::cube& Projections() { return projections; }
//! Change the projection tables (Retrains object)
void Projections(const arma::cube &projTables)
diff --git a/src/mlpack/methods/lsh/lsh_search_impl.hpp b/src/mlpack/methods/lsh/lsh_search_impl.hpp
index afeaf05..c0fc57e 100644
--- a/src/mlpack/methods/lsh/lsh_search_impl.hpp
+++ b/src/mlpack/methods/lsh/lsh_search_impl.hpp
@@ -12,7 +12,7 @@
namespace mlpack {
namespace neighbor {
-// Construct the object.
+// Construct the object with random tables
template<typename SortPolicy>
LSHSearch<SortPolicy>::
LSHSearch(const arma::mat& referenceSet,
@@ -35,6 +35,28 @@ LSHSearch(const arma::mat& referenceSet,
bucketSize);
}
+// Construct the object with given tables
+template<typename SortPolicy>
+LSHSearch<SortPolicy>::
+LSHSearch(const arma::mat& referenceSet,
+ const arma::cube& projections,
+ const double hashWidthIn,
+ const size_t secondHashSize,
+ const size_t bucketSize) :
+ referenceSet(NULL), // This will be set in Train().
+ ownsSet(false),
+ numProj(projections.n_cols),
+ numTables(projections.n_slices),
+ hashWidth(hashWidthIn),
+ secondHashSize(secondHashSize),
+ bucketSize(bucketSize),
+ distanceEvaluations(0)
+{
+ // Pass work to training function
+ Train(referenceSet, numProj, numTables, hashWidthIn, secondHashSize,
+ bucketSize, projections);
+}
+
// Empty constructor.
template<typename SortPolicy>
LSHSearch<SortPolicy>::LSHSearch() :
@@ -98,7 +120,152 @@ void LSHSearch<SortPolicy>::Train(const arma::mat& referenceSet,
Log::Info << "Hash width chosen as: " << hashWidth << std::endl;
- BuildHash(projection);
+ // Hash Building Procedure
+ // The first level hash for a single table outputs a 'numProj'-dimensional
+ // integer key for each point in the set -- (key, pointID)
+ // The key creation details are presented below
+ //
+
+ // Step I: Prepare the second level hash.
+
+ // Obtain the weights for the second hash.
+ secondHashWeights = arma::floor(arma::randu(numProj) *
+ (double) secondHashSize);
+
+ // The 'secondHashTable' is initially an empty matrix of size
+ // ('secondHashSize' x 'bucketSize'). But by only filling the buckets
+ // as points land in them allows us to shrink the size of the
+ // 'secondHashTable' at the end of the hashing.
+
+ // Fill the second hash table n = referenceSet.n_cols. This is because no
+ // point has index 'n' so the presence of this in the bucket denotes that
+ // there are no more points in this bucket.
+ secondHashTable.set_size(secondHashSize, bucketSize);
+ secondHashTable.fill(referenceSet.n_cols);
+
+ // Keep track of the size of each bucket in the hash. At the end of hashing
+ // most buckets will be empty.
+ bucketContentSize.zeros(secondHashSize);
+
+ // Instead of putting the points in the row corresponding to the bucket, we
+ // chose the next empty row and keep track of the row in which the bucket
+ // lies. This allows us to stack together and slice out the empty buckets at
+ // the end of the hashing.
+ bucketRowInHashTable.set_size(secondHashSize);
+ bucketRowInHashTable.fill(secondHashSize);
+
+ // Keep track of number of non-empty rows in the 'secondHashTable'.
+ size_t numRowsInTable = 0;
+
+ // Step II: The offsets for all projections in all tables.
+ // Since the 'offsets' are in [0, hashWidth], we obtain the 'offsets'
+ // as randu(numProj, numTables) * hashWidth.
+ offsets.randu(numProj, numTables);
+ offsets *= hashWidth;
+
+
+
+
+ // Step III: Obtain the 'numProj' projections for each table.
+ projections.clear(); // Reset projections vector.
+
+ if (projection.n_slices == 0) //random generation of tables
+ {
+ // For L2 metric, 2-stable distributions are used, and
+ // the normal Z ~ N(0, 1) is a 2-stable distribution.
+
+ //numTables random tables arranged in a cube
+ projections.randn(
+ referenceSet.n_rows,
+ numProj,
+ numTables
+ );
+ }
+ else if (projection.n_slices == numTables) //user defined tables
+ {
+ projections = projection;
+ }
+ else //invalid argument
+ {
+ throw std::invalid_argument(
+ "number of projection tables provided must be equal to numProj"
+ );
+ }
+
+
+ for (size_t i = 0; i < numTables; i++)
+ {
+ // Step IV: create the 'numProj'-dimensional key for each point in each
+ // table.
+
+ // The following code performs the task of hashing each point to a
+ // 'numProj'-dimensional integer key. Hence you get a ('numProj' x
+ // 'referenceSet.n_cols') key matrix.
+ //
+ // For a single table, let the 'numProj' projections be denoted by 'proj_i'
+ // and the corresponding offset be 'offset_i'. Then the key of a single
+ // point is obtained as:
+ // key = { floor( (<proj_i, point> + offset_i) / 'hashWidth' ) forall i }
+ arma::mat offsetMat = arma::repmat(offsets.unsafe_col(i), 1,
+ referenceSet.n_cols);
+ arma::mat hashMat = projections.slice(i).t() * (referenceSet);
+ hashMat += offsetMat;
+ hashMat /= hashWidth;
+
+ // Step V: Putting the points in the 'secondHashTable' by hashing the key.
+ // Now we hash every key, point ID to its corresponding bucket.
+ arma::rowvec secondHashVec = secondHashWeights.t() * arma::floor(hashMat);
+
+ // This gives us the bucket for the corresponding point ID.
+ for (size_t j = 0; j < secondHashVec.n_elem; j++)
+ secondHashVec[j] = (double)((size_t) secondHashVec[j] % secondHashSize);
+
+ Log::Assert(secondHashVec.n_elem == referenceSet.n_cols);
+
+ // Insert the point in the corresponding row to its bucket in the
+ // 'secondHashTable'.
+ for (size_t j = 0; j < secondHashVec.n_elem; j++)
+ {
+ // This is the bucket number.
+ size_t hashInd = (size_t) secondHashVec[j];
+ // The point ID is 'j'.
+
+ // If this is currently an empty bucket, start a new row keep track of
+ // which row corresponds to the bucket.
+ if (bucketContentSize[hashInd] == 0)
+ {
+ // Start a new row for hash.
+ bucketRowInHashTable[hashInd] = numRowsInTable;
+ secondHashTable(numRowsInTable, 0) = j;
+
+ numRowsInTable++;
+ }
+
+ else
+ {
+ // If bucket is already present in the 'secondHashTable', find the
+ // corresponding row and insert the point ID in this row unless the
+ // bucket is full, in which case, do nothing.
+ if (bucketContentSize[hashInd] < bucketSize)
+ secondHashTable(bucketRowInHashTable[hashInd],
+ bucketContentSize[hashInd]) = j;
+ }
+
+ // Increment the count of the points in this bucket.
+ if (bucketContentSize[hashInd] < bucketSize)
+ bucketContentSize[hashInd]++;
+ } // Loop over all points in the reference set.
+ } // Loop over tables.
+
+ // Step VI: Condensing the 'secondHashTable'.
+ size_t maxBucketSize = 0;
+ for (size_t i = 0; i < bucketContentSize.n_elem; i++)
+ if (bucketContentSize[i] > maxBucketSize)
+ maxBucketSize = bucketContentSize[i];
+
+ Log::Info << "Final hash table size: (" << numRowsInTable << " x "
+ << maxBucketSize << ")" << std::endl;
+ secondHashTable.resize(numRowsInTable, maxBucketSize);
}
template<typename SortPolicy>
@@ -359,166 +526,6 @@ Search(const size_t k,
template<typename SortPolicy>
void LSHSearch<SortPolicy>::BuildHash(const arma::cube &projection)
{
- // The first level hash for a single table outputs a 'numProj'-dimensional
- // integer key for each point in the set -- (key, pointID)
- // The key creation details are presented below
- //
- // The second level hash is performed by hashing the key to
- // an integer in the range [0, 'secondHashSize').
- //
- // This is done by creating a weight vector 'secondHashWeights' of
- // length 'numProj' with each entry an integer randomly chosen
- // between [0, 'secondHashSize').
- //
- // Then the bucket for any key and its corresponding point is
- // given by <key, 'secondHashWeights'> % 'secondHashSize'
- // and the corresponding point ID is put into that bucket.
-
- // Step I: Prepare the second level hash.
-
- // Obtain the weights for the second hash.
- secondHashWeights = arma::floor(arma::randu(numProj) *
- (double) secondHashSize);
-
- // The 'secondHashTable' is initially an empty matrix of size
- // ('secondHashSize' x 'bucketSize'). But by only filling the buckets
- // as points land in them allows us to shrink the size of the
- // 'secondHashTable' at the end of the hashing.
-
- // Fill the second hash table n = referenceSet.n_cols. This is because no
- // point has index 'n' so the presence of this in the bucket denotes that
- // there are no more points in this bucket.
- secondHashTable.set_size(secondHashSize, bucketSize);
- secondHashTable.fill(referenceSet->n_cols);
-
- // Keep track of the size of each bucket in the hash. At the end of hashing
- // most buckets will be empty.
- bucketContentSize.zeros(secondHashSize);
-
- // Instead of putting the points in the row corresponding to the bucket, we
- // chose the next empty row and keep track of the row in which the bucket
- // lies. This allows us to stack together and slice out the empty buckets at
- // the end of the hashing.
- bucketRowInHashTable.set_size(secondHashSize);
- bucketRowInHashTable.fill(secondHashSize);
-
- // Keep track of number of non-empty rows in the 'secondHashTable'.
- size_t numRowsInTable = 0;
-
- // Step II: The offsets for all projections in all tables.
- // Since the 'offsets' are in [0, hashWidth], we obtain the 'offsets'
- // as randu(numProj, numTables) * hashWidth.
- offsets.randu(numProj, numTables);
- offsets *= hashWidth;
-
- // Step III: Create each hash table in the first level hash one by one and
- // putting them directly into the 'secondHashTable' for memory efficiency.
- //projections.clear(); // Reset projections vector.
-
-
-
- // Step IV: Obtain the 'numProj' projections for each table.
-
- if (projection.n_slices == 0) //random generation of tables
- {
- // For L2 metric, 2-stable distributions are used, and
- // the normal Z ~ N(0, 1) is a 2-stable distribution.
-
- //numTables random tables arranged in a cube
- projections.randn(
- referenceSet->n_rows,
- numProj,
- numTables
- );
- }
- else if (projection.n_slices == numTables) //user defined tables
- {
- projections = projection;
- }
- else //invalid argument
- {
- throw std::invalid_argument(
- "number of projection tables provided must be equal to numProj"
- );
- }
-
-
- for (size_t i = 0; i < numTables; i++)
- {
-
-
- // Step V: create the 'numProj'-dimensional key for each point in each
- // table.
-
- // The following code performs the task of hashing each point to a
- // 'numProj'-dimensional integer key. Hence you get a ('numProj' x
- // 'referenceSet.n_cols') key matrix.
- //
- // For a single table, let the 'numProj' projections be denoted by 'proj_i'
- // and the corresponding offset be 'offset_i'. Then the key of a single
- // point is obtained as:
- // key = { floor( (<proj_i, point> + offset_i) / 'hashWidth' ) forall i }
- arma::mat offsetMat = arma::repmat(offsets.unsafe_col(i), 1,
- referenceSet->n_cols);
- // arma::mat hashMat = projMat.t() * (*referenceSet);
- arma::mat hashMat = projections.slice(i).t() * (*referenceSet);
- hashMat += offsetMat;
- hashMat /= hashWidth;
-
- // Step VI: Putting the points in the 'secondHashTable' by hashing the key.
- // Now we hash every key, point ID to its corresponding bucket.
- arma::rowvec secondHashVec = secondHashWeights.t() * arma::floor(hashMat);
-
- // This gives us the bucket for the corresponding point ID.
- for (size_t j = 0; j < secondHashVec.n_elem; j++)
- secondHashVec[j] = (double)((size_t) secondHashVec[j] % secondHashSize);
-
- Log::Assert(secondHashVec.n_elem == referenceSet->n_cols);
-
- // Insert the point in the corresponding row to its bucket in the
- // 'secondHashTable'.
- for (size_t j = 0; j < secondHashVec.n_elem; j++)
- {
- // This is the bucket number.
- size_t hashInd = (size_t) secondHashVec[j];
- // The point ID is 'j'.
-
- // If this is currently an empty bucket, start a new row keep track of
- // which row corresponds to the bucket.
- if (bucketContentSize[hashInd] == 0)
- {
- // Start a new row for hash.
- bucketRowInHashTable[hashInd] = numRowsInTable;
- secondHashTable(numRowsInTable, 0) = j;
-
- numRowsInTable++;
- }
-
- else
- {
- // If bucket is already present in the 'secondHashTable', find the
- // corresponding row and insert the point ID in this row unless the
- // bucket is full, in which case, do nothing.
- if (bucketContentSize[hashInd] < bucketSize)
- secondHashTable(bucketRowInHashTable[hashInd],
- bucketContentSize[hashInd]) = j;
- }
-
- // Increment the count of the points in this bucket.
- if (bucketContentSize[hashInd] < bucketSize)
- bucketContentSize[hashInd]++;
- } // Loop over all points in the reference set.
- } // Loop over tables.
-
- // Step VII: Condensing the 'secondHashTable'.
- size_t maxBucketSize = 0;
- for (size_t i = 0; i < bucketContentSize.n_elem; i++)
- if (bucketContentSize[i] > maxBucketSize)
- maxBucketSize = bucketContentSize[i];
-
- Log::Info << "Final hash table size: (" << numRowsInTable << " x "
- << maxBucketSize << ")" << std::endl;
- secondHashTable.resize(numRowsInTable, maxBucketSize);
}
template<typename SortPolicy>
More information about the mlpack-git
mailing list