[mlpack-git] master: Workaround to avoid copy of hashMat (e2596c5)
gitdub at mlpack.org
gitdub at mlpack.org
Thu Jun 23 12:17:23 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/eaa7182ebed8cce3fd6191dc1f8170546ea297da...812048c7c6bee0b6c8d936677f23bbb5930c6cfc
>---------------------------------------------------------------
commit e2596c5ac626fe6264d89ec4d546506228e16d56
Author: Yannis Mentekidis <mentekid at gmail.com>
Date: Thu Jun 23 17:17:23 2016 +0100
Workaround to avoid copy of hashMat
>---------------------------------------------------------------
e2596c5ac626fe6264d89ec4d546506228e16d56
src/mlpack/methods/lsh/lsh_search_impl.hpp | 110 +++++++++++++----------------
1 file changed, 50 insertions(+), 60 deletions(-)
diff --git a/src/mlpack/methods/lsh/lsh_search_impl.hpp b/src/mlpack/methods/lsh/lsh_search_impl.hpp
index f7883bc..5bf5318 100644
--- a/src/mlpack/methods/lsh/lsh_search_impl.hpp
+++ b/src/mlpack/methods/lsh/lsh_search_impl.hpp
@@ -509,29 +509,27 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
additionalProbingBins.col(c) = queryCode;
- // Calculate query point's projection position
+ // Calculate query point's projection position.
arma::mat projection = queryCodeNotFloored;
- // Use projection to calculate query's distance from hash limits
+ // Use projection to calculate query's distance from hash limits.
arma::vec limLow = projection - queryCode * hashWidth;
arma::vec limHigh = hashWidth - limLow;
- // calculate scores = distances^2
+ // Calculate scores. score = distance^2.
arma::vec scores(2 * numProj);
scores.rows(0, numProj - 1) = arma::pow(limLow, 2);
scores.rows(numProj, (2 * numProj) - 1) = arma::pow(limHigh, 2);
- // actions vector shows what transformation to apply to a coordinate
+ // Actions vector describes what perturbation (-1/+1) corresponds to a score.
arma::Col<short int> actions(2 * numProj); // will be [-1 ... 1 ...]
-
actions.rows(0, numProj - 1) = // first numProj rows
-1 * arma::ones< arma::Col<short int> > (numProj); // -1s
-
actions.rows(numProj, (2 * numProj) - 1) = // last numProj rows
arma::ones< arma::Col<short int> > (numProj); // 1s
- // acting dimension vector shows which coordinate to transform according to
+ // Acting dimension vector shows which coordinate to transform according to
// actions described by actions vector
arma::Col<size_t> positions(2 * numProj); // will be [0 1 2 ... 0 1 2 ...]
positions.rows(0, numProj - 1) =
@@ -539,14 +537,14 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
positions.rows(numProj, 2 * numProj - 1) =
arma::linspace< arma::Col<size_t> >(0, numProj - 1, numProj);
- // optimization: no need to create heap for 1 or 2 codes
+ // Special case: No need to create heap for 1 or 2 codes.
if (T <= 2)
{
// First, find location of minimum score, generate 1 perturbation vector,
// and add its code to additionalProbingBins column 0.
- // find location and value of smallest element of scores vector
+ // Find location and value of smallest element of scores vector.
double minscore = scores[0];
size_t minloc = 0;
for (size_t s = 1; s < (2 * numProj); ++s)
@@ -558,14 +556,14 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
}
}
- // add or subtract 1 to dimension corresponding to minimum score
+ // Add or subtract 1 to dimension corresponding to minimum score.
additionalProbingBins(positions[minloc], 0) += actions[minloc];
if (T == 1)
- return; //done if asked for only 1 code
+ return; // Done if asked for only 1 code.
// Now, find location of second smallest score and generate one more vector.
// The second perturbation vector still can't comprise of more than one
- // change in the bin codes. This is because of the way perturbation vectors
+ // change in the bin codes, because of the way perturbation vectors
// are generated: First we create the one with the smallest score (Ao) and
// then we either add 1 extra dimension to it (Ae) or shift it by one (As).
// Since As contains the second smallest score, and Ae contains both the
@@ -573,7 +571,6 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
// score(As). Therefore the second perturbation vector is ALWAYS the vector
// containing only the second-lowest scoring perturbation.
-
double minscore2 = scores[0];
size_t minloc2 = 0;
for (size_t s = 0; s < (2 * numProj); ++s) // here we can't start from 1
@@ -585,13 +582,14 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
}
}
- // add or subtract 1 to create second-lowest scoring vector
+ // Add or subtract 1 to create second-lowest scoring vector.
additionalProbingBins(positions[minloc2], 1) += actions[minloc2];
return;
}
+
// General case: more than 2 perturbation vectors require use of minheap.
- // sort everything in increasing order
+ // Sort everything in increasing order.
arma::Col<long long unsigned int> sortidx = arma::sort_index(scores);
scores = scores(sortidx);
actions = actions(sortidx);
@@ -599,7 +597,6 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
// Theory:
- // This is the part that creates the probing sequence.
// A probing sequence is a sequence of T probing bins where a query's
// neighbors are most likely to be. Likelihood is dependent only on a bin's
// score, which is the sum of scores of all dimension-action pairs, so we
@@ -686,22 +683,20 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
Ao[0] = 1; // Smallest vector includes only smallest score.
std::vector< std::vector<bool> > perturbationSets;
- perturbationSets.push_back(Ao); // storage of perturbation sets
+ perturbationSets.push_back(Ao); // Storage of perturbation sets.
std::priority_queue<
std::pair<double, size_t>, // contents: pairs of (score, index)
std::vector< // container: vector of pairs
std::pair<double, size_t>
>,
- std::greater< std::pair<double, size_t> > // comparator of pairs(compare scores)
+ std::greater< std::pair<double, size_t> > // comparator of pairs
> minHeap; // our minheap
- // Start by adding the lowest scoring set to the minheap
- // std::pair<double, size_t> pair0( perturbationScore(Ao, scores), 0 );
- // minHeap.push(pair0);
+ // Start by adding the lowest scoring set to the minheap.
minHeap.push( std::make_pair(perturbationScore(Ao, scores), 0) );
- // loop invariable: after pvec iterations, additionalProbingBins contains pvec
+ // Loop invariable: after pvec iterations, additionalProbingBins contains pvec
// valid codes of the lowest-scoring bins (bins most likely to contain
// neighbors of the query).
for (size_t pvec = 0; pvec < T; ++pvec)
@@ -709,13 +704,14 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
std::vector<bool> Ai;
do
{
- // get the perturbation set corresponding to the minimum score
+ // Get the perturbation set corresponding to the minimum score.
Ai = perturbationSets[ minHeap.top().second ];
minHeap.pop(); // .top() returns, .pop() removes
- // modify Ai (shift)
+ // Shift operation on Ai (replace max with max+1).
std::vector<bool> As = Ai;
- if ( perturbationShift(As) && perturbationValid(As, numProj) ) // Don't add invalid sets.
+ if ( perturbationShift(As) && perturbationValid(As, numProj) )
+ // Don't add invalid sets.
{
perturbationSets.push_back(As); // add shifted set to sets
minHeap.push(
@@ -725,9 +721,10 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
);
}
- // modify Ai (expand)
+ // Expand operation on Ai (add max+1 to set).
std::vector<bool> Ae = Ai;
- if ( perturbationExpand(Ae) && perturbationValid(Ae, numProj) ) // Don't add invalid sets.
+ if ( perturbationExpand(Ae) && perturbationValid(Ae, numProj) )
+ // Don't add invalid sets.
{
perturbationSets.push_back(Ae); // add expanded set to sets
minHeap.push(
@@ -739,11 +736,11 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
}while (! perturbationValid(Ai, numProj) );//Discard invalid perturbations
- // add perturbation vector to probing sequence if valid
- for (size_t i = 0; i < Ai.size(); ++i)
- additionalProbingBins(positions(i), pvec)
- += Ai[i] ? actions(i) : 0; // if A(i) marked, add action to probing vector
- }
+ // Found valid perturbation set Ai. Construct perturbation vector from set.
+ for (size_t pos = 0; pos < Ai.size(); ++pos)
+ // If Ai[pos] is marked, add action to probing vector.
+ additionalProbingBins(positions(pos), pvec)
+ += Ai[pos] ? actions(pos) : 0; }
}
@@ -777,48 +774,41 @@ void LSHSearch<SortPolicy>::ReturnIndicesFromTable(
queryCodesNotFloored += offsets.cols(0, numTablesToSearch - 1);
allProjInTables = arma::floor(queryCodesNotFloored/hashWidth);
- // Compute the hash value of each key of the query into a bucket of the
- // 'secondHashTable' using the 'secondHashWeights'.
- arma::Row<size_t> hashVec =
- arma::conv_to< arma::Row<size_t> >::
- from( secondHashWeights.t() * allProjInTables ); // typecast to floor
- // mod to compute 2nd-level codes
- for (size_t i = 0; i < hashVec.n_elem; i++)
- hashVec[i] = (hashVec[i] % secondHashSize);
-
- Log::Assert(hashVec.n_elem == numTablesToSearch);
- // Compute hashVectors of additional probing bins
+ // Use hashMat to store the primary probing codes and any additional codes
+ // from multiprobe LSH.
arma::Mat<size_t> hashMat;
+ hashMat.set_size(T + 1, numTablesToSearch);
+
+ // Compute the primary hash value of each key of the query into a bucket of
+ // the secondHashTable using the secondHashWeights.
+ hashMat.row(0) =
+ arma::conv_to< arma::Row<size_t> >:: // floor by typecasting to size_t
+ from( secondHashWeights.t() * allProjInTables );
+ // mod to compute 2nd-level codes
+ for (size_t i = 0; i < numTablesToSearch; i++)
+ hashMat(0, i) = (hashMat(0, i) % secondHashSize);
+
+ // Compute hash codes of additional probing bins
if (T > 0)
{
- hashMat.zeros(T, numTablesToSearch);
-
for (size_t i = 0; i < numTablesToSearch; ++i)
{
- // construct this table's probing sequence of length T
+ // Construct this table's probing sequence of length T.
arma::mat additionalProbingBins;
GetAdditionalProbingBins(allProjInTables.unsafe_col(i),
queryCodesNotFloored.unsafe_col(i),
T,
additionalProbingBins);
- // map the probing bin to second hash table bins
- hashMat.col(i) =
- arma::conv_to< arma::Col<size_t> >::
- from(additionalProbingBins.t() * secondHashWeights); // typecast floor
- for (size_t p = 0; p < T; ++p)
+ // Map each probing bin to a bin in secondHashTable (just like we did for
+ // the primary hash table).
+ hashMat(arma::span(1, T), i) = // Compute code of rows 1:end of column i
+ arma::conv_to< arma::Col<size_t> >:: // floor by typecasting to size_t
+ from( secondHashWeights.t() * additionalProbingBins );
+ for (size_t p = 1; p < T + 1; ++p)
hashMat(p, i) = (hashMat(p, i) % secondHashSize);
}
-
- // top row of hashMat is primary bins for each table
- hashMat = arma::join_vert(hashVec, hashMat);
- }
- else
- {
- // if not multiprobe, hashMat is only hashVec's elements
- hashMat.set_size(1, numTablesToSearch);
- hashMat.row(0) = hashVec;
}
More information about the mlpack-git
mailing list