[mlpack-git] master: Fixes style, adds documentation (fa7f62d)
gitdub at mlpack.org
gitdub at mlpack.org
Fri Jun 17 08:29:33 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/eaa7182ebed8cce3fd6191dc1f8170546ea297da...812048c7c6bee0b6c8d936677f23bbb5930c6cfc
>---------------------------------------------------------------
commit fa7f62da6a4cfe7fa45e297d7a4a1491c9c39bb1
Author: Yannis Mentekidis <mentekid at gmail.com>
Date: Fri Jun 17 15:29:33 2016 +0300
Fixes style, adds documentation
>---------------------------------------------------------------
fa7f62da6a4cfe7fa45e297d7a4a1491c9c39bb1
src/mlpack/methods/lsh/lsh_search.hpp | 33 ++++++++++++++++++-
src/mlpack/methods/lsh/lsh_search_impl.hpp | 53 +++++++++++++++---------------
2 files changed, 59 insertions(+), 27 deletions(-)
diff --git a/src/mlpack/methods/lsh/lsh_search.hpp b/src/mlpack/methods/lsh/lsh_search.hpp
index 3319e57..364ef19 100644
--- a/src/mlpack/methods/lsh/lsh_search.hpp
+++ b/src/mlpack/methods/lsh/lsh_search.hpp
@@ -19,7 +19,23 @@
* organization={ACM}
* }
*
+ * Additionally, the class implements Multiprobe LSH, which improves
+ * approximation results during the search for approximate nearest neighbors.
+ * The Multiprobe LSH algorithm was presented in the paper:
+ *
+ * @inproceedings{Lv2007multiprobe,
+ * tile={Multi-probe LSH: efficient indexing for high-dimensional similarity
+ * search},
+ * author={Lv, Qin and Josephson, William and Wang, Zhe and Charikar, Moses and
+ * Li, Kai},
+ * booktitle={Proceedings of the 33rd international conference on Very large
+ * data bases},
+ * year={2007},
+ * pages={950--961}
+ * }
+ *
*/
+
#ifndef MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_SEARCH_HPP
#define MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_SEARCH_HPP
@@ -235,6 +251,10 @@ class LSHSearch
* @param referenceIndices The list of neighbor candidates obtained from
* hashing the query into all the hash tables and eventually into
* multiple buckets of the second hash table.
+ * @param numTablesToSearch The number of tables to perform the search in. If
+ * 0, all tables are searched.
+ * @param T The number of additional probing bins for multiprobe LSH. If 0,
+ * single-probe is used.
*/
template<typename VecType>
void ReturnIndicesFromTable(const VecType& queryPoint,
@@ -298,7 +318,18 @@ class LSHSearch
const double distance) const;
/**
- TODO: Document this
+ * This function implements the core idea behind Multiprobe LSH. It is called
+ * by ReturnIndicesFromTables when T > 0. Given a query's code and its
+ * projection location, GetAdditionalProbingBins will calculate the T most
+ * likely alternative bin codes (other than queryCode) where a query's
+ * neighbors might be found in.
+ *
+ * @param queryCode vector containing the numProj-dimensional query code.
+ * @param queryCodeNotFloored vector containing the projection location of the
+ * query.
+ * @param T number of additional probing bins.
+ * @param additionalProbingBins matrix. Each column will hold one additional
+ * bin.
*/
void GetAdditionalProbingBins(const arma::vec &queryCode,
const arma::vec &queryCodeNotFloored,
diff --git a/src/mlpack/methods/lsh/lsh_search_impl.hpp b/src/mlpack/methods/lsh/lsh_search_impl.hpp
index bc49662..36e4afa 100644
--- a/src/mlpack/methods/lsh/lsh_search_impl.hpp
+++ b/src/mlpack/methods/lsh/lsh_search_impl.hpp
@@ -9,7 +9,6 @@
#include <mlpack/core.hpp>
-using std::cout; using std::endl; //TODO: remove
namespace mlpack {
namespace neighbor {
@@ -342,7 +341,7 @@ void LSHSearch<SortPolicy>::BaseCase(const size_t queryIndex,
}
-// Compare class for <double, size_t> pair, used in GetAdditionalProbingBins
+// Compare class for <double, size_t> pair, used in GetAdditionalProbingBins.
class CompareGreater
{
public:
@@ -354,9 +353,9 @@ class CompareGreater
}
};
-//Returns the score of a perturbation vector generated by perturbation set A
+//Returns the score of a perturbation vector generated by perturbation set A.
//The score of a pertubation set (vector) is the sum of scores of the
-//participating actions
+//participating actions.
inline double perturbationScore(
const std::vector<size_t> &A,
const arma::vec &scores)
@@ -367,7 +366,8 @@ inline double perturbationScore(
return score;
}
-// Replace max element with max element+1 in perturbation set A
+// Inline function used by GetAdditionalProbingBins. The vector shift operation
+// replaces the largest element of a vector A with (largest element) + 1.
inline void perturbationShift(std::vector<size_t> &A)
{
size_t max_pos = 0;
@@ -383,7 +383,9 @@ inline void perturbationShift(std::vector<size_t> &A)
A[max_pos]++;
}
-// Add 1+max element to perturbation set A
+// Inline function used by GetAdditionalProbingBins. The vector expansion
+// operation adds the element [1 + (largest_element)] to a vector A, where
+// largest_element is the largest element of A.
inline void perturbationExpand(std::vector<size_t> &A)
{
size_t max = A[0];
@@ -400,19 +402,20 @@ inline bool perturbationValid(
const std::vector<size_t> &A,
const size_t numProj)
{
- //stack allocation and initialization to 0 (bool check[numProj] = {0}) made
- //some compilers complain so use new to be safe...
+ // Stack allocation and initialization to 0 (bool check[numProj] = {0}) made
+ // some compilers complain. We use new just to be safe.
bool *check = new bool[numProj]();
for (size_t i = 0; i < A.size(); ++i)
{
+ // Check that we only use valid dimensions. If not, vector is not valid.
if ( A[i] >= 2*numProj)
{
delete []check;
return false;
}
- //check that we only see each dimension once
+ // Check that we only see each dimension once. If not, vector is not valid.
if (check[A[i] % numProj ] == 0)
check[A[i] % numProj ] = 1;
else
@@ -435,24 +438,24 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
arma::mat &additionalProbingBins) const
{
+ // No additional bins requested. Our work is done.
if (T == 0)
return;
-
-
// Each column of additionalProbingBins is the code of a bin.
additionalProbingBins.set_size(numProj, T);
- // Copy the query's code, then add/subtract according to perturbations
+ // Copy the query's code, then in the end we will add/subtract according
+ // to perturbations we calculated.
for (size_t c = 0; c < T; ++c)
additionalProbingBins.col(c) = queryCode;
// Calculate query point's projection position
- arma::mat projection = queryCode * hashWidth;
+ arma::mat projection = queryCodeNotFloored;
// Use projection to calculate query's distance from hash limits
- arma::vec limLow = queryCodeNotFloored - projection;
+ arma::vec limLow = projection - queryCode * hashWidth;
arma::vec limHigh = hashWidth - limLow;
// calculate scores = distances^2
@@ -482,17 +485,6 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
if (T <= 2)
{
- // special case: 1 or 2 codes only
- // The second perturbation vector still can't comprise of more than one
- // change in the bin codes. This is because of the way perturbation vectors
- // are generated: First we create the one with the smallest score (Ao) and
- // then we either add 1 extra dimension to it (Ae) or shift it by one (As).
- // Since As contains the second smallest score, and Ae contains both the
- // smallest and the second smallest, it's obvious that score(Ae) >
- // score(As). Therefore the second perturbation vector is ALWAYS the vector
- // containing only the second-lowest scoring perturbation.
-
-
// First, find location of minimum score, generate 1 perturbation vector,
// and add its code to additionalProbingBins column 0.
@@ -514,10 +506,19 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
return; //done if asked for only 1 code
// Now, find location of second smallest score and generate one more vector.
+ // The second perturbation vector still can't comprise of more than one
+ // change in the bin codes. This is because of the way perturbation vectors
+ // are generated: First we create the one with the smallest score (Ao) and
+ // then we either add 1 extra dimension to it (Ae) or shift it by one (As).
+ // Since As contains the second smallest score, and Ae contains both the
+ // smallest and the second smallest, it's obvious that score(Ae) >
+ // score(As). Therefore the second perturbation vector is ALWAYS the vector
+ // containing only the second-lowest scoring perturbation.
+
double minscore2 = scores[0];
size_t minloc2 = 0;
- for (size_t s = 0; s < 2 * numProj; ++s) // here we can't start from 0
+ for (size_t s = 0; s < 2 * numProj; ++s) // here we can't start from 1
{
if ( minscore2 > scores[s] && s != minloc) //second smallest
{
More information about the mlpack-git
mailing list