[mlpack-git] master: Fixes style, adds documentation (fa7f62d)

gitdub at mlpack.org gitdub at mlpack.org
Fri Jun 17 08:29:33 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/eaa7182ebed8cce3fd6191dc1f8170546ea297da...812048c7c6bee0b6c8d936677f23bbb5930c6cfc

>---------------------------------------------------------------

commit fa7f62da6a4cfe7fa45e297d7a4a1491c9c39bb1
Author: Yannis Mentekidis <mentekid at gmail.com>
Date:   Fri Jun 17 15:29:33 2016 +0300

    Fixes style, adds documentation


>---------------------------------------------------------------

fa7f62da6a4cfe7fa45e297d7a4a1491c9c39bb1
 src/mlpack/methods/lsh/lsh_search.hpp      | 33 ++++++++++++++++++-
 src/mlpack/methods/lsh/lsh_search_impl.hpp | 53 +++++++++++++++---------------
 2 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/src/mlpack/methods/lsh/lsh_search.hpp b/src/mlpack/methods/lsh/lsh_search.hpp
index 3319e57..364ef19 100644
--- a/src/mlpack/methods/lsh/lsh_search.hpp
+++ b/src/mlpack/methods/lsh/lsh_search.hpp
@@ -19,7 +19,23 @@
  *  organization={ACM}
  * }
  *
+ * Additionally, the class implements Multiprobe LSH, which improves 
+ * approximation results during the search for approximate nearest neighbors.
+ * The Multiprobe LSH algorithm was presented in the paper:
+ *
+ * @inproceedings{Lv2007multiprobe,
+ *  tile={Multi-probe LSH: efficient indexing for high-dimensional similarity
+ *  search},
+ *  author={Lv, Qin and Josephson, William and Wang, Zhe and Charikar, Moses and
+ *  Li, Kai},
+ *  booktitle={Proceedings of the 33rd international conference on Very large
+ *  data bases},
+ *  year={2007},
+ *  pages={950--961}
+ * }
+ *
  */
+
 #ifndef MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_SEARCH_HPP
 #define MLPACK_METHODS_NEIGHBOR_SEARCH_LSH_SEARCH_HPP
 
@@ -235,6 +251,10 @@ class LSHSearch
    * @param referenceIndices The list of neighbor candidates obtained from
    *    hashing the query into all the hash tables and eventually into
    *    multiple buckets of the second hash table.
+   * @param numTablesToSearch The number of tables to perform the search in. If
+   * 0, all tables are searched.
+   * @param T The number of additional probing bins for multiprobe LSH. If 0,
+   * single-probe is used.
    */
   template<typename VecType>
   void ReturnIndicesFromTable(const VecType& queryPoint,
@@ -298,7 +318,18 @@ class LSHSearch
                       const double distance) const;
 
   /**
-  TODO: Document this
+   * This function implements the core idea behind Multiprobe LSH. It is called
+   * by ReturnIndicesFromTables when T > 0. Given a query's code and its
+   * projection location, GetAdditionalProbingBins will calculate the T most
+   * likely alternative bin codes (other than queryCode) where a query's
+   * neighbors might be found in.
+   *
+   * @param queryCode vector containing the numProj-dimensional query code.
+   * @param queryCodeNotFloored vector containing the projection location of the
+   * query.
+   * @param T number of additional probing bins.
+   * @param additionalProbingBins matrix. Each column will hold one additional
+   * bin.
   */
   void GetAdditionalProbingBins(const arma::vec &queryCode,
                             const arma::vec &queryCodeNotFloored,
diff --git a/src/mlpack/methods/lsh/lsh_search_impl.hpp b/src/mlpack/methods/lsh/lsh_search_impl.hpp
index bc49662..36e4afa 100644
--- a/src/mlpack/methods/lsh/lsh_search_impl.hpp
+++ b/src/mlpack/methods/lsh/lsh_search_impl.hpp
@@ -9,7 +9,6 @@
 
 #include <mlpack/core.hpp>
 
-using std::cout; using std::endl; //TODO: remove
 
 namespace mlpack {
 namespace neighbor {
@@ -342,7 +341,7 @@ void LSHSearch<SortPolicy>::BaseCase(const size_t queryIndex,
 }
 
 
-// Compare class for <double, size_t> pair, used in GetAdditionalProbingBins
+// Compare class for <double, size_t> pair, used in GetAdditionalProbingBins.
 class CompareGreater
 {
   public:
@@ -354,9 +353,9 @@ class CompareGreater
     }
 };
 
-//Returns the score of a perturbation vector generated by perturbation set A
+//Returns the score of a perturbation vector generated by perturbation set A.
 //The score of a pertubation set (vector) is the sum of scores of the
-//participating actions
+//participating actions.
 inline double perturbationScore(
               const std::vector<size_t> &A,
               const arma::vec &scores)
@@ -367,7 +366,8 @@ inline double perturbationScore(
   return score;
 }
 
-// Replace max element with max element+1 in perturbation set A
+// Inline function used by GetAdditionalProbingBins. The vector shift operation
+// replaces the largest element of a vector A with (largest element) + 1.
 inline void perturbationShift(std::vector<size_t> &A)
 {
   size_t max_pos = 0;
@@ -383,7 +383,9 @@ inline void perturbationShift(std::vector<size_t> &A)
   A[max_pos]++;
 }
 
-// Add 1+max element to perturbation set A
+// Inline function used by GetAdditionalProbingBins. The vector expansion
+// operation adds the element [1 + (largest_element)] to a vector A, where
+// largest_element is the largest element of A.
 inline void perturbationExpand(std::vector<size_t> &A)
 {
   size_t max = A[0];
@@ -400,19 +402,20 @@ inline bool perturbationValid(
     const std::vector<size_t> &A,
     const size_t numProj)
 {
-  //stack allocation and initialization to 0 (bool check[numProj] = {0}) made
-  //some compilers complain so use new to be safe...
+  // Stack allocation and initialization to 0 (bool check[numProj] = {0}) made
+  // some compilers complain. We use new just to be safe.
   bool *check = new bool[numProj]();
 
   for (size_t i = 0; i < A.size(); ++i)
   {
+    // Check that we only use valid dimensions. If not, vector is not valid.
     if ( A[i] >= 2*numProj)
     {
       delete []check;
       return false;
     }
 
-    //check that we only see each dimension once
+    // Check that we only see each dimension once. If not, vector is not valid.
     if (check[A[i] % numProj ] == 0)
       check[A[i] % numProj ] = 1;
     else
@@ -435,24 +438,24 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
                             arma::mat &additionalProbingBins) const
 {
 
+  // No additional bins requested. Our work is done.
   if (T == 0)
     return;
 
-
-
   // Each column of additionalProbingBins is the code of a bin.
   additionalProbingBins.set_size(numProj, T);
   
-  // Copy the query's code, then add/subtract according to perturbations
+  // Copy the query's code, then in the end we will  add/subtract according 
+  // to perturbations we calculated.
   for (size_t c = 0; c < T; ++c)
     additionalProbingBins.col(c) = queryCode;
 
 
   // Calculate query point's projection position
-  arma::mat projection = queryCode * hashWidth;
+  arma::mat projection = queryCodeNotFloored;
 
   // Use projection to calculate query's distance from hash limits
-  arma::vec limLow = queryCodeNotFloored - projection;
+  arma::vec limLow = projection - queryCode * hashWidth;
   arma::vec limHigh = hashWidth - limLow;
 
   // calculate scores = distances^2
@@ -482,17 +485,6 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
   if (T <= 2)
   {
 
-    // special case: 1 or 2 codes only
-    // The second perturbation vector still can't comprise of more than one
-    // change in the bin codes. This is because of the way perturbation vectors
-    // are generated: First we create the one with the smallest score (Ao) and
-    // then we either add 1 extra dimension to it (Ae) or shift it by one (As).
-    // Since As contains the second smallest score, and Ae contains both the
-    // smallest and the second smallest, it's obvious that score(Ae) >
-    // score(As). Therefore the second perturbation vector is ALWAYS the vector
-    // containing only the second-lowest scoring perturbation.
-    
-
     // First, find location of minimum score, generate 1 perturbation vector,
     // and add its code to additionalProbingBins column 0.
 
@@ -514,10 +506,19 @@ void LSHSearch<SortPolicy>::GetAdditionalProbingBins(
       return; //done if asked for only 1 code
 
     // Now, find location of second smallest score and generate one more vector.
+    // The second perturbation vector still can't comprise of more than one
+    // change in the bin codes. This is because of the way perturbation vectors
+    // are generated: First we create the one with the smallest score (Ao) and
+    // then we either add 1 extra dimension to it (Ae) or shift it by one (As).
+    // Since As contains the second smallest score, and Ae contains both the
+    // smallest and the second smallest, it's obvious that score(Ae) >
+    // score(As). Therefore the second perturbation vector is ALWAYS the vector
+    // containing only the second-lowest scoring perturbation.
+    
     
     double minscore2 = scores[0];
     size_t minloc2 = 0;
-    for (size_t s = 0; s < 2 * numProj; ++s) // here we can't start from 0
+    for (size_t s = 0; s < 2 * numProj; ++s) // here we can't start from 1
     {
       if ( minscore2 > scores[s] && s != minloc) //second smallest
       {




More information about the mlpack-git mailing list