[mlpack-git] master: Changes BaseCase to include loop over candidate set. Changes loop variable to signed for OpenMP loops. (b95a3ce)

gitdub at mlpack.org gitdub at mlpack.org
Fri Jul 1 09:44:39 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/34cf8d94f79c9a72ff4199676033b060cd039fcd...425324bf7fb7c86c85d10a909d8a59d4f69b7164

>---------------------------------------------------------------

commit b95a3ce6f77fffad549e5558d879c2148c28d765
Author: Yannis Mentekidis <mentekid at gmail.com>
Date:   Fri Jul 1 14:44:09 2016 +0100

    Changes BaseCase to include loop over candidate set. Changes loop variable to signed for OpenMP loops.


>---------------------------------------------------------------

b95a3ce6f77fffad549e5558d879c2148c28d765
 src/mlpack/methods/lsh/lsh_search.hpp      |  40 ++++++-
 src/mlpack/methods/lsh/lsh_search_impl.hpp | 177 ++++++++++++++++++-----------
 src/mlpack/tests/lsh_test.cpp              |  51 ++++++++-
 3 files changed, 195 insertions(+), 73 deletions(-)

diff --git a/src/mlpack/methods/lsh/lsh_search.hpp b/src/mlpack/methods/lsh/lsh_search.hpp
index 2e99462..6227ca4 100644
--- a/src/mlpack/methods/lsh/lsh_search.hpp
+++ b/src/mlpack/methods/lsh/lsh_search.hpp
@@ -272,6 +272,40 @@ class LSHSearch
                               arma::uvec& referenceIndices,
                               size_t numTablesToSearch);
 
+//  /**
+//   * This is a helper function that computes the distance of the query to the
+//   * neighbor candidates and appropriately stores the best 'k' candidates.  This
+//   * is specific to the monochromatic search case, where the query set is the
+//   * reference set.
+//   *
+//   * @param queryIndex The index of the query in question
+//   * @param referenceIndex The index of the neighbor candidate in question
+//   * @param neighbors Matrix holding output neighbors.
+//   * @param distances Matrix holding output distances.
+//   */
+//  void BaseCase(const size_t queryIndex,
+//                const size_t referenceIndex,
+//                arma::Mat<size_t>& neighbors,
+//                arma::mat& distances) const;
+//
+//  /**
+//   * This is a helper function that computes the distance of the query to the
+//   * neighbor candidates and appropriately stores the best 'k' candidates.  This
+//   * is specific to bichromatic search, where the query set is not the same as
+//   * the reference set.
+//   *
+//   * @param queryIndex The index of the query in question
+//   * @param referenceIndex The index of the neighbor candidate in question
+//   * @param querySet Set of query points.
+//   * @param neighbors Matrix holding output neighbors.
+//   * @param distances Matrix holding output distances.
+//   */
+//  void BaseCase(const size_t queryIndex,
+//                const size_t referenceIndex,
+//                const arma::mat& querySet,
+//                arma::Mat<size_t>& neighbors,
+//                arma::mat& distances) const;
+
   /**
    * This is a helper function that computes the distance of the query to the
    * neighbor candidates and appropriately stores the best 'k' candidates.  This
@@ -283,8 +317,9 @@ class LSHSearch
    * @param neighbors Matrix holding output neighbors.
    * @param distances Matrix holding output distances.
    */
+  // TODO: change documentation above
   void BaseCase(const size_t queryIndex,
-                const size_t referenceIndex,
+                const arma::uvec& referenceIndices,
                 arma::Mat<size_t>& neighbors,
                 arma::mat& distances) const;
 
@@ -300,8 +335,9 @@ class LSHSearch
    * @param neighbors Matrix holding output neighbors.
    * @param distances Matrix holding output distances.
    */
+  //TODO: change documentation above.
   void BaseCase(const size_t queryIndex,
-                const size_t referenceIndex,
+                const arma::uvec& referenceIndices,
                 const arma::mat& querySet,
                 arma::Mat<size_t>& neighbors,
                 arma::mat& distances) const;
diff --git a/src/mlpack/methods/lsh/lsh_search_impl.hpp b/src/mlpack/methods/lsh/lsh_search_impl.hpp
index 2a76904..dae15a6 100644
--- a/src/mlpack/methods/lsh/lsh_search_impl.hpp
+++ b/src/mlpack/methods/lsh/lsh_search_impl.hpp
@@ -293,6 +293,7 @@ void LSHSearch<SortPolicy>::InsertNeighbor(arma::mat& distances,
   neighbors(pos, queryIndex) = neighbor;
 }
 
+/*
 // Base case where the query set is the reference set.  (So, we can't return
 // ourselves as the nearest neighbor.)
 template<typename SortPolicy>
@@ -319,8 +320,13 @@ void LSHSearch<SortPolicy>::BaseCase(const size_t queryIndex,
 
   // SortDistance() returns (size_t() - 1) if we shouldn't add it.
   if (insertPosition != (size_t() - 1))
-    InsertNeighbor(distances, neighbors, queryIndex, insertPosition,
-        referenceIndex, distance);
+  {
+    #pragma omp critical
+    {
+      InsertNeighbor(distances, neighbors, queryIndex, insertPosition,
+          referenceIndex, distance);
+    }
+  }
 }
 
 // Base case for bichromatic search.
@@ -345,10 +351,79 @@ void LSHSearch<SortPolicy>::BaseCase(const size_t queryIndex,
 
   // SortDistance() returns (size_t() - 1) if we shouldn't add it.
   if (insertPosition != (size_t() - 1))
-    InsertNeighbor(distances, neighbors, queryIndex, insertPosition,
-        referenceIndex, distance);
+  {  
+    #pragma omp critical
+    {
+      InsertNeighbor(distances, neighbors, queryIndex, insertPosition,
+          referenceIndex, distance);
+    }
+  }
 }
+*/
 
+// Base case where the query set is the reference set.  (So, we can't return
+// ourselves as the nearest neighbor.)
+template<typename SortPolicy>
+inline force_inline
+void LSHSearch<SortPolicy>::BaseCase(const size_t queryIndex,
+                                     const arma::uvec& referenceIndices,
+                                     arma::Mat<size_t>& neighbors,
+                                     arma::mat& distances) const
+{
+  for (size_t j = 0; j < referenceIndices.n_elem; ++j)
+  {
+    const size_t referenceIndex = referenceIndices[j];
+    // If the points are the same, skip this point.
+    if (queryIndex == referenceIndex)
+      continue;
+
+    const double distance = metric::EuclideanDistance::Evaluate(
+        referenceSet->unsafe_col(queryIndex),
+        referenceSet->unsafe_col(referenceIndex));
+
+    // If this distance is better than any of the current candidates, the
+    // SortDistance() function will give us the position to insert it into.
+    arma::vec queryDist = distances.unsafe_col(queryIndex);
+    arma::Col<size_t> queryIndices = neighbors.unsafe_col(queryIndex);
+    size_t insertPosition = SortPolicy::SortDistance(queryDist, queryIndices,
+        distance);
+
+    // SortDistance() returns (size_t() - 1) if we shouldn't add it.
+    if (insertPosition != (size_t() - 1))
+      InsertNeighbor(distances, neighbors, queryIndex, insertPosition,
+          referenceIndex, distance);
+  }
+}
+
+// Base case for bichromatic search.
+template<typename SortPolicy>
+inline force_inline
+void LSHSearch<SortPolicy>::BaseCase(const size_t queryIndex,
+                                     const arma::uvec& referenceIndices,
+                                     const arma::mat& querySet,
+                                     arma::Mat<size_t>& neighbors,
+                                     arma::mat& distances) const
+{
+  for (size_t j = 0; j < referenceIndices.n_elem; ++j)
+  {
+    const size_t referenceIndex = referenceIndices[j];
+    const double distance = metric::EuclideanDistance::Evaluate(
+        querySet.unsafe_col(queryIndex),
+        referenceSet->unsafe_col(referenceIndex));
+
+    // If this distance is better than any of the current candidates, the
+    // SortDistance() function will give us the position to insert it into.
+    arma::vec queryDist = distances.unsafe_col(queryIndex);
+    arma::Col<size_t> queryIndices = neighbors.unsafe_col(queryIndex);
+    size_t insertPosition = SortPolicy::SortDistance(queryDist, queryIndices,
+        distance);
+
+    // SortDistance() returns (size_t() - 1) if we shouldn't add it.
+    if (insertPosition != (size_t() - 1))
+      InsertNeighbor(distances, neighbors, queryIndex, insertPosition,
+          referenceIndex, distance);
+  }
+}
 template<typename SortPolicy>
 template<typename VecType>
 void LSHSearch<SortPolicy>::ReturnIndicesFromTable(
@@ -416,19 +491,7 @@ void LSHSearch<SortPolicy>::ReturnIndicesFromTable(
     arma::Col<size_t> refPointsConsidered;
     refPointsConsidered.zeros(referenceSet->n_cols);
 
-    // Define the number of threads used to process this.
-    size_t numThreadsUsed = std::min(maxThreads, numTablesToSearch);
-
-    // Parallelization: By default nested parallelism is off, so this won't be
-    // parallel. The user might turn nested parallelism on if (for example) they
-    // have a query-by-query processing scheme and so processing more than one
-    // query at the same time doesn't make sense for them.
-
-    #pragma omp parallel for \
-    num_threads (numThreadsUsed) \
-    shared (hashVec, refPointsConsidered) \
-    schedule(dynamic)
-    for (size_t i = 0; i < numTablesToSearch; ++i)
+    for (long long int i = 0; i < numTablesToSearch; ++i)
     {
 
       const size_t hashInd = (size_t) hashVec[i];
@@ -436,25 +499,13 @@ void LSHSearch<SortPolicy>::ReturnIndicesFromTable(
 
       // Pick the indices in the bucket corresponding to 'hashInd'.
       if (tableRow != secondHashSize)
-      {
         for (size_t j = 0; j < bucketContentSize[tableRow]; j++)
-        {
-          #pragma omp atomic
           refPointsConsidered[secondHashTable[tableRow](j)]++;
-        }
-      }
     }
 
-    // Only keep reference points found in at least one bucket. If OpenMP is
-    // found, do it in parallel
-    #ifdef OPENMP_FOUND
-      // TODO: change this to our own function?
-      referenceIndices = arma::find(refPointsConsidered > 0);
-      return;
-    #else
-      referenceIndices = arma::find(refPointsConsidered > 0);
-      return;
-    #endif
+    // Only keep reference points found in at least one bucket.
+    referenceIndices = arma::find(refPointsConsidered > 0);
+    return;
   }
   else
   {
@@ -467,45 +518,20 @@ void LSHSearch<SortPolicy>::ReturnIndicesFromTable(
     // Retrieve candidates.
     size_t start = 0;
 
-    // Define the number of threads used to process this.
-    size_t numThreadsUsed = std::min(maxThreads, numTablesToSearch);
-
-    // Parallelization: By default nested parallelism is off, so this won't be
-    // parallel. The user might turn nested parallelism on if (for example) they
-    // have a query-by-query processing scheme and so processing more than one
-    // query at the same time doesn't make sense for them.
-
-    #pragma omp parallel for \
-    num_threads (numThreadsUsed) \
-    shared (hashVec, refPointsConsideredSmall, start) \
-    schedule(dynamic)
-    for (size_t i = 0; i < numTablesToSearch; ++i) // For all tables
+    for (long long int i = 0; i < numTablesToSearch; ++i) // For all tables
     {
       const size_t hashInd = (size_t) hashVec[i]; // Find the query's bucket.
       const size_t tableRow = bucketRowInHashTable[hashInd];
 
       // Store all secondHashTable points in the candidates set.
       if (tableRow != secondHashSize)
-      { 
         for (size_t j = 0; j < bucketContentSize[tableRow]; ++j)
-        {
-          #pragma omp critical
-          {
-            refPointsConsideredSmall(start++) = secondHashTable[tableRow][j];
-          }
-        }
-      }
+          refPointsConsideredSmall(start++) = secondHashTable[tableRow][j];
     }
 
-    // Only keep unique candidates. If OpenMP is found, do it in parallel.
-    #ifdef OPENMP_FOUND
-      // TODO: change this to our own function?
-      referenceIndices = arma::unique(refPointsConsideredSmall);
-      return;
-    #else
-      referenceIndices = arma::unique(refPointsConsideredSmall);
-      return;
-    #endif
+    // Keep only one copy of each candidate.
+    referenceIndices = arma::unique(refPointsConsideredSmall);
+    return;
   }
 }
 
@@ -557,8 +583,9 @@ void LSHSearch<SortPolicy>::Search(const arma::mat& querySet,
     num_threads ( numThreadsUsed )\
     shared(avgIndicesReturned, resultingNeighbors, distances) \
     schedule(dynamic)
-  // Go through every query point.
-  for (size_t i = 0; i < querySet.n_cols; i++)
+  // Go through every query point. Use long int because some compilers complain
+  // for openMP unsigned index variables.
+  for (long long int i = 0; i < querySet.n_cols; i++)
   {
 
     // Hash every query into every hash table and eventually into the
@@ -574,9 +601,17 @@ void LSHSearch<SortPolicy>::Search(const arma::mat& querySet,
 
     // Sequentially go through all the candidates and save the best 'k'
     // candidates.
+    /*
+    numTheadsUsed = std::min( (arma::uword) maxThreads, refIndices.n_elem);
+    #pragma omp parallel for\
+    num_threads( numThreadsUsed )\
+    shared(refIndices, resultingNeighbors, distances, querySet)\
+    schedule(dynamic)
     for (size_t j = 0; j < refIndices.n_elem; j++)
       BaseCase(i, (size_t) refIndices[j], querySet, resultingNeighbors,
           distances);
+    */
+    BaseCase(i, refIndices, querySet, resultingNeighbors, distances);
   }
 
   Timer::Stop("computing_neighbors");
@@ -613,8 +648,9 @@ Search(const size_t k,
     num_threads ( numThreadsUsed )\
     shared(avgIndicesReturned, resultingNeighbors, distances) \
     schedule(dynamic)
-  // Go through every query point.
-  for (size_t i = 0; i < referenceSet->n_cols; i++)
+  // Go through every query point. Use long int because some compilers complain
+  // for openMP unsigned index variables.
+  for (long long int i = 0; i < referenceSet->n_cols; i++)
   {
     // Hash every query into every hash table and eventually into the
     // 'secondHashTable' to obtain the neighbor candidates.
@@ -629,8 +665,17 @@ Search(const size_t k,
 
     // Sequentially go through all the candidates and save the best 'k'
     // candidates.
+
+    /*
+    numTheadsUsed = std::min( (arma::uword) maxThreads, refIndices.n_elem);
+    #pragma omp parallel for\
+    num_threads( numThreadsUsed )\
+    shared(refIndices, resultingNeighbors, distances)\
+    schedule(dynamic)
     for (size_t j = 0; j < refIndices.n_elem; j++)
       BaseCase(i, (size_t) refIndices[j], resultingNeighbors, distances);
+    */
+    BaseCase(i, refIndices, resultingNeighbors, distances);
 
   }
 
diff --git a/src/mlpack/tests/lsh_test.cpp b/src/mlpack/tests/lsh_test.cpp
index ec2e394..c594991 100644
--- a/src/mlpack/tests/lsh_test.cpp
+++ b/src/mlpack/tests/lsh_test.cpp
@@ -509,7 +509,7 @@ BOOST_AUTO_TEST_CASE(ParallelBichromatic)
  */
 BOOST_AUTO_TEST_CASE(ParallelMonochromatic)
 {
-  // kNN and LSH parameters (use LSH default parameters).
+  // kNN and LSH parameters.
   const int k = 4;
   const int numTables = 16;
   const int numProj = 3;
@@ -524,15 +524,56 @@ BOOST_AUTO_TEST_CASE(ParallelMonochromatic)
   arma::Mat<size_t> parallelNeighbors;
   arma::mat distances;
 
-  // Construct an LSH object. By default, it uses the maximum number of threads
-  LSHSearch<> lshTest(rdata, numProj, numTables); //default parameters
+  // Construct an LSH object, using maximum number of available threads.
+  LSHSearch<> lshTest(rdata, numProj, numTables);
   lshTest.Search(k, parallelNeighbors, distances);
 
-  // Now perform same search but with 1 thread
+  // Now perform same search but with 1 thread.
   lshTest.MaxThreads(1);
   lshTest.Search(k, sequentialNeighbors, distances);
 
-  // Require both have same results
+  // Require both have same results.
+  double recall = LSHSearch<>::ComputeRecall(sequentialNeighbors, parallelNeighbors);
+  BOOST_REQUIRE_EQUAL(recall, 1);
+}
+
+/**
+ * Test: This test verifies that processing a query in parallel returns the same
+ * results with processing it sequentially. 
+ * Requires OMP_NESTED environment variable to be set to TRUE. To set it,
+ * execute:
+ * ```
+ * export OMP_NESTED=TRUE
+ * ```
+ */
+BOOST_AUTO_TEST_CASE(ParallelSingleQuery)
+{
+  // kNN and LSH parameters.
+  const int k = 4;
+  const int numTables = 16;
+  const int numProj = 3;
+
+  // Read iris training data as reference and query set.
+  const string trainSet = "iris_train.csv";
+  arma::mat rdata;
+  data::Load(trainSet, rdata, true);
+
+  arma::mat qdata = rdata.col(0); // Only 1 query.
+
+  // Where to store neighbors and distances.
+  arma::Mat<size_t> sequentialNeighbors;
+  arma::Mat<size_t> parallelNeighbors;
+  arma::mat distances;
+
+  // Construct an LSH object. By default, maximum number of threads are used.
+  LSHSearch<> lshTest(rdata, numProj, numTables);
+  lshTest.Search(qdata, k, parallelNeighbors, distances);
+
+  // Now perform same search but with 1 thread.
+  lshTest.MaxThreads(1);
+  lshTest.Search(qdata, k, sequentialNeighbors, distances);
+
+  // Require both have same results.
   double recall = LSHSearch<>::ComputeRecall(sequentialNeighbors, parallelNeighbors);
   BOOST_REQUIRE_EQUAL(recall, 1);
 }




More information about the mlpack-git mailing list