[mlpack-git] master: Changes BaseCase to include loop over candidate set. Changes loop variable to signed for OpenMP loops. (b95a3ce)
gitdub at mlpack.org
gitdub at mlpack.org
Fri Jul 1 09:44:39 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/34cf8d94f79c9a72ff4199676033b060cd039fcd...425324bf7fb7c86c85d10a909d8a59d4f69b7164
>---------------------------------------------------------------
commit b95a3ce6f77fffad549e5558d879c2148c28d765
Author: Yannis Mentekidis <mentekid at gmail.com>
Date: Fri Jul 1 14:44:09 2016 +0100
Changes BaseCase to include loop over candidate set. Changes loop variable to signed for OpenMP loops.
>---------------------------------------------------------------
b95a3ce6f77fffad549e5558d879c2148c28d765
src/mlpack/methods/lsh/lsh_search.hpp | 40 ++++++-
src/mlpack/methods/lsh/lsh_search_impl.hpp | 177 ++++++++++++++++++-----------
src/mlpack/tests/lsh_test.cpp | 51 ++++++++-
3 files changed, 195 insertions(+), 73 deletions(-)
diff --git a/src/mlpack/methods/lsh/lsh_search.hpp b/src/mlpack/methods/lsh/lsh_search.hpp
index 2e99462..6227ca4 100644
--- a/src/mlpack/methods/lsh/lsh_search.hpp
+++ b/src/mlpack/methods/lsh/lsh_search.hpp
@@ -272,6 +272,40 @@ class LSHSearch
arma::uvec& referenceIndices,
size_t numTablesToSearch);
+// /**
+// * This is a helper function that computes the distance of the query to the
+// * neighbor candidates and appropriately stores the best 'k' candidates. This
+// * is specific to the monochromatic search case, where the query set is the
+// * reference set.
+// *
+// * @param queryIndex The index of the query in question
+// * @param referenceIndex The index of the neighbor candidate in question
+// * @param neighbors Matrix holding output neighbors.
+// * @param distances Matrix holding output distances.
+// */
+// void BaseCase(const size_t queryIndex,
+// const size_t referenceIndex,
+// arma::Mat<size_t>& neighbors,
+// arma::mat& distances) const;
+//
+// /**
+// * This is a helper function that computes the distance of the query to the
+// * neighbor candidates and appropriately stores the best 'k' candidates. This
+// * is specific to bichromatic search, where the query set is not the same as
+// * the reference set.
+// *
+// * @param queryIndex The index of the query in question
+// * @param referenceIndex The index of the neighbor candidate in question
+// * @param querySet Set of query points.
+// * @param neighbors Matrix holding output neighbors.
+// * @param distances Matrix holding output distances.
+// */
+// void BaseCase(const size_t queryIndex,
+// const size_t referenceIndex,
+// const arma::mat& querySet,
+// arma::Mat<size_t>& neighbors,
+// arma::mat& distances) const;
+
/**
* This is a helper function that computes the distance of the query to the
* neighbor candidates and appropriately stores the best 'k' candidates. This
@@ -283,8 +317,9 @@ class LSHSearch
* @param neighbors Matrix holding output neighbors.
* @param distances Matrix holding output distances.
*/
+ // TODO: change documentation above
void BaseCase(const size_t queryIndex,
- const size_t referenceIndex,
+ const arma::uvec& referenceIndices,
arma::Mat<size_t>& neighbors,
arma::mat& distances) const;
@@ -300,8 +335,9 @@ class LSHSearch
* @param neighbors Matrix holding output neighbors.
* @param distances Matrix holding output distances.
*/
+ //TODO: change documentation above.
void BaseCase(const size_t queryIndex,
- const size_t referenceIndex,
+ const arma::uvec& referenceIndices,
const arma::mat& querySet,
arma::Mat<size_t>& neighbors,
arma::mat& distances) const;
diff --git a/src/mlpack/methods/lsh/lsh_search_impl.hpp b/src/mlpack/methods/lsh/lsh_search_impl.hpp
index 2a76904..dae15a6 100644
--- a/src/mlpack/methods/lsh/lsh_search_impl.hpp
+++ b/src/mlpack/methods/lsh/lsh_search_impl.hpp
@@ -293,6 +293,7 @@ void LSHSearch<SortPolicy>::InsertNeighbor(arma::mat& distances,
neighbors(pos, queryIndex) = neighbor;
}
+/*
// Base case where the query set is the reference set. (So, we can't return
// ourselves as the nearest neighbor.)
template<typename SortPolicy>
@@ -319,8 +320,13 @@ void LSHSearch<SortPolicy>::BaseCase(const size_t queryIndex,
// SortDistance() returns (size_t() - 1) if we shouldn't add it.
if (insertPosition != (size_t() - 1))
- InsertNeighbor(distances, neighbors, queryIndex, insertPosition,
- referenceIndex, distance);
+ {
+ #pragma omp critical
+ {
+ InsertNeighbor(distances, neighbors, queryIndex, insertPosition,
+ referenceIndex, distance);
+ }
+ }
}
// Base case for bichromatic search.
@@ -345,10 +351,79 @@ void LSHSearch<SortPolicy>::BaseCase(const size_t queryIndex,
// SortDistance() returns (size_t() - 1) if we shouldn't add it.
if (insertPosition != (size_t() - 1))
- InsertNeighbor(distances, neighbors, queryIndex, insertPosition,
- referenceIndex, distance);
+ {
+ #pragma omp critical
+ {
+ InsertNeighbor(distances, neighbors, queryIndex, insertPosition,
+ referenceIndex, distance);
+ }
+ }
}
+*/
+// Base case where the query set is the reference set. (So, we can't return
+// ourselves as the nearest neighbor.)
+template<typename SortPolicy>
+inline force_inline
+void LSHSearch<SortPolicy>::BaseCase(const size_t queryIndex,
+ const arma::uvec& referenceIndices,
+ arma::Mat<size_t>& neighbors,
+ arma::mat& distances) const
+{
+ for (size_t j = 0; j < referenceIndices.n_elem; ++j)
+ {
+ const size_t referenceIndex = referenceIndices[j];
+ // If the points are the same, skip this point.
+ if (queryIndex == referenceIndex)
+ continue;
+
+ const double distance = metric::EuclideanDistance::Evaluate(
+ referenceSet->unsafe_col(queryIndex),
+ referenceSet->unsafe_col(referenceIndex));
+
+ // If this distance is better than any of the current candidates, the
+ // SortDistance() function will give us the position to insert it into.
+ arma::vec queryDist = distances.unsafe_col(queryIndex);
+ arma::Col<size_t> queryIndices = neighbors.unsafe_col(queryIndex);
+ size_t insertPosition = SortPolicy::SortDistance(queryDist, queryIndices,
+ distance);
+
+ // SortDistance() returns (size_t() - 1) if we shouldn't add it.
+ if (insertPosition != (size_t() - 1))
+ InsertNeighbor(distances, neighbors, queryIndex, insertPosition,
+ referenceIndex, distance);
+ }
+}
+
+// Base case for bichromatic search.
+template<typename SortPolicy>
+inline force_inline
+void LSHSearch<SortPolicy>::BaseCase(const size_t queryIndex,
+ const arma::uvec& referenceIndices,
+ const arma::mat& querySet,
+ arma::Mat<size_t>& neighbors,
+ arma::mat& distances) const
+{
+ for (size_t j = 0; j < referenceIndices.n_elem; ++j)
+ {
+ const size_t referenceIndex = referenceIndices[j];
+ const double distance = metric::EuclideanDistance::Evaluate(
+ querySet.unsafe_col(queryIndex),
+ referenceSet->unsafe_col(referenceIndex));
+
+ // If this distance is better than any of the current candidates, the
+ // SortDistance() function will give us the position to insert it into.
+ arma::vec queryDist = distances.unsafe_col(queryIndex);
+ arma::Col<size_t> queryIndices = neighbors.unsafe_col(queryIndex);
+ size_t insertPosition = SortPolicy::SortDistance(queryDist, queryIndices,
+ distance);
+
+ // SortDistance() returns (size_t() - 1) if we shouldn't add it.
+ if (insertPosition != (size_t() - 1))
+ InsertNeighbor(distances, neighbors, queryIndex, insertPosition,
+ referenceIndex, distance);
+ }
+}
template<typename SortPolicy>
template<typename VecType>
void LSHSearch<SortPolicy>::ReturnIndicesFromTable(
@@ -416,19 +491,7 @@ void LSHSearch<SortPolicy>::ReturnIndicesFromTable(
arma::Col<size_t> refPointsConsidered;
refPointsConsidered.zeros(referenceSet->n_cols);
- // Define the number of threads used to process this.
- size_t numThreadsUsed = std::min(maxThreads, numTablesToSearch);
-
- // Parallelization: By default nested parallelism is off, so this won't be
- // parallel. The user might turn nested parallelism on if (for example) they
- // have a query-by-query processing scheme and so processing more than one
- // query at the same time doesn't make sense for them.
-
- #pragma omp parallel for \
- num_threads (numThreadsUsed) \
- shared (hashVec, refPointsConsidered) \
- schedule(dynamic)
- for (size_t i = 0; i < numTablesToSearch; ++i)
+ for (long long int i = 0; i < numTablesToSearch; ++i)
{
const size_t hashInd = (size_t) hashVec[i];
@@ -436,25 +499,13 @@ void LSHSearch<SortPolicy>::ReturnIndicesFromTable(
// Pick the indices in the bucket corresponding to 'hashInd'.
if (tableRow != secondHashSize)
- {
for (size_t j = 0; j < bucketContentSize[tableRow]; j++)
- {
- #pragma omp atomic
refPointsConsidered[secondHashTable[tableRow](j)]++;
- }
- }
}
- // Only keep reference points found in at least one bucket. If OpenMP is
- // found, do it in parallel
- #ifdef OPENMP_FOUND
- // TODO: change this to our own function?
- referenceIndices = arma::find(refPointsConsidered > 0);
- return;
- #else
- referenceIndices = arma::find(refPointsConsidered > 0);
- return;
- #endif
+ // Only keep reference points found in at least one bucket.
+ referenceIndices = arma::find(refPointsConsidered > 0);
+ return;
}
else
{
@@ -467,45 +518,20 @@ void LSHSearch<SortPolicy>::ReturnIndicesFromTable(
// Retrieve candidates.
size_t start = 0;
- // Define the number of threads used to process this.
- size_t numThreadsUsed = std::min(maxThreads, numTablesToSearch);
-
- // Parallelization: By default nested parallelism is off, so this won't be
- // parallel. The user might turn nested parallelism on if (for example) they
- // have a query-by-query processing scheme and so processing more than one
- // query at the same time doesn't make sense for them.
-
- #pragma omp parallel for \
- num_threads (numThreadsUsed) \
- shared (hashVec, refPointsConsideredSmall, start) \
- schedule(dynamic)
- for (size_t i = 0; i < numTablesToSearch; ++i) // For all tables
+ for (long long int i = 0; i < numTablesToSearch; ++i) // For all tables
{
const size_t hashInd = (size_t) hashVec[i]; // Find the query's bucket.
const size_t tableRow = bucketRowInHashTable[hashInd];
// Store all secondHashTable points in the candidates set.
if (tableRow != secondHashSize)
- {
for (size_t j = 0; j < bucketContentSize[tableRow]; ++j)
- {
- #pragma omp critical
- {
- refPointsConsideredSmall(start++) = secondHashTable[tableRow][j];
- }
- }
- }
+ refPointsConsideredSmall(start++) = secondHashTable[tableRow][j];
}
- // Only keep unique candidates. If OpenMP is found, do it in parallel.
- #ifdef OPENMP_FOUND
- // TODO: change this to our own function?
- referenceIndices = arma::unique(refPointsConsideredSmall);
- return;
- #else
- referenceIndices = arma::unique(refPointsConsideredSmall);
- return;
- #endif
+ // Keep only one copy of each candidate.
+ referenceIndices = arma::unique(refPointsConsideredSmall);
+ return;
}
}
@@ -557,8 +583,9 @@ void LSHSearch<SortPolicy>::Search(const arma::mat& querySet,
num_threads ( numThreadsUsed )\
shared(avgIndicesReturned, resultingNeighbors, distances) \
schedule(dynamic)
- // Go through every query point.
- for (size_t i = 0; i < querySet.n_cols; i++)
+ // Go through every query point. Use long int because some compilers complain
+ // for openMP unsigned index variables.
+ for (long long int i = 0; i < querySet.n_cols; i++)
{
// Hash every query into every hash table and eventually into the
@@ -574,9 +601,17 @@ void LSHSearch<SortPolicy>::Search(const arma::mat& querySet,
// Sequentially go through all the candidates and save the best 'k'
// candidates.
+ /*
+ numTheadsUsed = std::min( (arma::uword) maxThreads, refIndices.n_elem);
+ #pragma omp parallel for\
+ num_threads( numThreadsUsed )\
+ shared(refIndices, resultingNeighbors, distances, querySet)\
+ schedule(dynamic)
for (size_t j = 0; j < refIndices.n_elem; j++)
BaseCase(i, (size_t) refIndices[j], querySet, resultingNeighbors,
distances);
+ */
+ BaseCase(i, refIndices, querySet, resultingNeighbors, distances);
}
Timer::Stop("computing_neighbors");
@@ -613,8 +648,9 @@ Search(const size_t k,
num_threads ( numThreadsUsed )\
shared(avgIndicesReturned, resultingNeighbors, distances) \
schedule(dynamic)
- // Go through every query point.
- for (size_t i = 0; i < referenceSet->n_cols; i++)
+ // Go through every query point. Use long int because some compilers complain
+ // for openMP unsigned index variables.
+ for (long long int i = 0; i < referenceSet->n_cols; i++)
{
// Hash every query into every hash table and eventually into the
// 'secondHashTable' to obtain the neighbor candidates.
@@ -629,8 +665,17 @@ Search(const size_t k,
// Sequentially go through all the candidates and save the best 'k'
// candidates.
+
+ /*
+ numTheadsUsed = std::min( (arma::uword) maxThreads, refIndices.n_elem);
+ #pragma omp parallel for\
+ num_threads( numThreadsUsed )\
+ shared(refIndices, resultingNeighbors, distances)\
+ schedule(dynamic)
for (size_t j = 0; j < refIndices.n_elem; j++)
BaseCase(i, (size_t) refIndices[j], resultingNeighbors, distances);
+ */
+ BaseCase(i, refIndices, resultingNeighbors, distances);
}
diff --git a/src/mlpack/tests/lsh_test.cpp b/src/mlpack/tests/lsh_test.cpp
index ec2e394..c594991 100644
--- a/src/mlpack/tests/lsh_test.cpp
+++ b/src/mlpack/tests/lsh_test.cpp
@@ -509,7 +509,7 @@ BOOST_AUTO_TEST_CASE(ParallelBichromatic)
*/
BOOST_AUTO_TEST_CASE(ParallelMonochromatic)
{
- // kNN and LSH parameters (use LSH default parameters).
+ // kNN and LSH parameters.
const int k = 4;
const int numTables = 16;
const int numProj = 3;
@@ -524,15 +524,56 @@ BOOST_AUTO_TEST_CASE(ParallelMonochromatic)
arma::Mat<size_t> parallelNeighbors;
arma::mat distances;
- // Construct an LSH object. By default, it uses the maximum number of threads
- LSHSearch<> lshTest(rdata, numProj, numTables); //default parameters
+ // Construct an LSH object, using maximum number of available threads.
+ LSHSearch<> lshTest(rdata, numProj, numTables);
lshTest.Search(k, parallelNeighbors, distances);
- // Now perform same search but with 1 thread
+ // Now perform same search but with 1 thread.
lshTest.MaxThreads(1);
lshTest.Search(k, sequentialNeighbors, distances);
- // Require both have same results
+ // Require both have same results.
+ double recall = LSHSearch<>::ComputeRecall(sequentialNeighbors, parallelNeighbors);
+ BOOST_REQUIRE_EQUAL(recall, 1);
+}
+
+/**
+ * Test: This test verifies that processing a query in parallel returns the same
+ * results with processing it sequentially.
+ * Requires OMP_NESTED environment variable to be set to TRUE. To set it,
+ * execute:
+ * ```
+ * export OMP_NESTED=TRUE
+ * ```
+ */
+BOOST_AUTO_TEST_CASE(ParallelSingleQuery)
+{
+ // kNN and LSH parameters.
+ const int k = 4;
+ const int numTables = 16;
+ const int numProj = 3;
+
+ // Read iris training data as reference and query set.
+ const string trainSet = "iris_train.csv";
+ arma::mat rdata;
+ data::Load(trainSet, rdata, true);
+
+ arma::mat qdata = rdata.col(0); // Only 1 query.
+
+ // Where to store neighbors and distances.
+ arma::Mat<size_t> sequentialNeighbors;
+ arma::Mat<size_t> parallelNeighbors;
+ arma::mat distances;
+
+ // Construct an LSH object. By default, maximum number of threads are used.
+ LSHSearch<> lshTest(rdata, numProj, numTables);
+ lshTest.Search(qdata, k, parallelNeighbors, distances);
+
+ // Now perform same search but with 1 thread.
+ lshTest.MaxThreads(1);
+ lshTest.Search(qdata, k, sequentialNeighbors, distances);
+
+ // Require both have same results.
double recall = LSHSearch<>::ComputeRecall(sequentialNeighbors, parallelNeighbors);
BOOST_REQUIRE_EQUAL(recall, 1);
}
More information about the mlpack-git
mailing list