[mlpack-git] master: Refactor utility functionality into RAUtil class. (9768ed1)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Jul 29 16:42:32 EDT 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/f8ceffae0613b350f4d6bdd46c6c8633a40b4897...6ee21879488fe98612a4619b17f8b51e8da5215b
>---------------------------------------------------------------
commit 9768ed1e2fed54224f43481853055a1fee3c26a4
Author: ryan <ryan at ratml.org>
Date: Sun Jul 26 23:08:33 2015 -0400
Refactor utility functionality into RAUtil class.
>---------------------------------------------------------------
9768ed1e2fed54224f43481853055a1fee3c26a4
src/mlpack/methods/rann/ra_util.cpp | 182 ++++++++++++++++++++++++++++++++++++
src/mlpack/methods/rann/ra_util.hpp | 64 +++++++++++++
2 files changed, 246 insertions(+)
diff --git a/src/mlpack/methods/rann/ra_util.cpp b/src/mlpack/methods/rann/ra_util.cpp
new file mode 100644
index 0000000..579e03c
--- /dev/null
+++ b/src/mlpack/methods/rann/ra_util.cpp
@@ -0,0 +1,182 @@
+/**
+ * @file ra_util.cpp
+ * @author Parikshit Ram
+ * @author Ryan Curtin
+ *
+ * Utilities for rank-approximate neighbor search.
+ */
+#include "ra_util.hpp"
+
+using namespace mlpack;
+using namespace mlpack::neighbor;
+
+size_t mlpack::neighbor::RAUtil::MinimumSamplesReqd(const size_t n,
+ const size_t k,
+ const double tau,
+ const double alpha)
+{
+ size_t ub = n; // The upper bound on the binary search.
+ size_t lb = k; // The lower bound on the binary search.
+ size_t m = lb; // The minimum number of random samples.
+
+ // The rank-approximation.
+ const size_t t = (size_t) std::ceil(tau * (double) n / 100.0);
+
+ double prob;
+ Log::Assert(alpha <= 1.0);
+
+ // going through all values of sample sizes
+ // to find the minimum samples required to satisfy the
+ // desired bound
+ bool done = false;
+
+ // This performs a binary search on the integer values between 'lb = k'
+ // and 'ub = n' to find the minimum number of samples 'm' required to obtain
+ // the desired success probability 'alpha'.
+ do
+ {
+ prob = SuccessProbability(n, k, m, t);
+
+ if (prob > alpha)
+ {
+ if (prob - alpha < 0.001 || ub < lb + 2) {
+ done = true;
+ break;
+ }
+ else
+ ub = m;
+ }
+ else
+ {
+ if (prob < alpha)
+ {
+ if (m == lb)
+ {
+ m++;
+ continue;
+ }
+ else
+ lb = m;
+ }
+ else
+ {
+ done = true;
+ break;
+ }
+ }
+ m = (ub + lb) / 2;
+
+ } while (!done);
+
+ return (std::min(m + 1, n));
+}
+
+double mlpack::neighbor::RAUtil::SuccessProbability(const size_t n,
+ const size_t k,
+ const size_t m,
+ const size_t t)
+{
+ if (k == 1)
+ {
+ if (m > n - t)
+ return 1.0;
+
+ double eps = (double) t / (double) n;
+
+ return 1.0 - std::pow(1.0 - eps, (double) m);
+
+ } // Faster implementation for topK = 1.
+ else
+ {
+ if (m < k)
+ return 0.0;
+
+ if (m > n - t + k - 1)
+ return 1.0;
+
+ double eps = (double) t / (double) n;
+ double sum = 0.0;
+
+ // The probability that 'k' of the 'm' samples lie within the top 't'
+ // of the neighbors is given by:
+ // sum_{j = k}^m Choose(m, j) (t/n)^j (1 - t/n)^{m - j}
+ // which is also equal to
+ // 1 - sum_{j = 0}^{k - 1} Choose(m, j) (t/n)^j (1 - t/n)^{m - j}
+ //
+ // So this is a m - k term summation or a k term summation. So if
+ // m > 2k, do the k term summation, otherwise do the m term summation.
+
+ size_t lb;
+ size_t ub;
+ bool topHalf;
+
+ if (2 * k < m)
+ {
+ // Compute 1 - sum_{j = 0}^{k - 1} Choose(m, j) eps^j (1 - eps)^{m - j}
+ // eps = t/n.
+ //
+ // Choosing 'lb' as 1 and 'ub' as k so as to sum from 1 to (k - 1), and
+ // add the term (1 - eps)^m term separately.
+ lb = 1;
+ ub = k;
+ topHalf = true;
+ sum = std::pow(1 - eps, (double) m);
+ }
+ else
+ {
+ // Compute sum_{j = k}^m Choose(m, j) eps^j (1 - eps)^{m - j}
+ // eps = t/n.
+ //
+ // Choosing 'lb' as k and 'ub' as m so as to sum from k to (m - 1), and
+ // add the term eps^m term separately.
+ lb = k;
+ ub = m;
+ topHalf = false;
+ sum = std::pow(eps, (double) m);
+ }
+
+ for (size_t j = lb; j < ub; j++)
+ {
+ // Compute Choose(m, j).
+ double mCj = (double) m;
+ size_t jTrans;
+
+ // If j < m - j, compute Choose(m, j).
+ // If j > m - j, compute Choose(m, m - j).
+ if (topHalf)
+ jTrans = j;
+ else
+ jTrans = m - j;
+
+ for(size_t i = 2; i <= jTrans; i++)
+ {
+ mCj *= (double) (m - (i - 1));
+ mCj /= (double) i;
+ }
+
+ sum += (mCj * std::pow(eps, (double) j)
+ * std::pow(1.0 - eps, (double) (m - j)));
+ }
+
+ if (topHalf)
+ sum = 1.0 - sum;
+
+ return sum;
+ } // For k > 1.
+}
+
+void mlpack::neighbor::RAUtil::ObtainDistinctSamples(
+ const size_t numSamples,
+ const size_t rangeUpperBound,
+ arma::uvec& distinctSamples)
+{
+ // Keep track of the points that are sampled.
+ arma::Col<size_t> sampledPoints;
+ sampledPoints.zeros(rangeUpperBound);
+
+ for (size_t i = 0; i < numSamples; i++)
+ sampledPoints[(size_t) math::RandInt(rangeUpperBound)]++;
+
+ distinctSamples = arma::find(sampledPoints > 0);
+ return;
+}
diff --git a/src/mlpack/methods/rann/ra_util.hpp b/src/mlpack/methods/rann/ra_util.hpp
new file mode 100644
index 0000000..126e547
--- /dev/null
+++ b/src/mlpack/methods/rann/ra_util.hpp
@@ -0,0 +1,64 @@
+/**
+ * @file ra_util.hpp
+ * @author Parikshit Ram
+ * @author Ryan Curtin
+ *
+ * Utilities for rank-approximate neighbor search.
+ */
+#ifndef __MLPACK_METHODS_RANN_RA_UTIL_HPP
+#define __MLPACK_METHODS_RANN_RA_UTIL_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace neighbor {
+
+class RAUtil
+{
+ public:
+ /**
+ * Compute the minimum number of samples required to guarantee
+ * the given rank-approximation and success probability.
+ *
+ * @param n Size of the set to be sampled from.
+ * @param k The number of neighbors required within the rank-approximation.
+ * @param tau The rank-approximation in percentile of the data.
+ * @param alpha The success probability desired.
+ */
+ static size_t MinimumSamplesReqd(const size_t n,
+ const size_t k,
+ const double tau,
+ const double alpha);
+
+ /**
+ * Compute the success probability of obtaining 'k'-neighbors from a
+ * set of size 'n' within the top 't' neighbors if 'm' samples are made.
+ *
+ * @param n Size of the set being sampled from.
+ * @param k The number of neighbors required within the rank-approximation.
+ * @param m The number of random samples.
+ * @param t The desired rank-approximation.
+ */
+ static double SuccessProbability(const size_t n,
+ const size_t k,
+ const size_t m,
+ const size_t t);
+
+ /**
+ * Pick up desired number of samples (with replacement) from a given range
+ * of integers so that only the distinct samples are returned from
+ * the range [0 - specified upper bound)
+ *
+ * @param numSamples Number of random samples.
+ * @param rangeUpperBound The upper bound on the range of integers.
+ * @param distinctSamples The list of the distinct samples.
+ */
+ static void ObtainDistinctSamples(const size_t numSamples,
+ const size_t rangeUpperBound,
+ arma::uvec& distinctSamples);
+};
+
+} // namespace neighbor
+} // namespace mlpack
+
+#endif
More information about the mlpack-git
mailing list