[mlpack-git] master: Refactor utility functionality into RAUtil class. (9768ed1)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Wed Jul 29 16:42:32 EDT 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/f8ceffae0613b350f4d6bdd46c6c8633a40b4897...6ee21879488fe98612a4619b17f8b51e8da5215b

>---------------------------------------------------------------

commit 9768ed1e2fed54224f43481853055a1fee3c26a4
Author: ryan <ryan at ratml.org>
Date:   Sun Jul 26 23:08:33 2015 -0400

    Refactor utility functionality into RAUtil class.


>---------------------------------------------------------------

9768ed1e2fed54224f43481853055a1fee3c26a4
 src/mlpack/methods/rann/ra_util.cpp | 182 ++++++++++++++++++++++++++++++++++++
 src/mlpack/methods/rann/ra_util.hpp |  64 +++++++++++++
 2 files changed, 246 insertions(+)

diff --git a/src/mlpack/methods/rann/ra_util.cpp b/src/mlpack/methods/rann/ra_util.cpp
new file mode 100644
index 0000000..579e03c
--- /dev/null
+++ b/src/mlpack/methods/rann/ra_util.cpp
@@ -0,0 +1,182 @@
+/**
+ * @file ra_util.cpp
+ * @author Parikshit Ram
+ * @author Ryan Curtin
+ *
+ * Utilities for rank-approximate neighbor search.
+ */
+#include "ra_util.hpp"
+
+using namespace mlpack;
+using namespace mlpack::neighbor;
+
+size_t mlpack::neighbor::RAUtil::MinimumSamplesReqd(const size_t n,
+                                                    const size_t k,
+                                                    const double tau,
+                                                    const double alpha)
+{
+  size_t ub = n; // The upper bound on the binary search.
+  size_t lb = k; // The lower bound on the binary search.
+  size_t  m = lb; // The minimum number of random samples.
+
+  // The rank-approximation.
+  const size_t t = (size_t) std::ceil(tau * (double) n / 100.0);
+
+  double prob;
+  Log::Assert(alpha <= 1.0);
+
+  // going through all values of sample sizes
+  // to find the minimum samples required to satisfy the
+  // desired bound
+  bool done = false;
+
+  // This performs a binary search on the integer values between 'lb = k'
+  // and 'ub = n' to find the minimum number of samples 'm' required to obtain
+  // the desired success probability 'alpha'.
+  do
+  {
+    prob = SuccessProbability(n, k, m, t);
+
+    if (prob > alpha)
+    {
+      if (prob - alpha < 0.001 || ub < lb + 2) {
+        done = true;
+        break;
+      }
+      else
+        ub = m;
+    }
+    else
+    {
+      if (prob < alpha)
+      {
+        if (m == lb)
+        {
+          m++;
+          continue;
+        }
+        else
+          lb = m;
+      }
+      else
+      {
+        done = true;
+        break;
+      }
+    }
+    m = (ub + lb) / 2;
+
+  } while (!done);
+
+  return (std::min(m + 1, n));
+}
+
+double mlpack::neighbor::RAUtil::SuccessProbability(const size_t n,
+                                                    const size_t k,
+                                                    const size_t m,
+                                                    const size_t t)
+{
+  if (k == 1)
+  {
+    if (m > n - t)
+      return 1.0;
+
+    double eps = (double) t / (double) n;
+
+    return 1.0 - std::pow(1.0 - eps, (double) m);
+
+  } // Faster implementation for topK = 1.
+  else
+  {
+    if (m < k)
+      return 0.0;
+
+    if (m > n - t + k - 1)
+      return 1.0;
+
+    double eps = (double) t / (double) n;
+    double sum = 0.0;
+
+    // The probability that 'k' of the 'm' samples lie within the top 't'
+    // of the neighbors is given by:
+    // sum_{j = k}^m Choose(m, j) (t/n)^j (1 - t/n)^{m - j}
+    // which is also equal to
+    // 1 - sum_{j = 0}^{k - 1} Choose(m, j) (t/n)^j (1 - t/n)^{m - j}
+    //
+    // So this is a m - k term summation or a k term summation. So if
+    // m > 2k, do the k term summation, otherwise do the m term summation.
+
+    size_t lb;
+    size_t ub;
+    bool topHalf;
+
+    if (2 * k < m)
+    {
+      // Compute 1 - sum_{j = 0}^{k - 1} Choose(m, j) eps^j (1 - eps)^{m - j}
+      // eps = t/n.
+      //
+      // Choosing 'lb' as 1 and 'ub' as k so as to sum from 1 to (k - 1), and
+      // add the term (1 - eps)^m term separately.
+      lb = 1;
+      ub = k;
+      topHalf = true;
+      sum = std::pow(1 - eps, (double) m);
+    }
+    else
+    {
+      // Compute sum_{j = k}^m Choose(m, j) eps^j (1 - eps)^{m - j}
+      // eps = t/n.
+      //
+      // Choosing 'lb' as k and 'ub' as m so as to sum from k to (m - 1), and
+      // add the term eps^m term separately.
+      lb = k;
+      ub = m;
+      topHalf = false;
+      sum = std::pow(eps, (double) m);
+    }
+
+    for (size_t j = lb; j < ub; j++)
+    {
+      // Compute Choose(m, j).
+      double mCj = (double) m;
+      size_t jTrans;
+
+      // If j < m - j, compute Choose(m, j).
+      // If j > m - j, compute Choose(m, m - j).
+      if (topHalf)
+        jTrans = j;
+      else
+        jTrans = m - j;
+
+      for(size_t i = 2; i <= jTrans; i++)
+      {
+        mCj *= (double) (m - (i - 1));
+        mCj /= (double) i;
+      }
+
+      sum += (mCj * std::pow(eps, (double) j)
+              * std::pow(1.0 - eps, (double) (m - j)));
+    }
+
+    if (topHalf)
+      sum = 1.0 - sum;
+
+    return sum;
+  } // For k > 1.
+}
+
+void mlpack::neighbor::RAUtil::ObtainDistinctSamples(
+    const size_t numSamples,
+    const size_t rangeUpperBound,
+    arma::uvec& distinctSamples)
+{
+  // Keep track of the points that are sampled.
+  arma::Col<size_t> sampledPoints;
+  sampledPoints.zeros(rangeUpperBound);
+
+  for (size_t i = 0; i < numSamples; i++)
+    sampledPoints[(size_t) math::RandInt(rangeUpperBound)]++;
+
+  distinctSamples = arma::find(sampledPoints > 0);
+  return;
+}
diff --git a/src/mlpack/methods/rann/ra_util.hpp b/src/mlpack/methods/rann/ra_util.hpp
new file mode 100644
index 0000000..126e547
--- /dev/null
+++ b/src/mlpack/methods/rann/ra_util.hpp
@@ -0,0 +1,64 @@
+/**
+ * @file ra_util.hpp
+ * @author Parikshit Ram
+ * @author Ryan Curtin
+ *
+ * Utilities for rank-approximate neighbor search.
+ */
+#ifndef __MLPACK_METHODS_RANN_RA_UTIL_HPP
+#define __MLPACK_METHODS_RANN_RA_UTIL_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace neighbor {
+
+class RAUtil
+{
+ public:
+  /**
+   * Compute the minimum number of samples required to guarantee
+   * the given rank-approximation and success probability.
+   *
+   * @param n Size of the set to be sampled from.
+   * @param k The number of neighbors required within the rank-approximation.
+   * @param tau The rank-approximation in percentile of the data.
+   * @param alpha The success probability desired.
+   */
+  static size_t MinimumSamplesReqd(const size_t n,
+                                   const size_t k,
+                                   const double tau,
+                                   const double alpha);
+
+  /**
+   * Compute the success probability of obtaining 'k'-neighbors from a
+   * set of size 'n' within the top 't' neighbors if 'm' samples are made.
+   *
+   * @param n Size of the set being sampled from.
+   * @param k The number of neighbors required within the rank-approximation.
+   * @param m The number of random samples.
+   * @param t The desired rank-approximation.
+   */
+  static double SuccessProbability(const size_t n,
+                                   const size_t k,
+                                   const size_t m,
+                                   const size_t t);
+
+  /**
+   * Pick up desired number of samples (with replacement) from a given range
+   * of integers so that only the distinct samples are returned from
+   * the range [0 - specified upper bound)
+   *
+   * @param numSamples Number of random samples.
+   * @param rangeUpperBound The upper bound on the range of integers.
+   * @param distinctSamples The list of the distinct samples.
+   */
+  static void ObtainDistinctSamples(const size_t numSamples,
+                                    const size_t rangeUpperBound,
+                                    arma::uvec& distinctSamples);
+};
+
+} // namespace neighbor
+} // namespace mlpack
+
+#endif



More information about the mlpack-git mailing list