[mlpack-git] master: Fixed VantagePointSplit removal. (8e6223c)

gitdub at mlpack.org gitdub at mlpack.org
Fri Aug 12 14:09:01 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/8a0ad2fbe4db5614a6aad27cfb5e101ae8b1db96...dc6bae4e8634486b384b67e3ae7a690f34bdc677

>---------------------------------------------------------------

commit 8e6223ce9cbc05b5cb68e6f4e7a96d8cb41bf5ff
Author: Mikhail Lozhnikov <lozhnikovma at gmail.com>
Date:   Fri Aug 12 21:09:01 2016 +0300

    Fixed VantagePointSplit removal.


>---------------------------------------------------------------

8e6223ce9cbc05b5cb68e6f4e7a96d8cb41bf5ff
 .../tree/binary_space_tree/vantage_point_split.hpp | 157 ++++++++++++++
 .../binary_space_tree/vantage_point_split_impl.hpp | 233 +++++++++++++++++++++
 2 files changed, 390 insertions(+)

diff --git a/src/mlpack/core/tree/binary_space_tree/vantage_point_split.hpp b/src/mlpack/core/tree/binary_space_tree/vantage_point_split.hpp
new file mode 100644
index 0000000..0542433
--- /dev/null
+++ b/src/mlpack/core/tree/binary_space_tree/vantage_point_split.hpp
@@ -0,0 +1,157 @@
+/**
+ * @file vantage_point_split.hpp
+ * @author Mikhail Lozhnikov
+ *
+ * Definition of class VantagePointSplit, a class that splits a vantage point
+ * tree into two parts using the distance to a certain vantage point.
+ */
+#ifndef MLPACK_CORE_TREE_BINARY_SPACE_TREE_VANTAGE_POINT_SPLIT_HPP
+#define MLPACK_CORE_TREE_BINARY_SPACE_TREE_VANTAGE_POINT_SPLIT_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace tree /** Trees and tree-building procedures. */ {
+
+template<typename BoundType,
+         typename MatType = arma::mat,
+         size_t MaxNumSamples = 100>
+class VantagePointSplit
+{
+ public:
+  //! The matrix element type.
+  typedef typename MatType::elem_type ElemType;
+  //! The bounding shape type.
+  typedef typename BoundType::MetricType MetricType;
+
+  /**
+   * Split the node according to the distance to a vantage point.
+   *
+   * @param bound The bound used by the tree.
+   * @param data The dataset used by the tree.
+   * @param begin Index of the starting point in the dataset that belongs to
+   *    this node.
+   * @param count Number of points in this node.
+   * @param splitCol The index at which the dataset is divided into two parts
+   *    after the rearrangement.
+   */
+  static bool SplitNode(const BoundType& bound,
+                        MatType& data,
+                        const size_t begin,
+                        const size_t count,
+                        size_t& splitCol);
+
+  /**
+   * Split the node according to the distance to a vantage point.
+   *
+   * @param bound The bound used by the tree.
+   * @param data The dataset used by the tree.
+   * @param begin Index of the starting point in the dataset that belongs to
+   *    this node.
+   * @param count Number of points in this node.
+   * @param splitCol The index at which the dataset is divided into two parts
+   *    after the rearrangement.
+   * @param oldFromNew Vector which will be filled with the old positions for
+   *    each new point.
+   */
+  static bool SplitNode(const BoundType& bound,
+                        MatType& data,
+                        const size_t begin,
+                        const size_t count,
+                        size_t& splitCol,
+                        std::vector<size_t>& oldFromNew);
+ private:
+  /**
+   * Select the best vantage point, i.e., the point with the largest second
+   * moment of the distance from a number of random node points to the vantage
+   * point.  Firstly this method selects no more than MaxNumSamples random
+   * points.  Then it evaluates each point, i.e., calculates the corresponding
+   * second moment and selects the point with the largest moment. Each random
+   * point belongs to the node.
+   *
+   * @param metric The metric used by the tree.
+   * @param data The dataset used by the tree.
+   * @param begin Index of the starting point in the dataset that belongs to
+   *    this node.
+   * @param count Number of points in this node.
+   * @param vantagePoint The index of the vantage point in the dataset.
+   * @param mu The median value of distance form the vantage point to
+   * a number of random points.
+   */
+  static void SelectVantagePoint(const MetricType& metric,
+                                 const MatType& data,
+                                 const size_t begin,
+                                 const size_t count,
+                                 size_t& vantagePoint,
+                                 ElemType& mu);
+
+  /**
+   * This method returns true if a point should be assigned to the left subtree,
+   * i.e., if the distance from the point to the vantage point is less then the
+   * median value. Otherwise it returns false.
+   *
+   * @param metric The metric used by the tree.
+   * @param data The dataset used by the tree.
+   * @param vantagePoint The vantage point.
+   * @param point The point that is being assigned.
+   * @param mu The median value.
+   */
+  template<typename VecType>
+  static bool AssignToLeftSubtree(const MetricType& metric,
+                                  const MatType& mat,
+                                  const VecType& vantagePoint,
+                                  const size_t point,
+                                  const ElemType mu)
+  {
+    return (metric.Evaluate(vantagePoint, mat.col(point)) < mu);
+  }
+
+  /**
+   * Perform split according to the median value and the vantage point.
+   *
+   * @param metric The metric used by the tree.
+   * @param data The dataset used by the tree.
+   * @param begin Index of the starting point in the dataset that belongs to
+   *      this node.
+   * @param count Number of points in this node.
+   * @param vantagePoint The vantage point.
+   * @param mu The median value.
+   */
+  template<typename VecType>
+  static size_t PerformSplit(const MetricType& metric,
+                             MatType& data,
+                             const size_t begin,
+                             const size_t count,
+                             const VecType& vantagePoint,
+                             const ElemType mu);
+
+  /**
+   * Perform split according to the median value and the vantage point.
+   *
+   * @param metric The metric used by the tree.
+   * @param data The dataset used by the tree.
+   * @param begin Index of the starting point in the dataset that belongs to
+   *    this node.
+   * @param count Number of points in this node.
+   * @param vantagePoint The vantage point.
+   * @param mu The median value.
+   * @param oldFromNew Vector which will be filled with the old positions for
+   *    each new point.
+   */
+  template<typename VecType>
+  static size_t PerformSplit(const MetricType& metric,
+                             MatType& data,
+                             const size_t begin,
+                             const size_t count,
+                             const VecType& vantagePoint,
+                             const ElemType mu,
+                             std::vector<size_t>& oldFromNew);
+};
+
+} // namespace tree
+} // namespace mlpack
+
+// Include implementation.
+#include "vantage_point_split_impl.hpp"
+
+#endif  //  MLPACK_CORE_TREE_BINARY_SPACE_TREE_VANTAGE_POINT_SPLIT_HPP
diff --git a/src/mlpack/core/tree/binary_space_tree/vantage_point_split_impl.hpp b/src/mlpack/core/tree/binary_space_tree/vantage_point_split_impl.hpp
new file mode 100644
index 0000000..92e00b9
--- /dev/null
+++ b/src/mlpack/core/tree/binary_space_tree/vantage_point_split_impl.hpp
@@ -0,0 +1,233 @@
+/**
+ * @file vantage_point_split_impl.hpp
+ * @author Mikhail Lozhnikov
+ *
+ * Implementation of class (VantagePointSplit) to split a vantage point
+ * tree according to the median value of the distance to a certain vantage point.
+ */
+#ifndef MLPACK_CORE_TREE_BINARY_SPACE_TREE_VANTAGE_POINT_SPLIT_IMPL_HPP
+#define MLPACK_CORE_TREE_BINARY_SPACE_TREE_VANTAGE_POINT_SPLIT_IMPL_HPP
+
+#include "vantage_point_split.hpp"
+#include <mlpack/core/tree/bounds.hpp>
+
+namespace mlpack {
+namespace tree {
+
+template<typename BoundType, typename MatType, size_t MaxNumSamples>
+bool VantagePointSplit<BoundType, MatType, MaxNumSamples>::
+SplitNode(const BoundType& bound, MatType& data, const size_t begin,
+    const size_t count, size_t& splitCol)
+{
+  ElemType mu = 0;
+  size_t vantagePointIndex;
+
+  // Find the best vantage point.
+  SelectVantagePoint(bound.Metric(), data, begin, count, vantagePointIndex, mu);
+
+  // If all points are equal, we can't split.
+  if (mu == 0)
+    return false;
+
+  // The first point of the left child is centroid.
+  data.swap_cols(begin, vantagePointIndex);
+
+  arma::Col<ElemType> vantagePoint = data.col(begin);
+  splitCol = PerformSplit(bound.Metric(), data, begin, count, vantagePoint, mu);
+
+  assert(splitCol > begin);
+  assert(splitCol < begin + count);
+  return true;
+}
+
+template<typename BoundType, typename MatType, size_t MaxNumSamples>
+bool VantagePointSplit<BoundType, MatType, MaxNumSamples>::
+SplitNode(const BoundType& bound, MatType& data, const size_t begin,
+    const size_t count, size_t& splitCol, std::vector<size_t>& oldFromNew)
+{
+  ElemType mu = 0;
+  size_t vantagePointIndex;
+
+  // Find the best vantage point.
+  SelectVantagePoint(bound.Metric(), data, begin, count, vantagePointIndex, mu);
+
+  // If all points are equal, we can't split.
+  if (mu == 0)
+    return false;
+
+  // The first point of the left child is centroid.
+  data.swap_cols(begin, vantagePointIndex);
+  const size_t t = oldFromNew[begin];
+  oldFromNew[begin] = oldFromNew[vantagePointIndex];
+  oldFromNew[vantagePointIndex] = t;
+
+  arma::Col<ElemType> vantagePoint = data.col(begin);
+
+  splitCol = PerformSplit(bound.Metric(), data, begin, count, vantagePoint, mu,
+      oldFromNew);
+
+  assert(splitCol > begin);
+  assert(splitCol < begin + count);
+  return true;
+}
+
+template<typename BoundType, typename MatType, size_t MaxNumSamples>
+template <typename VecType>
+size_t VantagePointSplit<BoundType, MatType, MaxNumSamples>::PerformSplit(
+    const MetricType& metric,
+    MatType& data,
+    const size_t begin,
+    const size_t count,
+    const VecType& vantagePoint,
+    const ElemType mu)
+{
+  // This method modifies the input dataset.  We loop both from the left and
+  // right sides of the points contained in this node.  The points closer to
+  // the vantage point should be on the left side of the matrix, and the farther
+  // from the vantage point should be on the right side of the matrix.
+  size_t left = begin;
+  size_t right = begin + count - 1;
+
+  // First half-iteration of the loop is out here because the termination
+  // condition is in the middle.
+  while (AssignToLeftSubtree(metric, data, vantagePoint, left, mu) &&
+      (left <= right))
+    left++;
+
+  while ((!AssignToLeftSubtree(metric, data, vantagePoint, right, mu)) &&
+      (left <= right) && (right > 0))
+    right--;
+
+  while (left <= right)
+  {
+    // Swap columns.
+    data.swap_cols(left, right);
+
+    // See how many points on the left are correct.  When they are correct,
+    // increase the left counter accordingly.  When we encounter one that isn't
+    // correct, stop.  We will switch it later.
+    while ((AssignToLeftSubtree(metric, data, vantagePoint, left, mu)) &&
+        (left <= right))
+      left++;
+
+    // Now see how many points on the right are correct.  When they are correct,
+    // decrease the right counter accordingly.  When we encounter one that isn't
+    // correct, stop.  We will switch it with the wrong point we found in the
+    // previous loop.
+    while ((!AssignToLeftSubtree(metric, data, vantagePoint, right, mu)) &&
+        (left <= right))
+      right--;
+  }
+
+  Log::Assert(left == right + 1);
+
+  return left;
+}
+
+template<typename BoundType, typename MatType, size_t MaxNumSamples>
+template<typename VecType>
+size_t VantagePointSplit<BoundType, MatType, MaxNumSamples>::PerformSplit(
+    const MetricType& metric,
+    MatType& data,
+    const size_t begin,
+    const size_t count,
+    const VecType& vantagePoint,
+    const ElemType mu,
+    std::vector<size_t>& oldFromNew)
+{
+  // This method modifies the input dataset.  We loop both from the left and
+  // right sides of the points contained in this node.  The points closer to
+  // the vantage point should be on the left side of the matrix, and the farther
+  // from the vantage point should be on the right side of the matrix.
+  size_t left = begin;
+  size_t right = begin + count - 1;
+
+  // First half-iteration of the loop is out here because the termination
+  // condition is in the middle.
+
+  while (AssignToLeftSubtree(metric, data, vantagePoint, left, mu) &&
+      (left <= right))
+    left++;
+
+  while ((!AssignToLeftSubtree(metric, data, vantagePoint, right, mu)) &&
+      (left <= right) && (right > 0))
+    right--;
+
+  while (left <= right)
+  {
+    // Swap columns.
+    data.swap_cols(left, right);
+
+    // Update the indices for what we changed.
+    size_t t = oldFromNew[left];
+    oldFromNew[left] = oldFromNew[right];
+    oldFromNew[right] = t;
+
+    // See how many points on the left are correct.  When they are correct,
+    // increase the left counter accordingly.  When we encounter one that isn't
+    // correct, stop.  We will switch it later.
+    while (AssignToLeftSubtree(metric, data, vantagePoint, left, mu) &&
+        (left <= right))
+      left++;
+
+    // Now see how many points on the right are correct.  When they are correct,
+    // decrease the right counter accordingly.  When we encounter one that isn't
+    // correct, stop.  We will switch it with the wrong point we found in the
+    // previous loop.
+    while ((!AssignToLeftSubtree(metric, data, vantagePoint, right, mu)) &&
+        (left <= right))
+      right--;
+  }
+
+  Log::Assert(left == right + 1);
+
+  return left;
+}
+
+template<typename BoundType, typename MatType, size_t MaxNumSamples>
+void VantagePointSplit<BoundType, MatType, MaxNumSamples>::
+SelectVantagePoint(const MetricType& metric, const MatType& data,
+    const size_t begin, const size_t count, size_t& vantagePoint, ElemType& mu)
+{
+  arma::uvec vantagePointCandidates;
+  arma::Col<ElemType> distances(MaxNumSamples);
+
+  // Get no more than max(MaxNumSamples, count) vantage point candidates
+  math::ObtainDistinctSamples(begin, begin + count, MaxNumSamples,
+      vantagePointCandidates);
+
+  ElemType bestSpread = 0;
+
+  arma::uvec samples;
+  //  Evaluate each candidate
+  for (size_t i = 0; i < vantagePointCandidates.n_elem; i++)
+  {
+    // Get no more than min(MaxNumSamples, count) random samples
+    math::ObtainDistinctSamples(begin, begin + count, MaxNumSamples, samples);
+
+    // Calculate the second moment of the distance to the vantage point
+    // candidate using these random samples.
+    distances.set_size(samples.n_elem);
+
+    for (size_t j = 0; j < samples.n_elem; j++)
+      distances[j] = metric.Evaluate(data.col(vantagePointCandidates[i]),
+          data.col(samples[j]));
+
+    const ElemType spread = arma::sum(distances % distances) / samples.n_elem;
+
+    if (spread > bestSpread)
+    {
+      bestSpread = spread;
+      vantagePoint = vantagePointCandidates[i];
+      // Calculate the median value of the distance from the vantage point
+      // candidate to these samples.
+      mu = arma::median(distances);
+    }
+  }
+  assert(bestSpread > 0);
+}
+
+} // namespace tree
+} // namespace mlpack
+
+#endif  // MLPACK_CORE_TREE_BINARY_SPACE_TREE_VANTAGE_POINT_SPLIT_IMPL_HPP




More information about the mlpack-git mailing list