[mlpack-git] master: Fixed VantagePointSplit removal. (8e6223c)
gitdub at mlpack.org
gitdub at mlpack.org
Fri Aug 12 14:09:01 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/8a0ad2fbe4db5614a6aad27cfb5e101ae8b1db96...dc6bae4e8634486b384b67e3ae7a690f34bdc677
>---------------------------------------------------------------
commit 8e6223ce9cbc05b5cb68e6f4e7a96d8cb41bf5ff
Author: Mikhail Lozhnikov <lozhnikovma at gmail.com>
Date: Fri Aug 12 21:09:01 2016 +0300
Fixed VantagePointSplit removal.
>---------------------------------------------------------------
8e6223ce9cbc05b5cb68e6f4e7a96d8cb41bf5ff
.../tree/binary_space_tree/vantage_point_split.hpp | 157 ++++++++++++++
.../binary_space_tree/vantage_point_split_impl.hpp | 233 +++++++++++++++++++++
2 files changed, 390 insertions(+)
diff --git a/src/mlpack/core/tree/binary_space_tree/vantage_point_split.hpp b/src/mlpack/core/tree/binary_space_tree/vantage_point_split.hpp
new file mode 100644
index 0000000..0542433
--- /dev/null
+++ b/src/mlpack/core/tree/binary_space_tree/vantage_point_split.hpp
@@ -0,0 +1,157 @@
+/**
+ * @file vantage_point_split.hpp
+ * @author Mikhail Lozhnikov
+ *
+ * Definition of class VantagePointSplit, a class that splits a vantage point
+ * tree into two parts using the distance to a certain vantage point.
+ */
+#ifndef MLPACK_CORE_TREE_BINARY_SPACE_TREE_VANTAGE_POINT_SPLIT_HPP
+#define MLPACK_CORE_TREE_BINARY_SPACE_TREE_VANTAGE_POINT_SPLIT_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace tree /** Trees and tree-building procedures. */ {
+
+template<typename BoundType,
+ typename MatType = arma::mat,
+ size_t MaxNumSamples = 100>
+class VantagePointSplit
+{
+ public:
+ //! The matrix element type.
+ typedef typename MatType::elem_type ElemType;
+ //! The bounding shape type.
+ typedef typename BoundType::MetricType MetricType;
+
+ /**
+ * Split the node according to the distance to a vantage point.
+ *
+ * @param bound The bound used by the tree.
+ * @param data The dataset used by the tree.
+ * @param begin Index of the starting point in the dataset that belongs to
+ * this node.
+ * @param count Number of points in this node.
+ * @param splitCol The index at which the dataset is divided into two parts
+ * after the rearrangement.
+ */
+ static bool SplitNode(const BoundType& bound,
+ MatType& data,
+ const size_t begin,
+ const size_t count,
+ size_t& splitCol);
+
+ /**
+ * Split the node according to the distance to a vantage point.
+ *
+ * @param bound The bound used by the tree.
+ * @param data The dataset used by the tree.
+ * @param begin Index of the starting point in the dataset that belongs to
+ * this node.
+ * @param count Number of points in this node.
+ * @param splitCol The index at which the dataset is divided into two parts
+ * after the rearrangement.
+ * @param oldFromNew Vector which will be filled with the old positions for
+ * each new point.
+ */
+ static bool SplitNode(const BoundType& bound,
+ MatType& data,
+ const size_t begin,
+ const size_t count,
+ size_t& splitCol,
+ std::vector<size_t>& oldFromNew);
+ private:
+ /**
+ * Select the best vantage point, i.e., the point with the largest second
+ * moment of the distance from a number of random node points to the vantage
+ * point. Firstly this method selects no more than MaxNumSamples random
+ * points. Then it evaluates each point, i.e., calculates the corresponding
+ * second moment and selects the point with the largest moment. Each random
+ * point belongs to the node.
+ *
+ * @param metric The metric used by the tree.
+ * @param data The dataset used by the tree.
+ * @param begin Index of the starting point in the dataset that belongs to
+ * this node.
+ * @param count Number of points in this node.
+ * @param vantagePoint The index of the vantage point in the dataset.
+ * @param mu The median value of distance form the vantage point to
+ * a number of random points.
+ */
+ static void SelectVantagePoint(const MetricType& metric,
+ const MatType& data,
+ const size_t begin,
+ const size_t count,
+ size_t& vantagePoint,
+ ElemType& mu);
+
+ /**
+ * This method returns true if a point should be assigned to the left subtree,
+ * i.e., if the distance from the point to the vantage point is less then the
+ * median value. Otherwise it returns false.
+ *
+ * @param metric The metric used by the tree.
+ * @param data The dataset used by the tree.
+ * @param vantagePoint The vantage point.
+ * @param point The point that is being assigned.
+ * @param mu The median value.
+ */
+ template<typename VecType>
+ static bool AssignToLeftSubtree(const MetricType& metric,
+ const MatType& mat,
+ const VecType& vantagePoint,
+ const size_t point,
+ const ElemType mu)
+ {
+ return (metric.Evaluate(vantagePoint, mat.col(point)) < mu);
+ }
+
+ /**
+ * Perform split according to the median value and the vantage point.
+ *
+ * @param metric The metric used by the tree.
+ * @param data The dataset used by the tree.
+ * @param begin Index of the starting point in the dataset that belongs to
+ * this node.
+ * @param count Number of points in this node.
+ * @param vantagePoint The vantage point.
+ * @param mu The median value.
+ */
+ template<typename VecType>
+ static size_t PerformSplit(const MetricType& metric,
+ MatType& data,
+ const size_t begin,
+ const size_t count,
+ const VecType& vantagePoint,
+ const ElemType mu);
+
+ /**
+ * Perform split according to the median value and the vantage point.
+ *
+ * @param metric The metric used by the tree.
+ * @param data The dataset used by the tree.
+ * @param begin Index of the starting point in the dataset that belongs to
+ * this node.
+ * @param count Number of points in this node.
+ * @param vantagePoint The vantage point.
+ * @param mu The median value.
+ * @param oldFromNew Vector which will be filled with the old positions for
+ * each new point.
+ */
+ template<typename VecType>
+ static size_t PerformSplit(const MetricType& metric,
+ MatType& data,
+ const size_t begin,
+ const size_t count,
+ const VecType& vantagePoint,
+ const ElemType mu,
+ std::vector<size_t>& oldFromNew);
+};
+
+} // namespace tree
+} // namespace mlpack
+
+// Include implementation.
+#include "vantage_point_split_impl.hpp"
+
+#endif // MLPACK_CORE_TREE_BINARY_SPACE_TREE_VANTAGE_POINT_SPLIT_HPP
diff --git a/src/mlpack/core/tree/binary_space_tree/vantage_point_split_impl.hpp b/src/mlpack/core/tree/binary_space_tree/vantage_point_split_impl.hpp
new file mode 100644
index 0000000..92e00b9
--- /dev/null
+++ b/src/mlpack/core/tree/binary_space_tree/vantage_point_split_impl.hpp
@@ -0,0 +1,233 @@
+/**
+ * @file vantage_point_split_impl.hpp
+ * @author Mikhail Lozhnikov
+ *
+ * Implementation of class (VantagePointSplit) to split a vantage point
+ * tree according to the median value of the distance to a certain vantage point.
+ */
+#ifndef MLPACK_CORE_TREE_BINARY_SPACE_TREE_VANTAGE_POINT_SPLIT_IMPL_HPP
+#define MLPACK_CORE_TREE_BINARY_SPACE_TREE_VANTAGE_POINT_SPLIT_IMPL_HPP
+
+#include "vantage_point_split.hpp"
+#include <mlpack/core/tree/bounds.hpp>
+
+namespace mlpack {
+namespace tree {
+
+template<typename BoundType, typename MatType, size_t MaxNumSamples>
+bool VantagePointSplit<BoundType, MatType, MaxNumSamples>::
+SplitNode(const BoundType& bound, MatType& data, const size_t begin,
+ const size_t count, size_t& splitCol)
+{
+ ElemType mu = 0;
+ size_t vantagePointIndex;
+
+ // Find the best vantage point.
+ SelectVantagePoint(bound.Metric(), data, begin, count, vantagePointIndex, mu);
+
+ // If all points are equal, we can't split.
+ if (mu == 0)
+ return false;
+
+ // The first point of the left child is centroid.
+ data.swap_cols(begin, vantagePointIndex);
+
+ arma::Col<ElemType> vantagePoint = data.col(begin);
+ splitCol = PerformSplit(bound.Metric(), data, begin, count, vantagePoint, mu);
+
+ assert(splitCol > begin);
+ assert(splitCol < begin + count);
+ return true;
+}
+
+template<typename BoundType, typename MatType, size_t MaxNumSamples>
+bool VantagePointSplit<BoundType, MatType, MaxNumSamples>::
+SplitNode(const BoundType& bound, MatType& data, const size_t begin,
+ const size_t count, size_t& splitCol, std::vector<size_t>& oldFromNew)
+{
+ ElemType mu = 0;
+ size_t vantagePointIndex;
+
+ // Find the best vantage point.
+ SelectVantagePoint(bound.Metric(), data, begin, count, vantagePointIndex, mu);
+
+ // If all points are equal, we can't split.
+ if (mu == 0)
+ return false;
+
+ // The first point of the left child is centroid.
+ data.swap_cols(begin, vantagePointIndex);
+ const size_t t = oldFromNew[begin];
+ oldFromNew[begin] = oldFromNew[vantagePointIndex];
+ oldFromNew[vantagePointIndex] = t;
+
+ arma::Col<ElemType> vantagePoint = data.col(begin);
+
+ splitCol = PerformSplit(bound.Metric(), data, begin, count, vantagePoint, mu,
+ oldFromNew);
+
+ assert(splitCol > begin);
+ assert(splitCol < begin + count);
+ return true;
+}
+
+template<typename BoundType, typename MatType, size_t MaxNumSamples>
+template <typename VecType>
+size_t VantagePointSplit<BoundType, MatType, MaxNumSamples>::PerformSplit(
+ const MetricType& metric,
+ MatType& data,
+ const size_t begin,
+ const size_t count,
+ const VecType& vantagePoint,
+ const ElemType mu)
+{
+ // This method modifies the input dataset. We loop both from the left and
+ // right sides of the points contained in this node. The points closer to
+ // the vantage point should be on the left side of the matrix, and the farther
+ // from the vantage point should be on the right side of the matrix.
+ size_t left = begin;
+ size_t right = begin + count - 1;
+
+ // First half-iteration of the loop is out here because the termination
+ // condition is in the middle.
+ while (AssignToLeftSubtree(metric, data, vantagePoint, left, mu) &&
+ (left <= right))
+ left++;
+
+ while ((!AssignToLeftSubtree(metric, data, vantagePoint, right, mu)) &&
+ (left <= right) && (right > 0))
+ right--;
+
+ while (left <= right)
+ {
+ // Swap columns.
+ data.swap_cols(left, right);
+
+ // See how many points on the left are correct. When they are correct,
+ // increase the left counter accordingly. When we encounter one that isn't
+ // correct, stop. We will switch it later.
+ while ((AssignToLeftSubtree(metric, data, vantagePoint, left, mu)) &&
+ (left <= right))
+ left++;
+
+ // Now see how many points on the right are correct. When they are correct,
+ // decrease the right counter accordingly. When we encounter one that isn't
+ // correct, stop. We will switch it with the wrong point we found in the
+ // previous loop.
+ while ((!AssignToLeftSubtree(metric, data, vantagePoint, right, mu)) &&
+ (left <= right))
+ right--;
+ }
+
+ Log::Assert(left == right + 1);
+
+ return left;
+}
+
+template<typename BoundType, typename MatType, size_t MaxNumSamples>
+template<typename VecType>
+size_t VantagePointSplit<BoundType, MatType, MaxNumSamples>::PerformSplit(
+ const MetricType& metric,
+ MatType& data,
+ const size_t begin,
+ const size_t count,
+ const VecType& vantagePoint,
+ const ElemType mu,
+ std::vector<size_t>& oldFromNew)
+{
+ // This method modifies the input dataset. We loop both from the left and
+ // right sides of the points contained in this node. The points closer to
+ // the vantage point should be on the left side of the matrix, and the farther
+ // from the vantage point should be on the right side of the matrix.
+ size_t left = begin;
+ size_t right = begin + count - 1;
+
+ // First half-iteration of the loop is out here because the termination
+ // condition is in the middle.
+
+ while (AssignToLeftSubtree(metric, data, vantagePoint, left, mu) &&
+ (left <= right))
+ left++;
+
+ while ((!AssignToLeftSubtree(metric, data, vantagePoint, right, mu)) &&
+ (left <= right) && (right > 0))
+ right--;
+
+ while (left <= right)
+ {
+ // Swap columns.
+ data.swap_cols(left, right);
+
+ // Update the indices for what we changed.
+ size_t t = oldFromNew[left];
+ oldFromNew[left] = oldFromNew[right];
+ oldFromNew[right] = t;
+
+ // See how many points on the left are correct. When they are correct,
+ // increase the left counter accordingly. When we encounter one that isn't
+ // correct, stop. We will switch it later.
+ while (AssignToLeftSubtree(metric, data, vantagePoint, left, mu) &&
+ (left <= right))
+ left++;
+
+ // Now see how many points on the right are correct. When they are correct,
+ // decrease the right counter accordingly. When we encounter one that isn't
+ // correct, stop. We will switch it with the wrong point we found in the
+ // previous loop.
+ while ((!AssignToLeftSubtree(metric, data, vantagePoint, right, mu)) &&
+ (left <= right))
+ right--;
+ }
+
+ Log::Assert(left == right + 1);
+
+ return left;
+}
+
+template<typename BoundType, typename MatType, size_t MaxNumSamples>
+void VantagePointSplit<BoundType, MatType, MaxNumSamples>::
+SelectVantagePoint(const MetricType& metric, const MatType& data,
+ const size_t begin, const size_t count, size_t& vantagePoint, ElemType& mu)
+{
+ arma::uvec vantagePointCandidates;
+ arma::Col<ElemType> distances(MaxNumSamples);
+
+ // Get no more than max(MaxNumSamples, count) vantage point candidates
+ math::ObtainDistinctSamples(begin, begin + count, MaxNumSamples,
+ vantagePointCandidates);
+
+ ElemType bestSpread = 0;
+
+ arma::uvec samples;
+ // Evaluate each candidate
+ for (size_t i = 0; i < vantagePointCandidates.n_elem; i++)
+ {
+ // Get no more than min(MaxNumSamples, count) random samples
+ math::ObtainDistinctSamples(begin, begin + count, MaxNumSamples, samples);
+
+ // Calculate the second moment of the distance to the vantage point
+ // candidate using these random samples.
+ distances.set_size(samples.n_elem);
+
+ for (size_t j = 0; j < samples.n_elem; j++)
+ distances[j] = metric.Evaluate(data.col(vantagePointCandidates[i]),
+ data.col(samples[j]));
+
+ const ElemType spread = arma::sum(distances % distances) / samples.n_elem;
+
+ if (spread > bestSpread)
+ {
+ bestSpread = spread;
+ vantagePoint = vantagePointCandidates[i];
+ // Calculate the median value of the distance from the vantage point
+ // candidate to these samples.
+ mu = arma::median(distances);
+ }
+ }
+ assert(bestSpread > 0);
+}
+
+} // namespace tree
+} // namespace mlpack
+
+#endif // MLPACK_CORE_TREE_BINARY_SPACE_TREE_VANTAGE_POINT_SPLIT_IMPL_HPP
More information about the mlpack-git
mailing list