[mlpack-git] master: Various rp-tree fixes. Fixed debug build. (ea66e5d)

gitdub at mlpack.org gitdub at mlpack.org
Tue Aug 16 13:24:05 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/a7794bde8082c691553152393e1e230098f5e920...87776e52cf9ead63fa458118a0cfd2fe46b23466

>---------------------------------------------------------------

commit ea66e5d17914460289974cc61f0669941edc2524
Author: Mikhail Lozhnikov <lozhnikovma at gmail.com>
Date:   Tue Aug 16 20:24:05 2016 +0300

    Various rp-tree fixes. Fixed debug build.


>---------------------------------------------------------------

ea66e5d17914460289974cc61f0669941edc2524
 .../binary_space_tree/binary_space_tree_impl.hpp   |  8 ++++-
 .../tree/binary_space_tree/rp_tree_max_split.hpp   | 14 --------
 .../binary_space_tree/rp_tree_max_split_impl.hpp   | 38 +++++++---------------
 .../tree/binary_space_tree/rp_tree_mean_split.hpp  | 14 --------
 .../binary_space_tree/rp_tree_mean_split_impl.hpp  | 37 ++++++++-------------
 .../binary_space_tree/vantage_point_split_impl.hpp |  2 --
 6 files changed, 32 insertions(+), 81 deletions(-)

diff --git a/src/mlpack/core/tree/binary_space_tree/binary_space_tree_impl.hpp b/src/mlpack/core/tree/binary_space_tree/binary_space_tree_impl.hpp
index 95403c2..6059a3c 100644
--- a/src/mlpack/core/tree/binary_space_tree/binary_space_tree_impl.hpp
+++ b/src/mlpack/core/tree/binary_space_tree/binary_space_tree_impl.hpp
@@ -667,7 +667,10 @@ void BinarySpaceTree<MetricType, StatisticType, MatType, BoundType, SplitType>::
   // Perform the actual splitting.  This will order the dataset such that points
   // that belong to the left subtree are on the left of splitCol, and points
   // from the right subtree are on the right side of splitCol.
-   splitCol = PerformSplit(*dataset, begin, count, splitInfo);
+  splitCol = PerformSplit(*dataset, begin, count, splitInfo);
+
+  assert(splitCol > begin);
+  assert(splitCol < begin + count);
 
   // Now that we know the split column, we will recursively split the children
   // by calling their constructors (which perform this splitting process).
@@ -732,6 +735,9 @@ SplitNode(std::vector<size_t>& oldFromNew,
   // from the right subtree are on the right side of splitCol.
   splitCol = PerformSplit(*dataset, begin, count, splitInfo, oldFromNew);
 
+  assert(splitCol > begin);
+  assert(splitCol < begin + count);
+
   // Now that we know the split column, we will recursively split the children
   // by calling their constructors (which perform this splitting process).
   left = new BinarySpaceTree(this, begin, splitCol - begin, oldFromNew,
diff --git a/src/mlpack/core/tree/binary_space_tree/rp_tree_max_split.hpp b/src/mlpack/core/tree/binary_space_tree/rp_tree_max_split.hpp
index 5c31e84..52b2be8 100644
--- a/src/mlpack/core/tree/binary_space_tree/rp_tree_max_split.hpp
+++ b/src/mlpack/core/tree/binary_space_tree/rp_tree_max_split.hpp
@@ -83,20 +83,6 @@ class RPTreeMaxSplit
                                      const arma::Col<ElemType>& direction);
 
   /**
-   * Obtain a number of random distinct samples from the dataset. All samples
-   * belong to [begin, begin + count).
-   *
-   * @param distinctSamples The indices of the samples.
-   * @param begin The lower bound of indices.
-   * @param count The number of point candidates.
-   * @param numSamples The maximum number of samples.
-   */
-  static void GetDistinctSamples(arma::uvec& distinctSamples,
-                                 const size_t begin,
-                                 const size_t count,
-                                 const size_t numSamples);
-
-  /**
    * This method finds the position of the hyperplane that will split the node.
    *
    * @param data The dataset used by the binary space tree.
diff --git a/src/mlpack/core/tree/binary_space_tree/rp_tree_max_split_impl.hpp b/src/mlpack/core/tree/binary_space_tree/rp_tree_max_split_impl.hpp
index d5cd714..50f6a89 100644
--- a/src/mlpack/core/tree/binary_space_tree/rp_tree_max_split_impl.hpp
+++ b/src/mlpack/core/tree/binary_space_tree/rp_tree_max_split_impl.hpp
@@ -69,25 +69,6 @@ GetRandomDeviation(const MatType& data,
 }
 
 template<typename BoundType, typename MatType>
-void RPTreeMaxSplit<BoundType, MatType>::GetDistinctSamples(
-    arma::uvec& distinctSamples,
-    const size_t begin,
-    const size_t count,
-    const size_t numSamples)
-{
-  arma::Col<size_t> samples;
-
-  samples.zeros(count);
-
-  for (size_t i = 0; i < numSamples; i++)
-    samples [ (size_t) math::RandInt(count) ]++;
-
-  distinctSamples = arma::find(samples > 0);
-
-  distinctSamples += begin;
-}
-
-template<typename BoundType, typename MatType>
 bool RPTreeMaxSplit<BoundType, MatType>::GetSplitVal(
     const MatType& data,
     const size_t begin,
@@ -100,26 +81,29 @@ bool RPTreeMaxSplit<BoundType, MatType>::GetSplitVal(
   arma::uvec samples;
 
   // Get no more than numSamples distinct samples.
-  GetDistinctSamples(samples, begin, count, numSamples);
+  math::ObtainDistinctSamples(begin, begin + count, numSamples, samples);
 
-  std::vector<ElemType> values(samples.n_elem);
+  arma::Col<ElemType> values(samples.n_elem);
 
   // Find the median of scalar products of the samples and the normal vector.
   for (size_t k = 0; k < samples.n_elem; k++)
     values[k] = arma::dot(data.col(samples[k]), direction);
 
-  std::sort(values.begin(), values.end());
-
-  if (values[0] == values[values.size() - 1])
+  const ElemType maximum = arma::max(values);
+  const ElemType minimum = arma::min(values);
+  if (minimum == maximum)
     return false;
 
-  splitVal = values[values.size() / 2];
+  splitVal = arma::median(values);
 
   // Add a random deviation to the median.
   // This algorithm differs from the method suggested in the
   // random projection tree paper.
-  splitVal += math::Random((values[values.size() / 2] - values[0]) * 0.75,
-      (values[values.size() - 1] - values[values.size() / 2]) * 0.75);
+  splitVal += math::Random((minimum - splitVal) * 0.75,
+      (maximum - splitVal) * 0.75);
+
+  if (splitVal == maximum)
+    splitVal = minimum;
 
   return true;
 }
diff --git a/src/mlpack/core/tree/binary_space_tree/rp_tree_mean_split.hpp b/src/mlpack/core/tree/binary_space_tree/rp_tree_mean_split.hpp
index ad198a2..93708ed 100644
--- a/src/mlpack/core/tree/binary_space_tree/rp_tree_mean_split.hpp
+++ b/src/mlpack/core/tree/binary_space_tree/rp_tree_mean_split.hpp
@@ -85,20 +85,6 @@ class RPTreeMeanSplit
   static void GetRandomDirection(arma::Col<ElemType>& direction);
 
   /**
-   * Obtain a number of random distinct samples from the dataset. All samples
-   * belong to [begin, begin + count).
-   *
-   * @param distinctSamples The indices of the samples.
-   * @param begin The lower bound of indices.
-   * @param count The number of point candidates.
-   * @param numSamples The maximum number of samples.
-   */
-  static void GetDistinctSamples(arma::uvec& distinctSamples,
-                                 const size_t begin,
-                                 const size_t count,
-                                 const size_t numSamples);
-
-  /**
    * Get the average distance between points in the dataset.
    *
    * @param data The dataset used by the binary space tree.
diff --git a/src/mlpack/core/tree/binary_space_tree/rp_tree_mean_split_impl.hpp b/src/mlpack/core/tree/binary_space_tree/rp_tree_mean_split_impl.hpp
index ec5ad1d..14ebb53 100644
--- a/src/mlpack/core/tree/binary_space_tree/rp_tree_mean_split_impl.hpp
+++ b/src/mlpack/core/tree/binary_space_tree/rp_tree_mean_split_impl.hpp
@@ -25,7 +25,7 @@ bool RPTreeMeanSplit<BoundType, MatType>::SplitNode(const BoundType&  bound,
   arma::uvec samples;
 
   // Get no more than numSamples distinct samples.
-  GetDistinctSamples(samples, begin, count, numSamples);
+  math::ObtainDistinctSamples(begin, begin + count, numSamples, samples);
 
   // Find the average distance between points.
   ElemType averageDistanceSq = GetAveragePointDistance(data, samples);
@@ -62,25 +62,6 @@ bool RPTreeMeanSplit<BoundType, MatType>::SplitNode(const BoundType&  bound,
 }
 
 template<typename BoundType, typename MatType>
-void RPTreeMeanSplit<BoundType, MatType>::GetDistinctSamples(
-    arma::uvec& distinctSamples,
-    const size_t begin,
-    const size_t count,
-    const size_t numSamples)
-{
-  arma::Col<size_t> samples;
-
-  samples.zeros(count);
-
-  for (size_t i = 0; i < numSamples; i++)
-    samples [ (size_t) math::RandInt(count) ]++;
-
-  distinctSamples = arma::find(samples > 0);
-
-  distinctSamples += begin;
-}
-
-template<typename BoundType, typename MatType>
 typename MatType::elem_type RPTreeMeanSplit<BoundType, MatType>::
 GetAveragePointDistance(
     MatType& data,
@@ -131,11 +112,16 @@ bool RPTreeMeanSplit<BoundType, MatType>::GetDotMedian(
   for (size_t k = 0; k < samples.n_elem; k++)
     values[k] = arma::dot(data.col(samples[k]), direction);
 
-  if (arma::min(values) == arma::max(values))
+  const ElemType maximum = arma::max(values);
+  const ElemType minimum = arma::min(values);
+  if (minimum == maximum)
     return false;
 
   splitVal = arma::median(values);
 
+  if (splitVal == maximum)
+    splitVal = minimum;
+
   return true;
 }
 
@@ -148,7 +134,7 @@ bool RPTreeMeanSplit<BoundType, MatType>::GetMeanMedian(
 {
   arma::Col<ElemType> values(samples.n_elem);
 
-  mean = arma::mean(data.cols(samples));
+  mean = arma::mean(data.cols(samples), 1);
 
   arma::Col<ElemType> tmp(data.n_rows);
 
@@ -160,11 +146,16 @@ bool RPTreeMeanSplit<BoundType, MatType>::GetMeanMedian(
     values[k] = arma::dot(tmp, tmp);
   }
 
-  if (arma::min(values) == arma::max(values))
+  const ElemType maximum = arma::max(values);
+  const ElemType minimum = arma::min(values);
+  if (minimum == maximum)
     return false;
 
   splitVal = arma::median(values);
 
+  if (splitVal == maximum)
+    splitVal = minimum;
+
   return true;
 }
 
diff --git a/src/mlpack/core/tree/binary_space_tree/vantage_point_split_impl.hpp b/src/mlpack/core/tree/binary_space_tree/vantage_point_split_impl.hpp
index 94ec63b..3028d53 100644
--- a/src/mlpack/core/tree/binary_space_tree/vantage_point_split_impl.hpp
+++ b/src/mlpack/core/tree/binary_space_tree/vantage_point_split_impl.hpp
@@ -31,8 +31,6 @@ SplitNode(const BoundType& bound, MatType& data, const size_t begin,
 
   splitInfo = SplitInfo(bound.Metric(), data.col(vantagePointIndex), mu);
 
-  assert(splitCol > begin);
-  assert(splitCol < begin + count);
   return true;
 }
 




More information about the mlpack-git mailing list