[mlpack-git] master: Use arma::Col instead of std::vector. Also, avoid resizing in SplitPoints(), doing a previous calculation of the number of points to be stored. (388941a)

gitdub at mlpack.org gitdub at mlpack.org
Sat Aug 6 11:33:18 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/0f4b25acd6aaa14294c044874ba6cc0751712baa...0a19d07bd39e6223991976474bc79671ba8aa0f0

>---------------------------------------------------------------

commit 388941a2cb3c661c7024ccb847a9c1fe3abff57d
Author: MarcosPividori <marcos.pividori at gmail.com>
Date:   Sat Aug 6 12:21:42 2016 -0300

    Use arma::Col instead of std::vector. Also, avoid resizing in SplitPoints(), doing a previous calculation of the number of points to be stored.


>---------------------------------------------------------------

388941a2cb3c661c7024ccb847a9c1fe3abff57d
 src/mlpack/core/tree/spill_tree/space_split.hpp    |   8 +-
 .../core/tree/spill_tree/space_split_impl.hpp      |  18 +--
 src/mlpack/core/tree/spill_tree/spill_tree.hpp     |  12 +-
 .../core/tree/spill_tree/spill_tree_impl.hpp       | 124 +++++++++++++--------
 4 files changed, 96 insertions(+), 66 deletions(-)

diff --git a/src/mlpack/core/tree/spill_tree/space_split.hpp b/src/mlpack/core/tree/spill_tree/space_split.hpp
index 62548c2..da2d9b5 100644
--- a/src/mlpack/core/tree/spill_tree/space_split.hpp
+++ b/src/mlpack/core/tree/spill_tree/space_split.hpp
@@ -33,7 +33,7 @@ class MeanSpaceSplit
   static bool SplitSpace(
       const typename HyperplaneType::BoundType& bound,
       const MatType& data,
-      const std::vector<size_t>& points,
+      const arma::Col<size_t>& points,
       HyperplaneType& hyp);
 };
 
@@ -55,7 +55,7 @@ class MidpointSpaceSplit
   static bool SplitSpace(
       const typename HyperplaneType::BoundType& bound,
       const MatType& data,
-      const std::vector<size_t>& points,
+      const arma::Col<size_t>& points,
       HyperplaneType& hyp);
 };
 
@@ -78,7 +78,7 @@ class SpaceSplit
   static bool GetProjVector(
       const bound::HRectBound<MetricType>& bound,
       const MatType& data,
-      const std::vector<size_t>& points,
+      const arma::Col<size_t>& points,
       AxisParallelProjVector& projVector,
       double& midValue);
 
@@ -98,7 +98,7 @@ class SpaceSplit
   static bool GetProjVector(
       const BoundType& bound,
       const MatType& data,
-      const std::vector<size_t>& points,
+      const arma::Col<size_t>& points,
       ProjVector& projVector,
       double& midValue);
 };
diff --git a/src/mlpack/core/tree/spill_tree/space_split_impl.hpp b/src/mlpack/core/tree/spill_tree/space_split_impl.hpp
index db7e7c3..a9628c0 100644
--- a/src/mlpack/core/tree/spill_tree/space_split_impl.hpp
+++ b/src/mlpack/core/tree/spill_tree/space_split_impl.hpp
@@ -19,7 +19,7 @@ template<typename HyperplaneType>
 bool MeanSpaceSplit<MetricType, MatType>::SplitSpace(
     const typename HyperplaneType::BoundType& bound,
     const MatType& data,
-    const std::vector<size_t>& points,
+    const arma::Col<size_t>& points,
     HyperplaneType& hyp)
 {
   typename HyperplaneType::ProjVectorType projVector;
@@ -30,9 +30,9 @@ bool MeanSpaceSplit<MetricType, MatType>::SplitSpace(
     return false;
 
   double splitVal = 0.0;
-  for (size_t i = 0; i < points.size(); i++)
+  for (size_t i = 0; i < points.n_elem; i++)
     splitVal += projVector.Project(data.col(points[i]));
-  splitVal /= points.size();
+  splitVal /= points.n_elem;
 
   hyp = HyperplaneType(projVector, splitVal);
 
@@ -44,7 +44,7 @@ template<typename HyperplaneType>
 bool MidpointSpaceSplit<MetricType, MatType>::SplitSpace(
     const typename HyperplaneType::BoundType& bound,
     const MatType& data,
-    const std::vector<size_t>& points,
+    const arma::Col<size_t>& points,
     HyperplaneType& hyp)
 {
   typename HyperplaneType::ProjVectorType projVector;
@@ -63,7 +63,7 @@ template<typename MetricType, typename MatType>
 bool SpaceSplit<MetricType, MatType>::GetProjVector(
     const bound::HRectBound<MetricType>& bound,
     const MatType& data,
-    const std::vector<size_t>& /* points */,
+    const arma::Col<size_t>& /* points */,
     AxisParallelProjVector& projVector,
     double& midValue)
 {
@@ -97,18 +97,18 @@ template<typename BoundType>
 bool SpaceSplit<MetricType, MatType>::GetProjVector(
     const BoundType& /* bound */,
     const MatType& data,
-    const std::vector<size_t>& points,
+    const arma::Col<size_t>& points,
     ProjVector& projVector,
     double& midValue)
 {
   MetricType metric;
 
   // Efficiently estimate the farthest pair of points in the given set.
-  size_t fst = points[rand() % points.size()];
+  size_t fst = points[rand() % points.n_elem];
   size_t snd = points[0];
   double max = metric.Evaluate(data.col(fst), data.col(snd));
 
-  for (size_t i = 1; i < points.size(); i++)
+  for (size_t i = 1; i < points.n_elem; i++)
   {
     double dist = metric.Evaluate(data.col(fst), data.col(points[i]));
     if (dist > max)
@@ -120,7 +120,7 @@ bool SpaceSplit<MetricType, MatType>::GetProjVector(
 
   std::swap(fst, snd);
 
-  for (size_t i = 0; i < points.size(); i++)
+  for (size_t i = 0; i < points.n_elem; i++)
   {
     double dist = metric.Evaluate(data.col(fst), data.col(points[i]));
     if (dist > max)
diff --git a/src/mlpack/core/tree/spill_tree/spill_tree.hpp b/src/mlpack/core/tree/spill_tree/spill_tree.hpp
index ac22be3..7e99d6f 100644
--- a/src/mlpack/core/tree/spill_tree/spill_tree.hpp
+++ b/src/mlpack/core/tree/spill_tree/spill_tree.hpp
@@ -89,7 +89,7 @@ class SpillTree
   size_t count;
   //! The list of indexes of points contained in this node (non-null for
   //! leaf nodes).
-  std::vector<size_t>* pointsIndex;
+  arma::Col<size_t>* pointsIndex;
   //! Flag to distinguish overlapping nodes from non-overlapping nodes.
   bool overlappingNode;
   //! Splitting hyperplane represented by this node.
@@ -164,7 +164,7 @@ class SpillTree
    * @param rho Balance threshold.
    */
   SpillTree(SpillTree* parent,
-            std::vector<size_t>& points,
+            arma::Col<size_t>& points,
             const double tau = 0,
             const size_t maxLeafSize = 20,
             const double rho = 0.7);
@@ -379,7 +379,7 @@ class SpillTree
    * @param tau Overlapping size.
    * @param rho Balance threshold.
    */
-  void SplitNode(std::vector<size_t>& points,
+  void SplitNode(arma::Col<size_t>& points,
                  const size_t maxLeafSize,
                  const double tau,
                  const double rho);
@@ -396,9 +396,9 @@ class SpillTree
    */
   bool SplitPoints(const double tau,
                    const double rho,
-                   const std::vector<size_t>& points,
-                   std::vector<size_t>& leftPoints,
-                   std::vector<size_t>& rightPoints);
+                   const arma::Col<size_t>& points,
+                   arma::Col<size_t>& leftPoints,
+                   arma::Col<size_t>& rightPoints);
  protected:
   /**
    * A default constructor.  This is meant to only be used with
diff --git a/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp b/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp
index 6b39210..6b19f06 100644
--- a/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp
+++ b/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp
@@ -36,10 +36,9 @@ SpillTree(
     dataset(&data),
     localDataset(false)
 {
-  std::vector<size_t> points;
-  points.reserve(dataset->n_cols);
-  for (size_t i = 0; i < dataset->n_cols; i++)
-    points.push_back(i);
+  arma::Col<size_t> points(dataset->n_cols);
+  for (size_t i = 0; i < points.n_elem; i++)
+    points[i] = i;
 
   // Do the actual splitting of this node.
   SplitNode(points, maxLeafSize, tau, rho);
@@ -72,10 +71,9 @@ SpillTree(
     dataset(new MatType(std::move(data))),
     localDataset(true)
 {
-  std::vector<size_t> points;
-  points.reserve(dataset->n_cols);
-  for (size_t i = 0; i < dataset->n_cols; i++)
-    points.push_back(i);
+  arma::Col<size_t> points(dataset->n_cols);
+  for (size_t i = 0; i < points.n_elem; i++)
+    points[i] = i;
 
   // Do the actual splitting of this node.
   SplitNode(points, maxLeafSize, tau, rho);
@@ -93,7 +91,7 @@ template<typename MetricType,
 SpillTree<MetricType, StatisticType, MatType, HyperplaneType, SplitType>::
 SpillTree(
     SpillTree* parent,
-    std::vector<size_t>& points,
+    arma::Col<size_t>& points,
     const double tau,
     const size_t maxLeafSize,
     const double rho) :
@@ -157,7 +155,7 @@ SpillTree(const SpillTree& other) :
 
   // If vector of indexes, copy it.
   if (other.pointsIndex)
-    pointsIndex = new std::vector<size_t>(*other.pointsIndex);
+    pointsIndex = new arma::Col<size_t>(*other.pointsIndex);
 
   // Propagate matrix, but only if we are the root.
   if (parent == NULL)
@@ -500,24 +498,24 @@ template<typename MetricType,
          template<typename SplitMetricType, typename SplitMatType>
              class SplitType>
 void SpillTree<MetricType, StatisticType, MatType, HyperplaneType, SplitType>::
-    SplitNode(std::vector<size_t>& points,
+    SplitNode(arma::Col<size_t>& points,
               const size_t maxLeafSize,
               const double tau,
               const double rho)
 {
   // We need to expand the bounds of this node properly.
-  for (size_t i = 0; i < points.size(); i++)
-    bound |= dataset->cols(points[i], points[i]);
+  for (size_t i = 0; i < points.n_elem; i++)
+    bound |= dataset->col(points[i]);
 
   // Calculate the furthest descendant distance.
   furthestDescendantDistance = 0.5 * bound.Diameter();
 
   // Now, check if we need to split at all.
-  if (points.size() <= maxLeafSize)
+  if (points.n_elem <= maxLeafSize)
   {
-    pointsIndex = new std::vector<size_t>();
+    pointsIndex = new arma::Col<size_t>();
     pointsIndex->swap(points);
-    count = pointsIndex->size();
+    count = pointsIndex->n_elem;
     return; // We can't split this.
   }
 
@@ -527,18 +525,18 @@ void SpillTree<MetricType, StatisticType, MatType, HyperplaneType, SplitType>::
   // same, we can't split them.
   if (!split)
   {
-    pointsIndex = new std::vector<size_t>();
+    pointsIndex = new arma::Col<size_t>();
     pointsIndex->swap(points);
-    count = pointsIndex->size();
+    count = pointsIndex->n_elem;
     return; // We can't split this.
   }
 
-  std::vector<size_t> leftPoints, rightPoints;
+  arma::Col<size_t> leftPoints, rightPoints;
   // Split the node.
   overlappingNode = SplitPoints(tau, rho, points, leftPoints, rightPoints);
 
   // We don't need the information in points, so lets clean it.
-  std::vector<size_t>().swap(points);
+  arma::Col<size_t>().swap(points);
 
   // Now we will recursively split the children by calling their constructors
   // (which perform this splitting process).
@@ -571,48 +569,80 @@ template<typename MetricType,
 bool SpillTree<MetricType, StatisticType, MatType, HyperplaneType, SplitType>::
     SplitPoints(const double tau,
                 const double rho,
-                const std::vector<size_t>& points,
-                std::vector<size_t>& leftPoints,
-                std::vector<size_t>& rightPoints)
+                const arma::Col<size_t>& points,
+                arma::Col<size_t>& leftPoints,
+                arma::Col<size_t>& rightPoints)
 {
-  std::vector<size_t> leftFrontier, rightFrontier;
+  arma::vec projections(points.n_elem);
+  size_t left = 0, right = 0, leftFrontier = 0, rightFrontier = 0;
 
-  // Perform the actual splitting.  This will order the dataset such that points
-  // with projection value less than or equal to zero are included in leftPoints
-  // and points with projection value greater than zero are included in
-  // rightPoints.
-  for (size_t i = 0; i < points.size(); i++)
+  // Count the number of points to the left/right of the splitting hyperplane.
+  for (size_t i = 0; i < points.n_elem; i++)
   {
-    const double proj = hyperplane.Project(dataset->col(points[i]));
-    if (proj <= 0)
+    // Store projection value for future use.
+    projections[i] = hyperplane.Project(dataset->col(points[i]));
+    if (projections[i] <= 0)
     {
-      leftPoints.push_back(points[i]);
-      if (proj > -tau)
-        leftFrontier.push_back(points[i]);
+      left++;
+      if (projections[i] > -tau)
+        leftFrontier++;
     }
     else
     {
-      rightPoints.push_back(points[i]);
-      if (proj < tau)
-        rightFrontier.push_back(points[i]);
+      right++;
+      if (projections[i] < tau)
+        rightFrontier++;
     }
   }
 
-  const double p1 = double (leftPoints.size() + rightFrontier.size()) /
-      points.size();
-  const double p2 = double (rightPoints.size() + leftFrontier.size()) /
-      points.size();
+  const double p1 = double (left + rightFrontier) / points.n_elem;
+  const double p2 = double (right + leftFrontier) / points.n_elem;
 
-  if ((p1 <= rho || rightFrontier.empty()) &&
-      (p2 <= rho || leftFrontier.empty()))
+  if ((p1 <= rho || rightFrontier == 0) &&
+      (p2 <= rho || leftFrontier == 0))
   {
-    leftPoints.insert(leftPoints.end(), rightFrontier.begin(),
-        rightFrontier.end());
-    rightPoints.insert(rightPoints.end(), leftFrontier.begin(),
-        leftFrontier.end());
+    // Perform the actual splitting considering the overlapping buffer.  Points
+    // with projection value in the range (-tau, tau) are included in both,
+    // leftPoints and rightPoints.
+    leftPoints.resize(left + rightFrontier);
+    rightPoints.resize(right + leftFrontier);
+    for (size_t i = 0, rc = 0, lc = 0; i < points.n_elem; i++)
+    {
+      if (projections[i] < tau || projections[i] <= 0)
+      {
+        leftPoints[lc] = points[i];
+        lc++;
+      }
+      if (projections[i] > -tau)
+      {
+        rightPoints[rc] = points[i];
+        rc++;
+      }
+    }
+    // Return true, because it is a overlapping node.
     return true;
   }
 
+  // Perform the actual splitting ignoring the overlapping buffer.  Points
+  // with projection value less than or equal to zero are included in leftPoints
+  // and points with projection value greater than zero are included in
+  // rightPoints.
+  leftPoints.resize(left);
+  rightPoints.resize(right);
+  for (size_t i = 0, rc = 0, lc = 0; i < points.n_elem; i++)
+  {
+    if (projections[i] <= 0)
+    {
+      leftPoints[lc] = points[i];
+      lc++;
+    }
+    else
+    {
+      rightPoints[rc] = points[i];
+      rc++;
+    }
+  }
+  // Return false, because it isn't a overlapping node.
   return false;
 }
 




More information about the mlpack-git mailing list