[mlpack-git] master: Use arma::Col instead of std::vector. Also, avoid resizing in SplitPoints(), doing a previous calculation of the number of points to be stored. (388941a)
gitdub at mlpack.org
gitdub at mlpack.org
Sat Aug 6 11:33:18 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/0f4b25acd6aaa14294c044874ba6cc0751712baa...0a19d07bd39e6223991976474bc79671ba8aa0f0
>---------------------------------------------------------------
commit 388941a2cb3c661c7024ccb847a9c1fe3abff57d
Author: MarcosPividori <marcos.pividori at gmail.com>
Date: Sat Aug 6 12:21:42 2016 -0300
Use arma::Col instead of std::vector. Also, avoid resizing in SplitPoints(), doing a previous calculation of the number of points to be stored.
>---------------------------------------------------------------
388941a2cb3c661c7024ccb847a9c1fe3abff57d
src/mlpack/core/tree/spill_tree/space_split.hpp | 8 +-
.../core/tree/spill_tree/space_split_impl.hpp | 18 +--
src/mlpack/core/tree/spill_tree/spill_tree.hpp | 12 +-
.../core/tree/spill_tree/spill_tree_impl.hpp | 124 +++++++++++++--------
4 files changed, 96 insertions(+), 66 deletions(-)
diff --git a/src/mlpack/core/tree/spill_tree/space_split.hpp b/src/mlpack/core/tree/spill_tree/space_split.hpp
index 62548c2..da2d9b5 100644
--- a/src/mlpack/core/tree/spill_tree/space_split.hpp
+++ b/src/mlpack/core/tree/spill_tree/space_split.hpp
@@ -33,7 +33,7 @@ class MeanSpaceSplit
static bool SplitSpace(
const typename HyperplaneType::BoundType& bound,
const MatType& data,
- const std::vector<size_t>& points,
+ const arma::Col<size_t>& points,
HyperplaneType& hyp);
};
@@ -55,7 +55,7 @@ class MidpointSpaceSplit
static bool SplitSpace(
const typename HyperplaneType::BoundType& bound,
const MatType& data,
- const std::vector<size_t>& points,
+ const arma::Col<size_t>& points,
HyperplaneType& hyp);
};
@@ -78,7 +78,7 @@ class SpaceSplit
static bool GetProjVector(
const bound::HRectBound<MetricType>& bound,
const MatType& data,
- const std::vector<size_t>& points,
+ const arma::Col<size_t>& points,
AxisParallelProjVector& projVector,
double& midValue);
@@ -98,7 +98,7 @@ class SpaceSplit
static bool GetProjVector(
const BoundType& bound,
const MatType& data,
- const std::vector<size_t>& points,
+ const arma::Col<size_t>& points,
ProjVector& projVector,
double& midValue);
};
diff --git a/src/mlpack/core/tree/spill_tree/space_split_impl.hpp b/src/mlpack/core/tree/spill_tree/space_split_impl.hpp
index db7e7c3..a9628c0 100644
--- a/src/mlpack/core/tree/spill_tree/space_split_impl.hpp
+++ b/src/mlpack/core/tree/spill_tree/space_split_impl.hpp
@@ -19,7 +19,7 @@ template<typename HyperplaneType>
bool MeanSpaceSplit<MetricType, MatType>::SplitSpace(
const typename HyperplaneType::BoundType& bound,
const MatType& data,
- const std::vector<size_t>& points,
+ const arma::Col<size_t>& points,
HyperplaneType& hyp)
{
typename HyperplaneType::ProjVectorType projVector;
@@ -30,9 +30,9 @@ bool MeanSpaceSplit<MetricType, MatType>::SplitSpace(
return false;
double splitVal = 0.0;
- for (size_t i = 0; i < points.size(); i++)
+ for (size_t i = 0; i < points.n_elem; i++)
splitVal += projVector.Project(data.col(points[i]));
- splitVal /= points.size();
+ splitVal /= points.n_elem;
hyp = HyperplaneType(projVector, splitVal);
@@ -44,7 +44,7 @@ template<typename HyperplaneType>
bool MidpointSpaceSplit<MetricType, MatType>::SplitSpace(
const typename HyperplaneType::BoundType& bound,
const MatType& data,
- const std::vector<size_t>& points,
+ const arma::Col<size_t>& points,
HyperplaneType& hyp)
{
typename HyperplaneType::ProjVectorType projVector;
@@ -63,7 +63,7 @@ template<typename MetricType, typename MatType>
bool SpaceSplit<MetricType, MatType>::GetProjVector(
const bound::HRectBound<MetricType>& bound,
const MatType& data,
- const std::vector<size_t>& /* points */,
+ const arma::Col<size_t>& /* points */,
AxisParallelProjVector& projVector,
double& midValue)
{
@@ -97,18 +97,18 @@ template<typename BoundType>
bool SpaceSplit<MetricType, MatType>::GetProjVector(
const BoundType& /* bound */,
const MatType& data,
- const std::vector<size_t>& points,
+ const arma::Col<size_t>& points,
ProjVector& projVector,
double& midValue)
{
MetricType metric;
// Efficiently estimate the farthest pair of points in the given set.
- size_t fst = points[rand() % points.size()];
+ size_t fst = points[rand() % points.n_elem];
size_t snd = points[0];
double max = metric.Evaluate(data.col(fst), data.col(snd));
- for (size_t i = 1; i < points.size(); i++)
+ for (size_t i = 1; i < points.n_elem; i++)
{
double dist = metric.Evaluate(data.col(fst), data.col(points[i]));
if (dist > max)
@@ -120,7 +120,7 @@ bool SpaceSplit<MetricType, MatType>::GetProjVector(
std::swap(fst, snd);
- for (size_t i = 0; i < points.size(); i++)
+ for (size_t i = 0; i < points.n_elem; i++)
{
double dist = metric.Evaluate(data.col(fst), data.col(points[i]));
if (dist > max)
diff --git a/src/mlpack/core/tree/spill_tree/spill_tree.hpp b/src/mlpack/core/tree/spill_tree/spill_tree.hpp
index ac22be3..7e99d6f 100644
--- a/src/mlpack/core/tree/spill_tree/spill_tree.hpp
+++ b/src/mlpack/core/tree/spill_tree/spill_tree.hpp
@@ -89,7 +89,7 @@ class SpillTree
size_t count;
//! The list of indexes of points contained in this node (non-null for
//! leaf nodes).
- std::vector<size_t>* pointsIndex;
+ arma::Col<size_t>* pointsIndex;
//! Flag to distinguish overlapping nodes from non-overlapping nodes.
bool overlappingNode;
//! Splitting hyperplane represented by this node.
@@ -164,7 +164,7 @@ class SpillTree
* @param rho Balance threshold.
*/
SpillTree(SpillTree* parent,
- std::vector<size_t>& points,
+ arma::Col<size_t>& points,
const double tau = 0,
const size_t maxLeafSize = 20,
const double rho = 0.7);
@@ -379,7 +379,7 @@ class SpillTree
* @param tau Overlapping size.
* @param rho Balance threshold.
*/
- void SplitNode(std::vector<size_t>& points,
+ void SplitNode(arma::Col<size_t>& points,
const size_t maxLeafSize,
const double tau,
const double rho);
@@ -396,9 +396,9 @@ class SpillTree
*/
bool SplitPoints(const double tau,
const double rho,
- const std::vector<size_t>& points,
- std::vector<size_t>& leftPoints,
- std::vector<size_t>& rightPoints);
+ const arma::Col<size_t>& points,
+ arma::Col<size_t>& leftPoints,
+ arma::Col<size_t>& rightPoints);
protected:
/**
* A default constructor. This is meant to only be used with
diff --git a/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp b/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp
index 6b39210..6b19f06 100644
--- a/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp
+++ b/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp
@@ -36,10 +36,9 @@ SpillTree(
dataset(&data),
localDataset(false)
{
- std::vector<size_t> points;
- points.reserve(dataset->n_cols);
- for (size_t i = 0; i < dataset->n_cols; i++)
- points.push_back(i);
+ arma::Col<size_t> points(dataset->n_cols);
+ for (size_t i = 0; i < points.n_elem; i++)
+ points[i] = i;
// Do the actual splitting of this node.
SplitNode(points, maxLeafSize, tau, rho);
@@ -72,10 +71,9 @@ SpillTree(
dataset(new MatType(std::move(data))),
localDataset(true)
{
- std::vector<size_t> points;
- points.reserve(dataset->n_cols);
- for (size_t i = 0; i < dataset->n_cols; i++)
- points.push_back(i);
+ arma::Col<size_t> points(dataset->n_cols);
+ for (size_t i = 0; i < points.n_elem; i++)
+ points[i] = i;
// Do the actual splitting of this node.
SplitNode(points, maxLeafSize, tau, rho);
@@ -93,7 +91,7 @@ template<typename MetricType,
SpillTree<MetricType, StatisticType, MatType, HyperplaneType, SplitType>::
SpillTree(
SpillTree* parent,
- std::vector<size_t>& points,
+ arma::Col<size_t>& points,
const double tau,
const size_t maxLeafSize,
const double rho) :
@@ -157,7 +155,7 @@ SpillTree(const SpillTree& other) :
// If vector of indexes, copy it.
if (other.pointsIndex)
- pointsIndex = new std::vector<size_t>(*other.pointsIndex);
+ pointsIndex = new arma::Col<size_t>(*other.pointsIndex);
// Propagate matrix, but only if we are the root.
if (parent == NULL)
@@ -500,24 +498,24 @@ template<typename MetricType,
template<typename SplitMetricType, typename SplitMatType>
class SplitType>
void SpillTree<MetricType, StatisticType, MatType, HyperplaneType, SplitType>::
- SplitNode(std::vector<size_t>& points,
+ SplitNode(arma::Col<size_t>& points,
const size_t maxLeafSize,
const double tau,
const double rho)
{
// We need to expand the bounds of this node properly.
- for (size_t i = 0; i < points.size(); i++)
- bound |= dataset->cols(points[i], points[i]);
+ for (size_t i = 0; i < points.n_elem; i++)
+ bound |= dataset->col(points[i]);
// Calculate the furthest descendant distance.
furthestDescendantDistance = 0.5 * bound.Diameter();
// Now, check if we need to split at all.
- if (points.size() <= maxLeafSize)
+ if (points.n_elem <= maxLeafSize)
{
- pointsIndex = new std::vector<size_t>();
+ pointsIndex = new arma::Col<size_t>();
pointsIndex->swap(points);
- count = pointsIndex->size();
+ count = pointsIndex->n_elem;
return; // We can't split this.
}
@@ -527,18 +525,18 @@ void SpillTree<MetricType, StatisticType, MatType, HyperplaneType, SplitType>::
// same, we can't split them.
if (!split)
{
- pointsIndex = new std::vector<size_t>();
+ pointsIndex = new arma::Col<size_t>();
pointsIndex->swap(points);
- count = pointsIndex->size();
+ count = pointsIndex->n_elem;
return; // We can't split this.
}
- std::vector<size_t> leftPoints, rightPoints;
+ arma::Col<size_t> leftPoints, rightPoints;
// Split the node.
overlappingNode = SplitPoints(tau, rho, points, leftPoints, rightPoints);
// We don't need the information in points, so lets clean it.
- std::vector<size_t>().swap(points);
+ arma::Col<size_t>().swap(points);
// Now we will recursively split the children by calling their constructors
// (which perform this splitting process).
@@ -571,48 +569,80 @@ template<typename MetricType,
bool SpillTree<MetricType, StatisticType, MatType, HyperplaneType, SplitType>::
SplitPoints(const double tau,
const double rho,
- const std::vector<size_t>& points,
- std::vector<size_t>& leftPoints,
- std::vector<size_t>& rightPoints)
+ const arma::Col<size_t>& points,
+ arma::Col<size_t>& leftPoints,
+ arma::Col<size_t>& rightPoints)
{
- std::vector<size_t> leftFrontier, rightFrontier;
+ arma::vec projections(points.n_elem);
+ size_t left = 0, right = 0, leftFrontier = 0, rightFrontier = 0;
- // Perform the actual splitting. This will order the dataset such that points
- // with projection value less than or equal to zero are included in leftPoints
- // and points with projection value greater than zero are included in
- // rightPoints.
- for (size_t i = 0; i < points.size(); i++)
+ // Count the number of points to the left/right of the splitting hyperplane.
+ for (size_t i = 0; i < points.n_elem; i++)
{
- const double proj = hyperplane.Project(dataset->col(points[i]));
- if (proj <= 0)
+ // Store projection value for future use.
+ projections[i] = hyperplane.Project(dataset->col(points[i]));
+ if (projections[i] <= 0)
{
- leftPoints.push_back(points[i]);
- if (proj > -tau)
- leftFrontier.push_back(points[i]);
+ left++;
+ if (projections[i] > -tau)
+ leftFrontier++;
}
else
{
- rightPoints.push_back(points[i]);
- if (proj < tau)
- rightFrontier.push_back(points[i]);
+ right++;
+ if (projections[i] < tau)
+ rightFrontier++;
}
}
- const double p1 = double (leftPoints.size() + rightFrontier.size()) /
- points.size();
- const double p2 = double (rightPoints.size() + leftFrontier.size()) /
- points.size();
+ const double p1 = double (left + rightFrontier) / points.n_elem;
+ const double p2 = double (right + leftFrontier) / points.n_elem;
- if ((p1 <= rho || rightFrontier.empty()) &&
- (p2 <= rho || leftFrontier.empty()))
+ if ((p1 <= rho || rightFrontier == 0) &&
+ (p2 <= rho || leftFrontier == 0))
{
- leftPoints.insert(leftPoints.end(), rightFrontier.begin(),
- rightFrontier.end());
- rightPoints.insert(rightPoints.end(), leftFrontier.begin(),
- leftFrontier.end());
+ // Perform the actual splitting considering the overlapping buffer. Points
+ // with projection value in the range (-tau, tau) are included in both,
+ // leftPoints and rightPoints.
+ leftPoints.resize(left + rightFrontier);
+ rightPoints.resize(right + leftFrontier);
+ for (size_t i = 0, rc = 0, lc = 0; i < points.n_elem; i++)
+ {
+ if (projections[i] < tau || projections[i] <= 0)
+ {
+ leftPoints[lc] = points[i];
+ lc++;
+ }
+ if (projections[i] > -tau)
+ {
+ rightPoints[rc] = points[i];
+ rc++;
+ }
+ }
+ // Return true, because it is a overlapping node.
return true;
}
+ // Perform the actual splitting ignoring the overlapping buffer. Points
+ // with projection value less than or equal to zero are included in leftPoints
+ // and points with projection value greater than zero are included in
+ // rightPoints.
+ leftPoints.resize(left);
+ rightPoints.resize(right);
+ for (size_t i = 0, rc = 0, lc = 0; i < points.n_elem; i++)
+ {
+ if (projections[i] <= 0)
+ {
+ leftPoints[lc] = points[i];
+ lc++;
+ }
+ else
+ {
+ rightPoints[rc] = points[i];
+ rc++;
+ }
+ }
+ // Return false, because it isn't a overlapping node.
return false;
}
More information about the mlpack-git
mailing list