[mlpack-git] master: Remove unnecessary copying of dataset in SpillTrees. (498e74d)
gitdub at mlpack.org
gitdub at mlpack.org
Thu Aug 18 13:39:15 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/0f4b25acd6aaa14294c044874ba6cc0751712baa...0a19d07bd39e6223991976474bc79671ba8aa0f0
>---------------------------------------------------------------
commit 498e74d8ad71bbee84ec570b9b12381e0a8b5a63
Author: MarcosPividori <marcos.pividori at gmail.com>
Date: Thu Jul 28 12:24:22 2016 -0300
Remove unnecessary copying of dataset in SpillTrees.
>---------------------------------------------------------------
498e74d8ad71bbee84ec570b9b12381e0a8b5a63
src/mlpack/core/tree/spill_tree/spill_tree.hpp | 10 ++++----
.../core/tree/spill_tree/spill_tree_impl.hpp | 28 +++++++++++++++-------
2 files changed, 24 insertions(+), 14 deletions(-)
diff --git a/src/mlpack/core/tree/spill_tree/spill_tree.hpp b/src/mlpack/core/tree/spill_tree/spill_tree.hpp
index 270a9b9..c48fbd5 100644
--- a/src/mlpack/core/tree/spill_tree/spill_tree.hpp
+++ b/src/mlpack/core/tree/spill_tree/spill_tree.hpp
@@ -103,7 +103,9 @@ class SpillTree
ElemType minimumBoundDistance;
//! The dataset. If we are the root of the tree, we own the dataset and must
//! delete it.
- MatType* dataset;
+ const MatType* dataset;
+ //! If true, we own the dataset and need to destroy it in the destructor.
+ bool localDataset;
public:
//! A single-tree traverser for hybrid spill trees; see
@@ -117,8 +119,8 @@ class SpillTree
/**
* Construct this as the root node of a hybrid spill tree using the given
- * dataset. This will copy the input matrix; if you don't want this, consider
- * using the constructor that takes an rvalue reference and use std::move().
+ * dataset. The dataset will not be modified during the building procedure
+ * (unlike BinarySpaceTree).
*
* @param data Dataset to create tree from. This will be copied!
* @param tau Overlapping size.
@@ -229,8 +231,6 @@ class SpillTree
//! Get the dataset which the tree is built on.
const MatType& Dataset() const { return *dataset; }
- //! Modify the dataset which the tree is built on. Be careful!
- MatType& Dataset() { return *dataset; }
//! Distinguish overlapping nodes from non-overlapping nodes.
bool Overlap() const { return overlappingNode; }
diff --git a/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp b/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp
index cf8d5e9..b4b6557 100644
--- a/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp
+++ b/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp
@@ -33,7 +33,8 @@ SpillTree(
splitValue(0),
bound(data.n_rows),
parentDistance(0), // Parent distance for the root is 0: it has no parent.
- dataset(new MatType(data)) // Copies the dataset.
+ dataset(&data),
+ localDataset(false)
{
std::vector<size_t> points;
points.reserve(dataset->n_cols);
@@ -68,7 +69,8 @@ SpillTree(
splitValue(0),
bound(data.n_rows),
parentDistance(0), // Parent distance for the root is 0: it has no parent.
- dataset(new MatType(std::move(data)))
+ dataset(new MatType(std::move(data))),
+ localDataset(true)
{
std::vector<size_t> points;
points.reserve(dataset->n_cols);
@@ -104,7 +106,8 @@ SpillTree(
splitDimension(0),
splitValue(0),
bound(parent->Dataset().n_rows),
- dataset(&parent->Dataset()) // Point to the parent's dataset.
+ dataset(&parent->Dataset()), // Point to the parent's dataset.
+ localDataset(false)
{
// Perform the actual splitting.
SplitNode(points, overlapIndex, maxLeafSize, tau, rho);
@@ -137,7 +140,8 @@ SpillTree(const SpillTree& other) :
parentDistance(other.parentDistance),
furthestDescendantDistance(other.furthestDescendantDistance),
// Copy matrix, but only if we are the root.
- dataset((other.parent == NULL) ? new MatType(*other.dataset) : NULL)
+ dataset((other.parent == NULL) ? new MatType(*other.dataset) : NULL),
+ localDataset(other.parent == NULL)
{
// Create left and right children (if any).
if (other.Left())
@@ -201,7 +205,8 @@ SpillTree(SpillTree&& other) :
parentDistance(other.parentDistance),
furthestDescendantDistance(other.furthestDescendantDistance),
minimumBoundDistance(other.minimumBoundDistance),
- dataset(other.dataset)
+ dataset(other.dataset),
+ localDataset(other.localDataset)
{
// Now we are a clone of the other tree. But we must also clear the other
// tree's contents, so it doesn't delete anything when it is destructed.
@@ -213,6 +218,7 @@ SpillTree(SpillTree&& other) :
other.furthestDescendantDistance = 0.0;
other.minimumBoundDistance = 0.0;
other.dataset = NULL;
+ other.localDataset = false;
}
/**
@@ -252,8 +258,8 @@ SpillTree<MetricType, StatisticType, MatType, SplitType>::
delete right;
delete pointsIndex;
- // If we're the root, delete the matrix.
- if (!parent)
+ // If we're the root and we own the dataset, delete it.
+ if (!parent && localDataset)
delete dataset;
}
@@ -576,7 +582,8 @@ SpillTree<MetricType, StatisticType, MatType, SplitType>::
stat(*this),
parentDistance(0),
furthestDescendantDistance(0),
- dataset(NULL)
+ dataset(NULL),
+ localDataset(false)
{
// Nothing to do.
}
@@ -602,7 +609,7 @@ void SpillTree<MetricType, StatisticType, MatType, SplitType>::
delete left;
if (right)
delete right;
- if (!parent)
+ if (!parent && localDataset)
delete dataset;
}
@@ -616,6 +623,9 @@ void SpillTree<MetricType, StatisticType, MatType, SplitType>::
ar & CreateNVP(furthestDescendantDistance, "furthestDescendantDistance");
ar & CreateNVP(dataset, "dataset");
+ if (Archive::is_loading::value && parent == NULL)
+ localDataset = true;
+
// Save children last; otherwise boost::serialization gets confused.
ar & CreateNVP(left, "left");
ar & CreateNVP(right, "right");
More information about the mlpack-git
mailing list