[mlpack-git] master: Remove unnecessary copying of dataset in SpillTrees. (498e74d)

gitdub at mlpack.org gitdub at mlpack.org
Thu Aug 18 13:39:15 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/0f4b25acd6aaa14294c044874ba6cc0751712baa...0a19d07bd39e6223991976474bc79671ba8aa0f0

>---------------------------------------------------------------

commit 498e74d8ad71bbee84ec570b9b12381e0a8b5a63
Author: MarcosPividori <marcos.pividori at gmail.com>
Date:   Thu Jul 28 12:24:22 2016 -0300

    Remove unnecessary copying of dataset in SpillTrees.


>---------------------------------------------------------------

498e74d8ad71bbee84ec570b9b12381e0a8b5a63
 src/mlpack/core/tree/spill_tree/spill_tree.hpp     | 10 ++++----
 .../core/tree/spill_tree/spill_tree_impl.hpp       | 28 +++++++++++++++-------
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/src/mlpack/core/tree/spill_tree/spill_tree.hpp b/src/mlpack/core/tree/spill_tree/spill_tree.hpp
index 270a9b9..c48fbd5 100644
--- a/src/mlpack/core/tree/spill_tree/spill_tree.hpp
+++ b/src/mlpack/core/tree/spill_tree/spill_tree.hpp
@@ -103,7 +103,9 @@ class SpillTree
   ElemType minimumBoundDistance;
   //! The dataset.  If we are the root of the tree, we own the dataset and must
   //! delete it.
-  MatType* dataset;
+  const MatType* dataset;
+  //! If true, we own the dataset and need to destroy it in the destructor.
+  bool localDataset;
 
  public:
   //! A single-tree traverser for hybrid spill trees; see
@@ -117,8 +119,8 @@ class SpillTree
 
   /**
    * Construct this as the root node of a hybrid spill tree using the given
-   * dataset.  This will copy the input matrix; if you don't want this, consider
-   * using the constructor that takes an rvalue reference and use std::move().
+   * dataset.  The dataset will not be modified during the building procedure
+   * (unlike BinarySpaceTree).
    *
    * @param data Dataset to create tree from.  This will be copied!
    * @param tau Overlapping size.
@@ -229,8 +231,6 @@ class SpillTree
 
   //! Get the dataset which the tree is built on.
   const MatType& Dataset() const { return *dataset; }
-  //! Modify the dataset which the tree is built on.  Be careful!
-  MatType& Dataset() { return *dataset; }
 
   //! Distinguish overlapping nodes from non-overlapping nodes.
   bool Overlap() const { return overlappingNode; }
diff --git a/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp b/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp
index cf8d5e9..b4b6557 100644
--- a/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp
+++ b/src/mlpack/core/tree/spill_tree/spill_tree_impl.hpp
@@ -33,7 +33,8 @@ SpillTree(
     splitValue(0),
     bound(data.n_rows),
     parentDistance(0), // Parent distance for the root is 0: it has no parent.
-    dataset(new MatType(data)) // Copies the dataset.
+    dataset(&data),
+    localDataset(false)
 {
   std::vector<size_t> points;
   points.reserve(dataset->n_cols);
@@ -68,7 +69,8 @@ SpillTree(
     splitValue(0),
     bound(data.n_rows),
     parentDistance(0), // Parent distance for the root is 0: it has no parent.
-    dataset(new MatType(std::move(data)))
+    dataset(new MatType(std::move(data))),
+    localDataset(true)
 {
   std::vector<size_t> points;
   points.reserve(dataset->n_cols);
@@ -104,7 +106,8 @@ SpillTree(
     splitDimension(0),
     splitValue(0),
     bound(parent->Dataset().n_rows),
-    dataset(&parent->Dataset()) // Point to the parent's dataset.
+    dataset(&parent->Dataset()), // Point to the parent's dataset.
+    localDataset(false)
 {
   // Perform the actual splitting.
   SplitNode(points, overlapIndex, maxLeafSize, tau, rho);
@@ -137,7 +140,8 @@ SpillTree(const SpillTree& other) :
     parentDistance(other.parentDistance),
     furthestDescendantDistance(other.furthestDescendantDistance),
     // Copy matrix, but only if we are the root.
-    dataset((other.parent == NULL) ? new MatType(*other.dataset) : NULL)
+    dataset((other.parent == NULL) ? new MatType(*other.dataset) : NULL),
+    localDataset(other.parent == NULL)
 {
   // Create left and right children (if any).
   if (other.Left())
@@ -201,7 +205,8 @@ SpillTree(SpillTree&& other) :
     parentDistance(other.parentDistance),
     furthestDescendantDistance(other.furthestDescendantDistance),
     minimumBoundDistance(other.minimumBoundDistance),
-    dataset(other.dataset)
+    dataset(other.dataset),
+    localDataset(other.localDataset)
 {
   // Now we are a clone of the other tree.  But we must also clear the other
   // tree's contents, so it doesn't delete anything when it is destructed.
@@ -213,6 +218,7 @@ SpillTree(SpillTree&& other) :
   other.furthestDescendantDistance = 0.0;
   other.minimumBoundDistance = 0.0;
   other.dataset = NULL;
+  other.localDataset = false;
 }
 
 /**
@@ -252,8 +258,8 @@ SpillTree<MetricType, StatisticType, MatType, SplitType>::
   delete right;
   delete pointsIndex;
 
-  // If we're the root, delete the matrix.
-  if (!parent)
+  // If we're the root and we own the dataset, delete it.
+  if (!parent && localDataset)
     delete dataset;
 }
 
@@ -576,7 +582,8 @@ SpillTree<MetricType, StatisticType, MatType, SplitType>::
     stat(*this),
     parentDistance(0),
     furthestDescendantDistance(0),
-    dataset(NULL)
+    dataset(NULL),
+    localDataset(false)
 {
   // Nothing to do.
 }
@@ -602,7 +609,7 @@ void SpillTree<MetricType, StatisticType, MatType, SplitType>::
       delete left;
     if (right)
       delete right;
-    if (!parent)
+    if (!parent && localDataset)
       delete dataset;
   }
 
@@ -616,6 +623,9 @@ void SpillTree<MetricType, StatisticType, MatType, SplitType>::
   ar & CreateNVP(furthestDescendantDistance, "furthestDescendantDistance");
   ar & CreateNVP(dataset, "dataset");
 
+  if (Archive::is_loading::value && parent == NULL)
+    localDataset = true;
+
   // Save children last; otherwise boost::serialization gets confused.
   ar & CreateNVP(left, "left");
   ar & CreateNVP(right, "right");




More information about the mlpack-git mailing list