[mlpack-git] master: A first pass at refactoring BinarySpaceTree. (4329aa9)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Fri Jul 10 19:00:05 EDT 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/4a97187bbba7ce8a6191b714949dd818ef0f37d2...e5905e62c15d1bcff21e6359b11efcd7ab6d7ca0

>---------------------------------------------------------------

commit 4329aa9cb7a2c771f96c8f301e12b1d7661753f0
Author: ryan <ryan at ratml.org>
Date:   Wed Apr 22 16:53:24 2015 -0400

    A first pass at refactoring BinarySpaceTree.


>---------------------------------------------------------------

4329aa9cb7a2c771f96c8f301e12b1d7661753f0
 .../tree/binary_space_tree/binary_space_tree.hpp   | 78 +++++++++----------
 .../binary_space_tree/binary_space_tree_impl.hpp   | 88 +++++++++++-----------
 2 files changed, 83 insertions(+), 83 deletions(-)

diff --git a/src/mlpack/core/tree/binary_space_tree/binary_space_tree.hpp b/src/mlpack/core/tree/binary_space_tree/binary_space_tree.hpp
index 9af121b..d17532d 100644
--- a/src/mlpack/core/tree/binary_space_tree/binary_space_tree.hpp
+++ b/src/mlpack/core/tree/binary_space_tree/binary_space_tree.hpp
@@ -67,8 +67,9 @@ class BinarySpaceTree
   double furthestDescendantDistance;
   //! The minimum distance from the center to any edge of the bound.
   double minimumBoundDistance;
-  //! The dataset.
-  MatType& dataset;
+  //! The dataset.  If we are the root of the tree, we own the dataset and must
+  //! delete it.
+  MatType* dataset;
 
  public:
   //! So other classes can use TreeType::Mat.
@@ -88,92 +89,95 @@ class BinarySpaceTree
 
   /**
    * Construct this as the root node of a binary space tree using the given
-   * dataset.  This will modify the ordering of the points in the dataset!
+   * dataset.  This will copy the input matrix; if you don't want this, consider
+   * using the constructor that takes an rvalue reference and use std::move().
    *
-   * @param data Dataset to create tree from.  This will be modified!
+   * @param data Dataset to create tree from.  This will be copied!
    * @param maxLeafSize Size of each leaf in the tree.
    */
-  BinarySpaceTree(MatType& data, const size_t maxLeafSize = 20);
+  BinarySpaceTree(const MatType& data, const size_t maxLeafSize = 20);
 
   /**
    * Construct this as the root node of a binary space tree using the given
-   * dataset.  This will modify the ordering of points in the dataset!  A
-   * mapping of the old point indices to the new point indices is filled.
+   * dataset.  This will copy the input matrix and modify its ordering; a
+   * mapping of the old point indices to the new point indices is filled.  If
+   * you don't want the matrix to be copied, consider using the constructor that
+   * takes an rvalue reference and use std::move().
    *
-   * @param data Dataset to create tree from.  This will be modified!
+   * @param data Dataset to create tree from.  This will be copied!
    * @param oldFromNew Vector which will be filled with the old positions for
    *     each new point.
    * @param maxLeafSize Size of each leaf in the tree.
    */
-  BinarySpaceTree(MatType& data,
+  BinarySpaceTree(const MatType& data,
                   std::vector<size_t>& oldFromNew,
                   const size_t maxLeafSize = 20);
 
   /**
    * Construct this as the root node of a binary space tree using the given
-   * dataset.  This will modify the ordering of points in the dataset!  A
+   * dataset.  This will copy the input matrix and modify its ordering; a
    * mapping of the old point indices to the new point indices is filled, as
-   * well as a mapping of the new point indices to the old point indices.
+   * well as a mapping of the new point indices to the old point indices.  If
+   * you don't want the matrix to be copied, consider using the constructor that
+   * takes an rvalue reference and use std::move().
    *
-   * @param data Dataset to create tree from.  This will be modified!
+   * @param data Dataset to create tree from.  This will be copied!
    * @param oldFromNew Vector which will be filled with the old positions for
    *     each new point.
    * @param newFromOld Vector which will be filled with the new positions for
    *     each old point.
    * @param maxLeafSize Size of each leaf in the tree.
    */
-  BinarySpaceTree(MatType& data,
+  BinarySpaceTree(const MatType& data,
                   std::vector<size_t>& oldFromNew,
                   std::vector<size_t>& newFromOld,
                   const size_t maxLeafSize = 20);
 
   /**
-   * Construct this node on a subset of the given matrix, starting at column
-   * begin and using count points.  The ordering of that subset of points
-   * will be modified!  This is used for recursive tree-building by the other
-   * constructors which don't specify point indices.
+   * Construct this node as a child of the given parent, starting at column
+   * begin_in and using count_in points.  The ordering of that subset of points
+   * in the parent's data matrix will be modified!  This is used for recursive
+   * tree-building by the other constructors which don't specify point indices.
    *
-   * @param data Dataset to create tree from.  This will be modified!
+   * @param parent Parent of this node.  Its dataset will be modified!
    * @param begin Index of point to start tree construction with.
    * @param count Number of points to use to construct tree.
    * @param maxLeafSize Size of each leaf in the tree.
    */
-  BinarySpaceTree(MatType& data,
+  BinarySpaceTree(BinarySpaceTree* parent,
                   const size_t begin,
                   const size_t count,
-                  BinarySpaceTree* parent = NULL,
                   const size_t maxLeafSize = 20);
 
   /**
-   * Construct this node on a subset of the given matrix, starting at column
+   * Construct this node as a child of the given parent, starting at column
    * begin_in and using count_in points.  The ordering of that subset of points
-   * will be modified!  This is used for recursive tree-building by the other
-   * constructors which don't specify point indices.
+   * in the parent's data matrix will be modified!  This is used for recursive
+   * tree-building by the other constructors which don't specify point indices.
    *
    * A mapping of the old point indices to the new point indices is filled, but
    * it is expected that the vector is already allocated with size greater than
    * or equal to (begin_in + count_in), and if that is not true, invalid memory
    * reads (and writes) will occur.
    *
-   * @param data Dataset to create tree from.  This will be modified!
+   * @param parent Parent of this node.  Its dataset will be modified!
    * @param begin Index of point to start tree construction with.
    * @param count Number of points to use to construct tree.
    * @param oldFromNew Vector which will be filled with the old positions for
    *     each new point.
    * @param maxLeafSize Size of each leaf in the tree.
    */
-  BinarySpaceTree(MatType& data,
+  BinarySpaceTree(BinarySpaceTree* parent,
                   const size_t begin,
                   const size_t count,
                   std::vector<size_t>& oldFromNew,
-                  BinarySpaceTree* parent = NULL,
                   const size_t maxLeafSize = 20);
 
   /**
-   * Construct this node on a subset of the given matrix, starting at column
+   * Construct this node as a child of the given parent, starting at column
    * begin_in and using count_in points.  The ordering of that subset of points
-   * will be modified!  This is used for recursive tree-building by the other
-   * constructors which don't specify point indices.
+   * in the parent's data matrix will be modified!  This is used for recursive
+   * tree-building by the other constructors which don't specify point indices.
    *
    * A mapping of the old point indices to the new point indices is filled, as
    * well as a mapping of the new point indices to the old point indices.  It is
@@ -181,7 +185,7 @@ class BinarySpaceTree
    * equal to (begin_in + count_in), and if that is not true, invalid memory
    * reads (and writes) will occur.
    *
-   * @param data Dataset to create tree from.  This will be modified!
+   * @param parent Parent of this node.  Its dataset will be modified!
    * @param begin Index of point to start tree construction with.
    * @param count Number of points to use to construct tree.
    * @param oldFromNew Vector which will be filled with the old positions for
@@ -190,12 +194,11 @@ class BinarySpaceTree
    *     each old point.
    * @param maxLeafSize Size of each leaf in the tree.
    */
-  BinarySpaceTree(MatType& data,
+  BinarySpaceTree(BinarySpaceTree* parent,
                   const size_t begin,
                   const size_t count,
                   std::vector<size_t>& oldFromNew,
                   std::vector<size_t>& newFromOld,
-                  BinarySpaceTree* parent = NULL,
                   const size_t maxLeafSize = 20);
 
   /**
@@ -269,9 +272,9 @@ class BinarySpaceTree
   BinarySpaceTree*& Parent() { return parent; }
 
   //! Get the dataset which the tree is built on.
-  const MatType& Dataset() const { return dataset; }
+  const MatType& Dataset() const { return *dataset; }
   //! Modify the dataset which the tree is built on.  Be careful!
-  MatType& Dataset() { return dataset; }
+  MatType& Dataset() { return *dataset; }
 
   //! Get the metric which the tree uses.
   typename BoundType::MetricType Metric() const { return bound.Metric(); }
@@ -425,11 +428,9 @@ class BinarySpaceTree
   /**
    * Splits the current node, assigning its left and right children recursively.
    *
-   * @param data Dataset which we are using.
    * @param maxLeafSize Maximum number of points held in a leaf.
    */
-  void SplitNode(MatType& data,
-                 const size_t maxLeafSize);
+  void SplitNode(const size_t maxLeafSize);
 
   /**
    * Splits the current node, assigning its left and right children recursively.
@@ -439,8 +440,7 @@ class BinarySpaceTree
    * @param oldFromNew Vector holding permuted indices.
    * @param maxLeafSize Maximum number of points held in a leaf.
    */
-  void SplitNode(MatType& data,
-                 std::vector<size_t>& oldFromNew,
+  void SplitNode(std::vector<size_t>& oldFromNew,
                  const size_t maxLeafSize);
 
  public:
diff --git a/src/mlpack/core/tree/binary_space_tree/binary_space_tree_impl.hpp b/src/mlpack/core/tree/binary_space_tree/binary_space_tree_impl.hpp
index 9764578..ebc4c23 100644
--- a/src/mlpack/core/tree/binary_space_tree/binary_space_tree_impl.hpp
+++ b/src/mlpack/core/tree/binary_space_tree/binary_space_tree_impl.hpp
@@ -23,7 +23,7 @@ template<typename BoundType,
          typename MatType,
          typename SplitType>
 BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
-    MatType& data,
+    const MatType& data,
     const size_t maxLeafSize) :
     left(NULL),
     right(NULL),
@@ -32,10 +32,10 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     count(data.n_cols), /* and spans all of the dataset. */
     bound(data.n_rows),
     parentDistance(0), // Parent distance for the root is 0: it has no parent.
-    dataset(data)
+    dataset(new MatType(data)) // Copies the dataset.
 {
   // Do the actual splitting of this node.
-  SplitNode(data, maxLeafSize);
+  SplitNode(maxLeafSize);
 
   // Create the statistic depending on if we are a leaf or not.
   stat = StatisticType(*this);
@@ -46,7 +46,7 @@ template<typename BoundType,
          typename MatType,
          typename SplitType>
 BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
-    MatType& data,
+    const MatType& data,
     std::vector<size_t>& oldFromNew,
     const size_t maxLeafSize) :
     left(NULL),
@@ -56,7 +56,7 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     count(data.n_cols),
     bound(data.n_rows),
     parentDistance(0), // Parent distance for the root is 0: it has no parent.
-    dataset(data)
+    dataset(new MatType(data)) // Copies the dataset.
 {
   // Initialize oldFromNew correctly.
   oldFromNew.resize(data.n_cols);
@@ -64,7 +64,7 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     oldFromNew[i] = i; // Fill with unharmed indices.
 
   // Now do the actual splitting.
-  SplitNode(data, oldFromNew, maxLeafSize);
+  SplitNode(oldFromNew, maxLeafSize);
 
   // Create the statistic depending on if we are a leaf or not.
   stat = StatisticType(*this);
@@ -75,7 +75,7 @@ template<typename BoundType,
          typename MatType,
          typename SplitType>
 BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
-    MatType& data,
+    const MatType& data,
     std::vector<size_t>& oldFromNew,
     std::vector<size_t>& newFromOld,
     const size_t maxLeafSize) :
@@ -86,7 +86,7 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     count(data.n_cols),
     bound(data.n_rows),
     parentDistance(0), // Parent distance for the root is 0: it has no parent.
-    dataset(data)
+    dataset(new MatType(data)) // Copies the dataset.
 {
   // Initialize the oldFromNew vector correctly.
   oldFromNew.resize(data.n_cols);
@@ -94,7 +94,7 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     oldFromNew[i] = i; // Fill with unharmed indices.
 
   // Now do the actual splitting.
-  SplitNode(data, oldFromNew, maxLeafSize);
+  SplitNode(oldFromNew, maxLeafSize);
 
   // Create the statistic depending on if we are a leaf or not.
   stat = StatisticType(*this);
@@ -110,21 +110,20 @@ template<typename BoundType,
          typename MatType,
          typename SplitType>
 BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
-    MatType& data,
+    BinarySpaceTree* parent,
     const size_t begin,
     const size_t count,
-    BinarySpaceTree* parent,
     const size_t maxLeafSize) :
     left(NULL),
     right(NULL),
     parent(parent),
     begin(begin),
     count(count),
-    bound(data.n_rows),
-    dataset(data)
+    bound(parent->Dataset().n_rows),
+    dataset(&parent->Dataset()) // Point to the parent's dataset.
 {
   // Perform the actual splitting.
-  SplitNode(data, maxLeafSize);
+  SplitNode(maxLeafSize);
 
   // Create the statistic depending on if we are a leaf or not.
   stat = StatisticType(*this);
@@ -135,26 +134,25 @@ template<typename BoundType,
          typename MatType,
          typename SplitType>
 BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
-    MatType& data,
+    BinarySpaceTree* parent,
     const size_t begin,
     const size_t count,
     std::vector<size_t>& oldFromNew,
-    BinarySpaceTree* parent,
     const size_t maxLeafSize) :
     left(NULL),
     right(NULL),
     parent(parent),
     begin(begin),
     count(count),
-    bound(data.n_rows),
-    dataset(data)
+    bound(parent->Dataset().n_rows),
+    dataset(&parent->Dataset())
 {
   // Hopefully the vector is initialized correctly!  We can't check that
   // entirely but we can do a minor sanity check.
-  assert(oldFromNew.size() == data.n_cols);
+  assert(oldFromNew.size() == dataset->n_cols);
 
   // Perform the actual splitting.
-  SplitNode(data, oldFromNew, maxLeafSize);
+  SplitNode(oldFromNew, maxLeafSize);
 
   // Create the statistic depending on if we are a leaf or not.
   stat = StatisticType(*this);
@@ -165,34 +163,33 @@ template<typename BoundType,
          typename MatType,
          typename SplitType>
 BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
-    MatType& data,
+    BinarySpaceTree* parent,
     const size_t begin,
     const size_t count,
     std::vector<size_t>& oldFromNew,
     std::vector<size_t>& newFromOld,
-    BinarySpaceTree* parent,
     const size_t maxLeafSize) :
     left(NULL),
     right(NULL),
     parent(parent),
     begin(begin),
     count(count),
-    bound(data.n_rows),
-    dataset(data)
+    bound(parent->Dataset()->n_rows),
+    dataset(&parent->Dataset())
 {
   // Hopefully the vector is initialized correctly!  We can't check that
   // entirely but we can do a minor sanity check.
-  Log::Assert(oldFromNew.size() == data.n_cols);
+  Log::Assert(oldFromNew.size() == dataset->n_cols);
 
   // Perform the actual splitting.
-  SplitNode(data, oldFromNew, maxLeafSize);
+  SplitNode(oldFromNew, maxLeafSize);
 
   // Create the statistic depending on if we are a leaf or not.
   stat = StatisticType(*this);
 
   // Map the newFromOld indices correctly.
-  newFromOld.resize(data.n_cols);
-  for (size_t i = 0; i < data.n_cols; i++)
+  newFromOld.resize(dataset->n_cols);
+  for (size_t i = 0; i < dataset->n_cols; i++)
     newFromOld[oldFromNew[i]] = i;
 }
 
@@ -215,7 +212,7 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     stat(other.stat),
     parentDistance(other.parentDistance),
     furthestDescendantDistance(other.furthestDescendantDistance),
-    dataset(other.dataset)
+    dataset(new MatType(*other.dataset)) // Copy matrix.
 {
   // Create left and right children (if any).
   if (other.Left())
@@ -247,6 +244,10 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::
     delete left;
   if (right)
     delete right;
+
+  // If we're the root, delete the matrix.
+  if (!parent)
+    delete dataset;
 }
 
 /**
@@ -514,11 +515,10 @@ template<typename BoundType,
          typename MatType,
          typename SplitType>
 void BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::SplitNode(
-    MatType& data,
     const size_t maxLeafSize)
 {
   // We need to expand the bounds of this node properly.
-  bound |= data.cols(begin, begin + count - 1);
+  bound |= dataset->cols(begin, begin + count - 1);
 
   // Calculate the furthest descendant distance.
   furthestDescendantDistance = 0.5 * bound.Diameter();
@@ -534,7 +534,8 @@ void BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::SplitNode(
 
   // Split the node. The elements of 'data' are reordered by the splitting
   // algorithm. This function call updates splitCol.
-  const bool split = SplitType::SplitNode(bound, data, begin, count, splitCol);
+  const bool split = SplitType::SplitNode(bound, *dataset, begin, count,
+      splitCol);
 
   // The node may not be always split. For instance, if all the points are the
   // same, we can't split them.
@@ -543,10 +544,10 @@ void BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::SplitNode(
 
   // Now that we know the split column, we will recursively split the children
   // by calling their constructors (which perform this splitting process).
-  left = new BinarySpaceTree<BoundType, StatisticType, MatType>(data, begin,
-      splitCol - begin, this, maxLeafSize);
-  right = new BinarySpaceTree<BoundType, StatisticType, MatType>(data, splitCol,
-      begin + count - splitCol, this, maxLeafSize);
+  left = new BinarySpaceTree<BoundType, StatisticType, MatType>(this, begin,
+      splitCol - begin, maxLeafSize);
+  right = new BinarySpaceTree<BoundType, StatisticType, MatType>(this, splitCol,
+      begin + count - splitCol, maxLeafSize);
 
   // Calculate parent distances for those two nodes.
   arma::vec centroid, leftCentroid, rightCentroid;
@@ -568,13 +569,12 @@ template<typename BoundType,
          typename MatType,
          typename SplitType>
 void BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::SplitNode(
-    MatType& data,
     std::vector<size_t>& oldFromNew,
     const size_t maxLeafSize)
 {
   // This should be a single function for Bound.
   // We need to expand the bounds of this node properly.
-  bound |= data.cols(begin, begin + count - 1);
+  bound |= dataset->cols(begin, begin + count - 1);
 
   // Calculate the furthest descendant distance.
   furthestDescendantDistance = 0.5 * bound.Diameter();
@@ -590,8 +590,8 @@ void BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::SplitNode(
 
   // Split the node. The elements of 'data' are reordered by the splitting
   // algorithm. This function call updates splitCol and oldFromNew.
-  const bool split = SplitType::SplitNode(bound, data, begin, count, splitCol,
-      oldFromNew);
+  const bool split = SplitType::SplitNode(bound, *dataset, begin, count,
+      splitCol, oldFromNew);
 
   // The node may not be always split. For instance, if all the points are the
   // same, we can't split them.
@@ -600,10 +600,10 @@ void BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::SplitNode(
 
   // Now that we know the split column, we will recursively split the children
   // by calling their constructors (which perform this splitting process).
-  left = new BinarySpaceTree<BoundType, StatisticType, MatType>(data, begin,
-      splitCol - begin, oldFromNew, this, maxLeafSize);
-  right = new BinarySpaceTree<BoundType, StatisticType, MatType>(data, splitCol,
-      begin + count - splitCol, oldFromNew, this, maxLeafSize);
+  left = new BinarySpaceTree<BoundType, StatisticType, MatType>(this, begin,
+      splitCol - begin, oldFromNew, maxLeafSize);
+  right = new BinarySpaceTree<BoundType, StatisticType, MatType>(this, splitCol,
+      begin + count - splitCol, oldFromNew, maxLeafSize);
 
   // Calculate parent distances for those two nodes.
   arma::vec centroid, leftCentroid, rightCentroid;



More information about the mlpack-git mailing list