[mlpack-git] master: Remove unnecessary splitDimension and maxLeafSize. This results in a pretty trivial speedup (like, 0.5%?), but it makes the tree require less memory. Also I've removed a couple of entirely unused functions. (46e038c)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Tue Apr 7 17:27:56 EDT 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/81fe638aa0c8d9592bbf7b434110140eb9bb86e7...46e038cdb082ccbd44e0583e1bae6d6da03e26d1

>---------------------------------------------------------------

commit 46e038cdb082ccbd44e0583e1bae6d6da03e26d1
Author: ryan <ryan at ratml.org>
Date:   Tue Apr 7 17:27:12 2015 -0400

    Remove unnecessary splitDimension and maxLeafSize.
    This results in a pretty trivial speedup (like, 0.5%?), but it makes the tree require less memory.
    Also I've removed a couple of entirely unused functions.


>---------------------------------------------------------------

46e038cdb082ccbd44e0583e1bae6d6da03e26d1
 .../tree/binary_space_tree/binary_space_tree.hpp   | 53 ++------------
 .../binary_space_tree/binary_space_tree_impl.hpp   | 83 +++++-----------------
 .../core/tree/binary_space_tree/mean_split.hpp     |  2 -
 .../tree/binary_space_tree/mean_split_impl.hpp     |  6 +-
 src/mlpack/methods/neighbor_search/allknn_main.cpp |  7 ++
 5 files changed, 32 insertions(+), 119 deletions(-)

diff --git a/src/mlpack/core/tree/binary_space_tree/binary_space_tree.hpp b/src/mlpack/core/tree/binary_space_tree/binary_space_tree.hpp
index 6f98c6c..9af121b 100644
--- a/src/mlpack/core/tree/binary_space_tree/binary_space_tree.hpp
+++ b/src/mlpack/core/tree/binary_space_tree/binary_space_tree.hpp
@@ -56,14 +56,10 @@ class BinarySpaceTree
   //! The number of points of the dataset contained in this node (and its
   //! children).
   size_t count;
-  //! The max leaf size.
-  size_t maxLeafSize;
   //! The bound object for this node.
   BoundType bound;
   //! Any extra data contained in the node.
   StatisticType stat;
-  //! The dimension this node split on if it is a parent.
-  size_t splitDimension;
   //! The distance from the centroid of this node to the centroid of the parent.
   double parentDistance;
   //! The worst possible distance to the furthest descendant, cached to speed
@@ -257,14 +253,6 @@ class BinarySpaceTree
   //! Return whether or not this node is a leaf (true if it has no children).
   bool IsLeaf() const;
 
-  //! Return the max leaf size.
-  size_t MaxLeafSize() const { return maxLeafSize; }
-  //! Modify the max leaf size.
-  size_t& MaxLeafSize() { return maxLeafSize; }
-
-  //! Fills the tree to the specified level.
-  size_t ExtendTree(const size_t level);
-
   //! Gets the left child of this node.
   BinarySpaceTree* Left() const { return left; }
   //! Modify the left child of this node.
@@ -280,11 +268,6 @@ class BinarySpaceTree
   //! Modify the parent of this node.
   BinarySpaceTree*& Parent() { return parent; }
 
-  //! Get the split dimension for this node.
-  size_t SplitDimension() const { return splitDimension; }
-  //! Modify the split dimension for this node.
-  size_t& SplitDimension() { return splitDimension; }
-
   //! Get the dataset which the tree is built on.
   const MatType& Dataset() const { return dataset; }
   //! Modify the dataset which the tree is built on.  Be careful!
@@ -410,11 +393,6 @@ class BinarySpaceTree
   }
 
   /**
-  * Returns the dimension this parent's children are split on.
-  */
-  size_t GetSplitDimension() const;
-
-  /**
    * Obtains the number of nodes in the tree, starting with this.
    */
   size_t TreeSize() const;
@@ -445,33 +423,13 @@ class BinarySpaceTree
 
  private:
   /**
-   * Private copy constructor, available only to fill (pad) the tree to a
-   * specified level.
-   */
-  BinarySpaceTree(const size_t begin,
-                  const size_t count,
-                  BoundType bound,
-                  StatisticType stat,
-                  const int maxLeafSize = 20) :
-      left(NULL),
-      right(NULL),
-      begin(begin),
-      count(count),
-      bound(bound),
-      stat(stat),
-      maxLeafSize(maxLeafSize) { }
-
-  BinarySpaceTree* CopyMe()
-  {
-    return new BinarySpaceTree(begin, count, bound, stat, maxLeafSize);
-  }
-
-  /**
    * Splits the current node, assigning its left and right children recursively.
    *
    * @param data Dataset which we are using.
+   * @param maxLeafSize Maximum number of points held in a leaf.
    */
-  void SplitNode(MatType& data);
+  void SplitNode(MatType& data,
+                 const size_t maxLeafSize);
 
   /**
    * Splits the current node, assigning its left and right children recursively.
@@ -479,8 +437,11 @@ class BinarySpaceTree
    *
    * @param data Dataset which we are using.
    * @param oldFromNew Vector holding permuted indices.
+   * @param maxLeafSize Maximum number of points held in a leaf.
    */
-  void SplitNode(MatType& data, std::vector<size_t>& oldFromNew);
+  void SplitNode(MatType& data,
+                 std::vector<size_t>& oldFromNew,
+                 const size_t maxLeafSize);
 
  public:
   /**
diff --git a/src/mlpack/core/tree/binary_space_tree/binary_space_tree_impl.hpp b/src/mlpack/core/tree/binary_space_tree/binary_space_tree_impl.hpp
index 35db29a..a068899 100644
--- a/src/mlpack/core/tree/binary_space_tree/binary_space_tree_impl.hpp
+++ b/src/mlpack/core/tree/binary_space_tree/binary_space_tree_impl.hpp
@@ -30,13 +30,12 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     parent(NULL),
     begin(0), /* This root node starts at index 0, */
     count(data.n_cols), /* and spans all of the dataset. */
-    maxLeafSize(maxLeafSize),
     bound(data.n_rows),
     parentDistance(0), // Parent distance for the root is 0: it has no parent.
     dataset(data)
 {
   // Do the actual splitting of this node.
-  SplitNode(data);
+  SplitNode(data, maxLeafSize);
 
   // Create the statistic depending on if we are a leaf or not.
   stat = StatisticType(*this);
@@ -55,7 +54,6 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     parent(NULL),
     begin(0),
     count(data.n_cols),
-    maxLeafSize(maxLeafSize),
     bound(data.n_rows),
     parentDistance(0), // Parent distance for the root is 0: it has no parent.
     dataset(data)
@@ -66,7 +64,7 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     oldFromNew[i] = i; // Fill with unharmed indices.
 
   // Now do the actual splitting.
-  SplitNode(data, oldFromNew);
+  SplitNode(data, oldFromNew, maxLeafSize);
 
   // Create the statistic depending on if we are a leaf or not.
   stat = StatisticType(*this);
@@ -86,7 +84,6 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     parent(NULL),
     begin(0),
     count(data.n_cols),
-    maxLeafSize(maxLeafSize),
     bound(data.n_rows),
     parentDistance(0), // Parent distance for the root is 0: it has no parent.
     dataset(data)
@@ -97,7 +94,7 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     oldFromNew[i] = i; // Fill with unharmed indices.
 
   // Now do the actual splitting.
-  SplitNode(data, oldFromNew);
+  SplitNode(data, oldFromNew, maxLeafSize);
 
   // Create the statistic depending on if we are a leaf or not.
   stat = StatisticType(*this);
@@ -123,12 +120,11 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     parent(parent),
     begin(begin),
     count(count),
-    maxLeafSize(maxLeafSize),
     bound(data.n_rows),
     dataset(data)
 {
   // Perform the actual splitting.
-  SplitNode(data);
+  SplitNode(data, maxLeafSize);
 
   // Create the statistic depending on if we are a leaf or not.
   stat = StatisticType(*this);
@@ -150,7 +146,6 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     parent(parent),
     begin(begin),
     count(count),
-    maxLeafSize(maxLeafSize),
     bound(data.n_rows),
     dataset(data)
 {
@@ -159,7 +154,7 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
   assert(oldFromNew.size() == data.n_cols);
 
   // Perform the actual splitting.
-  SplitNode(data, oldFromNew);
+  SplitNode(data, oldFromNew, maxLeafSize);
 
   // Create the statistic depending on if we are a leaf or not.
   stat = StatisticType(*this);
@@ -182,7 +177,6 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     parent(parent),
     begin(begin),
     count(count),
-    maxLeafSize(maxLeafSize),
     bound(data.n_rows),
     dataset(data)
 {
@@ -191,7 +185,7 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
   Log::Assert(oldFromNew.size() == data.n_cols);
 
   // Perform the actual splitting.
-  SplitNode(data, oldFromNew);
+  SplitNode(data, oldFromNew, maxLeafSize);
 
   // Create the statistic depending on if we are a leaf or not.
   stat = StatisticType(*this);
@@ -202,21 +196,6 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     newFromOld[oldFromNew[i]] = i;
 }
 
-/*
-template<typename BoundType, typename StatisticType, typename MatType>
-BinarySpaceTree<BoundType, StatisticType, MatType>::BinarySpaceTree() :
-    left(NULL),
-    right(NULL),
-    parent(NULL),
-    begin(0),
-    count(0),
-    bound(),
-    stat(),
-    maxLeafSize(20) // Default max leaf size is 20.
-{
-  // Nothing to do.
-}*/
-
 /**
  * Create a binary space tree by copying the other tree.  Be careful!  This can
  * take a long time and use a lot of memory.
@@ -232,10 +211,8 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::BinarySpaceTree(
     parent(other.parent),
     begin(other.begin),
     count(other.count),
-    maxLeafSize(other.maxLeafSize),
     bound(other.bound),
     stat(other.stat),
-    splitDimension(other.splitDimension),
     parentDistance(other.parentDistance),
     furthestDescendantDistance(other.furthestDescendantDistance),
     dataset(other.dataset)
@@ -340,32 +317,6 @@ BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::FindByBeginCount(
     return NULL;
 }
 
-template<typename BoundType,
-         typename StatisticType,
-         typename MatType,
-         typename SplitType>
-size_t BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::ExtendTree(
-    size_t level)
-{
-  --level;
-  // Return the number of nodes duplicated.
-  size_t nodesDuplicated = 0;
-  if (level > 0)
-  {
-    if (!left)
-    {
-      left = CopyMe();
-      ++nodesDuplicated;
-    }
-    nodesDuplicated += left->ExtendTree(level);
-    if (right)
-    {
-      nodesDuplicated += right->ExtendTree(level);
-    }
-  }
-  return nodesDuplicated;
-}
-
 /* TODO: we can likely calculate this earlier, then store the
  *   result in a private member variable; for now, we can
  *   just calculate as needed...
@@ -563,7 +514,8 @@ template<typename BoundType,
          typename MatType,
          typename SplitType>
 void BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::SplitNode(
-    MatType& data)
+    MatType& data,
+    const size_t maxLeafSize)
 {
   // We need to expand the bounds of this node properly.
   bound |= data.cols(begin, begin + count - 1);
@@ -581,9 +533,8 @@ void BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::SplitNode(
   size_t splitCol;
 
   // Split the node. The elements of 'data' are reordered by the splitting
-  // algorithm. This function call updates splitDimension and splitCol.
-  const bool split = SplitType::SplitNode(bound, data, begin, count,
-      splitDimension, splitCol);
+  // algorithm. This function call updates splitCol.
+  const bool split = SplitType::SplitNode(bound, data, begin, count, splitCol);
 
   // The node may not be always split. For instance, if all the points are the
   // same, we can't split them.
@@ -618,7 +569,8 @@ template<typename BoundType,
          typename SplitType>
 void BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::SplitNode(
     MatType& data,
-    std::vector<size_t>& oldFromNew)
+    std::vector<size_t>& oldFromNew,
+    const size_t maxLeafSize)
 {
   // This should be a single function for Bound.
   // We need to expand the bounds of this node properly.
@@ -637,10 +589,9 @@ void BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::SplitNode(
   size_t splitCol;
 
   // Split the node. The elements of 'data' are reordered by the splitting
-  // algorithm. This function call updates splitDimension, splitCol and
-  // oldFromNew.
-  const bool split = SplitType::SplitNode(bound, data, begin, count,
-      splitDimension, splitCol, oldFromNew);
+  // algorithm. This function call updates splitCol and oldFromNew.
+  const bool split = SplitType::SplitNode(bound, data, begin, count, splitCol,
+      oldFromNew);
 
   // The node may not be always split. For instance, if all the points are the
   // same, we can't split them.
@@ -686,9 +637,7 @@ std::string BinarySpaceTree<BoundType, StatisticType, MatType, SplitType>::
   convert << "  Bound: " << std::endl;
   convert << mlpack::util::Indent(bound.ToString(), 2);
   convert << "  Statistic: " << std::endl;
-  convert << mlpack::util::Indent(stat.ToString(), 2);
-  convert << "  Max leaf size: " << maxLeafSize << std::endl;
-  convert << "  Split dimension: " << splitDimension << std::endl;
+  convert << mlpack::util::Indent(stat->ToString(), 2);
 
   // How many levels should we print?  This will print the top two tree levels.
   if (left != NULL && parent == NULL)
diff --git a/src/mlpack/core/tree/binary_space_tree/mean_split.hpp b/src/mlpack/core/tree/binary_space_tree/mean_split.hpp
index c6e3c04..219cc8c 100644
--- a/src/mlpack/core/tree/binary_space_tree/mean_split.hpp
+++ b/src/mlpack/core/tree/binary_space_tree/mean_split.hpp
@@ -41,7 +41,6 @@ class MeanSplit
                         MatType& data,
                         const size_t begin,
                         const size_t count,
-                        size_t& splitDimension,
                         size_t& splitCol);
 
   /**
@@ -64,7 +63,6 @@ class MeanSplit
                         MatType& data,
                         const size_t begin,
                         const size_t count,
-                        size_t& splitDimension,
                         size_t& splitCol,
                         std::vector<size_t>& oldFromNew);
 
diff --git a/src/mlpack/core/tree/binary_space_tree/mean_split_impl.hpp b/src/mlpack/core/tree/binary_space_tree/mean_split_impl.hpp
index d4c638b..f9d07fe 100644
--- a/src/mlpack/core/tree/binary_space_tree/mean_split_impl.hpp
+++ b/src/mlpack/core/tree/binary_space_tree/mean_split_impl.hpp
@@ -18,10 +18,9 @@ bool MeanSplit<BoundType, MatType>::SplitNode(const BoundType& bound,
                                               MatType& data,
                                               const size_t begin,
                                               const size_t count,
-                                              size_t& splitDimension,
                                               size_t& splitCol)
 {
-  splitDimension = data.n_rows; // Indicate invalid.
+  size_t splitDimension = data.n_rows; // Indicate invalid.
   double maxWidth = -1;
 
   // Find the split dimension.
@@ -56,11 +55,10 @@ bool MeanSplit<BoundType, MatType>::SplitNode(const BoundType& bound,
                                               MatType& data,
                                               const size_t begin,
                                               const size_t count,
-                                              size_t& splitDimension,
                                               size_t& splitCol,
                                               std::vector<size_t>& oldFromNew)
 {
-  splitDimension = data.n_rows; // Indicate invalid.
+  size_t splitDimension = data.n_rows; // Indicate invalid.
   double maxWidth = -1;
 
   // Find the split dimension.
diff --git a/src/mlpack/methods/neighbor_search/allknn_main.cpp b/src/mlpack/methods/neighbor_search/allknn_main.cpp
index 06ebc28..3aaf45e 100644
--- a/src/mlpack/methods/neighbor_search/allknn_main.cpp
+++ b/src/mlpack/methods/neighbor_search/allknn_main.cpp
@@ -68,6 +68,13 @@ int main(int argc, char *argv[])
   // Give CLI the command line parameters the user passed in.
   CLI::ParseCommandLine(argc, argv);
 
+      Log::Info << "sizeof(BinarySpaceTree<>): " << sizeof(BinarySpaceTree<bound::HRectBound<2>>) << ".\n";
+      Log::Info << "sizeof(HRectBound<2>): " << sizeof(bound::HRectBound<2>) << ".\n";
+      Log::Info << "sizeof(NeighborSearchStat): " << sizeof(NeighborSearchStat<NearestNeighborSort>) << ".\n";
+      Log::Info << "sizeof(TreeType): " <<
+sizeof(BinarySpaceTree<bound::HRectBound<2>,
+NeighborSearchStat<NearestNeighborSort>>) << ".\n";
+
   if (CLI::GetParam<int>("seed") != 0)
     math::RandomSeed((size_t) CLI::GetParam<int>("seed"));
   else



More information about the mlpack-git mailing list