[mlpack-svn] r12571 - mlpack/trunk/src/mlpack/core/tree

Mon Apr 30 12:58:18 EDT 2012

Author: rcurtin
Date: 2012-04-30 12:58:18 -0400 (Mon, 30 Apr 2012)
New Revision: 12571

Modified:
   mlpack/trunk/src/mlpack/core/tree/cover_tree.hpp
   mlpack/trunk/src/mlpack/core/tree/cover_tree_impl.hpp
Log:
Comment the CoverTree implementation a lot better.  Still a few things to do...


Modified: mlpack/trunk/src/mlpack/core/tree/cover_tree.hpp
===================================================================

--- mlpack/trunk/src/mlpack/core/tree/cover_tree.hpp	2012-04-30 15:55:58 UTC (rev 12570)
+++ mlpack/trunk/src/mlpack/core/tree/cover_tree.hpp	2012-04-30 16:58:18 UTC (rev 12571)
@@ -12,16 +12,108 @@
 namespace mlpack {
 namespace tree {
 
+/**
+ * A cover tree is a tree specifically designed to speed up nearest-neighbor
+ * computation in high-dimensional spaces.  Each non-leaf node references a
+ * point and has a nonzero number of children, including a "self-child" which
+ * references the same point.  A leaf node represents only one point.
+ *
+ * The tree can be thought of as a hierarchy with the root node at the top level
+ * and the leaf nodes at the bottom level.  Each level in the tree has an
+ * assigned 'scale' i.  The tree follows these three conditions:
+ *
+ * - nesting: the level C_i is a subset of the level C_{i - 1}.
+ * - covering: all node in level C_{i - 1} have at least one node in the
+ *     level C_i with distance less than or equal to EC^i (exactly one of these
+ *     is a parent of the point in level C_{i - 1}.
+ * - separation: all nodes in level C_i have distance greater than EC^i to all
+ *     other nodes in level C_i.
+ *
+ * The value 'EC' refers to the expansion constant, which is a parameter of the
+ * tree.  These three properties make the cover tree very good for fast,
+ * high-dimensional nearest-neighbor search.
+ *
+ * The theoretical structure of the tree contains many 'implicit' nodes which
+ * only have a "self-child" (a child referencing the same point, but at a lower
+ * scale level).  This practical implementation only constructs explicit nodes
+ * -- non-leaf nodes with more than one child.  A leaf node has no children, and
+ * its scale level is INT_MIN.
+ *
+ * For more information on cover trees, see
+ *
+ * @code
+ * @inproceedings{
+ *   author = {Beygelzimer, Alina and Kakade, Sham and Langford, John},
+ *   title = {Cover trees for nearest neighbor},
+ *   booktitle = {Proceedings of the 23rd International Conference on Machine
+ *     Learning},
+ *   series = {ICML '06},
+ *   year = {2006},
+ *   pages = {97--104]
+ * }
+ * @endcode
+ *
+ * For information on runtime bounds of the nearest-neighbor computation using
+ * cover trees, see the following paper, presented at NIPS 2009:
+ *
+ * @code
+ * @inproceedings{
+ *   author = {Ram, P., and Lee, D., and March, W.B., and Gray, A.G.},
+ *   title = {Linear-time Algorithms for Pairwise Statistical Problems},
+ *   booktitle = {Advances in Neural Information Processing Systems 22},
+ *   editor = {Y. Bengio and D. Schuurmans and J. Lafferty and C.K.I. Williams
+ *     and A. Culotta},
+ *   pages = {1527--1535},
+ *   year = {2009}
+ * }
+ * @endcode
+ */
 template<typename StatisticType = EmptyStatistic>
 class CoverTree
 {
  public:
   /**
-   * Create the cover tree.
+   * Create the cover tree with the given dataset and given expansion constant.
+   * The dataset will not be modified during the building procedure (unlike
+   * BinarySpaceTree).
+   *
+   * @param dataset Reference to the dataset to build a tree on.
+   * @param expansionConstant Expansion constant (EC) to use during tree
+   *      building (default 2.0).
    */
   CoverTree(const arma::mat& dataset,
             const double expansionConstant = 2.0);
 
+  /**
+   * Construct a child cover tree node.  This constructor is not meant to be
+   * used externally, but it could be used to insert another node into a tree.
+   * This procedure uses only one vector for the near set, the far set, and the
+   * used set (this is to prevent unnecessary memory allocation in recursive
+   * calls to this constructor).  Therefore, the size of the near set, far set,
+   * and used set must be passed in.  The near set will be entirely used up, and
+   * some of the far set may be used.  The value of usedSetSize will be set to
+   * the number of points used in the construction of this node, and the value
+   * of farSetSize will be modified to reflect the number of points in the far
+   * set _after_ the construction of this node.
+   *
+   * If you are calling this manually, be careful that the given scale is
+   * as small as possible, or you may be creating an implicit node in your tree.
+   *
+   * @param dataset Reference to the dataset to build a tree on.
+   * @param expansionConstant Expansion constant (EC) to use during tree
+   *     building.
+   * @param pointIndex Index of the point this node references.
+   * @param scale Scale of this level in the tree.
+   * @param indices Array of indices, ordered [ nearSet | farSet | usedSet ];
+   *     will be modified to [ farSet | usedSet ].
+   * @param distances Array of distances, ordered the same way as the indices.
+   *     These represent the distances between the point specified by pointIndex
+   *     and each point in the indices array.
+   * @param nearSetSize Size of the near set; if 0, this will be a leaf.
+   * @param farSetSize Size of the far set; may be modified (if this node uses
+   *     any points in the far set).
+   * @param usedSetSize The number of points used will be added to this number.
+   */
   CoverTree(const arma::mat& dataset,
             const double expansionConstant,
             const size_t pointIndex,
@@ -32,6 +124,9 @@
             size_t& farSetSize,
             size_t& usedSetSize);
 
+  /**
+   * Delete this cover tree node and its children.
+   */
   ~CoverTree();
 
   //! Get a reference to the dataset.
@@ -65,17 +160,9 @@
   //! Index of the point in the matrix which this node represents.
   size_t point;
 
-  //! The distance to the furthest descendant.
-  double furthestDistance; // Better name?
-
-  //! The distance to the parent.
-  double parentDistance; // Better name?
-
-  //! The list of children; the first is the "self child" (what is this?).
+  //! The list of children; the first is the self-child.
   std::vector<CoverTree*> children;
 
-  //! Depth of the node in terms of scale (in terms of what?).
-//  size_t scaleDepth;
   //! Scale level of the node.
   int scale;
 
@@ -84,31 +171,70 @@
 
   //! The instantiated statistic.
   StatisticType stat;
-};
 
-// Utility functions; these should probably be incorporated into the class
-// itself sometime soon.
+  /**
+   * Fill the vector of distances with the distances between the point specified
+   * by pointIndex and each point in the indices array.  The distances of the
+   * first pointSetSize points in indices are calculated (so, this does not
+   * necessarily need to use all of the points in the arrays).
+   *
+   * @param pointIndex Point to build the distances for.
+   * @param indices List of indices to compute distances for.
+   * @param distances Vector to store calculated distances in.
+   * @param pointSetSize Number of points in arrays to calculate distances for.
+   */
+  void ComputeDistances(const size_t pointIndex,
+                        const arma::Col<size_t>& indices,
+                        arma::vec& distances,
+                        const size_t pointSetSize);
+  /**
+   * Split the given indices and distances into a near and a far set, returning
+   * the number of points in the near set.  The distances must already be
+   * initialized.  This will order the indices and distances such that the
+   * points in the near set make up the first part of the array and the far set
+   * makes up the rest:  [ nearSet | farSet ].
+   *
+   * @param indices List of indices; will be reordered.
+   * @param distances List of distances; will be reordered.
+   * @param bound If the distance is less than or equal to this bound, the point
+   *      is placed into the near set.
+   * @param pointSetSize Size of point set (because we may be sorting a smaller
+   *      list than the indices vector will hold).
+   */
+  size_t SplitNearFar(arma::Col<size_t>& indices,
+                      arma::vec& distances,
+                      const double bound,
+                      const size_t pointSetSize);
 
-size_t SplitNearFar(arma::Col<size_t>& indices,
-                    arma::vec& distances,
-                    const double bound,
-                    const size_t pointSetSize);
+  /**
+   * Assuming that the list of indices and distances is sorted as
+   * [ childFarSet | childUsedSet | farSet | usedSet ],
+   * resort the sets so the organization is
+   * [ childFarSet | farSet | childUsedSet | usedSet ].
+   *
+   * The size_t parameters specify the sizes of each set in the array.  Only the
+   * ordering of the indices and distances arrays will be modified (not their
+   * actual contents).
+   *
+   * The size of any of the four sets can be zero and this method will handle
+   * that case accordingly.
+   *
+   * @param indices List of indices to sort.
+   * @param distances List of distances to sort.
+   * @param childFarSetSize Number of points in child far set (childFarSet).
+   * @param childUsedSetSize Number of points in child used set (childUsedSet).
+   * @param farSetSize Number of points in far set (farSet).
+   */
+  size_t SortPointSet(arma::Col<size_t>& indices,
+                      arma::vec& distances,
+                      const size_t childFarSetSize,
+                      const size_t childUsedSetSize,
+                      const size_t farSetSize);
+};
 
-void BuildDistances(const arma::mat& dataset,
-                    const size_t pointIndex,
-                    const arma::Col<size_t>& indices,
-                    arma::vec& distances,
-                    const size_t pointSetSize);
+}; // namespace tree
+}; // namespace mlpack
 
-size_t SortPointSet(arma::Col<size_t>& indices,
-                    arma::vec& distances,
-                    const size_t childFarSetSize,
-                    const size_t childUsedSetSize,
-                    const size_t farSetSize);
-
-};
-};
-
 // Include implementation.
 #include "cover_tree_impl.hpp"
 

Modified: mlpack/trunk/src/mlpack/core/tree/cover_tree_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/core/tree/cover_tree_impl.hpp	2012-04-30 15:55:58 UTC (rev 12570)
+++ mlpack/trunk/src/mlpack/core/tree/cover_tree_impl.hpp	2012-04-30 16:58:18 UTC (rev 12571)
@@ -27,8 +27,7 @@
   arma::vec distances(dataset.n_cols - 1);
 
   // Build the initial distances.
-  BuildDistances(dataset, 0 /* default */, indices, distances,
-      dataset.n_cols - 1);
+  ComputeDistances(0 /* default */, indices, distances, dataset.n_cols - 1);
 
   // Now determine the scale factor of the root node.
   const double maxDistance = max(distances);
@@ -46,8 +45,6 @@
       indices, distances, childNearSetSize, childFarSetSize, childUsedSetSize));
 
   size_t nearSetSize = (dataset.n_cols - 1) - childUsedSetSize;
-//  Log::Warn << "before non-self children, near set " << nearSetSize << " and "
-//      << "child used set size " << childUsedSetSize << std::endl;
 
   // We have no far set, so the array is organized thusly:
   // [ near | used ].  No resorting is necessary.
@@ -66,8 +63,6 @@
     // accordingly.  We don't have to worry about the fact that the point we
     // swapped is actually in the far set but grouped with the near set, because
     // we're about to rebuild that anyway.
-//    Log::Warn << "nearSetSize is AOIAJA " << nearSetSize << std::endl;
-//    Log::Warn << trans(nearSet);
     size_t setIndex;
     for (size_t k = 0; k < nearSetSize; ++k)
       if (indices[k] == newPointIndex)
@@ -92,7 +87,7 @@
     nearSetSize--;
 
     // Rebuild the distances for this point.
-    BuildDistances(dataset, newPointIndex, indices, distances, nearSetSize);
+    ComputeDistances(newPointIndex, indices, distances, nearSetSize);
 
     // Split into near and far sets for this point.
     childNearSetSize = SplitNearFar(indices, distances, bound, nearSetSize);
@@ -136,20 +131,10 @@
     scale(scale),
     expansionConstant(expansionConstant)
 {
-//  Log::Debug << "Making child.  pointIndex " << pointIndex << ", scale " <<
-//      scale << ", nearSetSize " << nearSetSize << ", farSetSize " << farSetSize
-//      << ", usedSetSize " << usedSetSize << std::endl;
-//  if ((nearSetSize + farSetSize) > 0)
-//    Log::Debug << trans(indices.rows(0, nearSetSize + farSetSize - 1))
-//        << trans(distances.rows(0, nearSetSize + farSetSize - 1)) << std::endl;
 
   // If the size of the near set is 0, this is a leaf.
   if (nearSetSize == 0)
-  {
-    // Make this a leaf...
-//    Log::Warn << "This one is a leaf." << std::endl;
     return;
-  }
 
   // Determine the next scale level.  This should be the first level where there
   // are any points in the far set.  So, if we know the maximum distance in the
@@ -166,9 +151,6 @@
     children.push_back(new CoverTree(dataset, expansionConstant, pointIndex,
         INT_MIN, indices, distances, 0, farSetSize, usedSetSize));
 
-//    Log::Debug << "Every point in the near set will be a leaf." << std::endl;
-//    Log::Debug << trans(indices.rows(0, (nearSetSize - 1))) << std::endl;
-
     // Every point in the near set should be a leaf.
     for (size_t i = 0; i < nearSetSize; ++i)
     {
@@ -182,12 +164,7 @@
     // [ used | far | other used ]
     // and we want
     // [ far | all used ].
-//    Log::Debug << "Sorting with usedSetSize " << usedSetSize
-//        << " and farSetSize " << farSetSize << std::endl;
     SortPointSet(indices, distances, 0, usedSetSize, farSetSize);
-//    Log::Debug << "After sorting: " << std::endl;
-//    Log::Debug << trans(indices.rows(0, nearSetSize + farSetSize - 1))
-//        << trans(distances.rows(0, nearSetSize + farSetSize - 1)) << std::endl;
 
     return;
   }
@@ -196,9 +173,6 @@
       (int) ceil(log(maxDistance) / log(expansionConstant)) - 1);
   const double bound = pow(expansionConstant, nextScale);
 
-//  Log::Debug << "maxDistance is " << maxDistance << "; nextScale is " <<
-//      nextScale << "; bound is " << bound << std::endl;
-
   // This needs to be taken out.  It's a sanity check for now.
   Log::Assert(nextScale < scale);
 
@@ -214,20 +188,6 @@
       nextScale, indices, distances, childNearSetSize, childFarSetSize,
       childUsedSetSize));
 
-  // This will report a used set size which is one too large, because when the
-  // point itself is made into a leaf, it is counted as used (but that point is
-  // not part of the near or far set).  This only needs to be done by the parent
-  // of the self-leaf.
-//  if (children[0]->NumChildren() == 0)
-//    childUsedSetSize--;
-
-//  Log::Warn << "Back in frame pointIndex " << pointIndex << " scale " <<
-//      scale << std::endl;
-//  Log::Debug << trans(indices.rows(0, nearSetSize + farSetSize - 1))
-//      << trans(distances.rows(0, nearSetSize + farSetSize - 1)) << std::endl;
-//  Log::Debug << "That child used " << childUsedSetSize << " points." <<
-//      std::endl;
-
   // Now the arrays, in memory, look like this:
   // [ childFar | childUsed | far | used ]
   // but we need to move the used points past our far set:
@@ -235,14 +195,8 @@
   // and keeping in mind that childFar = our near set,
   // [ near | far | childUsed + used ]
   // is what we are trying to make.
-//  Log::Debug << "Sorting with childFarSetSize " << childFarSetSize << " and "
-//      << "childUsedSetSize " << childUsedSetSize << " and farSetSize " <<
-//      farSetSize << std::endl;
   SortPointSet(indices, distances, childFarSetSize, childUsedSetSize,
       farSetSize);
-//  Log::Debug << "After sorting: " << std::endl;
-//  Log::Debug << trans(indices.rows(0, nearSetSize + farSetSize - 1))
-//      << trans(distances.rows(0, nearSetSize + farSetSize - 1)) << std::endl;
 
   // The self-child should not have used all the points in the near set.  If it
   // did, this is an implicit node.
@@ -252,9 +206,6 @@
   nearSetSize -= childUsedSetSize;
   usedSetSize += childUsedSetSize;
 
-//  Log::Debug << "The near set size is " << nearSetSize << "; we will need to "
-//      << "make children until those points are used." << std::endl;
-
   // Now for each point in the near set, we need to make children.  To save
   // computation later, we'll create an array holding the points in the near
   // set, and then after each run we'll check which of those (if any) were used
@@ -293,16 +244,13 @@
     }
 
     // Update the near set size.  The used set size is updated by the recursive
-    // child constructor.
+    // child constructor (but we have to add one for the point we are using,
+    // because the child constructor will not count that).
     nearSetSize--;
     usedSetSize++;
 
-//    Log::Debug << "Before rebuild we have farSetSize " << farSetSize
-//        << " nearSetSize " << nearSetSize << " usedSetSize " << usedSetSize
-//        << std::endl;
-
     // Rebuild the distances for this point.
-    BuildDistances(dataset, newPointIndex, indices, distances,
+    ComputeDistances(newPointIndex, indices, distances,
         nearSetSize + farSetSize);
 
     // Split into near and far sets for this point.
@@ -316,19 +264,6 @@
         nextScale, indices, distances, childNearSetSize, childFarSetSize,
         childUsedSetSize));
 
-//    Log::Warn << "Back in frame pointIndex " << pointIndex << " scale " <<
-//        scale << std::endl;
-//    Log::Debug << "That non-self child used " << childUsedSetSize << " points."
-//        << std::endl;
-//    Log::Debug << "Now we have farSetSize " << farSetSize << " childFarSetSize "
-//        << childFarSetSize << " childUsedSetSize " << childUsedSetSize <<
-//        " usedSetSize " << usedSetSize << std::endl;
-//    Log::Debug << trans(indices.rows(0, nearSetSize + farSetSize - 1))
-//        << trans(distances.rows(0, nearSetSize + farSetSize - 1)) << std::endl;
-
-    Log::Assert(childUsedSetSize <= (nearSetSize + farSetSize));
-    Log::Assert(childUsedSetSize >= childNearSetSize);
-
     // Now the arrays, in memory, look like this:
     // [ childFar | childUsed | used ]
     // So we don't really need to do anything to them to get ready for the next
@@ -362,16 +297,9 @@
   // We need to rebuild the distances and the set so it looks like this:
   // [ far | used ]
   // because all the points in the near set should be used up.
-//  Log::Assert(nearSetSize == 0);
   farSetSize = childFarSetSize;
 
-  BuildDistances(dataset, pointIndex, indices, distances, farSetSize);
-
-  //farSetSize = SortPointSet(indices, distances, childFarSetSize,
-//      childUsedSetSize, farSetSize);
-
-//  Log::Warn << "Node creation complete; farSetSize " << farSetSize <<
-//      " usedSetSize " << usedSetSize << std::endl;
+  ComputeDistances(pointIndex, indices, distances, farSetSize);
 }
 
 template<typename StatisticType>
@@ -382,20 +310,17 @@
     delete children[i];
 }
 
-// Needs a better name...
-size_t SplitNearFar(arma::Col<size_t>& indices,
-                    arma::vec& distances,
-                    const double bound,
-                    const size_t pointSetSize)
+template<typename StatisticType>
+size_t CoverTree<StatisticType>::SplitNearFar(arma::Col<size_t>& indices,
+                                              arma::vec& distances,
+                                              const double bound,
+                                              const size_t pointSetSize)
 {
   // Sanity check; there is no guarantee that this condition will not be true.
   // ...or is there?
   if (pointSetSize <= 1)
     return 0;
 
-//  Log::Debug << "Splitting dataset using bound " << bound << " and set size "
-//      << pointSetSize << std::endl;
-
   // We'll traverse from both left and right.
   size_t left = 0;
   size_t right = pointSetSize - 1;
@@ -437,11 +362,12 @@
 }
 
 // Returns the maximum distance between points.
-void BuildDistances(const arma::mat& dataset,
-                    const size_t pointIndex,
-                    const arma::Col<size_t>& indices,
-                    arma::vec& distances,
-                    const size_t pointSetSize)
+template<typename StatisticType>
+void CoverTree<StatisticType>::ComputeDistances(
+    const size_t pointIndex,
+    const arma::Col<size_t>& indices,
+    arma::vec& distances,
+    const size_t pointSetSize)
 {
   // For each point, rebuild the distances.  The indices do not need to be
   // modified.
@@ -452,11 +378,12 @@
   }
 }
 
-size_t SortPointSet(arma::Col<size_t>& indices,
-                    arma::vec& distances,
-                    const size_t childFarSetSize,
-                    const size_t childUsedSetSize,
-                    const size_t farSetSize)
+template<typename StatisticType>
+size_t CoverTree<StatisticType>::SortPointSet(arma::Col<size_t>& indices,
+                                              arma::vec& distances,
+                                              const size_t childFarSetSize,
+                                              const size_t childUsedSetSize,
+                                              const size_t farSetSize)
 {
   // We'll use low-level memcpy calls ourselves, just to ensure it's done
   // quickly and the way we want it to be.  Unfortunately this takes up more
@@ -472,9 +399,6 @@
   size_t* indicesBuffer = new size_t[bufferSize];
   double* distancesBuffer = new double[bufferSize];
 
-//  Log::Warn << "buffer size " << bufferSize << " and bigCopySize "
-//      << bigCopySize << std::endl;
-
   // The start of the memory region to copy to the buffer.
   const size_t bufferFromLocation = ((bufferSize == farSetSize) ?
       (childFarSetSize + childUsedSetSize) : childFarSetSize);
@@ -488,11 +412,7 @@
   const size_t directToLocation = ((bufferSize == farSetSize) ?
       (childFarSetSize + farSetSize) : childFarSetSize);
 
-//  Log::Warn << "bufferFromLocation " << bufferFromLocation
-//      << " bufferToLocation " << bufferToLocation << " directFromLocation "
-//      << directFromLocation << " directToLocation " << directToLocation
-//      << std::endl;
-
+  // Copy the smaller piece to the buffer.
   memcpy(indicesBuffer, indices.memptr() + bufferFromLocation,
       sizeof(size_t) * bufferSize);
   memcpy(distancesBuffer, distances.memptr() + bufferFromLocation,