[mlpack-svn] r14962 - mlpack/trunk/src/mlpack/core/tree/cover_tree

fastlab-svn at coffeetalk-1.cc.gatech.edu fastlab-svn at coffeetalk-1.cc.gatech.edu
Thu Apr 25 17:51:26 EDT 2013


Author: rcurtin
Date: 2013-04-25 17:51:26 -0400 (Thu, 25 Apr 2013)
New Revision: 14962

Modified:
   mlpack/trunk/src/mlpack/core/tree/cover_tree/cover_tree.hpp
   mlpack/trunk/src/mlpack/core/tree/cover_tree/cover_tree_impl.hpp
Log:
Refactor cover tree constructors into one CreateChildren() function, which saves
a lot of space.  A little bit of changing needed to be done to set the scale
right in the root node, and as a result the assert that checks that the furthest
descendant distance is within the scale has been removed (since that has never
been a problem).  This is for #274, but the other constructor will not be
changed until mlpack 1.1.0.


Modified: mlpack/trunk/src/mlpack/core/tree/cover_tree/cover_tree.hpp
===================================================================
--- mlpack/trunk/src/mlpack/core/tree/cover_tree/cover_tree.hpp	2013-04-25 21:43:08 UTC (rev 14961)
+++ mlpack/trunk/src/mlpack/core/tree/cover_tree/cover_tree.hpp	2013-04-25 21:51:26 UTC (rev 14962)
@@ -95,6 +95,8 @@
    * The dataset will not be modified during the building procedure (unlike
    * BinarySpaceTree).
    *
+   * The last argument will be removed in mlpack 1.1.0 (see #274 and #273).
+   *
    * @param dataset Reference to the dataset to build a tree on.
    * @param base Base to use during tree building (default 2.0).
    */
@@ -103,6 +105,19 @@
             MetricType* metric = NULL);
 
   /**
+   * Create the cover tree with the given dataset and the given instantiated
+   * metric.  Optionally, set the base.  The dataset will not be modified during
+   * the building procedure (unlike BinarySpaceTree).
+   *
+   * @param dataset Reference to the dataset to build a tree on.
+   * @param metric Instantiated metric to use during tree building.
+   * @param base Base to use during tree building (default 2.0).
+   */
+  CoverTree(const arma::mat& dataset,
+            MetricType& metric,
+            const double base = 2.0);
+
+  /**
    * Construct a child cover tree node.  This constructor is not meant to be
    * used externally, but it could be used to insert another node into a tree.
    * This procedure uses only one vector for the near set, the far set, and the
@@ -326,6 +341,15 @@
   MetricType* metric;
 
   /**
+   * Create the children for this node.
+   */
+  void CreateChildren(arma::Col<size_t>& indices,
+                      arma::vec& distances,
+                      size_t nearSetSize,
+                      size_t& farSetSize,
+                      size_t& usedSetSize);
+
+  /**
    * Fill the vector of distances with the distances between the point specified
    * by pointIndex and each point in the indices array.  The distances of the
    * first pointSetSize points in indices are calculated (so, this does not

Modified: mlpack/trunk/src/mlpack/core/tree/cover_tree/cover_tree_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/core/tree/cover_tree/cover_tree_impl.hpp	2013-04-25 21:43:08 UTC (rev 14961)
+++ mlpack/trunk/src/mlpack/core/tree/cover_tree/cover_tree_impl.hpp	2013-04-25 21:51:26 UTC (rev 14962)
@@ -24,6 +24,7 @@
     MetricType* metric) :
     dataset(dataset),
     point(RootPointPolicy::ChooseRoot(dataset)),
+    scale(INT_MAX),
     base(base),
     parent(NULL),
     parentDistance(0),
@@ -35,6 +36,10 @@
   if (localMetric)
     this->metric = new MetricType();
 
+  // If there is only one point in the dataset... uh, we're done.
+  if (dataset.n_cols == 1)
+    return;
+
   // Kick off the building.  Create the indices array and the distances array.
   arma::Col<size_t> indices = arma::linspace<arma::Col<size_t> >(1,
       dataset.n_cols - 1, dataset.n_cols - 1);
@@ -48,147 +53,62 @@
   // Build the initial distances.
   ComputeDistances(point, indices, distances, dataset.n_cols - 1);
 
-  // Now determine the scale factor of the root node.
-  const double maxDistance = max(distances);
-  scale = (int) ceil(log(maxDistance) / log(base));
-  const double bound = pow(base, scale - 1);
-
-  // Unfortunately, we can't call out to other constructors, so we have to copy
-  // a little bit of code from the other constructor.  First we build the self
-  // child.
-  size_t childNearSetSize = SplitNearFar(indices, distances, bound,
-      dataset.n_cols - 1);
-  size_t childFarSetSize = (dataset.n_cols - 1) - childNearSetSize;
+  // Create the children.
+  size_t farSetSize = 0;
   size_t usedSetSize = 0;
-  children.push_back(new CoverTree(dataset, base, point, scale - 1,
-      this, 0, indices, distances, childNearSetSize, childFarSetSize,
-      usedSetSize, *metric));
+  CreateChildren(indices, distances, dataset.n_cols - 1, farSetSize,
+      usedSetSize);
 
-  furthestDescendantDistance = children[0]->FurthestDescendantDistance();
+  // Use the furthest descendant distance to determine the scale of the root
+  // node.
+  scale = (int) ceil(log(furthestDescendantDistance) / log(base));
 
-  // If we created an implicit node, take its self-child instead (this could
-  // happen multiple times).
-  while (children[children.size() - 1]->NumChildren() == 1)
-  {
-    CoverTree* old = children[children.size() - 1];
-    children.erase(children.begin() + children.size() - 1);
+  // Initialize statistic.
+  stat = StatisticType(*this);
+}
 
-    // Now take its child.
-    children.push_back(&(old->Child(0)));
+template<typename MetricType, typename RootPointPolicy, typename StatisticType>
+CoverTree<MetricType, RootPointPolicy, StatisticType>::CoverTree(
+    const arma::mat& dataset,
+    MetricType& metric,
+    const double base) :
+    dataset(dataset),
+    point(RootPointPolicy::ChooseRoot(dataset)),
+    scale(INT_MAX),
+    base(base),
+    parent(NULL),
+    parentDistance(0),
+    furthestDescendantDistance(0),
+    localMetric(false),
+    metric(&metric)
+{
+  // If there is only one point in the dataset, uh, we're done.
+  if (dataset.n_cols == 1)
+    return;
 
-    // Set its parent correctly.
-    old->Child(0).Parent() = this;
+  // Kick off the building.  Create the indices array and the distances array.
+  arma::Col<size_t> indices = arma::linspace<arma::Col<size_t> >(1,
+      dataset.n_cols - 1, dataset.n_cols - 1);
+  // This is now [1 2 3 4 ... n].  We must be sure that our point does not
+  // occur.
+  if (point != 0)
+    indices[point - 1] = 0; // Put 0 back into the set; remove what was there.
 
-    // Remove its child (so it doesn't delete it).
-    old->Children().erase(old->Children().begin() + old->Children().size() - 1);
+  arma::vec distances(dataset.n_cols - 1);
 
-    // Now delete it.
-    delete old;
-  }
+  // Build the initial distances.
+  ComputeDistances(point, indices, distances, dataset.n_cols - 1);
 
-  size_t nearSetSize = (dataset.n_cols - 1) - usedSetSize;
+  // Create the children.
+  size_t farSetSize = 0;
+  size_t usedSetSize = 0;
+  CreateChildren(indices, distances, dataset.n_cols - 1, farSetSize,
+      usedSetSize);
 
-  // We have no far set, so the array is organized thusly:
-  // [ near | used ].  No resorting is necessary.
-  // Therefore, go ahead and build the children.
-  while (nearSetSize > 0)
-  {
-    // We want to select the furthest point in the near set as the next child.
-    size_t newPointIndex = nearSetSize - 1;
+  // Use the furthest descendant distance to determine the scale of the root
+  // node.
+  scale = (int) ceil(log(furthestDescendantDistance) / log(base));
 
-    // Swap to front if necessary.
-    if (newPointIndex != 0)
-    {
-      const size_t tempIndex = indices[newPointIndex];
-      const double tempDist = distances[newPointIndex];
-
-      indices[newPointIndex] = indices[0];
-      distances[newPointIndex] = distances[0];
-
-      indices[0] = tempIndex;
-      distances[0] = tempDist;
-    }
-
-    if (distances[0] > furthestDescendantDistance)
-      furthestDescendantDistance = distances[0];
-
-    size_t childUsedSetSize = 0;
-
-    // If there's only one point left, we don't need this crap.
-    if (nearSetSize == 1)
-    {
-      size_t childNearSetSize = 0;
-      size_t childFarSetSize = 0;
-      children.push_back(new CoverTree(dataset, base,
-          indices[0], scale - 1, this, distances[0], indices, distances,
-          childNearSetSize, childFarSetSize, usedSetSize, *metric));
-
-      // And we're done.
-      break;
-    }
-
-    // Create the near and far set indices and distance vectors.
-    arma::Col<size_t> childIndices(nearSetSize);
-    childIndices.rows(0, (nearSetSize - 2)) = indices.rows(1, nearSetSize - 1);
-    // Put the current point into the used set, so when we move our indices to
-    // the used set, this will be done for us.
-    childIndices(nearSetSize - 1) = indices[0];
-    arma::vec childDistances(nearSetSize);
-
-    // Build distances for the child.
-    ComputeDistances(indices[0], childIndices, childDistances,
-        nearSetSize - 1);
-    childDistances(nearSetSize - 1) = 0;
-
-    // Split into near and far sets for this point.
-    childNearSetSize = SplitNearFar(childIndices, childDistances, bound,
-        nearSetSize - 1);
-
-    // Build this child (recursively).
-    childUsedSetSize = 1; // Mark self point as used.
-    childFarSetSize = ((nearSetSize - 1) - childNearSetSize);
-    children.push_back(new CoverTree(dataset, base, indices[0],
-        scale - 1, this, distances[0], childIndices, childDistances,
-        childNearSetSize, childFarSetSize, childUsedSetSize, *metric));
-
-    // If we created an implicit node, take its self-child instead (this could
-    // happen multiple times).
-    while (children[children.size() - 1]->NumChildren() == 1)
-    {
-      CoverTree* old = children[children.size() - 1];
-      children.erase(children.begin() + children.size() - 1);
-
-      // Now take its child.
-      children.push_back(&(old->Child(0)));
-
-      // Set its parent correctly.
-      old->Child(0).Parent() = this;
-
-      // Remove its child (so it doesn't delete it).
-      old->Children().erase(old->Children().begin() + old->Children().size()
-          - 1);
-
-      // Now delete it.
-      delete old;
-    }
-
-    // Now with the child created, it returns the childIndices and
-    // childDistances vectors in this form:
-    // [ childFar | childUsed ]
-    // For each point in the childUsed set, we must move that point to the used
-    // set in our own vector.
-    size_t farSetSize = 0;
-    MoveToUsedSet(indices, distances, nearSetSize, farSetSize, usedSetSize,
-        childIndices, childFarSetSize, childUsedSetSize);
-  }
-
-  // Calculate furthest descendant.
-  for (size_t i = 0; i < usedSetSize; ++i)
-    if (distances[i] > furthestDescendantDistance)
-      furthestDescendantDistance = distances[i];
-
-  Log::Assert(furthestDescendantDistance <= pow(base, scale + 1));
-
   // Initialize statistic.
   stat = StatisticType(*this);
 }
@@ -225,10 +145,26 @@
     return;
   }
 
+  // Otherwise, create the children.
+  CreateChildren(indices, distances, nearSetSize, farSetSize, usedSetSize);
+
+  // Initialize statistic.
+  stat = StatisticType(*this);
+}
+
+template<typename MetricType, typename RootPointPolicy, typename StatisticType>
+inline void
+CoverTree<MetricType, RootPointPolicy, StatisticType>::CreateChildren(
+    arma::Col<size_t>& indices,
+    arma::vec& distances,
+    size_t nearSetSize,
+    size_t& farSetSize,
+    size_t& usedSetSize)
+{
   // Determine the next scale level.  This should be the first level where there
   // are any points in the far set.  So, if we know the maximum distance in the
   // distances array, this will be the largest i such that
-  //   maxDistance > pow(ec, i)
+  //   maxDistance > pow(base, i)
   // and using this for the scale factor should guarantee we are not creating an
   // implicit node.  If the maximum distance is 0, every point in the near set
   // will be created as a leaf, and a child to this node.  We also do not need
@@ -240,9 +176,8 @@
     // Make the self child at the lowest possible level.
     // This should not modify farSetSize or usedSetSize.
     size_t tempSize = 0;
-    children.push_back(new CoverTree(dataset, base, pointIndex,
-        INT_MIN, this, 0, indices, distances, 0, tempSize, usedSetSize,
-        metric));
+    children.push_back(new CoverTree(dataset, base, point, INT_MIN, this, 0,
+        indices, distances, 0, tempSize, usedSetSize, *metric));
 
     // Every point in the near set should be a leaf.
     for (size_t i = 0; i < nearSetSize; ++i)
@@ -250,7 +185,7 @@
       // farSetSize and usedSetSize will not be modified.
       children.push_back(new CoverTree(dataset, base, indices[i],
           INT_MIN, this, distances[i], indices, distances, 0, tempSize,
-          usedSetSize, metric));
+          usedSetSize, *metric));
       usedSetSize++;
     }
 
@@ -270,9 +205,6 @@
       (int) ceil(log(maxDistance) / log(base))) - 1;
   const double bound = pow(base, nextScale);
 
-  // This needs to be taken out.  It's a sanity check for now.
-  Log::Assert(nextScale < scale);
-
   // First, make the self child.  We must split the given near set into the near
   // set and far set for the self child.
   size_t childNearSetSize =
@@ -281,9 +213,9 @@
   // Build the self child (recursively).
   size_t childFarSetSize = nearSetSize - childNearSetSize;
   size_t childUsedSetSize = 0;
-  children.push_back(new CoverTree(dataset, base, pointIndex,
-      nextScale, this, 0, indices, distances, childNearSetSize, childFarSetSize,
-      childUsedSetSize, metric));
+  children.push_back(new CoverTree(dataset, base, point, nextScale, this, 0,
+      indices, distances, childNearSetSize, childFarSetSize, childUsedSetSize,
+      *metric));
 
   // The self-child can't modify the furthestChildDistance away from 0, but it
   // can modify the furthestDescendantDistance.
@@ -352,9 +284,9 @@
     if ((nearSetSize == 1) && (farSetSize == 0))
     {
       size_t childNearSetSize = 0;
-      children.push_back(new CoverTree(dataset, base,
-          indices[0], nextScale, this, distances[0], indices, distances,
-          childNearSetSize, farSetSize, usedSetSize, metric));
+      children.push_back(new CoverTree(dataset, base, indices[0], nextScale,
+          this, distances[0], indices, distances, childNearSetSize, farSetSize,
+          usedSetSize, *metric));
 
       // Because the far set size is 0, we don't have to do any swapping to
       // move the point into the used set.
@@ -392,9 +324,9 @@
 
     // Build this child (recursively).
     childUsedSetSize = 1; // Mark self point as used.
-    children.push_back(new CoverTree(dataset, base, indices[0],
-        nextScale, this, distances[0], childIndices, childDistances,
-        childNearSetSize, childFarSetSize, childUsedSetSize, metric));
+    children.push_back(new CoverTree(dataset, base, indices[0], nextScale,
+        this, distances[0], childIndices, childDistances, childNearSetSize,
+        childFarSetSize, childUsedSetSize, *metric));
 
     // If we created an implicit node, take its self-child instead (this could
     // happen multiple times).
@@ -431,11 +363,6 @@
       usedSetSize); ++i)
     if (distances[i] > furthestDescendantDistance)
       furthestDescendantDistance = distances[i];
-
-  Log::Assert(furthestDescendantDistance <= pow(base, scale + 1));
-
-  // Lastly, initialize the statistic.
-  stat = StatisticType(*this);
 }
 
 // Manually create a cover tree node.




More information about the mlpack-svn mailing list