[mlpack-svn] r12571 - mlpack/trunk/src/mlpack/core/tree
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Mon Apr 30 12:58:18 EDT 2012
Author: rcurtin
Date: 2012-04-30 12:58:18 -0400 (Mon, 30 Apr 2012)
New Revision: 12571
Modified:
mlpack/trunk/src/mlpack/core/tree/cover_tree.hpp
mlpack/trunk/src/mlpack/core/tree/cover_tree_impl.hpp
Log:
Comment the CoverTree implementation a lot better. Still a few things to do...
Modified: mlpack/trunk/src/mlpack/core/tree/cover_tree.hpp
===================================================================
--- mlpack/trunk/src/mlpack/core/tree/cover_tree.hpp 2012-04-30 15:55:58 UTC (rev 12570)
+++ mlpack/trunk/src/mlpack/core/tree/cover_tree.hpp 2012-04-30 16:58:18 UTC (rev 12571)
@@ -12,16 +12,108 @@
namespace mlpack {
namespace tree {
+/**
+ * A cover tree is a tree specifically designed to speed up nearest-neighbor
+ * computation in high-dimensional spaces. Each non-leaf node references a
+ * point and has a nonzero number of children, including a "self-child" which
+ * references the same point. A leaf node represents only one point.
+ *
+ * The tree can be thought of as a hierarchy with the root node at the top level
+ * and the leaf nodes at the bottom level. Each level in the tree has an
+ * assigned 'scale' i. The tree follows these three conditions:
+ *
+ * - nesting: the level C_i is a subset of the level C_{i - 1}.
+ * - covering: all node in level C_{i - 1} have at least one node in the
+ * level C_i with distance less than or equal to EC^i (exactly one of these
+ * is a parent of the point in level C_{i - 1}.
+ * - separation: all nodes in level C_i have distance greater than EC^i to all
+ * other nodes in level C_i.
+ *
+ * The value 'EC' refers to the expansion constant, which is a parameter of the
+ * tree. These three properties make the cover tree very good for fast,
+ * high-dimensional nearest-neighbor search.
+ *
+ * The theoretical structure of the tree contains many 'implicit' nodes which
+ * only have a "self-child" (a child referencing the same point, but at a lower
+ * scale level). This practical implementation only constructs explicit nodes
+ * -- non-leaf nodes with more than one child. A leaf node has no children, and
+ * its scale level is INT_MIN.
+ *
+ * For more information on cover trees, see
+ *
+ * @code
+ * @inproceedings{
+ * author = {Beygelzimer, Alina and Kakade, Sham and Langford, John},
+ * title = {Cover trees for nearest neighbor},
+ * booktitle = {Proceedings of the 23rd International Conference on Machine
+ * Learning},
+ * series = {ICML '06},
+ * year = {2006},
+ * pages = {97--104]
+ * }
+ * @endcode
+ *
+ * For information on runtime bounds of the nearest-neighbor computation using
+ * cover trees, see the following paper, presented at NIPS 2009:
+ *
+ * @code
+ * @inproceedings{
+ * author = {Ram, P., and Lee, D., and March, W.B., and Gray, A.G.},
+ * title = {Linear-time Algorithms for Pairwise Statistical Problems},
+ * booktitle = {Advances in Neural Information Processing Systems 22},
+ * editor = {Y. Bengio and D. Schuurmans and J. Lafferty and C.K.I. Williams
+ * and A. Culotta},
+ * pages = {1527--1535},
+ * year = {2009}
+ * }
+ * @endcode
+ */
template<typename StatisticType = EmptyStatistic>
class CoverTree
{
public:
/**
- * Create the cover tree.
+ * Create the cover tree with the given dataset and given expansion constant.
+ * The dataset will not be modified during the building procedure (unlike
+ * BinarySpaceTree).
+ *
+ * @param dataset Reference to the dataset to build a tree on.
+ * @param expansionConstant Expansion constant (EC) to use during tree
+ * building (default 2.0).
*/
CoverTree(const arma::mat& dataset,
const double expansionConstant = 2.0);
+ /**
+ * Construct a child cover tree node. This constructor is not meant to be
+ * used externally, but it could be used to insert another node into a tree.
+ * This procedure uses only one vector for the near set, the far set, and the
+ * used set (this is to prevent unnecessary memory allocation in recursive
+ * calls to this constructor). Therefore, the size of the near set, far set,
+ * and used set must be passed in. The near set will be entirely used up, and
+ * some of the far set may be used. The value of usedSetSize will be set to
+ * the number of points used in the construction of this node, and the value
+ * of farSetSize will be modified to reflect the number of points in the far
+ * set _after_ the construction of this node.
+ *
+ * If you are calling this manually, be careful that the given scale is
+ * as small as possible, or you may be creating an implicit node in your tree.
+ *
+ * @param dataset Reference to the dataset to build a tree on.
+ * @param expansionConstant Expansion constant (EC) to use during tree
+ * building.
+ * @param pointIndex Index of the point this node references.
+ * @param scale Scale of this level in the tree.
+ * @param indices Array of indices, ordered [ nearSet | farSet | usedSet ];
+ * will be modified to [ farSet | usedSet ].
+ * @param distances Array of distances, ordered the same way as the indices.
+ * These represent the distances between the point specified by pointIndex
+ * and each point in the indices array.
+ * @param nearSetSize Size of the near set; if 0, this will be a leaf.
+ * @param farSetSize Size of the far set; may be modified (if this node uses
+ * any points in the far set).
+ * @param usedSetSize The number of points used will be added to this number.
+ */
CoverTree(const arma::mat& dataset,
const double expansionConstant,
const size_t pointIndex,
@@ -32,6 +124,9 @@
size_t& farSetSize,
size_t& usedSetSize);
+ /**
+ * Delete this cover tree node and its children.
+ */
~CoverTree();
//! Get a reference to the dataset.
@@ -65,17 +160,9 @@
//! Index of the point in the matrix which this node represents.
size_t point;
- //! The distance to the furthest descendant.
- double furthestDistance; // Better name?
-
- //! The distance to the parent.
- double parentDistance; // Better name?
-
- //! The list of children; the first is the "self child" (what is this?).
+ //! The list of children; the first is the self-child.
std::vector<CoverTree*> children;
- //! Depth of the node in terms of scale (in terms of what?).
-// size_t scaleDepth;
//! Scale level of the node.
int scale;
@@ -84,31 +171,70 @@
//! The instantiated statistic.
StatisticType stat;
-};
-// Utility functions; these should probably be incorporated into the class
-// itself sometime soon.
+ /**
+ * Fill the vector of distances with the distances between the point specified
+ * by pointIndex and each point in the indices array. The distances of the
+ * first pointSetSize points in indices are calculated (so, this does not
+ * necessarily need to use all of the points in the arrays).
+ *
+ * @param pointIndex Point to build the distances for.
+ * @param indices List of indices to compute distances for.
+ * @param distances Vector to store calculated distances in.
+ * @param pointSetSize Number of points in arrays to calculate distances for.
+ */
+ void ComputeDistances(const size_t pointIndex,
+ const arma::Col<size_t>& indices,
+ arma::vec& distances,
+ const size_t pointSetSize);
+ /**
+ * Split the given indices and distances into a near and a far set, returning
+ * the number of points in the near set. The distances must already be
+ * initialized. This will order the indices and distances such that the
+ * points in the near set make up the first part of the array and the far set
+ * makes up the rest: [ nearSet | farSet ].
+ *
+ * @param indices List of indices; will be reordered.
+ * @param distances List of distances; will be reordered.
+ * @param bound If the distance is less than or equal to this bound, the point
+ * is placed into the near set.
+ * @param pointSetSize Size of point set (because we may be sorting a smaller
+ * list than the indices vector will hold).
+ */
+ size_t SplitNearFar(arma::Col<size_t>& indices,
+ arma::vec& distances,
+ const double bound,
+ const size_t pointSetSize);
-size_t SplitNearFar(arma::Col<size_t>& indices,
- arma::vec& distances,
- const double bound,
- const size_t pointSetSize);
+ /**
+ * Assuming that the list of indices and distances is sorted as
+ * [ childFarSet | childUsedSet | farSet | usedSet ],
+ * resort the sets so the organization is
+ * [ childFarSet | farSet | childUsedSet | usedSet ].
+ *
+ * The size_t parameters specify the sizes of each set in the array. Only the
+ * ordering of the indices and distances arrays will be modified (not their
+ * actual contents).
+ *
+ * The size of any of the four sets can be zero and this method will handle
+ * that case accordingly.
+ *
+ * @param indices List of indices to sort.
+ * @param distances List of distances to sort.
+ * @param childFarSetSize Number of points in child far set (childFarSet).
+ * @param childUsedSetSize Number of points in child used set (childUsedSet).
+ * @param farSetSize Number of points in far set (farSet).
+ */
+ size_t SortPointSet(arma::Col<size_t>& indices,
+ arma::vec& distances,
+ const size_t childFarSetSize,
+ const size_t childUsedSetSize,
+ const size_t farSetSize);
+};
-void BuildDistances(const arma::mat& dataset,
- const size_t pointIndex,
- const arma::Col<size_t>& indices,
- arma::vec& distances,
- const size_t pointSetSize);
+}; // namespace tree
+}; // namespace mlpack
-size_t SortPointSet(arma::Col<size_t>& indices,
- arma::vec& distances,
- const size_t childFarSetSize,
- const size_t childUsedSetSize,
- const size_t farSetSize);
-
-};
-};
-
// Include implementation.
#include "cover_tree_impl.hpp"
Modified: mlpack/trunk/src/mlpack/core/tree/cover_tree_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/core/tree/cover_tree_impl.hpp 2012-04-30 15:55:58 UTC (rev 12570)
+++ mlpack/trunk/src/mlpack/core/tree/cover_tree_impl.hpp 2012-04-30 16:58:18 UTC (rev 12571)
@@ -27,8 +27,7 @@
arma::vec distances(dataset.n_cols - 1);
// Build the initial distances.
- BuildDistances(dataset, 0 /* default */, indices, distances,
- dataset.n_cols - 1);
+ ComputeDistances(0 /* default */, indices, distances, dataset.n_cols - 1);
// Now determine the scale factor of the root node.
const double maxDistance = max(distances);
@@ -46,8 +45,6 @@
indices, distances, childNearSetSize, childFarSetSize, childUsedSetSize));
size_t nearSetSize = (dataset.n_cols - 1) - childUsedSetSize;
-// Log::Warn << "before non-self children, near set " << nearSetSize << " and "
-// << "child used set size " << childUsedSetSize << std::endl;
// We have no far set, so the array is organized thusly:
// [ near | used ]. No resorting is necessary.
@@ -66,8 +63,6 @@
// accordingly. We don't have to worry about the fact that the point we
// swapped is actually in the far set but grouped with the near set, because
// we're about to rebuild that anyway.
-// Log::Warn << "nearSetSize is AOIAJA " << nearSetSize << std::endl;
-// Log::Warn << trans(nearSet);
size_t setIndex;
for (size_t k = 0; k < nearSetSize; ++k)
if (indices[k] == newPointIndex)
@@ -92,7 +87,7 @@
nearSetSize--;
// Rebuild the distances for this point.
- BuildDistances(dataset, newPointIndex, indices, distances, nearSetSize);
+ ComputeDistances(newPointIndex, indices, distances, nearSetSize);
// Split into near and far sets for this point.
childNearSetSize = SplitNearFar(indices, distances, bound, nearSetSize);
@@ -136,20 +131,10 @@
scale(scale),
expansionConstant(expansionConstant)
{
-// Log::Debug << "Making child. pointIndex " << pointIndex << ", scale " <<
-// scale << ", nearSetSize " << nearSetSize << ", farSetSize " << farSetSize
-// << ", usedSetSize " << usedSetSize << std::endl;
-// if ((nearSetSize + farSetSize) > 0)
-// Log::Debug << trans(indices.rows(0, nearSetSize + farSetSize - 1))
-// << trans(distances.rows(0, nearSetSize + farSetSize - 1)) << std::endl;
// If the size of the near set is 0, this is a leaf.
if (nearSetSize == 0)
- {
- // Make this a leaf...
-// Log::Warn << "This one is a leaf." << std::endl;
return;
- }
// Determine the next scale level. This should be the first level where there
// are any points in the far set. So, if we know the maximum distance in the
@@ -166,9 +151,6 @@
children.push_back(new CoverTree(dataset, expansionConstant, pointIndex,
INT_MIN, indices, distances, 0, farSetSize, usedSetSize));
-// Log::Debug << "Every point in the near set will be a leaf." << std::endl;
-// Log::Debug << trans(indices.rows(0, (nearSetSize - 1))) << std::endl;
-
// Every point in the near set should be a leaf.
for (size_t i = 0; i < nearSetSize; ++i)
{
@@ -182,12 +164,7 @@
// [ used | far | other used ]
// and we want
// [ far | all used ].
-// Log::Debug << "Sorting with usedSetSize " << usedSetSize
-// << " and farSetSize " << farSetSize << std::endl;
SortPointSet(indices, distances, 0, usedSetSize, farSetSize);
-// Log::Debug << "After sorting: " << std::endl;
-// Log::Debug << trans(indices.rows(0, nearSetSize + farSetSize - 1))
-// << trans(distances.rows(0, nearSetSize + farSetSize - 1)) << std::endl;
return;
}
@@ -196,9 +173,6 @@
(int) ceil(log(maxDistance) / log(expansionConstant)) - 1);
const double bound = pow(expansionConstant, nextScale);
-// Log::Debug << "maxDistance is " << maxDistance << "; nextScale is " <<
-// nextScale << "; bound is " << bound << std::endl;
-
// This needs to be taken out. It's a sanity check for now.
Log::Assert(nextScale < scale);
@@ -214,20 +188,6 @@
nextScale, indices, distances, childNearSetSize, childFarSetSize,
childUsedSetSize));
- // This will report a used set size which is one too large, because when the
- // point itself is made into a leaf, it is counted as used (but that point is
- // not part of the near or far set). This only needs to be done by the parent
- // of the self-leaf.
-// if (children[0]->NumChildren() == 0)
-// childUsedSetSize--;
-
-// Log::Warn << "Back in frame pointIndex " << pointIndex << " scale " <<
-// scale << std::endl;
-// Log::Debug << trans(indices.rows(0, nearSetSize + farSetSize - 1))
-// << trans(distances.rows(0, nearSetSize + farSetSize - 1)) << std::endl;
-// Log::Debug << "That child used " << childUsedSetSize << " points." <<
-// std::endl;
-
// Now the arrays, in memory, look like this:
// [ childFar | childUsed | far | used ]
// but we need to move the used points past our far set:
@@ -235,14 +195,8 @@
// and keeping in mind that childFar = our near set,
// [ near | far | childUsed + used ]
// is what we are trying to make.
-// Log::Debug << "Sorting with childFarSetSize " << childFarSetSize << " and "
-// << "childUsedSetSize " << childUsedSetSize << " and farSetSize " <<
-// farSetSize << std::endl;
SortPointSet(indices, distances, childFarSetSize, childUsedSetSize,
farSetSize);
-// Log::Debug << "After sorting: " << std::endl;
-// Log::Debug << trans(indices.rows(0, nearSetSize + farSetSize - 1))
-// << trans(distances.rows(0, nearSetSize + farSetSize - 1)) << std::endl;
// The self-child should not have used all the points in the near set. If it
// did, this is an implicit node.
@@ -252,9 +206,6 @@
nearSetSize -= childUsedSetSize;
usedSetSize += childUsedSetSize;
-// Log::Debug << "The near set size is " << nearSetSize << "; we will need to "
-// << "make children until those points are used." << std::endl;
-
// Now for each point in the near set, we need to make children. To save
// computation later, we'll create an array holding the points in the near
// set, and then after each run we'll check which of those (if any) were used
@@ -293,16 +244,13 @@
}
// Update the near set size. The used set size is updated by the recursive
- // child constructor.
+ // child constructor (but we have to add one for the point we are using,
+ // because the child constructor will not count that).
nearSetSize--;
usedSetSize++;
-// Log::Debug << "Before rebuild we have farSetSize " << farSetSize
-// << " nearSetSize " << nearSetSize << " usedSetSize " << usedSetSize
-// << std::endl;
-
// Rebuild the distances for this point.
- BuildDistances(dataset, newPointIndex, indices, distances,
+ ComputeDistances(newPointIndex, indices, distances,
nearSetSize + farSetSize);
// Split into near and far sets for this point.
@@ -316,19 +264,6 @@
nextScale, indices, distances, childNearSetSize, childFarSetSize,
childUsedSetSize));
-// Log::Warn << "Back in frame pointIndex " << pointIndex << " scale " <<
-// scale << std::endl;
-// Log::Debug << "That non-self child used " << childUsedSetSize << " points."
-// << std::endl;
-// Log::Debug << "Now we have farSetSize " << farSetSize << " childFarSetSize "
-// << childFarSetSize << " childUsedSetSize " << childUsedSetSize <<
-// " usedSetSize " << usedSetSize << std::endl;
-// Log::Debug << trans(indices.rows(0, nearSetSize + farSetSize - 1))
-// << trans(distances.rows(0, nearSetSize + farSetSize - 1)) << std::endl;
-
- Log::Assert(childUsedSetSize <= (nearSetSize + farSetSize));
- Log::Assert(childUsedSetSize >= childNearSetSize);
-
// Now the arrays, in memory, look like this:
// [ childFar | childUsed | used ]
// So we don't really need to do anything to them to get ready for the next
@@ -362,16 +297,9 @@
// We need to rebuild the distances and the set so it looks like this:
// [ far | used ]
// because all the points in the near set should be used up.
-// Log::Assert(nearSetSize == 0);
farSetSize = childFarSetSize;
- BuildDistances(dataset, pointIndex, indices, distances, farSetSize);
-
- //farSetSize = SortPointSet(indices, distances, childFarSetSize,
-// childUsedSetSize, farSetSize);
-
-// Log::Warn << "Node creation complete; farSetSize " << farSetSize <<
-// " usedSetSize " << usedSetSize << std::endl;
+ ComputeDistances(pointIndex, indices, distances, farSetSize);
}
template<typename StatisticType>
@@ -382,20 +310,17 @@
delete children[i];
}
-// Needs a better name...
-size_t SplitNearFar(arma::Col<size_t>& indices,
- arma::vec& distances,
- const double bound,
- const size_t pointSetSize)
+template<typename StatisticType>
+size_t CoverTree<StatisticType>::SplitNearFar(arma::Col<size_t>& indices,
+ arma::vec& distances,
+ const double bound,
+ const size_t pointSetSize)
{
// Sanity check; there is no guarantee that this condition will not be true.
// ...or is there?
if (pointSetSize <= 1)
return 0;
-// Log::Debug << "Splitting dataset using bound " << bound << " and set size "
-// << pointSetSize << std::endl;
-
// We'll traverse from both left and right.
size_t left = 0;
size_t right = pointSetSize - 1;
@@ -437,11 +362,12 @@
}
// Returns the maximum distance between points.
-void BuildDistances(const arma::mat& dataset,
- const size_t pointIndex,
- const arma::Col<size_t>& indices,
- arma::vec& distances,
- const size_t pointSetSize)
+template<typename StatisticType>
+void CoverTree<StatisticType>::ComputeDistances(
+ const size_t pointIndex,
+ const arma::Col<size_t>& indices,
+ arma::vec& distances,
+ const size_t pointSetSize)
{
// For each point, rebuild the distances. The indices do not need to be
// modified.
@@ -452,11 +378,12 @@
}
}
-size_t SortPointSet(arma::Col<size_t>& indices,
- arma::vec& distances,
- const size_t childFarSetSize,
- const size_t childUsedSetSize,
- const size_t farSetSize)
+template<typename StatisticType>
+size_t CoverTree<StatisticType>::SortPointSet(arma::Col<size_t>& indices,
+ arma::vec& distances,
+ const size_t childFarSetSize,
+ const size_t childUsedSetSize,
+ const size_t farSetSize)
{
// We'll use low-level memcpy calls ourselves, just to ensure it's done
// quickly and the way we want it to be. Unfortunately this takes up more
@@ -472,9 +399,6 @@
size_t* indicesBuffer = new size_t[bufferSize];
double* distancesBuffer = new double[bufferSize];
-// Log::Warn << "buffer size " << bufferSize << " and bigCopySize "
-// << bigCopySize << std::endl;
-
// The start of the memory region to copy to the buffer.
const size_t bufferFromLocation = ((bufferSize == farSetSize) ?
(childFarSetSize + childUsedSetSize) : childFarSetSize);
@@ -488,11 +412,7 @@
const size_t directToLocation = ((bufferSize == farSetSize) ?
(childFarSetSize + farSetSize) : childFarSetSize);
-// Log::Warn << "bufferFromLocation " << bufferFromLocation
-// << " bufferToLocation " << bufferToLocation << " directFromLocation "
-// << directFromLocation << " directToLocation " << directToLocation
-// << std::endl;
-
+ // Copy the smaller piece to the buffer.
memcpy(indicesBuffer, indices.memptr() + bufferFromLocation,
sizeof(size_t) * bufferSize);
memcpy(distancesBuffer, distances.memptr() + bufferFromLocation,
More information about the mlpack-svn
mailing list