[mlpack-git] master: Add Pelleg-Moore k-means. This implementation is faster and prunes more tightly than my previous attempts (which I didn't check in). (That, of course, simply means that my previous implementations were wrong, but this one isn't.) (1701b9a)

Thu Mar 5 22:01:10 EST 2015

Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/904762495c039e345beba14c1142fd719b3bd50e...f94823c800ad6f7266995c700b1b630d5ffdcf40

>---------------------------------------------------------------

commit 1701b9a29287b8a3a12d30acb78f58a3a74db915
Author: Ryan Curtin <ryan at ratml.org>
Date:   Sun Oct 12 20:22:29 2014 +0000

    Add Pelleg-Moore k-means.  This implementation is faster and prunes more tightly
    than my previous attempts (which I didn't check in).  (That, of course, simply
    means that my previous implementations were wrong, but this one isn't.)


>---------------------------------------------------------------

1701b9a29287b8a3a12d30acb78f58a3a74db915
 src/mlpack/methods/kmeans/CMakeLists.txt           |   5 +
 src/mlpack/methods/kmeans/kmeans_main.cpp          |  15 +-
 src/mlpack/methods/kmeans/pelleg_moore_kmeans.hpp  |  70 ++++++++
 .../methods/kmeans/pelleg_moore_kmeans_impl.hpp    |  90 ++++++++++
 .../methods/kmeans/pelleg_moore_kmeans_rules.hpp   |  81 +++++++++
 .../kmeans/pelleg_moore_kmeans_rules_impl.hpp      | 199 +++++++++++++++++++++
 .../kmeans/pelleg_moore_kmeans_statistic.hpp       |  81 +++++++++
 7 files changed, 539 insertions(+), 2 deletions(-)

diff --git a/src/mlpack/methods/kmeans/CMakeLists.txt b/src/mlpack/methods/kmeans/CMakeLists.txt
index 5913ce6..2cf72d6 100644
--- a/src/mlpack/methods/kmeans/CMakeLists.txt
+++ b/src/mlpack/methods/kmeans/CMakeLists.txt
@@ -12,6 +12,11 @@ set(SOURCES
   max_variance_new_cluster_impl.hpp
   naive_kmeans.hpp
   naive_kmeans_impl.hpp
+  pelleg_moore_kmeans.hpp
+  pelleg_moore_kmeans_impl.hpp
+  pelleg_moore_kmeans_rules.hpp
+  pelleg_moore_kmeans_rules_impl.hpp
+  pelleg_moore_kmeans_statistic.hpp
   random_partition.hpp
   refined_start.hpp
   refined_start_impl.hpp
diff --git a/src/mlpack/methods/kmeans/kmeans_main.cpp b/src/mlpack/methods/kmeans/kmeans_main.cpp
index 9c911c1..1a4dcd7 100644
--- a/src/mlpack/methods/kmeans/kmeans_main.cpp
+++ b/src/mlpack/methods/kmeans/kmeans_main.cpp
@@ -11,6 +11,7 @@
 #include "refined_start.hpp"
 #include "elkan_kmeans.hpp"
 #include "hamerly_kmeans.hpp"
+#include "pelleg_moore_kmeans.hpp"
 
 using namespace mlpack;
 using namespace mlpack::kmeans;
@@ -32,6 +33,13 @@ PROGRAM_INFO("K-Means Clustering", "This program performs K-Means clustering "
     "to be used in each sample, the --percentage parameter is used (it should "
     "be a value between 0.0 and 1.0)."
     "\n\n"
+    "There are several options available for the algorithm used for each Lloyd "
+    "iteration, specified with the --algorithm (-a) option.  The standard O(kN)"
+    " approach can be used ('naive').  Other options include the Pelleg-Moore "
+    "tree-based algorithm ('pelleg-moore'), Elkan's triangle-inequality based "
+    "algorithm ('elkan'), and Hamerly's modification to Elkan's algorithm "
+    "('hamerly')."
+    "\n\n"
     "As of October 2014, the --overclustering option has been removed.  If you "
     "want this support back, let us know -- file a bug at "
     "http://www.mlpack.org/trac/ or get in touch through another means.");
@@ -67,7 +75,7 @@ PARAM_DOUBLE("percentage", "Percentage of dataset to use for each refined start"
     " sampling (use when --refined_start is specified).", "p", 0.02);
 
 PARAM_STRING("algorithm", "Algorithm to use for the Lloyd iteration ('naive', "
-    "'elkan', or 'hamerly').", "a", "naive");
+    "'pelleg-moore', 'elkan', or 'hamerly').", "a", "naive");
 
 // Given the type of initial partition policy, figure out the empty cluster
 // policy and run k-means.
@@ -139,11 +147,14 @@ void FindLloydStepType(const InitialPartitionPolicy& ipp)
     RunKMeans<InitialPartitionPolicy, EmptyClusterPolicy, ElkanKMeans>(ipp);
   else if (algorithm == "hamerly")
     RunKMeans<InitialPartitionPolicy, EmptyClusterPolicy, HamerlyKMeans>(ipp);
+  else if (algorithm == "pelleg-moore")
+    RunKMeans<InitialPartitionPolicy, EmptyClusterPolicy,
+        PellegMooreKMeans>(ipp);
   else if (algorithm == "naive")
     RunKMeans<InitialPartitionPolicy, EmptyClusterPolicy, NaiveKMeans>(ipp);
   else
     Log::Fatal << "Unknown algorithm: '" << algorithm << "'.  Supported options"
-        << " are 'naive', 'elkan', and 'hamerly'." << endl;
+        << " are 'naive', 'pelleg-moore', 'elkan', and 'hamerly'." << endl;
 }
 
 // Given the template parameters, sanitize/load input and run k-means.
diff --git a/src/mlpack/methods/kmeans/pelleg_moore_kmeans.hpp b/src/mlpack/methods/kmeans/pelleg_moore_kmeans.hpp
new file mode 100644
index 0000000..7a766f5
--- /dev/null
+++ b/src/mlpack/methods/kmeans/pelleg_moore_kmeans.hpp
@@ -0,0 +1,70 @@
+/**
+ * @file pelleg_moore_kmeans.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of Pelleg-Moore's 'blacklist' algorithm for k-means
+ * clustering.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_PELLEG_MOORE_KMEANS_HPP
+#define __MLPACK_METHODS_KMEANS_PELLEG_MOORE_KMEANS_HPP
+
+#include <mlpack/core/tree/binary_space_tree.hpp>
+#include "pelleg_moore_kmeans_statistic.hpp"
+
+namespace mlpack {
+namespace kmeans {
+
+template<typename MetricType, typename MatType>
+class PellegMooreKMeans
+{
+ public:
+  /**
+   * Construct the PellegMooreKMeans object, which must construct a tree.
+   */
+  PellegMooreKMeans(const MatType& dataset, MetricType& metric);
+
+  /**
+   * Delete the tree constructed by the PellegMooreKMeans object.
+   */
+  ~PellegMooreKMeans();
+
+  /**
+   * Run a single iteration of the Pelleg-Moore blacklist algorithm, updating
+   * the given centroids into the newCentroids matrix.
+   *
+   * @param centroids Current cluster centroids.
+   * @param newCentroids New cluster centroids.
+   * @param counts Current counts, to be overwritten with new counts.
+   */
+  double Iterate(const arma::mat& centroids,
+                 arma::mat& newCentroids,
+                 arma::Col<size_t>& counts);
+
+  size_t DistanceCalculations() const { return distanceCalculations; }
+
+  typedef tree::BinarySpaceTree<bound::HRectBound<2, true>,
+      PellegMooreKMeansStatistic, MatType> TreeType;
+
+ private:
+  //! The original dataset reference.
+  const MatType& datasetOrig; // Maybe not necessary.
+  //! The dataset we are using.
+  const MatType& dataset;
+  //! A copy of the dataset, if necessary.
+  MatType datasetCopy;
+  //! The metric.
+  MetricType& metric;
+
+  //! The tree built on the points.
+  TreeType* tree;
+
+  //! Track distance calculations.
+  size_t distanceCalculations;
+};
+
+} // namespace kmeans
+} // namespace mlpack
+
+#include "pelleg_moore_kmeans_impl.hpp"
+
+#endif
diff --git a/src/mlpack/methods/kmeans/pelleg_moore_kmeans_impl.hpp b/src/mlpack/methods/kmeans/pelleg_moore_kmeans_impl.hpp
new file mode 100644
index 0000000..51dbd66
--- /dev/null
+++ b/src/mlpack/methods/kmeans/pelleg_moore_kmeans_impl.hpp
@@ -0,0 +1,90 @@
+/**
+ * @file pelleg_moore_kmeans_impl.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of Pelleg-Moore's 'blacklist' algorithm for k-means
+ * clustering.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_PELLEG_MOORE_KMEANS_IMPL_HPP
+#define __MLPACK_METHODS_KMEANS_PELLEG_MOORE_KMEANS_IMPL_HPP
+
+#include "pelleg_moore_kmeans.hpp"
+#include "pelleg_moore_kmeans_rules.hpp"
+
+namespace mlpack {
+namespace kmeans {
+
+template<typename MetricType, typename MatType>
+PellegMooreKMeans<MetricType, MatType>::PellegMooreKMeans(
+    const MatType& dataset,
+    MetricType& metric) :
+    datasetOrig(dataset),
+    dataset(tree::TreeTraits<TreeType>::RearrangesDataset ? datasetCopy :
+        datasetOrig),
+    metric(metric),
+    distanceCalculations(0)
+{
+  Timer::Start("tree_building");
+
+  // Copy the dataset, if necessary.
+  if (tree::TreeTraits<TreeType>::RearrangesDataset)
+    datasetCopy = datasetOrig;
+
+  // Now build the tree.  We don't need any mappings.
+  tree = new TreeType(const_cast<typename TreeType::Mat&>(this->dataset));
+
+  Timer::Stop("tree_building");
+}
+
+template<typename MetricType, typename MatType>
+PellegMooreKMeans<MetricType, MatType>::~PellegMooreKMeans()
+{
+  delete tree;
+}
+
+// Run a single iteration.
+template<typename MetricType, typename MatType>
+double PellegMooreKMeans<MetricType, MatType>::Iterate(
+    const arma::mat& centroids,
+    arma::mat& newCentroids,
+    arma::Col<size_t>& counts)
+{
+  newCentroids.zeros(centroids.n_rows, centroids.n_cols);
+  counts.zeros(centroids.n_cols);
+
+  // Create rules object.
+  typedef PellegMooreKMeansRules<MetricType, TreeType> RulesType;
+  RulesType rules(dataset, centroids, newCentroids, counts, metric);
+
+  // Use single-tree traverser.
+  typename TreeType::template SingleTreeTraverser<RulesType> traverser(rules);
+
+  // Now, do a traversal with a fake query index (since the query index is
+  // irrelevant; we are checking each node with all clusters.
+  traverser.Traverse(0, *tree);
+
+  distanceCalculations += rules.BaseCases() + rules.Scores();
+
+  // Now, calculate how far the clusters moved, after normalizing them.
+  double residual = 0.0;
+  for (size_t c = 0; c < centroids.n_cols; ++c)
+  {
+    if (counts[c] == 0)
+    {
+      newCentroids.col(c).fill(DBL_MAX); // Should have happened anyway I think.
+    }
+    else
+    {
+      newCentroids.col(c) /= counts(c);
+      residual += std::pow(metric.Evaluate(centroids.col(c),
+                                           newCentroids.col(c)), 2.0);
+    }
+  }
+
+  return std::sqrt(residual);
+}
+
+} // namespace kmeans
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/kmeans/pelleg_moore_kmeans_rules.hpp b/src/mlpack/methods/kmeans/pelleg_moore_kmeans_rules.hpp
new file mode 100644
index 0000000..9bb808d
--- /dev/null
+++ b/src/mlpack/methods/kmeans/pelleg_moore_kmeans_rules.hpp
@@ -0,0 +1,81 @@
+/**
+ * @file kmeans_rules.hpp
+ * @author Ryan Curtin
+ *
+ * Defines the pruning rules and base cases rules necessary to perform
+ * single-tree k-means clustering using the Pelleg-Moore fast k-means algorithm,
+ * which has been shoehorned to fit into the mlpack tree abstractions.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_PELLEG_MOORE_KMEANS_RULES_HPP
+#define __MLPACK_METHODS_KMEANS_PELLEG_MOORE_KMEANS_RULES_HPP
+
+#include <mlpack/methods/neighbor_search/ns_traversal_info.hpp>
+
+namespace mlpack {
+namespace kmeans {
+
+template<typename MetricType, typename TreeType>
+class PellegMooreKMeansRules
+{
+ public:
+  PellegMooreKMeansRules(const typename TreeType::Mat& dataset,
+                         const arma::mat& centroids,
+                         arma::mat& newCentroids,
+                         arma::Col<size_t>& counts,
+                         MetricType& metric);
+
+  double BaseCase(const size_t queryIndex, const size_t referenceIndex);
+
+  double Score(const size_t queryIndex, TreeType& referenceNode);
+
+  double Rescore(const size_t queryIndex,
+                 TreeType& referenceNode,
+                 const double oldScore);
+
+  //! Get the number of base cases that have been performed.
+  size_t BaseCases() const { return baseCases; }
+  //! Modify the number of base cases that have been performed.
+  size_t& BaseCases() { return baseCases; }
+
+  //! Get the number of scores that have been performed.
+  size_t Scores() const { return scores; }
+  //! Modify the number of scores that have been performed.
+  size_t& Scores() { return scores; }
+
+  //! Convenience typedef.
+  typedef neighbor::NeighborSearchTraversalInfo<TreeType> TraversalInfoType;
+
+  //! Get the traversal info.
+  const TraversalInfoType& TraversalInfo() const { return traversalInfo; }
+  //! Modify the traversal info.
+  TraversalInfoType& TraversalInfo() { return traversalInfo; }
+
+ private:
+  //! The dataset.
+  const typename TreeType::Mat& dataset;
+  //! The clusters.
+  const arma::mat& centroids;
+  //! The new centroids.
+  arma::mat& newCentroids;
+  //! The counts of points in each cluster.
+  arma::Col<size_t>& counts;
+  //! Instantiated metric.
+  MetricType& metric;
+
+  //! The number of base cases that have been performed.
+  size_t baseCases;
+  //! The number of scores that have been performed.
+  size_t scores;
+
+  TraversalInfoType traversalInfo;
+
+  arma::uvec spareBlacklist;
+};
+
+}; // namespace kmeans
+}; // namespace mlpack
+
+// Include implementation.
+#include "pelleg_moore_kmeans_rules_impl.hpp"
+
+#endif
diff --git a/src/mlpack/methods/kmeans/pelleg_moore_kmeans_rules_impl.hpp b/src/mlpack/methods/kmeans/pelleg_moore_kmeans_rules_impl.hpp
new file mode 100644
index 0000000..1ad6fd4
--- /dev/null
+++ b/src/mlpack/methods/kmeans/pelleg_moore_kmeans_rules_impl.hpp
@@ -0,0 +1,199 @@
+/**
+ * @file pelleg_moore_kmeans_rules_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of the pruning rules and base cases necessary to perform
+ * single-tree k-means clustering using the fast Pelleg-Moore k-means algorithm,
+ * which has been shoehorned into the mlpack tree abstractions.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_PELLEG_MOORE_KMEANS_RULES_IMPL_HPP
+#define __MLPACK_METHODS_KMEANS_PELLEG_MOORE_KMEANS_RULES_IMPL_HPP
+
+namespace mlpack {
+namespace kmeans {
+
+template<typename MetricType, typename TreeType>
+PellegMooreKMeansRules<MetricType, TreeType>::PellegMooreKMeansRules(
+    const typename TreeType::Mat& dataset,
+    const arma::mat& centroids,
+    arma::mat& newCentroids,
+    arma::Col<size_t>& counts,
+    MetricType& metric) :
+    dataset(dataset),
+    centroids(centroids),
+    newCentroids(newCentroids),
+    counts(counts),
+    metric(metric),
+    baseCases(0),
+    scores(0),
+    spareBlacklist(centroids.n_cols)
+{
+  // Nothing to do.
+  spareBlacklist.zeros();
+}
+
+template<typename MetricType, typename TreeType>
+inline force_inline
+double PellegMooreKMeansRules<MetricType, TreeType>::BaseCase(
+    const size_t /* queryIndex */,
+    const size_t /* referenceIndex */)
+{
+  return 0.0;
+}
+
+template<typename MetricType, typename TreeType>
+double PellegMooreKMeansRules<MetricType, TreeType>::Score(
+    const size_t /* queryIndex */,
+    TreeType& referenceNode)
+{
+  // Obtain the parent's blacklist.  If this is the root node, we'll start with
+  // an empty blacklist.  This means that after each iteration, we don't need to
+  // reset any statistics.
+  arma::uvec* blacklistPtr = NULL;
+  if (referenceNode.Parent() == NULL ||
+      referenceNode.Parent()->Stat().Blacklist().size() == 0)
+    blacklistPtr = &spareBlacklist;
+  else
+    blacklistPtr = &referenceNode.Parent()->Stat().Blacklist();
+
+  // If the blacklist hasn't been initialized, fill it with zeros.
+  if (blacklistPtr->n_elem == 0)
+    blacklistPtr->zeros(centroids.n_cols);
+  referenceNode.Stat().Blacklist() = *blacklistPtr;
+
+  // The query index is a fake index that we won't use, and the reference node
+  // holds all of the points in the dataset.  Our goal is to determine whether
+  // or not this node is dominated by a single cluster.
+  const size_t whitelisted = centroids.n_cols - arma::accu(*blacklistPtr);
+
+  scores += whitelisted;
+
+  arma::vec minDistances(whitelisted);
+  minDistances.fill(DBL_MAX);
+  arma::Col<size_t> indexMappings(whitelisted);
+  size_t index = 0;
+  for (size_t i = 0; i < centroids.n_cols; ++i)
+  {
+    if ((*blacklistPtr)[i] == 0)
+    {
+      minDistances(index) = referenceNode.MinDistance(centroids.col(i));
+      indexMappings(index) = i;
+      ++index;
+    }
+  }
+
+  // Which cluster has minimum distance to the node?  Sort by distance.
+  // This should probably be rewritten -- we only need the minimum, not the
+  // entire sorted list.  That'll cost O(k) not O(k log k) (depending on sort
+  // type).
+  arma::uvec sortedClusterIndices = sort_index(minDistances);
+  const size_t closestCluster = indexMappings(sortedClusterIndices[0]);
+
+
+  // Now, for every other whitelisted cluster, determine if the closest cluster
+  // owns the point.  This calculation is specific to hyperrectangle trees (but,
+  // this implementation is specific to kd-trees, so that's okay).  For
+  // circular-bound trees, the condition should be simpler and can probably be
+  // expressed as a comparison between minimum and maximum distances.
+  size_t newBlacklisted = 0;
+  for (size_t c = 0; c < centroids.n_cols; ++c)
+  {
+    if (referenceNode.Stat().Blacklist()[c] == 1 || c == closestCluster)
+      continue;
+
+    // This algorithm comes from the proof of Lemma 4 in the extended version
+    // of the Pelleg-Moore paper (the CMU tech report, that is).  It has been
+    // adapted for speed.
+    arma::vec cornerPoint(centroids.n_rows);
+    for (size_t d = 0; d < referenceNode.Bound().Dim(); ++d)
+    {
+      if (centroids(d, c) > centroids(d, closestCluster))
+        cornerPoint(d) = referenceNode.Bound()[d].Hi();
+      else
+        cornerPoint(d) = referenceNode.Bound()[d].Lo();
+    }
+
+    const double closestDist = metric.Evaluate(cornerPoint,
+        centroids.col(closestCluster));
+    const double otherDist = metric.Evaluate(cornerPoint, centroids.col(c));
+
+    if (closestDist < otherDist)
+    {
+      // The closest cluster dominates the node with respect to the cluster c.
+      // So we can blacklist c.
+      referenceNode.Stat().Blacklist()[c] = 1;
+      ++newBlacklisted;
+    }
+  }
+
+  if (whitelisted - newBlacklisted == 1)
+  {
+    // This node is dominated by the first cluster.
+    const size_t cluster = indexMappings(sortedClusterIndices[0]);
+    counts[cluster] += referenceNode.NumDescendants();
+    newCentroids.col(cluster) += referenceNode.NumDescendants() *
+        referenceNode.Stat().Centroid();
+
+    if (referenceNode.Parent() == NULL ||
+        referenceNode.Parent()->Stat().Blacklist().size() == 0)
+    {
+      spareBlacklist.zeros(centroids.n_cols);
+    }
+
+    return DBL_MAX;
+  }
+
+  if (referenceNode.Parent() == NULL ||
+      referenceNode.Parent()->Stat().Blacklist().size() == 0)
+  {
+    spareBlacklist.zeros(centroids.n_cols);
+  }
+
+  // Perform the base case here.
+  for (size_t i = 0; i < referenceNode.NumPoints(); ++i)
+  {
+    size_t bestCluster = centroids.n_cols;
+    double bestDistance = DBL_MAX;
+    for (size_t c = 0; c < centroids.n_cols; ++c)
+    {
+      if (referenceNode.Stat().Blacklist()[c] == 1)
+        continue;
+
+      ++baseCases;
+
+      // The reference index is the index of the data point.
+      const double distance = metric.Evaluate(centroids.col(c),
+          dataset.col(referenceNode.Point(i)));
+
+      if (distance < bestDistance)
+      {
+        bestDistance = distance;
+        bestCluster = c;
+      }
+    }
+
+    // Add to resulting centroid.
+    newCentroids.col(bestCluster) += dataset.col(referenceNode.Point(i));
+    ++counts(bestCluster);
+  }
+
+  // Otherwise, we're not sure, so we can't prune.  Recursion order doesn't make
+  // a difference, so we'll just return a score of 0.
+  return 0.0;
+}
+
+template<typename MetricType, typename TreeType>
+double PellegMooreKMeansRules<MetricType, TreeType>::Rescore(
+    const size_t /* queryIndex */,
+    TreeType& /* referenceNode */,
+    const double oldScore)
+{
+  // There's no possible way that calling Rescore() can produce a prune now when
+  // it couldn't before.
+  return oldScore;
+}
+
+}; // namespace kmeans
+}; // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/kmeans/pelleg_moore_kmeans_statistic.hpp b/src/mlpack/methods/kmeans/pelleg_moore_kmeans_statistic.hpp
new file mode 100644
index 0000000..2f8244e
--- /dev/null
+++ b/src/mlpack/methods/kmeans/pelleg_moore_kmeans_statistic.hpp
@@ -0,0 +1,81 @@
+/**
+ * @file pelleg_moore_kmeans_statistic.hpp
+ * @author Ryan Curtin
+ *
+ * A StatisticType for trees which holds the blacklist for various k-means
+ * clusters.  See the Pelleg and Moore paper for more details.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_PELLEG_MOORE_KMEANS_STATISTIC_HPP
+#define __MLPACK_METHODS_KMEANS_PELLEG_MOORE_KMEANS_STATISTIC_HPP
+
+namespace mlpack {
+namespace kmeans {
+
+/**
+ * A statistic for trees which holds the blacklist for Pelleg-Moore k-means
+ * clustering (which represents the clusters that cannot possibly own any points
+ * in a node).
+ */
+class PellegMooreKMeansStatistic
+{
+ public:
+  //! Initialize the statistic without a node (this does nothing).
+  PellegMooreKMeansStatistic() { }
+
+  //! Initialize the statistic for a node; this calculates the centroid and
+  //! caches it.
+  template<typename TreeType>
+  PellegMooreKMeansStatistic(TreeType& node)
+  {
+    centroid.zeros(node.Dataset().n_rows);
+
+    // Hope it's a depth-first build procedure.  Also, this won't work right for
+    // trees that have self-children or stuff like that.
+    for (size_t i = 0; i < node.NumChildren(); ++i)
+    {
+      centroid += node.Child(i).NumDescendants() *
+          node.Child(i).Stat().Centroid();
+    }
+
+    for (size_t i = 0; i < node.NumPoints(); ++i)
+    {
+      centroid += node.Dataset().col(node.Point(i));
+    }
+
+    if (node.NumDescendants() > 0)
+      centroid /= node.NumDescendants();
+    else
+      centroid.fill(DBL_MAX); // Invalid centroid.  What else can we do?
+  }
+
+  //! Get the cluster blacklist.
+  const arma::uvec& Blacklist() const { return blacklist; }
+  //! Modify the cluster blacklist.
+  arma::uvec& Blacklist() { return blacklist; }
+
+  //! Get the node's centroid.
+  const arma::vec& Centroid() const { return centroid; }
+  //! Modify the node's centroid (be careful!).
+  arma::vec& Centroid() { return centroid; }
+
+  //! Return the object as a string.
+  std::string ToString() const
+  {
+    std::ostringstream convert;
+    convert << "KMeansStatistic [" << this << "]" << std::endl;
+    convert << "  Blacklist: " << blacklist.t();
+    convert << "  Centroid: " << centroid.t();
+    return convert.str();
+  }
+
+ private:
+  //! The cluster blacklist for the node.
+  arma::uvec blacklist;
+  //! The centroid of the node, cached for use during prunes.
+  arma::vec centroid;
+};
+
+}; // namespace kmeans
+}; // namespace mlpack
+
+#endif