[mlpack-git] master: Clean up an unnecessary sort, and remove spareBlacklist. (ecb190f)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Thu Mar 5 22:01:16 EST 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/904762495c039e345beba14c1142fd719b3bd50e...f94823c800ad6f7266995c700b1b630d5ffdcf40

>---------------------------------------------------------------

commit ecb190fac4ddddacfccba9a9e22e941ae4ed8bd1
Author: Ryan Curtin <ryan at ratml.org>
Date:   Sun Oct 12 20:45:13 2014 +0000

    Clean up an unnecessary sort, and remove spareBlacklist.


>---------------------------------------------------------------

ecb190fac4ddddacfccba9a9e22e941ae4ed8bd1
 .../methods/kmeans/pelleg_moore_kmeans_rules.hpp   |  3 -
 .../kmeans/pelleg_moore_kmeans_rules_impl.hpp      | 65 +++++++---------------
 2 files changed, 20 insertions(+), 48 deletions(-)

diff --git a/src/mlpack/methods/kmeans/pelleg_moore_kmeans_rules.hpp b/src/mlpack/methods/kmeans/pelleg_moore_kmeans_rules.hpp
index 874723d..c4fc419 100644
--- a/src/mlpack/methods/kmeans/pelleg_moore_kmeans_rules.hpp
+++ b/src/mlpack/methods/kmeans/pelleg_moore_kmeans_rules.hpp
@@ -51,9 +51,6 @@ class PellegMooreKMeansRules
 
   //! The number of O(d) distance calculations that have been performed.
   size_t distanceCalculations;
-
-  //! Spare blacklist; I think it's only used by the root node.
-  arma::uvec spareBlacklist;
 };
 
 }; // namespace kmeans
diff --git a/src/mlpack/methods/kmeans/pelleg_moore_kmeans_rules_impl.hpp b/src/mlpack/methods/kmeans/pelleg_moore_kmeans_rules_impl.hpp
index d0cced5..4ef735e 100644
--- a/src/mlpack/methods/kmeans/pelleg_moore_kmeans_rules_impl.hpp
+++ b/src/mlpack/methods/kmeans/pelleg_moore_kmeans_rules_impl.hpp
@@ -24,11 +24,9 @@ PellegMooreKMeansRules<MetricType, TreeType>::PellegMooreKMeansRules(
     newCentroids(newCentroids),
     counts(counts),
     metric(metric),
-    distanceCalculations(0),
-    spareBlacklist(centroids.n_cols)
+    distanceCalculations(0)
 {
   // Nothing to do.
-  spareBlacklist.zeros();
 }
 
 template<typename MetricType, typename TreeType>
@@ -48,47 +46,37 @@ double PellegMooreKMeansRules<MetricType, TreeType>::Score(
   // Obtain the parent's blacklist.  If this is the root node, we'll start with
   // an empty blacklist.  This means that after each iteration, we don't need to
   // reset any statistics.
-  arma::uvec* blacklistPtr = NULL;
   if (referenceNode.Parent() == NULL ||
-      referenceNode.Parent()->Stat().Blacklist().size() == 0)
-    blacklistPtr = &spareBlacklist;
+      referenceNode.Parent()->Stat().Blacklist().n_elem == 0)
+    referenceNode.Stat().Blacklist().zeros(centroids.n_cols);
   else
-    blacklistPtr = &referenceNode.Parent()->Stat().Blacklist();
-
-  // If the blacklist hasn't been initialized, fill it with zeros.
-  if (blacklistPtr->n_elem == 0)
-    blacklistPtr->zeros(centroids.n_cols);
-  referenceNode.Stat().Blacklist() = *blacklistPtr;
+    referenceNode.Stat().Blacklist() =
+        referenceNode.Parent()->Stat().Blacklist();
 
   // The query index is a fake index that we won't use, and the reference node
   // holds all of the points in the dataset.  Our goal is to determine whether
   // or not this node is dominated by a single cluster.
-  const size_t whitelisted = centroids.n_cols - arma::accu(*blacklistPtr);
+  const size_t whitelisted = centroids.n_cols -
+      arma::accu(referenceNode.Stat().Blacklist());
 
   distanceCalculations += whitelisted;
 
-  arma::vec minDistances(whitelisted);
-  minDistances.fill(DBL_MAX);
-  arma::Col<size_t> indexMappings(whitelisted);
-  size_t index = 0;
+  // Which cluster has minimum distance to the node?
+  size_t closestCluster = centroids.n_cols;
+  double minMinDistance = DBL_MAX;
   for (size_t i = 0; i < centroids.n_cols; ++i)
   {
-    if ((*blacklistPtr)[i] == 0)
+    if (referenceNode.Stat().Blacklist()[i] == 0)
     {
-      minDistances(index) = referenceNode.MinDistance(centroids.col(i));
-      indexMappings(index) = i;
-      ++index;
+      const double minDistance = referenceNode.MinDistance(centroids.col(i));
+      if (minDistance < minMinDistance)
+      {
+        minMinDistance = minDistance;
+        closestCluster = i;
+      }
     }
   }
 
-  // Which cluster has minimum distance to the node?  Sort by distance.
-  // This should probably be rewritten -- we only need the minimum, not the
-  // entire sorted list.  That'll cost O(k) not O(k log k) (depending on sort
-  // type).
-  arma::uvec sortedClusterIndices = sort_index(minDistances);
-  const size_t closestCluster = indexMappings(sortedClusterIndices[0]);
-
-
   // Now, for every other whitelisted cluster, determine if the closest cluster
   // owns the point.  This calculation is specific to hyperrectangle trees (but,
   // this implementation is specific to kd-trees, so that's okay).  For
@@ -129,27 +117,14 @@ double PellegMooreKMeansRules<MetricType, TreeType>::Score(
 
   if (whitelisted - newBlacklisted == 1)
   {
-    // This node is dominated by the first cluster.
-    const size_t cluster = indexMappings(sortedClusterIndices[0]);
-    counts[cluster] += referenceNode.NumDescendants();
-    newCentroids.col(cluster) += referenceNode.NumDescendants() *
+    // This node is dominated by the closest cluster.
+    counts[closestCluster] += referenceNode.NumDescendants();
+    newCentroids.col(closestCluster) += referenceNode.NumDescendants() *
         referenceNode.Stat().Centroid();
 
-    if (referenceNode.Parent() == NULL ||
-        referenceNode.Parent()->Stat().Blacklist().size() == 0)
-    {
-      spareBlacklist.zeros(centroids.n_cols);
-    }
-
     return DBL_MAX;
   }
 
-  if (referenceNode.Parent() == NULL ||
-      referenceNode.Parent()->Stat().Blacklist().size() == 0)
-  {
-    spareBlacklist.zeros(centroids.n_cols);
-  }
-
   // Perform the base case here.
   for (size_t i = 0; i < referenceNode.NumPoints(); ++i)
   {



More information about the mlpack-git mailing list