[mlpack-git] master: Further refactoring of KMeans. Fix MaxVarianceNewCluster, and also change it so that assignments are not passed in (since the inner KMeans loop doesn't really keep track of these now). (ca8cbba)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Thu Mar 5 21:55:26 EST 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/904762495c039e345beba14c1142fd719b3bd50e...f94823c800ad6f7266995c700b1b630d5ffdcf40

>---------------------------------------------------------------

commit ca8cbbaee0af81b254a07d0a391c0cc15bdb0387
Author: Ryan Curtin <ryan at ratml.org>
Date:   Wed Jul 30 16:33:07 2014 +0000

    Further refactoring of KMeans.  Fix MaxVarianceNewCluster, and also change it so
    that assignments are not passed in (since the inner KMeans loop doesn't really
    keep track of these now).


>---------------------------------------------------------------

ca8cbbaee0af81b254a07d0a391c0cc15bdb0387
 src/mlpack/methods/kmeans/allow_empty_clusters.hpp |  4 +-
 src/mlpack/methods/kmeans/kmeans_impl.hpp          | 61 +++++++---------------
 .../methods/kmeans/max_variance_new_cluster.hpp    |  4 +-
 .../kmeans/max_variance_new_cluster_impl.hpp       | 37 +++++++++----
 src/mlpack/methods/kmeans/naive_kmeans_impl.hpp    |  2 +
 5 files changed, 52 insertions(+), 56 deletions(-)

diff --git a/src/mlpack/methods/kmeans/allow_empty_clusters.hpp b/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
index ae476b7..f2af635 100644
--- a/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
+++ b/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
@@ -36,12 +36,12 @@ class AllowEmptyClusters
    *
    * @return Number of points changed (0).
    */
-  template<typename MatType>
+  template<typename MetricType, typename MatType>
   static size_t EmptyCluster(const MatType& /* data */,
                              const size_t /* emptyCluster */,
                              const arma::mat& /* centroids */,
                              arma::Col<size_t>& /* clusterCounts */,
-                             arma::Col<size_t>& /* assignments */)
+                             MetricType& /* assignments */)
   {
     // Empty clusters are okay!  Do nothing.
     return 0;
diff --git a/src/mlpack/methods/kmeans/kmeans_impl.hpp b/src/mlpack/methods/kmeans/kmeans_impl.hpp
index 936831c..0254e21 100644
--- a/src/mlpack/methods/kmeans/kmeans_impl.hpp
+++ b/src/mlpack/methods/kmeans/kmeans_impl.hpp
@@ -122,19 +122,6 @@ Cluster(const MatType& data,
       Log::Fatal << "KMeans::Cluster(): initial cluster assignments (length "
           << assignments.n_elem << ") not the same size as the dataset (size "
           << data.n_cols << ")!" << std::endl;
-
-    // Calculate initial centroids.
-    counts.zeros(actualClusters);
-    centroids.zeros(data.n_rows, actualClusters);
-    for (size_t i = 0; i < data.n_cols; ++i)
-    {
-      centroids.col(assignments[i]) += data.col(i);
-      counts[assignments[i]]++;
-    }
-
-    for (size_t i = 0; i < actualClusters; ++i)
-      if (counts[i] != 0)
-        centroids.col(i) /= counts[i];
   }
   else if (initialCentroidGuess)
   {
@@ -147,36 +134,16 @@ Cluster(const MatType& data,
       Log::Fatal << "KMeans::Cluster(): initial cluster centroids have wrong "
         << " dimensionality (" << centroids.n_rows << ", should be "
         << data.n_rows << ")!" << std::endl;
-
-    // If there were no problems, construct the initial assignments from the
-    // given centroids.
-    assignments.set_size(data.n_cols);
-    for (size_t i = 0; i < data.n_cols; ++i)
-    {
-      // Find the closest centroid to this point.
-      double minDistance = std::numeric_limits<double>::infinity();
-      size_t closestCluster = clusters; // Invalid value.
-
-      for (size_t j = 0; j < clusters; j++)
-      {
-        double distance = metric.Evaluate(data.col(i), centroids.col(j));
-
-        if (distance < minDistance)
-        {
-          minDistance = distance;
-          closestCluster = j;
-        }
-      }
-
-      // Assign the point to the closest cluster that we found.
-      assignments[i] = closestCluster;
-    }
   }
   else
   {
     // Use the partitioner to come up with the partition assignments.
     partitioner.Cluster(data, actualClusters, assignments);
+  }
 
+  // Calculate the initial centroids, if we need to.
+  if (!initialCentroidGuess || (initialAssignmentGuess && initialCentroidGuess))
+  {
     // Calculate initial centroids.
     counts.zeros(actualClusters);
     centroids.zeros(data.n_rows, actualClusters);
@@ -191,7 +158,6 @@ Cluster(const MatType& data,
         centroids.col(i) /= counts[i];
   }
 
-  size_t changedAssignments = 0;
   size_t iteration = 0;
 
   LloydStepType<MetricType, MatType> lloydStep(data, metric);
@@ -210,9 +176,17 @@ Cluster(const MatType& data,
     // If we are not allowing empty clusters, then check that all of our
     // clusters have points.
     for (size_t i = 0; i < actualClusters; i++)
+    {
       if (counts[i] == 0)
-        changedAssignments += emptyClusterAction.EmptyCluster(data, i,
-            centroids, counts, assignments);
+      {
+        Log::Debug << "Cluster " << i << " is empty.\n";
+        if (iteration % 2 == 0)
+          emptyClusterAction.EmptyCluster(data, i, centroidsOther, counts,
+              metric);
+        else
+          emptyClusterAction.EmptyCluster(data, i, centroids, counts, metric);
+      }
+    }
 
     // Calculate cluster distortion for this iteration.
     cNorm = 0.0;
@@ -228,9 +202,11 @@ Cluster(const MatType& data,
 
   } while (cNorm > 1e-5 && iteration != maxIterations);
 
-  // Unfortunate copy that is sometimes necessary.
+  // If we ended on an even iteration, then the centroids are in the
+  // centroidsOther matrix, and we need to steal its memory (steal_mem() avoids
+  // a copy if possible).
   if (iteration % 2 == 0)
-    centroids = centroidsOther;
+    centroids.steal_mem(centroidsOther);
 
   if (iteration != maxIterations)
   {
@@ -244,6 +220,7 @@ Cluster(const MatType& data,
   }
 
   // Calculate final assignments.
+  assignments.set_size(data.n_cols);
   for (size_t i = 0; i < data.n_cols; ++i)
   {
     // Find the closest centroid to this point.
diff --git a/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp b/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
index af14ca1..1bd7753 100644
--- a/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
+++ b/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
@@ -37,12 +37,12 @@ class MaxVarianceNewCluster
    *
    * @return Number of points changed.
    */
-  template<typename MatType>
+  template<typename MetricType, typename MatType>
   static size_t EmptyCluster(const MatType& data,
                              const size_t emptyCluster,
                              arma::mat& centroids,
                              arma::Col<size_t>& clusterCounts,
-                             arma::Col<size_t>& assignments);
+                             MetricType& metric);
 };
 
 }; // namespace kmeans
diff --git a/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp b/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
index 91b40f5..6ba376b 100644
--- a/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
+++ b/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
@@ -16,24 +16,41 @@ namespace kmeans {
 /**
  * Take action about an empty cluster.
  */
-template<typename MatType>
+template<typename MetricType, typename MatType>
 size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
                                            const size_t emptyCluster,
                                            arma::mat& centroids,
                                            arma::Col<size_t>& clusterCounts,
-                                           arma::Col<size_t>& assignments)
+                                           MetricType& metric)
 {
   // First, we need to find the cluster with maximum variance (by which I mean
   // the sum of the covariance matrices).
   arma::vec variances;
   variances.zeros(clusterCounts.n_elem); // Start with 0.
+  arma::Col<size_t> assignments(data.n_cols);
 
   // Add the variance of each point's distance away from the cluster.  I think
   // this is the sensible thing to do.
   for (size_t i = 0; i < data.n_cols; ++i)
   {
-    variances[assignments[i]] += metric::SquaredEuclideanDistance::Evaluate(
-        data.col(i), centroids.col(assignments[i]));
+    // Find the closest centroid to this point.
+    double minDistance = std::numeric_limits<double>::infinity();
+    size_t closestCluster = centroids.n_cols; // Invalid value.
+
+    for (size_t j = 0; j < centroids.n_cols; j++)
+    { 
+      const double distance = metric.Evaluate(data.col(i), centroids.col(j));
+
+      if (distance < minDistance)
+      {
+        minDistance = distance;
+        closestCluster = j;
+      }
+    }
+
+    assignments[i] = closestCluster;
+    variances[closestCluster] += metric.Evaluate(data.col(i),
+        centroids.col(closestCluster));
   }
 
   // Divide by the number of points in the cluster to produce the variance.
@@ -55,8 +72,8 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
   {
     if (assignments[i] == maxVarCluster)
     {
-      double distance = arma::as_scalar(
-          arma::var(data.col(i) - centroids.col(maxVarCluster)));
+      const double distance = metric.Evaluate(data.col(i),
+          centroids.col(maxVarCluster));
 
       if (distance > maxDistance)
       {
@@ -67,15 +84,15 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
   }
 
   // Take that point and add it to the empty cluster.
-  centroids.col(maxVarCluster) *= (clusterCounts[maxVarCluster] /
-      --clusterCounts[maxVarCluster]);
-  centroids.col(maxVarCluster) -= (1.0 / clusterCounts[maxVarCluster]) *
+  centroids.col(maxVarCluster) *= (double(clusterCounts[maxVarCluster]) /
+      double(clusterCounts[maxVarCluster] - 1));
+  centroids.col(maxVarCluster) -= (1.0 / (clusterCounts[maxVarCluster] - 1.0)) *
       data.col(furthestPoint);
+  clusterCounts[maxVarCluster]--;
   clusterCounts[emptyCluster]++;
   centroids.col(emptyCluster) = arma::vec(data.col(furthestPoint));
   assignments[furthestPoint] = emptyCluster;
 
-
   // Output some debugging information.
   Log::Debug << "Point " << furthestPoint << " assigned to empty cluster " <<
       emptyCluster << ".\n";
diff --git a/src/mlpack/methods/kmeans/naive_kmeans_impl.hpp b/src/mlpack/methods/kmeans/naive_kmeans_impl.hpp
index 4e9163c..d042bac 100644
--- a/src/mlpack/methods/kmeans/naive_kmeans_impl.hpp
+++ b/src/mlpack/methods/kmeans/naive_kmeans_impl.hpp
@@ -60,6 +60,8 @@ void NaiveKMeans<MetricType, MatType>::Iterate(const arma::mat& centroids,
   for (size_t i = 0; i < centroids.n_cols; ++i)
     if (counts(i) != 0)
       newCentroids.col(i) /= counts(i);
+    else
+      newCentroids.col(i).fill(DBL_MAX); // Invalid value.
 }
 
 } // namespace kmeans



More information about the mlpack-git mailing list