[mlpack-git] master: Refactor EmptyClusterPolicy to use both the old and the new clusters. (9264f75)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Tue Jun 16 14:40:29 EDT 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/4156e3f42f988e383c984e799f84453b6ca68acc...9264f7544f7c4d93ff735f00f35b0f5287abf59d

>---------------------------------------------------------------

commit 9264f7544f7c4d93ff735f00f35b0f5287abf59d
Author: Ryan Curtin <ryan at ratml.org>
Date:   Tue Jun 16 14:40:15 2015 -0400

    Refactor EmptyClusterPolicy to use both the old and the new clusters.


>---------------------------------------------------------------

9264f7544f7c4d93ff735f00f35b0f5287abf59d
 src/mlpack/methods/kmeans/allow_empty_clusters.hpp |  8 ++++--
 src/mlpack/methods/kmeans/kmeans_impl.hpp          |  8 +++---
 .../methods/kmeans/max_variance_new_cluster.hpp    | 10 +++++---
 .../kmeans/max_variance_new_cluster_impl.hpp       | 30 ++++++++++++----------
 4 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/src/mlpack/methods/kmeans/allow_empty_clusters.hpp b/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
index e231a23..2fb4534 100644
--- a/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
+++ b/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
@@ -30,7 +30,10 @@ class AllowEmptyClusters
    * @tparam MatType Type of data (arma::mat or arma::spmat).
    * @param data Dataset on which clustering is being performed.
    * @param emptyCluster Index of cluster which is empty.
-   * @param centroids Centroids of each cluster (one per column).
+   * @param oldCentroids Centroids of each cluster (one per column) at the start
+   *      of the iteration.
+   * @param newCentroids Centroids of each cluster (one per column) at the end
+   *      of the iteration.
    * @param clusterCounts Number of points in each cluster.
    * @param assignments Cluster assignments of each point.
    * @param iteration Number of iteration.
@@ -41,7 +44,8 @@ class AllowEmptyClusters
   static inline force_inline size_t EmptyCluster(
       const MatType& /* data */,
       const size_t /* emptyCluster */,
-      const arma::mat& /* centroids */,
+      const arma::mat& /* oldCentroids */,
+      arma::mat& /* newCentroids */,
       arma::Col<size_t>& /* clusterCounts */,
       MetricType& /* metric */,
       const size_t /* iteration */)
diff --git a/src/mlpack/methods/kmeans/kmeans_impl.hpp b/src/mlpack/methods/kmeans/kmeans_impl.hpp
index 7123fb4..16ba7e5 100644
--- a/src/mlpack/methods/kmeans/kmeans_impl.hpp
+++ b/src/mlpack/methods/kmeans/kmeans_impl.hpp
@@ -157,11 +157,11 @@ Cluster(const MatType& data,
       {
         Log::Info << "Cluster " << i << " is empty.\n";
         if (iteration % 2 == 0)
-          emptyClusterAction.EmptyCluster(data, i, centroidsOther, counts,
-              metric, iteration);
+          emptyClusterAction.EmptyCluster(data, i, centroids, centroidsOther,
+              counts, metric, iteration);
         else
-          emptyClusterAction.EmptyCluster(data, i, centroids, counts, metric,
-              iteration);
+          emptyClusterAction.EmptyCluster(data, i, centroidsOther, centroids,
+              counts, metric, iteration);
       }
     }
 
diff --git a/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp b/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
index 8a18496..ad962fc 100644
--- a/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
+++ b/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
@@ -31,7 +31,10 @@ class MaxVarianceNewCluster
    * @tparam MatType Type of data (arma::mat or arma::sp_mat).
    * @param data Dataset on which clustering is being performed.
    * @param emptyCluster Index of cluster which is empty.
-   * @param centroids Centroids of each cluster (one per column).
+   * @param oldCentroids Centroids of each cluster (one per column), at the
+   *      start of the iteration.
+   * @param newCentroids Centroids of each cluster (one per column), at the end
+   *      of the iteration.  This will be modified!
    * @param clusterCounts Number of points in each cluster.
    * @param assignments Cluster assignments of each point.
    *
@@ -40,7 +43,8 @@ class MaxVarianceNewCluster
   template<typename MetricType, typename MatType>
   size_t EmptyCluster(const MatType& data,
                       const size_t emptyCluster,
-                      arma::mat& centroids,
+                      const arma::mat& oldCentroids,
+                      arma::mat& newCentroids,
                       arma::Col<size_t>& clusterCounts,
                       MetricType& metric,
                       const size_t iteration);
@@ -56,7 +60,7 @@ class MaxVarianceNewCluster
   //! Called when we are on a new iteration.
   template<typename MetricType, typename MatType>
   void Precalculate(const MatType& data,
-                    arma::mat& centroids,
+                    const arma::mat& oldCentroids,
                     arma::Col<size_t>& clusterCounts,
                     MetricType& metric);
 };
diff --git a/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp b/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
index 2e6c117..c87200d 100644
--- a/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
+++ b/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
@@ -19,14 +19,15 @@ namespace kmeans {
 template<typename MetricType, typename MatType>
 size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
                                            const size_t emptyCluster,
-                                           arma::mat& centroids,
+                                           const arma::mat& oldCentroids,
+                                           arma::mat& newCentroids,
                                            arma::Col<size_t>& clusterCounts,
                                            MetricType& metric,
                                            const size_t iteration)
 {
   // If necessary, calculate the variances and assignments.
   if (iteration != this->iteration || assignments.n_elem != data.n_cols)
-    Precalculate(data, centroids, clusterCounts, metric);
+    Precalculate(data, oldCentroids, clusterCounts, metric);
   this->iteration = iteration;
 
   // Now find the cluster with maximum variance.
@@ -41,7 +42,7 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
     if (assignments[i] == maxVarCluster)
     {
       const double distance = std::pow(metric.Evaluate(data.col(i),
-          centroids.col(maxVarCluster)), 2.0);
+          newCentroids.col(maxVarCluster)), 2.0);
 
       if (distance > maxDistance)
       {
@@ -52,19 +53,20 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
   }
 
   // Take that point and add it to the empty cluster.
-  centroids.col(maxVarCluster) *= (double(clusterCounts[maxVarCluster]) /
+  newCentroids.col(maxVarCluster) *= (double(clusterCounts[maxVarCluster]) /
       double(clusterCounts[maxVarCluster] - 1));
-  centroids.col(maxVarCluster) -= (1.0 / (clusterCounts[maxVarCluster] - 1.0)) *
+  newCentroids.col(maxVarCluster) -= (1.0 / (clusterCounts[maxVarCluster] - 1.0)) *
       arma::vec(data.col(furthestPoint));
   clusterCounts[maxVarCluster]--;
   clusterCounts[emptyCluster]++;
-  centroids.col(emptyCluster) = arma::vec(data.col(furthestPoint));
+  newCentroids.col(emptyCluster) = arma::vec(data.col(furthestPoint));
   assignments[furthestPoint] = emptyCluster;
 
   // Modify the variances, as necessary.
   variances[emptyCluster] = 0;
-  variances[maxVarCluster] = (1.0 / (clusterCounts[maxVarCluster] - 1)) *
-      (variances[maxVarCluster] - maxDistance);
+  // One has already been subtracted from clusterCounts[maxVarCluster].
+  variances[maxVarCluster] = (1.0 / (clusterCounts[maxVarCluster])) *
+      ((clusterCounts[maxVarCluster] + 1) * variances[maxVarCluster] - maxDistance);
 
   // Output some debugging information.
   Log::Debug << "Point " << furthestPoint << " assigned to empty cluster " <<
@@ -75,14 +77,14 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
 
 template<typename MetricType, typename MatType>
 void MaxVarianceNewCluster::Precalculate(const MatType& data,
-                                         arma::mat& centroids,
+                                         const arma::mat& oldCentroids,
                                          arma::Col<size_t>& clusterCounts,
                                          MetricType& metric)
 {
   // We have to calculate the variances of each cluster and the assignments of
   // each point.  This is most easily done by iterating through the entire
   // dataset.
-  variances.zeros(centroids.n_cols);
+  variances.zeros(oldCentroids.n_cols);
   assignments.set_size(data.n_cols);
 
   // Add the variance of each point's distance away from the cluster.  I think
@@ -91,11 +93,11 @@ void MaxVarianceNewCluster::Precalculate(const MatType& data,
   {
     // Find the closest centroid to this point.
     double minDistance = std::numeric_limits<double>::infinity();
-    size_t closestCluster = centroids.n_cols; // Invalid value.
+    size_t closestCluster = oldCentroids.n_cols; // Invalid value.
 
-    for (size_t j = 0; j < centroids.n_cols; j++)
+    for (size_t j = 0; j < oldCentroids.n_cols; j++)
     {
-      const double distance = metric.Evaluate(data.col(i), centroids.col(j));
+      const double distance = metric.Evaluate(data.col(i), oldCentroids.col(j));
 
       if (distance < minDistance)
       {
@@ -106,7 +108,7 @@ void MaxVarianceNewCluster::Precalculate(const MatType& data,
 
     assignments[i] = closestCluster;
     variances[closestCluster] += std::pow(metric.Evaluate(data.col(i),
-        centroids.col(closestCluster)), 2.0);
+        oldCentroids.col(closestCluster)), 2.0);
   }
 
   // Divide by the number of points in the cluster to produce the variance,



More information about the mlpack-git mailing list