[mlpack-git] master: Refactor to precalculate variances. (35c0f04)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Mon Jun 1 11:16:09 EDT 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/803224f64f6b6490c3cb8aafd528b1db069ef2f1...7011dd89f6a9cf8cf7b66e1b42c9147b606551c2

>---------------------------------------------------------------

commit 35c0f041b0ade534eba7c60e82890d32fa491c2e
Author: Ryan Curtin <ryan at ratml.org>
Date:   Mon Jun 1 11:15:17 2015 -0400

    Refactor to precalculate variances.


>---------------------------------------------------------------

35c0f041b0ade534eba7c60e82890d32fa491c2e
 src/mlpack/methods/kmeans/allow_empty_clusters.hpp |  8 +-
 src/mlpack/methods/kmeans/kmeans_impl.hpp          |  5 +-
 .../methods/kmeans/max_variance_new_cluster.hpp    | 32 +++++--
 .../kmeans/max_variance_new_cluster_impl.hpp       | 98 +++++++++++++---------
 4 files changed, 90 insertions(+), 53 deletions(-)

diff --git a/src/mlpack/methods/kmeans/allow_empty_clusters.hpp b/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
index a8b0422..e231a23 100644
--- a/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
+++ b/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
@@ -33,6 +33,7 @@ class AllowEmptyClusters
    * @param centroids Centroids of each cluster (one per column).
    * @param clusterCounts Number of points in each cluster.
    * @param assignments Cluster assignments of each point.
+   * @param iteration Number of iteration.
    *
    * @return Number of points changed (0).
    */
@@ -42,14 +43,15 @@ class AllowEmptyClusters
       const size_t /* emptyCluster */,
       const arma::mat& /* centroids */,
       arma::Col<size_t>& /* clusterCounts */,
-      MetricType& /* metric */)
+      MetricType& /* metric */,
+      const size_t /* iteration */)
   {
     // Empty clusters are okay!  Do nothing.
     return 0;
   }
 };
 
-}; // namespace kmeans
-}; // namespace mlpack
+} // namespace kmeans
+} // namespace mlpack
 
 #endif
diff --git a/src/mlpack/methods/kmeans/kmeans_impl.hpp b/src/mlpack/methods/kmeans/kmeans_impl.hpp
index 9bdf29c..862010f 100644
--- a/src/mlpack/methods/kmeans/kmeans_impl.hpp
+++ b/src/mlpack/methods/kmeans/kmeans_impl.hpp
@@ -158,9 +158,10 @@ Cluster(const MatType& data,
         Log::Info << "Cluster " << i << " is empty.\n";
         if (iteration % 2 == 0)
           emptyClusterAction.EmptyCluster(data, i, centroidsOther, counts,
-              metric);
+              metric, iteration);
         else
-          emptyClusterAction.EmptyCluster(data, i, centroids, counts, metric);
+          emptyClusterAction.EmptyCluster(data, i, centroids, counts, metric,
+              iteration);
       }
     }
 
diff --git a/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp b/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
index 1bd7753..8a18496 100644
--- a/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
+++ b/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
@@ -22,7 +22,7 @@ class MaxVarianceNewCluster
 {
  public:
   //! Default constructor required by EmptyClusterPolicy.
-  MaxVarianceNewCluster() { }
+  MaxVarianceNewCluster() : iteration(size_t(-1)) { }
 
   /**
    * Take the point furthest from the centroid of the cluster with maximum
@@ -38,15 +38,31 @@ class MaxVarianceNewCluster
    * @return Number of points changed.
    */
   template<typename MetricType, typename MatType>
-  static size_t EmptyCluster(const MatType& data,
-                             const size_t emptyCluster,
-                             arma::mat& centroids,
-                             arma::Col<size_t>& clusterCounts,
-                             MetricType& metric);
+  size_t EmptyCluster(const MatType& data,
+                      const size_t emptyCluster,
+                      arma::mat& centroids,
+                      arma::Col<size_t>& clusterCounts,
+                      MetricType& metric,
+                      const size_t iteration);
+
+ private:
+  //! Index of iteration for which variance is cached.
+  size_t iteration;
+  //! Cached variances for each cluster.
+  arma::vec variances;
+  //! Cached assignments for each point.
+  arma::Col<size_t> assignments;
+
+  //! Called when we are on a new iteration.
+  template<typename MetricType, typename MatType>
+  void Precalculate(const MatType& data,
+                    arma::mat& centroids,
+                    arma::Col<size_t>& clusterCounts,
+                    MetricType& metric);
 };
 
-}; // namespace kmeans
-}; // namespace mlpack
+} // namespace kmeans
+} // namespace mlpack
 
 // Include implementation.
 #include "max_variance_new_cluster_impl.hpp"
diff --git a/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp b/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
index 339574e..7bc4f76 100644
--- a/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
+++ b/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
@@ -21,45 +21,13 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
                                            const size_t emptyCluster,
                                            arma::mat& centroids,
                                            arma::Col<size_t>& clusterCounts,
-                                           MetricType& metric)
+                                           MetricType& metric,
+                                           const size_t iteration)
 {
-  // First, we need to find the cluster with maximum variance (by which I mean
-  // the sum of the covariance matrices).
-  arma::vec variances;
-  variances.zeros(clusterCounts.n_elem); // Start with 0.
-  arma::Col<size_t> assignments(data.n_cols);
-
-  // Add the variance of each point's distance away from the cluster.  I think
-  // this is the sensible thing to do.
-  for (size_t i = 0; i < data.n_cols; ++i)
-  {
-    // Find the closest centroid to this point.
-    double minDistance = std::numeric_limits<double>::infinity();
-    size_t closestCluster = centroids.n_cols; // Invalid value.
-
-    for (size_t j = 0; j < centroids.n_cols; j++)
-    {
-      const double distance = metric.Evaluate(data.col(i), centroids.col(j));
-
-      if (distance < minDistance)
-      {
-        minDistance = distance;
-        closestCluster = j;
-      }
-    }
-
-    assignments[i] = closestCluster;
-    variances[closestCluster] += metric.Evaluate(data.col(i),
-        centroids.col(closestCluster));
-  }
-
-  // Divide by the number of points in the cluster to produce the variance.
-  // Although a -nan will occur here for the empty cluster(s), this doesn't
-  // matter because variances.max() won't pick it up.  If the number of points
-  // in the cluster is 1, we ensure that cluster is not selected by forcing the
-  // variance to 0.
-  for (size_t i = 0; i < clusterCounts.n_elem; ++i)
-    variances[i] /= (clusterCounts[i] == 1) ? DBL_MAX : clusterCounts[i];
+  // If necessary, calculate the variances and assignments.
+  if (iteration != this->iteration || assignments.n_elem != data.n_cols)
+    Precalculate(data, centroids, clusterCounts, metric);
+  this->iteration = iteration;
 
   // Now find the cluster with maximum variance.
   arma::uword maxVarCluster;
@@ -72,8 +40,8 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
   {
     if (assignments[i] == maxVarCluster)
     {
-      const double distance = metric.Evaluate(data.col(i),
-          centroids.col(maxVarCluster));
+      const double distance = std::pow(metric.Evaluate(data.col(i),
+          centroids.col(maxVarCluster)), 2.0);
 
       if (distance > maxDistance)
       {
@@ -93,6 +61,11 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
   centroids.col(emptyCluster) = arma::vec(data.col(furthestPoint));
   assignments[furthestPoint] = emptyCluster;
 
+  // Modify the variances, as necessary.
+  variances[emptyCluster] = 0;
+  variances[maxVarCluster] = (1.0 / (clusterCounts[maxVarCluster] - 1)) *
+      (variances[maxVarCluster] - maxDistance);
+
   // Output some debugging information.
   Log::Debug << "Point " << furthestPoint << " assigned to empty cluster " <<
       emptyCluster << ".\n";
@@ -100,6 +73,51 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
   return 1; // We only changed one point.
 }
 
+template<typename MetricType, typename MatType>
+void MaxVarianceNewCluster::Precalculate(const MatType& data,
+                                         arma::mat& centroids,
+                                         arma::Col<size_t>& clusterCounts,
+                                         MetricType& metric)
+{
+  // We have to calculate the variances of each cluster and the assignments of
+  // each point.  This is most easily done by iterating through the entire
+  // dataset.
+  variances.zeros(centroids.n_cols);
+  assignments.set_size(data.n_cols);
+
+  // Add the variance of each point's distance away from the cluster.  I think
+  // this is the sensible thing to do.
+  for (size_t i = 0; i < data.n_cols; ++i)
+  {
+    // Find the closest centroid to this point.
+    double minDistance = std::numeric_limits<double>::infinity();
+    size_t closestCluster = centroids.n_cols; // Invalid value.
+
+    for (size_t j = 0; j < centroids.n_cols; j++)
+    {
+      const double distance = metric.Evaluate(data.col(i), centroids.col(j));
+
+      if (distance < minDistance)
+      {
+        minDistance = distance;
+        closestCluster = j;
+      }
+    }
+
+    assignments[i] = closestCluster;
+    variances[closestCluster] += std::pow(metric.Evaluate(data.col(i),
+        centroids.col(closestCluster)), 2.0);
+  }
+
+  // Divide by the number of points in the cluster to produce the variance.
+  // Although a -nan will occur here for the empty cluster(s), this doesn't
+  // matter because variances.max() won't pick it up.  If the number of points
+  // in the cluster is 1, we ensure that cluster is not selected by forcing the
+  // variance to 0.
+  for (size_t i = 0; i < clusterCounts.n_elem; ++i)
+    variances[i] /= (clusterCounts[i] == 1) ? DBL_MAX : clusterCounts[i];
+}
+
 }; // namespace kmeans
 }; // namespace mlpack
 



More information about the mlpack-git mailing list