[mlpack-git] master: Refactor RefinedStart to give centroids not assignments. (b8bb079)

Tue Apr 12 10:43:52 EDT 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/eeba6bdc50ad4d785cb6880edbaba78173036ca6...8d77f4231046703d5c0c05ed4795458f98267968

>---------------------------------------------------------------

commit b8bb079a5feb7e8d95cdbd6094bc2fa28b5e1971
Author: Ryan Curtin <ryan at ratml.org>
Date:   Tue Apr 12 14:42:21 2016 +0000

    Refactor RefinedStart to give centroids not assignments.


>---------------------------------------------------------------

b8bb079a5feb7e8d95cdbd6094bc2fa28b5e1971
 src/mlpack/methods/kmeans/refined_start.hpp      | 18 +++++++++++++++-
 src/mlpack/methods/kmeans/refined_start_impl.hpp | 27 +++++++++++++++++-------
 2 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/src/mlpack/methods/kmeans/refined_start.hpp b/src/mlpack/methods/kmeans/refined_start.hpp
index d06e463..93449f5 100644
--- a/src/mlpack/methods/kmeans/refined_start.hpp
+++ b/src/mlpack/methods/kmeans/refined_start.hpp
@@ -43,7 +43,23 @@ class RefinedStart
 
   /**
    * Partition the given dataset into the given number of clusters according to
-   * the random sampling scheme outlined in Bradley and Fayyad's paper.
+   * the random sampling scheme outlined in Bradley and Fayyad's paper, and
+   * return centroids.
+   *
+   * @tparam MatType Type of data (arma::mat or arma::sp_mat).
+   * @param data Dataset to partition.
+   * @param clusters Number of clusters to split dataset into.
+   * @param centroids Matrix to store centroids into.
+   */
+  template<typename MatType>
+  void Cluster(const MatType& data,
+               const size_t clusters,
+               arma::mat& centroids) const;
+
+  /**
+   * Partition the given dataset into the given number of clusters according to
+   * the random sampling scheme outlined in Bradley and Fayyad's paper, and
+   * return point assignments.
    *
    * @tparam MatType Type of data (arma::mat or arma::sp_mat).
    * @param data Dataset to partition.
diff --git a/src/mlpack/methods/kmeans/refined_start_impl.hpp b/src/mlpack/methods/kmeans/refined_start_impl.hpp
index 1b178b9..4379709 100644
--- a/src/mlpack/methods/kmeans/refined_start_impl.hpp
+++ b/src/mlpack/methods/kmeans/refined_start_impl.hpp
@@ -19,7 +19,7 @@ namespace kmeans {
 template<typename MatType>
 void RefinedStart::Cluster(const MatType& data,
                            const size_t clusters,
-                           arma::Row<size_t>& assignments) const
+                           arma::mat& centroids) const
 {
   // This will hold the sampled datasets.
   const size_t numPoints = size_t(percentage * data.n_cols);
@@ -28,10 +28,6 @@ void RefinedStart::Cluster(const MatType& data,
   std::vector<bool> pointsUsed(data.n_cols, false);
   arma::mat sampledCentroids(data.n_rows, samplings * clusters);
 
-  // We will use these objects repeatedly for clustering.
-  arma::Row<size_t> sampledAssignments;
-  arma::mat centroids;
-
   for (size_t i = 0; i < samplings; ++i)
   {
     // First, assemble the sampled dataset.
@@ -55,7 +51,7 @@ void RefinedStart::Cluster(const MatType& data,
     // the cluster with maximum variance.  This is not *exactly* what the paper
     // implements, but it is quite similar, and we'll call it "good enough".
     KMeans<> kmeans;
-    kmeans.Cluster(sampledData, clusters, sampledAssignments, centroids);
+    kmeans.Cluster(sampledData, clusters, centroids);
 
     // Store the sampled centroids.
     sampledCentroids.cols(i * clusters, (i + 1) * clusters - 1) = centroids;
@@ -65,7 +61,18 @@ void RefinedStart::Cluster(const MatType& data,
 
   // Now, we run k-means on the sampled centroids to get our final clusters.
   KMeans<> kmeans;
-  kmeans.Cluster(sampledCentroids, clusters, sampledAssignments, centroids);
+  kmeans.Cluster(sampledCentroids, clusters, centroids);
+}
+
+template<typename MatType>
+void RefinedStart::Cluster(const MatType& data,
+                           const size_t clusters,
+                           arma::Row<size_t>& assignments) const
+{
+  // Perform the Bradley-Fayyad refined start algorithm, and get initial
+  // centroids back.
+  arma::mat centroids;
+  Cluster(data, clusters, centroids);
 
   // Turn the final centroids into assignments.
   assignments.set_size(data.n_cols);
@@ -77,7 +84,11 @@ void RefinedStart::Cluster(const MatType& data,
 
     for (size_t j = 0; j < clusters; ++j)
     {
-      const double distance = kmeans.Metric().Evaluate(data.col(i),
+      // This is restricted to the L2 distance, and unfortunately it would take
+      // a lot of refactoring and redesign to make this more general... we would
+      // probably need to have KMeans take a template template parameter for the
+      // initial partition policy.  It's not clear how to best do this.
+      const double distance = metric::EuclideanDistance::Evaluate(data.col(i),
           centroids.col(j));
 
       if (distance < minDistance)