[mlpack-svn] r14919 - mlpack/trunk/src/mlpack/methods/kmeans

Thu Apr 18 19:10:09 EDT 2013

Author: rcurtin
Date: 2013-04-18 19:10:09 -0400 (Thu, 18 Apr 2013)
New Revision: 14919

Added:
   mlpack/trunk/src/mlpack/methods/kmeans/refined_start.hpp
   mlpack/trunk/src/mlpack/methods/kmeans/refined_start_impl.hpp
Modified:
   mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt
Log:
Add a new initialization method, which is an implementation of Bradley and
Fayyad's scheme for better k-means initialization points.  It seems to work
well.


Modified: mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt
===================================================================

--- mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt	2013-04-18 18:29:05 UTC (rev 14918)
+++ mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt	2013-04-18 23:10:09 UTC (rev 14919)
@@ -7,6 +7,8 @@
   max_variance_new_cluster.hpp
   max_variance_new_cluster_impl.hpp
   random_partition.hpp
+  refined_start.hpp
+  refined_start_impl.hpp
 )
 
 # Add directory name to sources.

Added: mlpack/trunk/src/mlpack/methods/kmeans/refined_start.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/refined_start.hpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/methods/kmeans/refined_start.hpp	2013-04-18 23:10:09 UTC (rev 14919)
@@ -0,0 +1,72 @@
+/**
+ * @file refined_start.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of Bradley and Fayyad's "Refining Initial Points for
+ * K-Means clustering".  This class is meant to provide better initial points
+ * for the k-means algorithm.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_REFINED_START_HPP
+#define __MLPACK_METHODS_KMEANS_REFINED_START_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace kmeans {
+
+/**
+ * A refined approach for choosing initial points for k-means clustering.  This
+ * approach runs k-means several times on random subsets of the data, and then
+ * clusters those solutions to select refined initial cluster assignments.  It
+ * is an implementation of the following paper:
+ *
+ * @inproceedings{bradley1998refining,
+ *   title={Refining initial points for k-means clustering},
+ *   author={Bradley, Paul S and Fayyad, Usama M},
+ *   booktitle={Proceedings of the Fifteenth International Conference on Machine
+ *       Learning (ICML 1998)},
+ *   volume={66},
+ *   year={1998}
+ * }
+ */
+class RefinedStart
+{
+ public:
+  /**
+   * Create the RefinedStart object, optionally specifying parameters for the
+   * number of samplings to perform and the percentage of the dataset to use in
+   * each sampling.
+   */
+  RefinedStart(const size_t samplings = 100,
+               const double percentage = 0.02) :
+      samplings(samplings), percentage(percentage) { }
+
+  /**
+   * Partition the given dataset into the given number of clusters according to
+   * the random sampling scheme outlined in Bradley and Fayyad's paper.
+   *
+   * @tparam MatType Type of data (arma::mat or arma::sp_mat).
+   * @param data Dataset to partition.
+   * @param clusters Number of clusters to split dataset into.
+   * @param assignments Vector to store cluster assignments into.  Values will
+   *     be between 0 and (clusters - 1).
+   */
+  template<typename MatType>
+  void Cluster(const MatType& data,
+               const size_t clusters,
+               arma::Col<size_t>& assignments);
+
+ private:
+  //! The number of samplings to perform.
+  size_t samplings;
+  //! The percentage of the data to use for each subsampling.
+  double percentage;
+};
+
+}; // namespace kmeans
+}; // namespace mlpack
+
+// Include implementation.
+#include "refined_start_impl.hpp"
+
+#endif

Added: mlpack/trunk/src/mlpack/methods/kmeans/refined_start_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/refined_start_impl.hpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/methods/kmeans/refined_start_impl.hpp	2013-04-18 23:10:09 UTC (rev 14919)
@@ -0,0 +1,99 @@
+/**
+ * @file refined_start_impl.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of Bradley and Fayyad's "Refining Initial Points for
+ * K-Means clustering".  This class is meant to provide better initial points
+ * for the k-means algorithm.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_REFINED_START_IMPL_HPP
+#define __MLPACK_METHODS_KMEANS_REFINED_START_IMPL_HPP
+
+// In case it hasn't been included yet.
+#include "refined_start.hpp"
+
+namespace mlpack {
+namespace kmeans {
+
+//! Partition the given dataset according to Bradley and Fayyad's algorithm.
+template<typename MatType>
+void RefinedStart::Cluster(const MatType& data,
+                           const size_t clusters,
+                           arma::Col<size_t>& assignments)
+{
+  math::RandomSeed(std::time(NULL));
+
+  // This will hold the sampled datasets.
+  const size_t numPoints = size_t(percentage * data.n_cols);
+  MatType sampledData(data.n_rows, numPoints);
+  // vector<bool> is packed so each bool is 1 bit.
+  std::vector<bool> pointsUsed(data.n_cols, false);
+  arma::mat sampledCentroids(data.n_rows, samplings * clusters);
+
+  // We will use these objects repeatedly for clustering.
+  arma::Col<size_t> sampledAssignments;
+  arma::mat centroids;
+  KMeans<> kmeans;
+
+  for (size_t i = 0; i < samplings; ++i)
+  {
+    // First, assemble the sampled dataset.
+    size_t curSample = 0;
+    while (curSample < numPoints)
+    {
+      // Pick a random point in [0, numPoints).
+      size_t sample = (size_t) math::RandInt(data.n_cols);
+
+      if (!pointsUsed[sample])
+      {
+        // This point isn't used yet.  So we'll put it in our sample.
+        pointsUsed[sample] = true;
+        sampledData.col(curSample) = data.col(sample);
+        ++curSample;
+      }
+    }
+
+    // Now, using the sampled dataset, run k-means.  In the case of an empty
+    // cluster, we re-initialize that cluster as the point furthest away from
+    // the cluster with maximum variance.  This is not *exactly* what the paper
+    // implements, but it is quite similar, and we'll call it "good enough".
+    kmeans.Cluster(sampledData, clusters, sampledAssignments, centroids);
+
+    // Store the sampled centroids.
+    sampledCentroids.cols(i * clusters, (i + 1) * clusters - 1) = centroids;
+
+    pointsUsed.assign(data.n_cols, false);
+  }
+
+  // Now, we run k-means on the sampled centroids to get our final clusters.
+  kmeans.Cluster(sampledCentroids, clusters, sampledAssignments, centroids);
+
+  // Turn the final centroids into assignments.
+  assignments.set_size(data.n_cols);
+  for (size_t i = 0; i < data.n_cols; ++i)
+  {
+    // Find the closest centroid to this point.
+    double minDistance = std::numeric_limits<double>::infinity();
+    size_t closestCluster = clusters;
+
+    for (size_t j = 0; j < clusters; ++j)
+    {
+      const double distance = kmeans.Metric().Evaluate(data.col(i),
+          centroids.col(j));
+
+      if (distance < minDistance)
+      {
+        minDistance = distance;
+        closestCluster = j;
+      }
+    }
+
+    // Assign the point to its closest cluster.
+    assignments[i] = closestCluster;
+  }
+}
+
+}; // namespace kmeans
+}; // namespace mlpack
+
+#endif