[mlpack-svn] r14919 - mlpack/trunk/src/mlpack/methods/kmeans
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Thu Apr 18 19:10:09 EDT 2013
Author: rcurtin
Date: 2013-04-18 19:10:09 -0400 (Thu, 18 Apr 2013)
New Revision: 14919
Added:
mlpack/trunk/src/mlpack/methods/kmeans/refined_start.hpp
mlpack/trunk/src/mlpack/methods/kmeans/refined_start_impl.hpp
Modified:
mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt
Log:
Add a new initialization method, which is an implementation of Bradley and
Fayyad's scheme for better k-means initialization points. It seems to work
well.
Modified: mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt 2013-04-18 18:29:05 UTC (rev 14918)
+++ mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt 2013-04-18 23:10:09 UTC (rev 14919)
@@ -7,6 +7,8 @@
max_variance_new_cluster.hpp
max_variance_new_cluster_impl.hpp
random_partition.hpp
+ refined_start.hpp
+ refined_start_impl.hpp
)
# Add directory name to sources.
Added: mlpack/trunk/src/mlpack/methods/kmeans/refined_start.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/refined_start.hpp (rev 0)
+++ mlpack/trunk/src/mlpack/methods/kmeans/refined_start.hpp 2013-04-18 23:10:09 UTC (rev 14919)
@@ -0,0 +1,72 @@
+/**
+ * @file refined_start.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of Bradley and Fayyad's "Refining Initial Points for
+ * K-Means clustering". This class is meant to provide better initial points
+ * for the k-means algorithm.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_REFINED_START_HPP
+#define __MLPACK_METHODS_KMEANS_REFINED_START_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace kmeans {
+
+/**
+ * A refined approach for choosing initial points for k-means clustering. This
+ * approach runs k-means several times on random subsets of the data, and then
+ * clusters those solutions to select refined initial cluster assignments. It
+ * is an implementation of the following paper:
+ *
+ * @inproceedings{bradley1998refining,
+ * title={Refining initial points for k-means clustering},
+ * author={Bradley, Paul S and Fayyad, Usama M},
+ * booktitle={Proceedings of the Fifteenth International Conference on Machine
+ * Learning (ICML 1998)},
+ * volume={66},
+ * year={1998}
+ * }
+ */
+class RefinedStart
+{
+ public:
+ /**
+ * Create the RefinedStart object, optionally specifying parameters for the
+ * number of samplings to perform and the percentage of the dataset to use in
+ * each sampling.
+ */
+ RefinedStart(const size_t samplings = 100,
+ const double percentage = 0.02) :
+ samplings(samplings), percentage(percentage) { }
+
+ /**
+ * Partition the given dataset into the given number of clusters according to
+ * the random sampling scheme outlined in Bradley and Fayyad's paper.
+ *
+ * @tparam MatType Type of data (arma::mat or arma::sp_mat).
+ * @param data Dataset to partition.
+ * @param clusters Number of clusters to split dataset into.
+ * @param assignments Vector to store cluster assignments into. Values will
+ * be between 0 and (clusters - 1).
+ */
+ template<typename MatType>
+ void Cluster(const MatType& data,
+ const size_t clusters,
+ arma::Col<size_t>& assignments);
+
+ private:
+ //! The number of samplings to perform.
+ size_t samplings;
+ //! The percentage of the data to use for each subsampling.
+ double percentage;
+};
+
+}; // namespace kmeans
+}; // namespace mlpack
+
+// Include implementation.
+#include "refined_start_impl.hpp"
+
+#endif
Added: mlpack/trunk/src/mlpack/methods/kmeans/refined_start_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/refined_start_impl.hpp (rev 0)
+++ mlpack/trunk/src/mlpack/methods/kmeans/refined_start_impl.hpp 2013-04-18 23:10:09 UTC (rev 14919)
@@ -0,0 +1,99 @@
+/**
+ * @file refined_start_impl.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of Bradley and Fayyad's "Refining Initial Points for
+ * K-Means clustering". This class is meant to provide better initial points
+ * for the k-means algorithm.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_REFINED_START_IMPL_HPP
+#define __MLPACK_METHODS_KMEANS_REFINED_START_IMPL_HPP
+
+// In case it hasn't been included yet.
+#include "refined_start.hpp"
+
+namespace mlpack {
+namespace kmeans {
+
+//! Partition the given dataset according to Bradley and Fayyad's algorithm.
+template<typename MatType>
+void RefinedStart::Cluster(const MatType& data,
+ const size_t clusters,
+ arma::Col<size_t>& assignments)
+{
+ math::RandomSeed(std::time(NULL));
+
+ // This will hold the sampled datasets.
+ const size_t numPoints = size_t(percentage * data.n_cols);
+ MatType sampledData(data.n_rows, numPoints);
+ // vector<bool> is packed so each bool is 1 bit.
+ std::vector<bool> pointsUsed(data.n_cols, false);
+ arma::mat sampledCentroids(data.n_rows, samplings * clusters);
+
+ // We will use these objects repeatedly for clustering.
+ arma::Col<size_t> sampledAssignments;
+ arma::mat centroids;
+ KMeans<> kmeans;
+
+ for (size_t i = 0; i < samplings; ++i)
+ {
+ // First, assemble the sampled dataset.
+ size_t curSample = 0;
+ while (curSample < numPoints)
+ {
+ // Pick a random point in [0, numPoints).
+ size_t sample = (size_t) math::RandInt(data.n_cols);
+
+ if (!pointsUsed[sample])
+ {
+ // This point isn't used yet. So we'll put it in our sample.
+ pointsUsed[sample] = true;
+ sampledData.col(curSample) = data.col(sample);
+ ++curSample;
+ }
+ }
+
+ // Now, using the sampled dataset, run k-means. In the case of an empty
+ // cluster, we re-initialize that cluster as the point furthest away from
+ // the cluster with maximum variance. This is not *exactly* what the paper
+ // implements, but it is quite similar, and we'll call it "good enough".
+ kmeans.Cluster(sampledData, clusters, sampledAssignments, centroids);
+
+ // Store the sampled centroids.
+ sampledCentroids.cols(i * clusters, (i + 1) * clusters - 1) = centroids;
+
+ pointsUsed.assign(data.n_cols, false);
+ }
+
+ // Now, we run k-means on the sampled centroids to get our final clusters.
+ kmeans.Cluster(sampledCentroids, clusters, sampledAssignments, centroids);
+
+ // Turn the final centroids into assignments.
+ assignments.set_size(data.n_cols);
+ for (size_t i = 0; i < data.n_cols; ++i)
+ {
+ // Find the closest centroid to this point.
+ double minDistance = std::numeric_limits<double>::infinity();
+ size_t closestCluster = clusters;
+
+ for (size_t j = 0; j < clusters; ++j)
+ {
+ const double distance = kmeans.Metric().Evaluate(data.col(i),
+ centroids.col(j));
+
+ if (distance < minDistance)
+ {
+ minDistance = distance;
+ closestCluster = j;
+ }
+ }
+
+ // Assign the point to its closest cluster.
+ assignments[i] = closestCluster;
+ }
+}
+
+}; // namespace kmeans
+}; // namespace mlpack
+
+#endif
More information about the mlpack-svn
mailing list