[mlpack-git] master: Refactor RefinedStart to give centroids not assignments. (b8bb079)
gitdub at mlpack.org
gitdub at mlpack.org
Tue Apr 12 10:43:52 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/eeba6bdc50ad4d785cb6880edbaba78173036ca6...8d77f4231046703d5c0c05ed4795458f98267968
>---------------------------------------------------------------
commit b8bb079a5feb7e8d95cdbd6094bc2fa28b5e1971
Author: Ryan Curtin <ryan at ratml.org>
Date: Tue Apr 12 14:42:21 2016 +0000
Refactor RefinedStart to give centroids not assignments.
>---------------------------------------------------------------
b8bb079a5feb7e8d95cdbd6094bc2fa28b5e1971
src/mlpack/methods/kmeans/refined_start.hpp | 18 +++++++++++++++-
src/mlpack/methods/kmeans/refined_start_impl.hpp | 27 +++++++++++++++++-------
2 files changed, 36 insertions(+), 9 deletions(-)
diff --git a/src/mlpack/methods/kmeans/refined_start.hpp b/src/mlpack/methods/kmeans/refined_start.hpp
index d06e463..93449f5 100644
--- a/src/mlpack/methods/kmeans/refined_start.hpp
+++ b/src/mlpack/methods/kmeans/refined_start.hpp
@@ -43,7 +43,23 @@ class RefinedStart
/**
* Partition the given dataset into the given number of clusters according to
- * the random sampling scheme outlined in Bradley and Fayyad's paper.
+ * the random sampling scheme outlined in Bradley and Fayyad's paper, and
+ * return centroids.
+ *
+ * @tparam MatType Type of data (arma::mat or arma::sp_mat).
+ * @param data Dataset to partition.
+ * @param clusters Number of clusters to split dataset into.
+ * @param centroids Matrix to store centroids into.
+ */
+ template<typename MatType>
+ void Cluster(const MatType& data,
+ const size_t clusters,
+ arma::mat& centroids) const;
+
+ /**
+ * Partition the given dataset into the given number of clusters according to
+ * the random sampling scheme outlined in Bradley and Fayyad's paper, and
+ * return point assignments.
*
* @tparam MatType Type of data (arma::mat or arma::sp_mat).
* @param data Dataset to partition.
diff --git a/src/mlpack/methods/kmeans/refined_start_impl.hpp b/src/mlpack/methods/kmeans/refined_start_impl.hpp
index 1b178b9..4379709 100644
--- a/src/mlpack/methods/kmeans/refined_start_impl.hpp
+++ b/src/mlpack/methods/kmeans/refined_start_impl.hpp
@@ -19,7 +19,7 @@ namespace kmeans {
template<typename MatType>
void RefinedStart::Cluster(const MatType& data,
const size_t clusters,
- arma::Row<size_t>& assignments) const
+ arma::mat& centroids) const
{
// This will hold the sampled datasets.
const size_t numPoints = size_t(percentage * data.n_cols);
@@ -28,10 +28,6 @@ void RefinedStart::Cluster(const MatType& data,
std::vector<bool> pointsUsed(data.n_cols, false);
arma::mat sampledCentroids(data.n_rows, samplings * clusters);
- // We will use these objects repeatedly for clustering.
- arma::Row<size_t> sampledAssignments;
- arma::mat centroids;
-
for (size_t i = 0; i < samplings; ++i)
{
// First, assemble the sampled dataset.
@@ -55,7 +51,7 @@ void RefinedStart::Cluster(const MatType& data,
// the cluster with maximum variance. This is not *exactly* what the paper
// implements, but it is quite similar, and we'll call it "good enough".
KMeans<> kmeans;
- kmeans.Cluster(sampledData, clusters, sampledAssignments, centroids);
+ kmeans.Cluster(sampledData, clusters, centroids);
// Store the sampled centroids.
sampledCentroids.cols(i * clusters, (i + 1) * clusters - 1) = centroids;
@@ -65,7 +61,18 @@ void RefinedStart::Cluster(const MatType& data,
// Now, we run k-means on the sampled centroids to get our final clusters.
KMeans<> kmeans;
- kmeans.Cluster(sampledCentroids, clusters, sampledAssignments, centroids);
+ kmeans.Cluster(sampledCentroids, clusters, centroids);
+}
+
+template<typename MatType>
+void RefinedStart::Cluster(const MatType& data,
+ const size_t clusters,
+ arma::Row<size_t>& assignments) const
+{
+ // Perform the Bradley-Fayyad refined start algorithm, and get initial
+ // centroids back.
+ arma::mat centroids;
+ Cluster(data, clusters, centroids);
// Turn the final centroids into assignments.
assignments.set_size(data.n_cols);
@@ -77,7 +84,11 @@ void RefinedStart::Cluster(const MatType& data,
for (size_t j = 0; j < clusters; ++j)
{
- const double distance = kmeans.Metric().Evaluate(data.col(i),
+ // This is restricted to the L2 distance, and unfortunately it would take
+ // a lot of refactoring and redesign to make this more general... we would
+ // probably need to have KMeans take a template template parameter for the
+ // initial partition policy. It's not clear how to best do this.
+ const double distance = metric::EuclideanDistance::Evaluate(data.col(i),
centroids.col(j));
if (distance < minDistance)
More information about the mlpack-git
mailing list