[mlpack-svn] r14914 - mlpack/trunk/src/mlpack/methods/kmeans
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Wed Apr 17 17:52:51 EDT 2013
Author: rcurtin
Date: 2013-04-17 17:52:50 -0400 (Wed, 17 Apr 2013)
New Revision: 14914
Modified:
mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp
mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp
Log:
Add option to get the clusters back when the process completes.
Modified: mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp 2013-04-17 21:17:22 UTC (rev 14913)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp 2013-04-17 21:52:50 UTC (rev 14914)
@@ -88,23 +88,59 @@
/**
- * Perform K-Means clustering on the data, returning a list of cluster
+ * Perform k-means clustering on the data, returning a list of cluster
* assignments. Optionally, the vector of assignments can be set to an
- * initial guess of the cluster assignments; to do this, the number of
- * elements in the list of assignments must be equal to the number of points
- * (columns) in the dataset.
+ * initial guess of the cluster assignments; to do this, set initialGuess to
+ * true.
*
- * @tparam MatType Type of matrix (arma::mat or arma::spmat).
+ * @tparam MatType Type of matrix (arma::mat or arma::sp_mat).
* @param data Dataset to cluster.
* @param clusters Number of clusters to compute.
- * @param assignments Vector to store cluster assignments in. Can contain an
- * initial guess at cluster assignments.
+ * @param assignments Vector to store cluster assignments in.
+ * @param initialGuess If true, then it is assumed that assignments has a list
+ * of initial cluster assignments.
*/
template<typename MatType>
void Cluster(const MatType& data,
const size_t clusters,
- arma::Col<size_t>& assignments) const;
+ arma::Col<size_t>& assignments,
+ const bool initialGuess = false) const;
+
+ /**
+ * Perform k-means clustering on the data, returning a list of cluster
+ * assignments and also the centroids of each cluster. Optionally, the vector
+ * of assignments can be set to an initial guess of the cluster assignments;
+ * to do this, set initialAssignmentGuess to true. Another way to set initial
+ * cluster guesses is to fill the centroids matrix with the centroid guesses,
+ * and then set initialCentroidGuess to true. initialAssignmentGuess
+ * supersedes initialCentroidGuess, so if both are set to true, the
+ * assignments vector is used.
+ *
+ * Note that if the overclustering factor is greater than 1, the centroids
+ * matrix will be resized in the method. Regardless of the overclustering
+ * factor, the centroid guess matrix (if initialCentroidGuess is set to true)
+ * should have the same number of rows as the data matrix, and number of
+ * columns equal to 'clusters'.
+ *
+ * @tparam MatType Type of matrix (arma::mat or arma::sp_mat).
+ * @param data Dataset to cluster.
+ * @param clusters Number of clusters to compute.
+ * @param assignments Vector to store cluster assignments in.
+ * @param centroids Matrix in which centroids are stored.
+ * @param initialAssignmentGuess If true, then it is assumed that assignments
+ * has a list of initial cluster assignments.
+ * @param initialCentroidGuess If true, then it is assumed that centroids
+ * contains the initial centroids of each cluster.
+ */
template<typename MatType>
+ void Cluster(const MatType& data,
+ const size_t clusters,
+ arma::Col<size_t>& assignments,
+ MatType& centroids,
+ const bool initialAssignmentGuess = false,
+ const bool initialCentroidGuess = false) const;
+
+ template<typename MatType>
void FastCluster(MatType& data,
const size_t clusters,
arma::Col<size_t>& assignments) const;
Modified: mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp 2013-04-17 21:17:22 UTC (rev 14913)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp 2013-04-17 21:52:50 UTC (rev 14914)
@@ -172,8 +172,7 @@
for (size_t i = mrkd.Begin(); i < mrkd.Count() + mrkd.Begin(); ++i)
{
// Initialize minDistance to be nonzero.
- double minDistance = metric::SquaredEuclideanDistance::Evaluate(
- data.col(i), centroids.col(0));
+ double minDistance = metric.Evaluate(data.col(i), centroids.col(0));
// Find the minimal distance centroid for this point.
for (size_t j = 1; j < centroids.n_cols; ++j)
@@ -186,8 +185,7 @@
}
++comps;
- double distance = metric::SquaredEuclideanDistance::Evaluate(
- data.col(i), centroids.col(j));
+ double distance = metric.Evaluate(data.col(i), centroids.col(j));
if (minDistance > distance)
{
minIndex = j;
@@ -277,8 +275,7 @@
p(j) = bound[j].Hi();
}
- distance = metric::SquaredEuclideanDistance::Evaluate(
- p.col(0), centroids.col(i));
+ distance = metric.Evaluate(p.col(0), centroids.col(i));
*/
for (size_t j = 0; j < dimensionality; ++j)
{
@@ -371,10 +368,9 @@
bound[k].Hi() : bound[k].Lo();
}
- double distancei = metric::SquaredEuclideanDistance::Evaluate(
- p.col(0), centroids.col(i));
- double distanceMin = metric::SquaredEuclideanDistance::Evaluate(
- p.col(0), centroids.col(minIndex));
+ double distancei = metric.Evaluate(p.col(0), centroids.col(i));
+ double distanceMin = metric.Evaluate(p.col(0),
+ centroids.col(minIndex));
*/
comps += 1;
@@ -487,20 +483,46 @@
}
/**
- * Perform K-Means clustering on the data, returning a list of cluster
- * assignments.
+ * Perform k-means clustering on the data, returning a list of cluster
+ * assignments. This just forward to the other function, which returns the
+ * centroids too. If this is properly inlined, there shouldn't be any
+ * performance penalty whatsoever.
*/
template<typename DistanceMetric,
typename InitialPartitionPolicy,
typename EmptyClusterPolicy>
template<typename MatType>
+inline void KMeans<
+ DistanceMetric,
+ InitialPartitionPolicy,
+ EmptyClusterPolicy>::
+Cluster(const MatType& data,
+ const size_t clusters,
+ arma::Col<size_t>& assignments,
+ const bool initialGuess) const
+{
+ MatType centroids(data.n_rows, clusters);
+ Cluster(data, clusters, assignments, centroids, initialGuess);
+}
+
+/**
+ * Perform k-means clustering on the data, returning a list of cluster
+ * assignments and the centroids of each cluster.
+ */
+template<typename DistanceMetric,
+ typename InitialPartitionPolicy,
+ typename EmptyClusterPolicy>
+template<typename MatType>
void KMeans<
DistanceMetric,
InitialPartitionPolicy,
EmptyClusterPolicy>::
Cluster(const MatType& data,
const size_t clusters,
- arma::Col<size_t>& assignments) const
+ arma::Col<size_t>& assignments,
+ MatType& centroids,
+ const bool initialAssignmentGuess,
+ const bool initialCentroidGuess) const
{
// Make sure we have more points than clusters.
if (clusters > data.n_cols)
@@ -517,18 +539,56 @@
}
// Now, the initial assignments. First determine if they are necessary.
- if (assignments.n_elem != data.n_cols)
+ if (initialAssignmentGuess && assignments.n_cols != data.n_cols)
{
+ Log::Fatal << "KMeans::Cluster(): initial cluster assignments not the same "
+ << "size as the dataset!" << std::endl;
+ }
+ else if (initialCentroidGuess && (centroids.n_rows != data.n_rows ||
+ centroids.n_cols != clusters))
+ {
+ Log::Fatal << "KMeans::Cluster(): wrong number of initial cluster centroids"
+ << " or wrong dimensionality!" << std::endl;
+ }
+ else
+ {
// Use the partitioner to come up with the partition assignments.
partitioner.Cluster(data, actualClusters, assignments);
}
- // Centroids of each cluster. Each column corresponds to a centroid.
- MatType centroids(data.n_rows, actualClusters);
// Counts of points in each cluster.
arma::Col<size_t> counts(actualClusters);
counts.zeros();
+ // If we received an initial cluster guess, assign the points for the first
+ // time. Note that initialAssignmentGuess supersedes initialCentroidGuess.
+ if (initialCentroidGuess && !initialAssignmentGuess)
+ {
+ for (size_t i = 0; i < data.n_cols; ++i)
+ {
+ // Find the closest centroid to this point.
+ double minDistance = std::numeric_limits<double>::infinity();
+ size_t closestCluster = clusters; // Invalid value.
+
+ for (size_t j = 0; j < clusters; j++)
+ {
+ double distance = metric.Evaluate(data.col(i), centroids.col(j));
+
+ if (distance < minDistance)
+ {
+ minDistance = distance;
+ closestCluster = j;
+ }
+ }
+
+ // Assign the point to the closest cluster that we found.
+ assignments[i] = closestCluster;
+ }
+ }
+
+ // Resize to correct size.
+ centroids.set_size(data.n_rows, actualClusters);
+
// Set counts correctly.
for (size_t i = 0; i < assignments.n_elem; i++)
counts[assignments[i]]++;
@@ -559,8 +619,7 @@
for (size_t j = 0; j < actualClusters; j++)
{
- double distance = metric::SquaredEuclideanDistance::Evaluate(
- data.col(i), centroids.col(j));
+ double distance = metric.Evaluate(data.col(i), centroids.col(j));
if (distance < minDistance)
{
@@ -592,8 +651,26 @@
} while (changedAssignments > 0 && iteration != maxIterations);
- Log::Debug << "Iterations: " << iteration << std::endl;
+ if (iteration != maxIterations)
+ {
+ Log::Debug << "KMeans::Cluster(): converged after " << iteration
+ << " iterations." << std::endl;
+ }
+ else
+ {
+ Log::Debug << "KMeans::Cluster(): terminated after limit of " << iteration
+ << " iterations." << std::endl;
+ // Recalculate final clusters.
+ centroids.zeros();
+
+ for (size_t i = 0; i < data.n_cols; i++)
+ centroids.col(assignments[i]) += data.col(i);
+
+ for (size_t i = 0; i < actualClusters; i++)
+ centroids.col(i) /= counts[i];
+ }
+
// If we have overclustered, we need to merge the nearest clusters.
if (actualClusters != clusters)
{
@@ -614,8 +691,8 @@
{
for (size_t second = first + 1; second < actualClusters; second++)
{
- distances(i) = metric::SquaredEuclideanDistance::Evaluate(
- centroids.col(first), centroids.col(second));
+ distances(i) = metric.Evaluate(centroids.col(first),
+ centroids.col(second));
firstCluster(i) = first;
secondCluster(i) = second;
i++;
@@ -658,8 +735,7 @@
{
// Make sure it isn't already DBL_MAX.
if (distances(offset + (first - cluster)) != DBL_MAX)
- distances(offset + (first - cluster)) =
- metric::SquaredEuclideanDistance::Evaluate(
+ distances(offset + (first - cluster)) = metric.Evaluate(
centroids.col(first), centroids.col(cluster));
}
@@ -674,8 +750,7 @@
// Make sure it isn't already DBL_MAX.
if (distances(offset + (cluster - first)) != DBL_MAX)
{
- distances(offset + (cluster - first)) =
- metric::SquaredEuclideanDistance::Evaluate(
+ distances(offset + (cluster - first)) = metric.Evaluate(
centroids.col(first), centroids.col(cluster));
}
}
More information about the mlpack-svn
mailing list