[mlpack-svn] r14914 - mlpack/trunk/src/mlpack/methods/kmeans

fastlab-svn at coffeetalk-1.cc.gatech.edu fastlab-svn at coffeetalk-1.cc.gatech.edu
Wed Apr 17 17:52:51 EDT 2013


Author: rcurtin
Date: 2013-04-17 17:52:50 -0400 (Wed, 17 Apr 2013)
New Revision: 14914

Modified:
   mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp
   mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp
Log:
Add option to get the clusters back when the process completes.


Modified: mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp	2013-04-17 21:17:22 UTC (rev 14913)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp	2013-04-17 21:52:50 UTC (rev 14914)
@@ -88,23 +88,59 @@
 
 
   /**
-   * Perform K-Means clustering on the data, returning a list of cluster
+   * Perform k-means clustering on the data, returning a list of cluster
    * assignments.  Optionally, the vector of assignments can be set to an
-   * initial guess of the cluster assignments; to do this, the number of
-   * elements in the list of assignments must be equal to the number of points
-   * (columns) in the dataset.
+   * initial guess of the cluster assignments; to do this, set initialGuess to
+   * true.
    *
-   * @tparam MatType Type of matrix (arma::mat or arma::spmat).
+   * @tparam MatType Type of matrix (arma::mat or arma::sp_mat).
    * @param data Dataset to cluster.
    * @param clusters Number of clusters to compute.
-   * @param assignments Vector to store cluster assignments in.  Can contain an
-   *     initial guess at cluster assignments.
+   * @param assignments Vector to store cluster assignments in.
+   * @param initialGuess If true, then it is assumed that assignments has a list
+   *      of initial cluster assignments.
    */
   template<typename MatType>
   void Cluster(const MatType& data,
                const size_t clusters,
-               arma::Col<size_t>& assignments) const;
+               arma::Col<size_t>& assignments,
+               const bool initialGuess = false) const;
+
+  /**
+   * Perform k-means clustering on the data, returning a list of cluster
+   * assignments and also the centroids of each cluster.  Optionally, the vector
+   * of assignments can be set to an initial guess of the cluster assignments;
+   * to do this, set initialAssignmentGuess to true.  Another way to set initial
+   * cluster guesses is to fill the centroids matrix with the centroid guesses,
+   * and then set initialCentroidGuess to true.  initialAssignmentGuess
+   * supersedes initialCentroidGuess, so if both are set to true, the
+   * assignments vector is used.
+   *
+   * Note that if the overclustering factor is greater than 1, the centroids
+   * matrix will be resized in the method.  Regardless of the overclustering
+   * factor, the centroid guess matrix (if initialCentroidGuess is set to true)
+   * should have the same number of rows as the data matrix, and number of
+   * columns equal to 'clusters'.
+   *
+   * @tparam MatType Type of matrix (arma::mat or arma::sp_mat).
+   * @param data Dataset to cluster.
+   * @param clusters Number of clusters to compute.
+   * @param assignments Vector to store cluster assignments in.
+   * @param centroids Matrix in which centroids are stored.
+   * @param initialAssignmentGuess If true, then it is assumed that assignments
+   *      has a list of initial cluster assignments.
+   * @param initialCentroidGuess If true, then it is assumed that centroids
+   *      contains the initial centroids of each cluster.
+   */
   template<typename MatType>
+  void Cluster(const MatType& data,
+               const size_t clusters,
+               arma::Col<size_t>& assignments,
+               MatType& centroids,
+               const bool initialAssignmentGuess = false,
+               const bool initialCentroidGuess = false) const;
+
+  template<typename MatType>
   void FastCluster(MatType& data,
                const size_t clusters,
                arma::Col<size_t>& assignments) const;

Modified: mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp	2013-04-17 21:17:22 UTC (rev 14913)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp	2013-04-17 21:52:50 UTC (rev 14914)
@@ -172,8 +172,7 @@
         for (size_t i = mrkd.Begin(); i < mrkd.Count() + mrkd.Begin(); ++i)
         {
           // Initialize minDistance to be nonzero.
-          double minDistance = metric::SquaredEuclideanDistance::Evaluate(
-              data.col(i), centroids.col(0));
+          double minDistance = metric.Evaluate(data.col(i), centroids.col(0));
 
           // Find the minimal distance centroid for this point.
           for (size_t j = 1; j < centroids.n_cols; ++j)
@@ -186,8 +185,7 @@
             }
 
             ++comps;
-            double distance = metric::SquaredEuclideanDistance::Evaluate(
-                data.col(i), centroids.col(j));
+            double distance = metric.Evaluate(data.col(i), centroids.col(j));
             if (minDistance > distance)
             {
               minIndex = j;
@@ -277,8 +275,7 @@
                 p(j) = bound[j].Hi();
             }
 
-            distance = metric::SquaredEuclideanDistance::Evaluate(
-                p.col(0), centroids.col(i));
+            distance = metric.Evaluate(p.col(0), centroids.col(i));
           */
           for (size_t j = 0; j < dimensionality; ++j)
           {
@@ -371,10 +368,9 @@
                   bound[k].Hi() : bound[k].Lo();
               }
 
-              double distancei = metric::SquaredEuclideanDistance::Evaluate(
-                  p.col(0), centroids.col(i));
-              double distanceMin = metric::SquaredEuclideanDistance::Evaluate(
-                  p.col(0), centroids.col(minIndex));
+              double distancei = metric.Evaluate(p.col(0), centroids.col(i));
+              double distanceMin = metric.Evaluate(p.col(0),
+                  centroids.col(minIndex));
             */
 
             comps += 1;
@@ -487,20 +483,46 @@
 }
 
 /**
- * Perform K-Means clustering on the data, returning a list of cluster
- * assignments.
+ * Perform k-means clustering on the data, returning a list of cluster
+ * assignments.  This just forward to the other function, which returns the
+ * centroids too.  If this is properly inlined, there shouldn't be any
+ * performance penalty whatsoever.
  */
 template<typename DistanceMetric,
          typename InitialPartitionPolicy,
          typename EmptyClusterPolicy>
 template<typename MatType>
+inline void KMeans<
+    DistanceMetric,
+    InitialPartitionPolicy,
+    EmptyClusterPolicy>::
+Cluster(const MatType& data,
+        const size_t clusters,
+        arma::Col<size_t>& assignments,
+        const bool initialGuess) const
+{
+  MatType centroids(data.n_rows, clusters);
+  Cluster(data, clusters, assignments, centroids, initialGuess);
+}
+
+/**
+ * Perform k-means clustering on the data, returning a list of cluster
+ * assignments and the centroids of each cluster.
+ */
+template<typename DistanceMetric,
+         typename InitialPartitionPolicy,
+         typename EmptyClusterPolicy>
+template<typename MatType>
 void KMeans<
     DistanceMetric,
     InitialPartitionPolicy,
     EmptyClusterPolicy>::
 Cluster(const MatType& data,
         const size_t clusters,
-        arma::Col<size_t>& assignments) const
+        arma::Col<size_t>& assignments,
+        MatType& centroids,
+        const bool initialAssignmentGuess,
+        const bool initialCentroidGuess) const
 {
   // Make sure we have more points than clusters.
   if (clusters > data.n_cols)
@@ -517,18 +539,56 @@
   }
 
   // Now, the initial assignments.  First determine if they are necessary.
-  if (assignments.n_elem != data.n_cols)
+  if (initialAssignmentGuess && assignments.n_cols != data.n_cols)
   {
+    Log::Fatal << "KMeans::Cluster(): initial cluster assignments not the same "
+        << "size as the dataset!" << std::endl;
+  }
+  else if (initialCentroidGuess && (centroids.n_rows != data.n_rows ||
+                                    centroids.n_cols != clusters))
+  {
+    Log::Fatal << "KMeans::Cluster(): wrong number of initial cluster centroids"
+        << " or wrong dimensionality!" << std::endl;
+  }
+  else
+  {
     // Use the partitioner to come up with the partition assignments.
     partitioner.Cluster(data, actualClusters, assignments);
   }
 
-  // Centroids of each cluster.  Each column corresponds to a centroid.
-  MatType centroids(data.n_rows, actualClusters);
   // Counts of points in each cluster.
   arma::Col<size_t> counts(actualClusters);
   counts.zeros();
 
+  // If we received an initial cluster guess, assign the points for the first
+  // time.  Note that initialAssignmentGuess supersedes initialCentroidGuess.
+  if (initialCentroidGuess && !initialAssignmentGuess)
+  {
+    for (size_t i = 0; i < data.n_cols; ++i)
+    {
+      // Find the closest centroid to this point.
+      double minDistance = std::numeric_limits<double>::infinity();
+      size_t closestCluster = clusters; // Invalid value.
+
+      for (size_t j = 0; j < clusters; j++)
+      {
+        double distance = metric.Evaluate(data.col(i), centroids.col(j));
+
+        if (distance < minDistance)
+        {
+          minDistance = distance;
+          closestCluster = j;
+        }
+      }
+
+      // Assign the point to the closest cluster that we found.
+      assignments[i] = closestCluster;
+    }
+  }
+
+  // Resize to correct size.
+  centroids.set_size(data.n_rows, actualClusters);
+
   // Set counts correctly.
   for (size_t i = 0; i < assignments.n_elem; i++)
     counts[assignments[i]]++;
@@ -559,8 +619,7 @@
 
       for (size_t j = 0; j < actualClusters; j++)
       {
-        double distance = metric::SquaredEuclideanDistance::Evaluate(
-            data.col(i), centroids.col(j));
+        double distance = metric.Evaluate(data.col(i), centroids.col(j));
 
         if (distance < minDistance)
         {
@@ -592,8 +651,26 @@
 
   } while (changedAssignments > 0 && iteration != maxIterations);
 
-  Log::Debug << "Iterations: " << iteration << std::endl;
+  if (iteration != maxIterations)
+  {
+    Log::Debug << "KMeans::Cluster(): converged after " << iteration
+        << " iterations." << std::endl;
+  }
+  else
+  {
+    Log::Debug << "KMeans::Cluster(): terminated after limit of " << iteration
+        << " iterations." << std::endl;
 
+    // Recalculate final clusters.
+    centroids.zeros();
+
+    for (size_t i = 0; i < data.n_cols; i++)
+      centroids.col(assignments[i]) += data.col(i);
+
+    for (size_t i = 0; i < actualClusters; i++)
+      centroids.col(i) /= counts[i];
+  }
+
   // If we have overclustered, we need to merge the nearest clusters.
   if (actualClusters != clusters)
   {
@@ -614,8 +691,8 @@
     {
       for (size_t second = first + 1; second < actualClusters; second++)
       {
-        distances(i) = metric::SquaredEuclideanDistance::Evaluate(
-            centroids.col(first), centroids.col(second));
+        distances(i) = metric.Evaluate(centroids.col(first),
+                                       centroids.col(second));
         firstCluster(i) = first;
         secondCluster(i) = second;
         i++;
@@ -658,8 +735,7 @@
         {
           // Make sure it isn't already DBL_MAX.
           if (distances(offset + (first - cluster)) != DBL_MAX)
-            distances(offset + (first - cluster)) =
-                metric::SquaredEuclideanDistance::Evaluate(
+            distances(offset + (first - cluster)) = metric.Evaluate(
                 centroids.col(first), centroids.col(cluster));
         }
 
@@ -674,8 +750,7 @@
         // Make sure it isn't already DBL_MAX.
         if (distances(offset + (cluster - first)) != DBL_MAX)
         {
-          distances(offset + (cluster - first)) =
-              metric::SquaredEuclideanDistance::Evaluate(
+          distances(offset + (cluster - first)) = metric.Evaluate(
               centroids.col(first), centroids.col(cluster));
         }
       }




More information about the mlpack-svn mailing list