[mlpack-svn] r10741 - mlpack/trunk/src/mlpack/methods/kmeans

fastlab-svn at coffeetalk-1.cc.gatech.edu fastlab-svn at coffeetalk-1.cc.gatech.edu
Tue Dec 13 04:33:27 EST 2011


Author: rcurtin
Date: 2011-12-13 04:33:27 -0500 (Tue, 13 Dec 2011)
New Revision: 10741

Added:
   mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
Removed:
   mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.cpp
Modified:
   mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt
   mlpack/trunk/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
   mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp
   mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp
   mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
   mlpack/trunk/src/mlpack/methods/kmeans/random_partition.hpp
Log:
Abstractize k-means so it can use sparse matrices.


Modified: mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt	2011-12-13 09:20:14 UTC (rev 10740)
+++ mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt	2011-12-13 09:33:27 UTC (rev 10741)
@@ -7,7 +7,7 @@
   kmeans.hpp
   kmeans_impl.hpp
   max_variance_new_cluster.hpp
-  max_variance_new_cluster.cpp
+  max_variance_new_cluster_impl.hpp
   random_partition.hpp
 )
 

Modified: mlpack/trunk/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/allow_empty_clusters.hpp	2011-12-13 09:20:14 UTC (rev 10740)
+++ mlpack/trunk/src/mlpack/methods/kmeans/allow_empty_clusters.hpp	2011-12-13 09:33:27 UTC (rev 10741)
@@ -27,6 +27,7 @@
    * This function does nothing.  It is called by K-Means when K-Means detects
    * an empty cluster.
    *
+   * @tparam MatType Type of data (arma::mat or arma::spmat).
    * @param data Dataset on which clustering is being performed.
    * @param emptyCluster Index of cluster which is empty.
    * @param centroids Centroids of each cluster (one per column).
@@ -35,7 +36,8 @@
    *
    * @return Number of points changed (0).
    */
-  static size_t EmptyCluster(const arma::mat& data,
+  template<typename MatType>
+  static size_t EmptyCluster(const MatType& data,
                              const size_t emptyCluster,
                              const arma::mat& centroids,
                              arma::Col<size_t>& clusterCounts,

Modified: mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp	2011-12-13 09:20:14 UTC (rev 10740)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp	2011-12-13 09:33:27 UTC (rev 10741)
@@ -77,12 +77,14 @@
    * elements in the list of assignments must be equal to the number of points
    * (columns) in the dataset.
    *
+   * @tparam MatType Type of matrix (arma::mat or arma::spmat).
    * @param data Dataset to cluster.
    * @param clusters Number of clusters to compute.
    * @param assignments Vector to store cluster assignments in.  Can contain an
    *     initial guess at cluster assignments.
    */
-  void Cluster(const arma::mat& data,
+  template<typename MatType>
+  void Cluster(const MatType& data,
                const size_t clusters,
                arma::Col<size_t>& assignments) const;
 

Modified: mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp	2011-12-13 09:20:14 UTC (rev 10740)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp	2011-12-13 09:33:27 UTC (rev 10741)
@@ -52,11 +52,12 @@
 template<typename DistanceMetric,
          typename InitialPartitionPolicy,
          typename EmptyClusterPolicy>
+template<typename MatType>
 void KMeans<
     DistanceMetric,
     InitialPartitionPolicy,
     EmptyClusterPolicy>::
-Cluster(const arma::mat& data,
+Cluster(const MatType& data,
         const size_t clusters,
         arma::Col<size_t>& assignments) const
 {
@@ -118,7 +119,7 @@
       for (size_t j = 0; j < actualClusters; j++)
       {
         double distance = metric::SquaredEuclideanDistance::Evaluate(
-            data.unsafe_col(i), centroids.unsafe_col(j));
+            data.col(i), centroids.col(j));
 
         if (distance < minDistance)
         {
@@ -171,7 +172,7 @@
       for (size_t second = first + 1; second < actualClusters; second++)
       {
         distances(i) = metric::SquaredEuclideanDistance::Evaluate(
-            centroids.unsafe_col(first), centroids.unsafe_col(second));
+            centroids.col(first), centroids.col(second));
         firstCluster(i) = first;
         secondCluster(i) = second;
         i++;
@@ -216,7 +217,7 @@
           if (distances(offset + (first - cluster)) != DBL_MAX)
             distances(offset + (first - cluster)) =
                 metric::SquaredEuclideanDistance::Evaluate(
-                centroids.unsafe_col(first), centroids.unsafe_col(cluster));
+                centroids.col(first), centroids.col(cluster));
         }
 
         distances(offset + (second - cluster)) = DBL_MAX;
@@ -232,7 +233,7 @@
         {
           distances(offset + (cluster - first)) =
               metric::SquaredEuclideanDistance::Evaluate(
-              centroids.unsafe_col(first), centroids.unsafe_col(cluster));
+              centroids.col(first), centroids.col(cluster));
         }
       }
 

Deleted: mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.cpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.cpp	2011-12-13 09:20:14 UTC (rev 10740)
+++ mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.cpp	2011-12-13 09:33:27 UTC (rev 10741)
@@ -1,66 +0,0 @@
-/**
- * @file max_variance_new_cluster.cpp
- * @author Ryan Curtin
- *
- * Implementation of MaxVarianceNewCluster class.
- */
-#include "max_variance_new_cluster.hpp"
-
-using namespace mlpack;
-using namespace kmeans;
-
-/**
- * Take action about an empty cluster.
- */
-size_t MaxVarianceNewCluster::EmptyCluster(const arma::mat& data,
-                                           const size_t emptyCluster,
-                                           const arma::mat& centroids,
-                                           arma::Col<size_t>& clusterCounts,
-                                           arma::Col<size_t>& assignments)
-{
-  // First, we need to find the cluster with maximum variance (by which I mean
-  // the sum of the covariance matrices).
-  arma::vec variances;
-  variances.zeros(clusterCounts.n_elem); // Start with 0.
-
-  // Add the variance of each point's distance away from the cluster.  I think
-  // this is the sensible thing to do.
-  for (size_t i = 0; i < data.n_cols; i++)
-  {
-    arma::vec diff = data.col(i) - centroids.col(assignments[i]);
-    variances[assignments[i]] += var(diff);
-  }
-
-  // Now find the cluster with maximum variance.
-  arma::u32 maxVarCluster;
-  variances.max(maxVarCluster);
-
-  // Now, inside this cluster, find the point which is furthest away.
-  size_t furthestPoint = data.n_cols;
-  double maxDistance = 0;
-  for (size_t i = 0; i < data.n_cols; i++)
-  {
-    if (assignments[i] == maxVarCluster)
-    {
-      arma::vec diff = data.col(i) - centroids.col(maxVarCluster);
-      double distance = var(diff);
-
-      if (distance > maxDistance)
-      {
-        maxDistance = distance;
-        furthestPoint = i;
-      }
-    }
-  }
-
-  // Take that point and add it to the empty cluster.
-  clusterCounts[maxVarCluster]--;
-  clusterCounts[emptyCluster]++;
-  assignments[furthestPoint] = emptyCluster;
-
-  // Output some debugging information.
-  Log::Debug << "Point " << furthestPoint << " assigned to empty cluster " <<
-      emptyCluster << ".\n";
-
-  return 1; // We only changed one point.
-}

Modified: mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp	2011-12-13 09:20:14 UTC (rev 10740)
+++ mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp	2011-12-13 09:33:27 UTC (rev 10741)
@@ -28,6 +28,7 @@
    * Take the point furthest from the centroid of the cluster with maximum
    * variance to be a new cluster.
    *
+   * @tparam MatType Type of data (arma::mat or arma::spmat).
    * @param data Dataset on which clustering is being performed.
    * @param emptyCluster Index of cluster which is empty.
    * @param centroids Centroids of each cluster (one per column).
@@ -36,7 +37,8 @@
    *
    * @return Number of points changed.
    */
-  static size_t EmptyCluster(const arma::mat& data,
+  template<typename MatType>
+  static size_t EmptyCluster(const MatType& data,
                              const size_t emptyCluster,
                              const arma::mat& centroids,
                              arma::Col<size_t>& clusterCounts,
@@ -46,4 +48,7 @@
 }; // namespace kmeans
 }; // namespace mlpack
 
+// Include implementation.
+#include "max_variance_new_cluster_impl.hpp"
+
 #endif

Copied: mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp (from rev 10737, mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.cpp)
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp	2011-12-13 09:33:27 UTC (rev 10741)
@@ -0,0 +1,76 @@
+/**
+ * @file max_variance_new_cluster_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of MaxVarianceNewCluster class.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_MAX_VARIANCE_NEW_CLUSTER_IMPL_HPP
+#define __MLPACK_METHODS_KMEANS_MAX_VARIANCE_NEW_CLUSTER_IMPL_HPP
+
+// Just in case it has not been included.
+#include "max_variance_new_cluster.hpp"
+
+namespace mlpack {
+namespace kmeans {
+
+/**
+ * Take action about an empty cluster.
+ */
+template<typename MatType>
+size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
+                                           const size_t emptyCluster,
+                                           const arma::mat& centroids,
+                                           arma::Col<size_t>& clusterCounts,
+                                           arma::Col<size_t>& assignments)
+{
+  // First, we need to find the cluster with maximum variance (by which I mean
+  // the sum of the covariance matrices).
+  arma::vec variances;
+  variances.zeros(clusterCounts.n_elem); // Start with 0.
+
+  // Add the variance of each point's distance away from the cluster.  I think
+  // this is the sensible thing to do.
+  for (size_t i = 0; i < data.n_cols; i++)
+  {
+    arma::vec diff = data.col(i) - centroids.col(assignments[i]);
+    variances[assignments[i]] += var(diff);
+  }
+
+  // Now find the cluster with maximum variance.
+  arma::u32 maxVarCluster;
+  variances.max(maxVarCluster);
+
+  // Now, inside this cluster, find the point which is furthest away.
+  size_t furthestPoint = data.n_cols;
+  double maxDistance = 0;
+  for (size_t i = 0; i < data.n_cols; i++)
+  {
+    if (assignments[i] == maxVarCluster)
+    {
+      arma::vec diff = data.col(i) - centroids.col(maxVarCluster);
+      double distance = var(diff);
+
+      if (distance > maxDistance)
+      {
+        maxDistance = distance;
+        furthestPoint = i;
+      }
+    }
+  }
+
+  // Take that point and add it to the empty cluster.
+  clusterCounts[maxVarCluster]--;
+  clusterCounts[emptyCluster]++;
+  assignments[furthestPoint] = emptyCluster;
+
+  // Output some debugging information.
+  Log::Debug << "Point " << furthestPoint << " assigned to empty cluster " <<
+      emptyCluster << ".\n";
+
+  return 1; // We only changed one point.
+}
+
+}; // namespace kmeans
+}; // namespace mlpack
+
+#endif

Modified: mlpack/trunk/src/mlpack/methods/kmeans/random_partition.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/random_partition.hpp	2011-12-13 09:20:14 UTC (rev 10740)
+++ mlpack/trunk/src/mlpack/methods/kmeans/random_partition.hpp	2011-12-13 09:33:27 UTC (rev 10741)
@@ -29,12 +29,14 @@
    * are random, and the number of points in each cluster should be equal (or
    * approximately equal).
    *
+   * @tparam MatType Type of data (arma::mat or arma::spmat).
    * @param data Dataset to partition.
    * @param clusters Number of clusters to split dataset into.
    * @param assignments Vector to store cluster assignments into.  Values will
    *     be between 0 and (clusters - 1).
    */
-  inline static void Cluster(const arma::mat& data,
+  template<typename MatType>
+  inline static void Cluster(const MatType& data,
                              const size_t clusters,
                              arma::Col<size_t>& assignments)
   {




More information about the mlpack-svn mailing list