[mlpack-svn] r10397 - in mlpack/trunk/src/mlpack/methods: . gmm kmeans

Thu Nov 24 16:53:08 EST 2011

Author: rcurtin
Date: 2011-11-24 16:53:08 -0500 (Thu, 24 Nov 2011)
New Revision: 10397

Added:
   mlpack/trunk/src/mlpack/methods/kmeans/
   mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt
   mlpack/trunk/src/mlpack/methods/kmeans/kmeans.cpp
   mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp
Removed:
   mlpack/trunk/src/mlpack/methods/gmm/kmeans.cpp
   mlpack/trunk/src/mlpack/methods/gmm/kmeans.hpp
Modified:
   mlpack/trunk/src/mlpack/methods/CMakeLists.txt
   mlpack/trunk/src/mlpack/methods/gmm/CMakeLists.txt
   mlpack/trunk/src/mlpack/methods/gmm/gmm.cpp
Log:
Move K-Means to be its own method.


Modified: mlpack/trunk/src/mlpack/methods/CMakeLists.txt
===================================================================

--- mlpack/trunk/src/mlpack/methods/CMakeLists.txt	2011-11-24 21:09:47 UTC (rev 10396)
+++ mlpack/trunk/src/mlpack/methods/CMakeLists.txt	2011-11-24 21:53:08 UTC (rev 10397)
@@ -7,6 +7,7 @@
   gmm
   hmm
   infomax_ica
+  kmeans
   linear_regression
   #mvu  # (currently known to not work)
   naive_bayes

Modified: mlpack/trunk/src/mlpack/methods/gmm/CMakeLists.txt
===================================================================
--- mlpack/trunk/src/mlpack/methods/gmm/CMakeLists.txt	2011-11-24 21:09:47 UTC (rev 10396)
+++ mlpack/trunk/src/mlpack/methods/gmm/CMakeLists.txt	2011-11-24 21:53:08 UTC (rev 10397)
@@ -3,8 +3,6 @@
 # Define the files we need to compile.
 # Anything not in this list will not be compiled into MLPACK.
 set(SOURCES
-  kmeans.hpp
-  kmeans.cpp
   gmm.hpp
   gmm.cpp
   phi.hpp

Modified: mlpack/trunk/src/mlpack/methods/gmm/gmm.cpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/gmm/gmm.cpp	2011-11-24 21:09:47 UTC (rev 10396)
+++ mlpack/trunk/src/mlpack/methods/gmm/gmm.cpp	2011-11-24 21:53:08 UTC (rev 10397)
@@ -8,10 +8,12 @@
  */
 #include "gmm.hpp"
 #include "phi.hpp"
-#include "kmeans.hpp"
 
+#include <mlpack/methods/kmeans/kmeans.hpp>
+
 using namespace mlpack;
-using namespace gmm;
+using namespace mlpack::gmm;
+using namespace mlpack::kmeans;
 
 double GMM::Probability(const arma::vec& observation) const
 {

Deleted: mlpack/trunk/src/mlpack/methods/gmm/kmeans.cpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/gmm/kmeans.cpp	2011-11-24 21:09:47 UTC (rev 10396)
+++ mlpack/trunk/src/mlpack/methods/gmm/kmeans.cpp	2011-11-24 21:53:08 UTC (rev 10397)
@@ -1,171 +0,0 @@
-/**
- * @author Parikshit Ram (pram at cc.gatech.edu)
- * @file kmeans.cpp
- *
- * Implementation for the K-means method for getting an initial point.
- *
- */
-#include "kmeans.hpp"
-
-#include <mlpack/core/metrics/lmetric.hpp>
-
-namespace mlpack {
-namespace gmm {
-
-void KMeans(const arma::mat& data,
-            const size_t value_of_k,
-            std::vector<arma::vec>& means,
-            std::vector<arma::mat>& covars,
-            arma::vec& weights) {
-  // Make sure we have more points than clusters.
-  if (value_of_k > data.n_cols)
-    Log::Warn << "k-means: more clusters requested than points given.  Empty"
-        << " clusters may result." << std::endl;
-
-  // Assignment of cluster of each point.
-  arma::Col<size_t> assignments(data.n_cols); // Col used so we have shuffle().
-  // Centroids of each cluster.  Each column corresponds to a centroid.
-  arma::mat centroids(data.n_rows, value_of_k);
-  // Counts of points in each cluster.
-  arma::Col<size_t> counts(value_of_k);
-
-  // First we must randomly partition the dataset.
-  assignments = arma::shuffle(arma::linspace<arma::Col<size_t> >(0,
-      value_of_k - 1, data.n_cols));
-
-  // Set counts correctly.
-  for (size_t i = 0; i < value_of_k; i++)
-    counts[i] = accu(assignments == i);
-
-  size_t changed_assignments = 0;
-  do
-  {
-    // Update step.
-    // Calculate centroids based on given assignments.
-    centroids.zeros();
-
-    for (size_t i = 0; i < data.n_cols; i++)
-      centroids.col(assignments[i]) += data.col(i);
-
-    for (size_t i = 0; i < value_of_k; i++)
-      centroids.col(i) /= counts[i];
-
-    // Assignment step.
-    // Find the closest centroid to each point.  We will keep track of how many
-    // assignments change.  When no assignments change, we are done.
-    changed_assignments = 0;
-    for (size_t i = 0; i < data.n_cols; i++)
-    {
-      // Find the closest centroid to this point.
-      double min_distance = std::numeric_limits<double>::infinity();
-      size_t closest_cluster = value_of_k; // Invalid value.
-
-      for (size_t j = 0; j < value_of_k; j++)
-      {
-        double distance = metric::SquaredEuclideanDistance::Evaluate(
-            data.unsafe_col(i), centroids.unsafe_col(j));
-
-        if (distance < min_distance)
-        {
-          min_distance = distance;
-          closest_cluster = j;
-        }
-      }
-
-      // Reassign this point to the closest cluster.
-      if (assignments[i] != closest_cluster)
-      {
-        // Update counts.
-        counts[assignments[i]]--;
-        counts[closest_cluster]++;
-        // Update assignment.
-        assignments[i] = closest_cluster;
-        changed_assignments++;
-      }
-    }
-
-    // Keep-bad-things-from-happening step.
-    // Ensure that no cluster is empty, and if so, take corrective action.
-    for (size_t i = 0; i < value_of_k; i++)
-    {
-      if (counts[i] == 0)
-      {
-        // Strategy: take the furthest point from the cluster with highest
-        // variance.  So, we need the variance of each cluster.
-        arma::vec variances;
-        variances.zeros(value_of_k);
-        for (size_t j = 0; j < data.n_cols; j++)
-          variances[assignments[j]] += var(data.col(j));
-
-        size_t cluster;
-        double max_var = 0;
-        for (size_t j = 0; j < value_of_k; j++)
-        {
-          if (variances[j] > max_var)
-          {
-            cluster = j;
-            max_var = variances[j];
-          }
-        }
-
-        // Now find the furthest point.
-        size_t point = data.n_cols; // Invalid.
-        double distance = 0;
-        for (size_t j = 0; j < data.n_cols; j++)
-        {
-          if (assignments[j] == cluster)
-          {
-            double d = metric::SquaredEuclideanDistance::Evaluate(
-                data.unsafe_col(j), centroids.unsafe_col(cluster));
-
-            if (d >= distance)
-            {
-              distance = d;
-              point = j;
-            }
-          }
-        }
-
-        // Take that point and add it to the empty cluster.
-        counts[cluster]--;
-        counts[i]++;
-        assignments[point] = i;
-        changed_assignments++;
-      }
-    }
-
-  } while (changed_assignments > 0);
-
-  // Now, with the centroids final, we need to find the covariance matrix of
-  // each cluster and then the a priori weight.  We also need to assign the
-  // means to be the centroids.  First, we must make sure the size of the
-  // vectors is correct.
-  means.resize(value_of_k);
-  covars.resize(value_of_k);
-  weights.set_size(value_of_k);
-  for (size_t i = 0; i < value_of_k; i++)
-  {
-    // Assign mean.
-    means[i] = centroids.col(i);
-
-    // Calculate covariance.
-    arma::mat data_subset(data.n_rows, accu(assignments == i));
-    size_t position = 0;
-    for (size_t j = 0; j < data.n_cols; j++)
-    {
-      if (assignments[j] == i)
-      {
-        data_subset.col(position) = data.col(j);
-        position++;
-      }
-    }
-
-    covars[i] = ccov(data_subset);
-
-    // Assign weight.
-    weights[i] = (double) accu(assignments == i) / (double) data.n_cols;
-  }
-}
-
-}; // namespace gmm
-}; // namespace mlpack

Deleted: mlpack/trunk/src/mlpack/methods/gmm/kmeans.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/gmm/kmeans.hpp	2011-11-24 21:09:47 UTC (rev 10396)
+++ mlpack/trunk/src/mlpack/methods/gmm/kmeans.hpp	2011-11-24 21:53:08 UTC (rev 10397)
@@ -1,31 +0,0 @@
-/**
- * @author Parikshit Ram (pram at cc.gatech.edu)
- * @file kmeans.hpp
- *
- * Defines a Gaussian Mixture model and
- * estimates the parameters of the model
- */
-#ifndef __MLPACK_METHODS_MOG_KMEANS_HPP
-#define __MLPACK_METHODS_MOG_KMEANS_HPP
-
-#include <mlpack/core.hpp>
-
-namespace mlpack {
-namespace gmm {
-
-/**
- * This function computes the k-means of the data and stores the calculated
- * means and covariances in the std::vector of vectors and matrices passed to
- * it.  It sets the weights uniformly.
- *
- * This function is used to obtain a starting point for the optimization.
- */
-void KMeans(const arma::mat& data,
-            const size_t value_of_k,
-            std::vector<arma::vec>& means,
-            std::vector<arma::mat>& covars,
-            arma::vec& weights);
-}; // namespace gmm
-}; // namespace mlpack
-
-#endif // __MLPACK_METHODS_MOG_KMEANS_HPP

Copied: mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt (from rev 10392, mlpack/trunk/src/mlpack/methods/gmm/CMakeLists.txt)
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt	                        (rev 0)
+++ mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt	2011-11-24 21:53:08 UTC (rev 10397)
@@ -0,0 +1,17 @@
+cmake_minimum_required(VERSION 2.8)
+
+# Define the files we need to compile.
+# Anything not in this list will not be compiled into MLPACK.
+set(SOURCES
+  kmeans.hpp
+  kmeans.cpp
+)
+
+# Add directory name to sources.
+set(DIR_SRCS)
+foreach(file ${SOURCES})
+  set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file})
+endforeach()
+# Append sources (with directory name) to list of all MLPACK sources (used at
+# the parent scope).
+set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)

Copied: mlpack/trunk/src/mlpack/methods/kmeans/kmeans.cpp (from rev 10392, mlpack/trunk/src/mlpack/methods/gmm/kmeans.cpp)
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans.cpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans.cpp	2011-11-24 21:53:08 UTC (rev 10397)
@@ -0,0 +1,170 @@
+/**
+ * @file kmeans.cpp
+ * @author Parikshit Ram (pram at cc.gatech.edu)
+ *
+ * Implementation for the K-means method for getting an initial point.
+ */
+#include "kmeans.hpp"
+
+#include <mlpack/core/metrics/lmetric.hpp>
+
+namespace mlpack {
+namespace kmeans {
+
+void KMeans(const arma::mat& data,
+            const size_t value_of_k,
+            std::vector<arma::vec>& means,
+            std::vector<arma::mat>& covars,
+            arma::vec& weights) {
+  // Make sure we have more points than clusters.
+  if (value_of_k > data.n_cols)
+    Log::Warn << "k-means: more clusters requested than points given.  Empty"
+        << " clusters may result." << std::endl;
+
+  // Assignment of cluster of each point.
+  arma::Col<size_t> assignments(data.n_cols); // Col used so we have shuffle().
+  // Centroids of each cluster.  Each column corresponds to a centroid.
+  arma::mat centroids(data.n_rows, value_of_k);
+  // Counts of points in each cluster.
+  arma::Col<size_t> counts(value_of_k);
+
+  // First we must randomly partition the dataset.
+  assignments = arma::shuffle(arma::linspace<arma::Col<size_t> >(0,
+      value_of_k - 1, data.n_cols));
+
+  // Set counts correctly.
+  for (size_t i = 0; i < value_of_k; i++)
+    counts[i] = accu(assignments == i);
+
+  size_t changed_assignments = 0;
+  do
+  {
+    // Update step.
+    // Calculate centroids based on given assignments.
+    centroids.zeros();
+
+    for (size_t i = 0; i < data.n_cols; i++)
+      centroids.col(assignments[i]) += data.col(i);
+
+    for (size_t i = 0; i < value_of_k; i++)
+      centroids.col(i) /= counts[i];
+
+    // Assignment step.
+    // Find the closest centroid to each point.  We will keep track of how many
+    // assignments change.  When no assignments change, we are done.
+    changed_assignments = 0;
+    for (size_t i = 0; i < data.n_cols; i++)
+    {
+      // Find the closest centroid to this point.
+      double min_distance = std::numeric_limits<double>::infinity();
+      size_t closest_cluster = value_of_k; // Invalid value.
+
+      for (size_t j = 0; j < value_of_k; j++)
+      {
+        double distance = metric::SquaredEuclideanDistance::Evaluate(
+            data.unsafe_col(i), centroids.unsafe_col(j));
+
+        if (distance < min_distance)
+        {
+          min_distance = distance;
+          closest_cluster = j;
+        }
+      }
+
+      // Reassign this point to the closest cluster.
+      if (assignments[i] != closest_cluster)
+      {
+        // Update counts.
+        counts[assignments[i]]--;
+        counts[closest_cluster]++;
+        // Update assignment.
+        assignments[i] = closest_cluster;
+        changed_assignments++;
+      }
+    }
+
+    // Keep-bad-things-from-happening step.
+    // Ensure that no cluster is empty, and if so, take corrective action.
+    for (size_t i = 0; i < value_of_k; i++)
+    {
+      if (counts[i] == 0)
+      {
+        // Strategy: take the furthest point from the cluster with highest
+        // variance.  So, we need the variance of each cluster.
+        arma::vec variances;
+        variances.zeros(value_of_k);
+        for (size_t j = 0; j < data.n_cols; j++)
+          variances[assignments[j]] += var(data.col(j));
+
+        size_t cluster;
+        double max_var = 0;
+        for (size_t j = 0; j < value_of_k; j++)
+        {
+          if (variances[j] > max_var)
+          {
+            cluster = j;
+            max_var = variances[j];
+          }
+        }
+
+        // Now find the furthest point.
+        size_t point = data.n_cols; // Invalid.
+        double distance = 0;
+        for (size_t j = 0; j < data.n_cols; j++)
+        {
+          if (assignments[j] == cluster)
+          {
+            double d = metric::SquaredEuclideanDistance::Evaluate(
+                data.unsafe_col(j), centroids.unsafe_col(cluster));
+
+            if (d >= distance)
+            {
+              distance = d;
+              point = j;
+            }
+          }
+        }
+
+        // Take that point and add it to the empty cluster.
+        counts[cluster]--;
+        counts[i]++;
+        assignments[point] = i;
+        changed_assignments++;
+      }
+    }
+
+  } while (changed_assignments > 0);
+
+  // Now, with the centroids final, we need to find the covariance matrix of
+  // each cluster and then the a priori weight.  We also need to assign the
+  // means to be the centroids.  First, we must make sure the size of the
+  // vectors is correct.
+  means.resize(value_of_k);
+  covars.resize(value_of_k);
+  weights.set_size(value_of_k);
+  for (size_t i = 0; i < value_of_k; i++)
+  {
+    // Assign mean.
+    means[i] = centroids.col(i);
+
+    // Calculate covariance.
+    arma::mat data_subset(data.n_rows, accu(assignments == i));
+    size_t position = 0;
+    for (size_t j = 0; j < data.n_cols; j++)
+    {
+      if (assignments[j] == i)
+      {
+        data_subset.col(position) = data.col(j);
+        position++;
+      }
+    }
+
+    covars[i] = ccov(data_subset);
+
+    // Assign weight.
+    weights[i] = (double) accu(assignments == i) / (double) data.n_cols;
+  }
+}
+
+}; // namespace gmm
+}; // namespace mlpack

Copied: mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp (from rev 10393, mlpack/trunk/src/mlpack/methods/gmm/kmeans.hpp)
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp	2011-11-24 21:53:08 UTC (rev 10397)
@@ -0,0 +1,31 @@
+/**
+ * @file kmeans.hpp
+ * @author Parikshit Ram (pram at cc.gatech.edu)
+ *
+ * K-Means clustering.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_KMEANS_HPP
+#define __MLPACK_METHODS_KMEANS_KMEANS_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace kmeans {
+
+/**
+ * This function computes the k-means of the data and stores the calculated
+ * means and covariances in the std::vector of vectors and matrices passed to
+ * it.  It sets the weights uniformly.
+ *
+ * This function is used to obtain a starting point for the optimization.
+ */
+void KMeans(const arma::mat& data,
+            const size_t value_of_k,
+            std::vector<arma::vec>& means,
+            std::vector<arma::mat>& covars,
+            arma::vec& weights);
+
+}; // namespace gmm
+}; // namespace mlpack
+
+#endif // __MLPACK_METHODS_MOG_KMEANS_HPP