[mlpack-svn] r10397 - in mlpack/trunk/src/mlpack/methods: . gmm kmeans
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Thu Nov 24 16:53:08 EST 2011
Author: rcurtin
Date: 2011-11-24 16:53:08 -0500 (Thu, 24 Nov 2011)
New Revision: 10397
Added:
mlpack/trunk/src/mlpack/methods/kmeans/
mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt
mlpack/trunk/src/mlpack/methods/kmeans/kmeans.cpp
mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp
Removed:
mlpack/trunk/src/mlpack/methods/gmm/kmeans.cpp
mlpack/trunk/src/mlpack/methods/gmm/kmeans.hpp
Modified:
mlpack/trunk/src/mlpack/methods/CMakeLists.txt
mlpack/trunk/src/mlpack/methods/gmm/CMakeLists.txt
mlpack/trunk/src/mlpack/methods/gmm/gmm.cpp
Log:
Move K-Means to be its own method.
Modified: mlpack/trunk/src/mlpack/methods/CMakeLists.txt
===================================================================
--- mlpack/trunk/src/mlpack/methods/CMakeLists.txt 2011-11-24 21:09:47 UTC (rev 10396)
+++ mlpack/trunk/src/mlpack/methods/CMakeLists.txt 2011-11-24 21:53:08 UTC (rev 10397)
@@ -7,6 +7,7 @@
gmm
hmm
infomax_ica
+ kmeans
linear_regression
#mvu # (currently known to not work)
naive_bayes
Modified: mlpack/trunk/src/mlpack/methods/gmm/CMakeLists.txt
===================================================================
--- mlpack/trunk/src/mlpack/methods/gmm/CMakeLists.txt 2011-11-24 21:09:47 UTC (rev 10396)
+++ mlpack/trunk/src/mlpack/methods/gmm/CMakeLists.txt 2011-11-24 21:53:08 UTC (rev 10397)
@@ -3,8 +3,6 @@
# Define the files we need to compile.
# Anything not in this list will not be compiled into MLPACK.
set(SOURCES
- kmeans.hpp
- kmeans.cpp
gmm.hpp
gmm.cpp
phi.hpp
Modified: mlpack/trunk/src/mlpack/methods/gmm/gmm.cpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/gmm/gmm.cpp 2011-11-24 21:09:47 UTC (rev 10396)
+++ mlpack/trunk/src/mlpack/methods/gmm/gmm.cpp 2011-11-24 21:53:08 UTC (rev 10397)
@@ -8,10 +8,12 @@
*/
#include "gmm.hpp"
#include "phi.hpp"
-#include "kmeans.hpp"
+#include <mlpack/methods/kmeans/kmeans.hpp>
+
using namespace mlpack;
-using namespace gmm;
+using namespace mlpack::gmm;
+using namespace mlpack::kmeans;
double GMM::Probability(const arma::vec& observation) const
{
Deleted: mlpack/trunk/src/mlpack/methods/gmm/kmeans.cpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/gmm/kmeans.cpp 2011-11-24 21:09:47 UTC (rev 10396)
+++ mlpack/trunk/src/mlpack/methods/gmm/kmeans.cpp 2011-11-24 21:53:08 UTC (rev 10397)
@@ -1,171 +0,0 @@
-/**
- * @author Parikshit Ram (pram at cc.gatech.edu)
- * @file kmeans.cpp
- *
- * Implementation for the K-means method for getting an initial point.
- *
- */
-#include "kmeans.hpp"
-
-#include <mlpack/core/metrics/lmetric.hpp>
-
-namespace mlpack {
-namespace gmm {
-
-void KMeans(const arma::mat& data,
- const size_t value_of_k,
- std::vector<arma::vec>& means,
- std::vector<arma::mat>& covars,
- arma::vec& weights) {
- // Make sure we have more points than clusters.
- if (value_of_k > data.n_cols)
- Log::Warn << "k-means: more clusters requested than points given. Empty"
- << " clusters may result." << std::endl;
-
- // Assignment of cluster of each point.
- arma::Col<size_t> assignments(data.n_cols); // Col used so we have shuffle().
- // Centroids of each cluster. Each column corresponds to a centroid.
- arma::mat centroids(data.n_rows, value_of_k);
- // Counts of points in each cluster.
- arma::Col<size_t> counts(value_of_k);
-
- // First we must randomly partition the dataset.
- assignments = arma::shuffle(arma::linspace<arma::Col<size_t> >(0,
- value_of_k - 1, data.n_cols));
-
- // Set counts correctly.
- for (size_t i = 0; i < value_of_k; i++)
- counts[i] = accu(assignments == i);
-
- size_t changed_assignments = 0;
- do
- {
- // Update step.
- // Calculate centroids based on given assignments.
- centroids.zeros();
-
- for (size_t i = 0; i < data.n_cols; i++)
- centroids.col(assignments[i]) += data.col(i);
-
- for (size_t i = 0; i < value_of_k; i++)
- centroids.col(i) /= counts[i];
-
- // Assignment step.
- // Find the closest centroid to each point. We will keep track of how many
- // assignments change. When no assignments change, we are done.
- changed_assignments = 0;
- for (size_t i = 0; i < data.n_cols; i++)
- {
- // Find the closest centroid to this point.
- double min_distance = std::numeric_limits<double>::infinity();
- size_t closest_cluster = value_of_k; // Invalid value.
-
- for (size_t j = 0; j < value_of_k; j++)
- {
- double distance = metric::SquaredEuclideanDistance::Evaluate(
- data.unsafe_col(i), centroids.unsafe_col(j));
-
- if (distance < min_distance)
- {
- min_distance = distance;
- closest_cluster = j;
- }
- }
-
- // Reassign this point to the closest cluster.
- if (assignments[i] != closest_cluster)
- {
- // Update counts.
- counts[assignments[i]]--;
- counts[closest_cluster]++;
- // Update assignment.
- assignments[i] = closest_cluster;
- changed_assignments++;
- }
- }
-
- // Keep-bad-things-from-happening step.
- // Ensure that no cluster is empty, and if so, take corrective action.
- for (size_t i = 0; i < value_of_k; i++)
- {
- if (counts[i] == 0)
- {
- // Strategy: take the furthest point from the cluster with highest
- // variance. So, we need the variance of each cluster.
- arma::vec variances;
- variances.zeros(value_of_k);
- for (size_t j = 0; j < data.n_cols; j++)
- variances[assignments[j]] += var(data.col(j));
-
- size_t cluster;
- double max_var = 0;
- for (size_t j = 0; j < value_of_k; j++)
- {
- if (variances[j] > max_var)
- {
- cluster = j;
- max_var = variances[j];
- }
- }
-
- // Now find the furthest point.
- size_t point = data.n_cols; // Invalid.
- double distance = 0;
- for (size_t j = 0; j < data.n_cols; j++)
- {
- if (assignments[j] == cluster)
- {
- double d = metric::SquaredEuclideanDistance::Evaluate(
- data.unsafe_col(j), centroids.unsafe_col(cluster));
-
- if (d >= distance)
- {
- distance = d;
- point = j;
- }
- }
- }
-
- // Take that point and add it to the empty cluster.
- counts[cluster]--;
- counts[i]++;
- assignments[point] = i;
- changed_assignments++;
- }
- }
-
- } while (changed_assignments > 0);
-
- // Now, with the centroids final, we need to find the covariance matrix of
- // each cluster and then the a priori weight. We also need to assign the
- // means to be the centroids. First, we must make sure the size of the
- // vectors is correct.
- means.resize(value_of_k);
- covars.resize(value_of_k);
- weights.set_size(value_of_k);
- for (size_t i = 0; i < value_of_k; i++)
- {
- // Assign mean.
- means[i] = centroids.col(i);
-
- // Calculate covariance.
- arma::mat data_subset(data.n_rows, accu(assignments == i));
- size_t position = 0;
- for (size_t j = 0; j < data.n_cols; j++)
- {
- if (assignments[j] == i)
- {
- data_subset.col(position) = data.col(j);
- position++;
- }
- }
-
- covars[i] = ccov(data_subset);
-
- // Assign weight.
- weights[i] = (double) accu(assignments == i) / (double) data.n_cols;
- }
-}
-
-}; // namespace gmm
-}; // namespace mlpack
Deleted: mlpack/trunk/src/mlpack/methods/gmm/kmeans.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/gmm/kmeans.hpp 2011-11-24 21:09:47 UTC (rev 10396)
+++ mlpack/trunk/src/mlpack/methods/gmm/kmeans.hpp 2011-11-24 21:53:08 UTC (rev 10397)
@@ -1,31 +0,0 @@
-/**
- * @author Parikshit Ram (pram at cc.gatech.edu)
- * @file kmeans.hpp
- *
- * Defines a Gaussian Mixture model and
- * estimates the parameters of the model
- */
-#ifndef __MLPACK_METHODS_MOG_KMEANS_HPP
-#define __MLPACK_METHODS_MOG_KMEANS_HPP
-
-#include <mlpack/core.hpp>
-
-namespace mlpack {
-namespace gmm {
-
-/**
- * This function computes the k-means of the data and stores the calculated
- * means and covariances in the std::vector of vectors and matrices passed to
- * it. It sets the weights uniformly.
- *
- * This function is used to obtain a starting point for the optimization.
- */
-void KMeans(const arma::mat& data,
- const size_t value_of_k,
- std::vector<arma::vec>& means,
- std::vector<arma::mat>& covars,
- arma::vec& weights);
-}; // namespace gmm
-}; // namespace mlpack
-
-#endif // __MLPACK_METHODS_MOG_KMEANS_HPP
Copied: mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt (from rev 10392, mlpack/trunk/src/mlpack/methods/gmm/CMakeLists.txt)
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt (rev 0)
+++ mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt 2011-11-24 21:53:08 UTC (rev 10397)
@@ -0,0 +1,17 @@
+cmake_minimum_required(VERSION 2.8)
+
+# Define the files we need to compile.
+# Anything not in this list will not be compiled into MLPACK.
+set(SOURCES
+ kmeans.hpp
+ kmeans.cpp
+)
+
+# Add directory name to sources.
+set(DIR_SRCS)
+foreach(file ${SOURCES})
+ set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file})
+endforeach()
+# Append sources (with directory name) to list of all MLPACK sources (used at
+# the parent scope).
+set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)
Copied: mlpack/trunk/src/mlpack/methods/kmeans/kmeans.cpp (from rev 10392, mlpack/trunk/src/mlpack/methods/gmm/kmeans.cpp)
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans.cpp (rev 0)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans.cpp 2011-11-24 21:53:08 UTC (rev 10397)
@@ -0,0 +1,170 @@
+/**
+ * @file kmeans.cpp
+ * @author Parikshit Ram (pram at cc.gatech.edu)
+ *
+ * Implementation for the K-means method for getting an initial point.
+ */
+#include "kmeans.hpp"
+
+#include <mlpack/core/metrics/lmetric.hpp>
+
+namespace mlpack {
+namespace kmeans {
+
+void KMeans(const arma::mat& data,
+ const size_t value_of_k,
+ std::vector<arma::vec>& means,
+ std::vector<arma::mat>& covars,
+ arma::vec& weights) {
+ // Make sure we have more points than clusters.
+ if (value_of_k > data.n_cols)
+ Log::Warn << "k-means: more clusters requested than points given. Empty"
+ << " clusters may result." << std::endl;
+
+ // Assignment of cluster of each point.
+ arma::Col<size_t> assignments(data.n_cols); // Col used so we have shuffle().
+ // Centroids of each cluster. Each column corresponds to a centroid.
+ arma::mat centroids(data.n_rows, value_of_k);
+ // Counts of points in each cluster.
+ arma::Col<size_t> counts(value_of_k);
+
+ // First we must randomly partition the dataset.
+ assignments = arma::shuffle(arma::linspace<arma::Col<size_t> >(0,
+ value_of_k - 1, data.n_cols));
+
+ // Set counts correctly.
+ for (size_t i = 0; i < value_of_k; i++)
+ counts[i] = accu(assignments == i);
+
+ size_t changed_assignments = 0;
+ do
+ {
+ // Update step.
+ // Calculate centroids based on given assignments.
+ centroids.zeros();
+
+ for (size_t i = 0; i < data.n_cols; i++)
+ centroids.col(assignments[i]) += data.col(i);
+
+ for (size_t i = 0; i < value_of_k; i++)
+ centroids.col(i) /= counts[i];
+
+ // Assignment step.
+ // Find the closest centroid to each point. We will keep track of how many
+ // assignments change. When no assignments change, we are done.
+ changed_assignments = 0;
+ for (size_t i = 0; i < data.n_cols; i++)
+ {
+ // Find the closest centroid to this point.
+ double min_distance = std::numeric_limits<double>::infinity();
+ size_t closest_cluster = value_of_k; // Invalid value.
+
+ for (size_t j = 0; j < value_of_k; j++)
+ {
+ double distance = metric::SquaredEuclideanDistance::Evaluate(
+ data.unsafe_col(i), centroids.unsafe_col(j));
+
+ if (distance < min_distance)
+ {
+ min_distance = distance;
+ closest_cluster = j;
+ }
+ }
+
+ // Reassign this point to the closest cluster.
+ if (assignments[i] != closest_cluster)
+ {
+ // Update counts.
+ counts[assignments[i]]--;
+ counts[closest_cluster]++;
+ // Update assignment.
+ assignments[i] = closest_cluster;
+ changed_assignments++;
+ }
+ }
+
+ // Keep-bad-things-from-happening step.
+ // Ensure that no cluster is empty, and if so, take corrective action.
+ for (size_t i = 0; i < value_of_k; i++)
+ {
+ if (counts[i] == 0)
+ {
+ // Strategy: take the furthest point from the cluster with highest
+ // variance. So, we need the variance of each cluster.
+ arma::vec variances;
+ variances.zeros(value_of_k);
+ for (size_t j = 0; j < data.n_cols; j++)
+ variances[assignments[j]] += var(data.col(j));
+
+ size_t cluster;
+ double max_var = 0;
+ for (size_t j = 0; j < value_of_k; j++)
+ {
+ if (variances[j] > max_var)
+ {
+ cluster = j;
+ max_var = variances[j];
+ }
+ }
+
+ // Now find the furthest point.
+ size_t point = data.n_cols; // Invalid.
+ double distance = 0;
+ for (size_t j = 0; j < data.n_cols; j++)
+ {
+ if (assignments[j] == cluster)
+ {
+ double d = metric::SquaredEuclideanDistance::Evaluate(
+ data.unsafe_col(j), centroids.unsafe_col(cluster));
+
+ if (d >= distance)
+ {
+ distance = d;
+ point = j;
+ }
+ }
+ }
+
+ // Take that point and add it to the empty cluster.
+ counts[cluster]--;
+ counts[i]++;
+ assignments[point] = i;
+ changed_assignments++;
+ }
+ }
+
+ } while (changed_assignments > 0);
+
+ // Now, with the centroids final, we need to find the covariance matrix of
+ // each cluster and then the a priori weight. We also need to assign the
+ // means to be the centroids. First, we must make sure the size of the
+ // vectors is correct.
+ means.resize(value_of_k);
+ covars.resize(value_of_k);
+ weights.set_size(value_of_k);
+ for (size_t i = 0; i < value_of_k; i++)
+ {
+ // Assign mean.
+ means[i] = centroids.col(i);
+
+ // Calculate covariance.
+ arma::mat data_subset(data.n_rows, accu(assignments == i));
+ size_t position = 0;
+ for (size_t j = 0; j < data.n_cols; j++)
+ {
+ if (assignments[j] == i)
+ {
+ data_subset.col(position) = data.col(j);
+ position++;
+ }
+ }
+
+ covars[i] = ccov(data_subset);
+
+ // Assign weight.
+ weights[i] = (double) accu(assignments == i) / (double) data.n_cols;
+ }
+}
+
+}; // namespace gmm
+}; // namespace mlpack
Copied: mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp (from rev 10393, mlpack/trunk/src/mlpack/methods/gmm/kmeans.hpp)
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp (rev 0)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp 2011-11-24 21:53:08 UTC (rev 10397)
@@ -0,0 +1,31 @@
+/**
+ * @file kmeans.hpp
+ * @author Parikshit Ram (pram at cc.gatech.edu)
+ *
+ * K-Means clustering.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_KMEANS_HPP
+#define __MLPACK_METHODS_KMEANS_KMEANS_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace kmeans {
+
+/**
+ * This function computes the k-means of the data and stores the calculated
+ * means and covariances in the std::vector of vectors and matrices passed to
+ * it. It sets the weights uniformly.
+ *
+ * This function is used to obtain a starting point for the optimization.
+ */
+void KMeans(const arma::mat& data,
+ const size_t value_of_k,
+ std::vector<arma::vec>& means,
+ std::vector<arma::mat>& covars,
+ arma::vec& weights);
+
+}; // namespace gmm
+}; // namespace mlpack
+
+#endif // __MLPACK_METHODS_MOG_KMEANS_HPP
More information about the mlpack-svn
mailing list