[mlpack-svn] r10741 - mlpack/trunk/src/mlpack/methods/kmeans
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Tue Dec 13 04:33:27 EST 2011
Author: rcurtin
Date: 2011-12-13 04:33:27 -0500 (Tue, 13 Dec 2011)
New Revision: 10741
Added:
mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
Removed:
mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.cpp
Modified:
mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt
mlpack/trunk/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp
mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp
mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
mlpack/trunk/src/mlpack/methods/kmeans/random_partition.hpp
Log:
Abstractize k-means so it can use sparse matrices.
Modified: mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt 2011-12-13 09:20:14 UTC (rev 10740)
+++ mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt 2011-12-13 09:33:27 UTC (rev 10741)
@@ -7,7 +7,7 @@
kmeans.hpp
kmeans_impl.hpp
max_variance_new_cluster.hpp
- max_variance_new_cluster.cpp
+ max_variance_new_cluster_impl.hpp
random_partition.hpp
)
Modified: mlpack/trunk/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/allow_empty_clusters.hpp 2011-12-13 09:20:14 UTC (rev 10740)
+++ mlpack/trunk/src/mlpack/methods/kmeans/allow_empty_clusters.hpp 2011-12-13 09:33:27 UTC (rev 10741)
@@ -27,6 +27,7 @@
* This function does nothing. It is called by K-Means when K-Means detects
* an empty cluster.
*
+ * @tparam MatType Type of data (arma::mat or arma::spmat).
* @param data Dataset on which clustering is being performed.
* @param emptyCluster Index of cluster which is empty.
* @param centroids Centroids of each cluster (one per column).
@@ -35,7 +36,8 @@
*
* @return Number of points changed (0).
*/
- static size_t EmptyCluster(const arma::mat& data,
+ template<typename MatType>
+ static size_t EmptyCluster(const MatType& data,
const size_t emptyCluster,
const arma::mat& centroids,
arma::Col<size_t>& clusterCounts,
Modified: mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp 2011-12-13 09:20:14 UTC (rev 10740)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans.hpp 2011-12-13 09:33:27 UTC (rev 10741)
@@ -77,12 +77,14 @@
* elements in the list of assignments must be equal to the number of points
* (columns) in the dataset.
*
+ * @tparam MatType Type of matrix (arma::mat or arma::spmat).
* @param data Dataset to cluster.
* @param clusters Number of clusters to compute.
* @param assignments Vector to store cluster assignments in. Can contain an
* initial guess at cluster assignments.
*/
- void Cluster(const arma::mat& data,
+ template<typename MatType>
+ void Cluster(const MatType& data,
const size_t clusters,
arma::Col<size_t>& assignments) const;
Modified: mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp 2011-12-13 09:20:14 UTC (rev 10740)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp 2011-12-13 09:33:27 UTC (rev 10741)
@@ -52,11 +52,12 @@
template<typename DistanceMetric,
typename InitialPartitionPolicy,
typename EmptyClusterPolicy>
+template<typename MatType>
void KMeans<
DistanceMetric,
InitialPartitionPolicy,
EmptyClusterPolicy>::
-Cluster(const arma::mat& data,
+Cluster(const MatType& data,
const size_t clusters,
arma::Col<size_t>& assignments) const
{
@@ -118,7 +119,7 @@
for (size_t j = 0; j < actualClusters; j++)
{
double distance = metric::SquaredEuclideanDistance::Evaluate(
- data.unsafe_col(i), centroids.unsafe_col(j));
+ data.col(i), centroids.col(j));
if (distance < minDistance)
{
@@ -171,7 +172,7 @@
for (size_t second = first + 1; second < actualClusters; second++)
{
distances(i) = metric::SquaredEuclideanDistance::Evaluate(
- centroids.unsafe_col(first), centroids.unsafe_col(second));
+ centroids.col(first), centroids.col(second));
firstCluster(i) = first;
secondCluster(i) = second;
i++;
@@ -216,7 +217,7 @@
if (distances(offset + (first - cluster)) != DBL_MAX)
distances(offset + (first - cluster)) =
metric::SquaredEuclideanDistance::Evaluate(
- centroids.unsafe_col(first), centroids.unsafe_col(cluster));
+ centroids.col(first), centroids.col(cluster));
}
distances(offset + (second - cluster)) = DBL_MAX;
@@ -232,7 +233,7 @@
{
distances(offset + (cluster - first)) =
metric::SquaredEuclideanDistance::Evaluate(
- centroids.unsafe_col(first), centroids.unsafe_col(cluster));
+ centroids.col(first), centroids.col(cluster));
}
}
Deleted: mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.cpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.cpp 2011-12-13 09:20:14 UTC (rev 10740)
+++ mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.cpp 2011-12-13 09:33:27 UTC (rev 10741)
@@ -1,66 +0,0 @@
-/**
- * @file max_variance_new_cluster.cpp
- * @author Ryan Curtin
- *
- * Implementation of MaxVarianceNewCluster class.
- */
-#include "max_variance_new_cluster.hpp"
-
-using namespace mlpack;
-using namespace kmeans;
-
-/**
- * Take action about an empty cluster.
- */
-size_t MaxVarianceNewCluster::EmptyCluster(const arma::mat& data,
- const size_t emptyCluster,
- const arma::mat& centroids,
- arma::Col<size_t>& clusterCounts,
- arma::Col<size_t>& assignments)
-{
- // First, we need to find the cluster with maximum variance (by which I mean
- // the sum of the covariance matrices).
- arma::vec variances;
- variances.zeros(clusterCounts.n_elem); // Start with 0.
-
- // Add the variance of each point's distance away from the cluster. I think
- // this is the sensible thing to do.
- for (size_t i = 0; i < data.n_cols; i++)
- {
- arma::vec diff = data.col(i) - centroids.col(assignments[i]);
- variances[assignments[i]] += var(diff);
- }
-
- // Now find the cluster with maximum variance.
- arma::u32 maxVarCluster;
- variances.max(maxVarCluster);
-
- // Now, inside this cluster, find the point which is furthest away.
- size_t furthestPoint = data.n_cols;
- double maxDistance = 0;
- for (size_t i = 0; i < data.n_cols; i++)
- {
- if (assignments[i] == maxVarCluster)
- {
- arma::vec diff = data.col(i) - centroids.col(maxVarCluster);
- double distance = var(diff);
-
- if (distance > maxDistance)
- {
- maxDistance = distance;
- furthestPoint = i;
- }
- }
- }
-
- // Take that point and add it to the empty cluster.
- clusterCounts[maxVarCluster]--;
- clusterCounts[emptyCluster]++;
- assignments[furthestPoint] = emptyCluster;
-
- // Output some debugging information.
- Log::Debug << "Point " << furthestPoint << " assigned to empty cluster " <<
- emptyCluster << ".\n";
-
- return 1; // We only changed one point.
-}
Modified: mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp 2011-12-13 09:20:14 UTC (rev 10740)
+++ mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp 2011-12-13 09:33:27 UTC (rev 10741)
@@ -28,6 +28,7 @@
* Take the point furthest from the centroid of the cluster with maximum
* variance to be a new cluster.
*
+ * @tparam MatType Type of data (arma::mat or arma::spmat).
* @param data Dataset on which clustering is being performed.
* @param emptyCluster Index of cluster which is empty.
* @param centroids Centroids of each cluster (one per column).
@@ -36,7 +37,8 @@
*
* @return Number of points changed.
*/
- static size_t EmptyCluster(const arma::mat& data,
+ template<typename MatType>
+ static size_t EmptyCluster(const MatType& data,
const size_t emptyCluster,
const arma::mat& centroids,
arma::Col<size_t>& clusterCounts,
@@ -46,4 +48,7 @@
}; // namespace kmeans
}; // namespace mlpack
+// Include implementation.
+#include "max_variance_new_cluster_impl.hpp"
+
#endif
Copied: mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp (from rev 10737, mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster.cpp)
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp (rev 0)
+++ mlpack/trunk/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp 2011-12-13 09:33:27 UTC (rev 10741)
@@ -0,0 +1,76 @@
+/**
+ * @file max_variance_new_cluster_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of MaxVarianceNewCluster class.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_MAX_VARIANCE_NEW_CLUSTER_IMPL_HPP
+#define __MLPACK_METHODS_KMEANS_MAX_VARIANCE_NEW_CLUSTER_IMPL_HPP
+
+// Just in case it has not been included.
+#include "max_variance_new_cluster.hpp"
+
+namespace mlpack {
+namespace kmeans {
+
+/**
+ * Take action about an empty cluster.
+ */
+template<typename MatType>
+size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
+ const size_t emptyCluster,
+ const arma::mat& centroids,
+ arma::Col<size_t>& clusterCounts,
+ arma::Col<size_t>& assignments)
+{
+ // First, we need to find the cluster with maximum variance (by which I mean
+ // the sum of the covariance matrices).
+ arma::vec variances;
+ variances.zeros(clusterCounts.n_elem); // Start with 0.
+
+ // Add the variance of each point's distance away from the cluster. I think
+ // this is the sensible thing to do.
+ for (size_t i = 0; i < data.n_cols; i++)
+ {
+ arma::vec diff = data.col(i) - centroids.col(assignments[i]);
+ variances[assignments[i]] += var(diff);
+ }
+
+ // Now find the cluster with maximum variance.
+ arma::u32 maxVarCluster;
+ variances.max(maxVarCluster);
+
+ // Now, inside this cluster, find the point which is furthest away.
+ size_t furthestPoint = data.n_cols;
+ double maxDistance = 0;
+ for (size_t i = 0; i < data.n_cols; i++)
+ {
+ if (assignments[i] == maxVarCluster)
+ {
+ arma::vec diff = data.col(i) - centroids.col(maxVarCluster);
+ double distance = var(diff);
+
+ if (distance > maxDistance)
+ {
+ maxDistance = distance;
+ furthestPoint = i;
+ }
+ }
+ }
+
+ // Take that point and add it to the empty cluster.
+ clusterCounts[maxVarCluster]--;
+ clusterCounts[emptyCluster]++;
+ assignments[furthestPoint] = emptyCluster;
+
+ // Output some debugging information.
+ Log::Debug << "Point " << furthestPoint << " assigned to empty cluster " <<
+ emptyCluster << ".\n";
+
+ return 1; // We only changed one point.
+}
+
+}; // namespace kmeans
+}; // namespace mlpack
+
+#endif
Modified: mlpack/trunk/src/mlpack/methods/kmeans/random_partition.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/random_partition.hpp 2011-12-13 09:20:14 UTC (rev 10740)
+++ mlpack/trunk/src/mlpack/methods/kmeans/random_partition.hpp 2011-12-13 09:33:27 UTC (rev 10741)
@@ -29,12 +29,14 @@
* are random, and the number of points in each cluster should be equal (or
* approximately equal).
*
+ * @tparam MatType Type of data (arma::mat or arma::spmat).
* @param data Dataset to partition.
* @param clusters Number of clusters to split dataset into.
* @param assignments Vector to store cluster assignments into. Values will
* be between 0 and (clusters - 1).
*/
- inline static void Cluster(const arma::mat& data,
+ template<typename MatType>
+ inline static void Cluster(const MatType& data,
const size_t clusters,
arma::Col<size_t>& assignments)
{
More information about the mlpack-svn
mailing list