[mlpack-git] master: Refactor EmptyClusterPolicy to use both the old and the new clusters. (9264f75)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Tue Jun 16 14:40:29 EDT 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/4156e3f42f988e383c984e799f84453b6ca68acc...9264f7544f7c4d93ff735f00f35b0f5287abf59d
>---------------------------------------------------------------
commit 9264f7544f7c4d93ff735f00f35b0f5287abf59d
Author: Ryan Curtin <ryan at ratml.org>
Date: Tue Jun 16 14:40:15 2015 -0400
Refactor EmptyClusterPolicy to use both the old and the new clusters.
>---------------------------------------------------------------
9264f7544f7c4d93ff735f00f35b0f5287abf59d
src/mlpack/methods/kmeans/allow_empty_clusters.hpp | 8 ++++--
src/mlpack/methods/kmeans/kmeans_impl.hpp | 8 +++---
.../methods/kmeans/max_variance_new_cluster.hpp | 10 +++++---
.../kmeans/max_variance_new_cluster_impl.hpp | 30 ++++++++++++----------
4 files changed, 33 insertions(+), 23 deletions(-)
diff --git a/src/mlpack/methods/kmeans/allow_empty_clusters.hpp b/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
index e231a23..2fb4534 100644
--- a/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
+++ b/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
@@ -30,7 +30,10 @@ class AllowEmptyClusters
* @tparam MatType Type of data (arma::mat or arma::spmat).
* @param data Dataset on which clustering is being performed.
* @param emptyCluster Index of cluster which is empty.
- * @param centroids Centroids of each cluster (one per column).
+ * @param oldCentroids Centroids of each cluster (one per column) at the start
+ * of the iteration.
+ * @param newCentroids Centroids of each cluster (one per column) at the end
+ * of the iteration.
* @param clusterCounts Number of points in each cluster.
* @param assignments Cluster assignments of each point.
* @param iteration Number of iteration.
@@ -41,7 +44,8 @@ class AllowEmptyClusters
static inline force_inline size_t EmptyCluster(
const MatType& /* data */,
const size_t /* emptyCluster */,
- const arma::mat& /* centroids */,
+ const arma::mat& /* oldCentroids */,
+ arma::mat& /* newCentroids */,
arma::Col<size_t>& /* clusterCounts */,
MetricType& /* metric */,
const size_t /* iteration */)
diff --git a/src/mlpack/methods/kmeans/kmeans_impl.hpp b/src/mlpack/methods/kmeans/kmeans_impl.hpp
index 7123fb4..16ba7e5 100644
--- a/src/mlpack/methods/kmeans/kmeans_impl.hpp
+++ b/src/mlpack/methods/kmeans/kmeans_impl.hpp
@@ -157,11 +157,11 @@ Cluster(const MatType& data,
{
Log::Info << "Cluster " << i << " is empty.\n";
if (iteration % 2 == 0)
- emptyClusterAction.EmptyCluster(data, i, centroidsOther, counts,
- metric, iteration);
+ emptyClusterAction.EmptyCluster(data, i, centroids, centroidsOther,
+ counts, metric, iteration);
else
- emptyClusterAction.EmptyCluster(data, i, centroids, counts, metric,
- iteration);
+ emptyClusterAction.EmptyCluster(data, i, centroidsOther, centroids,
+ counts, metric, iteration);
}
}
diff --git a/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp b/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
index 8a18496..ad962fc 100644
--- a/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
+++ b/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
@@ -31,7 +31,10 @@ class MaxVarianceNewCluster
* @tparam MatType Type of data (arma::mat or arma::sp_mat).
* @param data Dataset on which clustering is being performed.
* @param emptyCluster Index of cluster which is empty.
- * @param centroids Centroids of each cluster (one per column).
+ * @param oldCentroids Centroids of each cluster (one per column), at the
+ * start of the iteration.
+ * @param newCentroids Centroids of each cluster (one per column), at the end
+ * of the iteration. This will be modified!
* @param clusterCounts Number of points in each cluster.
* @param assignments Cluster assignments of each point.
*
@@ -40,7 +43,8 @@ class MaxVarianceNewCluster
template<typename MetricType, typename MatType>
size_t EmptyCluster(const MatType& data,
const size_t emptyCluster,
- arma::mat& centroids,
+ const arma::mat& oldCentroids,
+ arma::mat& newCentroids,
arma::Col<size_t>& clusterCounts,
MetricType& metric,
const size_t iteration);
@@ -56,7 +60,7 @@ class MaxVarianceNewCluster
//! Called when we are on a new iteration.
template<typename MetricType, typename MatType>
void Precalculate(const MatType& data,
- arma::mat& centroids,
+ const arma::mat& oldCentroids,
arma::Col<size_t>& clusterCounts,
MetricType& metric);
};
diff --git a/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp b/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
index 2e6c117..c87200d 100644
--- a/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
+++ b/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
@@ -19,14 +19,15 @@ namespace kmeans {
template<typename MetricType, typename MatType>
size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
const size_t emptyCluster,
- arma::mat& centroids,
+ const arma::mat& oldCentroids,
+ arma::mat& newCentroids,
arma::Col<size_t>& clusterCounts,
MetricType& metric,
const size_t iteration)
{
// If necessary, calculate the variances and assignments.
if (iteration != this->iteration || assignments.n_elem != data.n_cols)
- Precalculate(data, centroids, clusterCounts, metric);
+ Precalculate(data, oldCentroids, clusterCounts, metric);
this->iteration = iteration;
// Now find the cluster with maximum variance.
@@ -41,7 +42,7 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
if (assignments[i] == maxVarCluster)
{
const double distance = std::pow(metric.Evaluate(data.col(i),
- centroids.col(maxVarCluster)), 2.0);
+ newCentroids.col(maxVarCluster)), 2.0);
if (distance > maxDistance)
{
@@ -52,19 +53,20 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
}
// Take that point and add it to the empty cluster.
- centroids.col(maxVarCluster) *= (double(clusterCounts[maxVarCluster]) /
+ newCentroids.col(maxVarCluster) *= (double(clusterCounts[maxVarCluster]) /
double(clusterCounts[maxVarCluster] - 1));
- centroids.col(maxVarCluster) -= (1.0 / (clusterCounts[maxVarCluster] - 1.0)) *
+ newCentroids.col(maxVarCluster) -= (1.0 / (clusterCounts[maxVarCluster] - 1.0)) *
arma::vec(data.col(furthestPoint));
clusterCounts[maxVarCluster]--;
clusterCounts[emptyCluster]++;
- centroids.col(emptyCluster) = arma::vec(data.col(furthestPoint));
+ newCentroids.col(emptyCluster) = arma::vec(data.col(furthestPoint));
assignments[furthestPoint] = emptyCluster;
// Modify the variances, as necessary.
variances[emptyCluster] = 0;
- variances[maxVarCluster] = (1.0 / (clusterCounts[maxVarCluster] - 1)) *
- (variances[maxVarCluster] - maxDistance);
+ // One has already been subtracted from clusterCounts[maxVarCluster].
+ variances[maxVarCluster] = (1.0 / (clusterCounts[maxVarCluster])) *
+ ((clusterCounts[maxVarCluster] + 1) * variances[maxVarCluster] - maxDistance);
// Output some debugging information.
Log::Debug << "Point " << furthestPoint << " assigned to empty cluster " <<
@@ -75,14 +77,14 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
template<typename MetricType, typename MatType>
void MaxVarianceNewCluster::Precalculate(const MatType& data,
- arma::mat& centroids,
+ const arma::mat& oldCentroids,
arma::Col<size_t>& clusterCounts,
MetricType& metric)
{
// We have to calculate the variances of each cluster and the assignments of
// each point. This is most easily done by iterating through the entire
// dataset.
- variances.zeros(centroids.n_cols);
+ variances.zeros(oldCentroids.n_cols);
assignments.set_size(data.n_cols);
// Add the variance of each point's distance away from the cluster. I think
@@ -91,11 +93,11 @@ void MaxVarianceNewCluster::Precalculate(const MatType& data,
{
// Find the closest centroid to this point.
double minDistance = std::numeric_limits<double>::infinity();
- size_t closestCluster = centroids.n_cols; // Invalid value.
+ size_t closestCluster = oldCentroids.n_cols; // Invalid value.
- for (size_t j = 0; j < centroids.n_cols; j++)
+ for (size_t j = 0; j < oldCentroids.n_cols; j++)
{
- const double distance = metric.Evaluate(data.col(i), centroids.col(j));
+ const double distance = metric.Evaluate(data.col(i), oldCentroids.col(j));
if (distance < minDistance)
{
@@ -106,7 +108,7 @@ void MaxVarianceNewCluster::Precalculate(const MatType& data,
assignments[i] = closestCluster;
variances[closestCluster] += std::pow(metric.Evaluate(data.col(i),
- centroids.col(closestCluster)), 2.0);
+ oldCentroids.col(closestCluster)), 2.0);
}
// Divide by the number of points in the cluster to produce the variance,
More information about the mlpack-git
mailing list