[mlpack-git] master: Refactor to precalculate variances. (35c0f04)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Mon Jun 1 11:16:09 EDT 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/803224f64f6b6490c3cb8aafd528b1db069ef2f1...7011dd89f6a9cf8cf7b66e1b42c9147b606551c2
>---------------------------------------------------------------
commit 35c0f041b0ade534eba7c60e82890d32fa491c2e
Author: Ryan Curtin <ryan at ratml.org>
Date: Mon Jun 1 11:15:17 2015 -0400
Refactor to precalculate variances.
>---------------------------------------------------------------
35c0f041b0ade534eba7c60e82890d32fa491c2e
src/mlpack/methods/kmeans/allow_empty_clusters.hpp | 8 +-
src/mlpack/methods/kmeans/kmeans_impl.hpp | 5 +-
.../methods/kmeans/max_variance_new_cluster.hpp | 32 +++++--
.../kmeans/max_variance_new_cluster_impl.hpp | 98 +++++++++++++---------
4 files changed, 90 insertions(+), 53 deletions(-)
diff --git a/src/mlpack/methods/kmeans/allow_empty_clusters.hpp b/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
index a8b0422..e231a23 100644
--- a/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
+++ b/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
@@ -33,6 +33,7 @@ class AllowEmptyClusters
* @param centroids Centroids of each cluster (one per column).
* @param clusterCounts Number of points in each cluster.
* @param assignments Cluster assignments of each point.
+ * @param iteration Number of iteration.
*
* @return Number of points changed (0).
*/
@@ -42,14 +43,15 @@ class AllowEmptyClusters
const size_t /* emptyCluster */,
const arma::mat& /* centroids */,
arma::Col<size_t>& /* clusterCounts */,
- MetricType& /* metric */)
+ MetricType& /* metric */,
+ const size_t /* iteration */)
{
// Empty clusters are okay! Do nothing.
return 0;
}
};
-}; // namespace kmeans
-}; // namespace mlpack
+} // namespace kmeans
+} // namespace mlpack
#endif
diff --git a/src/mlpack/methods/kmeans/kmeans_impl.hpp b/src/mlpack/methods/kmeans/kmeans_impl.hpp
index 9bdf29c..862010f 100644
--- a/src/mlpack/methods/kmeans/kmeans_impl.hpp
+++ b/src/mlpack/methods/kmeans/kmeans_impl.hpp
@@ -158,9 +158,10 @@ Cluster(const MatType& data,
Log::Info << "Cluster " << i << " is empty.\n";
if (iteration % 2 == 0)
emptyClusterAction.EmptyCluster(data, i, centroidsOther, counts,
- metric);
+ metric, iteration);
else
- emptyClusterAction.EmptyCluster(data, i, centroids, counts, metric);
+ emptyClusterAction.EmptyCluster(data, i, centroids, counts, metric,
+ iteration);
}
}
diff --git a/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp b/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
index 1bd7753..8a18496 100644
--- a/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
+++ b/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
@@ -22,7 +22,7 @@ class MaxVarianceNewCluster
{
public:
//! Default constructor required by EmptyClusterPolicy.
- MaxVarianceNewCluster() { }
+ MaxVarianceNewCluster() : iteration(size_t(-1)) { }
/**
* Take the point furthest from the centroid of the cluster with maximum
@@ -38,15 +38,31 @@ class MaxVarianceNewCluster
* @return Number of points changed.
*/
template<typename MetricType, typename MatType>
- static size_t EmptyCluster(const MatType& data,
- const size_t emptyCluster,
- arma::mat& centroids,
- arma::Col<size_t>& clusterCounts,
- MetricType& metric);
+ size_t EmptyCluster(const MatType& data,
+ const size_t emptyCluster,
+ arma::mat& centroids,
+ arma::Col<size_t>& clusterCounts,
+ MetricType& metric,
+ const size_t iteration);
+
+ private:
+ //! Index of iteration for which variance is cached.
+ size_t iteration;
+ //! Cached variances for each cluster.
+ arma::vec variances;
+ //! Cached assignments for each point.
+ arma::Col<size_t> assignments;
+
+ //! Called when we are on a new iteration.
+ template<typename MetricType, typename MatType>
+ void Precalculate(const MatType& data,
+ arma::mat& centroids,
+ arma::Col<size_t>& clusterCounts,
+ MetricType& metric);
};
-}; // namespace kmeans
-}; // namespace mlpack
+} // namespace kmeans
+} // namespace mlpack
// Include implementation.
#include "max_variance_new_cluster_impl.hpp"
diff --git a/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp b/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
index 339574e..7bc4f76 100644
--- a/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
+++ b/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
@@ -21,45 +21,13 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
const size_t emptyCluster,
arma::mat& centroids,
arma::Col<size_t>& clusterCounts,
- MetricType& metric)
+ MetricType& metric,
+ const size_t iteration)
{
- // First, we need to find the cluster with maximum variance (by which I mean
- // the sum of the covariance matrices).
- arma::vec variances;
- variances.zeros(clusterCounts.n_elem); // Start with 0.
- arma::Col<size_t> assignments(data.n_cols);
-
- // Add the variance of each point's distance away from the cluster. I think
- // this is the sensible thing to do.
- for (size_t i = 0; i < data.n_cols; ++i)
- {
- // Find the closest centroid to this point.
- double minDistance = std::numeric_limits<double>::infinity();
- size_t closestCluster = centroids.n_cols; // Invalid value.
-
- for (size_t j = 0; j < centroids.n_cols; j++)
- {
- const double distance = metric.Evaluate(data.col(i), centroids.col(j));
-
- if (distance < minDistance)
- {
- minDistance = distance;
- closestCluster = j;
- }
- }
-
- assignments[i] = closestCluster;
- variances[closestCluster] += metric.Evaluate(data.col(i),
- centroids.col(closestCluster));
- }
-
- // Divide by the number of points in the cluster to produce the variance.
- // Although a -nan will occur here for the empty cluster(s), this doesn't
- // matter because variances.max() won't pick it up. If the number of points
- // in the cluster is 1, we ensure that cluster is not selected by forcing the
- // variance to 0.
- for (size_t i = 0; i < clusterCounts.n_elem; ++i)
- variances[i] /= (clusterCounts[i] == 1) ? DBL_MAX : clusterCounts[i];
+ // If necessary, calculate the variances and assignments.
+ if (iteration != this->iteration || assignments.n_elem != data.n_cols)
+ Precalculate(data, centroids, clusterCounts, metric);
+ this->iteration = iteration;
// Now find the cluster with maximum variance.
arma::uword maxVarCluster;
@@ -72,8 +40,8 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
{
if (assignments[i] == maxVarCluster)
{
- const double distance = metric.Evaluate(data.col(i),
- centroids.col(maxVarCluster));
+ const double distance = std::pow(metric.Evaluate(data.col(i),
+ centroids.col(maxVarCluster)), 2.0);
if (distance > maxDistance)
{
@@ -93,6 +61,11 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
centroids.col(emptyCluster) = arma::vec(data.col(furthestPoint));
assignments[furthestPoint] = emptyCluster;
+ // Modify the variances, as necessary.
+ variances[emptyCluster] = 0;
+ variances[maxVarCluster] = (1.0 / (clusterCounts[maxVarCluster] - 1)) *
+ (variances[maxVarCluster] - maxDistance);
+
// Output some debugging information.
Log::Debug << "Point " << furthestPoint << " assigned to empty cluster " <<
emptyCluster << ".\n";
@@ -100,6 +73,51 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
return 1; // We only changed one point.
}
+template<typename MetricType, typename MatType>
+void MaxVarianceNewCluster::Precalculate(const MatType& data,
+ arma::mat& centroids,
+ arma::Col<size_t>& clusterCounts,
+ MetricType& metric)
+{
+ // We have to calculate the variances of each cluster and the assignments of
+ // each point. This is most easily done by iterating through the entire
+ // dataset.
+ variances.zeros(centroids.n_cols);
+ assignments.set_size(data.n_cols);
+
+ // Add the variance of each point's distance away from the cluster. I think
+ // this is the sensible thing to do.
+ for (size_t i = 0; i < data.n_cols; ++i)
+ {
+ // Find the closest centroid to this point.
+ double minDistance = std::numeric_limits<double>::infinity();
+ size_t closestCluster = centroids.n_cols; // Invalid value.
+
+ for (size_t j = 0; j < centroids.n_cols; j++)
+ {
+ const double distance = metric.Evaluate(data.col(i), centroids.col(j));
+
+ if (distance < minDistance)
+ {
+ minDistance = distance;
+ closestCluster = j;
+ }
+ }
+
+ assignments[i] = closestCluster;
+ variances[closestCluster] += std::pow(metric.Evaluate(data.col(i),
+ centroids.col(closestCluster)), 2.0);
+ }
+
+ // Divide by the number of points in the cluster to produce the variance.
+ // Although a -nan will occur here for the empty cluster(s), this doesn't
+ // matter because variances.max() won't pick it up. If the number of points
+ // in the cluster is 1, we ensure that cluster is not selected by forcing the
+ // variance to 0.
+ for (size_t i = 0; i < clusterCounts.n_elem; ++i)
+ variances[i] /= (clusterCounts[i] == 1) ? DBL_MAX : clusterCounts[i];
+}
+
}; // namespace kmeans
}; // namespace mlpack
More information about the mlpack-git
mailing list