[mlpack-git] master: A prototype algorithm for k-means clustering, which probably works best when k is very large and so is N. (7fd57f1)

Thu Mar 5 22:01:37 EST 2015

Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/904762495c039e345beba14c1142fd719b3bd50e...f94823c800ad6f7266995c700b1b630d5ffdcf40

>---------------------------------------------------------------

commit 7fd57f19e4994ad405c8ee867add506d0e11dd89
Author: Ryan Curtin <ryan at ratml.org>
Date:   Mon Oct 13 21:07:13 2014 +0000

    A prototype algorithm for k-means clustering, which probably works best when k
    is very large and so is N.


>---------------------------------------------------------------

7fd57f19e4994ad405c8ee867add506d0e11dd89
 src/mlpack/methods/kmeans/dtnn_kmeans.hpp      |  89 +++++++++++++++
 src/mlpack/methods/kmeans/dtnn_kmeans_impl.hpp | 144 +++++++++++++++++++++++++
 src/mlpack/methods/kmeans/kmeans_main.cpp      |   6 +-
 3 files changed, 238 insertions(+), 1 deletion(-)

diff --git a/src/mlpack/methods/kmeans/dtnn_kmeans.hpp b/src/mlpack/methods/kmeans/dtnn_kmeans.hpp
new file mode 100644
index 0000000..4dacd6e
--- /dev/null
+++ b/src/mlpack/methods/kmeans/dtnn_kmeans.hpp
@@ -0,0 +1,89 @@
+/**
+ * @file dtnn_kmeans.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of a Lloyd iteration which uses dual-tree nearest neighbor
+ * search as a black box.  The conditions under which this will perform best are
+ * probably limited to the case where k is close to the number of points in the
+ * dataset, and the number of iterations of the k-means algorithm will be few.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_DTNN_KMEANS_HPP
+#define __MLPACK_METHODS_KMEANS_DTNN_KMEANS_HPP
+
+#include <mlpack/core/tree/binary_space_tree.hpp>
+#include <mlpack/methods/neighbor_search/neighbor_search.hpp>
+
+namespace mlpack {
+namespace kmeans {
+
+/**
+ * An algorithm for an exact Lloyd iteration which simply uses dual-tree
+ * nearest-neighbor search to find the nearest centroid for each point in the
+ * dataset.  The conditions under which this will perform best are probably
+ * limited to the case where k is close to the number of points in the dataset,
+ * and the number of iterations of the k-means algorithm will be few.
+ */
+template<
+    typename MetricType,
+    typename MatType,
+    typename TreeType = tree::BinarySpaceTree<bound::HRectBound<2>,
+        neighbor::NeighborSearchStat<neighbor::NearestNeighborSort> > >
+class DTNNKMeans
+{
+ public:
+  /**
+   * Construct the DTNNKMeans object, which will construct a tree on the points.
+   */
+  DTNNKMeans(const MatType& dataset, MetricType& metric);
+
+  /**
+   * Delete the tree constructed by the DTNNKMeans object.
+   */
+  ~DTNNKMeans();
+
+  /**
+   * Run a single iteration of the dual-tree nearest neighbor algorithm for
+   * k-means, updating the given centroids into the newCentroids matrix.
+   *
+   * @param centroids Current cluster centroids.
+   * @param newCentroids New cluster centroids.
+   * @param counts Current counts, to be overwritten with new counts.
+   */
+  double Iterate(const arma::mat& centroids,
+                 arma::mat& newCentroids,
+                 arma::Col<size_t>& counts);
+
+  //! Return the number of distance calculations.
+  size_t DistanceCalculations() const { return distanceCalculations; }
+  //! Modify the number of distance calculations.
+  size_t& DistanceCalculations() { return distanceCalculations; }
+
+ private:
+  //! The original dataset reference.
+  const MatType& datasetOrig; // Maybe not necessary.
+  //! The dataset we are using.
+  const MatType& dataset;
+  //! A copy of the dataset, if necessary.
+  MatType datasetCopy;
+  //! The metric.
+  MetricType metric;
+
+  //! The tree built on the points.
+  TreeType* tree;
+
+  //! Track distance calculations.
+  size_t distanceCalculations;
+
+  //! Update the bounds in the tree before the next iteration.
+  void UpdateTree(TreeType& node, const double tolerance);
+};
+
+template<typename MetricType, typename MatType>
+using DefaultDTNNKMeans = DTNNKMeans<MetricType, MatType>;
+
+} // namespace kmeans
+} // namespace mlpack
+
+#include "dtnn_kmeans_impl.hpp"
+
+#endif
diff --git a/src/mlpack/methods/kmeans/dtnn_kmeans_impl.hpp b/src/mlpack/methods/kmeans/dtnn_kmeans_impl.hpp
new file mode 100644
index 0000000..90625a7
--- /dev/null
+++ b/src/mlpack/methods/kmeans/dtnn_kmeans_impl.hpp
@@ -0,0 +1,144 @@
+/**
+ * @file dtnn_kmeans_impl.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of a Lloyd iteration which uses dual-tree nearest neighbor
+ * search as a black box.  The conditions under which this will perform best are
+ * probably limited to the case where k is close to the number of points in the
+ * dataset, and the number of iterations of the k-means algorithm will be few.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_DTNN_KMEANS_IMPL_HPP
+#define __MLPACK_METHODS_KMEANS_DTNN_KMEANS_IMPL_HPP
+
+namespace mlpack {
+namespace kmeans {
+
+template<typename MetricType, typename MatType, typename TreeType>
+DTNNKMeans<MetricType, MatType, TreeType>::DTNNKMeans(const MatType& dataset,
+                                                      MetricType& metric) :
+    datasetOrig(dataset),
+    dataset(tree::TreeTraits<TreeType>::RearrangesDataset ? datasetCopy :
+        datasetOrig),
+    metric(metric),
+    distanceCalculations(0)
+{
+  Timer::Start("tree_building");
+
+  // Copy the dataset, if necessary.
+  if (tree::TreeTraits<TreeType>::RearrangesDataset)
+    datasetCopy = datasetOrig;
+
+  // Now build the tree.  We don't need any mappings.
+  tree = new TreeType(const_cast<typename TreeType::Mat&>(this->dataset));
+
+  Timer::Stop("tree_building");
+}
+
+template<typename MetricType, typename MatType, typename TreeType>
+DTNNKMeans<MetricType, MatType, TreeType>::~DTNNKMeans()
+{
+  if (tree)
+    delete tree;
+}
+
+// Run a single iteration.
+template<typename MetricType, typename MatType, typename TreeType>
+double DTNNKMeans<MetricType, MatType, TreeType>::Iterate(
+    const arma::mat& centroids,
+    arma::mat& newCentroids,
+    arma::Col<size_t>& counts)
+{
+  newCentroids.zeros(centroids.n_rows, centroids.n_cols);
+  counts.zeros(centroids.n_cols);
+  arma::mat centroidsCopy;
+
+  // Build a tree on the centroids.
+  std::vector<size_t> oldFromNewCentroids;
+  TreeType* centroidTree;
+  if (tree::TreeTraits<TreeType>::RearrangesDataset)
+  {
+    // Manually set leaf size of 2.  This may not always be appropriate.
+    centroidsCopy = centroids;
+    centroidTree = new TreeType(centroidsCopy, oldFromNewCentroids, 2);
+  }
+  else
+  {
+    centroidTree = new TreeType(centroidsCopy);
+  }
+
+  typedef neighbor::NeighborSearch<neighbor::NearestNeighborSort, MetricType,
+      TreeType> AllkNNType;
+  AllkNNType allknn(centroidTree, tree,
+      (tree::TreeTraits<TreeType>::RearrangesDataset) ? centroidsCopy :
+      centroids, dataset, false, metric);
+
+  // This is a lot of overhead.  We don't need the distances.
+  arma::mat distances;
+  arma::Mat<size_t> assignments;
+  allknn.Search(1, assignments, distances);
+  distanceCalculations += allknn.BaseCases() + allknn.Scores();
+
+  // From the assignments, calculate the new centroids and counts.
+  for (size_t i = 0; i < dataset.n_cols; ++i)
+  {
+    if (tree::TreeTraits<TreeType>::RearrangesDataset)
+    {
+      newCentroids.col(oldFromNewCentroids[assignments[i]]) += dataset.col(i);
+      ++counts(oldFromNewCentroids[assignments[i]]);
+    }
+    else
+    {
+      newCentroids.col(assignments[i]) += dataset.col(i);
+      ++counts(i);
+    }
+  }
+
+  // Now, calculate how far the clusters moved, after normalizing them.
+  double residual = 0.0;
+  double maxMovement = 0.0;
+  for (size_t c = 0; c < centroids.n_cols; ++c)
+  {
+    if (counts[c] == 0)
+    {
+      newCentroids.col(c).fill(DBL_MAX); // Should have happened anyway I think.
+    }
+    else
+    {
+      newCentroids.col(c) /= counts(c);
+      const double movement = metric.Evaluate(centroids.col(c),
+          newCentroids.col(c));
+      residual += std::pow(movement, 2.0);
+
+      if (movement > maxMovement)
+        maxMovement = movement;
+    }
+  }
+  distanceCalculations += centroids.n_cols;
+
+  UpdateTree(*tree, maxMovement);
+
+  delete centroidTree;
+
+  return std::sqrt(residual);
+}
+
+template<typename MetricType, typename MatType, typename TreeType>
+void DTNNKMeans<MetricType, MatType, TreeType>::UpdateTree(
+    TreeType& node,
+    const double tolerance)
+{
+  if (node.Stat().FirstBound() != DBL_MAX)
+    node.Stat().FirstBound() += tolerance;
+  if (node.Stat().SecondBound() != DBL_MAX)
+    node.Stat().SecondBound() += tolerance;
+  if (node.Stat().Bound() != DBL_MAX)
+    node.Stat().Bound() += tolerance;
+
+  for (size_t i = 0; i < node.NumChildren(); ++i)
+    UpdateTree(node.Child(i), tolerance);
+}
+
+} // namespace kmeans
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/kmeans/kmeans_main.cpp b/src/mlpack/methods/kmeans/kmeans_main.cpp
index 1a4dcd7..039925c 100644
--- a/src/mlpack/methods/kmeans/kmeans_main.cpp
+++ b/src/mlpack/methods/kmeans/kmeans_main.cpp
@@ -12,6 +12,7 @@
 #include "elkan_kmeans.hpp"
 #include "hamerly_kmeans.hpp"
 #include "pelleg_moore_kmeans.hpp"
+#include "dtnn_kmeans.hpp"
 
 using namespace mlpack;
 using namespace mlpack::kmeans;
@@ -75,7 +76,7 @@ PARAM_DOUBLE("percentage", "Percentage of dataset to use for each refined start"
     " sampling (use when --refined_start is specified).", "p", 0.02);
 
 PARAM_STRING("algorithm", "Algorithm to use for the Lloyd iteration ('naive', "
-    "'pelleg-moore', 'elkan', or 'hamerly').", "a", "naive");
+    "'pelleg-moore', 'elkan', 'hamerly', or 'dtnn').", "a", "naive");
 
 // Given the type of initial partition policy, figure out the empty cluster
 // policy and run k-means.
@@ -150,6 +151,9 @@ void FindLloydStepType(const InitialPartitionPolicy& ipp)
   else if (algorithm == "pelleg-moore")
     RunKMeans<InitialPartitionPolicy, EmptyClusterPolicy,
         PellegMooreKMeans>(ipp);
+  else if (algorithm == "dtnn")
+    RunKMeans<InitialPartitionPolicy, EmptyClusterPolicy,
+        DefaultDTNNKMeans>(ipp);
   else if (algorithm == "naive")
     RunKMeans<InitialPartitionPolicy, EmptyClusterPolicy, NaiveKMeans>(ipp);
   else