[mlpack-git] master: Add SampleInitialization and different type of initialization. (a38608b)

gitdub at mlpack.org gitdub at mlpack.org
Tue Apr 12 10:43:52 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/eeba6bdc50ad4d785cb6880edbaba78173036ca6...8d77f4231046703d5c0c05ed4795458f98267968

>---------------------------------------------------------------

commit a38608bea22a065325291bbb3f2ec5fe8d40e7fc
Author: Ryan Curtin <ryan at ratml.org>
Date:   Tue Apr 12 14:42:05 2016 +0000

    Add SampleInitialization and different type of initialization.


>---------------------------------------------------------------

a38608bea22a065325291bbb3f2ec5fe8d40e7fc
 src/mlpack/methods/kmeans/CMakeLists.txt           |  1 +
 src/mlpack/methods/kmeans/kmeans.hpp               | 11 +--
 src/mlpack/methods/kmeans/kmeans_impl.hpp          | 98 ++++++++++++++++++----
 .../methods/kmeans/sample_initialization.hpp       | 49 +++++++++++
 4 files changed, 138 insertions(+), 21 deletions(-)

diff --git a/src/mlpack/methods/kmeans/CMakeLists.txt b/src/mlpack/methods/kmeans/CMakeLists.txt
index 754ed4f..168373c 100644
--- a/src/mlpack/methods/kmeans/CMakeLists.txt
+++ b/src/mlpack/methods/kmeans/CMakeLists.txt
@@ -25,6 +25,7 @@ set(SOURCES
   random_partition.hpp
   refined_start.hpp
   refined_start_impl.hpp
+  sample_initialization.hpp
 )
 
 # Add directory name to sources.
diff --git a/src/mlpack/methods/kmeans/kmeans.hpp b/src/mlpack/methods/kmeans/kmeans.hpp
index 8acc9f3..94a0a15 100644
--- a/src/mlpack/methods/kmeans/kmeans.hpp
+++ b/src/mlpack/methods/kmeans/kmeans.hpp
@@ -10,7 +10,7 @@
 #include <mlpack/core.hpp>
 
 #include <mlpack/core/metrics/lmetric.hpp>
-#include "random_partition.hpp"
+#include "sample_initialization.hpp"
 #include "max_variance_new_cluster.hpp"
 #include "naive_kmeans.hpp"
 
@@ -47,8 +47,9 @@ namespace kmeans /** K-Means clustering. */ {
  * @tparam MetricType The distance metric to use for this KMeans; see
  *     metric::LMetric for an example.
  * @tparam InitialPartitionPolicy Initial partitioning policy; must implement a
- *     default constructor and 'void Cluster(const arma::mat&, const size_t,
- *     arma::Row<size_t>&)'.
+ *     default constructor and either 'void Cluster(const arma::mat&, const
+ *     size_t, arma::Row<size_t>&)' or 'void Cluster(const arma::mat&, const
+ *     size_t, arma::mat&)'.
  * @tparam EmptyClusterPolicy Policy for what to do on an empty cluster; must
  *     implement a default constructor and 'void EmptyCluster(const arma::mat&
  *     data, const size_t emptyCluster, const arma::mat& oldCentroids,
@@ -56,11 +57,11 @@ namespace kmeans /** K-Means clustering. */ {
  *     const size_t iteration)'.
  * @tparam LloydStepType Implementation of single Lloyd step to use.
  *
- * @see RandomPartition, RefinedStart, AllowEmptyClusters,
+ * @see RandomPartition, SampleInitialization, RefinedStart, AllowEmptyClusters,
  *      MaxVarianceNewCluster, NaiveKMeans, ElkanKMeans
  */
 template<typename MetricType = metric::EuclideanDistance,
-         typename InitialPartitionPolicy = RandomPartition,
+         typename InitialPartitionPolicy = SampleInitialization,
          typename EmptyClusterPolicy = MaxVarianceNewCluster,
          template<class, class> class LloydStepType = NaiveKMeans,
          typename MatType = arma::mat>
diff --git a/src/mlpack/methods/kmeans/kmeans_impl.hpp b/src/mlpack/methods/kmeans/kmeans_impl.hpp
index 635133e..b7f14bd 100644
--- a/src/mlpack/methods/kmeans/kmeans_impl.hpp
+++ b/src/mlpack/methods/kmeans/kmeans_impl.hpp
@@ -8,11 +8,72 @@
 #include "kmeans.hpp"
 
 #include <mlpack/core/metrics/lmetric.hpp>
+#include <mlpack/core/util/sfinae_utility.hpp>
 
 namespace mlpack {
 namespace kmeans {
 
 /**
+ * This gives us a GivesCentroids object that we can use to tell whether or not
+ * an InitialPartitionPolicy returns centroids or point assignments.
+ */
+HAS_MEM_FUNC(Cluster, GivesCentroidsCheck);
+
+/**
+ * 'value' is true if the InitialPartitionPolicy class has a member
+ * Cluster(const arma::mat& data, const size_t clusters, arma::mat& centroids).
+ */
+template<typename InitialPartitionPolicy>
+struct GivesCentroids
+{
+  static const bool value =
+    // Non-static version.
+    GivesCentroidsCheck<InitialPartitionPolicy,
+        void(InitialPartitionPolicy::*)(const arma::mat&,
+                                        const size_t,
+                                        arma::mat&)>::value ||
+    // Static version.
+    GivesCentroidsCheck<InitialPartitionPolicy,
+        void(*)(const arma::mat&, const size_t, arma::mat&)>::value;
+};
+
+//! Call the initial partition policy, if it returns assignments.  This returns
+//! 'true' to indicate that assignments were given.
+template<typename MatType,
+         typename InitialPartitionPolicy>
+bool GetInitialAssignmentsOrCentroids(
+    InitialPartitionPolicy& ipp,
+    const MatType& data,
+    const size_t clusters,
+    arma::Row<size_t>& assignments,
+    arma::mat& /* centroids */,
+    const typename boost::disable_if_c<
+        GivesCentroids<InitialPartitionPolicy>::value == true>::type* = 0)
+{
+  ipp.Cluster(data, clusters, assignments);
+
+  return true;
+}
+
+//! Call the initial partition policy, if it returns centroids.  This returns
+//! 'false' to indicate that assignments were not given.
+template<typename MatType,
+         typename InitialPartitionPolicy>
+bool GetInitialAssignmentsOrCentroids(
+    InitialPartitionPolicy& ipp,
+    const MatType& data,
+    const size_t clusters,
+    arma::Row<size_t>& /* assignments */,
+    arma::mat& centroids,
+    const typename boost::enable_if_c<
+        GivesCentroids<InitialPartitionPolicy>::value == true>::type* = 0)
+{
+  ipp.Cluster(data, clusters, centroids);
+
+  return false;
+}
+
+/**
  * Construct the K-Means object.
  */
 template<typename MetricType,
@@ -110,25 +171,30 @@ Cluster(const MatType& data,
   // the initial centroids.
   if (!initialGuess)
   {
-    // The partitioner gives assignments, so we need to calculate centroids from
-    // those assignments.  This is probably not the most efficient way to do
-    // this, so maybe refactoring should be considered in the future.
+    // The GetInitialAssignmentsOrCentroids() function will call the appropriate
+    // function in the InitialPartitionPolicy to return either assignments or
+    // centroids.  We prefer centroids, but if assignments are returned, then we
+    // have to calculate the initial centroids for the first iteration.
     arma::Row<size_t> assignments;
-    partitioner.Cluster(data, clusters, assignments);
-
-    // Calculate initial centroids.
-    arma::Row<size_t> counts;
-    counts.zeros(clusters);
-    centroids.zeros(data.n_rows, clusters);
-    for (size_t i = 0; i < data.n_cols; ++i)
+    bool gotAssignments = GetInitialAssignmentsOrCentroids(partitioner, data,
+        clusters, assignments, centroids);
+    if (gotAssignments)
     {
-      centroids.col(assignments[i]) += arma::vec(data.col(i));
-      counts[assignments[i]]++;
-    }
+      // The partitioner gives assignments, so we need to calculate centroids
+      // from those assignments.
+      arma::Row<size_t> counts;
+      counts.zeros(clusters);
+      centroids.zeros(data.n_rows, clusters);
+      for (size_t i = 0; i < data.n_cols; ++i)
+      {
+        centroids.col(assignments[i]) += arma::vec(data.col(i));
+        counts[assignments[i]]++;
+      }
 
-    for (size_t i = 0; i < clusters; ++i)
-      if (counts[i] != 0)
-        centroids.col(i) /= counts[i];
+      for (size_t i = 0; i < clusters; ++i)
+        if (counts[i] != 0)
+          centroids.col(i) /= counts[i];
+    }
   }
 
   // Counts of points in each cluster.
diff --git a/src/mlpack/methods/kmeans/sample_initialization.hpp b/src/mlpack/methods/kmeans/sample_initialization.hpp
new file mode 100644
index 0000000..0d967b3
--- /dev/null
+++ b/src/mlpack/methods/kmeans/sample_initialization.hpp
@@ -0,0 +1,49 @@
+/**
+ * @file sample_initialization.hpp
+ * @author Ryan Curtin
+ *
+ * In order to construct initial centroids, randomly sample points from the
+ * dataset.  This tends to give better results than the RandomPartition
+ * strategy.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_SAMPLE_INITIALIZATION_HPP
+#define __MLPACK_METHODS_KMEANS_SAMPLE_INITIALIZATION_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace kmeans {
+
+class SampleInitialization
+{
+ public:
+  //! Empty constructor, required by the InitialPartitionPolicy type definition.
+  SampleInitialization() { }
+
+  /**
+   * Initialize the centroids matrix by randomly sampling points from the data
+   * matrix.
+   *
+   * @param data Dataset.
+   * @param clusters Number of clusters.
+   * @param centroids Matrix to put initial centroids into.
+   */
+  template<typename MatType>
+  inline static void Cluster(const MatType& data,
+                             const size_t clusters,
+                             arma::mat& centroids)
+  {
+    centroids.set_size(data.n_rows, clusters);
+    for (size_t i = 0; i < clusters; ++i)
+    {
+      // Randomly sample a point.
+      const size_t index = math::RandInt(0, data.n_cols);
+      centroids.col(i) = data.col(index);
+    }
+  }
+};
+
+} // namespace kmeans
+} // namespace mlpack
+
+#endif




More information about the mlpack-git mailing list