[mlpack-git] master: Add SampleInitialization and different type of initialization. (a38608b)
gitdub at mlpack.org
gitdub at mlpack.org
Tue Apr 12 10:43:52 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/eeba6bdc50ad4d785cb6880edbaba78173036ca6...8d77f4231046703d5c0c05ed4795458f98267968
>---------------------------------------------------------------
commit a38608bea22a065325291bbb3f2ec5fe8d40e7fc
Author: Ryan Curtin <ryan at ratml.org>
Date: Tue Apr 12 14:42:05 2016 +0000
Add SampleInitialization and different type of initialization.
>---------------------------------------------------------------
a38608bea22a065325291bbb3f2ec5fe8d40e7fc
src/mlpack/methods/kmeans/CMakeLists.txt | 1 +
src/mlpack/methods/kmeans/kmeans.hpp | 11 +--
src/mlpack/methods/kmeans/kmeans_impl.hpp | 98 ++++++++++++++++++----
.../methods/kmeans/sample_initialization.hpp | 49 +++++++++++
4 files changed, 138 insertions(+), 21 deletions(-)
diff --git a/src/mlpack/methods/kmeans/CMakeLists.txt b/src/mlpack/methods/kmeans/CMakeLists.txt
index 754ed4f..168373c 100644
--- a/src/mlpack/methods/kmeans/CMakeLists.txt
+++ b/src/mlpack/methods/kmeans/CMakeLists.txt
@@ -25,6 +25,7 @@ set(SOURCES
random_partition.hpp
refined_start.hpp
refined_start_impl.hpp
+ sample_initialization.hpp
)
# Add directory name to sources.
diff --git a/src/mlpack/methods/kmeans/kmeans.hpp b/src/mlpack/methods/kmeans/kmeans.hpp
index 8acc9f3..94a0a15 100644
--- a/src/mlpack/methods/kmeans/kmeans.hpp
+++ b/src/mlpack/methods/kmeans/kmeans.hpp
@@ -10,7 +10,7 @@
#include <mlpack/core.hpp>
#include <mlpack/core/metrics/lmetric.hpp>
-#include "random_partition.hpp"
+#include "sample_initialization.hpp"
#include "max_variance_new_cluster.hpp"
#include "naive_kmeans.hpp"
@@ -47,8 +47,9 @@ namespace kmeans /** K-Means clustering. */ {
* @tparam MetricType The distance metric to use for this KMeans; see
* metric::LMetric for an example.
* @tparam InitialPartitionPolicy Initial partitioning policy; must implement a
- * default constructor and 'void Cluster(const arma::mat&, const size_t,
- * arma::Row<size_t>&)'.
+ * default constructor and either 'void Cluster(const arma::mat&, const
+ * size_t, arma::Row<size_t>&)' or 'void Cluster(const arma::mat&, const
+ * size_t, arma::mat&)'.
* @tparam EmptyClusterPolicy Policy for what to do on an empty cluster; must
* implement a default constructor and 'void EmptyCluster(const arma::mat&
* data, const size_t emptyCluster, const arma::mat& oldCentroids,
@@ -56,11 +57,11 @@ namespace kmeans /** K-Means clustering. */ {
* const size_t iteration)'.
* @tparam LloydStepType Implementation of single Lloyd step to use.
*
- * @see RandomPartition, RefinedStart, AllowEmptyClusters,
+ * @see RandomPartition, SampleInitialization, RefinedStart, AllowEmptyClusters,
* MaxVarianceNewCluster, NaiveKMeans, ElkanKMeans
*/
template<typename MetricType = metric::EuclideanDistance,
- typename InitialPartitionPolicy = RandomPartition,
+ typename InitialPartitionPolicy = SampleInitialization,
typename EmptyClusterPolicy = MaxVarianceNewCluster,
template<class, class> class LloydStepType = NaiveKMeans,
typename MatType = arma::mat>
diff --git a/src/mlpack/methods/kmeans/kmeans_impl.hpp b/src/mlpack/methods/kmeans/kmeans_impl.hpp
index 635133e..b7f14bd 100644
--- a/src/mlpack/methods/kmeans/kmeans_impl.hpp
+++ b/src/mlpack/methods/kmeans/kmeans_impl.hpp
@@ -8,11 +8,72 @@
#include "kmeans.hpp"
#include <mlpack/core/metrics/lmetric.hpp>
+#include <mlpack/core/util/sfinae_utility.hpp>
namespace mlpack {
namespace kmeans {
/**
+ * This gives us a GivesCentroids object that we can use to tell whether or not
+ * an InitialPartitionPolicy returns centroids or point assignments.
+ */
+HAS_MEM_FUNC(Cluster, GivesCentroidsCheck);
+
+/**
+ * 'value' is true if the InitialPartitionPolicy class has a member
+ * Cluster(const arma::mat& data, const size_t clusters, arma::mat& centroids).
+ */
+template<typename InitialPartitionPolicy>
+struct GivesCentroids
+{
+ static const bool value =
+ // Non-static version.
+ GivesCentroidsCheck<InitialPartitionPolicy,
+ void(InitialPartitionPolicy::*)(const arma::mat&,
+ const size_t,
+ arma::mat&)>::value ||
+ // Static version.
+ GivesCentroidsCheck<InitialPartitionPolicy,
+ void(*)(const arma::mat&, const size_t, arma::mat&)>::value;
+};
+
+//! Call the initial partition policy, if it returns assignments. This returns
+//! 'true' to indicate that assignments were given.
+template<typename MatType,
+ typename InitialPartitionPolicy>
+bool GetInitialAssignmentsOrCentroids(
+ InitialPartitionPolicy& ipp,
+ const MatType& data,
+ const size_t clusters,
+ arma::Row<size_t>& assignments,
+ arma::mat& /* centroids */,
+ const typename boost::disable_if_c<
+ GivesCentroids<InitialPartitionPolicy>::value == true>::type* = 0)
+{
+ ipp.Cluster(data, clusters, assignments);
+
+ return true;
+}
+
+//! Call the initial partition policy, if it returns centroids. This returns
+//! 'false' to indicate that assignments were not given.
+template<typename MatType,
+ typename InitialPartitionPolicy>
+bool GetInitialAssignmentsOrCentroids(
+ InitialPartitionPolicy& ipp,
+ const MatType& data,
+ const size_t clusters,
+ arma::Row<size_t>& /* assignments */,
+ arma::mat& centroids,
+ const typename boost::enable_if_c<
+ GivesCentroids<InitialPartitionPolicy>::value == true>::type* = 0)
+{
+ ipp.Cluster(data, clusters, centroids);
+
+ return false;
+}
+
+/**
* Construct the K-Means object.
*/
template<typename MetricType,
@@ -110,25 +171,30 @@ Cluster(const MatType& data,
// the initial centroids.
if (!initialGuess)
{
- // The partitioner gives assignments, so we need to calculate centroids from
- // those assignments. This is probably not the most efficient way to do
- // this, so maybe refactoring should be considered in the future.
+ // The GetInitialAssignmentsOrCentroids() function will call the appropriate
+ // function in the InitialPartitionPolicy to return either assignments or
+ // centroids. We prefer centroids, but if assignments are returned, then we
+ // have to calculate the initial centroids for the first iteration.
arma::Row<size_t> assignments;
- partitioner.Cluster(data, clusters, assignments);
-
- // Calculate initial centroids.
- arma::Row<size_t> counts;
- counts.zeros(clusters);
- centroids.zeros(data.n_rows, clusters);
- for (size_t i = 0; i < data.n_cols; ++i)
+ bool gotAssignments = GetInitialAssignmentsOrCentroids(partitioner, data,
+ clusters, assignments, centroids);
+ if (gotAssignments)
{
- centroids.col(assignments[i]) += arma::vec(data.col(i));
- counts[assignments[i]]++;
- }
+ // The partitioner gives assignments, so we need to calculate centroids
+ // from those assignments.
+ arma::Row<size_t> counts;
+ counts.zeros(clusters);
+ centroids.zeros(data.n_rows, clusters);
+ for (size_t i = 0; i < data.n_cols; ++i)
+ {
+ centroids.col(assignments[i]) += arma::vec(data.col(i));
+ counts[assignments[i]]++;
+ }
- for (size_t i = 0; i < clusters; ++i)
- if (counts[i] != 0)
- centroids.col(i) /= counts[i];
+ for (size_t i = 0; i < clusters; ++i)
+ if (counts[i] != 0)
+ centroids.col(i) /= counts[i];
+ }
}
// Counts of points in each cluster.
diff --git a/src/mlpack/methods/kmeans/sample_initialization.hpp b/src/mlpack/methods/kmeans/sample_initialization.hpp
new file mode 100644
index 0000000..0d967b3
--- /dev/null
+++ b/src/mlpack/methods/kmeans/sample_initialization.hpp
@@ -0,0 +1,49 @@
+/**
+ * @file sample_initialization.hpp
+ * @author Ryan Curtin
+ *
+ * In order to construct initial centroids, randomly sample points from the
+ * dataset. This tends to give better results than the RandomPartition
+ * strategy.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_SAMPLE_INITIALIZATION_HPP
+#define __MLPACK_METHODS_KMEANS_SAMPLE_INITIALIZATION_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace kmeans {
+
+class SampleInitialization
+{
+ public:
+ //! Empty constructor, required by the InitialPartitionPolicy type definition.
+ SampleInitialization() { }
+
+ /**
+ * Initialize the centroids matrix by randomly sampling points from the data
+ * matrix.
+ *
+ * @param data Dataset.
+ * @param clusters Number of clusters.
+ * @param centroids Matrix to put initial centroids into.
+ */
+ template<typename MatType>
+ inline static void Cluster(const MatType& data,
+ const size_t clusters,
+ arma::mat& centroids)
+ {
+ centroids.set_size(data.n_rows, clusters);
+ for (size_t i = 0; i < clusters; ++i)
+ {
+ // Randomly sample a point.
+ const size_t index = math::RandInt(0, data.n_cols);
+ centroids.col(i) = data.col(index);
+ }
+ }
+};
+
+} // namespace kmeans
+} // namespace mlpack
+
+#endif
More information about the mlpack-git
mailing list