[mlpack-svn] r14920 - mlpack/trunk/src/mlpack/tests

fastlab-svn at coffeetalk-1.cc.gatech.edu fastlab-svn at coffeetalk-1.cc.gatech.edu
Thu Apr 18 19:10:28 EDT 2013


Author: rcurtin
Date: 2013-04-18 19:10:28 -0400 (Thu, 18 Apr 2013)
New Revision: 14920

Modified:
   mlpack/trunk/src/mlpack/tests/kmeans_test.cpp
Log:
Add test for Bradley-Fayyad initialization (RefinedStart).


Modified: mlpack/trunk/src/mlpack/tests/kmeans_test.cpp
===================================================================
--- mlpack/trunk/src/mlpack/tests/kmeans_test.cpp	2013-04-18 23:10:09 UTC (rev 14919)
+++ mlpack/trunk/src/mlpack/tests/kmeans_test.cpp	2013-04-18 23:10:28 UTC (rev 14920)
@@ -6,6 +6,7 @@
 
 #include <mlpack/methods/kmeans/kmeans.hpp>
 #include <mlpack/methods/kmeans/allow_empty_clusters.hpp>
+#include <mlpack/methods/kmeans/refined_start.hpp>
 
 #include <boost/test/unit_test.hpp>
 #include "old_boost_test_definitions.hpp"
@@ -376,6 +377,73 @@
   BOOST_REQUIRE_GT(centroids(1, 1), 40.0);
 }
 
+/**
+ * Test that the refined starting policy returns decent initial cluster
+ * estimates.
+ */
+BOOST_AUTO_TEST_CASE(RefinedStartTest)
+{
+  // Our dataset will be five Gaussians of largely varying numbers of points and
+  // we expect that the refined starting policy should return good guesses at
+  // what these Gaussians are.
+  math::RandomSeed(std::time(NULL));
+  arma::mat data(3, 3000);
+  data.randn();
+
+  // First Gaussian: 10000 points, centered at (0, 0, 0).
+  // Second Gaussian: 2000 points, centered at (5, 0, -2).
+  // Third Gaussian: 5000 points, centered at (-2, -2, -2).
+  // Fourth Gaussian: 1000 points, centered at (-6, 8, 8).
+  // Fifth Gaussian: 12000 points, centered at (1, 6, 1).
+  arma::mat centroids(" 0  5 -2 -6  1;"
+                      " 0  0 -2  8  6;"
+                      " 0 -2 -2  8  1");
+
+  for (size_t i = 1000; i < 1200; ++i)
+    data.col(i) += centroids.col(1);
+  for (size_t i = 1200; i < 1700; ++i)
+    data.col(i) += centroids.col(2);
+  for (size_t i = 1700; i < 1800; ++i)
+    data.col(i) += centroids.col(3);
+  for (size_t i = 1800; i < 3000; ++i)
+    data.col(i) += centroids.col(4);
+
+  // Now run the RefinedStart algorithm and make sure it doesn't deviate too
+  // much from the actual solution.
+  RefinedStart rs;
+  arma::Col<size_t> assignments;
+  arma::mat resultingCentroids;
+  rs.Cluster(data, 5, assignments);
+
+  // Calculate resulting centroids.
+  resultingCentroids.zeros(3, 5);
+  arma::Col<size_t> counts(5);
+  counts.zeros();
+  for (size_t i = 0; i < 3000; ++i)
+  {
+    resultingCentroids.col(assignments[i]) += data.col(i);
+    ++counts[assignments[i]];
+  }
+
+  // Normalize centroids.
+  for (size_t i = 0; i < 5; ++i)
+    if (counts[i] != 0)
+      resultingCentroids /= counts[i];
+
+  // Calculate sum of distances from centroid means.
+  double distortion = 0;
+  for (size_t i = 0; i < 3000; ++i)
+    distortion += metric::EuclideanDistance::Evaluate(data.col(i),
+        resultingCentroids.col(assignments[i]));
+
+  // Using the refined start, the distance for this dataset is usually around
+  // 13500.  Regular k-means is between 10000 and 30000 (I think the 10000
+  // figure is a corner case which actually does not give good clusters), and
+  // random initial starts give distortion around 22000.  So we'll require that
+  // our distortion is less than 14000.
+  BOOST_REQUIRE_LT(distortion, 14000.0);
+}
+
 #ifdef ARMA_HAS_SPMAT
 // Can't do this test on Armadillo 3.4; var(SpBase) is not implemented.
 #if !((ARMA_VERSION_MAJOR == 3) && (ARMA_VERSION_MINOR == 4))




More information about the mlpack-svn mailing list