[mlpack-svn] r14920 - mlpack/trunk/src/mlpack/tests
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Thu Apr 18 19:10:28 EDT 2013
Author: rcurtin
Date: 2013-04-18 19:10:28 -0400 (Thu, 18 Apr 2013)
New Revision: 14920
Modified:
mlpack/trunk/src/mlpack/tests/kmeans_test.cpp
Log:
Add test for Bradley-Fayyad initialization (RefinedStart).
Modified: mlpack/trunk/src/mlpack/tests/kmeans_test.cpp
===================================================================
--- mlpack/trunk/src/mlpack/tests/kmeans_test.cpp 2013-04-18 23:10:09 UTC (rev 14919)
+++ mlpack/trunk/src/mlpack/tests/kmeans_test.cpp 2013-04-18 23:10:28 UTC (rev 14920)
@@ -6,6 +6,7 @@
#include <mlpack/methods/kmeans/kmeans.hpp>
#include <mlpack/methods/kmeans/allow_empty_clusters.hpp>
+#include <mlpack/methods/kmeans/refined_start.hpp>
#include <boost/test/unit_test.hpp>
#include "old_boost_test_definitions.hpp"
@@ -376,6 +377,73 @@
BOOST_REQUIRE_GT(centroids(1, 1), 40.0);
}
+/**
+ * Test that the refined starting policy returns decent initial cluster
+ * estimates.
+ */
+BOOST_AUTO_TEST_CASE(RefinedStartTest)
+{
+ // Our dataset will be five Gaussians of largely varying numbers of points and
+ // we expect that the refined starting policy should return good guesses at
+ // what these Gaussians are.
+ math::RandomSeed(std::time(NULL));
+ arma::mat data(3, 3000);
+ data.randn();
+
+ // First Gaussian: 10000 points, centered at (0, 0, 0).
+ // Second Gaussian: 2000 points, centered at (5, 0, -2).
+ // Third Gaussian: 5000 points, centered at (-2, -2, -2).
+ // Fourth Gaussian: 1000 points, centered at (-6, 8, 8).
+ // Fifth Gaussian: 12000 points, centered at (1, 6, 1).
+ arma::mat centroids(" 0 5 -2 -6 1;"
+ " 0 0 -2 8 6;"
+ " 0 -2 -2 8 1");
+
+ for (size_t i = 1000; i < 1200; ++i)
+ data.col(i) += centroids.col(1);
+ for (size_t i = 1200; i < 1700; ++i)
+ data.col(i) += centroids.col(2);
+ for (size_t i = 1700; i < 1800; ++i)
+ data.col(i) += centroids.col(3);
+ for (size_t i = 1800; i < 3000; ++i)
+ data.col(i) += centroids.col(4);
+
+ // Now run the RefinedStart algorithm and make sure it doesn't deviate too
+ // much from the actual solution.
+ RefinedStart rs;
+ arma::Col<size_t> assignments;
+ arma::mat resultingCentroids;
+ rs.Cluster(data, 5, assignments);
+
+ // Calculate resulting centroids.
+ resultingCentroids.zeros(3, 5);
+ arma::Col<size_t> counts(5);
+ counts.zeros();
+ for (size_t i = 0; i < 3000; ++i)
+ {
+ resultingCentroids.col(assignments[i]) += data.col(i);
+ ++counts[assignments[i]];
+ }
+
+ // Normalize centroids.
+ for (size_t i = 0; i < 5; ++i)
+ if (counts[i] != 0)
+ resultingCentroids /= counts[i];
+
+ // Calculate sum of distances from centroid means.
+ double distortion = 0;
+ for (size_t i = 0; i < 3000; ++i)
+ distortion += metric::EuclideanDistance::Evaluate(data.col(i),
+ resultingCentroids.col(assignments[i]));
+
+ // Using the refined start, the distance for this dataset is usually around
+ // 13500. Regular k-means is between 10000 and 30000 (I think the 10000
+ // figure is a corner case which actually does not give good clusters), and
+ // random initial starts give distortion around 22000. So we'll require that
+ // our distortion is less than 14000.
+ BOOST_REQUIRE_LT(distortion, 14000.0);
+}
+
#ifdef ARMA_HAS_SPMAT
// Can't do this test on Armadillo 3.4; var(SpBase) is not implemented.
#if !((ARMA_VERSION_MAJOR == 3) && (ARMA_VERSION_MINOR == 4))
More information about the mlpack-svn
mailing list