[mlpack-svn] r14918 - mlpack/trunk/src/mlpack/tests
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Thu Apr 18 14:29:05 EDT 2013
Author: rcurtin
Date: 2013-04-18 14:29:05 -0400 (Thu, 18 Apr 2013)
New Revision: 14918
Modified:
mlpack/trunk/src/mlpack/tests/kmeans_test.cpp
Log:
Test the new functionality of k-means: initial assignments and initial
centroids. Also test that k-means fails on corner case datasets -- it should.
Modified: mlpack/trunk/src/mlpack/tests/kmeans_test.cpp
===================================================================
--- mlpack/trunk/src/mlpack/tests/kmeans_test.cpp 2013-04-18 18:28:41 UTC (rev 14917)
+++ mlpack/trunk/src/mlpack/tests/kmeans_test.cpp 2013-04-18 18:29:05 UTC (rev 14918)
@@ -206,6 +206,176 @@
BOOST_REQUIRE_LT(assignments[i], 18);
}
+/**
+ * Make sure that random initialization fails for a corner case dataset.
+ */
+BOOST_AUTO_TEST_CASE(RandomInitialAssignmentFailureTest)
+{
+ // This is a very synthetic dataset. It is one Gaussian with a huge number of
+ // points combined with one faraway Gaussian with very few points. Normally,
+ // k-means should not get the correct result -- which is one cluster at each
+ // Gaussian. This is because the random partitioning scheme has very low
+ // (virtually zero) likelihood of separating the two Gaussians properly, and
+ // then the algorithm will never converge to that result.
+ //
+ // So we will set the initial assignments appropriately. Remember, once the
+ // initial assignments are done, k-means is deterministic.
+ arma::mat dataset(2, 10002);
+ dataset.randn();
+ // Now move the second Gaussian far away.
+ for (size_t i = 0; i < 2; ++i)
+ dataset.col(10000 + i) += arma::vec("50 50");
+
+ // Ensure that k-means fails when run with random initialization. This isn't
+ // strictly a necessary test, but it does help let us know that this is a good
+ // test.
+ size_t successes = 0;
+ for (size_t run = 0; run < 15; ++run)
+ {
+ arma::mat centroids;
+ arma::Col<size_t> assignments;
+ KMeans<> kmeans;
+ kmeans.Cluster(dataset, 2, assignments, centroids);
+
+ // Inspect centroids. See if one is close to the second Gaussian.
+ if ((centroids(0, 0) >= 30.0 && centroids(1, 0) >= 30.0) ||
+ (centroids(0, 1) >= 30.0 && centroids(1, 1) >= 30.0))
+ ++successes;
+ }
+
+ // Only one success allowed. The probability of two successes should be
+ // infinitesimal.
+ BOOST_REQUIRE_LT(successes, 2);
+}
+
+/**
+ * Make sure that specifying initial assignments is successful for a corner case
+ * dataset which doesn't usually converge otherwise.
+ */
+BOOST_AUTO_TEST_CASE(InitialAssignmentTest)
+{
+ // For a better description of this dataset, see
+ // RandomInitialAssignmentFailureTest.
+ arma::mat dataset(2, 10002);
+ dataset.randn();
+ // Now move the second Gaussian far away.
+ for (size_t i = 0; i < 2; ++i)
+ dataset.col(10000 + i) += arma::vec("50 50");
+
+ // Now, if we specify initial assignments, the algorithm should converge (with
+ // zero iterations, actually, because this is the solution).
+ arma::Col<size_t> assignments(10002);
+ assignments.fill(0);
+ assignments[10000] = 1;
+ assignments[10001] = 1;
+
+ KMeans<> kmeans;
+ kmeans.Cluster(dataset, 2, assignments, true);
+
+ // Check results.
+ for (size_t i = 0; i < 10000; ++i)
+ BOOST_REQUIRE_EQUAL(assignments[i], 0);
+ for (size_t i = 10000; i < 10002; ++i)
+ BOOST_REQUIRE_EQUAL(assignments[i], 1);
+
+ // Now, slightly harder. Give it one incorrect assignment in each cluster.
+ // The wrong assignment should be quickly fixed.
+ assignments[9999] = 1;
+ assignments[10000] = 0;
+
+ kmeans.Cluster(dataset, 2, assignments, true);
+
+ // Check results.
+ for (size_t i = 0; i < 10000; ++i)
+ BOOST_REQUIRE_EQUAL(assignments[i], 0);
+ for (size_t i = 10000; i < 10002; ++i)
+ BOOST_REQUIRE_EQUAL(assignments[i], 1);
+}
+
+/**
+ * Make sure specifying initial centroids is successful for a corner case which
+ * doesn't usually converge otherwise.
+ */
+BOOST_AUTO_TEST_CASE(InitialCentroidTest)
+{
+ // For a better description of this dataset, see
+ // RandomInitialAssignmentFailureTest.
+ arma::mat dataset(2, 10002);
+ dataset.randn();
+ // Now move the second Gaussian far away.
+ for (size_t i = 0; i < 2; ++i)
+ dataset.col(10000 + i) += arma::vec("50 50");
+
+ arma::Col<size_t> assignments;
+ arma::mat centroids(2, 2);
+
+ centroids.col(0) = arma::vec("0 0");
+ centroids.col(1) = arma::vec("50 50");
+
+ // This should converge correctly.
+ KMeans<> k;
+ k.Cluster(dataset, 2, assignments, centroids, false, true);
+
+ // Check results.
+ for (size_t i = 0; i < 10000; ++i)
+ BOOST_REQUIRE_EQUAL(assignments[i], 0);
+ for (size_t i = 10000; i < 10002; ++i)
+ BOOST_REQUIRE_EQUAL(assignments[i], 1);
+
+ // Now add a little noise to the initial centroids.
+ centroids.col(0) = arma::vec("3 4");
+ centroids.col(1) = arma::vec("25 10");
+
+ k.Cluster(dataset, 2, assignments, centroids, false, true);
+
+ // Check results.
+ for (size_t i = 0; i < 10000; ++i)
+ BOOST_REQUIRE_EQUAL(assignments[i], 0);
+ for (size_t i = 10000; i < 10002; ++i)
+ BOOST_REQUIRE_EQUAL(assignments[i], 1);
+}
+
+/**
+ * Ensure that initial assignments override initial centroids.
+ */
+BOOST_AUTO_TEST_CASE(InitialAssignmentOverrideTest)
+{
+ // For a better description of this dataset, see
+ // RandomInitialAssignmentFailureTest.
+ arma::mat dataset(2, 10002);
+ dataset.randn();
+ // Now move the second Gaussian far away.
+ for (size_t i = 0; i < 2; ++i)
+ dataset.col(10000 + i) += arma::vec("50 50");
+
+ arma::Col<size_t> assignments(10002);
+ assignments.fill(0);
+ assignments[10000] = 1;
+ assignments[10001] = 1;
+
+ // Note that this initial centroid guess is the opposite of the assignments
+ // guess!
+ arma::mat centroids(2, 2);
+ centroids.col(0) = arma::vec("50 50");
+ centroids.col(1) = arma::vec("0 0");
+
+ KMeans<> k;
+ k.Cluster(dataset, 2, assignments, centroids, true, true);
+
+ // Because the initial assignments guess should take priority, we should get
+ // those same results back.
+ for (size_t i = 0; i < 10000; ++i)
+ BOOST_REQUIRE_EQUAL(assignments[i], 0);
+ for (size_t i = 10000; i < 10002; ++i)
+ BOOST_REQUIRE_EQUAL(assignments[i], 1);
+
+ // Make sure the centroids are about right too.
+ BOOST_REQUIRE_LT(centroids(0, 0), 10.0);
+ BOOST_REQUIRE_LT(centroids(1, 0), 10.0);
+ BOOST_REQUIRE_GT(centroids(0, 1), 40.0);
+ BOOST_REQUIRE_GT(centroids(1, 1), 40.0);
+}
+
#ifdef ARMA_HAS_SPMAT
// Can't do this test on Armadillo 3.4; var(SpBase) is not implemented.
#if !((ARMA_VERSION_MAJOR == 3) && (ARMA_VERSION_MINOR == 4))
@@ -252,6 +422,7 @@
BOOST_REQUIRE_EQUAL(assignments[10], clusterTwo);
BOOST_REQUIRE_EQUAL(assignments[11], clusterTwo);
}
+
#endif // Exclude Armadillo 3.4.
#endif // ARMA_HAS_SPMAT
More information about the mlpack-svn
mailing list