[mlpack-svn] r14923 - mlpack/trunk/src/mlpack/methods/kmeans

Thu Apr 18 19:38:18 EDT 2013

Author: rcurtin
Date: 2013-04-18 19:38:17 -0400 (Thu, 18 Apr 2013)
New Revision: 14923

Modified:
   mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp
Log:
Remove Pelleg-Moore support (as per #251).  Allow the user to specify a file to
save centroids to, and also allow the user to ask for Bradley-Fayyad
initialization, and give options for the parameters to that initialization.


Modified: mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp
===================================================================

--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp	2013-04-18 23:37:33 UTC (rev 14922)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp	2013-04-18 23:38:17 UTC (rev 14923)
@@ -8,6 +8,7 @@
 
 #include "kmeans.hpp"
 #include "allow_empty_clusters.hpp"
+#include "refined_start.hpp"
 
 using namespace mlpack;
 using namespace mlpack::kmeans;
@@ -19,16 +20,35 @@
     "a column of labels in the file containing the input dataset or in a "
     "separate file.  Empty clusters are not allowed by default; when a cluster "
     "becomes empty, the point furthest from the centroid of the cluster with "
-    "maximum variance is taken to fill that cluster.");
+    "maximum variance is taken to fill that cluster."
+    "\n\n"
+    "Optionally, the Bradley and Fayyad approach (\"Refining initial points for"
+    " k-means clustering\", 1998) can be used to select initial points by "
+    "specifying the --refined_start (-r) option.  This approach works by taking"
+    " random samples of the dataset; to specify the number of samples, the "
+    "--samples parameter is used, and to specify the percentage of the dataset "
+    "to be used in each sample, the --percentage parameter is used (it should "
+    "be a value between 0.0 and 1.0)."
+    "\n\n"
+    "If you want to specify your own initial cluster assignments or initial "
+    "cluster centroids, this functionality is available in the C++ interface. "
+    "Alternately, file a bug (well, a feature request) on the mlpack bug "
+    "tracker.");
 
+// Required options.
 PARAM_STRING_REQ("inputFile", "Input dataset to perform clustering on.", "i");
 PARAM_INT_REQ("clusters", "Number of clusters to find.", "c");
 
+// Output options.
 PARAM_FLAG("in_place", "If specified, a column of the learned cluster "
     "assignments will be added to the input dataset file.  In this case, "
     "--outputFile is not necessary.", "p");
-PARAM_STRING("outputFile", "File to write output labels or labeled data to.",
+PARAM_STRING("output_file", "File to write output labels or labeled data to.",
     "o", "output.csv");
+PARAM_STRING("centroid_file", "If specified, the centroids of each cluster will"
+    " be written to the given file.", "c", "");
+
+// k-means configuration options.
 PARAM_FLAG("allow_empty_clusters", "Allow empty clusters to be created.", "e");
 PARAM_FLAG("labels_only", "Only output labels into output file.", "l");
 PARAM_DOUBLE("overclustering", "Finds (overclustering * clusters) clusters, "
@@ -37,8 +57,20 @@
 PARAM_INT("max_iterations", "Maximum number of iterations before K-Means "
     "terminates.", "m", 1000);
 PARAM_INT("seed", "Random seed.  If 0, 'std::time(NULL)' is used.", "s", 0);
-PARAM_FLAG("fast_kmeans", "Use the experimental fast k-means algorithm by Pelleg and Moore", "f")
 
+// This is known to not work (#251).
+//PARAM_FLAG("fast_kmeans", "Use the experimental fast k-means algorithm by "
+//    "Pelleg and Moore.", "f");
+
+// Parameters for "refined start" k-means.
+PARAM_FLAG("refined_start", "Use the refined initial point strategy by Bradley "
+    "and Fayyad to choose initial points.", "r");
+PARAM_INT("samplings", "Number of samplings to perform for refined start (use "
+    "when --refined_start is specified).", "S", 100);
+PARAM_DOUBLE("percentage", "Percentage of dataset to use for each refined start"
+    " sampling (use when --refined_start is specified).", "p", 0.02);
+
+
 int main(int argc, char** argv)
 {
   CLI::ParseCommandLine(argc, argv);
@@ -86,29 +118,60 @@
   // Now create the KMeans object.  Because we could be using different types,
   // it gets a little weird...
   arma::Col<size_t> assignments;
+  arma::mat centroids;
 
   if (CLI::HasParam("allow_empty_clusters"))
   {
-    KMeans<metric::SquaredEuclideanDistance, RandomPartition,
-        AllowEmptyClusters> k(maxIterations, overclustering);
+    if (CLI::HasParam("refined_start"))
+    {
+      KMeans<metric::SquaredEuclideanDistance, RefinedStart, AllowEmptyClusters>
+          k(maxIterations, overclustering);
 
-    Timer::Start("clustering");
-    if (CLI::HasParam("fast_kmeans"))
-      k.FastCluster(dataset, clusters, assignments);
+      Timer::Start("clustering");
+      if (CLI::HasParam("fast_kmeans"))
+        k.FastCluster(dataset, clusters, assignments);
+      else
+        k.Cluster(dataset, clusters, assignments, centroids);
+      Timer::Stop("clustering");
+    }
     else
-      k.Cluster(dataset, clusters, assignments);
-    Timer::Stop("clustering");
+    {
+      KMeans<metric::SquaredEuclideanDistance, RandomPartition,
+          AllowEmptyClusters> k(maxIterations, overclustering);
+
+      Timer::Start("clustering");
+      if (CLI::HasParam("fast_kmeans"))
+        k.FastCluster(dataset, clusters, assignments);
+      else
+        k.Cluster(dataset, clusters, assignments, centroids);
+      Timer::Stop("clustering");
+    }
   }
   else
   {
-    KMeans<> k(maxIterations, overclustering);
+    if (CLI::HasParam("refined_start"))
+    {
+      KMeans<metric::SquaredEuclideanDistance, RefinedStart> k(maxIterations,
+          overclustering);
 
-    Timer::Start("clustering");
-    if (CLI::HasParam("fast_kmeans"))
-      k.FastCluster(dataset, clusters, assignments);
+      Timer::Start("clustering");
+      if (CLI::HasParam("fast_kmeans"))
+        k.FastCluster(dataset, clusters, assignments);
+      else
+        k.Cluster(dataset, clusters, assignments, centroids);
+      Timer::Stop("clustering");
+    }
     else
-      k.Cluster(dataset, clusters, assignments);
-    Timer::Stop("clustering");
+    {
+      KMeans<> k(maxIterations, overclustering);
+
+      Timer::Start("clustering");
+      if (CLI::HasParam("fast_kmeans"))
+        k.FastCluster(dataset, clusters, assignments);
+      else
+        k.Cluster(dataset, clusters, assignments, centroids);
+      Timer::Stop("clustering");
+    }
   }
 
   // Now figure out what to do with our results.
@@ -123,7 +186,7 @@
     dataset.insert_rows(dataset.n_rows, trans(converted));
 
     // Save the dataset.
-    data::Save(inputFile.c_str(), dataset);
+    data::Save(inputFile, dataset);
   }
   else
   {
@@ -145,8 +208,12 @@
 
       // Now save, in the different file.
       string outputFile = CLI::GetParam<string>("outputFile");
-      data::Save(outputFile.c_str(), dataset);
+      data::Save(outputFile, dataset);
     }
   }
+
+  // Should we write the centroids to a file?
+  if (CLI::HasParam("centroids_file"))
+    data::Save(CLI::GetParam<std::string>("centroids_file"), centroids);
 }