[mlpack-svn] r14923 - mlpack/trunk/src/mlpack/methods/kmeans
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Thu Apr 18 19:38:18 EDT 2013
Author: rcurtin
Date: 2013-04-18 19:38:17 -0400 (Thu, 18 Apr 2013)
New Revision: 14923
Modified:
mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp
Log:
Remove Pelleg-Moore support (as per #251). Allow the user to specify a file to
save centroids to, and also allow the user to ask for Bradley-Fayyad
initialization, and give options for the parameters to that initialization.
Modified: mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp 2013-04-18 23:37:33 UTC (rev 14922)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp 2013-04-18 23:38:17 UTC (rev 14923)
@@ -8,6 +8,7 @@
#include "kmeans.hpp"
#include "allow_empty_clusters.hpp"
+#include "refined_start.hpp"
using namespace mlpack;
using namespace mlpack::kmeans;
@@ -19,16 +20,35 @@
"a column of labels in the file containing the input dataset or in a "
"separate file. Empty clusters are not allowed by default; when a cluster "
"becomes empty, the point furthest from the centroid of the cluster with "
- "maximum variance is taken to fill that cluster.");
+ "maximum variance is taken to fill that cluster."
+ "\n\n"
+ "Optionally, the Bradley and Fayyad approach (\"Refining initial points for"
+ " k-means clustering\", 1998) can be used to select initial points by "
+ "specifying the --refined_start (-r) option. This approach works by taking"
+ " random samples of the dataset; to specify the number of samples, the "
+ "--samples parameter is used, and to specify the percentage of the dataset "
+ "to be used in each sample, the --percentage parameter is used (it should "
+ "be a value between 0.0 and 1.0)."
+ "\n\n"
+ "If you want to specify your own initial cluster assignments or initial "
+ "cluster centroids, this functionality is available in the C++ interface. "
+ "Alternately, file a bug (well, a feature request) on the mlpack bug "
+ "tracker.");
+// Required options.
PARAM_STRING_REQ("inputFile", "Input dataset to perform clustering on.", "i");
PARAM_INT_REQ("clusters", "Number of clusters to find.", "c");
+// Output options.
PARAM_FLAG("in_place", "If specified, a column of the learned cluster "
"assignments will be added to the input dataset file. In this case, "
"--outputFile is not necessary.", "p");
-PARAM_STRING("outputFile", "File to write output labels or labeled data to.",
+PARAM_STRING("output_file", "File to write output labels or labeled data to.",
"o", "output.csv");
+PARAM_STRING("centroid_file", "If specified, the centroids of each cluster will"
+ " be written to the given file.", "c", "");
+
+// k-means configuration options.
PARAM_FLAG("allow_empty_clusters", "Allow empty clusters to be created.", "e");
PARAM_FLAG("labels_only", "Only output labels into output file.", "l");
PARAM_DOUBLE("overclustering", "Finds (overclustering * clusters) clusters, "
@@ -37,8 +57,20 @@
PARAM_INT("max_iterations", "Maximum number of iterations before K-Means "
"terminates.", "m", 1000);
PARAM_INT("seed", "Random seed. If 0, 'std::time(NULL)' is used.", "s", 0);
-PARAM_FLAG("fast_kmeans", "Use the experimental fast k-means algorithm by Pelleg and Moore", "f")
+// This is known to not work (#251).
+//PARAM_FLAG("fast_kmeans", "Use the experimental fast k-means algorithm by "
+// "Pelleg and Moore.", "f");
+
+// Parameters for "refined start" k-means.
+PARAM_FLAG("refined_start", "Use the refined initial point strategy by Bradley "
+ "and Fayyad to choose initial points.", "r");
+PARAM_INT("samplings", "Number of samplings to perform for refined start (use "
+ "when --refined_start is specified).", "S", 100);
+PARAM_DOUBLE("percentage", "Percentage of dataset to use for each refined start"
+ " sampling (use when --refined_start is specified).", "p", 0.02);
+
+
int main(int argc, char** argv)
{
CLI::ParseCommandLine(argc, argv);
@@ -86,29 +118,60 @@
// Now create the KMeans object. Because we could be using different types,
// it gets a little weird...
arma::Col<size_t> assignments;
+ arma::mat centroids;
if (CLI::HasParam("allow_empty_clusters"))
{
- KMeans<metric::SquaredEuclideanDistance, RandomPartition,
- AllowEmptyClusters> k(maxIterations, overclustering);
+ if (CLI::HasParam("refined_start"))
+ {
+ KMeans<metric::SquaredEuclideanDistance, RefinedStart, AllowEmptyClusters>
+ k(maxIterations, overclustering);
- Timer::Start("clustering");
- if (CLI::HasParam("fast_kmeans"))
- k.FastCluster(dataset, clusters, assignments);
+ Timer::Start("clustering");
+ if (CLI::HasParam("fast_kmeans"))
+ k.FastCluster(dataset, clusters, assignments);
+ else
+ k.Cluster(dataset, clusters, assignments, centroids);
+ Timer::Stop("clustering");
+ }
else
- k.Cluster(dataset, clusters, assignments);
- Timer::Stop("clustering");
+ {
+ KMeans<metric::SquaredEuclideanDistance, RandomPartition,
+ AllowEmptyClusters> k(maxIterations, overclustering);
+
+ Timer::Start("clustering");
+ if (CLI::HasParam("fast_kmeans"))
+ k.FastCluster(dataset, clusters, assignments);
+ else
+ k.Cluster(dataset, clusters, assignments, centroids);
+ Timer::Stop("clustering");
+ }
}
else
{
- KMeans<> k(maxIterations, overclustering);
+ if (CLI::HasParam("refined_start"))
+ {
+ KMeans<metric::SquaredEuclideanDistance, RefinedStart> k(maxIterations,
+ overclustering);
- Timer::Start("clustering");
- if (CLI::HasParam("fast_kmeans"))
- k.FastCluster(dataset, clusters, assignments);
+ Timer::Start("clustering");
+ if (CLI::HasParam("fast_kmeans"))
+ k.FastCluster(dataset, clusters, assignments);
+ else
+ k.Cluster(dataset, clusters, assignments, centroids);
+ Timer::Stop("clustering");
+ }
else
- k.Cluster(dataset, clusters, assignments);
- Timer::Stop("clustering");
+ {
+ KMeans<> k(maxIterations, overclustering);
+
+ Timer::Start("clustering");
+ if (CLI::HasParam("fast_kmeans"))
+ k.FastCluster(dataset, clusters, assignments);
+ else
+ k.Cluster(dataset, clusters, assignments, centroids);
+ Timer::Stop("clustering");
+ }
}
// Now figure out what to do with our results.
@@ -123,7 +186,7 @@
dataset.insert_rows(dataset.n_rows, trans(converted));
// Save the dataset.
- data::Save(inputFile.c_str(), dataset);
+ data::Save(inputFile, dataset);
}
else
{
@@ -145,8 +208,12 @@
// Now save, in the different file.
string outputFile = CLI::GetParam<string>("outputFile");
- data::Save(outputFile.c_str(), dataset);
+ data::Save(outputFile, dataset);
}
}
+
+ // Should we write the centroids to a file?
+ if (CLI::HasParam("centroids_file"))
+ data::Save(CLI::GetParam<std::string>("centroids_file"), centroids);
}
More information about the mlpack-svn
mailing list