[mlpack-svn] r10427 - mlpack/trunk/src/mlpack/methods/kmeans
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Sun Nov 27 02:59:04 EST 2011
Author: rcurtin
Date: 2011-11-27 02:59:03 -0500 (Sun, 27 Nov 2011)
New Revision: 10427
Added:
mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp
Modified:
mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt
Log:
Executable for K-Means. Celebrate!
Modified: mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt 2011-11-27 06:32:33 UTC (rev 10426)
+++ mlpack/trunk/src/mlpack/methods/kmeans/CMakeLists.txt 2011-11-27 07:59:03 UTC (rev 10427)
@@ -19,3 +19,11 @@
# Append sources (with directory name) to list of all MLPACK sources (used at
# the parent scope).
set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)
+
+# The main K-Means executable.
+add_executable(kmeans
+ kmeans_main.cpp
+)
+target_link_libraries(kmeans
+ mlpack
+)
Added: mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp (rev 0)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp 2011-11-27 07:59:03 UTC (rev 10427)
@@ -0,0 +1,134 @@
+/**
+ * @file kmeans_main.cpp
+ * @author Ryan Curtin
+ *
+ * Executable for running K-Means.
+ */
+#include <mlpack/core.hpp>
+
+#include "kmeans.hpp"
+#include "allow_empty_clusters.hpp"
+
+using namespace mlpack;
+using namespace mlpack::kmeans;
+using namespace std;
+
+// Define parameters for the executable.
+PROGRAM_INFO("K-Means Clustering", "This program performs K-Means clustering "
+ "on the given dataset, storing the learned cluster assignments either as "
+ "a column of labels in the file containing the input dataset or in a "
+ "separate file. Empty clusters are not allowed by default; when a cluster "
+ "becomes empty, the point furthest from the centroid of the cluster with "
+ "maximum variance is taken to fill that cluster.", "");
+
+PARAM_STRING_REQ("input_file", "Input dataset to perform clustering on.", "");
+PARAM_INT_REQ("clusters", "Number of clusters to find.", "");
+PARAM_FLAG("in_place", "If specified, a column of the learned cluster "
+ "assignments will be added to the input dataset file. In this case "
+ "--output_file is not necessary.", "");
+PARAM_STRING("output_file", "File to write output labels to.", "", "");
+PARAM_FLAG("allow_empty_clusters", "Allow empty clusters to be created.", "");
+PARAM_FLAG("labels_only", "Only output labels into output file.", "");
+PARAM_DOUBLE("overclustering", "Finds (overclustering * clusters) clusters, "
+ "then merges them together until only the desired number of clusters are "
+ "left.", "", 1.0);
+PARAM_INT("max_iterations", "Maximum number of iterations before K-Means "
+ "terminates.", "", 1000);
+
+int main(int argc, char** argv)
+{
+ CLI::ParseCommandLine(argc, argv);
+
+ // Initialize random seed -- because that makes a difference.
+ srand(time(NULL));
+
+ // Now do validation of options.
+ string input_file = CLI::GetParam<string>("input_file");
+ int clusters = CLI::GetParam<int>("clusters");
+ if (clusters < 1)
+ {
+ Log::Fatal << "Invalid number of clusters requested (" << clusters << ")! "
+ << "Must be greater than or equal to 1." << std::endl;
+ }
+
+ int maxIterations = CLI::GetParam<int>("max_iterations");
+ if (maxIterations < 0)
+ {
+ Log::Fatal << "Invalid value for maximum iterations (" << maxIterations <<
+ ")! Must be greater than or equal to 0." << std::endl;
+ }
+
+ double overclustering = CLI::GetParam<double>("overclustering");
+ if (overclustering < 1)
+ {
+ Log::Fatal << "Invalid value for overclustering (" << overclustering <<
+ ")! Must be greater than or equal to 1." << std::endl;
+ }
+
+ // Make sure we have an output file if we're not doing the work in-place.
+ if (!CLI::HasParam("in_place") && !CLI::HasParam("output_file"))
+ {
+ Log::Fatal << "--output_file not specified (and --in_place not set)."
+ << std::endl;
+ }
+
+ // Load our dataset.
+ arma::mat dataset;
+ data::Load(input_file.c_str(), dataset);
+
+ // Now create the KMeans object. Because we could be using different types,
+ // it gets a little weird...
+ arma::Col<size_t> assignments;
+ if (CLI::HasParam("allow_empty_clusters"))
+ {
+ KMeans<metric::SquaredEuclideanDistance, RandomPartition,
+ AllowEmptyClusters> k(maxIterations, overclustering);
+
+ k.Cluster(dataset, clusters, assignments);
+ }
+ else
+ {
+ KMeans<> k(maxIterations, overclustering);
+
+ k.Cluster(dataset, clusters, assignments);
+ }
+
+ // Now figure out what to do with our results.
+ if (CLI::HasParam("in_place"))
+ {
+ // Add the column of assignments to the dataset; but we have to convert them
+ // to type double first.
+ arma::vec converted(assignments.n_elem);
+ for (size_t i = 0; i < assignments.n_elem; i++)
+ converted(i) = (double) assignments(i);
+
+ dataset.insert_rows(dataset.n_rows, converted);
+
+ // Save the dataset.
+ data::Save(input_file.c_str(), dataset);
+ }
+ else
+ {
+ if (CLI::HasParam("only_labels"))
+ {
+ // Save only the labels.
+ string output_file = CLI::GetParam<string>("output_file");
+ arma::Mat<size_t> output = trans(assignments);
+ data::Save(output_file.c_str(), output);
+ }
+ else
+ {
+ // Convert the assignments to doubles.
+ arma::vec converted(assignments.n_elem);
+ for (size_t i = 0; i < assignments.n_elem; i++)
+ converted(i) = (double) assignments(i);
+
+ dataset.insert_rows(dataset.n_rows, converted);
+
+ // Now save, in the different file.
+ string output_file = CLI::GetParam<string>("output_file");
+ data::Save(output_file.c_str(), dataset);
+ }
+ }
+}
+
More information about the mlpack-svn
mailing list