[mlpack-svn] r15493 - in mlpack/conf/jenkins-conf/benchmark/methods/shogun: . src
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Thu Jul 18 11:53:34 EDT 2013
Author: marcus
Date: Thu Jul 18 11:53:34 2013
New Revision: 15493
Log:
Add shogun K-Means method src and benchmark script.
Added:
mlpack/conf/jenkins-conf/benchmark/methods/shogun/kmeans.py
mlpack/conf/jenkins-conf/benchmark/methods/shogun/src/
mlpack/conf/jenkins-conf/benchmark/methods/shogun/src/kmeans.cpp
Added: mlpack/conf/jenkins-conf/benchmark/methods/shogun/kmeans.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/shogun/kmeans.py Thu Jul 18 11:53:34 2013
@@ -0,0 +1,164 @@
+'''
+ @file kmeans.py
+ @author Marcus Edel
+
+ K-Means Clustering with shogun.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains symlinks to
+# modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+ os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
+if cmd_subfolder not in sys.path:
+ sys.path.insert(0, cmd_subfolder)
+
+from log import *
+from timer import *
+
+'''
+This class implements the K-Means Clustering benchmark.
+'''
+class KMEANS(object):
+
+ '''
+ Create the K-Means Clustering benchmark instance.
+
+ @param dataset - Input dataset to perform K-Means Clustering on.
+ @param verbose - Display informational messages.
+ '''
+ def __init__(self, dataset, verbose=True):
+ self.verbose = verbose
+ self.dataset = dataset
+
+ '''
+ Destructor to clean up at the end.
+ '''
+ def __del__(self):
+ pass
+
+ '''
+ Use the shogun libary to implement K-Means Clustering.
+
+ @param options - Extra options for the method.
+ @return - Elapsed time in seconds or -1 if the method was not successful.
+ '''
+ def KMeansShogun(self, options):
+ totalTimer = Timer()
+
+ # Gather parameters.
+ clusters = re.search("-c (\d+)", options)
+ seed = re.search("-s (\d+)", options)
+ maxIterations = re.search("-m (\d+)", options)
+
+ # Now do validation of options.
+ if not clusters and len(self.dataset) != 2:
+ Log.Fatal("Required option: Number of clusters or cluster locations.")
+ return -1
+ elif (not clusters or clusters.group(1) < 1):
+ Log.Fatal("Invalid number of clusters requested! Must be greater than or "
+ + "equal to 1.")
+ return -1
+
+ maxIterations = 1000 if not maxIterations else int(maxIterations.group(1))
+
+ # Load input dataset.
+ # If the dataset contains two files then the second file is the centroids
+ # file. In this case we run the the kmeans executable.
+ Log.Info("Loading dataset", self.verbose)
+ if len(self.dataset) == 2:
+
+ # Run command with the nessecary arguments and return its output as a byte
+ # string. We have untrusted input so we disables all shell based features.
+ cmd = shlex.split(self.path + "methods/shogun/kmeans " + self.dataset[0]
+ + " " + self.dataset[1] + clusters.group(1) + str(maxIterations))
+ try:
+ s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)
+ except Exception, e:
+ Log.Fatal("Could not execute command: " + str(cmd))
+ return -1
+
+ # Return the elapsed time.
+ timer = self.parseTimer(s)
+ if not timer:
+ Log.Fatal("Can't parse the timer")
+ return -1
+ else:
+ time = self.GetTime(timer)
+ Log.Info(("total time: %fs" % (time)), self.verbose)
+
+ return time
+
+ else:
+ import numpy as np
+ from shogun.Distance import EuclideanDistance
+ from shogun.Features import RealFeatures
+ from shogun import Clustering
+ from shogun.Mathematics import Math_init_random
+
+ if seed:
+ Math_init_random(seed.group(1))
+
+ data = np.genfromtxt(self.dataset, delimiter=',')
+
+ dataFeat = RealFeatures(data.T)
+ distance = EuclideanDistance(dataFeat, dataFeat)
+
+ # Create the K-Means object and perform K-Means clustering.
+ with totalTimer:
+ model = Clustering.KMeans(int(clusters.group(1)), distance)
+ model.set_max_iter(maxIterations)
+ model.train()
+
+ labels = model.apply().get_labels()
+ centers = model.get_cluster_centers()
+
+ return totalTimer.ElapsedTime()
+
+ '''
+ Perform K-Means Clustering. If the method has been successfully
+ completed return the elapsed time in seconds.
+
+ @param options - Extra options for the method.
+ @return - Elapsed time in seconds or -1 if the method was not successful.
+ '''
+ def RunMethod(self, options):
+ Log.Info("Perform K-Means.", self.verbose)
+
+ return self.KMeansShogun(options)
+
+ '''
+ Parse the timer data form a given string.
+
+ @param data - String to parse timer data from.
+ @return - Namedtuple that contains the timer data.
+ '''
+ def parseTimer(self, data):
+ # Compile the regular expression pattern into a regular expression object to
+ # parse the timer data.
+ pattern = re.compile(r"""
+ .*?total_time: (?P<total_time>.*?)s.*?
+ """, re.VERBOSE|re.MULTILINE|re.DOTALL)
+
+ match = pattern.match(data)
+ if not match:
+ Log.Fatal("Can't parse the data: wrong format")
+ return -1
+ else:
+ # Create a namedtuple and return the timer data.
+ timer = collections.namedtuple("timer", ["total_time"])
+
+ return timer(float(match.group("total_time")))
+
+ '''
+ Return the elapsed time in seconds.
+
+ @param timer - Namedtuple that contains the timer data.
+ @return Elapsed time in seconds.
+ '''
+ def GetTime(self, timer):
+ time = timer.total_time
+ return time
Added: mlpack/conf/jenkins-conf/benchmark/methods/shogun/src/kmeans.cpp
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/shogun/src/kmeans.cpp Thu Jul 18 11:53:34 2013
@@ -0,0 +1,112 @@
+/**
+ * @file kmeans.hpp
+ * @author Ryan Curtin
+ *
+ * K-Means (with initial centroids) Clustering with shogun.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <iostream>
+#include <iomanip>
+#include <ctime>
+
+#include <shogun/base/init.h>
+
+#define private protected
+#include <shogun/clustering/KMeans.h>
+#undef private
+#include <shogun/distance/EuclideanDistance.h>
+#include <shogun/features/DenseFeatures.h>
+#include <shogun/features/RealFileFeatures.h>
+#include <shogun/io/AsciiFile.h>
+
+using namespace shogun;
+
+// Define a new KMeans class with the opportunity to set initial centroids.
+class KMeans : public CKMeans
+{
+public:
+ KMeans(int32_t k_, CDistance* d) : CKMeans(k_, d) { }
+
+ // Overload the train_machine function, to set initial centroids.
+ virtual bool train_machine(CFeatures* data, CDenseFeatures<float64_t>* centroids)
+ {
+ ASSERT(distance);
+
+ if (data)
+ distance->init(data, data);
+
+ ASSERT(distance->get_feature_type() == F_DREAL);
+
+ CDenseFeatures<float64_t>* lhs = (CDenseFeatures<float64_t>*)
+ distance->get_lhs();
+
+ ASSERT(lhs);
+ int32_t num = lhs->get_num_vectors();
+ SG_UNREF(lhs);
+
+ Weights = SGVector<float64_t>(num);
+ for (int32_t i = 0; i < num; ++i)
+ Weights.vector[i] = 1.0;
+
+ clustknb(true, centroids->get_feature_matrix().matrix);
+
+ return true;
+ }
+};
+
+int main(int argc, char** argv)
+{
+ init_shogun_with_defaults();
+
+ // Load input dataset.
+ const char* dataset = argv[1];
+ const char* centroids = argv[2];
+ int32_t clusters = atoi(argv[3]);
+ int32_t maxIterations = atoi(argv[4]);
+
+ CAsciiFile* dfile = new CAsciiFile(dataset);
+ SGMatrix<float64_t> dmat = SGMatrix<float64_t>();
+ dmat.load(dfile);
+ SG_UNREF(dfile);
+
+ CAsciiFile* cfile = new CAsciiFile(centroids);
+ SGMatrix<float64_t> cmat = SGMatrix<float64_t>();
+ cmat.load(cfile);
+ SG_UNREF(cfile);
+
+ CDenseFeatures<float64_t>* data = new CDenseFeatures<float64_t>(dmat);
+ SG_REF(data);
+
+ CDenseFeatures<float64_t>* cent = new CDenseFeatures<float64_t>(cmat);
+ SG_REF(cent);
+
+ CEuclideanDistance* dist = new CEuclideanDistance(data, data);
+
+ timeval start;
+ start.tv_sec = 0;
+ start.tv_usec = 0;
+
+ gettimeofday(&start, NULL);
+
+ // Perform K-Means clustering.
+ KMeans k(clusters, dist);
+ k.set_max_iter(maxIterations);
+ k.train_machine(data, cent);
+
+ timeval end;
+ gettimeofday(&end, NULL);
+
+ timeval delta;
+ timersub(&end, &start, &delta);
+
+ std::cout << "[INFO ] total_time:" << delta.tv_sec << "." << std::setw(6)
+ << std::setfill('0') << delta.tv_usec << "s" << std::endl;
+
+ SG_UNREF(data);
+ SG_UNREF(cent);
+
+ exit_shogun();
+
+ return 0;
+}
\ No newline at end of file
More information about the mlpack-svn
mailing list