[mlpack-svn] r15421 - in mlpack/conf/jenkins-conf/benchmark/methods: mlpack scikit
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Fri Jul 5 13:03:36 EDT 2013
Author: marcus
Date: Fri Jul 5 13:03:36 2013
New Revision: 15421
Log:
Add scikit K-Means benchmark script.
Added:
mlpack/conf/jenkins-conf/benchmark/methods/scikit/kmeans.py
Modified:
mlpack/conf/jenkins-conf/benchmark/methods/mlpack/kmeans.py
Modified: mlpack/conf/jenkins-conf/benchmark/methods/mlpack/kmeans.py
==============================================================================
--- mlpack/conf/jenkins-conf/benchmark/methods/mlpack/kmeans.py (original)
+++ mlpack/conf/jenkins-conf/benchmark/methods/mlpack/kmeans.py Fri Jul 5 13:03:36 2013
@@ -115,9 +115,7 @@
# Compile the regular expression pattern into a regular expression object to
# parse the timer data.
pattern = re.compile(r"""
- .*?loading_data: (?P<loading_time>.*?)s.*?
- .*?saving_data: (?P<saving_time>.*?)s.*?
- .*?total_time: (?P<total_time>.*?)s.*?
+ .*?[INFO ] clustering: (?P<clustering>.*?)s.*?
""", re.VERBOSE|re.MULTILINE|re.DOTALL)
match = pattern.match(data)
@@ -126,12 +124,9 @@
return -1
else:
# Create a namedtuple and return the timer data.
- timer = collections.namedtuple("timer", ["loading_time", "saving_time",
- "total_time"])
+ timer = collections.namedtuple("timer", ["clustering"])
- return timer(float(match.group("loading_time")),
- float(match.group("saving_time")),
- float(match.group("total_time")))
+ return timer(float(match.group("clustering")))
'''
Return the elapsed time in seconds.
@@ -140,6 +135,6 @@
@return Elapsed time in seconds.
'''
def GetTime(self, timer):
- time = timer.total_time - timer.loading_time - timer.saving_time
+ time = timer.clustering
return time
\ No newline at end of file
Added: mlpack/conf/jenkins-conf/benchmark/methods/scikit/kmeans.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/scikit/kmeans.py Fri Jul 5 13:03:36 2013
@@ -0,0 +1,111 @@
+'''
+ @file kmeans.py
+ @author Marcus Edel
+
+ K-Means Clustering with scikit.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains symlinks to
+# modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+ os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
+if cmd_subfolder not in sys.path:
+ sys.path.insert(0, cmd_subfolder)
+
+from log import *
+from timer import *
+
+import numpy as np
+from sklearn.cluster import KMeans
+
+'''
+This class implements the K-Means Clustering benchmark.
+'''
+class KMEANS(object):
+
+ '''
+ Create the K-Means Clustering benchmark instance.
+
+ @param dataset - Input dataset to perform K-Means on.
+ @param verbose - Display informational messages.
+ '''
+ def __init__(self, dataset, verbose=True):
+ self.verbose = verbose
+ self.dataset = dataset
+
+ '''
+ Destructor to clean up at the end.
+ '''
+ def __del__(self):
+ pass
+
+ '''
+ Use the scikit libary to implement K-Means Clustering.
+
+ @param options - Extra options for the method.
+ @return - Elapsed time in seconds or -1 if the method was not successful.
+ '''
+ def KMeansScikit(self, options):
+ totalTimer = Timer()
+
+ # Load input dataset.
+ # If the dataset contains two files then the second file is the centroids
+ # file. In this case we add this to the command line.
+ Log.Info("Loading dataset", self.verbose)
+ if len(self.dataset) == 2:
+ data = np.genfromtxt(self.dataset[0], delimiter=',')
+ centroids = np.genfromtxt(self.dataset[1], delimiter=',')
+ else:
+ data = np.genfromtxt(self.dataset, delimiter=',')
+
+ # Gather parameters.
+ clusters = re.search("-c (\d+)", options)
+ maxIterations = re.search("-m (\d+)", options)
+ seed = re.search("-s (\d+)", options)
+
+ # Now do validation of options.
+ if not clusters and len(self.dataset) != 2:
+ Log.Fatal("Required option: Number of clusters or cluster locations.")
+ return -1
+ elif (not clusters or clusters.group(1) < 1) and len(self.dataset) != 2:
+ Log.Fatal("Invalid number of clusters requested! Must be greater than or "
+ + "equal to 1.")
+ return -1
+
+ if not maxIterations:
+ m = 1000
+ else:
+ m = maxIterations.group(1)
+
+ # Create the KMeans object and perform K-Means clustering.
+ with totalTimer:
+ if len(self.dataset) == 2:
+ kmeans = KMeans(k=centroids.shape[1], init=centroids, n_init=1,
+ max_iter=m)
+ elif seed:
+ kmeans = KMeans(k=int(clusters.group(1)), init='random', n_init=1,
+ max_iter=m, random_state=int(seed.group(1)))
+ else:
+ kmeans = KMeans(k=int(clusters.group(1)), n_init=1, max_iter=m)
+
+ kmeans.fit(data)
+ labels = kmeans.labels_
+ centers = kmeans.cluster_centers_
+
+ return totalTimer.ElapsedTime()
+
+ '''
+ Perform K-Means Clustering. If the method has been successfully completed
+ return the elapsed time in seconds.
+
+ @param options - Extra options for the method.
+ @return - Elapsed time in seconds or -1 if the method was not successful.
+ '''
+ def RunMethod(self, options):
+ Log.Info("Perform K-Means.", self.verbose)
+
+ return self.KMeansScikit(options)
More information about the mlpack-svn
mailing list