[mlpack-svn] r15421 - in mlpack/conf/jenkins-conf/benchmark/methods: mlpack scikit

Fri Jul 5 13:03:36 EDT 2013

Author: marcus
Date: Fri Jul  5 13:03:36 2013
New Revision: 15421

Log:
Add scikit K-Means benchmark script.

Added:
   mlpack/conf/jenkins-conf/benchmark/methods/scikit/kmeans.py
Modified:
   mlpack/conf/jenkins-conf/benchmark/methods/mlpack/kmeans.py

Modified: mlpack/conf/jenkins-conf/benchmark/methods/mlpack/kmeans.py
==============================================================================

--- mlpack/conf/jenkins-conf/benchmark/methods/mlpack/kmeans.py	(original)
+++ mlpack/conf/jenkins-conf/benchmark/methods/mlpack/kmeans.py	Fri Jul  5 13:03:36 2013
@@ -115,9 +115,7 @@
 		# Compile the regular expression pattern into a regular expression object to
 		# parse the timer data.
 		pattern = re.compile(r"""
-				.*?loading_data: (?P<loading_time>.*?)s.*?
-				.*?saving_data: (?P<saving_time>.*?)s.*?
-				.*?total_time: (?P<total_time>.*?)s.*?
+				.*?[INFO ]   clustering: (?P<clustering>.*?)s.*?
 				""", re.VERBOSE|re.MULTILINE|re.DOTALL)
 		
 		match = pattern.match(data)
@@ -126,12 +124,9 @@
 			return -1
 		else:
 			# Create a namedtuple and return the timer data.
-			timer = collections.namedtuple("timer", ["loading_time", "saving_time", 
-					"total_time"])
+			timer = collections.namedtuple("timer", ["clustering"])
 
-			return timer(float(match.group("loading_time")),
-					float(match.group("saving_time")),
-					float(match.group("total_time")))
+			return timer(float(match.group("clustering")))
 
 	'''
 	Return the elapsed time in seconds.
@@ -140,6 +135,6 @@
 	@return Elapsed time in seconds.
 	'''
 	def GetTime(self, timer):
-		time = timer.total_time - timer.loading_time - timer.saving_time
+		time = timer.clustering
 		return time
 		
\ No newline at end of file

Added: mlpack/conf/jenkins-conf/benchmark/methods/scikit/kmeans.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/scikit/kmeans.py	Fri Jul  5 13:03:36 2013
@@ -0,0 +1,111 @@
+'''
+  @file kmeans.py
+  @author Marcus Edel
+
+  K-Means Clustering with scikit.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains symlinks to
+# modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+  os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
+if cmd_subfolder not in sys.path:
+  sys.path.insert(0, cmd_subfolder)
+
+from log import *
+from timer import *
+
+import numpy as np
+from sklearn.cluster import KMeans
+
+'''
+This class implements the K-Means Clustering benchmark.
+'''
+class KMEANS(object):
+
+  ''' 
+  Create the K-Means Clustering benchmark instance.
+  
+  @param dataset - Input dataset to perform K-Means on.
+  @param verbose - Display informational messages.
+  '''
+  def __init__(self, dataset, verbose=True): 
+    self.verbose = verbose
+    self.dataset = dataset
+
+  '''
+  Destructor to clean up at the end.
+  '''
+  def __del__(self):
+    pass
+
+  '''
+  Use the scikit libary to implement K-Means Clustering.
+
+  @param options - Extra options for the method.
+  @return - Elapsed time in seconds or -1 if the method was not successful.
+  '''
+  def KMeansScikit(self, options):
+    totalTimer = Timer()
+
+    # Load input dataset.
+    # If the dataset contains two files then the second file is the centroids 
+    # file. In this case we add this to the command line.
+    Log.Info("Loading dataset", self.verbose)
+    if len(self.dataset) == 2:
+      data = np.genfromtxt(self.dataset[0], delimiter=',')
+      centroids = np.genfromtxt(self.dataset[1], delimiter=',')
+    else:
+      data = np.genfromtxt(self.dataset, delimiter=',')
+
+    # Gather parameters.
+    clusters = re.search("-c (\d+)", options)
+    maxIterations = re.search("-m (\d+)", options)
+    seed = re.search("-s (\d+)", options)
+
+    # Now do validation of options.
+    if not clusters and len(self.dataset) != 2:
+      Log.Fatal("Required option: Number of clusters or cluster locations.")
+      return -1
+    elif (not clusters or clusters.group(1) < 1) and len(self.dataset) != 2:
+      Log.Fatal("Invalid number of clusters requested! Must be greater than or "
+          + "equal to 1.")
+      return -1
+
+    if not maxIterations:
+      m = 1000
+    else:
+      m = maxIterations.group(1)
+
+    # Create the KMeans object and perform K-Means clustering.
+    with totalTimer:
+      if len(self.dataset) == 2:
+        kmeans = KMeans(k=centroids.shape[1], init=centroids, n_init=1, 
+            max_iter=m)
+      elif seed:
+        kmeans = KMeans(k=int(clusters.group(1)), init='random', n_init=1, 
+            max_iter=m, random_state=int(seed.group(1)))
+      else:
+        kmeans = KMeans(k=int(clusters.group(1)), n_init=1, max_iter=m)      
+
+      kmeans.fit(data)
+      labels = kmeans.labels_
+      centers = kmeans.cluster_centers_
+
+    return totalTimer.ElapsedTime()
+
+  '''
+  Perform K-Means Clustering. If the method has been successfully completed 
+  return the elapsed time in seconds.
+
+  @param options - Extra options for the method.
+  @return - Elapsed time in seconds or -1 if the method was not successful.
+  '''
+  def RunMethod(self, options):
+    Log.Info("Perform K-Means.", self.verbose)
+
+    return self.KMeansScikit(options)