[mlpack-svn] r15500 - mlpack/conf/jenkins-conf/benchmark/methods/mlpack
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Thu Jul 18 14:52:18 EDT 2013
Author: marcus
Date: Thu Jul 18 14:52:18 2013
New Revision: 15500
Log:
Add option to the mlpack K-Means benchmark script to deal with initial centroids.
Modified:
mlpack/conf/jenkins-conf/benchmark/methods/mlpack/kmeans.py
Modified: mlpack/conf/jenkins-conf/benchmark/methods/mlpack/kmeans.py
==============================================================================
--- mlpack/conf/jenkins-conf/benchmark/methods/mlpack/kmeans.py (original)
+++ mlpack/conf/jenkins-conf/benchmark/methods/mlpack/kmeans.py Thu Jul 18 14:52:18 2013
@@ -12,9 +12,9 @@
# Import the util path, this method even works if the path contains symlinks to
# modules.
cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
- os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
+ os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
if cmd_subfolder not in sys.path:
- sys.path.insert(0, cmd_subfolder)
+ sys.path.insert(0, cmd_subfolder)
from log import *
@@ -28,113 +28,117 @@
'''
class KMEANS(object):
- '''
- Create the K-Means Clustering benchmark instance, show some informations and
- return the instance.
+ '''
+ Create the K-Means Clustering benchmark instance, show some informations and
+ return the instance.
- @param dataset - Input dataset to perform PCA on.
+ @param dataset - Input dataset to perform K-Means on.
@param path - Path to the mlpack executable.
@param verbose - Display informational messages.
- '''
- def __init__(self, dataset, path=os.environ["MLPACK_BIN"], verbose=True):
- self.verbose = verbose
- self.dataset = dataset
- self.path = path
-
- # Get description from executable.
- cmd = shlex.split(self.path + "kmeans -h")
- try:
- s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)
- except Exception, e:
- Log.Fatal("Could not execute command: " + str(cmd))
- return -1
-
- # Use regular expression pattern to get the description.
- pattern = re.compile(r"""(.*?)Required.*?options:""",
- re.VERBOSE|re.MULTILINE|re.DOTALL)
-
- match = pattern.match(s)
- if not match:
- Log.Warn("Can't parse description", self.verbose)
- description = ""
- else:
- description = match.group(1)
-
- self.description = description
-
- '''
- Destructor to clean up at the end. Use this method to remove created files.
- '''
- def __del__(self):
- Log.Info("Clean up.", self.verbose)
- filelist = ["gmon.out", "output.csv"]
- for f in filelist:
- if os.path.isfile(f):
- os.remove(f)
+ '''
+ def __init__(self, dataset, path=os.environ["MLPACK_BIN"], verbose=True):
+ self.verbose = verbose
+ self.dataset = dataset
+ self.path = path
+
+ # Get description from executable.
+ cmd = shlex.split(self.path + "kmeans -h")
+ try:
+ s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)
+ except Exception, e:
+ Log.Fatal("Could not execute command: " + str(cmd))
+ return -1
+
+ # Use regular expression pattern to get the description.
+ pattern = re.compile(r"""(.*?)Required.*?options:""",
+ re.VERBOSE|re.MULTILINE|re.DOTALL)
+
+ match = pattern.match(s)
+ if not match:
+ Log.Warn("Can't parse description", self.verbose)
+ description = ""
+ else:
+ description = match.group(1)
+
+ self.description = description
+
+ '''
+ Destructor to clean up at the end. Use this method to remove created files.
+ '''
+ def __del__(self):
+ Log.Info("Clean up.", self.verbose)
+ filelist = ["gmon.out", "output.csv"]
+ for f in filelist:
+ if os.path.isfile(f):
+ os.remove(f)
- '''
+ '''
Perform K-Means Clustering. If the method has been successfully completed
return the elapsed time in seconds.
@param options - Extra options for the method.
@return - Elapsed time in seconds or -1 if the method was not successful.
'''
- def RunMethod(self, options):
- Log.Info("Perform K-Means Clustering.", self.verbose)
+ def RunMethod(self, options):
+ Log.Info("Perform K-Means Clustering.", self.verbose)
+
+ # If the dataset contains two files then the second file is the centroids
+ # file. In this case we run the the kmeans executable.
+ if len(self.dataset) == 2:
+ cmd = shlex.split(self.path + "kmeans -i " + self.dataset[0] + " -I "
+ + self.dataset[1] + " -o output.csv -v " + options)
+ else:
+ cmd = shlex.split(self.path + "kmeans -i " + self.dataset +
+ " -o output.csv -v " + options)
+
+ # Run command with the nessecary arguments and return its output as a byte
+ # string. We have untrusted input so we disables all shell based features.
+ try:
+ s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)
+ except Exception, e:
+ Log.Fatal("Could not execute command: " + str(cmd))
+ return -1
+
+ # Return the elapsed time.
+ timer = self.parseTimer(s)
+ if not timer:
+ Log.Fatal("Can't parse the timer")
+ return -1
+ else:
+ time = self.GetTime(timer)
+ Log.Info(("total time: %fs" % (time)), self.verbose)
+
+ return time
+
+ '''
+ Parse the timer data form a given string.
+
+ @param data - String to parse timer data from.
+ @return - Namedtuple that contains the timer data.
+ '''
+ def parseTimer(self, data):
+ # Compile the regular expression pattern into a regular expression object to
+ # parse the timer data.
+ pattern = re.compile(r"""
+ .*?[INFO ] clustering: (?P<clustering>.*?)s.*?
+ """, re.VERBOSE|re.MULTILINE|re.DOTALL)
+
+ match = pattern.match(data)
+ if not match:
+ Log.Fatal("Can't parse the data: wrong format")
+ return -1
+ else:
+ # Create a namedtuple and return the timer data.
+ timer = collections.namedtuple("timer", ["clustering"])
- # Split the command using shell-like syntax.
- cmd = shlex.split(self.path + "kmeans -i " + self.dataset +
- " -o output.csv -v " + options)
-
- # Run command with the nessecary arguments and return its output as a byte
- # string. We have untrusted input so we disables all shell based features.
- try:
- s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)
- except Exception, e:
- Log.Fatal("Could not execute command: " + str(cmd))
- return -1
-
- # Return the elapsed time.
- timer = self.parseTimer(s)
- if not timer:
- Log.Fatal("Can't parse the timer")
- return -1
- else:
- time = self.GetTime(timer)
- Log.Info(("total time: %fs" % (time)), self.verbose)
-
- return time
-
- '''
- Parse the timer data form a given string.
-
- @param data - String to parse timer data from.
- @return - Namedtuple that contains the timer data.
- '''
- def parseTimer(self, data):
- # Compile the regular expression pattern into a regular expression object to
- # parse the timer data.
- pattern = re.compile(r"""
- .*?[INFO ] clustering: (?P<clustering>.*?)s.*?
- """, re.VERBOSE|re.MULTILINE|re.DOTALL)
-
- match = pattern.match(data)
- if not match:
- Log.Fatal("Can't parse the data: wrong format")
- return -1
- else:
- # Create a namedtuple and return the timer data.
- timer = collections.namedtuple("timer", ["clustering"])
-
- return timer(float(match.group("clustering")))
-
- '''
- Return the elapsed time in seconds.
-
- @param timer - Namedtuple that contains the timer data.
- @return Elapsed time in seconds.
- '''
- def GetTime(self, timer):
- time = timer.clustering
- return time
-
\ No newline at end of file
+ return timer(float(match.group("clustering")))
+
+ '''
+ Return the elapsed time in seconds.
+
+ @param timer - Namedtuple that contains the timer data.
+ @return Elapsed time in seconds.
+ '''
+ def GetTime(self, timer):
+ time = timer.clustering
+ return time
More information about the mlpack-svn
mailing list