[mlpack-svn] r15459 - mlpack/conf/jenkins-conf/benchmark/methods/weka
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Fri Jul 12 09:38:58 EDT 2013
Author: marcus
Date: Fri Jul 12 09:38:58 2013
New Revision: 15459
Log:
Add weka benchmark script for allknn, kmeans, nbc and pca.
Added:
mlpack/conf/jenkins-conf/benchmark/methods/weka/allknn.py
mlpack/conf/jenkins-conf/benchmark/methods/weka/kmeans.py
mlpack/conf/jenkins-conf/benchmark/methods/weka/nbc.py
mlpack/conf/jenkins-conf/benchmark/methods/weka/pca.py
Added: mlpack/conf/jenkins-conf/benchmark/methods/weka/allknn.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/weka/allknn.py Fri Jul 12 09:38:58 2013
@@ -0,0 +1,124 @@
+'''
+ @file allknn.py
+ @author Marcus Edel
+
+ Class to benchmark the weka All K-Nearest-Neighbors method.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains symlinks to
+# modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+ os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
+if cmd_subfolder not in sys.path:
+ sys.path.insert(0, cmd_subfolder)
+
+from log import *
+from profiler import *
+
+import shlex
+import subprocess
+import re
+import collections
+
+'''
+This class implements the All K-Nearest-Neighbors benchmark.
+'''
+class ALLKNN(object):
+
+ '''
+ Create the All K-Nearest-Neighbors benchmark instance.
+
+ @param dataset - Input dataset to perform ALLKNN on.
+ @param path - Path to the mlpack executable.
+ @param verbose - Display informational messages.
+ '''
+ def __init__(self, dataset, path=os.environ["WEKA_CLASSPATH"], verbose = True):
+ self.verbose = verbose
+ self.dataset = dataset
+ self.path = path
+
+ '''
+ Destructor to clean up at the end.
+ '''
+ def __del__(self):
+ pass
+
+ '''
+ All K-Nearest-Neighbors. If the method has been successfully completed return
+ the elapsed time in seconds.
+
+ @param options - Extra options for the method.
+ @return - Elapsed time in seconds or -1 if the method was not successful.
+ '''
+ def RunMethod(self, options):
+ Log.Info("Perform ALLKNN.", self.verbose)
+
+ # If the dataset contains two files then the second file is the query file.
+ # In this case we add this to the command line.
+ if len(self.dataset) == 2:
+ inputCmd = "-r " + self.dataset[0] + " -q " + self.dataset[1] + " " + options
+ else:
+ inputCmd = "-r " + self.dataset + " " + options
+
+ # Split the command using shell-like syntax.
+ cmd = shlex.split("java -classpath " + self.path + ":methods/weka" +
+ " AllKnn " + inputCmd + " " + options)
+
+ # Run command with the nessecary arguments and return its output as a byte
+ # string. We have untrusted input so we disables all shell based features.
+ try:
+ s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)
+ except Exception:
+ Log.Fatal("Could not execute command: " + str(cmd))
+ return -1
+
+ # Return the elapsed time.
+ timer = self.parseTimer(s)
+ if not timer:
+ Log.Fatal("Can't parse the timer")
+ return -1
+ else:
+ time = self.GetTime(timer)
+ Log.Info(("total time: %fs" % time), self.verbose)
+
+ return time
+
+ '''
+ Parse the timer data form a given string.
+
+ @param data - String to parse timer data from.
+ @return - Namedtuple that contains the timer data.
+ '''
+ def parseTimer(self, data):
+ # Compile the regular expression pattern into a regular expression object to
+ # parse the timer data.
+ pattern = re.compile(r"""
+ .*?total_time: (?P<total_time>.*?)s.*?
+ """, re.VERBOSE|re.MULTILINE|re.DOTALL)
+
+ match = pattern.match(data)
+ if not match:
+ Log.Fatal("Can't parse the data: wrong format")
+ return -1
+ else:
+ # Create a namedtuple and return the timer data.
+ timer = collections.namedtuple("timer", ["total_time"])
+
+ if match.group("total_time").count(".") == 1:
+ return timer(float(match.group("total_time")))
+ else:
+ return timer(float(match.group("total_time").replace(",", ".")))
+
+ '''
+ Return the elapsed time in seconds.
+
+ @param timer - Namedtuple that contains the timer data.
+ @return Elapsed time in seconds.
+ '''
+ def GetTime(self, timer):
+ time = timer.total_time
+ return time
Added: mlpack/conf/jenkins-conf/benchmark/methods/weka/kmeans.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/weka/kmeans.py Fri Jul 12 09:38:58 2013
@@ -0,0 +1,117 @@
+'''
+ @file kmeans.py
+ @author Marcus Edel
+
+ Class to benchmark the weka K-Means Clustering method.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains symlinks to
+# modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+ os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
+if cmd_subfolder not in sys.path:
+ sys.path.insert(0, cmd_subfolder)
+
+from log import *
+from profiler import *
+
+import shlex
+import subprocess
+import re
+import collections
+
+'''
+This class implements the K-Means Clustering benchmark.
+'''
+class KMEANS(object):
+
+ '''
+ Create the K-Means Clustering benchmark instance.
+
+ @param dataset - Input dataset to perform K-Means on.
+ @param path - Path to the mlpack executable.
+ @param verbose - Display informational messages.
+ '''
+ def __init__(self, dataset, path=os.environ["WEKA_CLASSPATH"], verbose = True):
+ self.verbose = verbose
+ self.dataset = dataset
+ self.path = path
+
+ '''
+ Destructor to clean up at the end.
+ '''
+ def __del__(self):
+ pass
+
+ '''
+ K-Means Clustering benchmark instance. If the method has been successfully
+ completed return the elapsed time in seconds.
+
+ @param options - Extra options for the method.
+ @return - Elapsed time in seconds or -1 if the method was not successful.
+ '''
+ def RunMethod(self, options):
+ Log.Info("Perform K-Means.", self.verbose)
+
+ # Split the command using shell-like syntax.
+ cmd = shlex.split("java -classpath " + self.path + ":methods/weka" +
+ " KMeans -i " + self.dataset + " " + options)
+
+ # Run command with the nessecary arguments and return its output as a byte
+ # string. We have untrusted input so we disables all shell based features.
+ try:
+ s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)
+ except Exception:
+ Log.Fatal("Could not execute command: " + str(cmd))
+ return -1
+
+ # Return the elapsed time.
+ timer = self.parseTimer(s)
+ if not timer:
+ Log.Fatal("Can't parse the timer")
+ return -1
+ else:
+ time = self.GetTime(timer)
+ Log.Info(("total time: %fs" % time), self.verbose)
+
+ return time
+
+ '''
+ Parse the timer data form a given string.
+
+ @param data - String to parse timer data from.
+ @return - Namedtuple that contains the timer data.
+ '''
+ def parseTimer(self, data):
+ # Compile the regular expression pattern into a regular expression object to
+ # parse the timer data.
+ pattern = re.compile(r"""
+ .*?total_time: (?P<total_time>.*?)s.*?
+ """, re.VERBOSE|re.MULTILINE|re.DOTALL)
+
+ match = pattern.match(data)
+ if not match:
+ Log.Fatal("Can't parse the data: wrong format")
+ return -1
+ else:
+ # Create a namedtuple and return the timer data.
+ timer = collections.namedtuple("timer", ["total_time"])
+
+ if match.group("total_time").count(".") == 1:
+ return timer(float(match.group("total_time")))
+ else:
+ return timer(float(match.group("total_time").replace(",", ".")))
+
+ '''
+ Return the elapsed time in seconds.
+
+ @param timer - Namedtuple that contains the timer data.
+ @return Elapsed time in seconds.
+ '''
+ def GetTime(self, timer):
+ time = timer.total_time
+ return time
Added: mlpack/conf/jenkins-conf/benchmark/methods/weka/nbc.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/weka/nbc.py Fri Jul 12 09:38:58 2013
@@ -0,0 +1,121 @@
+'''
+ @file nbc.py
+ @author Marcus Edel
+
+ Class to benchmark the weka Naive Bayes Classifier method.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains symlinks to
+# modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+ os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
+if cmd_subfolder not in sys.path:
+ sys.path.insert(0, cmd_subfolder)
+
+from log import *
+from profiler import *
+
+import shlex
+import subprocess
+import re
+import collections
+
+'''
+This class implements the Naive Bayes Classifier benchmark.
+'''
+class NBC(object):
+
+ '''
+ Create the Naive Bayes Classifier benchmark instance.
+
+ @param dataset - Input dataset to perform NBC on.
+ @param path - Path to the mlpack executable.
+ @param verbose - Display informational messages.
+ '''
+ def __init__(self, dataset, path=os.environ["WEKA_CLASSPATH"], verbose=True):
+ self.verbose = verbose
+ self.dataset = dataset
+ self.path = path
+
+ '''
+ Destructor to clean up at the end.
+ '''
+ def __del__(self):
+ pass
+
+ '''
+ Naive Bayes Classifier. If the method has been successfully completed return
+ the elapsed time in seconds.
+
+ @param options - Extra options for the method.
+ @return - Elapsed time in seconds or -1 if the method was not successful.
+ '''
+ def RunMethod(self, options):
+ Log.Info("Perform NBC.", self.verbose)
+
+ if len(self.dataset) < 2:
+ Log.Fatal("The method need two datasets.")
+ return -1
+
+ # Split the command using shell-like syntax.
+ cmd = shlex.split("java -classpath " + self.path + ":methods/weka" +
+ " NBC -t " + self.dataset[0] + " -T " + self.dataset[1] + " " + options)
+
+ # Run command with the nessecary arguments and return its output as a byte
+ # string. We have untrusted input so we disables all shell based features.
+ try:
+ s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)
+ except Exception:
+ Log.Fatal("Could not execute command: " + str(cmd))
+ return -1
+
+ # Return the elapsed time.
+ timer = self.parseTimer(s)
+ if not timer:
+ Log.Fatal("Can't parse the timer")
+ return -1
+ else:
+ time = self.GetTime(timer)
+ Log.Info(("total time: %fs" % time), self.verbose)
+
+ return time
+
+ '''
+ Parse the timer data form a given string.
+
+ @param data - String to parse timer data from.
+ @return - Namedtuple that contains the timer data.
+ '''
+ def parseTimer(self, data):
+ # Compile the regular expression pattern into a regular expression object to
+ # parse the timer data.
+ pattern = re.compile(r"""
+ .*?total_time: (?P<total_time>.*?)s.*?
+ """, re.VERBOSE|re.MULTILINE|re.DOTALL)
+
+ match = pattern.match(data)
+ if not match:
+ Log.Fatal("Can't parse the data: wrong format")
+ return -1
+ else:
+ # Create a namedtuple and return the timer data.
+ timer = collections.namedtuple("timer", ["total_time"])
+
+ if match.group("total_time").count(".") == 1:
+ return timer(float(match.group("total_time")))
+ else:
+ return timer(float(match.group("total_time").replace(",", ".")))
+
+ '''
+ Return the elapsed time in seconds.
+
+ @param timer - Namedtuple that contains the timer data.
+ @return Elapsed time in seconds.
+ '''
+ def GetTime(self, timer):
+ time = timer.total_time
+ return time
Added: mlpack/conf/jenkins-conf/benchmark/methods/weka/pca.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/weka/pca.py Fri Jul 12 09:38:58 2013
@@ -0,0 +1,120 @@
+'''
+ @file pca.py
+ @author Marcus Edel
+
+ Class to benchmark the weka Principal Components Analysis method.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains symlinks to
+# modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+ os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
+if cmd_subfolder not in sys.path:
+ sys.path.insert(0, cmd_subfolder)
+
+from log import *
+from profiler import *
+
+import shlex
+import subprocess
+import re
+import collections
+
+'''
+This class implements the Principal Components Analysis benchmark.
+'''
+class PCA(object):
+
+ '''
+ Create the Principal Components Analysis benchmark instance.
+
+ @param dataset - Input dataset to perform PCA on.
+ @param path - Path to the mlpack executable.
+ @param verbose - Display informational messages.
+ '''
+ def __init__(self, dataset, path=os.environ["WEKA_CLASSPATH"], verbose=True):
+ self.verbose = verbose
+ self.dataset = dataset
+ self.path = path
+
+ '''
+ Destructor to clean up at the end.
+ '''
+ def __del__(self):
+ pass
+
+ '''
+ Perform Principal Components Analysis. If the method has been successfully
+ completed return the elapsed time in seconds.
+
+ @param options - Extra options for the method.
+ @return - Elapsed time in seconds or -1 if the method was not successful.
+ '''
+ def RunMethod(self, options):
+ Log.Info("Perform PCA.", self.verbose)
+
+ # Split the command using shell-like syntax.
+ cmd = shlex.split("java -classpath " + self.path + ":methods/weka" +
+ " PCA -i " + self.dataset + " " + options)
+
+ # Run command with the nessecary arguments and return its output as a byte
+ # string. We have untrusted input so we disables all shell based features.
+ try:
+ s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)
+ except Exception:
+ Log.Fatal("Could not execute command: " + str(cmd))
+ return -1
+
+ # Return the elapsed time.
+ timer = self.parseTimer(s)
+ if not timer:
+ Log.Fatal("Can't parse the timer")
+ return -1
+ else:
+ time = self.GetTime(timer)
+ Log.Info(("total time: %fs" % time), self.verbose)
+
+ return time
+
+ '''
+ Parse the timer data form a given string.
+
+ @param data - String to parse timer data from.
+ @return - Namedtuple that contains the timer data.
+ '''
+ def parseTimer(self, data):
+ # Compile the regular expression pattern into a regular expression object to
+ # parse the timer data.
+ pattern = re.compile(r"""
+ .*?loading_data: (?P<loading_time>.*?)s.*?
+ .*?total_time: (?P<total_time>.*?)s.*?
+ """, re.VERBOSE|re.MULTILINE|re.DOTALL)
+
+ match = pattern.match(data)
+ if not match:
+ Log.Fatal("Can't parse the data: wrong format")
+ return -1
+ else:
+ # Create a namedtuple and return the timer data.
+ timer = collections.namedtuple("timer", ["loading_time", "total_time"])
+
+ if match.group("loading_time").count(".") == 1:
+ return timer(float(match.group("loading_time")),
+ float(match.group("total_time")))
+ else:
+ return timer(float(match.group("loading_time").replace(",", ".")),
+ float(match.group("total_time").replace(",", ".")))
+
+ '''
+ Return the elapsed time in seconds.
+
+ @param timer - Namedtuple that contains the timer data.
+ @return Elapsed time in seconds.
+ '''
+ def GetTime(self, timer):
+ time = timer.total_time - timer.loading_time
+ return time
More information about the mlpack-svn
mailing list