[mlpack-svn] r15339 - mlpack/conf/jenkins-conf/benchmark/methods/mlpack

Wed Jun 26 13:26:20 EDT 2013

Author: marcus
Date: Wed Jun 26 13:26:20 2013
New Revision: 15339

Log:
Add hmm_generate, hmm_loglik, hmm_train, hmm_viterbi and nca script.

Added:
   mlpack/conf/jenkins-conf/benchmark/methods/mlpack/hmm_generate.py
   mlpack/conf/jenkins-conf/benchmark/methods/mlpack/hmm_loglik.py
   mlpack/conf/jenkins-conf/benchmark/methods/mlpack/hmm_train.py
   mlpack/conf/jenkins-conf/benchmark/methods/mlpack/hmm_viterbi.py
   mlpack/conf/jenkins-conf/benchmark/methods/mlpack/nca.py

Added: mlpack/conf/jenkins-conf/benchmark/methods/mlpack/hmm_generate.py
==============================================================================

--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/mlpack/hmm_generate.py	Wed Jun 26 13:26:20 2013
@@ -0,0 +1,112 @@
+'''
+  @file hmm_generate.py
+  @author Marcus Edel
+
+  Class to benchmark the mlpack Hidden Markov Model Sequence Generator method.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains
+# symlinks to modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+	os.path.split(inspect.getfile(inspect.currentframe()))[0], '../../util')))
+if cmd_subfolder not in sys.path:
+	sys.path.insert(0, cmd_subfolder)
+
+from log import *
+
+import shlex
+import subprocess
+import re
+import collections
+
+class HMMGENERATE(object):
+
+	# Create Hidden Markov Model Sequence Generator instance, show some 
+	# informations and return the instance.
+	def __init__(self, dataset, path='/usr/local/bin/', verbose=True): 
+		self.verbose = verbose
+		self.dataset = dataset
+		self.path = path
+
+		# Get description from executable.
+		cmd = shlex.split(self.path + "hmm_generate -h")
+		s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)	
+
+		# Use regular expression pattern to get the description.
+		pattern = re.compile(r"""(.*?)Required.*?options:""", 
+				re.VERBOSE|re.MULTILINE|re.DOTALL)
+		
+		match = pattern.match(s)
+		if not match:
+			Log.Warn("Can't parse description", self.verbose)
+			description = ''
+		else:
+			description = match.group(1)
+		
+		# Show method informations.
+		# Log.Notice(description)
+		# Log.Notice('\n')
+
+	# Remove created files.
+	def __del__(self):		
+		Log.Info('Clean up.', self.verbose)
+		filelist = ['gmon.out', 'output.csv']
+		for f in filelist:
+			if os.path.isfile(f):
+				os.remove(f)				
+
+	# Perform Hidden Markov Model Sequence Generator and return the elapsed time.
+	def RunMethod(self, options):
+		Log.Info('Perform HMM Generate.', self.verbose)
+
+		cmd = shlex.split(self.path + "hmm_generate -m " + self.dataset + 
+			" -v  " + options)		
+
+		# Run command with the nessecary arguments and return its output as a byte
+		# string. We have untrusted input so we disables all shell based features.
+		s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)	
+
+		# Return the elapsed time.
+		timer = self.parseTimer(s)
+		if not timer:
+			Log.Fatal("Can't parse the timer", self.verbose)
+			return 0
+		else:
+			time = self.GetTime(timer)
+			Log.Info(('total time: %fs' % (time)), self.verbose)
+
+			return time
+
+	# Parse the timer data.
+	def parseTimer(self, data):
+		# Compile the regular expression pattern into a regular expression object
+		# to parse the timer data.
+		pattern = re.compile(r"""
+							.*?saving_data: (?P<saving_data>.*?)s.*?
+							.*?total_time: (?P<total_time>.*?)s.*?
+							""", re.VERBOSE|re.MULTILINE|re.DOTALL)
+		
+		match = pattern.match(data)
+
+		if not match:
+			print "Can't parse the data: wrong format"
+			return False
+		else:
+			# Create a namedtuple and return the timer data.
+			timer = collections.namedtuple('timer', ['saving_data', 
+				'total_time'])
+			if match.group("saving_data").count(".") == 1:
+				return timer(float(match.group("saving_data")),
+						 	float(match.group("total_time")))
+			else:
+				return timer(float(match.group("saving_data").replace(",", ".")),
+						 	float(match.group("total_time").replace(",", ".")))
+
+	# Return the elapsed time.
+	def GetTime(self, timer):
+		time = timer.total_time - timer.saving_data
+		return time
\ No newline at end of file

Added: mlpack/conf/jenkins-conf/benchmark/methods/mlpack/hmm_loglik.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/mlpack/hmm_loglik.py	Wed Jun 26 13:26:20 2013
@@ -0,0 +1,120 @@
+'''
+  @file hmm_loglik.py
+  @author Marcus Edel
+
+  Class to benchmark the mlpack Hidden Markov Model Sequence Log-Likelihood 
+  method.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains
+# symlinks to modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+	os.path.split(inspect.getfile(inspect.currentframe()))[0], '../../util')))
+if cmd_subfolder not in sys.path:
+	sys.path.insert(0, cmd_subfolder)
+
+from log import *
+
+import shlex
+import subprocess
+import re
+import collections
+
+class HMMLOGLIK(object):
+
+	# Create the Hidden Markov Model Training instance, show some informations and
+	# return the instance.
+	def __init__(self, dataset, path='/usr/local/bin/', verbose=True): 
+		self.verbose = verbose
+		self.dataset = dataset
+		self.path = path
+
+		# Get description from executable.
+		cmd = shlex.split(self.path + "hmm_loglik -h")
+		s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)	
+
+		# Use regular expression pattern to get the description.
+		pattern = re.compile(r"""(.*?)Required.*?options:""", 
+				re.VERBOSE|re.MULTILINE|re.DOTALL)
+		
+		match = pattern.match(s)
+		if not match:
+			Log.Warn("Can't parse description", self.verbose)
+			description = ''
+		else:
+			description = match.group(1)
+		
+		# Show method informations.
+		# Log.Notice(description)
+		# Log.Notice('\n')
+
+	# Remove created files.
+	def __del__(self):		
+		Log.Info('Clean up.', self.verbose)
+		filelist = ['gmon.out']
+		for f in filelist:
+			if os.path.isfile(f):
+				os.remove(f)				
+
+	# Perform Hidden Markov Model Sequence Log-Likelihood and return the elapsed
+	# time.
+	def RunMethod(self, options):
+		Log.Info('Perform HMM Training.', self.verbose)
+
+		
+		if len(self.dataset) == 2:
+			cmd = shlex.split(self.path + "hmm_loglik -i " + self.dataset[0] + " -m " + 
+				self.dataset[1] + " -v " + options)	
+		else:
+			Log.Fatal("Not enough input datasets.")
+			return False
+
+		# Run command with the nessecary arguments and return its output as
+		# a byte string. We have untrusted input so we disables all shell 
+		# based features.
+		s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)	
+
+		# Return the elapsed time.
+		timer = self.parseTimer(s)
+		if not timer:
+			Log.Fatal("Can't parse the timer", self.verbose)
+			return 0
+		else:
+			time = self.GetTime(timer)
+			Log.Info(('total time: %fs' % (time)), self.verbose)
+
+			return time
+
+	# Parse the timer data.
+	def parseTimer(self, data):
+		# Compile the regular expression pattern into a regular expression object
+		# to parse the timer data.
+		pattern = re.compile(r"""
+							.*?loading_data: (?P<loading_data>.*?)s.*?
+							.*?total_time: (?P<total_time>.*?)s.*?
+							""", re.VERBOSE|re.MULTILINE|re.DOTALL)
+		
+		match = pattern.match(data)
+
+		if not match:
+			print "Can't parse the data: wrong format"
+			return False
+		else:
+			# Create a namedtuple and return the timer data.
+			timer = collections.namedtuple('timer', ['loading_data', 
+				'total_time'])
+			if match.group("loading_data").count(".") == 1:
+				return timer(float(match.group("loading_data")),
+						 	float(match.group("total_time")))
+			else:
+				return timer(float(match.group("loading_data").replace(",", ".")),
+						 	float(match.group("total_time").replace(",", ".")))
+
+	# Return the elapsed time.
+	def GetTime(self, timer):
+		time = timer.total_time - timer.loading_data
+		return time
\ No newline at end of file

Added: mlpack/conf/jenkins-conf/benchmark/methods/mlpack/hmm_train.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/mlpack/hmm_train.py	Wed Jun 26 13:26:20 2013
@@ -0,0 +1,119 @@
+'''
+  @file hmm_train.py
+  @author Marcus Edel
+
+  Class to benchmark the mlpack Hidden Markov Model Training method.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains
+# symlinks to modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+	os.path.split(inspect.getfile(inspect.currentframe()))[0], '../../util')))
+if cmd_subfolder not in sys.path:
+	sys.path.insert(0, cmd_subfolder)
+
+from log import *
+
+import shlex
+import subprocess
+import re
+import collections
+
+class HMMTRAIN(object):
+
+	# Create the Hidden Markov Model Training instance, show some informations and
+	# return the instance.
+	def __init__(self, dataset, path='/usr/local/bin/', verbose=True): 
+		self.verbose = verbose
+		self.dataset = dataset
+		self.path = path
+
+		# Get description from executable.
+		cmd = shlex.split(self.path + "hmm_train -h")
+		s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)	
+
+		# Use regular expression pattern to get the description.
+		pattern = re.compile(r"""(.*?)Required.*?options:""", 
+				re.VERBOSE|re.MULTILINE|re.DOTALL)
+		
+		match = pattern.match(s)
+		if not match:
+			Log.Warn("Can't parse description", self.verbose)
+			description = ''
+		else:
+			description = match.group(1)
+		
+		# Show method informations.
+		# Log.Notice(description)
+		# Log.Notice('\n')
+
+	# Remove created files.
+	def __del__(self):		
+		Log.Info('Clean up.', self.verbose)
+		filelist = ['gmon.out', 'output_hmm.xml']
+		for f in filelist:
+			if os.path.isfile(f):
+				os.remove(f)				
+
+	# Perform Hidden Markov Model Training and return the elapsed time.
+	def RunMethod(self, options):
+		Log.Info('Perform HMM Training.', self.verbose)
+
+		# If the dataset contains two files then the second file is the query
+		# file. In this case we add this to the command line.
+		if len(self.dataset) == 2:
+			cmd = shlex.split(self.path + "hmm_train -i " + self.dataset[0] + "-l " + 
+				self.dataset[1] + " -v " + options)
+		else:
+			cmd = shlex.split(self.path + "hmm_train -i " + self.dataset + 
+				" -v  " + options)		
+
+		# Run command with the nessecary arguments and return its output as
+		# a byte string. We have untrusted input so we disables all shell 
+		# based features.
+		s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)	
+
+		# Return the elapsed time.
+		timer = self.parseTimer(s)
+		if not timer:
+			Log.Fatal("Can't parse the timer", self.verbose)
+			return 0
+		else:
+			time = self.GetTime(timer)
+			Log.Info(('total time: %fs' % (time)), self.verbose)
+
+			return time
+
+	# Parse the timer data.
+	def parseTimer(self, data):
+		# Compile the regular expression pattern into a regular expression object
+		# to parse the timer data.
+		pattern = re.compile(r"""
+							.*?loading_data: (?P<loading_data>.*?)s.*?
+							.*?total_time: (?P<total_time>.*?)s.*?
+							""", re.VERBOSE|re.MULTILINE|re.DOTALL)
+		
+		match = pattern.match(data)
+
+		if not match:
+			print "Can't parse the data: wrong format"
+			return False
+		else:
+			# Create a namedtuple and return the timer data.
+			timer = collections.namedtuple('timer', ['loading_data', 
+				'total_time'])
+			if match.group("loading_data").count(".") == 1:
+				return timer(float(match.group("loading_data")),
+						 	float(match.group("total_time")))
+			else:
+				return timer(float(match.group("loading_data").replace(",", ".")),
+						 	float(match.group("total_time").replace(",", ".")))
+
+	# Return the elapsed time.
+	def GetTime(self, timer):
+		time = timer.total_time - timer.loading_data
+		return time
\ No newline at end of file

Added: mlpack/conf/jenkins-conf/benchmark/methods/mlpack/hmm_viterbi.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/mlpack/hmm_viterbi.py	Wed Jun 26 13:26:20 2013
@@ -0,0 +1,123 @@
+'''
+  @file hmm_viterbi.py
+  @author Marcus Edel
+
+  Class to benchmark the mlpack Hidden Markov Model Viterbi State Prediction
+  method.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains
+# symlinks to modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+	os.path.split(inspect.getfile(inspect.currentframe()))[0], '../../util')))
+if cmd_subfolder not in sys.path:
+	sys.path.insert(0, cmd_subfolder)
+
+from log import *
+
+import shlex
+import subprocess
+import re
+import collections
+
+class HMMVITERBI(object):
+
+	# Create the HHidden Markov Model Viterbi State Prediction instance, show some
+	# informations and return the instance.
+	def __init__(self, dataset, path='/usr/local/bin/', verbose=True): 
+		self.verbose = verbose
+		self.dataset = dataset
+		self.path = path
+
+		# Get description from executable.
+		cmd = shlex.split(self.path + "hmm_viterbi -h")
+		s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)	
+
+		# Use regular expression pattern to get the description.
+		pattern = re.compile(r"""(.*?)Required.*?options:""", 
+				re.VERBOSE|re.MULTILINE|re.DOTALL)
+		
+		match = pattern.match(s)
+		if not match:
+			Log.Warn("Can't parse description", self.verbose)
+			description = ''
+		else:
+			description = match.group(1)
+		
+		# Show method informations.
+		# Log.Notice(description)
+		# Log.Notice('\n')
+
+	# Remove created files.
+	def __del__(self):		
+		Log.Info('Clean up.', self.verbose)
+		filelist = ['gmon.out', 'output.csv']
+		for f in filelist:
+			if os.path.isfile(f):
+				os.remove(f)				
+
+	# PerformHidden Markov Model (HMM) Viterbi State Prediction and return the 
+	# elapsed time.
+	def RunMethod(self, options):
+		Log.Info('Perform HMM Viterbi State Prediction.', self.verbose)
+
+		
+		if len(self.dataset) == 2:
+			cmd = shlex.split(self.path + "hmm_viterbi -i " + self.dataset[0] + " -m " 
+				+ self.dataset[1] + " -v " + options)	
+		else:
+			Log.Fatal("Not enough input datasets.")
+			return False
+
+		# Run command with the nessecary arguments and return its output as a byte
+		# string. We have untrusted input so we disables all shell 
+		# based features.
+		s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)	
+
+		# Return the elapsed time.
+		timer = self.parseTimer(s)
+		if not timer:
+			Log.Fatal("Can't parse the timer", self.verbose)
+			return 0
+		else:
+			time = self.GetTime(timer)
+			Log.Info(('total time: %fs' % (time)), self.verbose)
+
+			return time
+
+	# Parse the timer data.
+	def parseTimer(self, data):
+		# Compile the regular expression pattern into a regular expression object to
+		# parse the timer data.
+		pattern = re.compile(r"""
+							.*?loading_data: (?P<loading_data>.*?)s.*?
+							.*?saving_data: (?P<saving_data>.*?)s.*?
+							.*?total_time: (?P<total_time>.*?)s.*?
+							""", re.VERBOSE|re.MULTILINE|re.DOTALL)
+		
+		match = pattern.match(data)
+
+		if not match:
+			print "Can't parse the data: wrong format"
+			return False
+		else:
+			# Create a namedtuple and return the timer data.
+			timer = collections.namedtuple('timer', ['loading_data', 'saving_data' , 
+					'total_time'])
+			if match.group("loading_data").count(".") == 1:
+				return timer(float(match.group("loading_data")),
+							float(match.group("saving_data")),
+						 	float(match.group("total_time")))
+			else:
+				return timer(float(match.group("loading_data").replace(",", ".")),
+							float(match.group("saving_data").replace(",", ".")),
+						 	float(match.group("total_time").replace(",", ".")))
+
+	# Return the elapsed time.
+	def GetTime(self, timer):
+		time = timer.total_time - timer.loading_data - timer.saving_data
+		return time
\ No newline at end of file

Added: mlpack/conf/jenkins-conf/benchmark/methods/mlpack/nca.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/mlpack/nca.py	Wed Jun 26 13:26:20 2013
@@ -0,0 +1,123 @@
+'''
+  @file nca.py
+  @author Marcus Edel
+
+  Class to benchmark the mlpack Neighborhood Components Analysis method.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains
+# symlinks to modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+	os.path.split(inspect.getfile(inspect.currentframe()))[0], '../../util')))
+if cmd_subfolder not in sys.path:
+	sys.path.insert(0, cmd_subfolder)
+
+from log import *
+
+import shlex
+import subprocess
+import re
+import collections
+
+class NCA(object):
+
+	# Create the Neighborhood Components Analysis instance, show some
+	# informations and return the instance.
+	def __init__(self, dataset, path='/usr/local/bin/', verbose=True): 
+		self.verbose = verbose
+		self.dataset = dataset
+		self.path = path
+
+		# Get description from executable.
+		cmd = shlex.split(self.path + "nca -h")
+		s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)	
+
+		# Use regular expression pattern to get the description.
+		pattern = re.compile(r"""(.*?)Required.*?options:""", 
+				re.VERBOSE|re.MULTILINE|re.DOTALL)
+		
+		match = pattern.match(s)
+		if not match:
+			Log.Warn("Can't parse description", self.verbose)
+			description = ''
+		else:
+			description = match.group(1)
+		
+		# Show method informations.
+		# Log.Notice(description)
+		# Log.Notice('\n')
+
+	# Remove created files.
+	def __del__(self):		
+		Log.Info('Clean up.', self.verbose)
+		filelist = ['gmon.out', 'distance.csv']
+		for f in filelist:
+			if os.path.isfile(f):
+				os.remove(f)				
+
+	# Perform Neighborhood Components Analysis and return the elapsed time.
+	def RunMethod(self, options):
+		Log.Info('Perform NCA.', self.verbose)
+
+
+		# If the dataset contains two files then the second file is the labels
+		# file. In this case we add this to the command line.
+		if len(self.dataset) == 2:
+			cmd = shlex.split(self.path + "nca -i " + self.dataset[0] + " -l " + 
+				self.dataset[1] + " -v -o distance.csv " + options)
+		else:
+			cmd = shlex.split(self.path + "nca -i " + self.dataset + 
+				" -v -o distance.csv " + options)
+
+		# Run command with the nessecary arguments and return its output as
+		# a byte string. We have untrusted input so we disables all shell 
+		# based features.
+		s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)	
+
+		# Return the elapsed time.
+		timer = self.parseTimer(s)
+		if not timer:
+			Log.Fatal("Can't parse the timer", self.verbose)
+			return 0
+		else:
+			time = self.GetTime(timer)
+			Log.Info(('total time: %fs' % (time)), self.verbose)
+
+			return time
+
+	# Parse the timer data.
+	def parseTimer(self, data):
+		# Compile the regular expression pattern into a regular expression object
+		# to parse the timer data.
+		pattern = re.compile(r"""
+							.*?loading_data: (?P<loading_data>.*?)s.*?
+							.*?saving_data: (?P<saving_data>.*?)s.*?
+							.*?total_time: (?P<total_time>.*?)s.*?
+							""", re.VERBOSE|re.MULTILINE|re.DOTALL)
+		
+		match = pattern.match(data)
+
+		if not match:
+			Log.Fatal("Can't parse the data: wrong format")
+			return False
+		else:
+			# Create a namedtuple and return the timer data.
+			timer = collections.namedtuple('timer', ['loading_data', 
+				'saving_data', 'total_time'])
+			if match.group("loading_data").count(".") == 1:
+				return timer(float(match.group("loading_data")),
+							float(match.group("saving_data")),
+						 	float(match.group("total_time")))
+			else:
+				return timer(float(match.group("loading_data").replace(",", ".")),
+							float(match.group("saving_data").replace(",", ".")),
+						 	float(match.group("total_time").replace(",", ".")))
+
+	# Return the elapsed time.
+	def GetTime(self, timer):
+		time = timer.total_time - timer.loading_data - timer.saving_data
+		return time
\ No newline at end of file