[mlpack-svn] r15436 - mlpack/conf/jenkins-conf/benchmark/methods/matlab

fastlab-svn at coffeetalk-1.cc.gatech.edu fastlab-svn at coffeetalk-1.cc.gatech.edu
Tue Jul 9 15:13:32 EDT 2013


Author: marcus
Date: Tue Jul  9 15:13:32 2013
New Revision: 15436

Log:
Add matlab All K-Nearest-Neighbors method and benchmarl script.

Added:
   mlpack/conf/jenkins-conf/benchmark/methods/matlab/ALLKNN.m
   mlpack/conf/jenkins-conf/benchmark/methods/matlab/allknn.py

Added: mlpack/conf/jenkins-conf/benchmark/methods/matlab/ALLKNN.m
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/matlab/ALLKNN.m	Tue Jul  9 15:13:32 2013
@@ -0,0 +1,80 @@
+% @file ALLKNN.m
+% @author Marcus Edel
+%
+% All K-Nearest-Neighbors with matlab.
+
+function allknn(cmd)
+% This program will calculate the all k-nearest-neighbors of a set of 
+% points using kd-trees. You may specify a separate set of reference points
+% and query points, or just a reference set which will be used as both the 
+% reference and query set.
+%
+% Required options:
+%     (-k) [int]       Number of furthest neighbors to find.
+%     (-t) [string]    A file containing the training set.
+%
+% Options:
+%     (-l) [int]       Leaf size for tree building. Default value 20.
+%     (-N)             If true, O(n^2) naive mode is used for computation.
+%     (-q) [string]    File containing query points (optional). 
+%                      Default value ''.
+
+% Load input dataset.
+referenceFile = regexp(cmd, '.*?-r ([^\s]+)', 'tokens', 'once');
+referenceData = csvread(referenceFile{:});
+
+% Get all the parameters.
+queryFile = regexp(cmd, '.*?-q ([^\s]+)', 'tokens', 'once');
+k = regexp(cmd,'.* -k (\d+)','tokens','once');
+leafSize = str2double(regexp(cmd,'.* -l (\d+)','tokens','once'));
+
+if ~isempty(queryFile)
+  disp('[INFO ] Load query data.');
+  queryData = csvread(queryFile{:});
+end
+
+if ~isempty(k)
+  k = str2double(k)
+else
+  disp('[Fatal] Required options: Number of furthest neighbors to find.');
+  return;
+end
+
+total_time = tic;
+% Sanity check on k value: must be greater than 0, must be less than the
+% number of reference points.
+if k > size(referenceData, 2)
+  msg = [...
+      '[Fatal] Invalid k: %i; must be greater than 0 and less '...
+      'than or equal to the number of reference points (%i)'...
+      ];
+  disp(sprintf(msg, k, size(referenceData, 2)))
+  return;
+end
+
+if isempty(leafSize)
+  leafSize = 20;  
+end
+
+if strfind(cmd, '-N') > 0
+  if isempty(queryFile)
+    [IDX, D] = knnsearch(referenceData, referenceData, 'K', k, ...
+      'distance', 'euclidean', 'NSMethod', 'exhaustive');    
+  else
+    [IDX, D] = knnsearch(referenceData, queryData, 'K', k, ...
+      'distance', 'euclidean', 'NSMethod', 'exhaustive');
+  end
+else
+  if isempty(queryFile)
+    [IDX, D] = knnsearch(referenceData, referenceData, 'K', k, ...
+      'distance', 'euclidean', 'NSMethod', 'kdtree', 'BucketSize', ...
+      leafSize);
+  else
+    [IDX, D] = knnsearch(referenceData, queryData, 'K', k, ...
+        'distance', 'euclidean', 'NSMethod', 'kdtree', 'BucketSize', ...
+        leafSize); 
+    end
+end
+
+disp(sprintf('[INFO ]   total_time: %fs', toc(total_time)))
+end

Added: mlpack/conf/jenkins-conf/benchmark/methods/matlab/allknn.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/matlab/allknn.py	Tue Jul  9 15:13:32 2013
@@ -0,0 +1,121 @@
+'''
+  @file allknn.py
+  @author Marcus Edel
+
+  Class to benchmark the matlab All K-Nearest-Neighbors method.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains symlinks to
+# modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+	os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
+if cmd_subfolder not in sys.path:
+	sys.path.insert(0, cmd_subfolder)
+
+from log import *
+from profiler import *
+
+import shlex
+import subprocess
+import re
+import collections
+
+'''
+This class implements the All K-Nearest-Neighbors benchmark.
+'''
+class ALLKNN(object):
+
+	''' 
+	Create the All K-Nearest-Neighbors benchmark instance.
+  
+  @param dataset - Input dataset to perform ALLKNN on.
+  @param path - Path to the mlpack executable.
+  @param verbose - Display informational messages.
+	'''
+	def __init__(self, dataset, path=os.environ["MATLAB_BIN"], verbose = True): 
+		self.verbose = verbose
+		self.dataset = dataset
+		self.path = path
+
+	'''
+	Destructor to clean up at the end.
+	'''
+	def __del__(self):		
+		pass	
+		
+	'''
+  All K-Nearest-Neighbors. If the method has been successfully completed return 
+  the elapsed time in seconds.
+
+  @param options - Extra options for the method.
+  @return - Elapsed time in seconds or -1 if the method was not successful.
+  '''
+	def RunMethod(self, options):
+		Log.Info("Perform ALLKNN.", self.verbose)
+
+		# If the dataset contains two files then the second file is the query file.
+		# In this case we add this to the command line.
+		if len(self.dataset) == 2:
+			inputCmd = "-r " + self.dataset[0] + " -q " + self.dataset[1] + " " + options
+		else:
+			inputCmd = "-r " + self.dataset + " " + options
+		
+		# Split the command using shell-like syntax.
+		cmd = shlex.split(self.path + "matlab -nodisplay -nosplash -r \"try, " +
+				"ALLKNN('"  + inputCmd + "'), catch, exit(1), end, exit(0)\"")
+		
+		# Run command with the nessecary arguments and return its output as a byte
+		# string. We have untrusted input so we disables all shell based features.
+		try:
+			s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)		
+		except Exception:
+			Log.Fatal("Could not execute command: " + str(cmd))
+			return -1
+
+		# Return the elapsed time.
+		timer = self.parseTimer(s)
+		if not timer:
+			Log.Fatal("Can't parse the timer")
+			return -1
+		else:
+			time = self.GetTime(timer)
+			Log.Info(("total time: %fs" % time), self.verbose)
+
+			return time
+
+	'''
+	Parse the timer data form a given string.
+
+	@param data - String to parse timer data from.
+	@return - Namedtuple that contains the timer data.
+	'''
+	def parseTimer(self, data):
+		# Compile the regular expression pattern into a regular expression object to
+		# parse the timer data.
+		pattern = re.compile(r"""
+				.*?total_time: (?P<total_time>.*?)s.*?
+				""", re.VERBOSE|re.MULTILINE|re.DOTALL)
+		
+		match = pattern.match(data)
+		if not match:
+			Log.Fatal("Can't parse the data: wrong format")
+			return -1
+		else:
+			# Create a namedtuple and return the timer data.
+			timer = collections.namedtuple("timer", ["total_time"])
+			
+			return timer(float(match.group("total_time")))
+
+	'''
+	Return the elapsed time in seconds.
+
+	@param timer - Namedtuple that contains the timer data.
+	@return Elapsed time in seconds.
+	'''
+	def GetTime(self, timer):
+		time = timer.total_time
+		return time



More information about the mlpack-svn mailing list