[mlpack-svn] r15415 - mlpack/conf/jenkins-conf/benchmark/methods/matlab
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Fri Jul 5 07:27:18 EDT 2013
Author: marcus
Date: Fri Jul 5 07:27:18 2013
New Revision: 15415
Log:
Add matlab nmf method and nmf benchmark script.
Added:
mlpack/conf/jenkins-conf/benchmark/methods/matlab/NMF.m
mlpack/conf/jenkins-conf/benchmark/methods/matlab/nmf.py
Added: mlpack/conf/jenkins-conf/benchmark/methods/matlab/NMF.m
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/matlab/NMF.m Fri Jul 5 07:27:18 2013
@@ -0,0 +1,87 @@
+% @file NMF.m
+% @author Marcus Edel
+%
+% Non-negative Matrix Factorization with matlab.
+
+function nmf(cmd)
+% This program performs non-negative matrix factorization on the given
+% dataset, storing the resulting decomposed matrices in the specified
+% files. For an input dataset V, NMF decomposes V into two matrices W and H
+% such that
+%
+% V = W * H
+%
+% where all elements in W and H are non-negative.
+%
+% Required options:
+% (-i) [string] Input dataset to perform NMF on.
+% (-r) [int] Rank of the factorization.
+% Options:
+% (-m) [int] Number of iterations before NMF terminates (0) runs
+% until convergence. Default value 10000.
+% (-e) [double] The minimum root mean square residue allowed for
+% each iteration, below which the program terminates.
+% Default value 1e-05.
+% (-s) [int] Random seed.
+% (-u) [string] Update rules for each iteration; ( multdist | als ).
+% Default value 'multdist'.
+
+
+% Load input dataset.
+inputFile = regexp(cmd, '.*?-i ([^\s]+)', 'tokens', 'once');
+X = csvread(inputFile{:});
+
+total_time = tic;
+
+% Gather parameters.
+rank = str2double(regexp(cmd,'.* -r (\d+)','tokens','once'));
+seed = str2double(regexp(cmd,'.* -s (\d+)','tokens','once'));
+maxIterations = str2double(regexp(cmd,'.* -m (\d+)','tokens','once'));
+minResidue = str2double(regexp(cmd, '.*?-e ([^\s]+)', 'tokens', 'once'));
+updateRule = regexp(cmd, '.*?-u ([^\s]+)', 'tokens', 'once');
+
+% Validate parameters.
+if isempty(maxIterations)
+ m = 10000;
+else
+ if maxIterations == 0
+ m = inf;
+ else
+ m = maxIterations;
+ end
+end
+
+if isempty(minResidue)
+ e = 1e-05;
+else
+ e = minResidue;
+end
+
+if ~isempty(seed)
+ s = RandStream('mt19937ar','Seed', seed);
+ RandStream.setGlobalStream(s);
+end
+
+if isempty(rank) || rank < 1
+ disp('[Fatal] The rank of the factorization cannot be less than 1.')
+ return
+end
+
+if ~strcmp(updateRule, 'multdist') && ~strcmp(updateRule, 'als')
+ msg = [...
+ '[Fatal] Invalid update rules ("%s") must be "multdist" or "als"'];
+ disp(sprintf(msg, updateRule{:}))
+ return
+end
+
+% Perform NMF with the specified update rules and parameters.
+opt = statset('MaxIter', m, 'TolFun', e, 'TolX', e);
+if strcmp(updateRule, 'multdist') || ~strcmp(updateRule, 'als')
+ nnmf(X, rank, 'options', opt, 'algorithm', 'mult');
+ disp(sprintf('[INFO ] total_time: %fs', toc(total_time)))
+elseif strcmp(updateRule, 'als')
+ nnmf(X, rank, 'options', opt, 'algorithm', 'als');
+ disp(sprintf('[INFO ] total_time: %fs', toc(total_time)))
+end
+
+end
Added: mlpack/conf/jenkins-conf/benchmark/methods/matlab/nmf.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/matlab/nmf.py Fri Jul 5 07:27:18 2013
@@ -0,0 +1,115 @@
+'''
+ @file nmf.py
+ @author Marcus Edel
+
+ Class to benchmark the matlab Non-negative Matrix Factorization method.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains symlinks to
+# modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+ os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
+if cmd_subfolder not in sys.path:
+ sys.path.insert(0, cmd_subfolder)
+
+from log import *
+from profiler import *
+
+import shlex
+import subprocess
+import re
+import collections
+
+'''
+This class implements the Non-negative Matrix Factorization benchmark.
+'''
+class NMF(object):
+
+ '''
+ Create the Non-negative Matrix Factorization benchmark instance.
+
+ @param dataset - Input dataset to perform NMF on.
+ @param path - Path to the mlpack executable.
+ @param verbose - Display informational messages.
+ '''
+ def __init__(self, dataset, path="/opt/matlab/bin/", verbose=True):
+ self.verbose = verbose
+ self.dataset = dataset
+ self.path = path
+
+ '''
+ Destructor to clean up at the end.
+ '''
+ def __del__(self):
+ pass
+
+ '''
+ Non-negative Matrix Factorization. If the method has been successfully
+ completed return the elapsed time in seconds.
+
+ @param options - Extra options for the method.
+ @return - Elapsed time in seconds or -1 if the method was not successful.
+ '''
+ def RunMethod(self, options):
+ Log.Info("Perform NMF.", self.verbose)
+
+ inputCmd = "-i " + self.dataset + " " + options
+ # Split the command using shell-like syntax.
+ cmd = shlex.split(self.path + "matlab -nodisplay -nosplash -r \"try, NMF('"
+ + inputCmd + "'), catch, exit(1), end, exit(0)\"")
+
+ # Run command with the nessecary arguments and return its output as a byte
+ # string. We have untrusted input so we disables all shell based features.
+ try:
+ s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)
+ except Exception:
+ Log.Fatal("Could not execute command: " + str(cmd))
+ return -1
+
+ # Return the elapsed time.
+ timer = self.parseTimer(s)
+ if not timer:
+ Log.Fatal("Can't parse the timer")
+ return -1
+ else:
+ time = self.GetTime(timer)
+ Log.Info(("total time: %fs" % time), self.verbose)
+
+ return time
+
+ '''
+ Parse the timer data form a given string.
+
+ @param data - String to parse timer data from.
+ @return - Namedtuple that contains the timer data.
+ '''
+ def parseTimer(self, data):
+ # Compile the regular expression pattern into a regular expression object to
+ # parse the timer data.
+ pattern = re.compile(r"""
+ .*?total_time: (?P<total_time>.*?)s.*?
+ """, re.VERBOSE|re.MULTILINE|re.DOTALL)
+
+ match = pattern.match(data)
+ if not match:
+ Log.Fatal("Can't parse the data: wrong format")
+ return -1
+ else:
+ # Create a namedtuple and return the timer data.
+ timer = collections.namedtuple("timer", ["total_time"])
+
+ return timer(float(match.group("total_time")))
+
+ '''
+ Return the elapsed time in seconds.
+
+ @param timer - Namedtuple that contains the timer data.
+ @return Elapsed time in seconds.
+ '''
+ def GetTime(self, timer):
+ time = timer.total_time
+ return time
More information about the mlpack-svn
mailing list