[mlpack-svn] r16137 - in mlpack/conf/jenkins-conf/benchmark: . methods/ann methods/ann/src

fastlab-svn at coffeetalk-1.cc.gatech.edu fastlab-svn at coffeetalk-1.cc.gatech.edu
Sat Jan 11 07:50:27 EST 2014


Author: marcus
Date: Sat Jan 11 07:50:27 2014
New Revision: 16137

Log:
Add  ann allknn method src and benchmark script.

Added:
   mlpack/conf/jenkins-conf/benchmark/methods/ann/
   mlpack/conf/jenkins-conf/benchmark/methods/ann/allknn.py
   mlpack/conf/jenkins-conf/benchmark/methods/ann/src/
   mlpack/conf/jenkins-conf/benchmark/methods/ann/src/allknn.cpp
Modified:
   mlpack/conf/jenkins-conf/benchmark/Makefile

Modified: mlpack/conf/jenkins-conf/benchmark/Makefile
==============================================================================
--- mlpack/conf/jenkins-conf/benchmark/Makefile	(original)
+++ mlpack/conf/jenkins-conf/benchmark/Makefile	Sat Jan 11 07:50:27 2014
@@ -58,6 +58,7 @@
 export MS_PRINT_BIN=/usr/bin/ms_print
 export VALGRIND_BIN=/usr/bin/valgrind
 export FLANN_PATH=methods/flann/
+export ANN_PATH=methods/ann/
 
 # Color settings.
 NO_COLOR=\033[0m

Added: mlpack/conf/jenkins-conf/benchmark/methods/ann/allknn.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/ann/allknn.py	Sat Jan 11 07:50:27 2014
@@ -0,0 +1,124 @@
+'''
+  @file allknn.py
+  @author Marcus Edel
+
+  Class to benchmark the ann All K-Nearest-Neighbors method with kd-trees.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains symlinks to
+# modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+  os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
+if cmd_subfolder not in sys.path:
+  sys.path.insert(0, cmd_subfolder)
+
+from log import *
+from profiler import *
+
+import shlex
+import subprocess
+import re
+import collections
+
+'''
+This class implements the All K-Nearest-Neighbor Search benchmark.
+'''
+class ALLKNN(object):
+
+  ''' 
+  Create the All K-Nearest-Neighbors benchmark instance, show some informations 
+  and return the instance.
+  
+  @param dataset - Input dataset to perform All K-Nearest-Neighbors on.
+  @param timeout - The time until the timeout. Default no timeout.
+  @param path - Path to the ann executable.
+  @param verbose - Display informational messages.
+  '''
+  def __init__(self, dataset, timeout=0, path=os.environ["ANN_PATH"], 
+        verbose = True):
+    self.verbose = verbose
+    self.dataset = dataset
+    self.path = path
+    self.timeout = timeout
+
+  '''
+  Perform All K-Nearest-Neighbors. If the method has been successfully completed
+  return the elapsed time in seconds.
+
+  @param options - Extra options for the method.
+  @return - Elapsed time in seconds or a negative value if the method was not 
+  successful.
+  '''
+  def RunMethod(self, options):
+    Log.Info("Perform ALLKNN.", self.verbose)
+
+    # If the dataset contains two files then the second file is the query file. 
+    # In this case we add this to the command line.
+    if len(self.dataset) == 2:
+      cmd = shlex.split(self.path + "allknn -r " + self.dataset[0] + " -q " + 
+          self.dataset[1] + " -v " + options)
+    else:
+      cmd = shlex.split(self.path + "allknn -r " + self.dataset + 
+          " -v " + options)   
+
+    # Run command with the nessecary arguments and return its output as a byte 
+    # string. We have untrusted input so we disable all shell based features.
+    try:
+      s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False, 
+          timeout=self.timeout) 
+    except subprocess.TimeoutExpired as e:
+      Log.Warn(str(e))
+      return -2
+    except Exception as e:
+      Log.Fatal("Could not execute command: " + str(cmd))
+      return -1
+
+    # Return the elapsed time.
+    timer = self.parseTimer(s)
+    if not timer:
+      Log.Fatal("Can't parse the timer")
+      return -1
+    else:
+      time = self.GetTime(timer)
+      Log.Info(("total time: %fs" % (time)), self.verbose)
+
+      return time
+
+  '''
+  Parse the timer data form a given string.
+
+  @param data - String to parse timer data from.
+  @return - Namedtuple that contains the timer data or -1 in case of an error.
+  '''
+  def parseTimer(self, data):
+    # Compile the regular expression pattern into a regular expression object to
+    # parse the timer data.
+    pattern = re.compile(r"""
+        .*?knn_time: (?P<knn_time>.*?)s.*?
+        """, re.VERBOSE|re.MULTILINE|re.DOTALL)
+    
+    match = pattern.match(data.decode())
+    if not match:
+      Log.Fatal("Can't parse the data: wrong format")
+      return -1
+    else:
+      # Create a namedtuple and return the timer data.
+      timer = collections.namedtuple("timer", ["knn_time"])
+      
+      if match.group("knn_time").count(".") == 1:
+        return timer(float(match.group("knn_time")))
+      else:
+        return timer(float(match.group("knn_time").replace(",", ".")))
+
+  '''
+  Return the elapsed time in seconds.
+
+  @param timer - Namedtuple that contains the timer data.
+  @return Elapsed time in seconds.
+  '''
+  def GetTime(self, timer):
+    return timer.total_time

Added: mlpack/conf/jenkins-conf/benchmark/methods/ann/src/allknn.cpp
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/methods/ann/src/allknn.cpp	Sat Jan 11 07:50:27 2014
@@ -0,0 +1,114 @@
+/**
+ * @file allknn.cpp
+ * @author Marcus Edel
+ *
+ * Code to benchmark the ann All K-Nearest-Neighbors method with kd-trees.
+ */
+
+#include <mlpack/core.hpp>
+#include <mlpack/core/util/timers.hpp>
+#include <ANN/ANN.h>
+
+using namespace mlpack;
+using namespace std;
+
+// Information about the program itself.
+PROGRAM_INFO("All K-Nearest-Neighbors",
+    "This program will calculate the all k-nearest-neighbors with the ann "
+    "library.");
+
+// Define our input parameters that this program will take.
+PARAM_STRING_REQ("reference_file", "File containing the reference dataset.",
+    "r");
+PARAM_INT_REQ("k", "Number of nearest neighbors to find.", "k");
+PARAM_STRING("query_file", "File containing query points (optional).", "q", "");
+PARAM_INT("leaf_size", "Leaf size for tree building.", "l", 20);
+
+
+int main(int argc, char **argv)
+{
+  // Parse command line options.
+  CLI::ParseCommandLine(argc, argv);
+
+  // Get all the parameters.
+  const string referenceFile = CLI::GetParam<string>("reference_file");
+  const string queryFile = CLI::GetParam<string>("query_file");
+
+  int lsInt = CLI::GetParam<int>("leaf_size");
+
+  size_t k = CLI::GetParam<int>("k");
+
+  arma::mat referenceData;
+  arma::mat queryData; // So it doesn't go out of scope.
+  data::Load(referenceFile, referenceData, true);
+
+  Log::Info << "Loaded reference data from '" << referenceFile << "' ("
+      << referenceData.n_rows << " x " << referenceData.n_cols << ")." << endl;
+
+  if (queryFile != "")
+  {
+    data::Load(queryFile, queryData, true);
+    Log::Info << "Loaded query data from '" << queryFile << "' ("
+        << queryData.n_rows << " x " << queryData.n_cols << ")." << endl;
+  }
+  else
+  {
+    queryData = referenceData;
+  }
+
+  // Sanity check on k value: must be greater than 0, must be less than the
+  // number of reference points.
+  if (k > referenceData.n_cols)
+  {
+    Log::Fatal << "Invalid k: " << k << "; must be greater than 0 and less ";
+    Log::Fatal << "than or equal to the number of reference points (";
+    Log::Fatal << referenceData.n_cols << ")." << endl;
+  }
+
+  // Sanity check on leaf size.
+  if (lsInt < 0)
+  {
+    Log::Fatal << "Invalid leaf size: " << lsInt << ".  Must be greater "
+        "than or equal to 0." << endl;
+  }
+
+  size_t leafSize = lsInt;
+  size_t maxPts = referenceData.n_elem;
+  size_t dim = referenceData.n_rows;
+
+  ANNidxArray nnIdx = new ANNidx[k];
+  ANNdistArray dists = new ANNdist[k];
+  ANNpointArray dataPts = annAllocPts(maxPts, dim);
+
+  for (int i = 0; i < referenceData.n_cols; ++i)
+  {
+    for (int j = 0; j < referenceData.n_rows; ++j)
+    {
+      dataPts[i][j] = referenceData(j,i);
+    }
+  }
+
+  Timer::Start("knn_time");
+
+  ANNkd_tree*  kdTree = new ANNkd_tree(dataPts, maxPts, dim, lsInt);
+
+  arma::vec queryPoint;
+  for (int i = 0; i < queryData.n_cols; ++i)
+  {
+    queryPoint = queryData.col(i);
+    kdTree->annkSearch(queryPoint.memptr(), k, nnIdx,  dists, 0);
+
+    for (int j = 0; j < k; j++) 
+    {     
+      dists[j] = sqrt(dists[j]);
+    }
+  }
+
+  Timer::Stop("knn_time");
+
+  delete [] nnIdx;
+  delete [] dists;
+  delete kdTree;
+  annClose();
+  return 0;
+}



More information about the mlpack-svn mailing list