[mlpack-svn] r15611 - in mlpack/conf/jenkins-conf/benchmark: . benchmark methods/mlpack util

Tue Aug 13 15:15:30 EDT 2013

Author: marcus
Date: Tue Aug 13 15:15:29 2013
New Revision: 15611

Log:
Add functions and a new table to create the memory reports.

Modified:
   mlpack/conf/jenkins-conf/benchmark/Makefile
   mlpack/conf/jenkins-conf/benchmark/benchmark/make_reports.py
   mlpack/conf/jenkins-conf/benchmark/benchmark/memory_benchmark.py
   mlpack/conf/jenkins-conf/benchmark/methods/mlpack/pca.py
   mlpack/conf/jenkins-conf/benchmark/util/database.py
   mlpack/conf/jenkins-conf/benchmark/util/misc.py
   mlpack/conf/jenkins-conf/benchmark/util/profiler.py
   mlpack/conf/jenkins-conf/benchmark/util/template.py

Modified: mlpack/conf/jenkins-conf/benchmark/Makefile
==============================================================================

--- mlpack/conf/jenkins-conf/benchmark/Makefile	(original)
+++ mlpack/conf/jenkins-conf/benchmark/Makefile	Tue Aug 13 15:15:29 2013
@@ -15,6 +15,8 @@
 export SHOGUN_PATH=/opt/shogun/shogun-2.1.0-mod
 export PYTHONPATH=/opt/scikit-learn/scikit-learn-0.13.1/lib/python3.3/site-packages/:/opt/mlpy/mlpy-3.5.0/lib/python3.3/site-packages/:/opt/shogun/shogun-2.1.0/lib/python3.3/dist-packages/
 export LD_LIBRARY_PATH=/opt/shogun/shogun-2.1.0/lib/
+export MS_PRINT_BIN=/usr/bin/ms_print
+export VALGRIND_BIN=/usr/bin/valgrind
 
 ifeq ($(PYTHON_VERSION), 0)
 	$(error Python version 2.7 required which was not found)

Modified: mlpack/conf/jenkins-conf/benchmark/benchmark/make_reports.py
==============================================================================
--- mlpack/conf/jenkins-conf/benchmark/benchmark/make_reports.py	(original)
+++ mlpack/conf/jenkins-conf/benchmark/benchmark/make_reports.py	Tue Aug 13 15:15:29 2013
@@ -20,6 +20,7 @@
 from database import *
 from template import *
 from misc import *
+from profiler import *
 
 import argparse
 import glob
@@ -95,6 +96,23 @@
   return datasetTable
 
 '''
+Create the content for the memory section.
+
+ at param results - This data structure contains the results.
+ at return A string that contains the content for the memory section.
+'''
+def CreateMemoryContent(results):
+  memoryContent = ""
+  if results:
+    for result in results:
+      memoryValues = {}
+      memoryValues["name"] = result[7]
+      memoryValues["content"] = Profiler.MassifMemoryUsageReport(str(result[5]))
+      memoryContent += memoryTemplate % memoryValues
+
+  return memoryContent
+
+'''
 Create the method container with the informations from the database.
 
 @param db - The database object.
@@ -124,6 +142,15 @@
       # Generate a "unique" hash for the chart names.
       chartHash = str(hash(str(method[1:]) + str(buildIds)))
 
+      # Create the memory content.
+      memoryContent = ""
+      mlpackMemoryId = db.GetLibrary("mlpack_memory")
+      if mlpackMemoryId:
+        mlpackMemoryBuilId = db.GetLatestBuildFromLibary(mlpackMemoryId[0][0])
+        if mlpackMemoryBuilId:
+          memoryResults = db.GetMemoryResults(mlpackMemoryBuilId, mlpackMemoryId[0][0], method[0])
+          memoryContent = CreateMemoryContent(memoryResults)     
+
       # Generate a "unique" name for the line chart.
       lineChartName = "img/line_" + chartHash + ".png"
 
@@ -141,7 +168,7 @@
 
       # Create the timing table.
       header, timingTable = CreateTimingTable(timingData, methodLibararies)
-      datasetTable = CreateDatasetTable(methodResults)
+      datasetTable = CreateDatasetTable(methodResults)      
 
       # Create the container.
       reportValues = {}
@@ -163,6 +190,7 @@
       reportValues["timingHeader"] = header
       reportValues["timingTable"] = timingTable
       reportValues["datasetTable"] = datasetTable
+      reportValues["memoryContent"] = memoryContent
 
       methodsPage += methodTemplate % reportValues
 

Modified: mlpack/conf/jenkins-conf/benchmark/benchmark/memory_benchmark.py
==============================================================================
--- mlpack/conf/jenkins-conf/benchmark/benchmark/memory_benchmark.py	(original)
+++ mlpack/conf/jenkins-conf/benchmark/benchmark/memory_benchmark.py	Tue Aug 13 15:15:29 2013
@@ -5,7 +5,6 @@
   Perform the memory benchmark.
 '''
 
-
 import os, sys, inspect
 
 # Import the util path, this method even works if the path contains
@@ -16,85 +15,171 @@
   sys.path.insert(0, cmd_subfolder)
 
 from log import *
-from system import *
 from loader import * 
 from parser import *
+from convert import *
+from misc import *
+from database import *
 
 import argparse
+import datetime
 
-'''
-Show system informations. Are there no data available, the value is 'N/A'.
-'''
-def SystemInformation():
-  
-  Log.Info('CPU Model: ' + SystemInfo.GetCPUModel())
-  Log.Info('Distribution: ' + SystemInfo.GetDistribution())
-  Log.Info('Platform: ' + SystemInfo.GetPlatform())
-  Log.Info('Memory: ' + SystemInfo.GetMemory())
-  Log.Info('CPU Cores: ' + SystemInfo.GetCPUCores())
 
 '''
-Start the main benchmark routine. The method shows some DEBUG information and 
-prints a table with the runtime information.
+Return a list with modified dataset.
+
+ at param dataset - Datasets to be modified.
+ at param format - List of file formats to be converted to.
+ at return List of modified datasets.
 '''
-def Main(configfile):
+def GetDataset(dataset, format):
+  # Check if the given dataset is a list or a single dataset.
+  if not isinstance(dataset, str):
+    datasetList = []
+    modifiedList = []
+
+    for data in dataset:  
+      mdata = CheckFileExtension(data, format)
+
+      # Check if the dataset is available.
+      if os.path.isfile(mdata):
+        datasetList.append(mdata)
+      else:
+        # Check if the dataset is available.
+        convert = Convert(data, format[0])
+        datasetList.append(convert.modifiedDataset)
+        modifiedList.append(convert.modifiedDataset)
+  else:
+    datasetList = ""
+    modifiedList = ""
+
+    mdataset = CheckFileExtension(dataset, format)
+
+    # Check if the dataset is available.
+    if os.path.isfile(mdataset):
+      datasetList = mdataset
+    else:
+      # Convert the Dataset.
+      convert = Convert(dataset, format[0])
+      datasetList = convert.modifiedDataset
+      modifiedList = convert.modifiedDataset
+
+  return (datasetList, modifiedList)
+
+def Main(configfile, blocks, log):
+  # Benchmark settings.
+  timeout = 23000
+  database = "reports/benchmark.db"
+
   # Read Config.
   config = Parser(configfile, verbose=False)
+  streamData = config.StreamMerge()
 
-  # Iterate through all libraries.
-  libraryMapping = config.GetConfigLibraryMethods()
-  while libraryMapping: 
+  # Read the general block and set the attributes.
+  if "general" in streamData:
+    for key, value in streamData["general"]:
+      if key == "timeout":
+        timeout = value
+      if key == "database":
+        database = value
+
+  # Temporary datastructures for the current build.
+  build = {}
+
+  # Open logfile if the user asked for.
+  if log:
+    db = Database(database)
+    db.CreateTables()
+
+  # Transform the blocks string to a list.
+  if blocks:
+    blocks = blocks.split(",")
 
-    if libraryMapping.libraryName != "mlpack":
+  # Iterate through all libraries.
+  for method, sets in streamData.items():
+    if method == "general":
       continue
+    Log.Info("Method: " + method)    
+    for options, libraries in sets.items():
+      Log.Info('Options: ' + (options if options != '' else 'None'))
+
+      if log:
+        methodId = db.GetMethod(method, options)
+        methodId = methodId[0][0] if methodId else db.NewMethod(method, options)
+
+      for libary in libraries:
+        name = libary[0]
+        datsets = libary[1]
+        script = libary[3]
+        format = libary[4]
+        
+        if not blocks or name in blocks:
+          Log.Info("Libary: " + name)
+
+          # Logging: create a new library record for this library.
+          if log and name not in build:
+            libaryId = db.GetLibrary(name + "_memory")
+            libaryId = libaryId[0][0] if libaryId else db.NewLibrary(name + "_memory")
+
+            build[name] = (db.NewBuild(libaryId), libaryId)
+
+          # Load script.
+          try:
+            module = Loader.ImportModuleFromPath(script)
+            methodCall = getattr(module, method)
+          except Exception as e:
+            Log.Fatal("Could not load the script: " + script)
+            Log.Fatal("Exception: " + str(e))
+          else:
+
+            for dataset in datsets:  
+              datasetName = NormalizeDatasetName(dataset)
+
+              # Logging: Create a new dataset record fot this dataset.
+              if log:
+                datasetId = db.GetDataset(datasetName)
+                datasetId = datasetId[0][0] if datasetId else db.NewDataset(*DatasetInfo(dataset))
+
+              Log.Info("Dataset: " + datasetName)
+              modifiedDataset = GetDataset(dataset, format)
+
+              try:
+                instance = methodCall(modifiedDataset[0], timeout=timeout, 
+                  verbose=False)
+              except Exception as e:
+                Log.Fatal("Could not call the constructor: " + script)
+                Log.Fatal("Exception: " + str(e))
+                continue
+
+              # Generate a "unique" name for the memory output file.
+              outputName = "reports/etc/" + str(hash(datetime.datetime.now())) + ".mout"
+
+              try:
+                instance.RunMemoryProfiling(options, outputName);
+              except Exception as e:
+                Log.Fatal("Exception: " + str(e))
+                continue
+
+              # Save results in the logfile if the user asked for.
+              if log:
+                buildId, libaryId = build[name]
+                db.NewMemory(buildId, libaryId, methodId, datasetId, outputName)
 
-    # Iterate through all methods.
-    methodMapping = config.GetConfigMethod(libraryMapping.methods)      
-    while methodMapping and libraryMapping:
-
-      if methodMapping.run:
-
-        Log.Info('Method: ' + methodMapping.methodName)
-
-        # Load script.
-        module = Loader.ImportModuleFromPath(methodMapping.script)
-        methodCall = getattr(module, methodMapping.methodName)
-
-        for dataset in methodMapping.datasets:
-
-          Log.Info('Options: ' + (dataset["options"] if dataset["options"] != '' 
-            else 'None'))
-
-          for files in dataset["files"]:
-
-            # Get dataset name.
-            if  not isinstance(files, basestring):
-              name = os.path.splitext(os.path.basename(files[0]))[0]  
-            else:
-              name = os.path.splitext(os.path.basename(files))[0]
-
-            if name.count('_') != 0:
-              name = name.split("_")[0]
-
-            Log.Info('Dataset: ' + name)
-
-            instance = methodCall(files, verbose=True)
-            instance.RunMemoryProfiling(dataset["options"]);
-
-            # Call the destructor.
-            del instance
-
-      methodMapping = config.GetConfigMethod(libraryMapping.methods)
-    libraryMapping = config.GetConfigLibraryMethods()
+              # Remove temporary datasets.
+              RemoveDataset(modifiedDataset[1])
 
 if __name__ == '__main__':
   parser = argparse.ArgumentParser(description="""Perform the benchmark with the
       given config.""")
   parser.add_argument('-c','--config', help='Configuration file name.', 
       required=True)
+  parser.add_argument('-b','--blocks', help='Run only the specified blocks.', 
+      required=False)
+  parser.add_argument('-l','--log', help='Save the results in the logfile.', 
+      required=False)
 
   args = parser.parse_args()
 
   if args:
-    SystemInformation()
-    Main(args.config)
\ No newline at end of file
+    log = True if args.log == "True" else False
+    Main(args.config, args.blocks, log)
\ No newline at end of file

Modified: mlpack/conf/jenkins-conf/benchmark/methods/mlpack/pca.py
==============================================================================
--- mlpack/conf/jenkins-conf/benchmark/methods/mlpack/pca.py	(original)
+++ mlpack/conf/jenkins-conf/benchmark/methods/mlpack/pca.py	Tue Aug 13 15:15:29 2013
@@ -71,7 +71,7 @@
   '''
   def __del__(self):    
     Log.Info("Clean up.", self.verbose)
-    filelist = ["gmon.out", "output.csv", "PCA.mout"]
+    filelist = ["gmon.out", "output.csv"]
     for f in filelist:
       if os.path.isfile(f):
         os.remove(f)    
@@ -85,14 +85,14 @@
   @return Returns False if the method was not successful, if the method was 
   successful save the report file in the specified file.
   '''
-  def RunMemoryProfiling(self, methodOptions, massifOptions = "--depth=3"):
+  def RunMemoryProfiling(self, methodOptions, fileName, massifOptions="--depth=2"):
     Log.Info("Perform PCA Memory Profiling.", self.verbose)
 
     # Split the command using shell-like syntax.
     cmd = shlex.split(self.path + "pca -i " + self.dataset + 
         " -o output.csv -v " + methodOptions)
 
-    return Profiler.MassifMemoryUsage(cmd, "PCA.mout", options = massifOptions)
+    return Profiler.MassifMemoryUsage(cmd, fileName, options=massifOptions)
     
   '''
   Perform Principal Components Analysis. If the method has been successfully 

Modified: mlpack/conf/jenkins-conf/benchmark/util/database.py
==============================================================================
--- mlpack/conf/jenkins-conf/benchmark/util/database.py	(original)
+++ mlpack/conf/jenkins-conf/benchmark/util/database.py	Tue Aug 13 15:15:29 2013
@@ -100,6 +100,26 @@
         """)
 
   '''
+  Create a new memory table.
+  '''
+  def CreateMemoryTable(self):
+    self.con.executescript("""
+        CREATE TABLE IF NOT EXISTS memory (
+          id INTEGER PRIMARY KEY AUTOINCREMENT,
+          build_id INTEGER NOT NULL,
+          libary_id INTEGER NOT NULL,
+          method_id INTEGER NOT NULL,
+          dataset_id INTEGER NOT NULL,
+          memory_info TEXT NOT NULL,
+
+          FOREIGN KEY(build_id) REFERENCES builds(id) ON DELETE CASCADE,
+          FOREIGN KEY(libary_id) REFERENCES libraries(id) ON DELETE CASCADE,
+          FOREIGN KEY(dataset_id) REFERENCES datasets(id) ON DELETE CASCADE,
+          FOREIGN KEY(method_id) REFERENCES methods(id) ON DELETE CASCADE
+        );
+        """)
+
+  '''
   Create a new build, libraries, datasets and results table.
   '''
   def CreateTables(self):
@@ -108,6 +128,7 @@
     self.CreateDatasetsTable()
     self.CreateMethodsTable()
     self.CreateResultsTable()
+    self.CreateMemoryTable()
 
   '''
   Add a new build record to the builds table.
@@ -189,6 +210,7 @@
   @param time - The mesured time of the build.
   @param var - The variance of the build.
   @param datasetId - The id of the dataset.
+  @param methodId - The id of the method.
   '''
   def NewResult(self, buildId, libaryId, time, var, datasetId, methodId):
      with self.con:
@@ -304,3 +326,21 @@
            str(buildId[0]) + " AND method_id=" + str(methodId))
         timeSummed.append(self.cur.fetchall()[0][0])
     return (buildId[0], timeSummed)
+
+  '''
+  Add a new memory record to the memory table.
+
+  @param libaryId - The if ot the library.
+  @param methodId - The id of the method
+  @param datasetId - The id of the dataset.
+  @param memoryInfo - The text for the memory value.
+  '''
+  def NewMemory(self, buildId, libaryId, methodId, datasetId, memoryInfo):
+     with self.con:
+      self.cur.execute("INSERT INTO memory VALUES (NULL,?,?,?,?,?)", 
+          (buildId, libaryId, methodId, datasetId, memoryInfo))
+
+  def GetMemoryResults(self, buildId, libaryId, methodId):
+    with self.con:
+      self.cur.execute("SELECT * FROM memory JOIN datasets ON memory.dataset_id = datasets.id WHERE libary_id=" + str(libaryId) + " AND build_id="+ str(buildId) + " AND method_id=" + str(methodId))
+      return self.cur.fetchall()

Modified: mlpack/conf/jenkins-conf/benchmark/util/misc.py
==============================================================================
--- mlpack/conf/jenkins-conf/benchmark/util/misc.py	(original)
+++ mlpack/conf/jenkins-conf/benchmark/util/misc.py	Tue Aug 13 15:15:29 2013
@@ -110,3 +110,30 @@
   datasetType = "real"
 
   return (name, size, attributes, instances, datasetType)
+
+'''
+This function Remove a given file or list of files.
+
+ at param dataset - File or list of file which should be deleted.
+'''
+def RemoveDataset(dataset):
+  if isinstance(dataset, str):
+    dataset = [dataset]
+
+  for f in dataset:
+    if os.path.isfile(f):
+      os.remove(f)
+
+'''
+Check if the file is available in one of the given formats.
+
+ at param dataset - Datsets which should be checked.
+ at param formats - List of supported file formats.
+ at return Orginal dataset or dataset with new file format.
+'''
+def CheckFileExtension(dataset, formats):
+  dataExtension = os.path.splitext(dataset)[1][1:]
+  if dataExtension in formats:
+    return dataset
+  else:
+    return dataset[0:len(dataset) - len(dataExtension)] + formats[0]

Modified: mlpack/conf/jenkins-conf/benchmark/util/profiler.py
==============================================================================
--- mlpack/conf/jenkins-conf/benchmark/util/profiler.py	(original)
+++ mlpack/conf/jenkins-conf/benchmark/util/profiler.py	Tue Aug 13 15:15:29 2013
@@ -1,8 +1,8 @@
 '''
-	@file profiler.py
-	@author Marcus Edel
+  @file profiler.py
+  @author Marcus Edel
 
-	Contains functions to get profiling informations.
+  Contains functions to get profiling informations.
 '''
 
 import os
@@ -12,9 +12,9 @@
 # import the util path, this method even works if the path contains
 # symlinks to modules.
 cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
-	os.path.split(inspect.getfile(inspect.currentframe()))[0], '')))
+  os.path.split(inspect.getfile(inspect.currentframe()))[0], '')))
 if cmd_subfolder not in sys.path:
-	sys.path.insert(0, cmd_subfolder)
+  sys.path.insert(0, cmd_subfolder)
 
 from log import *
 
@@ -23,60 +23,79 @@
 '''
 class Profiler(object):
 
-	'''
-	Use valgrind massif to get memory profiling information and the save the ouput
-	in the specified file.
-
-	@param cmd - Method command line to profile.
-	@param output - Save the report at the output path with the specified name.
-	@param valgrind - Path to the valgrind binary.
-	@param options - Specified massif options.
-	@ return Returns False if the method was not successful, if the method was 
-	successful save the report file in the specified file. 
-	'''
-	@staticmethod
-	def MassifMemoryUsage(cmd, output, valgrind = "valgrind", options = ""):
-		import shlex, subprocess
-
-		cmd = shlex.split(("%s --tool=massif --massif-out-file=%s %s") % 
-				(valgrind, output, options)) + cmd
-		try:
-			s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)		
-		except Exception:
-			Log.Fatal("Could not execute command: " + str(cmd))
-			return -1
-
-	'''
-	Returns the memory used by a process and his children. We don't know when the 
-	process is done so we have to poll to get the memory. To avoid memory overflow
-	we use  a ringbuffer to limit the size of the memory values.
-
-	@param process - Popen instance.
-	@param Buffersize - Memory value count.
-	@return List of memory values.
-	'''
-	@staticmethod
-	def SubprocessMemoryUsage(process, Buffersize = 200):
-		import psutil, time, collections
-
-		# Create the process list with the main process and his childrens.
-		p = psutil.Process(process.pid)
-		children = list(p.get_children(recursive=True)) + [p]
-
-		memoryTable = collections.deque(maxlen = Buffersize)
-
-		# We have to poll to get the memory values.
-		while process.poll() == None:
-			try:
-				for p in children:
-					memoryTable.append(int(p.get_memory_info()[0]))
-			# Sometimes a subprocess has terminated in the time between we measure the
-			# memory. In this case, we continue.
-			except psutil.NoSuchProcess: 
-				continue				
-			except psutil.AccessDenied: 
-				continue
+  '''
+  Use valgrind massif to get memory profiling information and the save the ouput
+  in the specified file.
+
+  @param cmd - Method command line to profile.
+  @param output - Save the report at the output path with the specified name.
+  @param valgrind - Path to the valgrind binary.
+  @param options - Specified massif options.
+  @ return Returns False if the method was not successful, if the method was 
+  successful save the report file in the specified file. 
+  '''
+  @staticmethod
+  def MassifMemoryUsage(command, output, valgrind=os.environ["VALGRIND_BIN"], options=""):
+    import shlex, subprocess
+
+    cmd = shlex.split(("%s --tool=massif --massif-out-file=%s %s ") % 
+        (valgrind, output, options)) + command
+    try:
+      s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)
+    except Exception:
+      Log.Fatal("Could not execute command: " + str(cmd))
+      return -1
+
+  '''
+  Use the valgrind ms_print script to generate the massif output.
+
+  @param fileName - The filname of the valgrind massif log file.
+  @param valgrind - The path to the ms_print script.
+  @return The ms_print output.
+  '''
+  @staticmethod
+  def MassifMemoryUsageReport(fileName, valgrind=os.environ["MS_PRINT_BIN"]):
+    import shlex, subprocess
+
+    cmd = shlex.split(valgrind + " " + fileName)
+    try:
+      s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)
+      return s
+    except Exception:
+      Log.Fatal("Could not execute command: " + str(cmd))
+      return -1
+
+  '''
+  Returns the memory used by a process and his children. We don't know when the 
+  process is done so we have to poll to get the memory. To avoid memory overflow
+  we use  a ringbuffer to limit the size of the memory values.
+
+  @param process - Popen instance.
+  @param Buffersize - Memory value count.
+  @return List of memory values.
+  '''
+  @staticmethod
+  def SubprocessMemoryUsage(process, Buffersize=200):
+    import psutil, time, collections
+
+    # Create the process list with the main process and his childrens.
+    p = psutil.Process(process.pid)
+    children = list(p.get_children(recursive=True)) + [p]
+
+    memoryTable = collections.deque(maxlen=Buffersize)
+
+    # We have to poll to get the memory values.
+    while process.poll() == None:
+      try:
+        for p in children:
+          memoryTable.append(int(p.get_memory_info()[0]))
+      # Sometimes a subprocess has terminated in the time between we measure the
+      # memory. In this case, we continue.
+      except psutil.NoSuchProcess: 
+        continue        
+      except psutil.AccessDenied: 
+        continue
 
-			time.sleep(0.01)
+      time.sleep(0.01)
 
-		return memoryTable
+    return memoryTable

Modified: mlpack/conf/jenkins-conf/benchmark/util/template.py
==============================================================================
--- mlpack/conf/jenkins-conf/benchmark/util/template.py	(original)
+++ mlpack/conf/jenkins-conf/benchmark/util/template.py	Tue Aug 13 15:15:29 2013
@@ -129,9 +129,7 @@
             <div>
               <div class="panel">
                 <div class="panel-heading">Massif Log</div>
-                  <div class="row">
-                    
-                  </div>
+                  %(memoryContent)s
               </div>
 
             </div>
@@ -150,4 +148,15 @@
           </div>
         </div>
 
+"""
+
+memoryTemplate = """
+<div class="accordion-group">
+<div class="accordion-heading"><a class="accordion-toggle" data-toggle="collapse" data-parent="#accordion2" href="#%(name)s">%(name)s</a></div>
+  <div id="%(name)s" class="accordion-body collapse">
+    <div class="accordion-inner">
+      %(content)s
+    </div>
+  </div>
+</div>
 """
\ No newline at end of file