[mlpack-svn] r15509 - in mlpack/conf/jenkins-conf/benchmark: benchmark util

Fri Jul 19 14:59:28 EDT 2013

Author: marcus
Date: Fri Jul 19 14:59:27 2013
New Revision: 15509

Log:
Add function to modify datasets (e.g. add arff header).

Added:
   mlpack/conf/jenkins-conf/benchmark/util/convert.py
Modified:
   mlpack/conf/jenkins-conf/benchmark/benchmark/run_benchmark.py
   mlpack/conf/jenkins-conf/benchmark/util/parser.py

Modified: mlpack/conf/jenkins-conf/benchmark/benchmark/run_benchmark.py
==============================================================================

--- mlpack/conf/jenkins-conf/benchmark/benchmark/run_benchmark.py	(original)
+++ mlpack/conf/jenkins-conf/benchmark/benchmark/run_benchmark.py	Fri Jul 19 14:59:27 2013
@@ -18,6 +18,7 @@
 from system import *
 from loader import * 
 from parser import *
+from convert import *
 
 import argparse
 
@@ -40,12 +41,80 @@
 @return Normalized dataset name.
 '''
 def NormalizeDatasetName(dataset):
-  if  not isinstance(dataset, basestring):
+  if not isinstance(dataset, basestring):
     return os.path.splitext(os.path.basename(dataset[0]))[0].split('_')[0]
   else:
     return os.path.splitext(os.path.basename(dataset))[0].split('_')[0]
 
 '''
+Check if the file is available in one of the given formats.
+
+ at para dataset - Datsets which should be checked.
+ at para formats - List of supported file formats.
+ at return Orginal dataset or dataset with new file format.
+'''
+def CheckFileExtension(dataset, formats):
+  dataExtension = os.path.splitext(dataset)[1][1:]
+  if dataExtension in formats:
+    return dataset
+  else:
+    return dataset[0:len(dataset) - len(dataExtension)] + formats[0]
+
+'''
+Return a list with modified dataset.
+
+ at para dataset - Datasets to be modified.
+ at para format - List of file formats to be converted to.
+ at return List of modified datasets.
+'''
+def GetDataset(dataset, format):
+  # Check if the given dataset is a list or a single dataset.
+  if not isinstance(dataset, basestring):
+    datasetList = []
+    modifiedList = []
+
+    for data in dataset:  
+      mdata = CheckFileExtension(data, format)
+
+      # Check if the dataset is available.
+      if os.path.isfile(mdata):
+        datasetList.append(mdata)
+      else:
+        # Check if the dataset is available.
+        convert = Convert(data, format[0])
+        datasetList.append(convert.modifiedDataset)
+        modifiedList.append(convert.modifiedDataset)
+  else:
+    datasetList = ""
+    modifiedList = ""
+
+    mdataset = CheckFileExtension(dataset, format)
+
+    # Check if the dataset is available.
+    if os.path.isfile(mdataset):
+      datasetList = mdataset
+    else:
+      # Convert the Dataset.
+      convert = Convert(dataset, format[0])
+      datasetList = convert.modifiedDataset
+      modifiedList = convert.modifiedDataset
+
+  return (datasetList, modifiedList)
+
+'''
+This function Remove a given file or list of files.
+
+ at para dataset - File or list of file which should be deleted.
+'''
+def RemoveDataset(dataset):
+  if isinstance(dataset, basestring):
+    dataset = [dataset]
+
+  for f in dataset:
+    if os.path.isfile(f):
+      os.remove(f)  
+
+'''
 Add all rows from a given matrix to a given table.
 
 @para matrix - 2D array contains the row.
@@ -122,29 +191,35 @@
         datsets = libary[1]
         trials = libary[2]
         script = libary[3]
+        format = libary[4]
 
         Log.Info("Libary: " + name)
         header.append(name)
 
         # Load script.
         module = Loader.ImportModuleFromPath(script)
-        methodCall = getattr(module, method)       
+        methodCall = getattr(module, method)            
 
         for dataset in datsets:  
           datasetName = NormalizeDatasetName(dataset)          
           row = FindRightRow(dataMatrix, datasetName, datasetCount)      
 
           dataMatrix[row][0] = NormalizeDatasetName(dataset)
-          Log.Info("Dataset: " + dataMatrix[row][0])        
+          Log.Info("Dataset: " + dataMatrix[row][0])    
+
+          modifiedDataset = GetDataset(dataset, format)
 
           time = 0
           for trial in range(trials + 1):
-            instance = methodCall(dataset, verbose=False)
+            instance = methodCall(modifiedDataset[0], verbose=False)
             if trial > 0:
               time += instance.RunMethod(options);
 
           # Set time.
           dataMatrix[row][col] = "{0:.6f}".format(time / trials)
+
+          # Remove temporary datasets.
+          RemoveDataset(modifiedDataset[1])
           row += 1
         col += 1
 

Added: mlpack/conf/jenkins-conf/benchmark/util/convert.py
==============================================================================
--- (empty file)
+++ mlpack/conf/jenkins-conf/benchmark/util/convert.py	Fri Jul 19 14:59:27 2013
@@ -0,0 +1,91 @@
+'''
+  @file convert.py
+  @author Marcus Edel
+
+  Implementation of the Convert class.
+'''
+
+import os, sys, inspect
+
+# Import the util path, this method even works if the path contains
+# symlinks to modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+  os.path.split(inspect.getfile(inspect.currentframe()))[0], "")))
+if cmd_subfolder not in sys.path:
+  sys.path.insert(0, cmd_subfolder)
+
+from log import *
+
+import os.path
+
+'''
+This class implements functions to convert files.
+'''
+class Convert(object):
+  '''
+  Convert dataset to a file with the given extension.
+
+  @para dataset - Convert this dataset.
+  @para extension - Convert dataset to a new file with this extension.
+  '''
+  def __init__(self, dataset, extension):
+    self.extension = extension
+    self.modifiedDataset = ""
+
+    self.ModifyDataset(dataset, extension)
+
+  '''
+  Decide which method we have to call to modify the dataset.
+
+  @para dataset - Convert this dataset.
+  @para extension - Convert dataset to a new file with this extension.
+  '''
+  def ModifyDataset(self, dataset, extension):
+    dataExtension = os.path.splitext(dataset)[1][1:]
+    newDataset = dataset[0:len(dataset) - len(dataExtension)] + extension
+
+    if extension == "arff" and (dataExtension == "csv" or dataExtension == "txt"):
+      self.AddArffHeader(dataset, newDataset)
+    else:
+      Log.Fatal("No conversion possible.")
+      pass
+
+  '''
+  Add an header to the dataset file.
+
+  @para data - This dataset contains the information.
+  @para newData - This dataset contais the information and the header.
+  '''
+  def AddArffHeader(self, data, newData):
+    # Extract the dataset name.
+    relationName = os.path.splitext(os.path.basename(data))[0].split('_')[0]
+
+    # Read the first to get the attributes count.
+    fid = open(data)
+    head = [fid.next() for x in xrange(1)]
+    fid.close()
+    
+    # We can convert files with ' ' and ',' as seperator.
+    count = max(head[0].count(","), head[0].count(" ")) + 1
+
+    # Write the header to the new file.
+    nfid = open(newData, "a")
+    nfid.write("@relation " + relationName + "\n\n")
+    for i in range(count):
+      nfid.write("@attribute " + data + "_dim" + str(i) + " NUMERIC\n")
+    nfid.write("\n at data\n")
+
+    # Append the data to the new file.
+    fid = open(data, "r")
+    while True:
+      line = fid.read(65536)
+      if line:
+        nfid.write(line)
+      else:
+        break
+
+    fid.close()
+    nfid.close()
+
+    # Add the modified datasetname to the list.
+    self.modifiedDataset = newData

Modified: mlpack/conf/jenkins-conf/benchmark/util/parser.py
==============================================================================
--- mlpack/conf/jenkins-conf/benchmark/util/parser.py	(original)
+++ mlpack/conf/jenkins-conf/benchmark/util/parser.py	Fri Jul 19 14:59:27 2013
@@ -310,7 +310,6 @@
       # Iterate through all methods.
       methodMapping = self.GetConfigMethod(libraryMapping.methods)      
       while methodMapping and libraryMapping:
-
         # Collect data only from method with run value = true.
         if methodMapping.run:
           for dataset in methodMapping.datasets:     
@@ -320,16 +319,19 @@
 
               if dataset["options"] in tempDict:              
                 t = (libraryMapping.libraryName, dataset["files"], 
-                  methodMapping.iteration, methodMapping.script)  
+                  methodMapping.iteration, methodMapping.script, 
+                  methodMapping.format)  
                 tempDict[dataset["options"]].append(t)          
               else:
                 t = (libraryMapping.libraryName, dataset["files"], 
-                  methodMapping.iteration, methodMapping.script)            
+                  methodMapping.iteration, methodMapping.script, 
+                  methodMapping.format)            
                 tempDict[dataset["options"]] = [t]
             else:
               d = {}
               t = (libraryMapping.libraryName, dataset["files"], 
-                methodMapping.iteration, methodMapping.script)            
+                methodMapping.iteration, methodMapping.script, 
+                methodMapping.format)            
               d[dataset["options"]] = [t]
               streamData[methodMapping.methodName] = d