[mlpack-svn] r15588 - in mlpack/conf/jenkins-conf/benchmark: benchmark util

Tue Aug 6 16:43:30 EDT 2013

Author: marcus
Date: Tue Aug  6 16:43:30 2013
New Revision: 15588

Log:
Add function to get dataset informations.

Modified:
   mlpack/conf/jenkins-conf/benchmark/benchmark/run_benchmark.py
   mlpack/conf/jenkins-conf/benchmark/util/misc.py

Modified: mlpack/conf/jenkins-conf/benchmark/benchmark/run_benchmark.py
==============================================================================

--- mlpack/conf/jenkins-conf/benchmark/benchmark/run_benchmark.py	(original)
+++ mlpack/conf/jenkins-conf/benchmark/benchmark/run_benchmark.py	Tue Aug  6 16:43:30 2013
@@ -20,6 +20,7 @@
 from parser import *
 from convert import *
 from misc import *
+from database import *
 
 import argparse
 import datetime
@@ -36,19 +37,6 @@
   Log.Info('CPU Cores: ' + SystemInfo.GetCPUCores())
 
 '''
-Normalize the dataset name. If the dataset is a list of datasets, take the first
-dataset as name. If necessary remove characters like '.', '_'.
-
- at param dataset - Dataset file or a list of datasets files.
- at return Normalized dataset name.
-'''
-def NormalizeDatasetName(dataset):
-  if not isinstance(dataset, str):
-    return os.path.splitext(os.path.basename(dataset[0]))[0].split('_')[0]
-  else:
-    return os.path.splitext(os.path.basename(dataset))[0].split('_')[0]
-
-'''
 Check if the file is available in one of the given formats.
 
 @param dataset - Datsets which should be checked.

Modified: mlpack/conf/jenkins-conf/benchmark/util/misc.py
==============================================================================
--- mlpack/conf/jenkins-conf/benchmark/util/misc.py	(original)
+++ mlpack/conf/jenkins-conf/benchmark/util/misc.py	Tue Aug  6 16:43:30 2013
@@ -5,6 +5,8 @@
   Supporting functions.
 '''
 
+import os
+
 '''
 This function determinate if the given number is a float.
 
@@ -47,6 +49,19 @@
   return table
 
 '''
+Normalize the dataset name. If the dataset is a list of datasets, take the first
+dataset as name. If necessary remove characters like '.', '_'.
+
+ at param dataset - Dataset file or a list of datasets files.
+ at return Normalized dataset name.
+'''
+def NormalizeDatasetName(dataset):
+  if not isinstance(dataset, str):
+    return os.path.splitext(os.path.basename(dataset[0]))[0].split('_')[0]
+  else:
+    return os.path.splitext(os.path.basename(dataset))[0].split('_')[0]
+
+'''
 Search the correct row to insert the new data. We look at the left column for
 a free place or for the matching name.
 
@@ -58,3 +73,27 @@
   for row in range(datasetCount):
     if (dataMatrix[row][0] == datasetName) or (dataMatrix[row][0] == "-"):
       return row
+
+'''
+Collect informations for the given dataset.
+
+ at param path - Path to the dataset.
+ at return Tuble which contains the informations about the given dataset.
+'''
+def DatasetInfo(path):
+  instances = 0
+  with open(path, "r") as fid:
+    for line in fid:
+      instances += 1
+
+  attributes = 0
+  with open(path, "r") as fid:
+    for line in fid:
+      attributes = line.count(",") + 1
+      break
+
+  name = NormalizeDatasetName(path)
+  size = os.path.getsize(path) / (1 << 20)
+  datasetType = "real"
+
+  return (name, size, attributes, instances, datasetType)