[mlpack-git] master: Add a primitive ARFF reader. (1c9ad21)

Wed Dec 23 11:44:47 EST 2015

Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit 1c9ad215f88cc69e72e51073cc78c12787867607
Author: ryan <ryan at ratml.org>
Date:   Tue Oct 20 17:50:39 2015 -0400

    Add a primitive ARFF reader.


>---------------------------------------------------------------

1c9ad215f88cc69e72e51073cc78c12787867607
 src/mlpack/core/data/CMakeLists.txt     |   2 +
 src/mlpack/core/data/load_arff.hpp      |  39 +++++++
 src/mlpack/core/data/load_arff_impl.hpp | 193 ++++++++++++++++++++++++++++++++
 src/mlpack/core/data/load_impl.hpp      |  20 ++++
 4 files changed, 254 insertions(+)

diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt
index 6fd0aef..87626b0 100644
--- a/src/mlpack/core/data/CMakeLists.txt
+++ b/src/mlpack/core/data/CMakeLists.txt
@@ -7,6 +7,8 @@ set(SOURCES
   format.hpp
   load.hpp
   load_impl.hpp
+  load_arff.hpp
+  load_arff_impl.hpp
   normalize_labels.hpp
   normalize_labels_impl.hpp
   save.hpp
diff --git a/src/mlpack/core/data/load_arff.hpp b/src/mlpack/core/data/load_arff.hpp
new file mode 100644
index 0000000..5401e4d
--- /dev/null
+++ b/src/mlpack/core/data/load_arff.hpp
@@ -0,0 +1,39 @@
+/**
+ * @file load_arff.hpp
+ * @author Ryan Curtin
+ *
+ * Load an ARFF dataset.
+ */
+#ifndef __MLPACK_CORE_DATA_LOAD_ARFF_HPP
+#define __MLPACK_CORE_DATA_LOAD_ARFF_HPP
+
+#include <mlpack/prereqs.hpp>
+
+namespace mlpack {
+namespace data {
+
+/**
+ * A utility function to load an ARFF dataset as numeric features (that is, as
+ * an Armadillo matrix without any modification).  An exception will be thrown
+ * if any features are non-numeric.
+ */
+template<typename eT>
+void LoadARFF(const std::string& filename, arma::Mat<eT>& matrix);
+
+/**
+ * A utility function to load an ARFF dataset as numeric and categorical
+ * features, using the DatasetInfo structure for mapping.  An exception will be
+ * thrown upon failure.
+ */
+template<typename eT>
+void LoadARFF(const std::string& filename,
+              arma::Mat<eT>& matrix,
+              DatasetInfo& info);
+
+} // namespace data
+} // namespace mlpack
+
+// Include implementation.
+#include "load_arff_impl.hpp"
+
+#endif
diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp
new file mode 100644
index 0000000..32651f4
--- /dev/null
+++ b/src/mlpack/core/data/load_arff_impl.hpp
@@ -0,0 +1,193 @@
+/**
+ * @file load_arff_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Load an ARFF dataset.
+ */
+#ifndef __MLPACK_CORE_DATA_LOAD_ARFF_IMPL_HPP
+#define __MLPACK_CORE_DATA_LOAD_ARFF_IMPL_HPP
+
+// In case it hasn't been included yet.
+#include "load_arff.hpp"
+
+namespace mlpack {
+namespace data {
+
+template<typename eT>
+void LoadARFF(const std::string& filename,
+              arma::Mat<eT>& matrix,
+              DatasetInfo& info)
+{
+  // First, open the file.
+  std::ifstream ifs;
+  ifs.open(filename);
+
+  std::string line;
+  size_t dimensionality = 0;
+  std::vector<bool> types;
+  size_t headerLines = 0;
+  while (!ifs.eof())
+  {
+    // Read the next line, then strip whitespace from either side.
+    std::getline(ifs, line, '\n');
+    boost::trim(line);
+    ++headerLines;
+
+    // Is the first character a comment, or is the line empty?
+    if (line[0] == '%' || line.empty())
+      continue; // Ignore this line.
+
+    // If the first character is @, we are looking at @relation, @attribute, or
+    // @data.
+    if (line[0] == '@')
+    {
+      typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
+      std::string spaces = " \t";
+      boost::escaped_list_separator<char> sep("\\", spaces, "\"{");
+      Tokenizer tok(line, sep);
+      Tokenizer::iterator it = tok.begin();
+
+      // Get the annotation we are looking at.
+      std::string annotation(*it);
+
+      if (*tok.begin() == "@relation")
+      {
+        // We don't actually have anything to do with the name of the dataset.
+        continue;
+      }
+      else if (*tok.begin() == "@attribute")
+      {
+        ++dimensionality;
+        // We need to mark this dimension with its according type.
+        ++it; // Ignore the dimension name.
+        std::string dimType = *(++it);
+        std::transform(dimType.begin(), dimType.end(), dimType.begin(),
+            ::tolower);
+
+        if (dimType == "numeric" || dimType == "integer" || dimType == "real")
+        {
+          types.push_back(false); // The feature is numeric.
+        }
+        else if (dimType == "string")
+        {
+          types.push_back(true); // The feature is categorical.
+        }
+        else if (dimType[0] == '{')
+        {
+          throw std::logic_error("list of ARFF values not yet supported");
+        }
+      }
+      else if (*tok.begin() == "@data")
+      {
+        // We are in the data section.  So we can move out of this loop.
+        break;
+      }
+      else
+      {
+        throw std::runtime_error("unknown ARFF annotation '" + (*tok.begin()) +
+            "'");
+      }
+    }
+  }
+
+  if (ifs.eof())
+    throw std::runtime_error("no @data section found");
+
+  info = DatasetInfo(dimensionality);
+  for (size_t i = 0; i < types.size(); ++i)
+  {
+    if (types[i])
+      info.Type(i) = Datatype::categorical;
+    else
+      info.Type(i) = Datatype::numeric;
+  }
+
+  // We need to find out how many lines of data are in the file.
+  std::streampos pos = ifs.tellg();
+  size_t row = 0;
+  while (!ifs.eof())
+  {
+    std::getline(ifs, line, '\n');
+    ++row;
+  }
+  // Uncount the EOF row.
+  --row;
+
+  // Since we've hit the EOF, we have to call clear() so we can seek again.
+  ifs.clear();
+  ifs.seekg(pos);
+
+  // Now, set the size of the matrix.
+  matrix.set_size(dimensionality, row);
+
+  // Now we are looking at the @data section.
+  row = 0;
+  while (!ifs.eof())
+  {
+    std::getline(ifs, line, '\n');
+    boost::trim(line);
+    // Each line of the @data section must be a CSV (except sparse data, which
+    // we will handle later).  So now we can tokenize the
+    // CSV and parse it.  The '?' representing a missing value is not allowed,
+    // so if that occurs we throw an exception.  We also throw an exception if
+    // any piece of data does not match its type (categorical or numeric).
+
+    // If the first character is {, it is sparse data, and we can just say this
+    // is not handled for now...
+    if (line[0] == '{')
+      throw std::runtime_error("cannot yet parse sparse ARFF data");
+
+    // Tokenize the line.
+    typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
+    boost::escaped_list_separator<char> sep("\\", ",", "\"");
+    Tokenizer tok(line, sep);
+
+    size_t col = 0;
+    std::stringstream token;
+    for (Tokenizer::iterator it = tok.begin(); it != tok.end(); ++it)
+    {
+      // What should this token be?
+      if (info.Type(col) == Datatype::categorical)
+      {
+        matrix(col, row) = info.MapString(*it, col); // We load transposed.
+      }
+      else if (info.Type(col) == Datatype::numeric)
+      {
+        // Attempt to read as numeric.
+        token.clear();
+        token.str(*it);
+
+        eT val = eT(0);
+        token >> val;
+
+        if (token.fail())
+        {
+          // Check for NaN or inf.
+          if (!arma::diskio::convert_naninf(val, token.str()))
+          {
+            // Okay, it's not NaN or inf.  If it's '?', we issue a specific
+            // error, otherwise we issue a general error.
+            std::stringstream error;
+            if (token.str() == "?")
+              error << "missing values ('?') not supported ";
+            else
+              error << "parse error ";
+            error << "at line " << (headerLines + row) << " token " << col;
+            throw std::runtime_error(error.str());
+          }
+        }
+
+        // If we made it to here, we have a value.
+        matrix(col, row) = val; // We load transposed.
+      }
+
+      ++col;
+    }
+    ++row;
+  }
+}
+
+} // namespace data
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp
index 5ce19d0..cd9bedf 100644
--- a/src/mlpack/core/data/load_impl.hpp
+++ b/src/mlpack/core/data/load_impl.hpp
@@ -23,6 +23,8 @@
 
 #include "serialization_shim.hpp"
 
+#include "load_arff.hpp"
+
 namespace mlpack {
 namespace data {
 
@@ -432,6 +434,24 @@ bool Load(const std::string& filename,
       ++row;
     }
   }
+  else if (extension == "arff")
+  {
+    try
+    {
+      LoadARFF(filename, matrix, info);
+
+      // We transpose by default.  So, un-transpose if necessary...
+      if (!transpose)
+        inplace_transpose(matrix);
+    }
+    catch (std::exception& e)
+    {
+      if (fatal)
+        Log::Fatal << e.what() << std::endl;
+      else
+        Log::Warn << e.what() << std::endl;
+    }
+  }
   else
   {
     // The type is unknown.