[mlpack-git] master: Add a primitive ARFF reader. (1c9ad21)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:44:47 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit 1c9ad215f88cc69e72e51073cc78c12787867607
Author: ryan <ryan at ratml.org>
Date: Tue Oct 20 17:50:39 2015 -0400
Add a primitive ARFF reader.
>---------------------------------------------------------------
1c9ad215f88cc69e72e51073cc78c12787867607
src/mlpack/core/data/CMakeLists.txt | 2 +
src/mlpack/core/data/load_arff.hpp | 39 +++++++
src/mlpack/core/data/load_arff_impl.hpp | 193 ++++++++++++++++++++++++++++++++
src/mlpack/core/data/load_impl.hpp | 20 ++++
4 files changed, 254 insertions(+)
diff --git a/src/mlpack/core/data/CMakeLists.txt b/src/mlpack/core/data/CMakeLists.txt
index 6fd0aef..87626b0 100644
--- a/src/mlpack/core/data/CMakeLists.txt
+++ b/src/mlpack/core/data/CMakeLists.txt
@@ -7,6 +7,8 @@ set(SOURCES
format.hpp
load.hpp
load_impl.hpp
+ load_arff.hpp
+ load_arff_impl.hpp
normalize_labels.hpp
normalize_labels_impl.hpp
save.hpp
diff --git a/src/mlpack/core/data/load_arff.hpp b/src/mlpack/core/data/load_arff.hpp
new file mode 100644
index 0000000..5401e4d
--- /dev/null
+++ b/src/mlpack/core/data/load_arff.hpp
@@ -0,0 +1,39 @@
+/**
+ * @file load_arff.hpp
+ * @author Ryan Curtin
+ *
+ * Load an ARFF dataset.
+ */
+#ifndef __MLPACK_CORE_DATA_LOAD_ARFF_HPP
+#define __MLPACK_CORE_DATA_LOAD_ARFF_HPP
+
+#include <mlpack/prereqs.hpp>
+
+namespace mlpack {
+namespace data {
+
+/**
+ * A utility function to load an ARFF dataset as numeric features (that is, as
+ * an Armadillo matrix without any modification). An exception will be thrown
+ * if any features are non-numeric.
+ */
+template<typename eT>
+void LoadARFF(const std::string& filename, arma::Mat<eT>& matrix);
+
+/**
+ * A utility function to load an ARFF dataset as numeric and categorical
+ * features, using the DatasetInfo structure for mapping. An exception will be
+ * thrown upon failure.
+ */
+template<typename eT>
+void LoadARFF(const std::string& filename,
+ arma::Mat<eT>& matrix,
+ DatasetInfo& info);
+
+} // namespace data
+} // namespace mlpack
+
+// Include implementation.
+#include "load_arff_impl.hpp"
+
+#endif
diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp
new file mode 100644
index 0000000..32651f4
--- /dev/null
+++ b/src/mlpack/core/data/load_arff_impl.hpp
@@ -0,0 +1,193 @@
+/**
+ * @file load_arff_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Load an ARFF dataset.
+ */
+#ifndef __MLPACK_CORE_DATA_LOAD_ARFF_IMPL_HPP
+#define __MLPACK_CORE_DATA_LOAD_ARFF_IMPL_HPP
+
+// In case it hasn't been included yet.
+#include "load_arff.hpp"
+
+namespace mlpack {
+namespace data {
+
+template<typename eT>
+void LoadARFF(const std::string& filename,
+ arma::Mat<eT>& matrix,
+ DatasetInfo& info)
+{
+ // First, open the file.
+ std::ifstream ifs;
+ ifs.open(filename);
+
+ std::string line;
+ size_t dimensionality = 0;
+ std::vector<bool> types;
+ size_t headerLines = 0;
+ while (!ifs.eof())
+ {
+ // Read the next line, then strip whitespace from either side.
+ std::getline(ifs, line, '\n');
+ boost::trim(line);
+ ++headerLines;
+
+ // Is the first character a comment, or is the line empty?
+ if (line[0] == '%' || line.empty())
+ continue; // Ignore this line.
+
+ // If the first character is @, we are looking at @relation, @attribute, or
+ // @data.
+ if (line[0] == '@')
+ {
+ typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
+ std::string spaces = " \t";
+ boost::escaped_list_separator<char> sep("\\", spaces, "\"{");
+ Tokenizer tok(line, sep);
+ Tokenizer::iterator it = tok.begin();
+
+ // Get the annotation we are looking at.
+ std::string annotation(*it);
+
+ if (*tok.begin() == "@relation")
+ {
+ // We don't actually have anything to do with the name of the dataset.
+ continue;
+ }
+ else if (*tok.begin() == "@attribute")
+ {
+ ++dimensionality;
+ // We need to mark this dimension with its according type.
+ ++it; // Ignore the dimension name.
+ std::string dimType = *(++it);
+ std::transform(dimType.begin(), dimType.end(), dimType.begin(),
+ ::tolower);
+
+ if (dimType == "numeric" || dimType == "integer" || dimType == "real")
+ {
+ types.push_back(false); // The feature is numeric.
+ }
+ else if (dimType == "string")
+ {
+ types.push_back(true); // The feature is categorical.
+ }
+ else if (dimType[0] == '{')
+ {
+ throw std::logic_error("list of ARFF values not yet supported");
+ }
+ }
+ else if (*tok.begin() == "@data")
+ {
+ // We are in the data section. So we can move out of this loop.
+ break;
+ }
+ else
+ {
+ throw std::runtime_error("unknown ARFF annotation '" + (*tok.begin()) +
+ "'");
+ }
+ }
+ }
+
+ if (ifs.eof())
+ throw std::runtime_error("no @data section found");
+
+ info = DatasetInfo(dimensionality);
+ for (size_t i = 0; i < types.size(); ++i)
+ {
+ if (types[i])
+ info.Type(i) = Datatype::categorical;
+ else
+ info.Type(i) = Datatype::numeric;
+ }
+
+ // We need to find out how many lines of data are in the file.
+ std::streampos pos = ifs.tellg();
+ size_t row = 0;
+ while (!ifs.eof())
+ {
+ std::getline(ifs, line, '\n');
+ ++row;
+ }
+ // Uncount the EOF row.
+ --row;
+
+ // Since we've hit the EOF, we have to call clear() so we can seek again.
+ ifs.clear();
+ ifs.seekg(pos);
+
+ // Now, set the size of the matrix.
+ matrix.set_size(dimensionality, row);
+
+ // Now we are looking at the @data section.
+ row = 0;
+ while (!ifs.eof())
+ {
+ std::getline(ifs, line, '\n');
+ boost::trim(line);
+ // Each line of the @data section must be a CSV (except sparse data, which
+ // we will handle later). So now we can tokenize the
+ // CSV and parse it. The '?' representing a missing value is not allowed,
+ // so if that occurs we throw an exception. We also throw an exception if
+ // any piece of data does not match its type (categorical or numeric).
+
+ // If the first character is {, it is sparse data, and we can just say this
+ // is not handled for now...
+ if (line[0] == '{')
+ throw std::runtime_error("cannot yet parse sparse ARFF data");
+
+ // Tokenize the line.
+ typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
+ boost::escaped_list_separator<char> sep("\\", ",", "\"");
+ Tokenizer tok(line, sep);
+
+ size_t col = 0;
+ std::stringstream token;
+ for (Tokenizer::iterator it = tok.begin(); it != tok.end(); ++it)
+ {
+ // What should this token be?
+ if (info.Type(col) == Datatype::categorical)
+ {
+ matrix(col, row) = info.MapString(*it, col); // We load transposed.
+ }
+ else if (info.Type(col) == Datatype::numeric)
+ {
+ // Attempt to read as numeric.
+ token.clear();
+ token.str(*it);
+
+ eT val = eT(0);
+ token >> val;
+
+ if (token.fail())
+ {
+ // Check for NaN or inf.
+ if (!arma::diskio::convert_naninf(val, token.str()))
+ {
+ // Okay, it's not NaN or inf. If it's '?', we issue a specific
+ // error, otherwise we issue a general error.
+ std::stringstream error;
+ if (token.str() == "?")
+ error << "missing values ('?') not supported ";
+ else
+ error << "parse error ";
+ error << "at line " << (headerLines + row) << " token " << col;
+ throw std::runtime_error(error.str());
+ }
+ }
+
+ // If we made it to here, we have a value.
+ matrix(col, row) = val; // We load transposed.
+ }
+
+ ++col;
+ }
+ ++row;
+ }
+}
+
+} // namespace data
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp
index 5ce19d0..cd9bedf 100644
--- a/src/mlpack/core/data/load_impl.hpp
+++ b/src/mlpack/core/data/load_impl.hpp
@@ -23,6 +23,8 @@
#include "serialization_shim.hpp"
+#include "load_arff.hpp"
+
namespace mlpack {
namespace data {
@@ -432,6 +434,24 @@ bool Load(const std::string& filename,
++row;
}
}
+ else if (extension == "arff")
+ {
+ try
+ {
+ LoadARFF(filename, matrix, info);
+
+ // We transpose by default. So, un-transpose if necessary...
+ if (!transpose)
+ inplace_transpose(matrix);
+ }
+ catch (std::exception& e)
+ {
+ if (fatal)
+ Log::Fatal << e.what() << std::endl;
+ else
+ Log::Warn << e.what() << std::endl;
+ }
+ }
else
{
// The type is unknown.
More information about the mlpack-git
mailing list