[mlpack-git] master: Handle non-new DatasetInfo objects better. (06eeee1)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:46:26 EST 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit 06eeee1ea7155813c654cd9bc7c853a0d7e2ebfc
Author: Ryan Curtin <ryan at ratml.org>
Date:   Wed Nov 18 15:26:32 2015 -0800

    Handle non-new DatasetInfo objects better.


>---------------------------------------------------------------

06eeee1ea7155813c654cd9bc7c853a0d7e2ebfc
 src/mlpack/core/data/load_arff.hpp      | 17 +++++++++++++++++
 src/mlpack/core/data/load_arff_impl.hpp | 15 ++++++++++++++-
 src/mlpack/tests/load_save_test.cpp     | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/src/mlpack/core/data/load_arff.hpp b/src/mlpack/core/data/load_arff.hpp
index 5401e4d..f11bda0 100644
--- a/src/mlpack/core/data/load_arff.hpp
+++ b/src/mlpack/core/data/load_arff.hpp
@@ -24,6 +24,23 @@ void LoadARFF(const std::string& filename, arma::Mat<eT>& matrix);
  * A utility function to load an ARFF dataset as numeric and categorical
  * features, using the DatasetInfo structure for mapping.  An exception will be
  * thrown upon failure.
+ *
+ * A pre-existing DatasetInfo object can be passed in, but if the dimensionality
+ * of the given DatasetInfo object (info.Dimensionality()) does not match the
+ * dimensionality of the data, a std::invalid_argument exception will be thrown.
+ * If an empty DatasetInfo object is given (constructed with the default
+ * constructor or otherwise, so that info.Dimensionality() is 0), it will be set
+ * to the right dimensionality.
+ *
+ * This ability to pass in pre-existing DatasetInfo objects is very necessary
+ * when, e.g., loading a test set after training.  If the same DatasetInfo from
+ * loading the training set is not used, then the test set may be loaded with
+ * different mappings---which can cause horrible problems!
+ *
+ * @param filename Name of ARFF file to load.
+ * @param matrix Matrix to load data into.
+ * @param info DatasetInfo object; can be default-constructed or pre-existing
+ *     from another call to LoadARFF().
  */
 template<typename eT>
 void LoadARFF(const std::string& filename,
diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp
index 1e6dc73..d870c84 100644
--- a/src/mlpack/core/data/load_arff_impl.hpp
+++ b/src/mlpack/core/data/load_arff_impl.hpp
@@ -93,7 +93,20 @@ void LoadARFF(const std::string& filename,
   if (ifs.eof())
     throw std::runtime_error("no @data section found");
 
-  info = DatasetInfo(dimensionality);
+  // Reset the DatasetInfo object, if needed.
+  if (info.Dimensionality() == 0)
+  {
+    info = DatasetInfo(dimensionality);
+  }
+  else if (info.Dimensionality() != dimensionality)
+  {
+    std::ostringstream oss;
+    oss << "data::LoadARFF(): given DatasetInfo has dimensionality "
+        << info.Dimensionality() << ", but data has dimensionality "
+        << dimensionality;
+    throw std::invalid_argument(oss.str());
+  }
+
   for (size_t i = 0; i < types.size(); ++i)
   {
     if (types[i])
diff --git a/src/mlpack/tests/load_save_test.cpp b/src/mlpack/tests/load_save_test.cpp
index b533204..f0662e3 100644
--- a/src/mlpack/tests/load_save_test.cpp
+++ b/src/mlpack/tests/load_save_test.cpp
@@ -1253,4 +1253,37 @@ BOOST_AUTO_TEST_CASE(HarderARFFTest)
   remove("test.arff");
 }
 
+/**
+ * If we pass a bad DatasetInfo, it should throw.
+ */
+BOOST_AUTO_TEST_CASE(BadDatasetInfoARFFTest)
+{
+  fstream f;
+  f.open("test.arff", fstream::out);
+  f << "@relation    \t test" << endl;
+  f << endl;
+  f << endl;
+  f << "@attribute @@@@flfl numeric" << endl;
+  f << endl;
+  f << "\% comment" << endl;
+  f << "@attribute \"hello world\" string" << endl;
+  f << "@attribute 12345 integer" << endl;
+  f << "@attribute real real" << endl;
+  f << "@attribute \"blah blah blah     \t \" numeric \% comment" << endl;
+  f << "\% comment" << endl;
+  f << "@data" << endl;
+  f << "1, one, 3, 4.5, 6" << endl;
+  f << "2, two, 4, 5.5, 7 \% comment" << endl;
+  f << "3, \"three five, six\", 5, 6.5, 8" << endl;
+  f.close();
+
+  arma::mat dataset;
+  DatasetInfo info(6);
+
+  BOOST_REQUIRE_THROW(data::LoadARFF("test.arff", dataset, info),
+      std::invalid_argument);
+
+  remove("test.arff");
+}
+
 BOOST_AUTO_TEST_SUITE_END();



More information about the mlpack-git mailing list