[mlpack-git] master: Handle non-new DatasetInfo objects better. (06eeee1)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:46:26 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit 06eeee1ea7155813c654cd9bc7c853a0d7e2ebfc
Author: Ryan Curtin <ryan at ratml.org>
Date: Wed Nov 18 15:26:32 2015 -0800
Handle non-new DatasetInfo objects better.
>---------------------------------------------------------------
06eeee1ea7155813c654cd9bc7c853a0d7e2ebfc
src/mlpack/core/data/load_arff.hpp | 17 +++++++++++++++++
src/mlpack/core/data/load_arff_impl.hpp | 15 ++++++++++++++-
src/mlpack/tests/load_save_test.cpp | 33 +++++++++++++++++++++++++++++++++
3 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/src/mlpack/core/data/load_arff.hpp b/src/mlpack/core/data/load_arff.hpp
index 5401e4d..f11bda0 100644
--- a/src/mlpack/core/data/load_arff.hpp
+++ b/src/mlpack/core/data/load_arff.hpp
@@ -24,6 +24,23 @@ void LoadARFF(const std::string& filename, arma::Mat<eT>& matrix);
* A utility function to load an ARFF dataset as numeric and categorical
* features, using the DatasetInfo structure for mapping. An exception will be
* thrown upon failure.
+ *
+ * A pre-existing DatasetInfo object can be passed in, but if the dimensionality
+ * of the given DatasetInfo object (info.Dimensionality()) does not match the
+ * dimensionality of the data, a std::invalid_argument exception will be thrown.
+ * If an empty DatasetInfo object is given (constructed with the default
+ * constructor or otherwise, so that info.Dimensionality() is 0), it will be set
+ * to the right dimensionality.
+ *
+ * This ability to pass in pre-existing DatasetInfo objects is very necessary
+ * when, e.g., loading a test set after training. If the same DatasetInfo from
+ * loading the training set is not used, then the test set may be loaded with
+ * different mappings---which can cause horrible problems!
+ *
+ * @param filename Name of ARFF file to load.
+ * @param matrix Matrix to load data into.
+ * @param info DatasetInfo object; can be default-constructed or pre-existing
+ * from another call to LoadARFF().
*/
template<typename eT>
void LoadARFF(const std::string& filename,
diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp
index 1e6dc73..d870c84 100644
--- a/src/mlpack/core/data/load_arff_impl.hpp
+++ b/src/mlpack/core/data/load_arff_impl.hpp
@@ -93,7 +93,20 @@ void LoadARFF(const std::string& filename,
if (ifs.eof())
throw std::runtime_error("no @data section found");
- info = DatasetInfo(dimensionality);
+ // Reset the DatasetInfo object, if needed.
+ if (info.Dimensionality() == 0)
+ {
+ info = DatasetInfo(dimensionality);
+ }
+ else if (info.Dimensionality() != dimensionality)
+ {
+ std::ostringstream oss;
+ oss << "data::LoadARFF(): given DatasetInfo has dimensionality "
+ << info.Dimensionality() << ", but data has dimensionality "
+ << dimensionality;
+ throw std::invalid_argument(oss.str());
+ }
+
for (size_t i = 0; i < types.size(); ++i)
{
if (types[i])
diff --git a/src/mlpack/tests/load_save_test.cpp b/src/mlpack/tests/load_save_test.cpp
index b533204..f0662e3 100644
--- a/src/mlpack/tests/load_save_test.cpp
+++ b/src/mlpack/tests/load_save_test.cpp
@@ -1253,4 +1253,37 @@ BOOST_AUTO_TEST_CASE(HarderARFFTest)
remove("test.arff");
}
+/**
+ * If we pass a bad DatasetInfo, it should throw.
+ */
+BOOST_AUTO_TEST_CASE(BadDatasetInfoARFFTest)
+{
+ fstream f;
+ f.open("test.arff", fstream::out);
+ f << "@relation \t test" << endl;
+ f << endl;
+ f << endl;
+ f << "@attribute @@@@flfl numeric" << endl;
+ f << endl;
+ f << "\% comment" << endl;
+ f << "@attribute \"hello world\" string" << endl;
+ f << "@attribute 12345 integer" << endl;
+ f << "@attribute real real" << endl;
+ f << "@attribute \"blah blah blah \t \" numeric \% comment" << endl;
+ f << "\% comment" << endl;
+ f << "@data" << endl;
+ f << "1, one, 3, 4.5, 6" << endl;
+ f << "2, two, 4, 5.5, 7 \% comment" << endl;
+ f << "3, \"three five, six\", 5, 6.5, 8" << endl;
+ f.close();
+
+ arma::mat dataset;
+ DatasetInfo info(6);
+
+ BOOST_REQUIRE_THROW(data::LoadARFF("test.arff", dataset, info),
+ std::invalid_argument);
+
+ remove("test.arff");
+}
+
BOOST_AUTO_TEST_SUITE_END();
More information about the mlpack-git
mailing list