[mlpack-git] master: update data::load to accept different policies (de0b2db)
gitdub at mlpack.org
gitdub at mlpack.org
Mon Jul 25 12:18:48 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2
>---------------------------------------------------------------
commit de0b2dbba2422296d801e7aa60dc2ed06091ae1a
Author: Keon Kim <kwk236 at gmail.com>
Date: Fri Jul 1 23:47:31 2016 +0900
update data::load to accept different policies
>---------------------------------------------------------------
de0b2dbba2422296d801e7aa60dc2ed06091ae1a
src/mlpack/core/data/dataset_info.hpp | 14 +-
src/mlpack/core/data/dataset_info_impl.hpp | 44 ++---
src/mlpack/core/data/imputer.hpp | 1 +
src/mlpack/core/data/load.hpp | 12 +-
src/mlpack/core/data/load_arff.hpp | 4 +-
src/mlpack/core/data/load_arff_impl.hpp | 6 +-
src/mlpack/core/data/load_impl.hpp | 182 ++++++++++++++++++++-
.../core/data/map_policies/missing_policy.hpp | 23 ++-
.../methods/preprocess/preprocess_imputer_main.cpp | 18 +-
src/mlpack/tests/imputation_test.cpp | 8 +-
10 files changed, 250 insertions(+), 62 deletions(-)
diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp
index bfd5b70..eaa6882 100644
--- a/src/mlpack/core/data/dataset_info.hpp
+++ b/src/mlpack/core/data/dataset_info.hpp
@@ -24,9 +24,9 @@ namespace data {
* (Datatype::numeric or Datatype::categorical) as well as mappings from strings
* to unsigned integers and vice versa.
*
- * @tparam MapPolicy Mapping policy used to specify MapString();
+ * @tparam PolicyType Mapping policy used to specify MapString();
*/
-template <typename MapPolicy>
+template <typename PolicyType>
class DatasetMapper
{
public:
@@ -37,7 +37,7 @@ class DatasetMapper
*/
DatasetMapper(const size_t dimensionality = 0);
- DatasetMapper(MapPolicy policy, const size_t dimensionality = 0);
+ DatasetMapper(PolicyType& policy, const size_t dimensionality = 0);
/**
* Given the string and the dimension to which it belongs, return its numeric
* mapping. If no mapping yet exists, the string is added to the list of
@@ -47,7 +47,7 @@ class DatasetMapper
* @param string String to find/create mapping for.
* @param dimension Index of the dimension of the string.
*/
- typename MapPolicy::mapped_type MapString(const std::string& string,
+ typename PolicyType::mapped_type MapString(const std::string& string,
const size_t dimension);
/**
@@ -69,7 +69,7 @@ class DatasetMapper
* @param string Mapped string for value.
* @param dimension Dimension to unmap string from.
*/
- typename MapPolicy::mapped_type UnmapValue(const std::string& string,
+ typename PolicyType::mapped_type UnmapValue(const std::string& string,
const size_t dimension);
//! Return the type of a given dimension (numeric or categorical).
@@ -106,7 +106,7 @@ class DatasetMapper
std::vector<Datatype> types;
// BiMapType definition
- using BiMapType = boost::bimap<std::string, typename MapPolicy::mapped_type>;
+ using BiMapType = boost::bimap<std::string, typename PolicyType::mapped_type>;
// Mappings from strings to integers.
// Map entries will only exist for dimensions that are categorical.
@@ -114,7 +114,7 @@ class DatasetMapper
MapType maps;
- MapPolicy policy;
+ PolicyType policy;
};
// Use typedef to provide backward compatibility
diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp
index de543ab..93e2a13 100644
--- a/src/mlpack/core/data/dataset_info_impl.hpp
+++ b/src/mlpack/core/data/dataset_info_impl.hpp
@@ -2,7 +2,7 @@
* @file dataset_info_impl.hpp
* @author Ryan Curtin
*
- * An implementation of the DatasetMapper<MapPolicy> class.
+ * An implementation of the DatasetMapper<PolicyType> class.
*/
#ifndef MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP
#define MLPACK_CORE_DATA_DATASET_INFO_IMPL_HPP
@@ -14,26 +14,28 @@ namespace mlpack {
namespace data {
// Default constructor.
-template<typename MapPolicy>
-inline DatasetMapper<MapPolicy>::DatasetMapper(const size_t dimensionality) :
+template<typename PolicyType>
+inline DatasetMapper<PolicyType>::DatasetMapper(const size_t dimensionality) :
types(dimensionality, Datatype::numeric)
{
+ Log::Debug << "DatasetMapper(dimensionality)" << std::endl;
// Nothing to initialize here.
}
-template<typename MapPolicy>
-inline DatasetMapper<MapPolicy>::DatasetMapper(MapPolicy policy,
+template<typename PolicyType>
+inline DatasetMapper<PolicyType>::DatasetMapper(PolicyType& policy,
const size_t dimensionality) :
types(dimensionality, Datatype::numeric),
policy(std::move(policy))
{
+ Log::Debug << "DatasetMapper(policy, dimensionality)" << std::endl;
// Nothing to initialize here.
}
// When we want to insert value into the map,
// we could use the policy to map the string
-template<typename MapPolicy>
-inline typename MapPolicy::mapped_type DatasetMapper<MapPolicy>::MapString(
+template<typename PolicyType>
+inline typename PolicyType::mapped_type DatasetMapper<PolicyType>::MapString(
const std::string& string,
const size_t dimension)
{
@@ -41,8 +43,8 @@ inline typename MapPolicy::mapped_type DatasetMapper<MapPolicy>::MapString(
}
// Return the string corresponding to a value in a given dimension.
-template<typename MapPolicy>
-inline const std::string& DatasetMapper<MapPolicy>::UnmapString(
+template<typename PolicyType>
+inline const std::string& DatasetMapper<PolicyType>::UnmapString(
const size_t value,
const size_t dimension)
{
@@ -50,7 +52,7 @@ inline const std::string& DatasetMapper<MapPolicy>::UnmapString(
if (maps[dimension].first.right.count(value) == 0)
{
std::ostringstream oss;
- oss << "DatasetMapper<MapPolicy>::UnmapString(): value '" << value
+ oss << "DatasetMapper<PolicyType>::UnmapString(): value '" << value
<< "' unknown for dimension " << dimension;
throw std::invalid_argument(oss.str());
}
@@ -59,8 +61,8 @@ inline const std::string& DatasetMapper<MapPolicy>::UnmapString(
}
// Return the value corresponding to a string in a given dimension.
-template<typename MapPolicy>
-inline typename MapPolicy::mapped_type DatasetMapper<MapPolicy>::UnmapValue(
+template<typename PolicyType>
+inline typename PolicyType::mapped_type DatasetMapper<PolicyType>::UnmapValue(
const std::string& string,
const size_t dimension)
{
@@ -68,7 +70,7 @@ inline typename MapPolicy::mapped_type DatasetMapper<MapPolicy>::UnmapValue(
if (maps[dimension].first.left.count(string) == 0)
{
std::ostringstream oss;
- oss << "DatasetMapper<MapPolicy>::UnmapValue(): string '" << string
+ oss << "DatasetMapper<PolicyType>::UnmapValue(): string '" << string
<< "' unknown for dimension " << dimension;
throw std::invalid_argument(oss.str());
}
@@ -77,8 +79,8 @@ inline typename MapPolicy::mapped_type DatasetMapper<MapPolicy>::UnmapValue(
}
// Get the type of a particular dimension.
-template<typename MapPolicy>
-inline Datatype DatasetMapper<MapPolicy>::Type(const size_t dimension) const
+template<typename PolicyType>
+inline Datatype DatasetMapper<PolicyType>::Type(const size_t dimension) const
{
if (dimension >= types.size())
{
@@ -91,8 +93,8 @@ inline Datatype DatasetMapper<MapPolicy>::Type(const size_t dimension) const
return types[dimension];
}
-template<typename MapPolicy>
-inline Datatype& DatasetMapper<MapPolicy>::Type(const size_t dimension)
+template<typename PolicyType>
+inline Datatype& DatasetMapper<PolicyType>::Type(const size_t dimension)
{
if (dimension >= types.size())
types.resize(dimension + 1, Datatype::numeric);
@@ -100,15 +102,15 @@ inline Datatype& DatasetMapper<MapPolicy>::Type(const size_t dimension)
return types[dimension];
}
-template<typename MapPolicy>
+template<typename PolicyType>
inline
-size_t DatasetMapper<MapPolicy>::NumMappings(const size_t dimension) const
+size_t DatasetMapper<PolicyType>::NumMappings(const size_t dimension) const
{
return (maps.count(dimension) == 0) ? 0 : maps.at(dimension).second;
}
-template<typename MapPolicy>
-inline size_t DatasetMapper<MapPolicy>::Dimensionality() const
+template<typename PolicyType>
+inline size_t DatasetMapper<PolicyType>::Dimensionality() const
{
return types.size();
}
diff --git a/src/mlpack/core/data/imputer.hpp b/src/mlpack/core/data/imputer.hpp
index b7f4bee..4596639 100644
--- a/src/mlpack/core/data/imputer.hpp
+++ b/src/mlpack/core/data/imputer.hpp
@@ -9,6 +9,7 @@
#define MLPACK_CORE_DATA_IMPUTER_HPP
#include <mlpack/core.hpp>
+#include "dataset_info.hpp"
namespace mlpack {
namespace data {
diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp
index b2009d8..476c3ad 100644
--- a/src/mlpack/core/data/load.hpp
+++ b/src/mlpack/core/data/load.hpp
@@ -91,10 +91,18 @@ bool Load(const std::string& filename,
* @param transpose If true, transpose the matrix after loading.
* @return Boolean value indicating success or failure of load.
*/
-template<typename eT, typename MapperType>
+template<typename eT, typename PolicyType>
bool Load(const std::string& filename,
arma::Mat<eT>& matrix,
- MapperType& info,
+ DatasetMapper<PolicyType>& info,
+ const bool fatal = false,
+ const bool transpose = true);
+
+template<typename eT, typename PolicyType>
+bool Load(const std::string& filename,
+ arma::Mat<eT>& matrix,
+ DatasetMapper<PolicyType>& info,
+ PolicyType& policy,
const bool fatal = false,
const bool transpose = true);
diff --git a/src/mlpack/core/data/load_arff.hpp b/src/mlpack/core/data/load_arff.hpp
index 60579ca..ff6c431 100644
--- a/src/mlpack/core/data/load_arff.hpp
+++ b/src/mlpack/core/data/load_arff.hpp
@@ -42,10 +42,10 @@ void LoadARFF(const std::string& filename, arma::Mat<eT>& matrix);
* @param info DatasetInfo object; can be default-constructed or pre-existing
* from another call to LoadARFF().
*/
-template<typename eT, typename MapperType>
+template<typename eT, typename PolicyType>
void LoadARFF(const std::string& filename,
arma::Mat<eT>& matrix,
- MapperType& info);
+ DatasetMapper<PolicyType>& info);
} // namespace data
} // namespace mlpack
diff --git a/src/mlpack/core/data/load_arff_impl.hpp b/src/mlpack/core/data/load_arff_impl.hpp
index edb9057..71ccea6 100644
--- a/src/mlpack/core/data/load_arff_impl.hpp
+++ b/src/mlpack/core/data/load_arff_impl.hpp
@@ -15,10 +15,10 @@
namespace mlpack {
namespace data {
-template<typename eT, typename MapperType>
+template<typename eT, typename PolicyType>
void LoadARFF(const std::string& filename,
arma::Mat<eT>& matrix,
- MapperType& info)
+ DatasetMapper<PolicyType>& info)
{
// First, open the file.
std::ifstream ifs;
@@ -98,7 +98,7 @@ void LoadARFF(const std::string& filename,
// Reset the DatasetInfo object, if needed.
if (info.Dimensionality() == 0)
{
- info = MapperType(dimensionality);
+ info = DatasetMapper<PolicyType>(dimensionality);
}
else if (info.Dimensionality() != dimensionality)
{
diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp
index 8349f4c..f1e7651 100644
--- a/src/mlpack/core/data/load_impl.hpp
+++ b/src/mlpack/core/data/load_impl.hpp
@@ -59,10 +59,10 @@ void TransPoseTokens(std::vector<std::vector<std::string>> const &input,
}
}
-template<typename eT, typename MapperType>
+template<typename eT, typename PolicyType>
void MapToNumerical(const std::vector<std::string>& tokens,
size_t& row,
- MapperType& info,
+ DatasetMapper<PolicyType>& info,
arma::Mat<eT>& matrix)
{
auto notNumber = [](const std::string& str)
@@ -370,10 +370,180 @@ bool Load(const std::string& filename,
}
// Load with mappings. Unfortunately we have to implement this ourselves.
-template<typename eT, typename MapperType>
+template<typename eT, typename PolicyType>
bool Load(const std::string& filename,
arma::Mat<eT>& matrix,
- MapperType& info,
+ DatasetMapper<PolicyType>& info,
+ PolicyType& policy,
+ const bool fatal,
+ const bool transpose)
+{
+ // Get the extension and load as necessary.
+ Timer::Start("loading_data");
+ Log::Debug << "Load with Policy" << std::endl;
+ // Get the extension.
+ std::string extension = Extension(filename);
+
+ // Catch nonexistent files by opening the stream ourselves.
+ std::fstream stream;
+ stream.open(filename.c_str(), std::fstream::in);
+
+ if (!stream.is_open())
+ {
+ Timer::Stop("loading_data");
+ if (fatal)
+ Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl;
+ else
+ Log::Warn << "Cannot open file '" << filename << "'; load failed."
+ << std::endl;
+
+ return false;
+ }
+
+ if (extension == "csv" || extension == "tsv" || extension == "txt")
+ {
+ // True if we're looking for commas; if false, we're looking for spaces.
+ bool commas = (extension == "csv");
+
+ std::string type;
+ if (extension == "csv")
+ type = "CSV data";
+ else
+ type = "raw ASCII-formatted data";
+
+ Log::Info << "Loading '" << filename << "' as " << type << ". "
+ << std::flush;
+ std::string separators;
+ if (commas)
+ separators = ",";
+ else
+ separators = " \t";
+
+ // We'll load this as CSV (or CSV with spaces or tabs) according to
+ // RFC4180. So the first thing to do is determine the size of the matrix.
+ std::string buffer;
+ size_t cols = 0;
+
+ std::getline(stream, buffer, '\n');
+ // Count commas and whitespace in the line, ignoring anything inside
+ // quotes.
+ typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
+ boost::escaped_list_separator<char> sep("\\", separators, "\"");
+ Tokenizer tok(buffer, sep);
+ for (Tokenizer::iterator i = tok.begin(); i != tok.end(); ++i)
+ ++cols;
+
+ // Now count the number of lines in the file. We've already counted the
+ // first one.
+ size_t rows = 1;
+ while (!stream.eof() && !stream.bad() && !stream.fail())
+ {
+ std::getline(stream, buffer, '\n');
+ if (!stream.fail())
+ ++rows;
+ }
+
+ // Now we have the size. So resize our matrix.
+ if (transpose)
+ {
+ matrix.set_size(cols, rows);
+ Log::Debug << "initialize datasetmapper with policy" << std::endl;
+ info = DatasetMapper<PolicyType>(policy, cols);
+ }
+ else
+ {
+ matrix.set_size(rows, cols);
+ Log::Debug << "initialize datasetmapper with policy" << std::endl;
+ info = DatasetMapper<PolicyType>(policy, rows);
+ }
+
+ stream.close();
+ stream.open(filename, std::fstream::in);
+
+ if(transpose)
+ {
+ std::vector<std::vector<std::string>> tokensArray;
+ std::vector<std::string> tokens;
+ while (!stream.bad() && !stream.fail() && !stream.eof())
+ {
+ // Extract line by line.
+ std::getline(stream, buffer, '\n');
+ Tokenizer lineTok(buffer, sep);
+ tokens = details::ToTokens(lineTok);
+ if(tokens.size() == cols)
+ {
+ tokensArray.emplace_back(std::move(tokens));
+ }
+ }
+ for(size_t i = 0; i != cols; ++i)
+ {
+ details::TransPoseTokens(tokensArray, tokens, i);
+ details::MapToNumerical(tokens, i,
+ info, matrix);
+ }
+ }
+ else
+ {
+ size_t row = 0;
+ while (!stream.bad() && !stream.fail() && !stream.eof())
+ {
+ // Extract line by line.
+ std::getline(stream, buffer, '\n');
+ Tokenizer lineTok(buffer, sep);
+ details::MapToNumerical(details::ToTokens(lineTok), row,
+ info, matrix);
+ ++row;
+ }
+ }
+ }
+ else if (extension == "arff")
+ {
+ Log::Info << "Loading '" << filename << "' as ARFF dataset. "
+ << std::flush;
+ try
+ {
+ LoadARFF(filename, matrix, info);
+
+ // We transpose by default. So, un-transpose if necessary...
+ if (!transpose)
+ inplace_transpose(matrix);
+ }
+ catch (std::exception& e)
+ {
+ if (fatal)
+ Log::Fatal << e.what() << std::endl;
+ else
+ Log::Warn << e.what() << std::endl;
+ }
+ }
+ else
+ {
+ // The type is unknown.
+ Timer::Stop("loading_data");
+ if (fatal)
+ Log::Fatal << "Unable to detect type of '" << filename << "'; "
+ << "incorrect extension?" << std::endl;
+ else
+ Log::Warn << "Unable to detect type of '" << filename << "'; load failed."
+ << " Incorrect extension?" << std::endl;
+
+ return false;
+ }
+
+ Log::Info << "Size is " << (transpose ? matrix.n_cols : matrix.n_rows)
+ << " x " << (transpose ? matrix.n_rows : matrix.n_cols) << ".\n";
+
+ Timer::Stop("loading_data");
+
+ return true;
+}
+
+
+// Load with mappings. Unfortunately we have to implement this ourselves.
+template<typename eT, typename PolicyType>
+bool Load(const std::string& filename,
+ arma::Mat<eT>& matrix,
+ DatasetMapper<PolicyType>& info,
const bool fatal,
const bool transpose)
{
@@ -446,12 +616,12 @@ bool Load(const std::string& filename,
if (transpose)
{
matrix.set_size(cols, rows);
- info = MapperType(cols);
+ info = DatasetMapper<PolicyType>(cols);
}
else
{
matrix.set_size(rows, cols);
- info = MapperType(rows);
+ info = DatasetMapper<PolicyType>(rows);
}
stream.close();
diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp
index 4cc8a96..2611e17 100644
--- a/src/mlpack/core/data/map_policies/missing_policy.hpp
+++ b/src/mlpack/core/data/map_policies/missing_policy.hpp
@@ -26,11 +26,18 @@ class MissingPolicy
public:
typedef size_t mapped_type;
- //explicit MissingPolicy(std::set<std::string> specificString) :
- //specificString(std::move(specificString))
- //{
- //// Nothing to initialize here.
- //}
+ MissingPolicy()
+ {
+ Log::Debug << "MissingPolicy()" << std::endl;
+ missingSet.insert("a");
+ }
+
+ explicit MissingPolicy(std::set<std::string> missingSet) :
+ missingSet(std::move(missingSet))
+ {
+ Log::Debug << "MissingPolicy()" << std::endl;
+ // Nothing to initialize here.
+ }
template <typename MapType>
@@ -42,11 +49,11 @@ class MissingPolicy
// If this condition is true, either we have no mapping for the given string
// or we have no mappings for the given dimension at all. In either case,
// we create a mapping.
- if (//specificString.count(string) != 0 &&
+ if (missingSet.count(string) != 0 &&
maps.count(dimension) == 0 ||
maps[dimension].first.left.count(string) == 0)
{
- // This string does not exist yet.
+ // This string does not exist yet.
size_t& numMappings = maps[dimension].second;
typedef boost::bimap<std::string, size_t>::value_type PairType;
@@ -60,7 +67,7 @@ class MissingPolicy
}
}
private:
- //std::set<std::string> specificString;
+ std::set<std::string> missingSet;
}; // class MissingPolicy
} // namespace data
diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
index 6a290b9..687e78e 100644
--- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
@@ -84,10 +84,12 @@ int main(int argc, char** argv)
// DatasetInfo holds how the DatasetMapper should map the values.
// can be specified by passing map_policy classes as template parameters
// ex) DatasetMapper<IncrementPolicy> info;
- using MapperType = DatasetMapper<MissingPolicy>;
- MapperType info;
+ std::set<std::string> missingSet;
+ missingSet.insert(missingValue);
+ MissingPolicy policy(missingSet);
+ DatasetMapper<MissingPolicy> info(policy);
- Load<double, MapperType>(inputFile, input, info, true, true);
+ Load<double, MissingPolicy>(inputFile, input, info, policy, true, true);
// for testing purpose
Log::Info << input << endl;
@@ -109,7 +111,7 @@ int main(int argc, char** argv)
{
Log::Info << "Replacing all '" << missingValue << "' with '" << customValue
<< "'." << endl;
- Imputer<double, MapperType, CustomImputation<double>> impu(info);
+ Imputer<double, MissingPolicy, CustomImputation<double>> impu(info);
impu.Impute(input, output, missingValue, customValue, dimension);
}
else
@@ -119,17 +121,17 @@ int main(int argc, char** argv)
if (strategy == "mean")
{
- Imputer<double, MapperType, MeanImputation<double>> impu(info);
+ Imputer<double, MissingPolicy, MeanImputation<double>> impu(info);
impu.Impute(input, output, missingValue, dimension);
}
else if (strategy == "median")
{
- Imputer<double, MapperType, MedianImputation<double>> impu(info);
+ Imputer<double, MissingPolicy, MedianImputation<double>> impu(info);
impu.Impute(input, output, missingValue, dimension);
}
else if (strategy == "listwise")
{
- Imputer<double, MapperType, ListwiseDeletion<double>> impu(info);
+ Imputer<double, MissingPolicy, ListwiseDeletion<double>> impu(info);
impu.Impute(input, output, missingValue, dimension);
}
else
@@ -139,8 +141,6 @@ int main(int argc, char** argv)
}
// for testing purpose
- Log::Info << "input::" << endl;
- Log::Info << input << endl;
Log::Info << "output::" << endl;
Log::Info << output << endl;
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
index ed776c7..8c759bd 100644
--- a/src/mlpack/tests/imputation_test.cpp
+++ b/src/mlpack/tests/imputation_test.cpp
@@ -51,10 +51,10 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
/* TODO: Connect Load with the new DatasetMapper instead of DatasetInfo*/
- Imputer<double,
- DatasetInfo,
- CustomImputation<double>> impu(info);
- impu.Impute(input, output, missingValue, customValue, feature);
+ //Imputer<double,
+ //DatasetInfo,
+ //CustomImputation<double>> impu(info);
+ //impu.Impute(input, output, missingValue, customValue, feature);
// Remove the file.
remove("test_file.csv");
}
More information about the mlpack-git
mailing list