[mlpack-git] master: modify MapToNumerical to work with MissingPolicy (bace8b2)
gitdub at mlpack.org
gitdub at mlpack.org
Mon Jul 25 12:19:10 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2
>---------------------------------------------------------------
commit bace8b25ba703878a1348782e9e4feb210062a47
Author: Keon Kim <kwk236 at gmail.com>
Date: Mon Jul 4 09:21:30 2016 +0900
modify MapToNumerical to work with MissingPolicy
>---------------------------------------------------------------
bace8b25ba703878a1348782e9e4feb210062a47
src/mlpack/core/data/dataset_info.hpp | 3 ++
src/mlpack/core/data/dataset_info_impl.hpp | 9 ++++--
src/mlpack/core/data/load_impl.hpp | 33 ++++++----------------
.../core/data/map_policies/missing_policy.hpp | 5 +---
.../methods/preprocess/preprocess_imputer_main.cpp | 5 ----
src/mlpack/tests/imputation_test.cpp | 16 ++++++-----
6 files changed, 28 insertions(+), 43 deletions(-)
diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp
index da9f3ce..d87e027 100644
--- a/src/mlpack/core/data/dataset_info.hpp
+++ b/src/mlpack/core/data/dataset_info.hpp
@@ -113,6 +113,9 @@ class DatasetMapper
//! Modify the policy of the mapper (be careful!).
PolicyType& Policy();
+ //! Modify (Replace) the policy of the mapper with a new policy
+ void Policy(PolicyType& policy);
+
private:
//! Types of each dimension.
std::vector<Datatype> types;
diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp
index 4eed4a9..1c35097 100644
--- a/src/mlpack/core/data/dataset_info_impl.hpp
+++ b/src/mlpack/core/data/dataset_info_impl.hpp
@@ -18,7 +18,6 @@ template<typename PolicyType>
inline DatasetMapper<PolicyType>::DatasetMapper(const size_t dimensionality) :
types(dimensionality, Datatype::numeric)
{
- Log::Debug << "DatasetMapper(dimensionality)" << std::endl;
// Nothing to initialize here.
}
@@ -28,7 +27,6 @@ inline DatasetMapper<PolicyType>::DatasetMapper(PolicyType& policy,
types(dimensionality, Datatype::numeric),
policy(std::move(policy))
{
- Log::Debug << "DatasetMapper(policy, dimensionality)" << std::endl;
// Nothing to initialize here.
}
@@ -127,6 +125,13 @@ inline PolicyType& DatasetMapper<PolicyType>::Policy()
return this->policy;
}
+template<typename PolicyType>
+inline void DatasetMapper<PolicyType>::Policy(PolicyType& policy)
+{
+ this->policy = std::move(policy);
+}
+
+
} // namespace data
} // namespace mlpack
diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp
index 419b090..f521be4 100644
--- a/src/mlpack/core/data/load_impl.hpp
+++ b/src/mlpack/core/data/load_impl.hpp
@@ -65,34 +65,17 @@ void MapToNumerical(const std::vector<std::string>& tokens,
DatasetMapper<PolicyType>& info,
arma::Mat<eT>& matrix)
{
- auto notNumber = [](const std::string& str)
+ std::stringstream token;
+ for (size_t i = 0; i != tokens.size(); ++i)
{
- eT val(0);
- std::stringstream token;
- token.str(str);
- token>>val;
- return token.fail();
- };
-
- const bool notNumeric = std::any_of(std::begin(tokens),
- std::end(tokens), notNumber);
- if(notNumeric)
- {
- for(size_t i = 0; i != tokens.size(); ++i)
+ token.str(tokens[i]);
+ token>>matrix.at(row, i);
+ if (token.fail()) // if not number, map it to datasetmapper
{
const eT val = static_cast<eT>(info.MapString(tokens[i], row));
matrix.at(row, i) = val;
}
- }
- else
- {
- std::stringstream token;
- for(size_t i = 0; i != tokens.size(); ++i)
- {
- token.str(tokens[i]);
- token>>matrix.at(row, i);
- token.clear();
- }
+ token.clear();
}
}
@@ -411,7 +394,7 @@ bool Load(const std::string& filename,
type = "raw ASCII-formatted data";
Log::Info << "Loading '" << filename << "' as " << type << ". "
- << std::endl;
+ << std::flush;
std::string separators;
if (commas)
separators = ",";
@@ -496,7 +479,7 @@ bool Load(const std::string& filename,
else if (extension == "arff")
{
Log::Info << "Loading '" << filename << "' as ARFF dataset. "
- << std::endl;
+ << std::flush;
try
{
LoadARFF(filename, matrix, info);
diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp
index 6c3d1d1..c5df023 100644
--- a/src/mlpack/core/data/map_policies/missing_policy.hpp
+++ b/src/mlpack/core/data/map_policies/missing_policy.hpp
@@ -29,13 +29,12 @@ class MissingPolicy
MissingPolicy()
{
- Log::Debug << "MissingPolicy()" << std::endl;
+ // Nothing to initialize here.
}
explicit MissingPolicy(std::set<std::string> missingSet) :
missingSet(std::move(missingSet))
{
- Log::Debug << "MissingPolicy(missingSet)" << std::endl;
// Nothing to initialize here.
}
@@ -49,7 +48,6 @@ class MissingPolicy
// If this condition is true, either we have no mapping for the given string
// or we have no mappings for the given dimension at all. In either case,
// we create a mapping.
- Log::Debug << "missingSet has: " << missingSet.count(string) << std::endl;
if (missingSet.count(string) != 0 &&
(maps.count(dimension) == 0 ||
maps[dimension].first.left.count(string) == 0))
@@ -64,7 +62,6 @@ class MissingPolicy
else
{
// This string already exists in the mapping.
- Log::Debug << "string already exists in the mapping" << std::endl;
return maps[dimension].first.left.at(string);
}
}
diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
index 6857352..a0b0a13 100644
--- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
@@ -92,10 +92,8 @@ int main(int argc, char** argv)
// Policy tells how the DatasetMapper should map the values.
std::set<std::string> missingSet;
missingSet.insert(missingValue);
- Log::Debug << "initalize MissingPolicy(missingSet)" << endl;
MissingPolicy policy(missingSet);
using MapperType = DatasetMapper<MissingPolicy>;
- Log::Debug << "initalize info(policy)" << endl;
DatasetMapper<MissingPolicy> info(policy);
Load(inputFile, input, info, true, true);
@@ -149,9 +147,6 @@ int main(int argc, char** argv)
}
}
- // for testing purpose
- Log::Info << "output::" << endl;
- Log::Info << output << endl;
if (!outputFile.empty())
{
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
index 8746945..c7d6d4c 100644
--- a/src/mlpack/tests/imputation_test.cpp
+++ b/src/mlpack/tests/imputation_test.cpp
@@ -33,7 +33,7 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
fstream f;
f.open("test_file.csv", fstream::out);
f << "a, 2, 3" << endl;
- f << "5, 6, 7" << endl;
+ f << "5, 6, b" << endl;
f << "8, 9, 10" << endl;
f.close();
@@ -43,6 +43,7 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
std::set<string> mset;
mset.insert("a");
+ mset.insert("b");
MissingPolicy miss(mset);
DatasetMapper<MissingPolicy> info(miss);
BOOST_REQUIRE(data::Load("test_file.csv", input, info) == true);
@@ -56,15 +57,16 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
DatasetMapper<MissingPolicy>,
CustomImputation<double>> imputer(info);
imputer.Impute(input, output, "a", 99, dimension); // convert a -> 99
+ imputer.Impute(input, output, "b", 99, dimension); // convert b -> 99
BOOST_REQUIRE_CLOSE(output(0, 0), 99.0, 1e-5);
- BOOST_REQUIRE_CLOSE(output(0, 1), 2.0, 1e-5);
- BOOST_REQUIRE_CLOSE(output(0, 2), 3.0, 1e-5);
- BOOST_REQUIRE_CLOSE(output(1, 0), 5.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(output(0, 1), 5.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(output(0, 2), 8.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(output(1, 0), 2.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5);
- BOOST_REQUIRE_CLOSE(output(1, 2), 7.0, 1e-5);
- BOOST_REQUIRE_CLOSE(output(2, 0), 8.0, 1e-5);
- BOOST_REQUIRE_CLOSE(output(2, 1), 9.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(output(1, 2), 9.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(output(2, 0), 3.0, 1e-5);
+ BOOST_REQUIRE_CLOSE(output(2, 1), 99.0, 1e-5);
BOOST_REQUIRE_CLOSE(output(2, 2), 10.0, 1e-5);
// Remove the file.
More information about the mlpack-git
mailing list