[mlpack-git] master: modify MapToNumerical to work with MissingPolicy (bace8b2)

gitdub at mlpack.org gitdub at mlpack.org
Mon Jul 25 12:19:10 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2

>---------------------------------------------------------------

commit bace8b25ba703878a1348782e9e4feb210062a47
Author: Keon Kim <kwk236 at gmail.com>
Date:   Mon Jul 4 09:21:30 2016 +0900

    modify MapToNumerical to work with MissingPolicy


>---------------------------------------------------------------

bace8b25ba703878a1348782e9e4feb210062a47
 src/mlpack/core/data/dataset_info.hpp              |  3 ++
 src/mlpack/core/data/dataset_info_impl.hpp         |  9 ++++--
 src/mlpack/core/data/load_impl.hpp                 | 33 ++++++----------------
 .../core/data/map_policies/missing_policy.hpp      |  5 +---
 .../methods/preprocess/preprocess_imputer_main.cpp |  5 ----
 src/mlpack/tests/imputation_test.cpp               | 16 ++++++-----
 6 files changed, 28 insertions(+), 43 deletions(-)

diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp
index da9f3ce..d87e027 100644
--- a/src/mlpack/core/data/dataset_info.hpp
+++ b/src/mlpack/core/data/dataset_info.hpp
@@ -113,6 +113,9 @@ class DatasetMapper
   //! Modify the policy of the mapper (be careful!).
   PolicyType& Policy();
 
+  //! Modify (Replace) the policy of the mapper with a new policy
+  void Policy(PolicyType& policy);
+
  private:
   //! Types of each dimension.
   std::vector<Datatype> types;
diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp
index 4eed4a9..1c35097 100644
--- a/src/mlpack/core/data/dataset_info_impl.hpp
+++ b/src/mlpack/core/data/dataset_info_impl.hpp
@@ -18,7 +18,6 @@ template<typename PolicyType>
 inline DatasetMapper<PolicyType>::DatasetMapper(const size_t dimensionality) :
     types(dimensionality, Datatype::numeric)
 {
-    Log::Debug << "DatasetMapper(dimensionality)" << std::endl;
   // Nothing to initialize here.
 }
 
@@ -28,7 +27,6 @@ inline DatasetMapper<PolicyType>::DatasetMapper(PolicyType& policy,
     types(dimensionality, Datatype::numeric),
     policy(std::move(policy))
 {
-    Log::Debug << "DatasetMapper(policy, dimensionality)" << std::endl;
   // Nothing to initialize here.
 }
 
@@ -127,6 +125,13 @@ inline PolicyType& DatasetMapper<PolicyType>::Policy()
   return this->policy;
 }
 
+template<typename PolicyType>
+inline void DatasetMapper<PolicyType>::Policy(PolicyType& policy)
+{
+  this->policy = std::move(policy);
+}
+
+
 
 } // namespace data
 } // namespace mlpack
diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp
index 419b090..f521be4 100644
--- a/src/mlpack/core/data/load_impl.hpp
+++ b/src/mlpack/core/data/load_impl.hpp
@@ -65,34 +65,17 @@ void MapToNumerical(const std::vector<std::string>& tokens,
                     DatasetMapper<PolicyType>& info,
                     arma::Mat<eT>& matrix)
 {
-  auto notNumber = [](const std::string& str)
+  std::stringstream token;
+  for (size_t i = 0; i != tokens.size(); ++i)
   {
-    eT val(0);
-    std::stringstream token;
-    token.str(str);
-    token>>val;
-    return token.fail();
-  };
-
-  const bool notNumeric = std::any_of(std::begin(tokens),
-                                      std::end(tokens), notNumber);
-  if(notNumeric)
-  {
-    for(size_t i = 0; i != tokens.size(); ++i)
+    token.str(tokens[i]);
+    token>>matrix.at(row, i);
+    if (token.fail()) // if not number, map it to datasetmapper
     {
       const eT val = static_cast<eT>(info.MapString(tokens[i], row));
       matrix.at(row, i) = val;
     }
-  }
-  else
-  {
-    std::stringstream token;
-    for(size_t i = 0; i != tokens.size(); ++i)
-    {
-      token.str(tokens[i]);
-      token>>matrix.at(row, i);
-      token.clear();
-    }
+    token.clear();
   }
 }
 
@@ -411,7 +394,7 @@ bool Load(const std::string& filename,
       type = "raw ASCII-formatted data";
 
     Log::Info << "Loading '" << filename << "' as " << type << ".  "
-        << std::endl;
+        << std::flush;
     std::string separators;
     if (commas)
       separators = ",";
@@ -496,7 +479,7 @@ bool Load(const std::string& filename,
   else if (extension == "arff")
   {
     Log::Info << "Loading '" << filename << "' as ARFF dataset.  "
-        << std::endl;
+        << std::flush;
     try
     {
       LoadARFF(filename, matrix, info);
diff --git a/src/mlpack/core/data/map_policies/missing_policy.hpp b/src/mlpack/core/data/map_policies/missing_policy.hpp
index 6c3d1d1..c5df023 100644
--- a/src/mlpack/core/data/map_policies/missing_policy.hpp
+++ b/src/mlpack/core/data/map_policies/missing_policy.hpp
@@ -29,13 +29,12 @@ class MissingPolicy
 
   MissingPolicy()
   {
-    Log::Debug << "MissingPolicy()" << std::endl;
+    // Nothing to initialize here.
   }
 
   explicit MissingPolicy(std::set<std::string> missingSet) :
     missingSet(std::move(missingSet))
   {
-    Log::Debug << "MissingPolicy(missingSet)" << std::endl;
     // Nothing to initialize here.
   }
 
@@ -49,7 +48,6 @@ class MissingPolicy
     // If this condition is true, either we have no mapping for the given string
     // or we have no mappings for the given dimension at all.  In either case,
     // we create a mapping.
-    Log::Debug << "missingSet has: " << missingSet.count(string) << std::endl;
     if (missingSet.count(string) != 0 &&
         (maps.count(dimension) == 0 ||
          maps[dimension].first.left.count(string) == 0))
@@ -64,7 +62,6 @@ class MissingPolicy
     else
     {
       // This string already exists in the mapping.
-      Log::Debug << "string already exists in the mapping" << std::endl;
       return maps[dimension].first.left.at(string);
     }
   }
diff --git a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
index 6857352..a0b0a13 100644
--- a/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_imputer_main.cpp
@@ -92,10 +92,8 @@ int main(int argc, char** argv)
   // Policy tells how the DatasetMapper should map the values.
   std::set<std::string> missingSet;
   missingSet.insert(missingValue);
-  Log::Debug << "initalize MissingPolicy(missingSet)" << endl;
   MissingPolicy policy(missingSet);
   using MapperType = DatasetMapper<MissingPolicy>;
-  Log::Debug << "initalize info(policy)" << endl;
   DatasetMapper<MissingPolicy> info(policy);
 
   Load(inputFile, input, info, true, true);
@@ -149,9 +147,6 @@ int main(int argc, char** argv)
     }
   }
 
-  // for testing purpose
-  Log::Info << "output::" << endl;
-  Log::Info << output << endl;
 
   if (!outputFile.empty())
   {
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
index 8746945..c7d6d4c 100644
--- a/src/mlpack/tests/imputation_test.cpp
+++ b/src/mlpack/tests/imputation_test.cpp
@@ -33,7 +33,7 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
   fstream f;
   f.open("test_file.csv", fstream::out);
   f << "a, 2, 3"  << endl;
-  f << "5, 6, 7"  << endl;
+  f << "5, 6, b"  << endl;
   f << "8, 9, 10" << endl;
   f.close();
 
@@ -43,6 +43,7 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
 
   std::set<string> mset;
   mset.insert("a");
+  mset.insert("b");
   MissingPolicy miss(mset);
   DatasetMapper<MissingPolicy> info(miss);
   BOOST_REQUIRE(data::Load("test_file.csv", input, info) == true);
@@ -56,15 +57,16 @@ BOOST_AUTO_TEST_CASE(DatasetMapperImputerTest)
           DatasetMapper<MissingPolicy>,
           CustomImputation<double>> imputer(info);
   imputer.Impute(input, output, "a", 99, dimension); // convert a -> 99
+  imputer.Impute(input, output, "b", 99, dimension); // convert b -> 99
 
   BOOST_REQUIRE_CLOSE(output(0, 0), 99.0, 1e-5);
-  BOOST_REQUIRE_CLOSE(output(0, 1), 2.0, 1e-5);
-  BOOST_REQUIRE_CLOSE(output(0, 2), 3.0, 1e-5);
-  BOOST_REQUIRE_CLOSE(output(1, 0), 5.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(0, 1), 5.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(0, 2), 8.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(1, 0), 2.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(1, 1), 6.0, 1e-5);
-  BOOST_REQUIRE_CLOSE(output(1, 2), 7.0, 1e-5);
-  BOOST_REQUIRE_CLOSE(output(2, 0), 8.0, 1e-5);
-  BOOST_REQUIRE_CLOSE(output(2, 1), 9.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(1, 2), 9.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(2, 0), 3.0, 1e-5);
+  BOOST_REQUIRE_CLOSE(output(2, 1), 99.0, 1e-5);
   BOOST_REQUIRE_CLOSE(output(2, 2), 10.0, 1e-5);
 
   // Remove the file.




More information about the mlpack-git mailing list