[mlpack-git] master: remove duplicate code in load function (21d94c0)

Mon Jul 25 12:19:06 EDT 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/ecbfd24defe31d9f39708c0b4c6ad352cd46ed5c...7eec0609aa21cb12aeed3cbcaa1e411dad0359f2

>---------------------------------------------------------------

commit 21d94c04652e8faadd5e8991103a3b73b4c81033
Author: Keon Kim <kwk236 at gmail.com>
Date:   Sun Jul 3 03:01:54 2016 +0900

    remove duplicate code in load function


>---------------------------------------------------------------

21d94c04652e8faadd5e8991103a3b73b4c81033
 src/mlpack/core/data/dataset_info.hpp      |   6 +-
 src/mlpack/core/data/dataset_info_impl.hpp |   6 ++
 src/mlpack/core/data/load.hpp              |   6 +-
 src/mlpack/core/data/load_impl.hpp         | 167 -----------------------------
 4 files changed, 15 insertions(+), 170 deletions(-)

diff --git a/src/mlpack/core/data/dataset_info.hpp b/src/mlpack/core/data/dataset_info.hpp
index eaa6882..91e150b 100644
--- a/src/mlpack/core/data/dataset_info.hpp
+++ b/src/mlpack/core/data/dataset_info.hpp
@@ -35,9 +35,9 @@ class DatasetMapper
    * the dimensionality cannot be changed later; you will have to create a new
    * DatasetMapper object.
    */
-  DatasetMapper(const size_t dimensionality = 0);
+  explicit DatasetMapper(const size_t dimensionality = 0);
 
-  DatasetMapper(PolicyType& policy, const size_t dimensionality = 0);
+  explicit DatasetMapper(PolicyType& policy, const size_t dimensionality = 0);
   /**
    * Given the string and the dimension to which it belongs, return its numeric
    * mapping.  If no mapping yet exists, the string is added to the list of
@@ -101,6 +101,8 @@ class DatasetMapper
     ar & data::CreateNVP(maps, "maps");
   }
 
+  PolicyType& Policy() const;
+
  private:
   //! Types of each dimension.
   std::vector<Datatype> types;
diff --git a/src/mlpack/core/data/dataset_info_impl.hpp b/src/mlpack/core/data/dataset_info_impl.hpp
index 93e2a13..c95fa1a 100644
--- a/src/mlpack/core/data/dataset_info_impl.hpp
+++ b/src/mlpack/core/data/dataset_info_impl.hpp
@@ -115,6 +115,12 @@ inline size_t DatasetMapper<PolicyType>::Dimensionality() const
   return types.size();
 }
 
+template<typename PolicyType>
+inline PolicyType& DatasetMapper<PolicyType>::Policy() const
+{
+  return this->policy;
+}
+
 } // namespace data
 } // namespace mlpack
 
diff --git a/src/mlpack/core/data/load.hpp b/src/mlpack/core/data/load.hpp
index da770b4..8694cc2 100644
--- a/src/mlpack/core/data/load.hpp
+++ b/src/mlpack/core/data/load.hpp
@@ -96,7 +96,11 @@ bool Load(const std::string& filename,
           arma::Mat<eT>& matrix,
           DatasetMapper<PolicyType>& info,
           const bool fatal = false,
-          const bool transpose = true);
+          const bool transpose = true)
+{
+  PolicyType policy;
+  return Load(filename, matrix, info, policy, fatal, transpose);
+}
 
 /**
  * Loads a matrix from a file, guessing the filetype from the extension and
diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp
index 4f03221..c44b77f 100644
--- a/src/mlpack/core/data/load_impl.hpp
+++ b/src/mlpack/core/data/load_impl.hpp
@@ -538,173 +538,6 @@ bool Load(const std::string& filename,
   return true;
 }
 
-
-// Load with mappings.  Unfortunately we have to implement this ourselves.
-template<typename eT, typename PolicyType>
-bool Load(const std::string& filename,
-          arma::Mat<eT>& matrix,
-          DatasetMapper<PolicyType>& info,
-          const bool fatal,
-          const bool transpose)
-{
-  // Get the extension and load as necessary.
-  Timer::Start("loading_data");
-
-  // Get the extension.
-  std::string extension = Extension(filename);
-
-  // Catch nonexistent files by opening the stream ourselves.
-  std::fstream stream;
-  stream.open(filename.c_str(), std::fstream::in);
-
-  if (!stream.is_open())
-  {
-    Timer::Stop("loading_data");
-    if (fatal)
-      Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl;
-    else
-      Log::Warn << "Cannot open file '" << filename << "'; load failed."
-          << std::endl;
-
-    return false;
-  }
-
-  if (extension == "csv" || extension == "tsv" || extension == "txt")
-  {
-    // True if we're looking for commas; if false, we're looking for spaces.
-    bool commas = (extension == "csv");
-
-    std::string type;
-    if (extension == "csv")
-      type = "CSV data";
-    else
-      type = "raw ASCII-formatted data";
-
-    Log::Info << "Loading '" << filename << "' as " << type << ".  "
-        << std::flush;
-    std::string separators;
-    if (commas)
-      separators = ",";
-    else
-      separators = " \t";
-
-    // We'll load this as CSV (or CSV with spaces or tabs) according to
-    // RFC4180.  So the first thing to do is determine the size of the matrix.
-    std::string buffer;
-    size_t cols = 0;
-
-    std::getline(stream, buffer, '\n');
-    // Count commas and whitespace in the line, ignoring anything inside
-    // quotes.
-    typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
-    boost::escaped_list_separator<char> sep("\\", separators, "\"");
-    Tokenizer tok(buffer, sep);
-    for (Tokenizer::iterator i = tok.begin(); i != tok.end(); ++i)
-      ++cols;
-
-    // Now count the number of lines in the file.  We've already counted the
-    // first one.
-    size_t rows = 1;
-    while (!stream.eof() && !stream.bad() && !stream.fail())
-    {
-      std::getline(stream, buffer, '\n');
-      if (!stream.fail())
-        ++rows;
-    }
-
-    // Now we have the size.  So resize our matrix.
-    if (transpose)
-    {
-      matrix.set_size(cols, rows);
-      info = DatasetMapper<PolicyType>(cols);
-    }
-    else
-    {
-      matrix.set_size(rows, cols);
-      info = DatasetMapper<PolicyType>(rows);
-    }
-
-    stream.close();
-    stream.open(filename, std::fstream::in);
-
-    if(transpose)
-    {
-      std::vector<std::vector<std::string>> tokensArray;
-      std::vector<std::string> tokens;
-      while (!stream.bad() && !stream.fail() && !stream.eof())
-      {
-        // Extract line by line.
-        std::getline(stream, buffer, '\n');
-        Tokenizer lineTok(buffer, sep);
-        tokens = details::ToTokens(lineTok);
-        if(tokens.size() == cols)
-        {
-          tokensArray.emplace_back(std::move(tokens));
-        }
-      }
-      for(size_t i = 0; i != cols; ++i)
-      {
-        details::TransPoseTokens(tokensArray, tokens, i);
-        details::MapToNumerical(tokens, i,
-                                info, matrix);
-      }
-    }
-    else
-    {
-      size_t row = 0;
-      while (!stream.bad() && !stream.fail() && !stream.eof())
-      {
-        // Extract line by line.
-        std::getline(stream, buffer, '\n');
-        Tokenizer lineTok(buffer, sep);
-        details::MapToNumerical(details::ToTokens(lineTok), row,
-                                info, matrix);
-        ++row;
-      }
-    }
-  }
-  else if (extension == "arff")
-  {
-    Log::Info << "Loading '" << filename << "' as ARFF dataset.  "
-        << std::flush;
-    try
-    {
-      LoadARFF(filename, matrix, info);
-
-      // We transpose by default.  So, un-transpose if necessary...
-      if (!transpose)
-        inplace_transpose(matrix);
-    }
-    catch (std::exception& e)
-    {
-      if (fatal)
-        Log::Fatal << e.what() << std::endl;
-      else
-        Log::Warn << e.what() << std::endl;
-    }
-  }
-  else
-  {
-    // The type is unknown.
-    Timer::Stop("loading_data");
-    if (fatal)
-      Log::Fatal << "Unable to detect type of '" << filename << "'; "
-          << "incorrect extension?" << std::endl;
-    else
-      Log::Warn << "Unable to detect type of '" << filename << "'; load failed."
-          << " Incorrect extension?" << std::endl;
-
-    return false;
-  }
-
-  Log::Info << "Size is " << (transpose ? matrix.n_cols : matrix.n_rows)
-      << " x " << (transpose ? matrix.n_rows : matrix.n_cols) << ".\n";
-
-  Timer::Stop("loading_data");
-
-  return true;
-}
-
 // Load a model from file.
 template<typename T>
 bool Load(const std::string& filename,