[mlpack-git] master: Count lines by hand; and trim whitespace from tokens. (da12bda)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:41:45 EST 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit da12bdabe41e2cf154635e8680c0fabf7d8b7a10
Author: ryan <ryan at ratml.org>
Date:   Thu Sep 10 17:07:35 2015 -0400

    Count lines by hand; and trim whitespace from tokens.
    
    The std::count() idea for counting lines is elegant, but stumbles when presented
    with files that don't end with newlines.  So just counting with getline() works
    fine.


>---------------------------------------------------------------

da12bdabe41e2cf154635e8680c0fabf7d8b7a10
 src/mlpack/core/data/load_impl.hpp | 28 +++++++++-------------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp
index 85badd4..4f953b5 100644
--- a/src/mlpack/core/data/load_impl.hpp
+++ b/src/mlpack/core/data/load_impl.hpp
@@ -19,6 +19,7 @@
 #include <boost/archive/text_iarchive.hpp>
 #include <boost/archive/binary_iarchive.hpp>
 #include <boost/tokenizer.hpp>
+#include <boost/algorithm/string.hpp>
 
 #include "serialization_shim.hpp"
 
@@ -330,21 +331,11 @@ bool Load(const std::string& filename,
     // Now count the number of lines in the file.  We've already counted the
     // first one.
     size_t rows = 1;
-    stream.unsetf(std::ios_base::skipws);
-    rows += std::count(std::istream_iterator<char>(stream),
-        std::istream_iterator<char>(), '\n');
-
-    // Back up to see if the last character in the file is an empty line.
-    stream.unget();
-    std::cout << "last character is " << int(stream.peek()) << ".\n";
-    while (isspace(stream.peek()))
+    while (!stream.eof() && !stream.bad() && !stream.fail())
     {
-      if (stream.peek() == '\n')
-      {
-        --rows;
-        break;
-      }
-      stream.unget();
+      std::getline(stream, buffer, '\n');
+      if (!stream.fail())
+        ++rows;
     }
 
     // Now we have the size.  So resize our matrix.
@@ -380,7 +371,6 @@ bool Load(const std::string& filename,
 
         if (token.fail())
         {
-          std::cout << "conversion failed\n";
           // Conversion failed; but it may be a NaN or inf.  Armadillo has
           // convenient functions to check.
           if (!arma::diskio::convert_naninf(val, token.str()))
@@ -418,7 +408,10 @@ bool Load(const std::string& filename,
               }
             }
 
-            val = info.MapString(token.str(), dim);
+            // Strip whitespace from either side of the string.
+            std::string trimmedToken(token.str());
+            boost::trim(trimmedToken);
+            val = info.MapString(trimmedToken, dim);
           }
         }
 
@@ -432,9 +425,6 @@ bool Load(const std::string& filename,
 
       ++row;
     }
-
-    if (stream.bad() || stream.fail())
-      Log::Warn << "Failure reading file '" << filename << "'." << std::endl;
   }
   else
   {



More information about the mlpack-git mailing list