[mlpack-git] master: - Rowback to faster sparse iteration. (50fa931)

Wed Oct 19 17:36:02 EDT 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/94d14187222231ca29e4f6419c5999c660db4f8a...981ffa2d67d8fe38df6c699589005835fef710ea

>---------------------------------------------------------------

commit 50fa9312175a6f44a1f9bb211c8dc80ccb8a0492
Author: theJonan <ivan at jonan.info>
Date:   Thu Oct 20 00:36:02 2016 +0300

    - Rowback to faster sparse iteration.


>---------------------------------------------------------------

50fa9312175a6f44a1f9bb211c8dc80ccb8a0492
 src/mlpack/methods/det/dtree_impl.hpp | 50 +++++++++++------------------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp
index 4c1b9f3..1d2cb76 100644
--- a/src/mlpack/methods/det/dtree_impl.hpp
+++ b/src/mlpack/methods/det/dtree_impl.hpp
@@ -23,12 +23,12 @@ namespace details
    * in a vector, that can easily be iterated afterwards.
    */
   template <typename MatType>
-  std::vector<std::pair<typename MatType::elem_type, size_t>>
-  ExtractSplits(const MatType& data,
-                size_t dim,
-                size_t start,
-                size_t end,
-                size_t minLeafSize)
+  void ExtractSplits(std::vector<std::pair<typename MatType::elem_type, size_t>>& splitVec,
+                     const MatType& data,
+                     size_t dim,
+                     size_t start,
+                     size_t end,
+                     size_t minLeafSize)
   {
     typedef typename MatType::elem_type ElemType;
     typedef std::pair<ElemType, size_t> SplitItem;
@@ -37,9 +37,6 @@ namespace details
     // We sort these, in-place (it's a copy of the data, anyways).
     std::sort(dimVec.begin(), dimVec.end());
     
-    // We're going to collect results here.
-    std::vector<SplitItem>  splitVec;
-    
     // Ensure the minimum leaf size on both sides. We need to figure out why
     // there are spikes if this minLeafSize is enforced here...
     for (size_t i = minLeafSize - 1; i < dimVec.n_elem - minLeafSize; ++i)
@@ -52,37 +49,27 @@ namespace details
       if (split != dimVec[i])
         splitVec.push_back(SplitItem(split, i));
     }
-    
-    return splitVec;
-    
   }
   
   // This the custom, sparse optimized implementation of the same routine.
   template <typename ElemType>
-  std::vector<std::pair<ElemType, size_t>>
-  ExtractSplits(const arma::SpMat<ElemType>& data,
-                size_t dim,
-                size_t start,
-                size_t end,
-                size_t minLeafSize)
+  void ExtractSplits(std::vector<std::pair<ElemType, size_t>>& splitVec,
+                     const arma::SpMat<ElemType>& data,
+                     size_t dim,
+                     size_t start,
+                     size_t end,
+                     size_t minLeafSize)
   {
-    typedef typename arma::SpMat<ElemType>::const_row_iterator  RowIterator;
     typedef std::pair<ElemType, size_t> SplitItem;
     const size_t n_elem = end - start;
     
     // Construct a vector of values.
-    std::vector<ElemType> valsVec;
-    valsVec.reserve(n_elem);
-    
-    for (RowIterator j(data, dim, start);j.row() == dim && j.col() < end; ++j)
-      valsVec.push_back(*j);
+    const arma::SpRow<ElemType> row = data(dim, arma::span(start, end - 1));
+    std::vector<ElemType> valsVec(row.begin(), row.end());
     
     // ... and sort it!
     std::sort(valsVec.begin(), valsVec.end());
 
-    // We're going to collect our splits here.
-    std::vector<SplitItem>  splitVec;
-
     // Now iterate over the values, taking account for the over-the-zeroes
     // jump and construct the splits vector.
     ElemType lastVal = -std::numeric_limits<ElemType>::max();
@@ -116,8 +103,6 @@ namespace details
       
       lastVal = newVal;
     }
-    
-    return splitVec;
   }
 };
 
@@ -319,11 +304,8 @@ bool DTree<MatType, TagType>::FindSplit(const MatType& data,
     // could be quite inefficient for sparse matrices, due to copy operations (3).
     // This one has custom implementation for dense and sparse matrices.
 
-    std::vector<SplitItem> splitVec = details::ExtractSplits(data,
-                                                             dim,
-                                                             start,
-                                                             end,
-                                                             minLeafSize);
+    std::vector<SplitItem> splitVec;
+    details::ExtractSplits(splitVec, data, dim, start, end, minLeafSize);
     
     // Iterate on all the splits for this dimension
     for (typename std::vector<SplitItem>::iterator i = splitVec.begin();