[mlpack-git] master: - First successfull SpMat build. (3547fff)

Tue Nov 1 15:22:50 EDT 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/94d14187222231ca29e4f6419c5999c660db4f8a...981ffa2d67d8fe38df6c699589005835fef710ea

>---------------------------------------------------------------

commit 3547fffc27f1f0a38b7813e902c4143de6b8a5f8
Author: theJonan <ivan at jonan.info>
Date:   Mon Oct 17 17:03:18 2016 +0300

    - First successfull SpMat build.


>---------------------------------------------------------------

3547fffc27f1f0a38b7813e902c4143de6b8a5f8
 src/mlpack/methods/det/dtree_impl.hpp | 104 ++++++++++++++++++++--------------
 1 file changed, 62 insertions(+), 42 deletions(-)

diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp
index b683ffd..16092b8 100644
--- a/src/mlpack/methods/det/dtree_impl.hpp
+++ b/src/mlpack/methods/det/dtree_impl.hpp
@@ -8,13 +8,52 @@
  */
 #include "dtree.hpp"
 #include <stack>
-#include <vector>
 #include <algorithm>
+#include <vector>
 #include <mlpack/core/tree/perform_split.hpp>
 
 using namespace mlpack;
 using namespace det;
 
+namespace arma
+{
+  template <typename ElemType>
+  SpMat<ElemType> sort(const SpMat<ElemType>& data)
+  {
+    typedef std::vector<ElemType> ElemVectorType;
+    
+    // Construct the vector of values / locations...
+    ElemVectorType indexVec(data.begin(), data.end());
+    
+    // ... and sort it!
+    std::sort(indexVec.begin(), indexVec.end());
+    
+    // Now prepare the structures for the batch construction of the
+    // sorted sparse row.
+    arma::umat locations(2, data.n_nonzero);
+    arma::Col<ElemType> vals(data.n_nonzero);
+    ElemType lastVal = -std::numeric_limits<ElemType>::max();
+    size_t padding = 0;
+    
+    for (size_t ii = 0; ii < indexVec.size(); ++ii)
+    {
+      const ElemType newVal = indexVec[ii];
+      if (lastVal < ElemType(0) && newVal > ElemType(0))
+      {
+        assert(padding == 0); // we should arrive here once!
+        padding = data.n_elem - data.n_nonzero;
+      }
+      
+      locations.at(0, ii) = (ii + padding) % data.n_rows;
+      locations.at(1, ii) = (ii + padding) / data.n_rows;
+      vals.at(ii) = lastVal = newVal;
+    }
+    
+    return SpMat<ElemType>(locations, vals, data.n_rows, data.n_cols, false, false);
+  };
+  
+};
+
 namespace detail
 {
   template <typename ElemType>
@@ -57,44 +96,21 @@ namespace detail
     return dimVec;
   }
   
-//  template <typename ElemType>
-//  std::vector<SortedItem<ElemType> > ExtractSortedRow(const arma::SpMat<ElemType>& data,
-//                                                      size_t dim,
-//                                                      size_t start,
-//                                                      size_t end,
-//                                                      size_t padding)
-//  {
-//    typedef SortedItem<ElemType>        SortedType;
-//    
-//    assert(padding > 0);
-//    assert(start < end);
-//    
-//    arma::SpRow<ElemType> dimVec = data(dim, arma::span(start, end - 1));
-//    typename arma::SpRow<ElemType>::iterator dimVecEnd = dimVec.end();
-//    
-//    // Build the vector to be sorted with values in ascending order.
-//    std::vector<SortedType> sortedDim = std::vector<SortedType>();
-//    sortedDim.reserve(dimVec.n_elem / 2);
-//    
-//    // Prepare these for the iteration.
-//    ElemType lastVal = 0;
-//    --padding;
-//    
-//    // Iterate over the row and put only different values, also skipping
-//    // `padding` number of elements from both sides of the row.
-//    for (typename MatType::row_col_iterator di = dimVec.begin_row_col(); di != dimVecEnd; ++di)
-//    {
-//      if (di.col() < padding || di.col() >= dimVec.n_elem - padding)
-//        continue;
-//      //      if (*di == lastVal && sortedDim.size() > 0)
-//      //        continue;
-//      
-//      sortedDim.push_back(lastVal = *di);
-//    }
-//    
-//    std::sort(sortedDim.begin(), sortedDim.end());
-//    return sortedDim;
-//  }
+  /**
+   * We need a custom implementation for sparse matrix, in order to save sorting of
+   * all these zeroes.
+   */
+  template <typename ElemType>
+  arma::SpRow<ElemType> ExtractSortedRow(const arma::SpMat<ElemType>& data,
+                                         size_t dim,
+                                         size_t start,
+                                         size_t end)
+  {
+    assert(start < end);
+    
+    arma::SpRow<ElemType> dimVec = data(dim, arma::span(start, end - 1));
+    return arma::sort(dimVec);
+  }
   
 };
 
@@ -314,20 +330,24 @@ bool DTree<MatType, TagType>::FindSplit(const MatType& data,
     typename MatType::row_col_iterator dimVecEnd = dimVec.end_row_col();
     typename MatType::row_col_iterator dI = dimVec.begin_row_col();
 
-    // Find the best split for this dimension.  We need to figure out why
-    // there are spikes if this minLeafSize is enforced here...
+    // Find the best split for this dimension.
     for (;;)
     {
       const size_t position = dI.col();
       
+      // Ensure the minimum leaf size on both sides. We need to figure out why
+      // there are spikes if this minLeafSize is enforced here...
       if (position >= dimVec.n_cols - minLeafSize)
         break;
       if (position < minLeafSize - 1)
         continue;
       
       ElemType split = *dI;
+
+      // In case of sparse matrices and all present values being negative
+      // this can happen - last many elements being 0, so we need to check.
       if (++dI == dimVecEnd)
-        break; // This means we have same values till the end => No split.
+        break;
       
       // This makes sense for real continuous data.  This kinda corrupts the
       // data and estimation if the data is ordinal.