[mlpack-git] master: - Sparse DTree finally working. (dc8d9b1)

Tue Oct 18 05:43:36 EDT 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/94d14187222231ca29e4f6419c5999c660db4f8a...981ffa2d67d8fe38df6c699589005835fef710ea

>---------------------------------------------------------------

commit dc8d9b1c7170af2f891538a8585d0494381430c0
Author: theJonan <ivan at jonan.info>
Date:   Mon Oct 17 22:55:41 2016 +0300

    - Sparse DTree finally working.


>---------------------------------------------------------------

dc8d9b1c7170af2f891538a8585d0494381430c0
 src/mlpack/methods/det/dtree.hpp      |  25 ++--
 src/mlpack/methods/det/dtree_impl.hpp | 244 ++++++++++++++--------------------
 2 files changed, 111 insertions(+), 158 deletions(-)

diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp
index 9a2b175..70c8cc5 100644
--- a/src/mlpack/methods/det/dtree.hpp
+++ b/src/mlpack/methods/det/dtree.hpp
@@ -44,8 +44,9 @@ class DTree
   /**
    * The actual, underlying type we're working with
    */
-  typedef typename MatType::elem_type ElemType;
-  typedef typename MatType::vec_type  VecType;
+  typedef typename MatType::elem_type     ElemType;
+  typedef typename MatType::vec_type      VecType;
+  typedef typename arma::Col<ElemType>  StatType;
   
   /**
    * Create an empty density estimation tree.
@@ -60,8 +61,8 @@ class DTree
    * @param minVals Minimum values of the bounding box.
    * @param totalPoints Total number of points in the dataset.
    */
-  DTree(const VecType& maxVals,
-        const VecType& minVals,
+  DTree(const StatType& maxVals,
+        const StatType& minVals,
         const size_t totalPoints);
 
   /**
@@ -86,8 +87,8 @@ class DTree
    * @param end End of points represented by this node in the data matrix.
    * @param error log-negative error of this node.
    */
-  DTree(const VecType& maxVals,
-        const VecType& minVals,
+  DTree(const StatType& maxVals,
+        const StatType& minVals,
         const size_t start,
         const size_t end,
         const double logNegError);
@@ -103,8 +104,8 @@ class DTree
    * @param start Start of points represented by this node in the data matrix.
    * @param end End of points represented by this node in the data matrix.
    */
-  DTree(const VecType& maxVals,
-        const VecType& minVals,
+  DTree(const StatType& maxVals,
+        const StatType& minVals,
         const size_t totalPoints,
         const size_t start,
         const size_t end);
@@ -199,9 +200,9 @@ class DTree
   size_t end;
 
   //! Upper half of bounding box for this node.
-  VecType maxVals;
+  StatType maxVals;
   //! Lower half of bounding box for this node.
-  VecType minVals;
+  StatType minVals;
 
   //! The splitting dimension for this node.
   size_t splitDim;
@@ -270,10 +271,10 @@ class DTree
   TagType BucketTag() const { return subtreeLeaves == 1 ? bucketTag : -1; }
 
   //! Return the maximum values.
-  const VecType& MaxVals() const { return maxVals; }
+  const StatType& MaxVals() const { return maxVals; }
 
   //! Return the minimum values.
-  const VecType& MinVals() const { return minVals; }
+  const StatType& MinVals() const { return minVals; }
 
   /**
    * Serialize the density estimation tree.
diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp
index 16092b8..ad54039 100644
--- a/src/mlpack/methods/det/dtree_impl.hpp
+++ b/src/mlpack/methods/det/dtree_impl.hpp
@@ -5,113 +5,98 @@
  * Implementations of some declared functions in
  * the Density Estimation Tree class.
  *
+ * Sparsification and optimizations.
+ * @author Ivan Georgiev (ivan at jonan.info)
+ *
  */
 #include "dtree.hpp"
 #include <stack>
-#include <algorithm>
 #include <vector>
-#include <mlpack/core/tree/perform_split.hpp>
 
 using namespace mlpack;
 using namespace det;
 
-namespace arma
+namespace details
 {
-  template <typename ElemType>
-  SpMat<ElemType> sort(const SpMat<ElemType>& data)
+  /**
+   * This one sorts and scand the given per-dimension extract and puts all splits
+   * in a vector, that can easily be iterated afterwards.
+   */
+  template <typename ElemType, typename RowType>
+  std::vector<std::pair<ElemType, size_t>> ExtractSplits(RowType& row, size_t minLeafSize)
   {
-    typedef std::vector<ElemType> ElemVectorType;
+    typedef std::pair<ElemType, size_t> SplitItem;
+    std::vector<SplitItem>  splitVec;
     
-    // Construct the vector of values / locations...
-    ElemVectorType indexVec(data.begin(), data.end());
+    // We want to do that inplace for optimization purposes;
+    std::sort(row.begin(), row.end());
     
-    // ... and sort it!
-    std::sort(indexVec.begin(), indexVec.end());
-    
-    // Now prepare the structures for the batch construction of the
-    // sorted sparse row.
-    arma::umat locations(2, data.n_nonzero);
-    arma::Col<ElemType> vals(data.n_nonzero);
-    ElemType lastVal = -std::numeric_limits<ElemType>::max();
-    size_t padding = 0;
-    
-    for (size_t ii = 0; ii < indexVec.size(); ++ii)
+    // Ensure the minimum leaf size on both sides. We need to figure out why
+    // there are spikes if this minLeafSize is enforced here...
+    for (size_t i = minLeafSize - 1; i < row.n_elem - minLeafSize; ++i)
     {
-      const ElemType newVal = indexVec[ii];
-      if (lastVal < ElemType(0) && newVal > ElemType(0))
-      {
-        assert(padding == 0); // we should arrive here once!
-        padding = data.n_elem - data.n_nonzero;
-      }
+      // This makes sense for real continuous data.  This kinda corrupts the
+      // data and estimation if the data is ordinal.
+      const ElemType split = (row[i] + row[i + 1]) / 2.0;
       
-      locations.at(0, ii) = (ii + padding) % data.n_rows;
-      locations.at(1, ii) = (ii + padding) / data.n_rows;
-      vals.at(ii) = lastVal = newVal;
+      // Check if we can split here (two points are different)
+      if (split != row[i])
+        splitVec.push_back(SplitItem(split, i));
     }
     
-    return SpMat<ElemType>(locations, vals, data.n_rows, data.n_cols, false, false);
-  };
+    return splitVec;
     
-};
+  }
   
-namespace detail
-{
+  // This the custom, sparse optimized implementation of the same routine.
   template <typename ElemType>
-  class DTreeSplit
+  std::vector<std::pair<ElemType, size_t>> ExtractSplits(arma::SpRow<ElemType>& row, size_t minLeafSize)
   {
-  public:
-    typedef DTreeSplit<ElemType>    SplitInfo;
+    typedef std::pair<ElemType, size_t> SplitItem;
+    std::vector<SplitItem>  splitVec;
     
-    template<typename VecType>
-    static bool AssignToLeftNode(const VecType& point,
-                                 const SplitInfo& splitInfo)
-    {
-      return point[splitInfo.splitDimension] < splitInfo.splitVal;
-    }
+    // Construct a vector of values.
+    std::vector<ElemType> valsVec(row.begin(), row.end());
     
-  private:
-    ElemType    splitVal;
-    size_t      splitDimension;
-  };
+    // ... and sort it!
+    std::sort(valsVec.begin(), valsVec.end());
     
-  /**
-   * Get the values for the dimension and sort them. The old implementation:
-   *   dimVec = data.row(dim).subvec(start, end - 1);
-   *   dimVec = arma::sort(dimVec);
-   * was quite inefficient, due to many (3) vector copy operations. This could be a
-   * problem especially for sparse matrices. That's why they have custom implementation.
-   */
-  template <typename MatType>
-  typename MatType::row_type ExtractSortedRow(const MatType& data,
-                                              size_t dim,
-                                              size_t start,
-                                              size_t end)
-  {
-    typedef typename MatType::elem_type ElemType;
+    // Now iterate over the values, taking account for the over-the-zeroes
+    // jump and construct the splits vector.
+    ElemType lastVal = -std::numeric_limits<ElemType>::max();
+    size_t padding = 0, zeroes = row.n_elem - row.n_nonzero;
 
-    assert(start < end);
+    for (size_t i = 0; i < valsVec.size(); ++i)
+    {
+      const ElemType newVal = valsVec[i];
+      if (lastVal < ElemType(0) && newVal > ElemType(0) && zeroes > 0)
+      {
+        Log::Assert(padding == 0); // we should arrive here once!
+        if (lastVal >= valsVec[0] && // i.e. we're not in the beginning
+            i >= minLeafSize &&
+            i <= row.n_elem - minLeafSize)
+          splitVec.push_back(SplitItem(lastVal / 2.0, i - 1));
+
+        padding = zeroes;
+        lastVal = ElemType(0);
+      }
       
-    typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1));
-    std::sort(dimVec.begin(), dimVec.end());
-    return dimVec;
-  }
+      if (i + padding >= minLeafSize && i + padding <= row.n_elem - minLeafSize)// the normal case
+      {
+        // This makes sense for real continuous data.  This kinda corrupts the
+        // data and estimation if the data is ordinal.
+        const ElemType split = (lastVal + newVal) / 2.0;
         
-  /**
-   * We need a custom implementation for sparse matrix, in order to save sorting of
-   * all these zeroes.
-   */
-  template <typename ElemType>
-  arma::SpRow<ElemType> ExtractSortedRow(const arma::SpMat<ElemType>& data,
-                                         size_t dim,
-                                         size_t start,
-                                         size_t end)
-  {
-    assert(start < end);
+        // Check if we can split here (two points are different)
+        if (split != newVal)
+          splitVec.push_back(SplitItem(split, i + padding - 1));
+      }
       
-    arma::SpRow<ElemType> dimVec = data(dim, arma::span(start, end - 1));
-    return arma::sort(dimVec);
-  }
+      lastVal = newVal;
+    }
     
+    return splitVec;
+  }
 };
 
 template <typename MatType, typename TagType>
@@ -136,8 +121,8 @@ DTree<MatType, TagType>::DTree() :
 // Root node initializers
 
 template <typename MatType, typename TagType>
-DTree<MatType, TagType>::DTree(const VecType& maxVals,
-                               const VecType& minVals,
+DTree<MatType, TagType>::DTree(const StatType& maxVals,
+                               const StatType& minVals,
                                const size_t totalPoints) :
     start(0),
     end(totalPoints),
@@ -161,6 +146,8 @@ template <typename MatType, typename TagType>
 DTree<MatType, TagType>::DTree(MatType & data) :
     start(0),
     end(data.n_cols),
+    minVals(arma::min(data, 1)),
+    maxVals(arma::max(data, 1)),
     splitDim(size_t(-1)),
     splitValue(std::numeric_limits<ElemType>::max()),
     subtreeLeavesLogNegError(-DBL_MAX),
@@ -173,28 +160,13 @@ DTree<MatType, TagType>::DTree(MatType & data) :
     left(NULL),
     right(NULL)
 {
-  maxVals = data.col(0);
-  minVals = data.col(0);
-  
-  typename MatType::row_col_iterator dataEnd = data.end_row_col();
-  
-  // Loop over data to extract maximum and minimum values in each dimension.
-  for (typename MatType::row_col_iterator i = data.begin_row_col(); i != dataEnd; ++i)
-  {
-    size_t j = i.row();
-    if (*i > maxVals[j])
-      maxVals[j] = *i;
-    else if (*i < minVals[j])
-      minVals[j] = *i;
-  }
-
   logNegError = LogNegativeError(data.n_cols);
 }
 
 // Non-root node initializers
 template <typename MatType, typename TagType>
-DTree<MatType, TagType>::DTree(const VecType& maxVals,
-                               const VecType& minVals,
+DTree<MatType, TagType>::DTree(const StatType& maxVals,
+                               const StatType& minVals,
                                const size_t start,
                                const size_t end,
                                const double logNegError) :
@@ -217,8 +189,8 @@ DTree<MatType, TagType>::DTree(const VecType& maxVals,
 { /* Nothing to do. */ }
 
 template <typename MatType, typename TagType>
-DTree<MatType, TagType>::DTree(const VecType& maxVals,
-                               const VecType& minVals,
+DTree<MatType, TagType>::DTree(const StatType& maxVals,
+                               const StatType& minVals,
                                const size_t totalPoints,
                                const size_t start,
                                const size_t end) :
@@ -256,13 +228,12 @@ double DTree<MatType, TagType>::LogNegativeError(const size_t totalPoints) const
   double err = 2 * std::log((double) (end - start)) -
                2 * std::log((double) totalPoints);
 
-  VecType valDiffs = maxVals - minVals;
-  typename VecType::iterator valEnd = valDiffs.end();
-  for (typename VecType::iterator i = valDiffs.begin(); i != valEnd; ++i)
+  StatType valDiffs = maxVals - minVals;
+  for (size_t i = 0;i < valDiffs.n_elem; ++i)
   {
     // Ignore very small dimensions to prevent overflow.
-    if (*i > 1e-50)
-      err -= std::log(*i);
+    if (valDiffs[i] > 1e-50)
+      err -= std::log(valDiffs[i]);
   }
 
   return err;
@@ -279,10 +250,12 @@ bool DTree<MatType, TagType>::FindSplit(const MatType& data,
                                         double& rightError,
                                         const size_t minLeafSize) const
 {
+  typedef std::pair<ElemType, size_t>   SplitItem;
+  
   // Ensure the dimensionality of the data is the same as the dimensionality of
   // the bounding rectangle.
-  assert(data.n_rows == maxVals.n_elem);
-  assert(data.n_rows == minVals.n_elem);
+  Log::Assert(data.n_rows == maxVals.n_elem);
+  Log::Assert(data.n_rows == minVals.n_elem);
 
   const size_t points = end - start;
 
@@ -320,42 +293,22 @@ bool DTree<MatType, TagType>::FindSplit(const MatType& data,
     // Find the log volume of all the other dimensions.
     double volumeWithoutDim = logVolume - std::log(max - min);
 
-    // Get a sorted version of the dimension in interest,
-    // from the given samples range. This is the most expensive step.
-    typename MatType::row_type dimVec = detail::ExtractSortedRow(data,
-                                                                 dim,
-                                                                 start,
-                                                                 end);
+    // Get the values for splitting. The old implementation:
+    //   dimVec = data.row(dim).subvec(start, end - 1);
+    //   dimVec = arma::sort(dimVec);
+    // could be quite inefficient for sparse matrices, due to copy operations (3).
+    // This one has custom implementation for dense and sparse matrices.
 
-    typename MatType::row_col_iterator dimVecEnd = dimVec.end_row_col();
-    typename MatType::row_col_iterator dI = dimVec.begin_row_col();
+    typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1));
+    std::vector<SplitItem> splitVec = details::ExtractSplits<ElemType>(dimVec, minLeafSize);
     
-    // Find the best split for this dimension.
-    for (;;)
+    // Iterate on all the splits for this dimension
+    for (typename std::vector<SplitItem>::iterator i = splitVec.begin();
+         i != splitVec.end();
+         ++i)
     {
-      const size_t position = dI.col();
-      
-      // Ensure the minimum leaf size on both sides. We need to figure out why
-      // there are spikes if this minLeafSize is enforced here...
-      if (position >= dimVec.n_cols - minLeafSize)
-        break;
-      if (position < minLeafSize - 1)
-        continue;
-      
-      ElemType split = *dI;
-
-      // In case of sparse matrices and all present values being negative
-      // this can happen - last many elements being 0, so we need to check.
-      if (++dI == dimVecEnd)
-        break;
-      
-      // This makes sense for real continuous data.  This kinda corrupts the
-      // data and estimation if the data is ordinal.
-      split += *dI;
-      split /= 2.0;
-      
-      if (split == *dI)
-        continue; // We can't split here (two points are the same).
+      const ElemType split = i->first;
+      const size_t position = i->second;
       
       // Another way of picking split is using this:
       //   split = leftsplit;
@@ -475,14 +428,13 @@ double DTree<MatType, TagType>::Grow(MatType& data,
     {
       // Move the data around for the children to have points in a node lie
       // contiguously (to increase efficiency during the training).
-//      const size_t splitIndex = splt::PerformSplit(data, start, end - start, )
       const size_t splitIndex = SplitData(data, dim, splitValueTmp, oldFromNew);
 
       // Make max and min vals for the children.
-      arma::vec maxValsL(maxVals);
-      arma::vec maxValsR(maxVals);
-      arma::vec minValsL(minVals);
-      arma::vec minValsR(minVals);
+      StatType maxValsL(maxVals);
+      StatType maxValsR(maxVals);
+      StatType minValsL(minVals);
+      StatType minValsR(minVals);
 
       maxValsL[dim] = splitValueTmp;
       minValsR[dim] = splitValueTmp;
@@ -524,7 +476,7 @@ double DTree<MatType, TagType>::Grow(MatType& data,
   else
   {
     // We can make this a leaf node.
-    assert((size_t) (end - start) >= minLeafSize);
+    Log::Assert((size_t) (end - start) >= minLeafSize);
     subtreeLeaves = 1;
     subtreeLeavesLogNegError = logNegError;
   }