[mlpack-git] master: - Fixes, based on PR's comments. - The fastest sparse-matrix utilization so far. (a60ae8a)

Wed Oct 19 18:07:35 EDT 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/94d14187222231ca29e4f6419c5999c660db4f8a...981ffa2d67d8fe38df6c699589005835fef710ea

>---------------------------------------------------------------

commit a60ae8a4022ce1154c40b4f321bb7cab28a4663e
Author: theJonan <ivan at jonan.info>
Date:   Thu Oct 20 01:07:35 2016 +0300

    - Fixes, based on PR's comments.
    - The fastest sparse-matrix utilization so far.


>---------------------------------------------------------------

a60ae8a4022ce1154c40b4f321bb7cab28a4663e
 src/mlpack/methods/det/CMakeLists.txt    |  4 +++
 src/mlpack/methods/det/dt_utils_impl.hpp | 18 +++++++---
 src/mlpack/methods/det/dtree_impl.hpp    | 62 +++++++++++++++++++-------------
 3 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/src/mlpack/methods/det/CMakeLists.txt b/src/mlpack/methods/det/CMakeLists.txt
index 4dd3bc3..ced69a9 100644
--- a/src/mlpack/methods/det/CMakeLists.txt
+++ b/src/mlpack/methods/det/CMakeLists.txt
@@ -5,6 +5,10 @@ set(SOURCES
   # the DET class
   dtree.hpp
   dtree_impl.hpp
+
+  # Utility files
+  dt_utils.hpp
+  dt_utils_impl.hpp
 )
 
 # add directory name to sources
diff --git a/src/mlpack/methods/det/dt_utils_impl.hpp b/src/mlpack/methods/det/dt_utils_impl.hpp
index 6a798c4..191bce1 100644
--- a/src/mlpack/methods/det/dt_utils_impl.hpp
+++ b/src/mlpack/methods/det/dt_utils_impl.hpp
@@ -158,7 +158,8 @@ DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
     // Some sanity checks.  It seems that on some datasets, the error does not
     // increase as the tree is pruned but instead stays the same---hence the
     // "<=" in the final assert.
-    Log::Assert((alpha < std::numeric_limits<double>::max()) || (dtree.SubtreeLeaves() == 1));
+    Log::Assert((alpha < std::numeric_limits<double>::max())
+                || (dtree.SubtreeLeaves() == 1));
     Log::Assert(alpha > oldAlpha);
     Log::Assert(dtree.SubtreeLeavesLogNegError() <= treeSeq.second);
   }
@@ -191,7 +192,8 @@ DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
   {
     // Break up data into train and test sets.
     const size_t start = fold * testSize;
-    const size_t end = std::min((size_t) (fold + 1) * testSize, (size_t) cvData.n_cols);
+    const size_t end = std::min((size_t) (fold + 1)
+                                * testSize, (size_t) cvData.n_cols);
 
     MatType test = cvData.cols(start, end - 1);
     MatType train(cvData.n_rows, cvData.n_cols - test.n_cols);
@@ -242,7 +244,8 @@ DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
       cvRegularizationConstants[i] += 2.0 * cvVal / (double) cvData.n_cols;
 
       // Determine the new alpha value and prune accordingly.
-      double cvOldAlpha = 0.5 * (prunedSequence[i + 1].first + prunedSequence[i + 2].first);
+      double cvOldAlpha = 0.5 * (prunedSequence[i + 1].first
+                                 + prunedSequence[i + 2].first);
       cvDTree.PruneAndUpdate(cvOldAlpha, train.n_cols, useVolumeReg);
     }
 
@@ -255,7 +258,8 @@ DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
     }
 
     if (prunedSequence.size() > 2)
-      cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal / (double) cvData.n_cols;
+      cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal
+        / (double) cvData.n_cols;
 
     #pragma omp critical (DTreeCVUpdate)
     regularizationConstants += cvRegularizationConstants;
@@ -293,7 +297,11 @@ DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
 
   // Grow the tree.
   oldAlpha = -DBL_MAX;
-  alpha = dtreeOpt->Grow(newDataset, oldFromNew, useVolumeReg, maxLeafSize, minLeafSize);
+  alpha = dtreeOpt->Grow(newDataset,
+                         oldFromNew,
+                         useVolumeReg,
+                         maxLeafSize,
+                         minLeafSize);
 
   // Prune with optimal alpha.
   while ((oldAlpha < optimalAlpha) && (dtreeOpt->SubtreeLeaves() > 1))
diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp
index 1d2cb76..7c86899 100644
--- a/src/mlpack/methods/det/dtree_impl.hpp
+++ b/src/mlpack/methods/det/dtree_impl.hpp
@@ -23,7 +23,8 @@ namespace details
    * in a vector, that can easily be iterated afterwards.
    */
   template <typename MatType>
-  void ExtractSplits(std::vector<std::pair<typename MatType::elem_type, size_t>>& splitVec,
+  void ExtractSplits(std::vector<
+                      std::pair<typename MatType::elem_type, size_t>>& splitVec,
                      const MatType& data,
                      size_t dim,
                      size_t start,
@@ -90,7 +91,8 @@ namespace details
         lastVal = ElemType(0);
       }
       
-      if (i + padding >= minLeafSize && i + padding <= n_elem - minLeafSize)// the normal case
+      // the normal case
+      if (i + padding >= minLeafSize && i + padding <= n_elem - minLeafSize)
       {
         // This makes sense for real continuous data.  This kinda corrupts the
         // data and estimation if the data is ordinal.
@@ -278,8 +280,6 @@ bool DTree<MatType, TagType>::FindSplit(const MatType& data,
   for (size_t dim = 0; dim < maxVals.n_elem; ++dim)
 #endif
   {
-    // Have to deal with REAL, INTEGER, NOMINAL data differently, so we have to
-    // think of how to do that...
     const ElemType min = minVals[dim];
     const ElemType max = maxVals[dim];
 
@@ -329,8 +329,10 @@ bool DTree<MatType, TagType>::FindSplit(const MatType& data,
         // and because the volume is only dependent on the dimension we are
         // splitting, we can assume V_l is just the range of the left and V_r is
         // just the range of the right.
-        double negLeftError = std::pow(position + 1, 2.0) / (split - min);
-        double negRightError = std::pow(points - position - 1, 2.0) / (max - split);
+        double negLeftError = std::pow(position + 1, 2.0)
+          / (split - min);
+        double negRightError = std::pow(points - position - 1, 2.0)
+          / (max - split);
 
         // If this is better, take it.
         if ((negLeftError + negRightError) >= minDimError)
@@ -344,21 +346,23 @@ bool DTree<MatType, TagType>::FindSplit(const MatType& data,
       }
     }
 
-    double actualMinDimError = std::log(minDimError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim;
+    double actualMinDimError = std::log(minDimError)
+      - 2 * std::log((double) data.n_cols)
+      - volumeWithoutDim;
 
 #pragma omp critical (DTreeFindUpdate)
     if ((actualMinDimError > minError) && dimSplitFound)
     {
-      {
-        // Calculate actual error (in logspace) by adding terms back to our
-        // estimate.
-        minError = actualMinDimError;
-        splitDim = dim;
-        splitValue = dimSplitValue;
-        leftError = std::log(dimLeftError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim;
-        rightError = std::log(dimRightError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim;
-        splitFound = true;
-      }
+      // Calculate actual error (in logspace) by adding terms back to our
+      // estimate.
+      minError = actualMinDimError;
+      splitDim = dim;
+      splitValue = dimSplitValue;
+      leftError = std::log(dimLeftError) - 2 * std::log((double) data.n_cols)
+        - volumeWithoutDim;
+      rightError = std::log(dimRightError) - 2 * std::log((double) data.n_cols)
+        - volumeWithoutDim;
+      splitFound = true;
     } // end if better split found in this dimension.
   }
 
@@ -451,8 +455,10 @@ double DTree<MatType, TagType>::Grow(MatType& data,
       left = new DTree(maxValsL, minValsL, start, splitIndex, leftError);
       right = new DTree(maxValsR, minValsR, splitIndex, end, rightError);
 
-      leftG = left->Grow(data, oldFromNew, useVolReg, maxLeafSize, minLeafSize);
-      rightG = right->Grow(data, oldFromNew, useVolReg, maxLeafSize, minLeafSize);
+      leftG = left->Grow(data, oldFromNew, useVolReg, maxLeafSize,
+                         minLeafSize);
+      rightG = right->Grow(data, oldFromNew, useVolReg, maxLeafSize,
+                           minLeafSize);
 
       // Store values of R(T~) and |T~|.
       subtreeLeaves = left->SubtreeLeaves() + right->SubtreeLeaves();
@@ -517,12 +523,15 @@ double DTree<MatType, TagType>::Grow(MatType& data,
 
     if (right->SubtreeLeaves() > 1)
     {
-      const double exponent = 2 * std::log((double) data.n_cols) + logVolume + right->AlphaUpper();
+      const double exponent = 2 * std::log((double) data.n_cols)
+        + logVolume
+        + right->AlphaUpper();
 
       tmpAlphaSum += std::exp(exponent);
     }
 
-    alphaUpper = std::log(tmpAlphaSum) - 2 * std::log((double) data.n_cols) - logVolume;
+    alphaUpper = std::log(tmpAlphaSum) - 2 * std::log((double) data.n_cols)
+      - logVolume;
 
     double gT;
     if (useVolReg)
@@ -689,7 +698,9 @@ double DTree<MatType, TagType>::ComputeValue(const VecType& query) const
   else
   {
     // Return either of the two children - left or right, depending on the splitValue
-    return (query[splitDim] <= splitValue) ? left->ComputeValue(query) : right->ComputeValue(query);
+    return (query[splitDim] <= splitValue) ?
+      left->ComputeValue(query) :
+      right->ComputeValue(query);
   }
 
   return 0.0;
@@ -725,12 +736,15 @@ TagType DTree<MatType, TagType>::FindBucket(const VecType& query) const
   else
   {
     // Return the tag from either of the two children - left or right.
-    return (query[splitDim] <= splitValue) ? left->FindBucket(query) : right->FindBucket(query);
+    return (query[splitDim] <= splitValue) ?
+      left->FindBucket(query) :
+      right->FindBucket(query);
   }
 }
 
 template <typename MatType, typename TagType>
-void DTree<MatType, TagType>::ComputeVariableImportance(arma::vec& importances) const
+void
+DTree<MatType, TagType>::ComputeVariableImportance(arma::vec& importances) const
 {
   // Clear and set to right size.
   importances.zeros(maxVals.n_elem);