[mlpack-git] master: - All DET sparsification works. (4083784)

gitdub at mlpack.org gitdub at mlpack.org
Tue Oct 18 04:54:50 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/94d14187222231ca29e4f6419c5999c660db4f8a...981ffa2d67d8fe38df6c699589005835fef710ea

>---------------------------------------------------------------

commit 4083784eb8b7ac54fe182aa4565f7f2f25e19cde
Author: theJonan <ivan at jonan.info>
Date:   Tue Oct 18 11:54:50 2016 +0300

    - All DET sparsification works.


>---------------------------------------------------------------

4083784eb8b7ac54fe182aa4565f7f2f25e19cde
 src/mlpack/methods/det/dt_utils_impl.hpp |  29 ++++---
 src/mlpack/methods/det/dtree.hpp         |   2 +-
 src/mlpack/methods/det/dtree_impl.hpp    |   9 +-
 src/mlpack/tests/det_test.cpp            | 137 +++++++++++++++++++++++++++++++
 4 files changed, 156 insertions(+), 21 deletions(-)

diff --git a/src/mlpack/methods/det/dt_utils_impl.hpp b/src/mlpack/methods/det/dt_utils_impl.hpp
index cad5289..fcb6b0c 100644
--- a/src/mlpack/methods/det/dt_utils_impl.hpp
+++ b/src/mlpack/methods/det/dt_utils_impl.hpp
@@ -99,11 +99,11 @@ void mlpack::det::PrintVariableImportance(const DTree<MatType, TagType>* dtree,
 // folds.
 template <typename MatType, typename TagType>
 DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
-                                                       const size_t folds,
-                                                       const bool useVolumeReg,
-                                                       const size_t maxLeafSize,
-                                                       const size_t minLeafSize,
-                                                       const std::string unprunedTreeOutput)
+                                              const size_t folds,
+                                              const bool useVolumeReg,
+                                              const size_t maxLeafSize,
+                                              const size_t minLeafSize,
+                                              const std::string unprunedTreeOutput)
 {
   // Initialize the tree.
   DTree<MatType, TagType> dtree(dataset);
@@ -170,7 +170,7 @@ DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
       << " " << oldAlpha << "." << std::endl;
 
   MatType cvData(dataset);
-  size_t testSize = dataset.n_cols / folds;
+  const size_t testSize = dataset.n_cols / folds;
 
   arma::vec regularizationConstants(prunedSequence.size());
   regularizationConstants.fill(0.0);
@@ -181,17 +181,17 @@ DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
   // implementation.
 #ifdef _WIN32
   #pragma omp parallel for default(none) \
-      shared(testSize, cvData, prunedSequence, regularizationConstants, dataset)
+      shared(testSize, cvData, prunedSequence, regularizationConstants)
   for (intmax_t fold = 0; fold < (intmax_t) folds; fold++)
 #else
   #pragma omp parallel for default(none) \
-      shared(testSize, cvData, prunedSequence, regularizationConstants, dataset)
+      shared(testSize, cvData, prunedSequence, regularizationConstants)
   for (size_t fold = 0; fold < folds; fold++)
 #endif
   {
     // Break up data into train and test sets.
-    size_t start = fold * testSize;
-    size_t end = std::min((size_t) (fold + 1) * testSize, (size_t) cvData.n_cols);
+    const size_t start = fold * testSize;
+    const size_t end = std::min((size_t) (fold + 1) * testSize, (size_t) cvData.n_cols);
 
     MatType test = cvData.cols(start, end - 1);
     MatType train(cvData.n_rows, cvData.n_cols - test.n_cols);
@@ -239,11 +239,10 @@ DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
       }
 
       // Update the cv regularization constant.
-      cvRegularizationConstants[i] += 2.0 * cvVal / (double) dataset.n_cols;
+      cvRegularizationConstants[i] += 2.0 * cvVal / (double) cvData.n_cols;
 
       // Determine the new alpha value and prune accordingly.
-      double cvOldAlpha = 0.5 * (prunedSequence[i + 1].first +
-          prunedSequence[i + 2].first);
+      double cvOldAlpha = 0.5 * (prunedSequence[i + 1].first + prunedSequence[i + 2].first);
       cvDTree.PruneAndUpdate(cvOldAlpha, train.n_cols, useVolumeReg);
     }
 
@@ -256,9 +255,9 @@ DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
     }
 
     if (prunedSequence.size() > 2)
-      cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal / (double) dataset.n_cols;
+      cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal / (double) cvData.n_cols;
 
-    #pragma omp critical
+    #pragma omp critical (DTreeCVUpdate)
     regularizationConstants += cvRegularizationConstants;
   }
   Timer::Stop("cross_validation");
diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp
index 70c8cc5..a4ec63a 100644
--- a/src/mlpack/methods/det/dtree.hpp
+++ b/src/mlpack/methods/det/dtree.hpp
@@ -46,7 +46,7 @@ class DTree
    */
   typedef typename MatType::elem_type     ElemType;
   typedef typename MatType::vec_type      VecType;
-  typedef typename arma::Col<ElemType>  StatType;
+  typedef typename arma::Col<ElemType>    StatType;
   
   /**
    * Create an empty density estimation tree.
diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp
index ad54039..be64f5c 100644
--- a/src/mlpack/methods/det/dtree_impl.hpp
+++ b/src/mlpack/methods/det/dtree_impl.hpp
@@ -146,8 +146,8 @@ template <typename MatType, typename TagType>
 DTree<MatType, TagType>::DTree(MatType & data) :
     start(0),
     end(data.n_cols),
-    minVals(arma::min(data, 1)),
     maxVals(arma::max(data, 1)),
+    minVals(arma::min(data, 1)),
     splitDim(size_t(-1)),
     splitValue(std::numeric_limits<ElemType>::max()),
     subtreeLeavesLogNegError(-DBL_MAX),
@@ -264,11 +264,11 @@ bool DTree<MatType, TagType>::FindSplit(const MatType& data,
 
   // Loop through each dimension.
 #ifdef _WIN32
-  #pragma omp parallel for default(shared) \
+  #pragma omp parallel for default(none) \
     shared(splitValue, splitDim, data)
   for (intmax_t dim = 0; dim < (intmax_t) maxVals.n_elem; ++dim)
 #else
-  #pragma omp parallel for default(shared) \
+  #pragma omp parallel for default(none) \
     shared(splitValue, splitDim, data)
   for (size_t dim = 0; dim < maxVals.n_elem; ++dim)
 #endif
@@ -341,10 +341,9 @@ bool DTree<MatType, TagType>::FindSplit(const MatType& data,
 
     double actualMinDimError = std::log(minDimError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim;
 
-#pragma omp atomic
+#pragma omp critical (DTreeFindUpdate)
     if ((actualMinDimError > minError) && dimSplitFound)
     {
-#pragma omp critical DTreeFindUpdate
       {
         // Calculate actual error (in logspace) by adding terms back to our
         // estimate.
diff --git a/src/mlpack/tests/det_test.cpp b/src/mlpack/tests/det_test.cpp
index 3365984..09b9691 100644
--- a/src/mlpack/tests/det_test.cpp
+++ b/src/mlpack/tests/det_test.cpp
@@ -141,6 +141,70 @@ BOOST_AUTO_TEST_CASE(TestSplitData)
   BOOST_REQUIRE_EQUAL(oTest[3], 2);
   BOOST_REQUIRE_EQUAL(oTest[4], 5);
 }
+
+BOOST_AUTO_TEST_CASE(TestSparseFindSplit)
+{
+  arma::mat realData(4,7);
+  
+  realData << .0 << 4 << 5 << 7 << 0 << 5 << 0 << arma::endr
+           << .0 << 5 << 0 << 0 << 1 << 7 << 1 << arma::endr
+           << .0 << 5 << 6 << 7 << 1 << 0 << 8 << arma::endr
+           << -1 << 2 << 5 << 0 << 0 << 0 << 0 << arma::endr;
+  
+  arma::sp_mat testData(realData);
+  
+  DTree<arma::sp_mat> testDTree(testData);
+  
+  size_t obDim, trueDim;
+  double trueLeftError, obLeftError, trueRightError, obRightError, obSplit, trueSplit;
+  
+  trueDim = 1;
+  trueSplit = .5;
+  trueLeftError = 2 * log(3.0 / 7.0) - (log(7.0) + log(0.5) + log(8.0) + log(6.0));
+  trueRightError = 2 * log(4.0 / 7.0) - (log(7.0) + log(6.5) + log(8.0) + log(6.0));
+  
+  testDTree.logVolume = log(7.0) + log(7.0) + log(8.0) + log(6.0);
+  BOOST_REQUIRE(testDTree.FindSplit(testData, obDim, obSplit, obLeftError, obRightError, 1));
+  
+  BOOST_REQUIRE(trueDim == obDim);
+  BOOST_REQUIRE_CLOSE(trueSplit, obSplit, 1e-10);
+  
+  BOOST_REQUIRE_CLOSE(trueLeftError, obLeftError, 1e-10);
+  BOOST_REQUIRE_CLOSE(trueRightError, obRightError, 1e-10);
+}
+
+BOOST_AUTO_TEST_CASE(TestSparseSplitData)
+{
+  arma::mat realData(4,7);
+  
+  realData << .0 << 4 << 5 << 7 << 0 << 5 << 0 << arma::endr
+           << .0 << 5 << 0 << 0 << 1 << 7 << 1 << arma::endr
+           << .0 << 5 << 6 << 7 << 1 << 0 << 8 << arma::endr
+           << -1 << 2 << 5 << 0 << 0 << 0 << 0 << arma::endr;
+  
+  arma::sp_mat testData(realData);
+  
+  DTree<arma::sp_mat> testDTree(testData);
+  
+  arma::Col<size_t> oTest(7);
+  oTest << 1 << 2 << 3 << 4 << 5 << 6 << 7;
+  
+  size_t splitDim = 1;
+  double trueSplitVal = .5;
+  
+  size_t splitInd = testDTree.SplitData(testData, splitDim, trueSplitVal, oTest);
+  
+  BOOST_REQUIRE_EQUAL(splitInd, 3); // 2 points on left side.
+  
+  BOOST_REQUIRE_EQUAL(oTest[0], 1);
+  BOOST_REQUIRE_EQUAL(oTest[1], 4);
+  BOOST_REQUIRE_EQUAL(oTest[2], 3);
+  BOOST_REQUIRE_EQUAL(oTest[3], 2);
+  BOOST_REQUIRE_EQUAL(oTest[4], 5);
+  BOOST_REQUIRE_EQUAL(oTest[5], 6);
+  BOOST_REQUIRE_EQUAL(oTest[6], 7);
+}
+
 #endif
 
 // Tests for the public functions.
@@ -307,6 +371,79 @@ BOOST_AUTO_TEST_CASE(TestVariableImportance)
   BOOST_REQUIRE_CLOSE((double) (rootError - (lError + rError)), imps[2], 1e-10);
 }
 
+BOOST_AUTO_TEST_CASE(TestSparsePruneAndUpdate)
+{
+  arma::mat realData(3, 5);
+  
+  realData << 4 << 5 << 7 << 3 << 5 << arma::endr
+           << 5 << 0 << 1 << 7 << 1 << arma::endr
+           << 5 << 6 << 7 << 1 << 8 << arma::endr;
+  
+  arma::sp_mat testData(realData);
+  
+  arma::Col<size_t> oTest(5);
+  oTest << 0 << 1 << 2 << 3 << 4;
+  
+  DTree<arma::sp_mat> testDTree(testData);
+  double alpha = testDTree.Grow(testData, oTest, false, 2, 1);
+  alpha = testDTree.PruneAndUpdate(alpha, testData.n_cols, false);
+  
+  BOOST_REQUIRE_CLOSE(alpha, numeric_limits<double>::max(), 1e-10);
+  BOOST_REQUIRE(testDTree.SubtreeLeaves() == 1);
+  
+  double rootError = -log(4.0) - log(7.0) - log(7.0);
+  
+  BOOST_REQUIRE_CLOSE(testDTree.LogNegError(), rootError, 1e-10);
+  BOOST_REQUIRE_CLOSE(testDTree.SubtreeLeavesLogNegError(), rootError, 1e-10);
+  BOOST_REQUIRE(testDTree.Left() == NULL);
+  BOOST_REQUIRE(testDTree.Right() == NULL);
+}
+
+BOOST_AUTO_TEST_CASE(TestSparseComputeValue)
+{
+  arma::mat realData(3, 5);
+  
+  Log::Info << "OMP threads: " << omp_get_thread_num() << std::endl;
+  
+  realData << 4 << 5 << 7 << 3 << 5 << arma::endr
+           << 5 << 0 << 1 << 7 << 1 << arma::endr
+           << 5 << 6 << 7 << 1 << 8 << arma::endr;
+  
+  arma::vec _q1(3), _q2(3), _q3(3), _q4(3);
+  
+  _q1 << 4 << 2 << 2;
+  _q2 << 5 << 0.25 << 6;
+  _q3 << 5 << 3 << 7;
+  _q4 << 2 << 3 << 3;
+  
+  arma::sp_mat testData(realData);
+  arma::sp_vec q1(_q1), q2(_q2), q3(_q3), q4(_q4);
+  
+  arma::Col<size_t> oTest(5);
+  oTest << 0 << 1 << 2 << 3 << 4;
+  
+  DTree<arma::sp_mat> testDTree(testData);
+  double alpha = testDTree.Grow(testData, oTest, false, 2, 1);
+  
+  double d1 = (2.0 / 5.0) / exp(log(4.0) + log(7.0) + log(4.5));
+  double d2 = (1.0 / 5.0) / exp(log(4.0) + log(0.5) + log(2.5));
+  double d3 = (2.0 / 5.0) / exp(log(4.0) + log(6.5) + log(2.5));
+  
+  BOOST_REQUIRE_CLOSE(d1, testDTree.ComputeValue(q1), 1e-10);
+  BOOST_REQUIRE_CLOSE(d2, testDTree.ComputeValue(q2), 1e-10);
+  BOOST_REQUIRE_CLOSE(d3, testDTree.ComputeValue(q3), 1e-10);
+  BOOST_REQUIRE_CLOSE(0.0, testDTree.ComputeValue(q4), 1e-10);
+  
+  alpha = testDTree.PruneAndUpdate(alpha, testData.n_cols, false);
+  
+  double d = 1.0 / exp(log(4.0) + log(7.0) + log(7.0));
+  
+  BOOST_REQUIRE_CLOSE(d, testDTree.ComputeValue(q1), 1e-10);
+  BOOST_REQUIRE_CLOSE(d, testDTree.ComputeValue(q2), 1e-10);
+  BOOST_REQUIRE_CLOSE(d, testDTree.ComputeValue(q3), 1e-10);
+  BOOST_REQUIRE_CLOSE(0.0, testDTree.ComputeValue(q4), 1e-10);
+}
+
 /**
  * These are not yet implemented.
  *




More information about the mlpack-git mailing list