[mlpack-git] master: Allow large speedups by not requiring split checks every training point. (f03ae5b)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:44:41 EST 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit f03ae5b69184b0fab7037481c88bcc8ab243a753
Author: Ryan Curtin <ryan at ratml.org>
Date:   Sun Oct 18 07:26:04 2015 -0400

    Allow large speedups by not requiring split checks every training point.


>---------------------------------------------------------------

f03ae5b69184b0fab7037481c88bcc8ab243a753
 src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp           | 2 ++
 src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp      | 9 ++++++++-
 src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp   | 6 +++++-
 .../methods/hoeffding_trees/streaming_decision_tree_impl.hpp     | 9 ++++++---
 4 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
index 2f062fa..86ae07d 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
@@ -30,6 +30,7 @@ class HoeffdingSplit
                  const data::DatasetInfo& datasetInfo,
                  const double successProbability,
                  const size_t maxSamples,
+                 const size_t checkInterval,
                  std::unordered_map<size_t, std::pair<size_t, size_t>>*
                      dimensionMappings = NULL);
 
@@ -81,6 +82,7 @@ class HoeffdingSplit
   size_t numSamples;
   size_t numClasses;
   size_t maxSamples;
+  size_t checkInterval;
   data::DatasetInfo* datasetInfo;
   double successProbability;
 
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
index 5f1e29c..c109ed1 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
@@ -22,6 +22,7 @@ HoeffdingSplit<
                   const data::DatasetInfo& datasetInfo,
                   const double successProbability,
                   const size_t maxSamples,
+                  const size_t checkInterval,
                   std::unordered_map<size_t, std::pair<size_t, size_t>>*
                       dimensionMappingsIn) :
     dimensionMappings((dimensionMappingsIn != NULL) ? dimensionMappingsIn :
@@ -30,6 +31,7 @@ HoeffdingSplit<
     numSamples(0),
     numClasses(numClasses),
     maxSamples(maxSamples),
+    checkInterval(checkInterval),
     datasetInfo(const_cast<data::DatasetInfo*>(&datasetInfo)),
     successProbability(successProbability),
     splitDimension(size_t(-1)),
@@ -134,6 +136,10 @@ size_t HoeffdingSplit<
     CategoricalSplitType
 >::SplitCheck()
 {
+  // If we have not seen enough samples to check, don't check.
+  if (numSamples % checkInterval != 0)
+    return 0;
+
   // Do nothing if we've already split.
   if (splitDimension != size_t(-1))
     return 0;
@@ -314,7 +320,8 @@ void HoeffdingSplit<
   for (size_t i = 0; i < childMajorities.n_elem; ++i)
   {
     children.push_back(StreamingDecisionTreeType(*datasetInfo, dimensionality,
-        numClasses, successProbability, maxSamples, dimensionMappings));
+        numClasses, successProbability, maxSamples, checkInterval,
+        dimensionMappings));
     children[i].MajorityClass() = childMajorities[i];
   }
 
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
index a05d9b2..11d99fe 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
@@ -24,13 +24,15 @@ class StreamingDecisionTree
                         const arma::Row<size_t>& labels,
                         const size_t numClasses,
                         const double confidence = 0.95,
-                        const size_t numSamples = 5000);
+                        const size_t numSamples = 5000,
+                        const size_t checkInterval = 100);
 
   StreamingDecisionTree(const data::DatasetInfo& datasetInfo,
                         const size_t dimensionality,
                         const size_t numClasses,
                         const double confidence = 0.95,
                         const size_t numSamples = 5000,
+                        const size_t checkInterval = 100,
                         std::unordered_map<size_t, std::pair<size_t, size_t>>*
                             dimensionMappings = NULL);
 
@@ -71,6 +73,7 @@ class StreamingDecisionTree
   void Serialize(Archive& ar, const unsigned int /* version */)
   {
     ar & data::CreateNVP(split, "split");
+    ar & data::CreateNVP(checkInterval, "checkInterval");
 
     size_t numChildren;
     if (Archive::is_saving::value)
@@ -90,6 +93,7 @@ class StreamingDecisionTree
 
  private:
   std::vector<StreamingDecisionTree> children;
+  size_t checkInterval;
 
   SplitType split;
 };
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
index 8697a05..dda3d2a 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
@@ -20,8 +20,10 @@ StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
     const arma::Row<size_t>& labels,
     const size_t numClasses,
     const double confidence,
-    const size_t numSamples) :
-    split(data.n_rows, numClasses, datasetInfo, confidence, numSamples)
+    const size_t numSamples,
+    const size_t checkInterval) :
+    split(data.n_rows, numClasses, datasetInfo, confidence, numSamples,
+        checkInterval)
 {
   Train(data, labels);
 }
@@ -33,9 +35,10 @@ StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
     const size_t numClasses,
     const double confidence,
     const size_t numSamples,
+    const size_t checkInterval,
     std::unordered_map<size_t, std::pair<size_t, size_t>>* dimensionMappings) :
     split(dimensionality, numClasses, datasetInfo, confidence, numSamples,
-        dimensionMappings)
+        checkInterval, dimensionMappings)
 {
   // No training.  Anything else to do...?
 }



More information about the mlpack-git mailing list