[mlpack-git] master: Add minSamples for splitting. (91a5ff3)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:46:05 EST 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit 91a5ff39a85c1cbcf3ff17f82c2633d4437cf86d
Author: Ryan Curtin <ryan at ratml.org>
Date:   Wed Nov 11 12:07:03 2015 -0800

    Add minSamples for splitting.


>---------------------------------------------------------------

91a5ff39a85c1cbcf3ff17f82c2633d4437cf86d
 src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp      | 10 +++++++++-
 src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp | 14 ++++++++++++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
index a97f1e5..55941bc 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
@@ -74,6 +74,8 @@ class HoeffdingTree
    *      never forces a split); ignored in batch training mode.
    * @param checkInterval Number of samples required before each split; ignored
    *      in batch training mode.
+   * @param minSamples If the node has seen this many points or fewer, no split
+   *      will be allowed.
    */
   template<typename MatType>
   HoeffdingTree(const MatType& data,
@@ -83,7 +85,8 @@ class HoeffdingTree
                 const bool batchTraining = true,
                 const double successProbability = 0.95,
                 const size_t maxSamples = 0,
-                const size_t checkInterval = 100);
+                const size_t checkInterval = 100,
+                const size_t minSamples = 100);
 
   /**
    * Construct the Hoeffding tree with the given parameters, but training on no
@@ -98,6 +101,8 @@ class HoeffdingTree
    *      bound before a split can happen.
    * @param maxSamples Maximum number of samples before a split is forced.
    * @param checkInterval Number of samples required before each split check.
+   * @param minSamples If the node has seen this many points or fewer, no split
+   *      will be allowed.
    * @param dimensionMappings Mappings from dimension indices to positions in
    *      numeric and categorical split vectors.  If left NULL, a new one will
    *      be created.
@@ -107,6 +112,7 @@ class HoeffdingTree
                 const double successProbability = 0.95,
                 const size_t maxSamples = 0,
                 const size_t checkInterval = 100,
+                const size_t minSamples = 100,
                 std::unordered_map<size_t, std::pair<size_t, size_t>>*
                     dimensionMappings = NULL);
 
@@ -264,6 +270,8 @@ class HoeffdingTree
   size_t maxSamples;
   //! The number of samples that should be seen before checking for a split.
   size_t checkInterval;
+  //! The minimum number of samples for splitting.
+  size_t minSamples;
   //! The dataset information.
   const data::DatasetInfo* datasetInfo;
   //! Whether or not we own the dataset information.
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
index 7b279af..c30f48f 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
@@ -28,7 +28,8 @@ HoeffdingTree<
                  const bool batchTraining,
                  const double successProbability,
                  const size_t maxSamples,
-                 const size_t checkInterval) :
+                 const size_t checkInterval,
+                 const size_t minSamples) :
     dimensionMappings(new std::unordered_map<size_t,
         std::pair<size_t, size_t>>()),
     ownsMappings(true),
@@ -36,6 +37,7 @@ HoeffdingTree<
     numClasses(numClasses),
     maxSamples((maxSamples == 0) ? size_t(-1) : maxSamples),
     checkInterval(checkInterval),
+    minSamples(minSamples),
     datasetInfo(&datasetInfo),
     ownsInfo(false),
     successProbability(successProbability),
@@ -77,6 +79,7 @@ HoeffdingTree<
                  const double successProbability,
                  const size_t maxSamples,
                  const size_t checkInterval,
+                 const size_t minSamples,
                  std::unordered_map<size_t, std::pair<size_t, size_t>>*
                      dimensionMappingsIn) :
     dimensionMappings((dimensionMappingsIn != NULL) ? dimensionMappingsIn :
@@ -86,6 +89,7 @@ HoeffdingTree<
     numClasses(numClasses),
     maxSamples((maxSamples == 0) ? size_t(-1) : maxSamples),
     checkInterval(checkInterval),
+    minSamples(minSamples),
     datasetInfo(&datasetInfo),
     ownsInfo(false),
     successProbability(successProbability),
@@ -144,6 +148,7 @@ HoeffdingTree<FitnessFunction, NumericSplitType, CategoricalSplitType>::
     numSamples(other.numSamples),
     numClasses(other.numClasses),
     maxSamples(other.maxSamples),
+    minSamples(other.minSamples),
     checkInterval(other.checkInterval),
     datasetInfo(new data::DatasetInfo(*other.datasetInfo)),
     ownsInfo(true),
@@ -318,6 +323,10 @@ size_t HoeffdingTree<
   if (splitDimension != size_t(-1))
     return 0;
 
+  // If not enough points have been seen, we cannot split.
+  if (numSamples <= minSamples)
+    return 0;
+
   // Check the fitness of each dimension.  Then we'll use a Hoeffding bound
   // somehow.
 
@@ -524,7 +533,8 @@ void HoeffdingTree<
   for (size_t i = 0; i < childMajorities.n_elem; ++i)
   {
     children.push_back(HoeffdingTree(*datasetInfo, numClasses,
-        successProbability, maxSamples, checkInterval, dimensionMappings));
+        successProbability, maxSamples, checkInterval, minSamples,
+        dimensionMappings));
     children[i].MajorityClass() = childMajorities[i];
   }
 



More information about the mlpack-git mailing list