[mlpack-git] master: Add minSamples for splitting. (91a5ff3)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:46:05 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit 91a5ff39a85c1cbcf3ff17f82c2633d4437cf86d
Author: Ryan Curtin <ryan at ratml.org>
Date: Wed Nov 11 12:07:03 2015 -0800
Add minSamples for splitting.
>---------------------------------------------------------------
91a5ff39a85c1cbcf3ff17f82c2633d4437cf86d
src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp | 10 +++++++++-
src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp | 14 ++++++++++++--
2 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
index a97f1e5..55941bc 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
@@ -74,6 +74,8 @@ class HoeffdingTree
* never forces a split); ignored in batch training mode.
* @param checkInterval Number of samples required before each split; ignored
* in batch training mode.
+ * @param minSamples If the node has seen this many points or fewer, no split
+ * will be allowed.
*/
template<typename MatType>
HoeffdingTree(const MatType& data,
@@ -83,7 +85,8 @@ class HoeffdingTree
const bool batchTraining = true,
const double successProbability = 0.95,
const size_t maxSamples = 0,
- const size_t checkInterval = 100);
+ const size_t checkInterval = 100,
+ const size_t minSamples = 100);
/**
* Construct the Hoeffding tree with the given parameters, but training on no
@@ -98,6 +101,8 @@ class HoeffdingTree
* bound before a split can happen.
* @param maxSamples Maximum number of samples before a split is forced.
* @param checkInterval Number of samples required before each split check.
+ * @param minSamples If the node has seen this many points or fewer, no split
+ * will be allowed.
* @param dimensionMappings Mappings from dimension indices to positions in
* numeric and categorical split vectors. If left NULL, a new one will
* be created.
@@ -107,6 +112,7 @@ class HoeffdingTree
const double successProbability = 0.95,
const size_t maxSamples = 0,
const size_t checkInterval = 100,
+ const size_t minSamples = 100,
std::unordered_map<size_t, std::pair<size_t, size_t>>*
dimensionMappings = NULL);
@@ -264,6 +270,8 @@ class HoeffdingTree
size_t maxSamples;
//! The number of samples that should be seen before checking for a split.
size_t checkInterval;
+ //! The minimum number of samples for splitting.
+ size_t minSamples;
//! The dataset information.
const data::DatasetInfo* datasetInfo;
//! Whether or not we own the dataset information.
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
index 7b279af..c30f48f 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
@@ -28,7 +28,8 @@ HoeffdingTree<
const bool batchTraining,
const double successProbability,
const size_t maxSamples,
- const size_t checkInterval) :
+ const size_t checkInterval,
+ const size_t minSamples) :
dimensionMappings(new std::unordered_map<size_t,
std::pair<size_t, size_t>>()),
ownsMappings(true),
@@ -36,6 +37,7 @@ HoeffdingTree<
numClasses(numClasses),
maxSamples((maxSamples == 0) ? size_t(-1) : maxSamples),
checkInterval(checkInterval),
+ minSamples(minSamples),
datasetInfo(&datasetInfo),
ownsInfo(false),
successProbability(successProbability),
@@ -77,6 +79,7 @@ HoeffdingTree<
const double successProbability,
const size_t maxSamples,
const size_t checkInterval,
+ const size_t minSamples,
std::unordered_map<size_t, std::pair<size_t, size_t>>*
dimensionMappingsIn) :
dimensionMappings((dimensionMappingsIn != NULL) ? dimensionMappingsIn :
@@ -86,6 +89,7 @@ HoeffdingTree<
numClasses(numClasses),
maxSamples((maxSamples == 0) ? size_t(-1) : maxSamples),
checkInterval(checkInterval),
+ minSamples(minSamples),
datasetInfo(&datasetInfo),
ownsInfo(false),
successProbability(successProbability),
@@ -144,6 +148,7 @@ HoeffdingTree<FitnessFunction, NumericSplitType, CategoricalSplitType>::
numSamples(other.numSamples),
numClasses(other.numClasses),
maxSamples(other.maxSamples),
+ minSamples(other.minSamples),
checkInterval(other.checkInterval),
datasetInfo(new data::DatasetInfo(*other.datasetInfo)),
ownsInfo(true),
@@ -318,6 +323,10 @@ size_t HoeffdingTree<
if (splitDimension != size_t(-1))
return 0;
+ // If not enough points have been seen, we cannot split.
+ if (numSamples <= minSamples)
+ return 0;
+
// Check the fitness of each dimension. Then we'll use a Hoeffding bound
// somehow.
@@ -524,7 +533,8 @@ void HoeffdingTree<
for (size_t i = 0; i < childMajorities.n_elem; ++i)
{
children.push_back(HoeffdingTree(*datasetInfo, numClasses,
- successProbability, maxSamples, checkInterval, dimensionMappings));
+ successProbability, maxSamples, checkInterval, minSamples,
+ dimensionMappings));
children[i].MajorityClass() = childMajorities[i];
}
More information about the mlpack-git
mailing list