[mlpack-git] master: Allow large speedups by not requiring split checks every training point. (f03ae5b)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:44:41 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit f03ae5b69184b0fab7037481c88bcc8ab243a753
Author: Ryan Curtin <ryan at ratml.org>
Date: Sun Oct 18 07:26:04 2015 -0400
Allow large speedups by not requiring split checks every training point.
>---------------------------------------------------------------
f03ae5b69184b0fab7037481c88bcc8ab243a753
src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp | 2 ++
src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp | 9 ++++++++-
src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp | 6 +++++-
.../methods/hoeffding_trees/streaming_decision_tree_impl.hpp | 9 ++++++---
4 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
index 2f062fa..86ae07d 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
@@ -30,6 +30,7 @@ class HoeffdingSplit
const data::DatasetInfo& datasetInfo,
const double successProbability,
const size_t maxSamples,
+ const size_t checkInterval,
std::unordered_map<size_t, std::pair<size_t, size_t>>*
dimensionMappings = NULL);
@@ -81,6 +82,7 @@ class HoeffdingSplit
size_t numSamples;
size_t numClasses;
size_t maxSamples;
+ size_t checkInterval;
data::DatasetInfo* datasetInfo;
double successProbability;
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
index 5f1e29c..c109ed1 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
@@ -22,6 +22,7 @@ HoeffdingSplit<
const data::DatasetInfo& datasetInfo,
const double successProbability,
const size_t maxSamples,
+ const size_t checkInterval,
std::unordered_map<size_t, std::pair<size_t, size_t>>*
dimensionMappingsIn) :
dimensionMappings((dimensionMappingsIn != NULL) ? dimensionMappingsIn :
@@ -30,6 +31,7 @@ HoeffdingSplit<
numSamples(0),
numClasses(numClasses),
maxSamples(maxSamples),
+ checkInterval(checkInterval),
datasetInfo(const_cast<data::DatasetInfo*>(&datasetInfo)),
successProbability(successProbability),
splitDimension(size_t(-1)),
@@ -134,6 +136,10 @@ size_t HoeffdingSplit<
CategoricalSplitType
>::SplitCheck()
{
+ // If we have not seen enough samples to check, don't check.
+ if (numSamples % checkInterval != 0)
+ return 0;
+
// Do nothing if we've already split.
if (splitDimension != size_t(-1))
return 0;
@@ -314,7 +320,8 @@ void HoeffdingSplit<
for (size_t i = 0; i < childMajorities.n_elem; ++i)
{
children.push_back(StreamingDecisionTreeType(*datasetInfo, dimensionality,
- numClasses, successProbability, maxSamples, dimensionMappings));
+ numClasses, successProbability, maxSamples, checkInterval,
+ dimensionMappings));
children[i].MajorityClass() = childMajorities[i];
}
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
index a05d9b2..11d99fe 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
@@ -24,13 +24,15 @@ class StreamingDecisionTree
const arma::Row<size_t>& labels,
const size_t numClasses,
const double confidence = 0.95,
- const size_t numSamples = 5000);
+ const size_t numSamples = 5000,
+ const size_t checkInterval = 100);
StreamingDecisionTree(const data::DatasetInfo& datasetInfo,
const size_t dimensionality,
const size_t numClasses,
const double confidence = 0.95,
const size_t numSamples = 5000,
+ const size_t checkInterval = 100,
std::unordered_map<size_t, std::pair<size_t, size_t>>*
dimensionMappings = NULL);
@@ -71,6 +73,7 @@ class StreamingDecisionTree
void Serialize(Archive& ar, const unsigned int /* version */)
{
ar & data::CreateNVP(split, "split");
+ ar & data::CreateNVP(checkInterval, "checkInterval");
size_t numChildren;
if (Archive::is_saving::value)
@@ -90,6 +93,7 @@ class StreamingDecisionTree
private:
std::vector<StreamingDecisionTree> children;
+ size_t checkInterval;
SplitType split;
};
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
index 8697a05..dda3d2a 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
@@ -20,8 +20,10 @@ StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
const arma::Row<size_t>& labels,
const size_t numClasses,
const double confidence,
- const size_t numSamples) :
- split(data.n_rows, numClasses, datasetInfo, confidence, numSamples)
+ const size_t numSamples,
+ const size_t checkInterval) :
+ split(data.n_rows, numClasses, datasetInfo, confidence, numSamples,
+ checkInterval)
{
Train(data, labels);
}
@@ -33,9 +35,10 @@ StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
const size_t numClasses,
const double confidence,
const size_t numSamples,
+ const size_t checkInterval,
std::unordered_map<size_t, std::pair<size_t, size_t>>* dimensionMappings) :
split(dimensionality, numClasses, datasetInfo, confidence, numSamples,
- dimensionMappings)
+ checkInterval, dimensionMappings)
{
// No training. Anything else to do...?
}
More information about the mlpack-git
mailing list