[mlpack-git] master: Avoid copies of dimensionMappings. (a4415f3)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:43:13 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit a4415f3815eb29bcf58a6edcb8857f30acadf6b4
Author: Ryan Curtin <ryan at ratml.org>
Date: Wed Sep 30 15:37:58 2015 -0400
Avoid copies of dimensionMappings.
This currently fails, but I need to move it to another system to use gdb.
>---------------------------------------------------------------
a4415f3815eb29bcf58a6edcb8857f30acadf6b4
.../methods/hoeffding_trees/hoeffding_split.hpp | 11 +++-
.../hoeffding_trees/hoeffding_split_impl.hpp | 76 +++++++++++++++++-----
.../hoeffding_trees/streaming_decision_tree.hpp | 4 +-
.../streaming_decision_tree_impl.hpp | 6 +-
4 files changed, 74 insertions(+), 23 deletions(-)
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
index 4d18b75..31d352a 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
@@ -27,7 +27,11 @@ class HoeffdingSplit
const size_t numClasses,
const data::DatasetInfo& datasetInfo,
const double successProbability,
- const size_t maxSamples);
+ const size_t maxSamples,
+ std::unordered_map<size_t, std::pair<size_t, size_t>>*
+ dimensionMappings = NULL);
+
+ ~HoeffdingSplit();
template<typename VecType>
void Train(const VecType& point, const size_t label);
@@ -59,7 +63,10 @@ class HoeffdingSplit
std::vector<NumericSplitType> numericSplits;
std::vector<CategoricalSplitType> categoricalSplits;
- std::unordered_map<size_t, std::pair<size_t, size_t>> dimensionMappings;
+ // This structure is owned by this node only if it is the root of the tree.
+ std::unordered_map<size_t, std::pair<size_t, size_t>>* dimensionMappings;
+ // Indicates whether or not we own the mappings.
+ bool ownsMappings;
size_t numSamples;
size_t numClasses;
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
index b43a426..558dfb6 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
@@ -21,7 +21,12 @@ HoeffdingSplit<
const size_t numClasses,
const data::DatasetInfo& datasetInfo,
const double successProbability,
- const size_t maxSamples) :
+ const size_t maxSamples,
+ std::unordered_map<size_t, std::pair<size_t, size_t>>*
+ dimensionMappings) :
+ dimensionMappings((dimensionMappings != NULL) ? dimensionMappings :
+ new std::unordered_map<size_t, std::pair<size_t, size_t>>()),
+ ownsMappings(dimensionMappings == NULL),
numSamples(0),
numClasses(numClasses),
maxSamples(maxSamples),
@@ -31,20 +36,39 @@ HoeffdingSplit<
categoricalSplit(0),
numericSplit()
{
- for (size_t i = 0; i < dimensionality; ++i)
+ // Do we need to generate the mappings too?
+ if (ownsMappings)
{
- if (datasetInfo.Type(i) == data::Datatype::categorical)
+ for (size_t i = 0; i < dimensionality; ++i)
{
- categoricalSplits.push_back(
- CategoricalSplitType(datasetInfo.NumMappings(i), numClasses));
- dimensionMappings[i] = std::make_pair(data::Datatype::categorical,
- categoricalSplits.size() - 1);
+ if (datasetInfo.Type(i) == data::Datatype::categorical)
+ {
+ categoricalSplits.push_back(
+ CategoricalSplitType(datasetInfo.NumMappings(i), numClasses));
+ dimensionMappings->at(i) = std::make_pair(data::Datatype::categorical,
+ categoricalSplits.size() - 1);
+ }
+ else
+ {
+ numericSplits.push_back(NumericSplitType(numClasses));
+ dimensionMappings->at(i) = std::make_pair(data::Datatype::numeric,
+ numericSplits.size() - 1);
+ }
}
- else
+ }
+ else
+ {
+ for (size_t i = 0; i < dimensionality; ++i)
{
- numericSplits.push_back(NumericSplitType(numClasses));
- dimensionMappings[i] = std::make_pair(data::Datatype::numeric,
- numericSplits.size() - 1);
+ if (datasetInfo.Type(i) == data::Datatype::categorical)
+ {
+ categoricalSplits.push_back(
+ CategoricalSplitType(datasetInfo.NumMappings(i), numClasses));
+ }
+ else
+ {
+ numericSplits.push_back(NumericSplitType(numClasses));
+ }
}
}
}
@@ -52,6 +76,16 @@ HoeffdingSplit<
template<typename FitnessFunction,
typename NumericSplitType,
typename CategoricalSplitType>
+HoeffdingSplit<FitnessFunction, NumericSplitType, CategoricalSplitType>::
+ ~HoeffdingSplit()
+{
+ if (ownsMappings)
+ delete dimensionMappings;
+}
+
+template<typename FitnessFunction,
+ typename NumericSplitType,
+ typename CategoricalSplitType>
template<typename VecType>
void HoeffdingSplit<
FitnessFunction,
@@ -103,8 +137,8 @@ size_t HoeffdingSplit<
arma::vec gains(categoricalSplits.size() + numericSplits.size());
for (size_t i = 0; i < gains.n_elem; ++i)
{
- size_t type = dimensionMappings[i].first;
- size_t index = dimensionMappings[i].second;
+ size_t type = dimensionMappings->at(i).first;
+ size_t index = dimensionMappings->at(i).second;
if (type == data::Datatype::categorical)
gains[i] = categoricalSplits[index].EvaluateFitnessFunction();
else if (type == data::Datatype::numeric)
@@ -243,14 +277,16 @@ void HoeffdingSplit<
{
// Create the children.
arma::Col<size_t> childMajorities;
- if (dimensionMappings[splitDimension].first == data::Datatype::categorical)
+ if (dimensionMappings->at(splitDimension).first ==
+ data::Datatype::categorical)
{
- categoricalSplits[dimensionMappings[splitDimension].second].Split(
+ categoricalSplits[dimensionMappings->at(splitDimension).second].Split(
childMajorities, categoricalSplit);
}
- else if (dimensionMappings[splitDimension].first == data::Datatype::numeric)
+ else if (dimensionMappings->at(splitDimension).first ==
+ data::Datatype::numeric)
{
- numericSplits[dimensionMappings[splitDimension].second].Split(
+ numericSplits[dimensionMappings->at(splitDimension).second].Split(
childMajorities, numericSplit);
}
@@ -259,9 +295,13 @@ void HoeffdingSplit<
for (size_t i = 0; i < childMajorities.n_elem; ++i)
{
children.push_back(StreamingDecisionTreeType(datasetInfo, dimensionality,
- numClasses, successProbability, numSamples));
+ numClasses, successProbability, numSamples, dimensionMappings));
children[i].MajorityClass() = childMajorities[i];
}
+
+ // Eliminate now-unnecessary split information.
+ numericSplits.clear();
+ categoricalSplits.clear();
}
} // namespace tree
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
index 71b4315..bcfb2c2 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
@@ -30,7 +30,9 @@ class StreamingDecisionTree
const size_t dimensionality,
const size_t numClasses,
const double confidence = 0.95,
- const size_t numSamples = 5000);
+ const size_t numSamples = 5000,
+ std::unordered_map<size_t, std::pair<size_t, size_t>>*
+ dimensionMappings = NULL);
StreamingDecisionTree(const StreamingDecisionTree& other);
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
index 0c0a3b5..31e9857 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
@@ -32,8 +32,10 @@ StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
const size_t dimensionality,
const size_t numClasses,
const double confidence,
- const size_t numSamples) :
- split(dimensionality, numClasses, datasetInfo, confidence, numSamples)
+ const size_t numSamples,
+ std::unordered_map<size_t, std::pair<size_t, size_t>>* dimensionMappings) :
+ split(dimensionality, numClasses, datasetInfo, confidence, numSamples,
+ dimensionMappings)
{
// No training. Anything else to do...?
}
More information about the mlpack-git
mailing list