[mlpack-git] master: Handle numeric and categorical attributes simultaneously. (55dfc55)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:42:59 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit 55dfc552834bc7c9f2f84b8b636320c4ae207905
Author: ryan <ryan at ratml.org>
Date: Tue Sep 29 12:21:41 2015 -0400
Handle numeric and categorical attributes simultaneously.
>---------------------------------------------------------------
55dfc552834bc7c9f2f84b8b636320c4ae207905
.../hoeffding_trees/hoeffding_numeric_split.hpp | 2 ++
.../methods/hoeffding_trees/hoeffding_split.hpp | 2 ++
.../hoeffding_trees/hoeffding_split_impl.hpp | 23 ++++++++++++++++++----
.../streaming_decision_tree_main.cpp | 23 ++++++++++++++++++++--
4 files changed, 44 insertions(+), 6 deletions(-)
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
index 5b3df29..567fd76 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
@@ -64,6 +64,8 @@ class HoeffdingNumericSplit
size_t MajorityClass() const;
+ size_t Bins() const { return bins; }
+
private:
// Cache the values of the points seen before we make bins.
arma::Col<ObservationType> observations;
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
index bc96870..3c1e9ff 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
@@ -53,6 +53,8 @@ class HoeffdingSplit
std::vector<NumericSplitType> numericSplits;
std::vector<CategoricalSplitType> categoricalSplits;
+ std::unordered_map<size_t, std::pair<size_t, size_t>> dimensionMappings;
+
size_t numSamples;
size_t numClasses;
arma::Col<size_t> classCounts;
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
index 5dffd01..765a197 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
@@ -33,10 +33,18 @@ HoeffdingSplit<
for (size_t i = 0; i < dimensionality; ++i)
{
if (datasetInfo.Type(i) == data::Datatype::categorical)
+ {
categoricalSplits.push_back(
CategoricalSplitType(datasetInfo.NumMappings(i), numClasses));
+ dimensionMappings[i] = std::make_pair(data::Datatype::categorical,
+ categoricalSplits.size() - 1);
+ }
else
+ {
numericSplits.push_back(NumericSplitType(numClasses));
+ dimensionMappings[i] = std::make_pair(data::Datatype::numeric,
+ numericSplits.size() - 1);
+ }
}
}
@@ -98,8 +106,15 @@ size_t HoeffdingSplit<
std::log(1.0 / (1.0 - successProbability)) / (2 * numSamples));
arma::vec gains(categoricalSplits.size() + numericSplits.size());
- for (size_t i = 0; i < categoricalSplits.size(); ++i)
- gains[i] = categoricalSplits[i].EvaluateFitnessFunction();
+ for (size_t i = 0; i < gains.n_elem; ++i)
+ {
+ size_t type = dimensionMappings[i].first;
+ size_t index = dimensionMappings[i].second;
+ if (type == data::Datatype::categorical)
+ gains[i] = categoricalSplits[index].EvaluateFitnessFunction();
+ else if (type == data::Datatype::numeric)
+ gains[i] = numericSplits[index].EvaluateFitnessFunction();
+ }
// Now find the largest and second-largest.
double largest = -DBL_MAX;
@@ -132,8 +147,8 @@ size_t HoeffdingSplit<
}
else
{
- majorityClass = 0;
- return 0; // I have no idea what to do yet.
+ majorityClass = numericSplits[largestIndex].MajorityClass();
+ return numericSplits[largestIndex].Bins();
}
}
else
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
index 073ea1d..9d6226c 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
@@ -7,6 +7,7 @@
#include <mlpack/core.hpp>
#include <mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp>
#include <mlpack/methods/hoeffding_trees/hoeffding_split.hpp>
+#include <stack>
using namespace std;
using namespace mlpack;
@@ -27,12 +28,30 @@ int main(int argc, char** argv)
DatasetInfo datasetInfo;
data::Load(trainingFile, trainingSet, datasetInfo, true);
- arma::Row<size_t> labels;
- data::Load(labelsFile, labels, true);
+ arma::Col<size_t> labelsIn;
+ data::Load(labelsFile, labelsIn, true, false);
+ arma::Row<size_t> labels = labelsIn.t();
// Now create the decision tree.
StreamingDecisionTree<HoeffdingSplit<>> tree(trainingSet, datasetInfo, labels,
max(labels) + 1);
// Great. Good job team.
+ std::stack<StreamingDecisionTree<HoeffdingSplit<>>*> stack;
+ stack.push(&tree);
+ while (!stack.empty())
+ {
+ StreamingDecisionTree<HoeffdingSplit<>>* node = stack.top();
+ stack.pop();
+
+ Log::Info << "Node:\n";
+ Log::Info << " split dimension " << node->Split().SplitDimension()
+ << ".\n";
+ Log::Info << " majority class " << node->Split().Classify(arma::vec())
+ << ".\n";
+ Log::Info << " children " << node->NumChildren() << ".\n";
+
+ for (size_t i = 0; i < node->NumChildren(); ++i)
+ stack.push(&node->Child(i));
+ }
}
More information about the mlpack-git
mailing list