[mlpack-git] master: Get HoeffdingSplit and StreamingDecisionTree to compile. (a77b9f3)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:42:32 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit a77b9f3bc478d2aae1315beff4ba98095663481f
Author: ryan <ryan at ratml.org>
Date: Tue Sep 22 23:22:43 2015 -0400
Get HoeffdingSplit and StreamingDecisionTree to compile.
>---------------------------------------------------------------
a77b9f3bc478d2aae1315beff4ba98095663481f
src/mlpack/methods/hoeffding_trees/CMakeLists.txt | 1 +
.../hoeffding_categorical_split.hpp | 2 +-
.../hoeffding_categorical_split_impl.hpp | 5 +-
.../hoeffding_trees/hoeffding_numeric_split.hpp | 13 +++-
.../hoeffding_trees/hoeffding_split_impl.hpp | 5 +-
.../hoeffding_trees/streaming_decision_tree.hpp | 9 ++-
.../streaming_decision_tree_impl.hpp | 11 ++--
src/mlpack/tests/hoeffding_tree_test.cpp | 72 ++++++++++++++++++++++
8 files changed, 106 insertions(+), 12 deletions(-)
diff --git a/src/mlpack/methods/hoeffding_trees/CMakeLists.txt b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
index 0d06de2..1e583f3 100644
--- a/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
+++ b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
@@ -8,6 +8,7 @@ set(SOURCES
hoeffding_numeric_split.hpp
hoeffding_split.hpp
hoeffding_split_impl.hpp
+ numeric_split_info.hpp
streaming_decision_tree.hpp
streaming_decision_tree_impl.hpp
)
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
index ce053cf..d41374d 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
@@ -49,7 +49,7 @@ class HoeffdingCategoricalSplit
template<typename StreamingDecisionTreeType>
void CreateChildren(std::vector<StreamingDecisionTreeType>& children,
- data::DatasetInfo& datasetInfo,
+ const data::DatasetInfo& datasetInfo,
SplitInfo& splitInfo);
size_t MajorityClass() const;
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp
index 599dcf0..b05f8df 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp
@@ -43,12 +43,13 @@ template<typename FitnessFunction>
template<typename StreamingDecisionTreeType>
void HoeffdingCategoricalSplit<FitnessFunction>::CreateChildren(
std::vector<StreamingDecisionTreeType>& children,
- data::DatasetInfo& datasetInfo,
+ const data::DatasetInfo& datasetInfo,
SplitInfo& splitInfo)
{
// We'll make one child for each category.
for (size_t i = 0; i < sufficientStatistics.n_cols; ++i)
- children.push_back(StreamingDecisionTreeType(datasetInfo));
+ children.push_back(StreamingDecisionTreeType(datasetInfo, 3,
+ sufficientStatistics.n_rows));
// Create the according SplitInfo object.
splitInfo = SplitInfo(sufficientStatistics.n_cols);
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
index 172d7bc..78a46d9 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
@@ -7,6 +7,9 @@
#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_HPP
#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_HPP
+#include <mlpack/core.hpp>
+#include "numeric_split_info.hpp"
+
namespace mlpack {
namespace tree {
@@ -14,7 +17,7 @@ template<typename FitnessFunction>
class HoeffdingNumericSplit
{
public:
- typedef size_t SplitInfo;
+ typedef NumericSplitInfo SplitInfo;
HoeffdingNumericSplit();
@@ -22,6 +25,14 @@ class HoeffdingNumericSplit
void Train(eT /* value */, const size_t /* label */) { }
double EvaluateFitnessFunction() const { return 0.0; }
+
+ // Does nothing for now.
+ template<typename StreamingDecisionTreeType>
+ void CreateChildren(std::vector<StreamingDecisionTreeType>& children,
+ const data::DatasetInfo& datasetInfo,
+ SplitInfo& splitInfo) { } // Nothing to do.
+
+ size_t MajorityClass() const { return 0; } // Nothing yet.
};
} // namespace tree
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
index f294adc..609971b 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
@@ -194,12 +194,13 @@ void HoeffdingSplit<
if (datasetInfo.Type(splitDimension) == data::Datatype::numeric)
{
- numericSplits[numericSplitIndex + 1].CreateChildren(children, numericSplit);
+ numericSplits[numericSplitIndex + 1].CreateChildren(children, datasetInfo,
+ numericSplit);
}
else if (datasetInfo.Type(splitDimension) == data::Datatype::categorical)
{
categoricalSplits[categoricalSplitIndex + 1].CreateChildren(children,
- categoricalSplit);
+ datasetInfo, categoricalSplit);
}
}
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
index dc4a214..3d34738 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
@@ -21,9 +21,12 @@ class StreamingDecisionTree
public:
StreamingDecisionTree(const MatType& data,
const data::DatasetInfo& datasetInfo,
- const arma::Row<size_t>& labels);
+ const arma::Row<size_t>& labels,
+ const size_t numClasses);
- StreamingDecisionTree(const data::DatasetInfo& datasetInfo);
+ StreamingDecisionTree(const data::DatasetInfo& datasetInfo,
+ const size_t dimensionality,
+ const size_t numClasses);
StreamingDecisionTree(const StreamingDecisionTree& other);
@@ -32,6 +35,8 @@ class StreamingDecisionTree
const StreamingDecisionTree& Child(const size_t i) const { return children[i];
}
+ const SplitType& Split() const { return split; }
+
template<typename VecType>
void Train(const VecType& data, const size_t label);
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
index f97a4f5..61e3f59 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
@@ -17,16 +17,19 @@ template<typename SplitType, typename MatType>
StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
const MatType& data,
const data::DatasetInfo& datasetInfo,
- const arma::Row<size_t>& labels) :
- split(0, 0, datasetInfo, 0)
+ const arma::Row<size_t>& labels,
+ const size_t numClasses) :
+ split(data.n_rows, numClasses, datasetInfo, 0.95)
{
Train(data, labels);
}
template<typename SplitType, typename MatType>
StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
- const data::DatasetInfo& datasetInfo) :
- split(0, 0, datasetInfo, 0)
+ const data::DatasetInfo& datasetInfo,
+ const size_t dimensionality,
+ const size_t numClasses) :
+ split(dimensionality, numClasses, datasetInfo, 0.95)
{
// No training. Anything else to do...?
}
diff --git a/src/mlpack/tests/hoeffding_tree_test.cpp b/src/mlpack/tests/hoeffding_tree_test.cpp
index f24c2dc..cfa9e51 100644
--- a/src/mlpack/tests/hoeffding_tree_test.cpp
+++ b/src/mlpack/tests/hoeffding_tree_test.cpp
@@ -347,4 +347,76 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitAlmostPerfectSplit)
BOOST_REQUIRE_EQUAL(split.SplitDimension(), 1);
}
+/**
+ * Build a decision tree on a dataset with two meaningless dimensions and ensure
+ * that it can properly classify all of the training points. (The dataset is
+ * perfectly separable.)
+ */
+BOOST_AUTO_TEST_CASE(StreamingDecisionTreeSimpleDatasetTest)
+{
+ DatasetInfo info;
+ info.MapString("cat0", 0);
+ info.MapString("cat1", 0);
+ info.MapString("cat2", 0);
+ info.MapString("cat3", 0);
+ info.MapString("cat4", 0);
+ info.MapString("cat5", 0);
+ info.MapString("cat6", 0);
+ info.MapString("cat0", 1);
+ info.MapString("cat1", 1);
+ info.MapString("cat2", 1);
+ info.MapString("cat0", 2);
+ info.MapString("cat1", 2);
+
+ // Now generate data.
+ arma::Mat<size_t> dataset(3, 9000);
+ arma::Row<size_t> labels(9000);
+ for (size_t i = 0; i < 9000; i += 3)
+ {
+ dataset(0, i) = mlpack::math::RandInt(7);
+ dataset(1, i) = 0;
+ dataset(2, i) = mlpack::math::RandInt(2);
+ labels(i) = 0;
+
+ dataset(0, i + 1) = mlpack::math::RandInt(7);
+ dataset(1, i + 1) = 2;
+ dataset(2, i + 1) = mlpack::math::RandInt(2);
+ labels(i) = 1;
+
+ dataset(0, i + 2) = mlpack::math::RandInt(7);
+ dataset(1, i + 2) = 1;
+ dataset(2, i + 2) = mlpack::math::RandInt(2);
+ labels(i) = 2;
+ }
+
+ // Now train two streaming decision trees; one on the whole dataset, and one
+ // on streaming data.
+ StreamingDecisionTree<HoeffdingSplit<>, arma::Mat<size_t>>
+ batchTree(dataset, info, labels, 2);
+ StreamingDecisionTree<HoeffdingSplit<>, arma::Mat<size_t>>
+ streamTree(info, 3, 2);
+ for (size_t i = 0; i < 9000; ++i)
+ streamTree.Train(dataset.col(i), labels[i]);
+
+ // Each tree should have a single split.
+ BOOST_REQUIRE_EQUAL(batchTree.NumChildren(), 3);
+ BOOST_REQUIRE_EQUAL(streamTree.NumChildren(), 3);
+ BOOST_REQUIRE_EQUAL(batchTree.Split().SplitDimension(), 1);
+ BOOST_REQUIRE_EQUAL(streamTree.Split().SplitDimension(), 1);
+
+ // Now, classify all the points in the dataset.
+ arma::Row<size_t> batchLabels(9000);
+ arma::Row<size_t> streamLabels(9000);
+
+ streamTree.Classify(dataset, batchLabels);
+ for (size_t i = 0; i < 9000; ++i)
+ streamLabels[i] = batchTree.Classify(dataset.col(i));
+
+ for (size_t i = 0; i < 9000; ++i)
+ {
+ BOOST_REQUIRE_EQUAL(labels[i], streamLabels[i]);
+ BOOST_REQUIRE_EQUAL(labels[i], batchLabels[i]);
+ }
+}
+
BOOST_AUTO_TEST_SUITE_END();
More information about the mlpack-git
mailing list