[mlpack-git] master: Refactor: HoeffdingSplit -> HoeffdingTree. (2bff3d7)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:45:18 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit 2bff3d725602b858931f3f4ebe5390b0b1ea9d01
Author: Ryan Curtin <ryan at ratml.org>
Date: Fri Oct 30 16:40:58 2015 +0000
Refactor: HoeffdingSplit -> HoeffdingTree.
In preparation for removal of StreamingDecisionTree altogether.
>---------------------------------------------------------------
2bff3d725602b858931f3f4ebe5390b0b1ea9d01
src/mlpack/methods/hoeffding_trees/CMakeLists.txt | 4 +-
.../{hoeffding_split.hpp => hoeffding_tree.hpp} | 33 ++++++++++-----
...ding_split_impl.hpp => hoeffding_tree_impl.hpp} | 49 ++++++++++++----------
.../streaming_decision_tree_main.cpp | 4 +-
src/mlpack/tests/hoeffding_tree_test.cpp | 42 +++++++++----------
5 files changed, 73 insertions(+), 59 deletions(-)
diff --git a/src/mlpack/methods/hoeffding_trees/CMakeLists.txt b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
index 9b79011..1e7417b 100644
--- a/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
+++ b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
@@ -9,8 +9,8 @@ set(SOURCES
hoeffding_categorical_split_impl.hpp
hoeffding_numeric_split.hpp
hoeffding_numeric_split_impl.hpp
- hoeffding_split.hpp
- hoeffding_split_impl.hpp
+ hoeffding_tree.hpp
+ hoeffding_tree_impl.hpp
numeric_split_info.hpp
streaming_decision_tree.hpp
streaming_decision_tree_impl.hpp
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
similarity index 86%
rename from src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
rename to src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
index 04e24e5..69c58c3 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
@@ -2,11 +2,11 @@
* @file hoeffding_split.hpp
* @author Ryan Curtin
*
- * An implementation of the standard Hoeffding bound split by Pedro Domingos and
- * Geoff Hulten in ``Mining High-Speed Data Streams''.
+ * An implementation of the standard Hoeffding tree by Pedro Domingos and Geoff
+ * Hulten in ``Mining High-Speed Data Streams''.
*/
-#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_HPP
-#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_HPP
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_TREE_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_TREE_HPP
#include <mlpack/core.hpp>
#include "gini_impurity.hpp"
@@ -17,11 +17,22 @@ namespace mlpack {
namespace tree {
/**
- * The HoeffdingSplit object represents all of the necessary information for a
- * Hoeffding-bound-based decision tree. (Perhaps, the whole thing should be
- * refactored so this class is called 'HoeffdingTree'.) This class is able to
- * train on samples in streaming settings and batch settings, and perform splits
- * based on the Hoeffding bound.
+ * The HoeffdingTree object represents all of the necessary information for a
+ * Hoeffding-bound-based decision tree. This class is able to train on samples
+ * in streaming settings and batch settings, and perform splits based on the
+ * Hoeffding bound. The Hoeffding tree (also known as the "very fast decision
+ * tree" -- VFDT) is described in the following paper:
+ *
+ * @code
+ * @inproceedings{domingos2000mining,
+ * title={{Mining High-Speed Data Streams}},
+ * author={Domingos, P. and Hulten, G.},
+ * year={2000},
+ * booktitle={Proceedings of the Sixth ACM SIGKDD International Conference
+ * on Knowledge Discovery and Data Mining (KDD '00)},
+ * pages={71--80}
+ * }
+ * @endcode
*
* The class is modular, and takes three template parameters. The first,
* FitnessFunction, is the fitness function that should be used to determine
@@ -41,7 +52,7 @@ template<typename FitnessFunction = GiniImpurity,
template<typename> class CategoricalSplitType =
HoeffdingCategoricalSplit
>
-class HoeffdingSplit
+class HoeffdingTree
{
public:
/**
@@ -188,6 +199,6 @@ class HoeffdingSplit
} // namespace tree
} // namespace mlpack
-#include "hoeffding_split_impl.hpp"
+#include "hoeffding_tree_impl.hpp"
#endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
similarity index 92%
rename from src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
rename to src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
index b0898e7..5aa2093 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
@@ -2,10 +2,13 @@
* @file hoeffding_split_impl.hpp
* @author Ryan Curtin
*
- * Implementation of the HoeffdingSplit class.
+ * Implementation of the HoeffdingTree class.
*/
-#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_IMPL_HPP
-#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_IMPL_HPP
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_TREE_IMPL_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_TREE_IMPL_HPP
+
+// In case it hasn't been included yet.
+#include "hoeffding_tree.hpp"
namespace mlpack {
namespace tree {
@@ -13,18 +16,18 @@ namespace tree {
template<typename FitnessFunction,
template<typename> class NumericSplitType,
template<typename> class CategoricalSplitType>
-HoeffdingSplit<
+HoeffdingTree<
FitnessFunction,
NumericSplitType,
CategoricalSplitType
->::HoeffdingSplit(const size_t dimensionality,
- const size_t numClasses,
- const data::DatasetInfo& datasetInfo,
- const double successProbability,
- const size_t maxSamples,
- const size_t checkInterval,
- std::unordered_map<size_t, std::pair<size_t, size_t>>*
- dimensionMappingsIn) :
+>::HoeffdingTree(const size_t dimensionality,
+ const size_t numClasses,
+ const data::DatasetInfo& datasetInfo,
+ const double successProbability,
+ const size_t maxSamples,
+ const size_t checkInterval,
+ std::unordered_map<size_t, std::pair<size_t, size_t>>*
+ dimensionMappingsIn) :
dimensionMappings((dimensionMappingsIn != NULL) ? dimensionMappingsIn :
new std::unordered_map<size_t, std::pair<size_t, size_t>>()),
ownsMappings(dimensionMappingsIn == NULL),
@@ -78,8 +81,8 @@ HoeffdingSplit<
template<typename FitnessFunction,
template<typename> class NumericSplitType,
template<typename> class CategoricalSplitType>
-HoeffdingSplit<FitnessFunction, NumericSplitType, CategoricalSplitType>::
- ~HoeffdingSplit()
+HoeffdingTree<FitnessFunction, NumericSplitType, CategoricalSplitType>::
+ ~HoeffdingTree()
{
if (ownsMappings)
delete dimensionMappings;
@@ -89,7 +92,7 @@ template<typename FitnessFunction,
template<typename> class NumericSplitType,
template<typename> class CategoricalSplitType>
template<typename VecType>
-void HoeffdingSplit<
+void HoeffdingTree<
FitnessFunction,
NumericSplitType,
CategoricalSplitType
@@ -130,7 +133,7 @@ void HoeffdingSplit<
template<typename FitnessFunction,
template<typename> class NumericSplitType,
template<typename> class CategoricalSplitType>
-size_t HoeffdingSplit<
+size_t HoeffdingTree<
FitnessFunction,
NumericSplitType,
CategoricalSplitType
@@ -211,7 +214,7 @@ template<
template<typename> class NumericSplitType,
template<typename> class CategoricalSplitType
>
-size_t HoeffdingSplit<
+size_t HoeffdingTree<
FitnessFunction,
NumericSplitType,
CategoricalSplitType
@@ -225,7 +228,7 @@ template<
template<typename> class NumericSplitType,
template<typename> class CategoricalSplitType
>
-size_t& HoeffdingSplit<
+size_t& HoeffdingTree<
FitnessFunction,
NumericSplitType,
CategoricalSplitType
@@ -240,7 +243,7 @@ template<
template<typename> class CategoricalSplitType
>
template<typename VecType>
-size_t HoeffdingSplit<
+size_t HoeffdingTree<
FitnessFunction,
NumericSplitType,
CategoricalSplitType
@@ -261,7 +264,7 @@ template<
template<typename> class CategoricalSplitType
>
template<typename VecType>
-size_t HoeffdingSplit<
+size_t HoeffdingTree<
FitnessFunction,
NumericSplitType,
CategoricalSplitType
@@ -278,7 +281,7 @@ template<
template<typename> class CategoricalSplitType
>
template<typename VecType>
-void HoeffdingSplit<
+void HoeffdingTree<
FitnessFunction,
NumericSplitType,
CategoricalSplitType
@@ -296,7 +299,7 @@ template<
template<typename> class CategoricalSplitType
>
template<typename StreamingDecisionTreeType>
-void HoeffdingSplit<
+void HoeffdingTree<
FitnessFunction,
NumericSplitType,
CategoricalSplitType
@@ -338,7 +341,7 @@ template<
template<typename> class CategoricalSplitType
>
template<typename Archive>
-void HoeffdingSplit<
+void HoeffdingTree<
FitnessFunction,
NumericSplitType,
CategoricalSplitType
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
index ef17cd7..87653b2 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
@@ -66,10 +66,10 @@ int main(int argc, char** argv)
<< "specified too!" << endl;
if (numericSplitStrategy == "domingos")
- PerformActions<StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+ PerformActions<StreamingDecisionTree<HoeffdingTree<GiniImpurity,
HoeffdingDoubleNumericSplit, HoeffdingCategoricalSplit>>>();
else if (numericSplitStrategy == "binary")
- PerformActions<StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+ PerformActions<StreamingDecisionTree<HoeffdingTree<GiniImpurity,
BinaryDoubleNumericSplit, HoeffdingCategoricalSplit>>>();
else
Log::Fatal << "Unrecognized numeric split strategy ("
diff --git a/src/mlpack/tests/hoeffding_tree_test.cpp b/src/mlpack/tests/hoeffding_tree_test.cpp
index 97183e3..4d58f8e 100644
--- a/src/mlpack/tests/hoeffding_tree_test.cpp
+++ b/src/mlpack/tests/hoeffding_tree_test.cpp
@@ -207,7 +207,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingCategoricalSplitSplitTest)
HoeffdingCategoricalSplit<GiniImpurity> split(3, 3); // 3 categories.
// No training is necessary because we can just call CreateChildren().
- std::vector<StreamingDecisionTree<HoeffdingSplit<>>> children;
+ std::vector<StreamingDecisionTree<HoeffdingTree<>>> children;
data::DatasetInfo info(3);
info.MapString("hello", 0); // Make dimension 0 categorical.
HoeffdingCategoricalSplit<GiniImpurity>::SplitInfo splitInfo(3);
@@ -223,10 +223,10 @@ BOOST_AUTO_TEST_CASE(HoeffdingCategoricalSplitSplitTest)
}
/**
- * If we feed the HoeffdingSplit a ton of points of the same class, it should
+ * If we feed the HoeffdingTree a ton of points of the same class, it should
* not suggest that we split.
*/
-BOOST_AUTO_TEST_CASE(HoeffdingSplitNoSplitTest)
+BOOST_AUTO_TEST_CASE(HoeffdingTreeNoSplitTest)
{
// Make all dimensions categorical.
data::DatasetInfo info(3);
@@ -240,7 +240,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitNoSplitTest)
info.MapString("cat1", 2);
info.MapString("cat2", 2);
- HoeffdingSplit<> split(3, 2, info, 0.95, 5000, 1);
+ HoeffdingTree<> split(3, 2, info, 0.95, 5000, 1);
// Feed it samples.
for (size_t i = 0; i < 1000; ++i)
@@ -257,10 +257,10 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitNoSplitTest)
}
/**
- * If we feed the HoeffdingSplit a ton of points of two different classes, it
+ * If we feed the HoeffdingTree a ton of points of two different classes, it
* should very clearly suggest that we split (eventually).
*/
-BOOST_AUTO_TEST_CASE(HoeffdingSplitEasySplitTest)
+BOOST_AUTO_TEST_CASE(HoeffdingTreeEasySplitTest)
{
// It'll be a two-dimensional dataset with two categories each. In the first
// dimension, category 0 will only receive points with class 0, and category 1
@@ -271,7 +271,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitEasySplitTest)
info.MapString("cat1", 0);
info.MapString("cat0", 1);
- HoeffdingSplit<> split(2, 2, info, 0.95, 5000, 1);
+ HoeffdingTree<> split(2, 2, info, 0.95, 5000, 1);
// Feed samples from each class.
for (size_t i = 0; i < 500; ++i)
@@ -288,7 +288,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitEasySplitTest)
/**
* If we force a success probability of 1, it should never split.
*/
-BOOST_AUTO_TEST_CASE(HoeffdingSplitProbability1SplitTest)
+BOOST_AUTO_TEST_CASE(HoeffdingTreeProbability1SplitTest)
{
// It'll be a two-dimensional dataset with two categories each. In the first
// dimension, category 0 will only receive points with class 0, and category 1
@@ -299,7 +299,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitProbability1SplitTest)
info.MapString("cat1", 0);
info.MapString("cat0", 1);
- HoeffdingSplit<> split(2, 2, info, 1.0, 12000, 1);
+ HoeffdingTree<> split(2, 2, info, 1.0, 12000, 1);
// Feed samples from each class.
for (size_t i = 0; i < 5000; ++i)
@@ -318,7 +318,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitProbability1SplitTest)
* perfect classification, another gives almost perfect classification (with 10%
* error). Splits should occur after many samples.
*/
-BOOST_AUTO_TEST_CASE(HoeffdingSplitAlmostPerfectSplit)
+BOOST_AUTO_TEST_CASE(HoeffdingTreeAlmostPerfectSplit)
{
// Two categories and two dimensions.
data::DatasetInfo info(2);
@@ -327,7 +327,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitAlmostPerfectSplit)
info.MapString("cat0", 1);
info.MapString("cat1", 1);
- HoeffdingSplit<> split(2, 2, info, 0.95, 5000, 1);
+ HoeffdingTree<> split(2, 2, info, 0.95, 5000, 1);
// Feed samples.
for (size_t i = 0; i < 500; ++i)
@@ -350,10 +350,10 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitAlmostPerfectSplit)
}
/**
- * Test that the HoeffdingSplit class will not split if the two features are
+ * Test that the HoeffdingTree class will not split if the two features are
* equally good.
*/
-BOOST_AUTO_TEST_CASE(HoeffdingSplitEqualSplitTest)
+BOOST_AUTO_TEST_CASE(HoeffdingTreeEqualSplitTest)
{
// Two categories and two dimensions.
data::DatasetInfo info(2);
@@ -362,7 +362,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitEqualSplitTest)
info.MapString("cat0", 1);
info.MapString("cat1", 1);
- HoeffdingSplit<> split(2, 2, info, 0.95, 5000, 1);
+ HoeffdingTree<> split(2, 2, info, 0.95, 5000, 1);
// Feed samples.
for (size_t i = 0; i < 500; ++i)
@@ -419,10 +419,10 @@ BOOST_AUTO_TEST_CASE(StreamingDecisionTreeSimpleDatasetTest)
// Now train two streaming decision trees; one on the whole dataset, and one
// on streaming data.
- StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+ StreamingDecisionTree<HoeffdingTree<GiniImpurity,
HoeffdingDoubleNumericSplit>, arma::Mat<size_t>>
batchTree(dataset, info, labels, 3);
- StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+ StreamingDecisionTree<HoeffdingTree<GiniImpurity,
HoeffdingDoubleNumericSplit>, arma::Mat<size_t>>
streamTree(info, 3, 3);
for (size_t i = 0; i < 9000; ++i)
@@ -622,10 +622,10 @@ BOOST_AUTO_TEST_CASE(NumericHoeffdingTreeTest)
// Now train two streaming decision trees; one on the whole dataset, and one
// on streaming data.
- StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+ StreamingDecisionTree<HoeffdingTree<GiniImpurity,
HoeffdingDoubleNumericSplit>, arma::mat> batchTree(dataset, info, labels,
3);
- StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+ StreamingDecisionTree<HoeffdingTree<GiniImpurity,
HoeffdingDoubleNumericSplit>, arma::mat> streamTree(info, 3, 3);
for (size_t i = 0; i < 9000; ++i)
streamTree.Train(dataset.col(i), labels[i]);
@@ -693,9 +693,9 @@ BOOST_AUTO_TEST_CASE(BinaryNumericHoeffdingTreeTest)
// Now train two streaming decision trees; one on the whole dataset, and one
// on streaming data.
- StreamingDecisionTree<HoeffdingSplit<GiniImpurity, BinaryDoubleNumericSplit>,
+ StreamingDecisionTree<HoeffdingTree<GiniImpurity, BinaryDoubleNumericSplit>,
arma::mat> batchTree(dataset, info, labels, 3);
- StreamingDecisionTree<HoeffdingSplit<GiniImpurity, BinaryDoubleNumericSplit>,
+ StreamingDecisionTree<HoeffdingTree<GiniImpurity, BinaryDoubleNumericSplit>,
arma::mat> streamTree(info, 4, 3);
for (size_t i = 0; i < 9000; ++i)
streamTree.Train(dataset.col(i), labels[i]);
@@ -735,7 +735,7 @@ BOOST_AUTO_TEST_CASE(BinaryNumericHoeffdingTreeTest)
BOOST_AUTO_TEST_CASE(MajorityProbabilityTest)
{
data::DatasetInfo info(1);
- StreamingDecisionTree<HoeffdingSplit<>> tree(info, 1, 3);
+ StreamingDecisionTree<HoeffdingTree<>> tree(info, 1, 3);
// Feed the tree a few samples.
tree.Train(arma::vec("1"), 0);
More information about the mlpack-git
mailing list