[mlpack-git] master: Refactor: HoeffdingSplit -> HoeffdingTree. (2bff3d7)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:45:18 EST 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit 2bff3d725602b858931f3f4ebe5390b0b1ea9d01
Author: Ryan Curtin <ryan at ratml.org>
Date:   Fri Oct 30 16:40:58 2015 +0000

    Refactor: HoeffdingSplit -> HoeffdingTree.
    
    In preparation for removal of StreamingDecisionTree altogether.


>---------------------------------------------------------------

2bff3d725602b858931f3f4ebe5390b0b1ea9d01
 src/mlpack/methods/hoeffding_trees/CMakeLists.txt  |  4 +-
 .../{hoeffding_split.hpp => hoeffding_tree.hpp}    | 33 ++++++++++-----
 ...ding_split_impl.hpp => hoeffding_tree_impl.hpp} | 49 ++++++++++++----------
 .../streaming_decision_tree_main.cpp               |  4 +-
 src/mlpack/tests/hoeffding_tree_test.cpp           | 42 +++++++++----------
 5 files changed, 73 insertions(+), 59 deletions(-)

diff --git a/src/mlpack/methods/hoeffding_trees/CMakeLists.txt b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
index 9b79011..1e7417b 100644
--- a/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
+++ b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
@@ -9,8 +9,8 @@ set(SOURCES
   hoeffding_categorical_split_impl.hpp
   hoeffding_numeric_split.hpp
   hoeffding_numeric_split_impl.hpp
-  hoeffding_split.hpp
-  hoeffding_split_impl.hpp
+  hoeffding_tree.hpp
+  hoeffding_tree_impl.hpp
   numeric_split_info.hpp
   streaming_decision_tree.hpp
   streaming_decision_tree_impl.hpp
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
similarity index 86%
rename from src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
rename to src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
index 04e24e5..69c58c3 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
@@ -2,11 +2,11 @@
  * @file hoeffding_split.hpp
  * @author Ryan Curtin
  *
- * An implementation of the standard Hoeffding bound split by Pedro Domingos and
- * Geoff Hulten in ``Mining High-Speed Data Streams''.
+ * An implementation of the standard Hoeffding tree by Pedro Domingos and Geoff
+ * Hulten in ``Mining High-Speed Data Streams''.
  */
-#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_HPP
-#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_HPP
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_TREE_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_TREE_HPP
 
 #include <mlpack/core.hpp>
 #include "gini_impurity.hpp"
@@ -17,11 +17,22 @@ namespace mlpack {
 namespace tree {
 
 /**
- * The HoeffdingSplit object represents all of the necessary information for a
- * Hoeffding-bound-based decision tree.  (Perhaps, the whole thing should be
- * refactored so this class is called 'HoeffdingTree'.)  This class is able to
- * train on samples in streaming settings and batch settings, and perform splits
- * based on the Hoeffding bound.
+ * The HoeffdingTree object represents all of the necessary information for a
+ * Hoeffding-bound-based decision tree.  This class is able to train on samples
+ * in streaming settings and batch settings, and perform splits based on the
+ * Hoeffding bound.  The Hoeffding tree (also known as the "very fast decision
+ * tree" -- VFDT) is described in the following paper:
+ *
+ * @code
+ * @inproceedings{domingos2000mining,
+ *     title={{Mining High-Speed Data Streams}},
+ *     author={Domingos, P. and Hulten, G.},
+ *     year={2000},
+ *     booktitle={Proceedings of the Sixth ACM SIGKDD International Conference
+ *         on Knowledge Discovery and Data Mining (KDD '00)},
+ *     pages={71--80}
+ * }
+ * @endcode
  *
  * The class is modular, and takes three template parameters.  The first,
  * FitnessFunction, is the fitness function that should be used to determine
@@ -41,7 +52,7 @@ template<typename FitnessFunction = GiniImpurity,
          template<typename> class CategoricalSplitType =
              HoeffdingCategoricalSplit
 >
-class HoeffdingSplit
+class HoeffdingTree
 {
  public:
   /**
@@ -188,6 +199,6 @@ class HoeffdingSplit
 } // namespace tree
 } // namespace mlpack
 
-#include "hoeffding_split_impl.hpp"
+#include "hoeffding_tree_impl.hpp"
 
 #endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
similarity index 92%
rename from src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
rename to src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
index b0898e7..5aa2093 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
@@ -2,10 +2,13 @@
  * @file hoeffding_split_impl.hpp
  * @author Ryan Curtin
  *
- * Implementation of the HoeffdingSplit class.
+ * Implementation of the HoeffdingTree class.
  */
-#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_IMPL_HPP
-#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_IMPL_HPP
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_TREE_IMPL_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_TREE_IMPL_HPP
+
+// In case it hasn't been included yet.
+#include "hoeffding_tree.hpp"
 
 namespace mlpack {
 namespace tree {
@@ -13,18 +16,18 @@ namespace tree {
 template<typename FitnessFunction,
          template<typename> class NumericSplitType,
          template<typename> class CategoricalSplitType>
-HoeffdingSplit<
+HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
->::HoeffdingSplit(const size_t dimensionality,
-                  const size_t numClasses,
-                  const data::DatasetInfo& datasetInfo,
-                  const double successProbability,
-                  const size_t maxSamples,
-                  const size_t checkInterval,
-                  std::unordered_map<size_t, std::pair<size_t, size_t>>*
-                      dimensionMappingsIn) :
+>::HoeffdingTree(const size_t dimensionality,
+                 const size_t numClasses,
+                 const data::DatasetInfo& datasetInfo,
+                 const double successProbability,
+                 const size_t maxSamples,
+                 const size_t checkInterval,
+                 std::unordered_map<size_t, std::pair<size_t, size_t>>*
+                     dimensionMappingsIn) :
     dimensionMappings((dimensionMappingsIn != NULL) ? dimensionMappingsIn :
         new std::unordered_map<size_t, std::pair<size_t, size_t>>()),
     ownsMappings(dimensionMappingsIn == NULL),
@@ -78,8 +81,8 @@ HoeffdingSplit<
 template<typename FitnessFunction,
          template<typename> class NumericSplitType,
          template<typename> class CategoricalSplitType>
-HoeffdingSplit<FitnessFunction, NumericSplitType, CategoricalSplitType>::
-    ~HoeffdingSplit()
+HoeffdingTree<FitnessFunction, NumericSplitType, CategoricalSplitType>::
+    ~HoeffdingTree()
 {
   if (ownsMappings)
     delete dimensionMappings;
@@ -89,7 +92,7 @@ template<typename FitnessFunction,
          template<typename> class NumericSplitType,
          template<typename> class CategoricalSplitType>
 template<typename VecType>
-void HoeffdingSplit<
+void HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
@@ -130,7 +133,7 @@ void HoeffdingSplit<
 template<typename FitnessFunction,
          template<typename> class NumericSplitType,
          template<typename> class CategoricalSplitType>
-size_t HoeffdingSplit<
+size_t HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
@@ -211,7 +214,7 @@ template<
     template<typename> class NumericSplitType,
     template<typename> class CategoricalSplitType
 >
-size_t HoeffdingSplit<
+size_t HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
@@ -225,7 +228,7 @@ template<
     template<typename> class NumericSplitType,
     template<typename> class CategoricalSplitType
 >
-size_t& HoeffdingSplit<
+size_t& HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
@@ -240,7 +243,7 @@ template<
     template<typename> class CategoricalSplitType
 >
 template<typename VecType>
-size_t HoeffdingSplit<
+size_t HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
@@ -261,7 +264,7 @@ template<
     template<typename> class CategoricalSplitType
 >
 template<typename VecType>
-size_t HoeffdingSplit<
+size_t HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
@@ -278,7 +281,7 @@ template<
     template<typename> class CategoricalSplitType
 >
 template<typename VecType>
-void HoeffdingSplit<
+void HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
@@ -296,7 +299,7 @@ template<
     template<typename> class CategoricalSplitType
 >
 template<typename StreamingDecisionTreeType>
-void HoeffdingSplit<
+void HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
@@ -338,7 +341,7 @@ template<
     template<typename> class CategoricalSplitType
 >
 template<typename Archive>
-void HoeffdingSplit<
+void HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
index ef17cd7..87653b2 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
@@ -66,10 +66,10 @@ int main(int argc, char** argv)
         << "specified too!" << endl;
 
   if (numericSplitStrategy == "domingos")
-    PerformActions<StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+    PerformActions<StreamingDecisionTree<HoeffdingTree<GiniImpurity,
         HoeffdingDoubleNumericSplit, HoeffdingCategoricalSplit>>>();
   else if (numericSplitStrategy == "binary")
-    PerformActions<StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+    PerformActions<StreamingDecisionTree<HoeffdingTree<GiniImpurity,
         BinaryDoubleNumericSplit, HoeffdingCategoricalSplit>>>();
   else
     Log::Fatal << "Unrecognized numeric split strategy ("
diff --git a/src/mlpack/tests/hoeffding_tree_test.cpp b/src/mlpack/tests/hoeffding_tree_test.cpp
index 97183e3..4d58f8e 100644
--- a/src/mlpack/tests/hoeffding_tree_test.cpp
+++ b/src/mlpack/tests/hoeffding_tree_test.cpp
@@ -207,7 +207,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingCategoricalSplitSplitTest)
   HoeffdingCategoricalSplit<GiniImpurity> split(3, 3); // 3 categories.
 
   // No training is necessary because we can just call CreateChildren().
-  std::vector<StreamingDecisionTree<HoeffdingSplit<>>> children;
+  std::vector<StreamingDecisionTree<HoeffdingTree<>>> children;
   data::DatasetInfo info(3);
   info.MapString("hello", 0); // Make dimension 0 categorical.
   HoeffdingCategoricalSplit<GiniImpurity>::SplitInfo splitInfo(3);
@@ -223,10 +223,10 @@ BOOST_AUTO_TEST_CASE(HoeffdingCategoricalSplitSplitTest)
 }
 
 /**
- * If we feed the HoeffdingSplit a ton of points of the same class, it should
+ * If we feed the HoeffdingTree a ton of points of the same class, it should
  * not suggest that we split.
  */
-BOOST_AUTO_TEST_CASE(HoeffdingSplitNoSplitTest)
+BOOST_AUTO_TEST_CASE(HoeffdingTreeNoSplitTest)
 {
   // Make all dimensions categorical.
   data::DatasetInfo info(3);
@@ -240,7 +240,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitNoSplitTest)
   info.MapString("cat1", 2);
   info.MapString("cat2", 2);
 
-  HoeffdingSplit<> split(3, 2, info, 0.95, 5000, 1);
+  HoeffdingTree<> split(3, 2, info, 0.95, 5000, 1);
 
   // Feed it samples.
   for (size_t i = 0; i < 1000; ++i)
@@ -257,10 +257,10 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitNoSplitTest)
 }
 
 /**
- * If we feed the HoeffdingSplit a ton of points of two different classes, it
+ * If we feed the HoeffdingTree a ton of points of two different classes, it
  * should very clearly suggest that we split (eventually).
  */
-BOOST_AUTO_TEST_CASE(HoeffdingSplitEasySplitTest)
+BOOST_AUTO_TEST_CASE(HoeffdingTreeEasySplitTest)
 {
   // It'll be a two-dimensional dataset with two categories each.  In the first
   // dimension, category 0 will only receive points with class 0, and category 1
@@ -271,7 +271,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitEasySplitTest)
   info.MapString("cat1", 0);
   info.MapString("cat0", 1);
 
-  HoeffdingSplit<> split(2, 2, info, 0.95, 5000, 1);
+  HoeffdingTree<> split(2, 2, info, 0.95, 5000, 1);
 
   // Feed samples from each class.
   for (size_t i = 0; i < 500; ++i)
@@ -288,7 +288,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitEasySplitTest)
 /**
  * If we force a success probability of 1, it should never split.
  */
-BOOST_AUTO_TEST_CASE(HoeffdingSplitProbability1SplitTest)
+BOOST_AUTO_TEST_CASE(HoeffdingTreeProbability1SplitTest)
 {
   // It'll be a two-dimensional dataset with two categories each.  In the first
   // dimension, category 0 will only receive points with class 0, and category 1
@@ -299,7 +299,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitProbability1SplitTest)
   info.MapString("cat1", 0);
   info.MapString("cat0", 1);
 
-  HoeffdingSplit<> split(2, 2, info, 1.0, 12000, 1);
+  HoeffdingTree<> split(2, 2, info, 1.0, 12000, 1);
 
   // Feed samples from each class.
   for (size_t i = 0; i < 5000; ++i)
@@ -318,7 +318,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitProbability1SplitTest)
  * perfect classification, another gives almost perfect classification (with 10%
  * error).  Splits should occur after many samples.
  */
-BOOST_AUTO_TEST_CASE(HoeffdingSplitAlmostPerfectSplit)
+BOOST_AUTO_TEST_CASE(HoeffdingTreeAlmostPerfectSplit)
 {
   // Two categories and two dimensions.
   data::DatasetInfo info(2);
@@ -327,7 +327,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitAlmostPerfectSplit)
   info.MapString("cat0", 1);
   info.MapString("cat1", 1);
 
-  HoeffdingSplit<> split(2, 2, info, 0.95, 5000, 1);
+  HoeffdingTree<> split(2, 2, info, 0.95, 5000, 1);
 
   // Feed samples.
   for (size_t i = 0; i < 500; ++i)
@@ -350,10 +350,10 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitAlmostPerfectSplit)
 }
 
 /**
- * Test that the HoeffdingSplit class will not split if the two features are
+ * Test that the HoeffdingTree class will not split if the two features are
  * equally good.
  */
-BOOST_AUTO_TEST_CASE(HoeffdingSplitEqualSplitTest)
+BOOST_AUTO_TEST_CASE(HoeffdingTreeEqualSplitTest)
 {
   // Two categories and two dimensions.
   data::DatasetInfo info(2);
@@ -362,7 +362,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitEqualSplitTest)
   info.MapString("cat0", 1);
   info.MapString("cat1", 1);
 
-  HoeffdingSplit<> split(2, 2, info, 0.95, 5000, 1);
+  HoeffdingTree<> split(2, 2, info, 0.95, 5000, 1);
 
   // Feed samples.
   for (size_t i = 0; i < 500; ++i)
@@ -419,10 +419,10 @@ BOOST_AUTO_TEST_CASE(StreamingDecisionTreeSimpleDatasetTest)
 
   // Now train two streaming decision trees; one on the whole dataset, and one
   // on streaming data.
-  StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+  StreamingDecisionTree<HoeffdingTree<GiniImpurity,
       HoeffdingDoubleNumericSplit>, arma::Mat<size_t>>
       batchTree(dataset, info, labels, 3);
-  StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+  StreamingDecisionTree<HoeffdingTree<GiniImpurity,
       HoeffdingDoubleNumericSplit>, arma::Mat<size_t>>
       streamTree(info, 3, 3);
   for (size_t i = 0; i < 9000; ++i)
@@ -622,10 +622,10 @@ BOOST_AUTO_TEST_CASE(NumericHoeffdingTreeTest)
 
   // Now train two streaming decision trees; one on the whole dataset, and one
   // on streaming data.
-  StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+  StreamingDecisionTree<HoeffdingTree<GiniImpurity,
       HoeffdingDoubleNumericSplit>, arma::mat> batchTree(dataset, info, labels,
       3);
-  StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+  StreamingDecisionTree<HoeffdingTree<GiniImpurity,
       HoeffdingDoubleNumericSplit>, arma::mat> streamTree(info, 3, 3);
   for (size_t i = 0; i < 9000; ++i)
     streamTree.Train(dataset.col(i), labels[i]);
@@ -693,9 +693,9 @@ BOOST_AUTO_TEST_CASE(BinaryNumericHoeffdingTreeTest)
 
   // Now train two streaming decision trees; one on the whole dataset, and one
   // on streaming data.
-  StreamingDecisionTree<HoeffdingSplit<GiniImpurity, BinaryDoubleNumericSplit>,
+  StreamingDecisionTree<HoeffdingTree<GiniImpurity, BinaryDoubleNumericSplit>,
       arma::mat> batchTree(dataset, info, labels, 3);
-  StreamingDecisionTree<HoeffdingSplit<GiniImpurity, BinaryDoubleNumericSplit>,
+  StreamingDecisionTree<HoeffdingTree<GiniImpurity, BinaryDoubleNumericSplit>,
       arma::mat> streamTree(info, 4, 3);
   for (size_t i = 0; i < 9000; ++i)
     streamTree.Train(dataset.col(i), labels[i]);
@@ -735,7 +735,7 @@ BOOST_AUTO_TEST_CASE(BinaryNumericHoeffdingTreeTest)
 BOOST_AUTO_TEST_CASE(MajorityProbabilityTest)
 {
   data::DatasetInfo info(1);
-  StreamingDecisionTree<HoeffdingSplit<>> tree(info, 1, 3);
+  StreamingDecisionTree<HoeffdingTree<>> tree(info, 1, 3);
 
   // Feed the tree a few samples.
   tree.Train(arma::vec("1"), 0);



More information about the mlpack-git mailing list