[mlpack-git] master: Get HoeffdingSplit and StreamingDecisionTree to compile. (a77b9f3)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:42:32 EST 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit a77b9f3bc478d2aae1315beff4ba98095663481f
Author: ryan <ryan at ratml.org>
Date:   Tue Sep 22 23:22:43 2015 -0400

    Get HoeffdingSplit and StreamingDecisionTree to compile.


>---------------------------------------------------------------

a77b9f3bc478d2aae1315beff4ba98095663481f
 src/mlpack/methods/hoeffding_trees/CMakeLists.txt  |  1 +
 .../hoeffding_categorical_split.hpp                |  2 +-
 .../hoeffding_categorical_split_impl.hpp           |  5 +-
 .../hoeffding_trees/hoeffding_numeric_split.hpp    | 13 +++-
 .../hoeffding_trees/hoeffding_split_impl.hpp       |  5 +-
 .../hoeffding_trees/streaming_decision_tree.hpp    |  9 ++-
 .../streaming_decision_tree_impl.hpp               | 11 ++--
 src/mlpack/tests/hoeffding_tree_test.cpp           | 72 ++++++++++++++++++++++
 8 files changed, 106 insertions(+), 12 deletions(-)

diff --git a/src/mlpack/methods/hoeffding_trees/CMakeLists.txt b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
index 0d06de2..1e583f3 100644
--- a/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
+++ b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
@@ -8,6 +8,7 @@ set(SOURCES
   hoeffding_numeric_split.hpp
   hoeffding_split.hpp
   hoeffding_split_impl.hpp
+  numeric_split_info.hpp
   streaming_decision_tree.hpp
   streaming_decision_tree_impl.hpp
 )
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
index ce053cf..d41374d 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
@@ -49,7 +49,7 @@ class HoeffdingCategoricalSplit
 
   template<typename StreamingDecisionTreeType>
   void CreateChildren(std::vector<StreamingDecisionTreeType>& children,
-                      data::DatasetInfo& datasetInfo,
+                      const data::DatasetInfo& datasetInfo,
                       SplitInfo& splitInfo);
 
   size_t MajorityClass() const;
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp
index 599dcf0..b05f8df 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp
@@ -43,12 +43,13 @@ template<typename FitnessFunction>
 template<typename StreamingDecisionTreeType>
 void HoeffdingCategoricalSplit<FitnessFunction>::CreateChildren(
     std::vector<StreamingDecisionTreeType>& children,
-    data::DatasetInfo& datasetInfo,
+    const data::DatasetInfo& datasetInfo,
     SplitInfo& splitInfo)
 {
   // We'll make one child for each category.
   for (size_t i = 0; i < sufficientStatistics.n_cols; ++i)
-    children.push_back(StreamingDecisionTreeType(datasetInfo));
+    children.push_back(StreamingDecisionTreeType(datasetInfo, 3,
+        sufficientStatistics.n_rows));
 
   // Create the according SplitInfo object.
   splitInfo = SplitInfo(sufficientStatistics.n_cols);
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
index 172d7bc..78a46d9 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
@@ -7,6 +7,9 @@
 #ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_HPP
 #define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_HPP
 
+#include <mlpack/core.hpp>
+#include "numeric_split_info.hpp"
+
 namespace mlpack {
 namespace tree {
 
@@ -14,7 +17,7 @@ template<typename FitnessFunction>
 class HoeffdingNumericSplit
 {
  public:
-  typedef size_t SplitInfo;
+  typedef NumericSplitInfo SplitInfo;
 
   HoeffdingNumericSplit();
 
@@ -22,6 +25,14 @@ class HoeffdingNumericSplit
   void Train(eT /* value */, const size_t /* label */) { }
 
   double EvaluateFitnessFunction() const { return 0.0; }
+
+  // Does nothing for now.
+  template<typename StreamingDecisionTreeType>
+  void CreateChildren(std::vector<StreamingDecisionTreeType>& children,
+                      const data::DatasetInfo& datasetInfo,
+                      SplitInfo& splitInfo) { } // Nothing to do.
+
+  size_t MajorityClass() const { return 0; } // Nothing yet.
 };
 
 } // namespace tree
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
index f294adc..609971b 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
@@ -194,12 +194,13 @@ void HoeffdingSplit<
 
   if (datasetInfo.Type(splitDimension) == data::Datatype::numeric)
   {
-    numericSplits[numericSplitIndex + 1].CreateChildren(children, numericSplit);
+    numericSplits[numericSplitIndex + 1].CreateChildren(children, datasetInfo,
+        numericSplit);
   }
   else if (datasetInfo.Type(splitDimension) == data::Datatype::categorical)
   {
     categoricalSplits[categoricalSplitIndex + 1].CreateChildren(children,
-        categoricalSplit);
+        datasetInfo, categoricalSplit);
   }
 }
 
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
index dc4a214..3d34738 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
@@ -21,9 +21,12 @@ class StreamingDecisionTree
  public:
   StreamingDecisionTree(const MatType& data,
                         const data::DatasetInfo& datasetInfo,
-                        const arma::Row<size_t>& labels);
+                        const arma::Row<size_t>& labels,
+                        const size_t numClasses);
 
-  StreamingDecisionTree(const data::DatasetInfo& datasetInfo);
+  StreamingDecisionTree(const data::DatasetInfo& datasetInfo,
+                        const size_t dimensionality,
+                        const size_t numClasses);
 
   StreamingDecisionTree(const StreamingDecisionTree& other);
 
@@ -32,6 +35,8 @@ class StreamingDecisionTree
   const StreamingDecisionTree& Child(const size_t i) const { return children[i];
 }
 
+  const SplitType& Split() const { return split; }
+
   template<typename VecType>
   void Train(const VecType& data, const size_t label);
 
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
index f97a4f5..61e3f59 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
@@ -17,16 +17,19 @@ template<typename SplitType, typename MatType>
 StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
     const MatType& data,
     const data::DatasetInfo& datasetInfo,
-    const arma::Row<size_t>& labels) :
-    split(0, 0, datasetInfo, 0)
+    const arma::Row<size_t>& labels,
+    const size_t numClasses) :
+    split(data.n_rows, numClasses, datasetInfo, 0.95)
 {
   Train(data, labels);
 }
 
 template<typename SplitType, typename MatType>
 StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
-    const data::DatasetInfo& datasetInfo) :
-    split(0, 0, datasetInfo, 0)
+    const data::DatasetInfo& datasetInfo,
+    const size_t dimensionality,
+    const size_t numClasses) :
+    split(dimensionality, numClasses, datasetInfo, 0.95)
 {
   // No training.  Anything else to do...?
 }
diff --git a/src/mlpack/tests/hoeffding_tree_test.cpp b/src/mlpack/tests/hoeffding_tree_test.cpp
index f24c2dc..cfa9e51 100644
--- a/src/mlpack/tests/hoeffding_tree_test.cpp
+++ b/src/mlpack/tests/hoeffding_tree_test.cpp
@@ -347,4 +347,76 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitAlmostPerfectSplit)
   BOOST_REQUIRE_EQUAL(split.SplitDimension(), 1);
 }
 
+/**
+ * Build a decision tree on a dataset with two meaningless dimensions and ensure
+ * that it can properly classify all of the training points.  (The dataset is
+ * perfectly separable.)
+ */
+BOOST_AUTO_TEST_CASE(StreamingDecisionTreeSimpleDatasetTest)
+{
+  DatasetInfo info;
+  info.MapString("cat0", 0);
+  info.MapString("cat1", 0);
+  info.MapString("cat2", 0);
+  info.MapString("cat3", 0);
+  info.MapString("cat4", 0);
+  info.MapString("cat5", 0);
+  info.MapString("cat6", 0);
+  info.MapString("cat0", 1);
+  info.MapString("cat1", 1);
+  info.MapString("cat2", 1);
+  info.MapString("cat0", 2);
+  info.MapString("cat1", 2);
+
+  // Now generate data.
+  arma::Mat<size_t> dataset(3, 9000);
+  arma::Row<size_t> labels(9000);
+  for (size_t i = 0; i < 9000; i += 3)
+  {
+    dataset(0, i) = mlpack::math::RandInt(7);
+    dataset(1, i) = 0;
+    dataset(2, i) = mlpack::math::RandInt(2);
+    labels(i) = 0;
+
+    dataset(0, i + 1) = mlpack::math::RandInt(7);
+    dataset(1, i + 1) = 2;
+    dataset(2, i + 1) = mlpack::math::RandInt(2);
+    labels(i) = 1;
+
+    dataset(0, i + 2) = mlpack::math::RandInt(7);
+    dataset(1, i + 2) = 1;
+    dataset(2, i + 2) = mlpack::math::RandInt(2);
+    labels(i) = 2;
+  }
+
+  // Now train two streaming decision trees; one on the whole dataset, and one
+  // on streaming data.
+  StreamingDecisionTree<HoeffdingSplit<>, arma::Mat<size_t>>
+      batchTree(dataset, info, labels, 2);
+  StreamingDecisionTree<HoeffdingSplit<>, arma::Mat<size_t>>
+      streamTree(info, 3, 2);
+  for (size_t i = 0; i < 9000; ++i)
+    streamTree.Train(dataset.col(i), labels[i]);
+
+  // Each tree should have a single split.
+  BOOST_REQUIRE_EQUAL(batchTree.NumChildren(), 3);
+  BOOST_REQUIRE_EQUAL(streamTree.NumChildren(), 3);
+  BOOST_REQUIRE_EQUAL(batchTree.Split().SplitDimension(), 1);
+  BOOST_REQUIRE_EQUAL(streamTree.Split().SplitDimension(), 1);
+
+  // Now, classify all the points in the dataset.
+  arma::Row<size_t> batchLabels(9000);
+  arma::Row<size_t> streamLabels(9000);
+
+  streamTree.Classify(dataset, batchLabels);
+  for (size_t i = 0; i < 9000; ++i)
+    streamLabels[i] = batchTree.Classify(dataset.col(i));
+
+  for (size_t i = 0; i < 9000; ++i)
+  {
+    BOOST_REQUIRE_EQUAL(labels[i], streamLabels[i]);
+    BOOST_REQUIRE_EQUAL(labels[i], batchLabels[i]);
+  }
+}
+
 BOOST_AUTO_TEST_SUITE_END();



More information about the mlpack-git mailing list