[mlpack-git] master: Implement numeric feature support (though stupidly). (1f936fd)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:42:48 EST 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit 1f936fd9a0c35cfca31de53842d950281ed335e9
Author: ryan <ryan at ratml.org>
Date:   Thu Sep 24 16:07:27 2015 -0400

    Implement numeric feature support (though stupidly).
    
    Testing is not yet completed.


>---------------------------------------------------------------

1f936fd9a0c35cfca31de53842d950281ed335e9
 src/mlpack/methods/hoeffding_trees/CMakeLists.txt  |   1 +
 .../hoeffding_trees/hoeffding_numeric_split.hpp    |  62 +++++++--
 .../hoeffding_numeric_split_impl.hpp               | 141 +++++++++++++++++++++
 .../hoeffding_trees/hoeffding_split_impl.hpp       |   1 +
 .../methods/hoeffding_trees/numeric_split_info.hpp |  19 ++-
 src/mlpack/tests/hoeffding_tree_test.cpp           |  37 +++++-
 6 files changed, 247 insertions(+), 14 deletions(-)

diff --git a/src/mlpack/methods/hoeffding_trees/CMakeLists.txt b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
index 1e583f3..b6e6258 100644
--- a/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
+++ b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
@@ -6,6 +6,7 @@ set(SOURCES
   hoeffding_categorical_split.hpp
   hoeffding_categorical_split_impl.hpp
   hoeffding_numeric_split.hpp
+  hoeffding_numeric_split_impl.hpp
   hoeffding_split.hpp
   hoeffding_split_impl.hpp
   numeric_split_info.hpp
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
index bda7744..5b3df29 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
@@ -2,7 +2,9 @@
  * @file hoeffding_numeric_split.hpp
  * @author Ryan Curtin
  *
- * A numeric feature split for Hoeffding trees.  At the moment it does nothing.
+ * A numeric feature split for Hoeffding trees.  This is a very simple
+ * implementation based on a minor note in the paper "Mining High-Speed Data
+ * Streams" by Pedro Domingos and Geoff Hulten.
  */
 #ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_HPP
 #define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_HPP
@@ -13,30 +15,72 @@
 namespace mlpack {
 namespace tree {
 
-template<typename FitnessFunction>
+/**
+ * The HoeffdingNumericSplit class implements the numeric feature splitting
+ * strategy alluded to by Domingos and Hulten in the following paper:
+ *
+ * @code
+ * @inproceedings{domingos2000mining,
+ *   title={{Mining High-Speed Data Streams}},
+ *   author={Domingos, P. and Hulten, G.},
+ *   year={2000},
+ *   booktitle={Proceedings of the Sixth ACM SIGKDD International Conference on
+ *       Knowledge Discovery and Data Mining (KDD '00)},
+ *   pages={71--80}
+ * }
+ * @endcode
+ *
+ * The strategy alluded to is very simple: we discretize the numeric features
+ * that we see.  But in this case, we don't know how many bins we have, which
+ * makes things a little difficult.  This class only makes binary splits, and
+ * has a maximum number of bins.  The binning strategy is simple: the split
+ * caches the minimum and maximum value of points seen so far, and when the
+ * number of points hits a predefined threshold, the cached minimum-maximum
+ * range is equally split into bins, and splitting proceeds in the same way as
+ * with the categorical splits.  This is a simple and stupid strategy, so don't
+ * expect it to be the best possible thing you can do.
+ */
+template<typename FitnessFunction,
+         typename ObservationType = double>
 class HoeffdingNumericSplit
 {
  public:
-  typedef NumericSplitInfo SplitInfo;
+  typedef NumericSplitInfo<ObservationType> SplitInfo;
 
-  HoeffdingNumericSplit();
+  HoeffdingNumericSplit(const size_t numClasses,
+                        const size_t bins = 10,
+                        const size_t observationsBeforeBinning = 100);
 
-  template<typename eT>
-  void Train(eT /* value */, const size_t /* label */) { }
+  void Train(ObservationType value, const size_t label);
 
-  double EvaluateFitnessFunction() const { return 0.0; }
+  double EvaluateFitnessFunction() const;
 
   // Does nothing for now.
   template<typename StreamingDecisionTreeType>
   void CreateChildren(std::vector<StreamingDecisionTreeType>& children,
                       const data::DatasetInfo& datasetInfo,
                       const size_t dimensionality,
-                      SplitInfo& splitInfo) { } // Nothing to do.
+                      SplitInfo& splitInfo);
 
-  size_t MajorityClass() const { return 0; } // Nothing yet.
+  size_t MajorityClass() const;
+
+ private:
+  // Cache the values of the points seen before we make bins.
+  arma::Col<ObservationType> observations;
+  arma::Col<size_t> labels;
+
+  arma::Col<ObservationType> splitPoints;
+  size_t bins;
+  size_t observationsBeforeBinning;
+  size_t samplesSeen;
+
+  arma::Mat<size_t> sufficientStatistics;
 };
 
 } // namespace tree
 } // namespace mlpack
 
+// Include implementation.
+#include "hoeffding_numeric_split_impl.hpp"
+
 #endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split_impl.hpp
new file mode 100644
index 0000000..7d6de50
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split_impl.hpp
@@ -0,0 +1,141 @@
+/**
+ * @file hoeffding_numeric_split_impl.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of the simple HoeffdingNumericSplit class.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_IMPL_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_IMPL_HPP
+
+#include "hoeffding_numeric_split.hpp"
+
+namespace mlpack {
+namespace tree {
+
+template<typename FitnessFunction, typename ObservationType>
+HoeffdingNumericSplit<FitnessFunction, ObservationType>::HoeffdingNumericSplit(
+    const size_t numClasses,
+    const size_t bins,
+    const size_t observationsBeforeBinning) :
+    observations(observationsBeforeBinning - 1),
+    labels(observationsBeforeBinning - 1),
+    splitPoints(bins),
+    bins(bins),
+    observationsBeforeBinning(observationsBeforeBinning),
+    samplesSeen(0),
+    sufficientStatistics(arma::zeros<arma::Mat<size_t>>(numClasses, bins))
+{
+  // Nothing to do.
+}
+
+template<typename FitnessFunction, typename ObservationType>
+void HoeffdingNumericSplit<FitnessFunction, ObservationType>::Train(
+    ObservationType value,
+    const size_t label)
+{
+  if (samplesSeen < observationsBeforeBinning - 1)
+  {
+    // Add this to the samples we have seen.
+    observations[samplesSeen] = value;
+    labels[samplesSeen] = label;
+    ++samplesSeen;
+    return;
+  }
+  else if (samplesSeen == observationsBeforeBinning - 1)
+  {
+    // Now we need to make the bins.
+    ObservationType min = value;
+    ObservationType max = value;
+    for (size_t i = 0; i < observationsBeforeBinning - 1; ++i)
+    {
+      if (observations[i] < min)
+        min = observations[i];
+      else if (observations[i] > max)
+        max = observations[i];
+    }
+
+    // Now split these.
+    splitPoints = arma::linspace<arma::Col<ObservationType>>(min, max, bins);
+    ++samplesSeen;
+
+    // Now, add all of the points we've seen to the sufficient statistics.
+    for (size_t i = 0; i < observationsBeforeBinning - 1; ++i)
+    {
+      // What bin does the point fall into?
+      size_t bin = 0;
+      while (observations[i] > splitPoints[bin] && bin < bins - 1)
+        ++bin;
+
+      sufficientStatistics(labels[i], bin)++;
+    }
+  }
+
+  // If we've gotten to here, then we need to add the point to the sufficient
+  // statistics.  What bin does the point fall into?
+  size_t bin = 0;
+  while (value > splitPoints[bin] && bin < bins - 1)
+    ++bin;
+
+  sufficientStatistics(label, bin)++;
+}
+
+template<typename FitnessFunction, typename ObservationType>
+double HoeffdingNumericSplit<FitnessFunction, ObservationType>::
+    EvaluateFitnessFunction() const
+{
+  if (samplesSeen < observationsBeforeBinning)
+    return 0.0;
+  else
+    return FitnessFunction::Evaluate(sufficientStatistics);
+}
+
+template<typename FitnessFunction, typename ObservationType>
+template<typename StreamingDecisionTreeType>
+void HoeffdingNumericSplit<FitnessFunction, ObservationType>::CreateChildren(
+    std::vector<StreamingDecisionTreeType>& children,
+    const data::DatasetInfo& datasetInfo,
+    const size_t dimensionality,
+    SplitInfo& splitInfo)
+{
+  // We'll make one child for each bin.
+  for (size_t i = 0; i < sufficientStatistics.n_cols; ++i)
+    children.push_back(StreamingDecisionTreeType(datasetInfo, dimensionality,
+        sufficientStatistics.n_rows));
+
+  // Create the SplitInfo object.
+  splitInfo = SplitInfo(splitPoints);
+}
+
+template<typename FitnessFunction, typename ObservationType>
+size_t HoeffdingNumericSplit<FitnessFunction, ObservationType>::
+    MajorityClass() const
+{
+  // If we haven't yet determined the bins, we must calculate this by hand.
+  if (samplesSeen < observationsBeforeBinning)
+  {
+    arma::Col<size_t> classes(sufficientStatistics.n_rows);
+    classes.zeros();
+
+    for (size_t i = 0; i < samplesSeen; ++i)
+      classes[labels[i]]++;
+
+    arma::uword majorityClass;
+    classes.max(majorityClass);
+    return size_t(majorityClass);
+  }
+  else
+  {
+    // We've calculated the bins, so we can just sum over the sufficient
+    // statistics.
+    arma::Col<size_t> classCounts = sum(sufficientStatistics, 1);
+
+    arma::uword maxIndex;
+    classCounts.max(maxIndex);
+    return size_t(maxIndex);
+  }
+}
+
+} // namespace tree
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
index 6e98760..2b536f3 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
@@ -27,6 +27,7 @@ HoeffdingSplit<
     datasetInfo(datasetInfo),
     successProbability(successProbability),
     splitDimension(size_t(-1)),
+    numericSplit(),
     categoricalSplit(0)
 {
   for (size_t i = 0; i < dimensionality; ++i)
diff --git a/src/mlpack/methods/hoeffding_trees/numeric_split_info.hpp b/src/mlpack/methods/hoeffding_trees/numeric_split_info.hpp
index b048b4b..c7f2c12 100644
--- a/src/mlpack/methods/hoeffding_trees/numeric_split_info.hpp
+++ b/src/mlpack/methods/hoeffding_trees/numeric_split_info.hpp
@@ -12,14 +12,27 @@
 namespace mlpack {
 namespace tree {
 
-// This doesn't do anything yet.
+template<typename ObservationType = double>
 class NumericSplitInfo
 {
  public:
-  NumericSplitInfo() { }
+  NumericSplitInfo() { /* Nothing to do. */ }
+  NumericSplitInfo(arma::Col<ObservationType>& splitPoints) :
+      splitPoints(splitPoints) { /* Nothing to do. */ }
 
   template<typename eT>
-  static size_t CalculateDirection(const eT& /* value */) { return 0; }
+  size_t CalculateDirection(const eT& value) const
+  {
+    // What bin does the point fall into?
+    size_t bin = 0;
+    while (value > splitPoints[bin] && bin < splitPoints.n_elem - 1)
+      ++bin;
+
+    return bin;
+  }
+
+ private:
+  arma::Col<size_t> splitPoints;
 };
 
 } // namespace tree
diff --git a/src/mlpack/tests/hoeffding_tree_test.cpp b/src/mlpack/tests/hoeffding_tree_test.cpp
index 0bab98d..5cbb2d0 100644
--- a/src/mlpack/tests/hoeffding_tree_test.cpp
+++ b/src/mlpack/tests/hoeffding_tree_test.cpp
@@ -417,9 +417,11 @@ BOOST_AUTO_TEST_CASE(StreamingDecisionTreeSimpleDatasetTest)
 
   // Now train two streaming decision trees; one on the whole dataset, and one
   // on streaming data.
-  StreamingDecisionTree<HoeffdingSplit<>, arma::Mat<size_t>>
+  StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+      HoeffdingNumericSplit<GiniImpurity, size_t>>, arma::Mat<size_t>>
       batchTree(dataset, info, labels, 3);
-  StreamingDecisionTree<HoeffdingSplit<>, arma::Mat<size_t>>
+  StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+      HoeffdingNumericSplit<GiniImpurity, size_t>>, arma::Mat<size_t>>
       streamTree(info, 3, 3);
   for (size_t i = 0; i < 9000; ++i)
     streamTree.Train(dataset.col(i), labels[i]);
@@ -445,4 +447,35 @@ BOOST_AUTO_TEST_CASE(StreamingDecisionTreeSimpleDatasetTest)
   }
 }
 
+/**
+ * Test that the HoeffdingNumericSplit class has a fitness function value of 0
+ * before it's seen enough points.
+ */
+BOOST_AUTO_TEST_CASE(HoeffdingNumericSplitFitnessFunctionTest)
+{
+  HoeffdingNumericSplit<GiniImpurity> split(5, 10, 100);
+
+  // The first 99 iterations should not calculate anything.  The 100th is where
+  // the counting starts.
+  for (size_t i = 0; i < 99; ++i)
+  {
+    split.Train(mlpack::math::Random(), mlpack::math::RandInt(5));
+    BOOST_REQUIRE_SMALL(split.EvaluateFitnessFunction(), 1e-10);
+  }
+}
+
+/**
+ * Make sure the majority class is correct in the samples before binning.
+ */
+BOOST_AUTO_TEST_CASE(HoeffdingNumericSplitPreBinningMajorityClassTest)
+{
+  HoeffdingNumericSplit<GiniImpurity> split(3, 10, 100);
+
+  for (size_t i = 0; i < 100; ++i)
+  {
+    split.Train(mlpack::math::Random(), 1);
+    BOOST_REQUIRE_EQUAL(split.MajorityClass(), 1);
+  }
+}
+
 BOOST_AUTO_TEST_SUITE_END();



More information about the mlpack-git mailing list