[mlpack-git] master: A pass at VFDT implementation. (0a6b3a2)

Wed Dec 23 11:42:01 EST 2015

Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit 0a6b3a210ccc997a5d44ba91888faec2d0d8190d
Author: Ryan Curtin <ryan at ratml.org>
Date:   Mon Sep 21 12:30:59 2015 +0000

    A pass at VFDT implementation.


>---------------------------------------------------------------

0a6b3a210ccc997a5d44ba91888faec2d0d8190d
 .../CMakeLists.txt                                 |  15 +-
 .../hoeffding_categorical_split.hpp                |  18 +-
 .../hoeffding_categorical_split_impl.hpp           |  70 ++++++++
 .../hoeffding_trees/hoeffding_numeric_split.hpp    |  28 +++
 .../methods/hoeffding_trees/hoeffding_split.hpp    |  30 +++-
 .../hoeffding_trees/hoeffding_split_impl.hpp       | 200 +++++++++++++++++++++
 .../hoeffding_trees/streaming_decision_tree.hpp    |  34 +---
 .../streaming_decision_tree_impl.hpp               | 100 +++++++++++
 8 files changed, 457 insertions(+), 38 deletions(-)

diff --git a/src/mlpack/methods/amf/termination_policies/CMakeLists.txt b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
similarity index 61%
copy from src/mlpack/methods/amf/termination_policies/CMakeLists.txt
copy to src/mlpack/methods/hoeffding_trees/CMakeLists.txt
index e9aca7a..0d06de2 100644
--- a/src/mlpack/methods/amf/termination_policies/CMakeLists.txt
+++ b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
@@ -1,12 +1,15 @@
 # Define the files we need to compile
 # Anything not in this list will not be compiled into MLPACK.
 set(SOURCES
-  simple_residue_termination.hpp
-  simple_tolerance_termination.hpp
-  validation_rmse_termination.hpp
-  incomplete_incremental_termination.hpp
-  complete_incremental_termination.hpp
-  max_iteration_termination.hpp
+  categorical_split_info.hpp
+  gini_impurity.hpp
+  hoeffding_categorical_split.hpp
+  hoeffding_categorical_split_impl.hpp
+  hoeffding_numeric_split.hpp
+  hoeffding_split.hpp
+  hoeffding_split_impl.hpp
+  streaming_decision_tree.hpp
+  streaming_decision_tree_impl.hpp
 )
 
 # Add directory name to sources.
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
index e870445..14da600 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
@@ -8,6 +8,9 @@
 #ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_CATEGORICAL_SPLIT_HPP
 #define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_CATEGORICAL_SPLIT_HPP
 
+#include <mlpack/core.hpp>
+#include "categorical_split_info.hpp"
+
 namespace mlpack {
 namespace tree {
 
@@ -34,12 +37,22 @@ template<typename FitnessFunction>
 class HoeffdingCategoricalSplit
 {
  public:
-  HoeffdingCategoricalSplit(const size_t numCategories, const size_t numClasses);
+  typedef CategoricalSplitInfo SplitInfo;
+
+  HoeffdingCategoricalSplit(const size_t numCategories,
+                            const size_t numClasses);
 
   template<typename eT>
   void Train(eT value, const size_t label);
 
   double EvaluateFitnessFunction() const;
+
+  template<typename StreamingDecisionTreeType>
+  void CreateChildren(std::vector<StreamingDecisionTreeType*>& children,
+                      SplitInfo& splitInfo);
+
+  size_t MajorityClass() const;
+
  private:
   arma::Mat<size_t> sufficientStatistics;
 };
@@ -47,4 +60,7 @@ class HoeffdingCategoricalSplit
 } // namespace tree
 } // namespace mlpack
 
+// Include implementation.
+#include "hoeffding_categorical_split_impl.hpp"
+
 #endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp
new file mode 100644
index 0000000..4faf88a
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp
@@ -0,0 +1,70 @@
+/**
+ * @file hoeffding_categorical_split_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implemental of the HoeffdingCategoricalSplit class.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_CATEGORICAL_SPLIT_IMPL_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_CATEGORICAL_SPLIT_IMPL_HPP
+
+// In case it hasn't been included yet.
+#include "hoeffding_categorical_split.hpp"
+
+namespace mlpack {
+namespace tree {
+
+template<typename FitnessFunction>
+HoeffdingCategoricalSplit<FitnessFunction>::HoeffdingCategoricalSplit(
+    const size_t numCategories,
+    const size_t numClasses) :
+    sufficientStatistics(numClasses, numCategories)
+{
+  sufficientStatistics.zeros();
+}
+
+template<typename eT>
+template<typename FitnessFunction>
+void HoeffdingCategoricalSplit<FitnessFunction>::Train(eT value,
+                                                       const size_t label)
+{
+  // Add this to our counts.
+  // 'value' should be categorical, so we should be able to cast to size_t...
+  sufficientStatistics(label, size_t(value))++;
+}
+
+template<typename FitnessFunction>
+double HoeffdingCategoricalSplit<FitnessFunction>::EvaluateFitnessFunction()
+    const
+{
+  return FitnessFunction::Evaluate(sufficientStatistics);
+}
+
+template<typename StreamingDecisionTreeType>
+template<typename FitnessFunction>
+void HoeffdingCategoricalSplit<FitnessFunction>::CreateChildren(
+    std::vector<StreamingDecisionTreeType*>& children,
+    SplitInfo& splitInfo)
+{
+  // We'll make one child for each category.
+  children.push_back(StreamingDecisionTree(datasetInfo));
+  // Create the according SplitInfo object.
+  splitInfo = SplitInfo(sufficientStatistics.n_cols);
+}
+
+template<typename StreamingDecisionTreeType>
+template<typename FitnessFunction>
+size_t HoeffdingCategoricalSplit<FitnessFunction>::MajorityClass() const
+{
+  // Calculate the class that we have seen the most of.
+  arma::Row<size_t> classCounts = sum(sufficientStatistics);
+
+  arma::uword maxIndex;
+  classCounts.max(maxIndex);
+
+  return size_t(maxIndex);
+}
+
+} // namespace tree
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
new file mode 100644
index 0000000..ce53128
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
@@ -0,0 +1,28 @@
+/**
+ * @file hoeffding_numeric_split.hpp
+ * @author Ryan Curtin
+ *
+ * A numeric feature split for Hoeffding trees.  At the moment it does nothing.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_HPP
+
+namespace mlpack {
+namespace tree {
+
+template<typename FitnessFunction>
+class HoeffdingNumericSplit
+{
+ public:
+  HoeffdingNumericSplit();
+
+  template<typename eT>
+  void Train(eT /* value */, const size_t /* label */) { }
+
+  double EvaluateFitnessFunction() const { return 0.0; }
+};
+
+} // namespace tree
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
index fa18d09..1db2847 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
@@ -8,28 +8,41 @@
 #ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_HPP
 #define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_HPP
 
+#include <mlpack/core.hpp>
+#include "gini_impurity.hpp"
+#include "hoeffding_numeric_split.hpp"
+#include "hoeffding_categorical_split.hpp"
+
 namespace mlpack {
 namespace tree {
 
-template<typename FitnessFunction,
-         typename NumericSplitType,
-         typename CategoricalSplitType>
+template<typename FitnessFunction = GiniImpurity,
+         typename NumericSplitType = HoeffdingNumericSplit,
+         typename CategoricalSplitType = HoeffdingCategoricalSplit>
 class HoeffdingSplit
 {
  public:
   HoeffdingSplit(const size_t dimensionality,
                  const size_t numClasses,
-                 const DatasetInfo& datasetInfo);
+                 const DatasetInfo& datasetInfo,
+                 const double successProbability);
 
   template<typename VecType>
-  void Train(VecType& point, const size_t label);
+  void Train(const VecType& point, const size_t label);
 
   // 0 if split should not happen; number of splits otherwise.
   size_t SplitCheck() const;
 
   // Return index that we should go towards.
   template<typename VecType>
-  size_t CalculateDirection(VecType& point) const;
+  size_t CalculateDirection(const VecType& point) const;
+
+  // Classify the point according to the statistics in this node.
+  template<typename VecType>
+  size_t Classify(const VecType& point) const;
+
+  template<typename StreamingDecisionTreeType>
+  void CreateChildren(std::vector<StreamingDecisionTreeType>& children);
 
  private:
   // We need to keep some information for before we have split.
@@ -37,9 +50,12 @@ class HoeffdingSplit
   std::vector<CategoricalSplitType> categoricalSplits;
 
   const DatasetInfo& datasetInfo;
+  double successProbability;
+  size_t numSamples;
 
   // And we need to keep some information for after we have split.
   size_t splitDimension;
+  size_t majorityClass;
   typename CategoricalSplitType::SplitInfo categoricalSplit; // In case it's categorical.
   typename NumericSplitType::SplitInfo numericSplit; // In case it's numeric.
 };
@@ -47,4 +63,6 @@ class HoeffdingSplit
 } // namespace tree
 } // namespace mlpack
 
+#include "hoeffding_split_impl.hpp"
+
 #endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
new file mode 100644
index 0000000..62499c29
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
@@ -0,0 +1,200 @@
+/**
+ * @file hoeffding_split_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of the HoeffdingSplit class.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_IMPL_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_IMPL_HPP
+
+namespace mlpack {
+namespace tree {
+
+template<typename FitnessFunction,
+         typename NumericSplitType,
+         typename CategoricalSplitType>
+HoeffdingSplit<
+    FitnessFunction,
+    NumericSplitType,
+    CategoricalSplitType
+>::HoeffdingSplit(const size_t dimensionality,
+                  const size_t numClasses,
+                  const DatasetInfo& datasetInfo)
+{
+  for (size_t i = 0; i < dimensionality; ++i)
+  {
+    if (datasetInfo.Type(i) == Datatype.categorical)
+      categoricalSplits.push_back(
+          CategoricalSplitType(datasetInfo.NumMappings(), numClasses));
+    // else, numeric splits (not yet!)
+  }
+}
+
+template<typename VecType>
+template<typename FitnessFunction,
+         typename NumericSplitType,
+         typename CategoricalSplitType>
+void HoeffdingSplit<
+    FitnessFunction,
+    NumericSplitType,
+    CategoricalSplitType
+>::Train(VecType& point, const size_t label)
+{
+  if (splitDimension == size_t(-1))
+  {
+    ++numSamples;
+    size_t numericIndex = 0;
+    size_t categoricalIndex = 0;
+    for (size_t i = 0; i < point.n_rows; ++i)
+    {
+      if (datasetInfo.Type(i) == Datatype.categorical)
+        categoricalSplits[categoricalIndex++].Train(point[i], label);
+      else if (datasetInfo.Type(i) == Datatype.numeric)
+        numericSplits[numericIndex++].Train(point[i], label);
+    }
+  }
+  else
+  {
+    // Already split.
+    // But we should probably pass it down anyway.
+  }
+}
+
+template<typename FitnessFunction,
+         typename NumericSplitType,
+         typename CategoricalSplitType>
+size_t HoeffdingSplit<
+    FitnessFunction,
+    NumericSplitType,
+    CategoricalSplitType
+>::SplitCheck() const
+{
+  // Do nothing if we've already split.
+  if (splitDimension == size_t(-1))
+    return 0;
+
+  // Check the fitness of each dimension.  Then we'll use a Hoeffding bound
+  // somehow.
+
+  // Calculate epsilon, the value we need things to be greater than.
+  const double rSquared = std::pow(FitnessFunction::Range(numClasses), 2.0);
+  const double epsilon = std::sqrt(rSquared *
+      std::log(1.0 / successProbability) / (2 * numSamples));
+
+  arma::vec gains(categoricalSplits.size() + numericSplits.size());
+  for (size_t i = 0; i < categoricalSplits.size(); ++i)
+    gains[i] = categoricalSplits[i].EvaluateFitnessFunction();
+
+  // Now find the largest and second-largest.
+  double largest = -DBL_MAX;
+  size_t largestIndex = 0;
+  double secondLargest = -DBL_MAX;
+  size_t secondLargestIndex = 0;
+  for (size_t i = 0; i < gains.n_elem; ++i)
+  {
+    if (gains[i] > largest)
+    {
+      secondLargest = largest;
+      secondLargestIndex = largestIndex;
+      largest = gains[i];
+      largestIndex = i;
+    }
+    else if (gains[i] > secondLargest)
+    {
+      secondLargest = gains[i];
+      secondLargestIndex = i;
+    }
+  }
+
+  // Are these far enough apart to split?
+  if (largest - secondLargest > epsilon)
+  {
+    // Split!
+    splitDimension = largestIndex;
+    if (datasetInfo[largestIndex].Type == Datatype.categorical)
+    {
+      // I don't know if this should be here.
+      majorityClass = categoricalSplit[largestIndex].MajorityClass();
+      return datasetInfo[largestIndex].NumMappings();
+    }
+    else
+    {
+      majorityClass = 0;
+      return 0; // I have no idea what to do yet.
+    }
+  }
+}
+
+template<typename VecType>
+template<
+    typename FitnessFunction,
+    typename NumericSplitType,
+    typename CategoricalSplitType
+>
+size_t HoeffdingSplit<
+    FitnessFunction,
+    NumericSplitType,
+    CategoricalSplitType
+>::CalculateDirection(VecType& point) const
+{
+  // Don't call this before the node is split...
+  if (datasetInfo.Type(splitDimension) == Datatype::numeric)
+    return numericSplit.CalculateDirection(point[splitDimension]);
+  else if (datasetInfo.Type(splitDimension) == Datatype::categorical)
+    return categoricalSplit.CalculateDirection(point[splitDimension]);
+}
+
+template<typename VecType>
+template<
+    typename FitnessFunction,
+    typename NumericSplitType,
+    typename CategoricalSplitType
+>
+void HoeffdingSplit<
+    FitnessFunction,
+    NumericSplitType,
+    CategoricalSplitType
+>::Classify(const VecType& point) const
+{
+  // We're a leaf (or being considered a leaf), so classify based on what we
+  // know.
+  return majorityClass;
+}
+
+template<
+    typename FitnessFunction,
+    typename NumericSplitType,
+    typename CategoricalSplitType
+>
+void HoeffdingSplit<
+    FitnessFunction,
+    NumericSplitType,
+    CategoricalSplitType
+>::CreateChildren(std::vector<StreamingDecisionTreeType>& children)
+{
+  // We already know what the splitDimension will be.
+  size_t numericSplitIndex = 0;
+  size_t categoricalSplitIndex = 0;
+  for (size_t i = 0; i < splitDimension; ++i)
+  {
+    if (datasetInfo.Type(i) == Datatype::numeric)
+      ++numericSplitIndex;
+    if (datasetInfo.Type(i) == Datatype::categorical)
+      ++categoricalSplitIndex;
+  }
+
+  if (datasetInfo.Type(splitDimension) == Datatype::numeric)
+  {
+    numericSplits[numericSplitIndex + 1].CreateChildren(children, numericSplit);
+  }
+  else if (datasetInfo.Type(splitDimension) == Datatype::categorical)
+  {
+    categoricalSplits[categoricalSplitIndex + 1].CreateChildren(children,
+        categoricalSplit);
+  }
+}
+
+} // namespace tree
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
index fe8bc8e..af2a160 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
@@ -19,17 +19,17 @@ template<
 class StreamingDecisionTree
 {
  public:
-  StreamingDecisionTree(const MatType& data, const arma::Row<size_t>& labels);
+  StreamingDecisionTree(const MatType& data,
+                        const data::DatasetInfo& datasetInfo,
+                        const arma::Row<size_t>& labels);
 
-  StreamingDecisionTree();
+  StreamingDecisionTree(const data::DatasetInfo& datasetInfo);
 
   StreamingDecisionTree(const StreamingDecisionTree& other);
 
-  ~StreamingDecisionTree();
-
   size_t NumChildren() const { return children.size(); }
-  StreamingDecisionTree* Child(const size_t i) { return children[i]; }
-  const StreamingDecisionTree* Child(const size_t i) const { return children[i];
+  StreamingDecisionTree& Child(const size_t i) { return children[i]; }
+  const StreamingDecisionTree& Child(const size_t i) const { return children[i];
 }
 
   template<typename VecType>
@@ -47,25 +47,9 @@ class StreamingDecisionTree
   // that's just a split dimension and a rule (categorical or numeric)
 
  private:
-  std::vector<StreamingDecisionTree*> children;
-
-  DatasetInfo info;
-  size_t splitDimension;
-  NumericSplitType* numericSplit;
-  CategoricalSplitType* categoricalSplit;
-
-  SplitType split; // hide it in the split?
-  // split must provide Dimension() and
-  //
-  // template<typename VecType>
-  // StreamingDecisionTree* MakeDecision(const VecType& point);
-  //
-  // template<typename VecType>
-  // void Train(const VecType& data, const size_t label);
-  //
-  // Datatype SplitType() const;
-  //
-  // 
+  std::vector<StreamingDecisionTree> children;
+
+  SplitType split;
 };
 
 } // namespace tree
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
new file mode 100644
index 0000000..b40a502
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
@@ -0,0 +1,100 @@
+/**
+ * @file streaming_decision_tree_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of a streaming decision tree.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_STREAMING_DECISION_TREE_IMPL_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_STREAMING_DECISION_TREE_IMPL_HPP
+
+// In case it hasn't been included yet.
+#include "streaming_decision_tree.hpp"
+
+namespace mlpack {
+namespace tree {
+
+template<typename SplitType, typename MatType>
+StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
+    const MatType& data,
+    const data::DatasetInfo& datasetInfo,
+    const arma::Row<size_t>& labels) :
+    split(datasetInfo)
+{
+  Train(data, labels);
+}
+
+template<typename SplitType, typename MatType>
+StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
+    const data::DatasetInfo& datasetInfo) :
+    split(datasetInfo)
+{
+  // No training.  Anything else to do...?
+}
+
+template<typename SplitType, typename MatType>
+StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
+    const StreamingDecisionTree& other)
+{
+  // Copy the children of the other tree.
+}
+
+template<typename VecType>
+template<typename SplitType, typename MatType>
+void StreamingDecisionTree<SplitType, MatType>::Train(const VecType& data,
+                                                      const size_t label)
+{
+  split.Train(data, label);
+
+  const size_t numChildren = split.SplitCheck();
+  if (numChildren > 0)
+  {
+    // We need to add a bunch of children.
+    // Delete children, if we have them.
+    if (children.size() > 0)
+      children.clear();
+
+    // The split knows how to add the children.
+    SplitType.CreateChildren(children);
+  }
+}
+
+template<typename SplitType, typename MatType>
+void StreamingDecisionTree<SplitType, MatType>::Train(
+    const MatType& data,
+    const arma::Row<size_t>& labels)
+{
+  // Train on each point sequentially.
+  for (size_t i = 0; i < data.n_cols; ++i)
+    Train(data.col(i), labels[i]);
+}
+
+template<typename VecType>
+template<typename SplitType, typename MatType>
+size_t StreamingDecisionTree<SplitType, MatType>::Classify(const VecType& data)
+{
+  // Get the direction we need to go, and continue classification.
+  // If we're at a leaf, we don't need to go any deeper.
+  if (children.size() == 0)
+  {
+    return split.Classify(data);
+  }
+  else
+  {
+    const size_t direction = split.CalculateDirection(data);
+    return children[direction].Classify(data);
+  }
+}
+
+template<typename SplitType, typename MatType>
+void StreamingDecisionTree<SplitType, MatType>::Classify(
+    const MatType& data,
+    arma::Row<size_t>& predictions)
+{
+  for (size_t i = 0; i < data.n_cols; ++i)
+    predictions[i] = Classify(data.col(i));
+}
+
+} // namespace tree
+} // namespace mlpack
+
+#endif