[mlpack-git] master: Refactor HoeffdingTree to replace StreamingDecisionTree. (f0abc60)

Wed Dec 23 11:45:41 EST 2015

Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit f0abc6003d77e8ca9f100aebe9392ae3d97b3612
Author: Ryan Curtin <ryan at ratml.org>
Date:   Sun Nov 1 17:44:09 2015 +0000

    Refactor HoeffdingTree to replace StreamingDecisionTree.


>---------------------------------------------------------------

f0abc6003d77e8ca9f100aebe9392ae3d97b3612
 .../methods/hoeffding_trees/hoeffding_tree.hpp     |  65 ++++++-
 .../hoeffding_trees/hoeffding_tree_impl.hpp        | 199 +++++++++++++++++----
 2 files changed, 217 insertions(+), 47 deletions(-)

diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
index 2b780f5..27101a5 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_tree.hpp
@@ -65,6 +65,7 @@ class HoeffdingTree
    *
    * @param data Dataset to train on.
    * @param datasetInfo Information on the dataset (types of each feature).
+   * @param labels Labels of each point in the dataset.
    * @param numClasses Number of classes in the dataset.
    * @param batchTraining Whether or not to train in batch.
    * @param successProbability Probability of success required in Hoeffding
@@ -76,8 +77,8 @@ class HoeffdingTree
    */
   template<typename MatType>
   HoeffdingTree(const MatType& data,
-                const arma::Row<size_t>& labels,
                 const data::DatasetInfo& datasetInfo,
+                const arma::Row<size_t>& labels,
                 const size_t numClasses,
                 const bool batchTraining = true,
                 const double successProbability = 0.95,
@@ -103,13 +104,21 @@ class HoeffdingTree
    */
   HoeffdingTree(const data::DatasetInfo& datasetInfo,
                 const size_t numClasses,
-                const double successProbability,
-                const size_t maxSamples,
-                const size_t checkInterval,
+                const double successProbability = 0.95,
+                const size_t maxSamples = 0,
+                const size_t checkInterval = 100,
                 std::unordered_map<size_t, std::pair<size_t, size_t>>*
                     dimensionMappings = NULL);
 
   /**
+   * Copy another tree (warning: this will duplicate the tree entirely, and may
+   * use a lot of memory.  Make sure it's what you want before you do it).
+   *
+   * @param other Tree to copy.
+   */
+  HoeffdingTree(const HoeffdingTree& other);
+
+  /**
    * Clean up memory.
    */
   ~HoeffdingTree();
@@ -147,9 +156,22 @@ class HoeffdingTree
   size_t SplitDimension() const { return splitDimension; }
 
   //! Get the majority class.
-  size_t MajorityClass() const;
+  size_t MajorityClass() const { return majorityClass; }
   //! Modify the majority class.
-  size_t& MajorityClass();
+  size_t& MajorityClass() { return majorityClass; }
+
+  //! Get the probability of the majority class (based on training samples).
+  double MajorityProbability() const { return majorityProbability; }
+  //! Modify the probability of the majority class.
+  double& MajorityProbability() { return majorityProbability; }
+
+  //! Get the number of children.
+  size_t NumChildren() const { return children.size(); }
+
+  //! Get a child.
+  const HoeffdingTree& Child(const size_t i) const { return children[i]; }
+  //! Modify a child.
+  HoeffdingTree& Child(const size_t i) { return children[i]; }
 
   /**
    * Given a point and that this node is not a leaf, calculate the index of the
@@ -187,10 +209,35 @@ class HoeffdingTree
       const;
 
   /**
+   * Classify the given points, using this node and the entire (sub)tree beneath
+   * it.  The predicted labels for each point are returned.
+   *
+   * @param data Points to classify.
+   * @param predictions Predicted labels for each point.
+   */
+  template<typename MatType>
+  void Classify(const MatType& data, arma::Row<size_t>& predictions) const;
+
+  /**
+   * Classify the given points, using this node and the entire (sub)tree beneath
+   * it.  The predicted labels for each point are returned, as well as an
+   * estimate of the probability that the prediction is correct for each point.
+   * This estimate is simply the MajorityProbability() for the leaf that each
+   * point bins to.
+   *
+   * @param data Points to classify.
+   * @param predictions Predicted labels for each point.
+   * @param probabilities Probability estimates for each predicted label.
+   */
+  template<typename MatType>
+  void Classify(const MatType& data,
+                arma::Row<size_t>& predictions,
+                arma::rowvec& probabilities) const;
+
+  /**
    * Given that this node should split, create the children.
    */
-  template<typename StreamingDecisionTreeType>
-  void CreateChildren(std::vector<StreamingDecisionTreeType>& children);
+  void CreateChildren();
 
   //! Serialize the split.
   template<typename Archive>
@@ -235,6 +282,8 @@ class HoeffdingTree
   typename CategoricalSplitType<FitnessFunction>::SplitInfo categoricalSplit;
   //! If the split is numeric, this holds the splitting information.
   typename NumericSplitType<FitnessFunction>::SplitInfo numericSplit;
+  //! If the split has occurred, these are the children.
+  std::vector<HoeffdingTree> children;
 };
 
 } // namespace tree
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
index e7c5300..bed0237 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_tree_impl.hpp
@@ -22,15 +22,46 @@ HoeffdingTree<
     NumericSplitType,
     CategoricalSplitType
 >::HoeffdingTree(const MatType& data,
-                 const arma::Row<size_t>& labels,
                  const data::DatasetInfo& datasetInfo,
+                 const arma::Row<size_t>& labels,
                  const size_t numClasses,
                  const bool batchTraining,
                  const double successProbability,
                  const size_t maxSamples,
-                 const size_t checkInterval)
+                 const size_t checkInterval) :
+    dimensionMappings(new std::unordered_map<size_t,
+        std::pair<size_t, size_t>>()),
+    ownsMappings(true),
+    numSamples(0),
+    numClasses(numClasses),
+    maxSamples(maxSamples),
+    checkInterval(checkInterval),
+    datasetInfo(&datasetInfo),
+    successProbability(successProbability),
+    splitDimension(size_t(-1)),
+    categoricalSplit(0),
+    numericSplit()
 {
-  // Not yet implemented.
+  // Generate dimension mappings and create split objects.
+  for (size_t i = 0; i < datasetInfo.Dimensionality(); ++i)
+  {
+    if (datasetInfo.Type(i) == data::Datatype::categorical)
+    {
+      categoricalSplits.push_back(CategoricalSplitType<FitnessFunction>(
+          datasetInfo.NumMappings(i), numClasses));
+      (*dimensionMappings)[i] = std::make_pair(data::Datatype::categorical,
+          categoricalSplits.size() - 1);
+    }
+    else
+    {
+      numericSplits.push_back(NumericSplitType<FitnessFunction>(numClasses));
+      (*dimensionMappings)[i] = std::make_pair(data::Datatype::numeric,
+          numericSplits.size() - 1);
+    }
+  }
+
+  // Now train.
+  Train(data, labels, batchTraining);
 }
 
 template<typename FitnessFunction,
@@ -97,6 +128,33 @@ HoeffdingTree<
   }
 }
 
+// Copy constructor.
+template<typename FitnessFunction,
+         template<typename> class NumericSplitType,
+         template<typename> class CategoricalSplitType>
+HoeffdingTree<FitnessFunction, NumericSplitType, CategoricalSplitType>::
+    HoeffdingTree(const HoeffdingTree& other) :
+    numericSplits(other.numericSplits),
+    categoricalSplits(other.categoricalSplits),
+    dimensionMappings(new std::unordered_map<size_t,
+        std::pair<size_t, size_t>>(*other.dimensionMappings)),
+    ownsMappings(true),
+    numSamples(other.numSamples),
+    numClasses(other.numClasses),
+    maxSamples(other.maxSamples),
+    checkInterval(other.checkInterval),
+    datasetInfo(new data::DatasetInfo(*other.datasetInfo)),
+    successProbability(other.successProbability),
+    splitDimension(other.splitDimension),
+    majorityClass(other.majorityClass),
+    majorityProbability(other.majorityProbability),
+    categoricalSplit(other.categoricalSplit),
+    numericSplit(other.numericSplit),
+    children(other.children)
+{
+  // Nothing left to copy.
+}
+
 template<typename FitnessFunction,
          template<typename> class NumericSplitType,
          template<typename> class CategoricalSplitType>
@@ -121,6 +179,16 @@ void HoeffdingTree<
          const bool batchTraining)
 {
   // Not yet implemented.
+  if (batchTraining)
+  {
+    throw std::invalid_argument("batch training not yet implemented");
+  }
+  else
+  {
+    // We aren't training in batch mode; loop through the points.
+    for (size_t i = 0; i < data.n_cols; ++i)
+      Train(data.col(i), labels[i]);
+  }
 }
 
 //! Train on one point.
@@ -158,11 +226,25 @@ void HoeffdingTree<
       majorityClass = numericSplits[0].MajorityClass();
       majorityProbability = numericSplits[0].MajorityProbability();
     }
+
+    // Check for a split, if we should.
+    if (numSamples % checkInterval == 0)
+    {
+      const size_t numChildren = SplitCheck();
+      if (numChildren > 0)
+      {
+        // We need to add a bunch of children.
+        // Delete children, if we have them.
+        children.clear();
+        CreateChildren();
+      }
+    }
   }
   else
   {
-    // Already split.
-    // But we should probably pass it down anyway.
+    // Already split.  Pass the training point to the relevant child.
+    size_t direction = CalculateDirection(point);
+    children[direction].Train(point, label);
   }
 }
 
@@ -175,10 +257,6 @@ size_t HoeffdingTree<
     CategoricalSplitType
 >::SplitCheck()
 {
-  // If we have not seen enough samples to check, don't check.
-  if (numSamples % checkInterval != 0)
-    return 0;
-
   // Do nothing if we've already split.
   if (splitDimension != size_t(-1))
     return 0;
@@ -250,13 +328,20 @@ template<
     template<typename> class NumericSplitType,
     template<typename> class CategoricalSplitType
 >
+template<typename VecType>
 size_t HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
->::MajorityClass() const
+>::CalculateDirection(const VecType& point) const
 {
-  return majorityClass;
+  // Don't call this before the node is split...
+  if (datasetInfo->Type(splitDimension) == data::Datatype::numeric)
+    return numericSplit.CalculateDirection(point[splitDimension]);
+  else if (datasetInfo->Type(splitDimension) == data::Datatype::categorical)
+    return categoricalSplit.CalculateDirection(point[splitDimension]);
+  else
+    return 0; // Not sure what to do here...
 }
 
 template<
@@ -264,13 +349,24 @@ template<
     template<typename> class NumericSplitType,
     template<typename> class CategoricalSplitType
 >
-size_t& HoeffdingTree<
+template<typename VecType>
+size_t HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
->::MajorityClass()
+>::Classify(const VecType& point) const
 {
-  return majorityClass;
+  if (children.size() == 0)
+  {
+    // If we're a leaf (or being considered a leaf), classify based on what we
+    // know.
+    return majorityClass;
+  }
+  else
+  {
+    // Otherwise, pass to the right child and let them classify.
+    return children[CalculateDirection(point)].Classify(point);
+  }
 }
 
 template<
@@ -279,54 +375,65 @@ template<
     template<typename> class CategoricalSplitType
 >
 template<typename VecType>
-size_t HoeffdingTree<
+void HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
->::CalculateDirection(const VecType& point) const
+>::Classify(const VecType& point,
+            size_t& prediction,
+            double& probability) const
 {
-  // Don't call this before the node is split...
-  if (datasetInfo->Type(splitDimension) == data::Datatype::numeric)
-    return numericSplit.CalculateDirection(point[splitDimension]);
-  else if (datasetInfo->Type(splitDimension) == data::Datatype::categorical)
-    return categoricalSplit.CalculateDirection(point[splitDimension]);
+  if (children.size() == 0)
+  {
+    // We are a leaf, so classify accordingly.
+    prediction = majorityClass;
+    probability = majorityProbability;
+  }
   else
-    return 0; // Not sure what to do here...
+  {
+    // Pass to the right child and let them do the classification.
+    children[CalculateDirection(point)].Classify(point, prediction,
+        probability);
+  }
 }
 
+//! Batch classification.
 template<
     typename FitnessFunction,
     template<typename> class NumericSplitType,
     template<typename> class CategoricalSplitType
 >
-template<typename VecType>
-size_t HoeffdingTree<
+template<typename MatType>
+void HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
->::Classify(const VecType& /* point */) const
+>::Classify(const MatType& data, arma::Row<size_t>& predictions) const
 {
-  // We're a leaf (or being considered a leaf), so classify based on what we
-  // know.
-  return majorityClass;
+  predictions.set_size(data.n_cols);
+  for (size_t i = 0; i < data.n_cols; ++i)
+    predictions[i] = Classify(data.col(i));
 }
 
+//! Batch classification with probabilities.
 template<
     typename FitnessFunction,
     template<typename> class NumericSplitType,
     template<typename> class CategoricalSplitType
 >
-template<typename VecType>
+template<typename MatType>
 void HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
->::Classify(const VecType& /* point */,
-            size_t& prediction,
-            double& probability) const
+>::Classify(const MatType& data,
+            arma::Row<size_t>& predictions,
+            arma::rowvec& probabilities) const
 {
-  prediction = majorityClass;
-  probability = majorityProbability;
+  predictions.set_size(data.n_cols);
+  probabilities.set_size(data.n_cols);
+  for (size_t i = 0; i < data.n_cols; ++i)
+    Classify(data.col(i), predictions[i], probabilities[i]);
 }
 
 template<
@@ -334,12 +441,11 @@ template<
     template<typename> class NumericSplitType,
     template<typename> class CategoricalSplitType
 >
-template<typename StreamingDecisionTreeType>
 void HoeffdingTree<
     FitnessFunction,
     NumericSplitType,
     CategoricalSplitType
->::CreateChildren(std::vector<StreamingDecisionTreeType>& children)
+>::CreateChildren()
 {
   // Create the children.
   arma::Col<size_t> childMajorities;
@@ -359,8 +465,8 @@ void HoeffdingTree<
   // We already know what the splitDimension will be.
   for (size_t i = 0; i < childMajorities.n_elem; ++i)
   {
-    children.push_back(StreamingDecisionTreeType(*datasetInfo, numClasses,
-        successProbability, maxSamples, checkInterval, dimensionMappings);
+    children.push_back(HoeffdingTree(*datasetInfo, numClasses,
+        successProbability, maxSamples, checkInterval, dimensionMappings));
     children[i].MajorityClass() = childMajorities[i];
   }
 
@@ -454,12 +560,27 @@ void HoeffdingTree<
   }
   else
   {
-    // We have split, so we only need to save the split.
+    // We have split, so we only need to save the split and the children.
     if (datasetInfo->Type(splitDimension) == data::Datatype::categorical)
       ar & CreateNVP(categoricalSplit, "categoricalSplit");
     else
       ar & CreateNVP(numericSplit, "numericSplit");
 
+    // Serialize the children, because we have split.
+    size_t numChildren;
+    if (Archive::is_saving::value)
+      numChildren = children.size();
+    ar & CreateNVP(numChildren, "numChildren");
+    if (Archive::is_loading::value) // If needed, allocate space.
+      children.resize(numChildren, HoeffdingTree(data::DatasetInfo(0), 0));
+
+    for (size_t i = 0; i < numChildren; ++i)
+    {
+      std::ostringstream name;
+      name << "child" << i;
+      ar & data::CreateNVP(children[i], name.str());
+    }
+
     if (Archive::is_loading::value)
     {
       numericSplits.clear();