[mlpack-git] master: Comment the HoeffdingSplit class. (eca9af0)

Wed Dec 23 11:45:14 EST 2015

Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit eca9af0aa5e1f1565162d57f408c8febc2c423ed
Author: Ryan Curtin <ryan at ratml.org>
Date:   Fri Oct 30 16:08:46 2015 +0000

    Comment the HoeffdingSplit class.


>---------------------------------------------------------------

eca9af0aa5e1f1565162d57f408c8febc2c423ed
 .../methods/hoeffding_trees/hoeffding_split.hpp    | 103 +++++++++++++++++++--
 1 file changed, 96 insertions(+), 7 deletions(-)

diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
index ebaee47..04e24e5 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
@@ -16,6 +16,25 @@
 namespace mlpack {
 namespace tree {
 
+/**
+ * The HoeffdingSplit object represents all of the necessary information for a
+ * Hoeffding-bound-based decision tree.  (Perhaps, the whole thing should be
+ * refactored so this class is called 'HoeffdingTree'.)  This class is able to
+ * train on samples in streaming settings and batch settings, and perform splits
+ * based on the Hoeffding bound.
+ *
+ * The class is modular, and takes three template parameters.  The first,
+ * FitnessFunction, is the fitness function that should be used to determine
+ * whether a split is beneficial; examples might be GiniImpurity or
+ * InformationGain.  The NumericSplitType determines how numeric attributes are
+ * handled, and the CategoricalSplitType determines how categorical attributes
+ * are handled.  As far as the actual splitting goes, the meat of the splitting
+ * procedure will be contained in those two classes.
+ *
+ * @tparam FitnessFunction Fitness function to use.
+ * @tparam NumericSplitType Technique for splitting numeric features.
+ * @tparam CategoricalSplitType Technique for splitting categorical features.
+ */
 template<typename FitnessFunction = GiniImpurity,
          template<typename> class NumericSplitType =
              HoeffdingDoubleNumericSplit,
@@ -25,6 +44,23 @@ template<typename FitnessFunction = GiniImpurity,
 class HoeffdingSplit
 {
  public:
+  /**
+   * Construct the Hoeffding split object with the given parameters.  The
+   * dimensionMappings parameter is only used if it is desired that this node
+   * does not create its own dimensionMappings object (for instance, if this is
+   * a child of another node in the tree).
+   *
+   * @param dimensionality Dimensionality of the dataset.
+   * @param numClasses Number of classes in the dataset.
+   * @param datasetInfo Information on the dataset (types of each feature).
+   * @param successProbability Probability of success required in Hoeffding
+   *      bound before a split can happen.
+   * @param maxSamples Maximum number of samples before a split is forced.
+   * @param checkInterval Number of samples required before each split check.
+   * @param dimensionMappings Mappings from dimension indices to positions in
+   *      numeric and categorical split vectors.  If left NULL, a new one will
+   *      be created.
+   */
   HoeffdingSplit(const size_t dimensionality,
                  const size_t numClasses,
                  const data::DatasetInfo& datasetInfo,
@@ -34,12 +70,25 @@ class HoeffdingSplit
                  std::unordered_map<size_t, std::pair<size_t, size_t>>*
                      dimensionMappings = NULL);
 
+  /**
+   * Clean up memory.
+   */
   ~HoeffdingSplit();
 
+  /**
+   * Train on a single point in streaming mode, with the given label.
+   *
+   * @param point Point to train on.
+   * @param label Label of point to train on.
+   */
   template<typename VecType>
   void Train(const VecType& point, const size_t label);
 
-  // 0 if split should not happen; number of splits otherwise.
+  /**
+   * Check if a split would satisfy the conditions of the Hoeffding bound with
+   * the node's specified success probability.  If so, the number of children
+   * that would be created is returned.  If not, 0 is returned.
+   */
   size_t SplitCheck();
 
   //! Get the splitting dimension (size_t(-1) if no split).
@@ -50,18 +99,44 @@ class HoeffdingSplit
   //! Modify the majority class.
   size_t& MajorityClass();
 
-  // Return index that we should go towards.
+  /**
+   * Given a point and that this node is not a leaf, calculate the index of the
+   * child node this point would go towards.  This method is primarily used by
+   * the Classify() function, but it can be used in a standalone sense too.
+   *
+   * @param point Point to classify.
+   */
   template<typename VecType>
   size_t CalculateDirection(const VecType& point) const;
 
-  // Classify the point according to the statistics in this node.
+  /**
+   * Classify the given point, using this node and the entire (sub)tree beneath
+   * it.  The predicted label is returned.
+   *
+   * @param point Point to classify.
+   * @return Predicted label of point.
+   */
   template<typename VecType>
   size_t Classify(const VecType& point) const;
 
+  /**
+   * Classify the given point and also return an estimate of the probability
+   * that the prediction is correct.  (This estimate is simply the probability
+   * that a training point was from the majority class in the leaf that this
+   * point binned to.)
+   *
+   * @param point Point to classify.
+   * @param prediction Predicted label of point.
+   * @param probability An estimate of the probability that the prediction is
+   *      correct.
+   */
   template<typename VecType>
   void Classify(const VecType& point, size_t& prediction, double& probability)
       const;
 
+  /**
+   * Given that this node should split, create the children.
+   */
   template<typename StreamingDecisionTreeType>
   void CreateChildren(std::vector<StreamingDecisionTreeType>& children);
 
@@ -71,28 +146,42 @@ class HoeffdingSplit
 
  private:
   // We need to keep some information for before we have split.
+
+  //! Information for splitting of numeric features (used before split).
   std::vector<NumericSplitType<FitnessFunction>> numericSplits;
+  //! Information for splitting of categorical features (used before split).
   std::vector<CategoricalSplitType<FitnessFunction>> categoricalSplits;
 
-  // This structure is owned by this node only if it is the root of the tree.
+  //! This structure is owned by this node only if it is the root of the tree.
   std::unordered_map<size_t, std::pair<size_t, size_t>>* dimensionMappings;
-  // Indicates whether or not we own the mappings.
+  //! Indicates whether or not we own the mappings.
   bool ownsMappings;
 
+  //! The number of samples seen so far by this node.
   size_t numSamples;
+  //! The number of classes this node is trained on.
   size_t numClasses;
+  //! The maximum number of samples we can see before splitting.
   size_t maxSamples;
+  //! The number of samples that should be seen before checking for a split.
   size_t checkInterval;
+  //! The dataset information.  (We don't own this.)
   const data::DatasetInfo* datasetInfo;
+  //! The required probability of success for a split to be performed.
   double successProbability;
 
   // And we need to keep some information for after we have split.
+
+  //! The dimension that this node has split on.
   size_t splitDimension;
+  //! The majority class of this node.
   size_t majorityClass;
+  //! The empirical probability of a point this node saw having the majority
+  //! class.
   double majorityProbability;
-  // In case it's categorical.
+  //! If the split is categorical, this holds the splitting information.
   typename CategoricalSplitType<FitnessFunction>::SplitInfo categoricalSplit;
-  // In case it's numeric.
+  //! If the split is numeric, this holds the splitting information.
   typename NumericSplitType<FitnessFunction>::SplitInfo numericSplit;
 };