[mlpack-git] master: Add some documentation (finally). (70a8e70)

Wed Dec 23 11:45:16 EST 2015

Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit 70a8e70f969e2db529b19b1231f6b790957d434d
Author: Ryan Curtin <ryan at ratml.org>
Date:   Fri Oct 30 16:30:11 2015 +0000

    Add some documentation (finally).


>---------------------------------------------------------------

70a8e70f969e2db529b19b1231f6b790957d434d
 .../hoeffding_trees/binary_numeric_split.hpp       | 46 ++++++++++++++++---
 .../hoeffding_categorical_split.hpp                | 37 ++++++++++++++-
 .../hoeffding_trees/hoeffding_numeric_split.hpp    | 53 +++++++++++++++++++---
 3 files changed, 121 insertions(+), 15 deletions(-)

diff --git a/src/mlpack/methods/hoeffding_trees/binary_numeric_split.hpp b/src/mlpack/methods/hoeffding_trees/binary_numeric_split.hpp
index 755efeb..07f8d7b 100644
--- a/src/mlpack/methods/hoeffding_trees/binary_numeric_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/binary_numeric_split.hpp
@@ -31,39 +31,73 @@ namespace tree {
  * time, where n is the number of samples seen so far.  Every split with this
  * split type returns only two splits (greater than or equal to the split point,
  * and less than the split point).  The Train() function should take O(1) time.
+ *
+ * @tparam FitnessFunction Fitness function to use for calculating gain.
+ * @tparam ObservationType Type of observation used by this dimension.
  */
 template<typename FitnessFunction,
          typename ObservationType = double>
 class BinaryNumericSplit
 {
  public:
+  //! The splitting information required by the BinaryNumericSplit.
   typedef NumericSplitInfo<ObservationType> SplitInfo;
 
+  /**
+   * Create the BinaryNumericSplit object with the given number of classes.
+   *
+   * @param numClasses Number of classes in dataset.
+   */
   BinaryNumericSplit(const size_t numClasses);
 
+  /**
+   * Train on the given value with the given label.
+   *
+   * @param value The value to train on.
+   * @param label The label to train on.
+   */
   void Train(ObservationType value, const size_t label);
 
+  /**
+   * Given the points seen so far, evaluate the fitness function, returning the
+   * best possible gain of a binary split.  Note that this takes O(n) time,
+   * where n is the number of points seen so far.  So this may not exactly be
+   * fast...
+   */
   double EvaluateFitnessFunction();
 
+  // Return the number of children if this node were to split on this feature.
+  size_t NumChildren() const { return 2; }
+
+  /**
+   * Given that a split should happen, return the majority classes of the (two)
+   * children and an initialized SplitInfo object.
+   *
+   * @param childMajorities Majority classes of the children after the split.
+   * @param splitInfo Split information.
+   */
   void Split(arma::Col<size_t>& childMajorities, SplitInfo& splitInfo);
 
+  //! The majority class of the points seen so far.
   size_t MajorityClass() const;
+  //! The probability of the majority class given the points seen so far.
   double MajorityProbability() const;
 
+  //! Serialize the object.
   template<typename Archive>
   void Serialize(Archive& ar, const unsigned int /* version */);
 
-  // Return the number of children if this node were to split on this feature.
-  size_t NumChildren() const { return 2; }
-
  private:
-  // All we need is ordered access.
+  //! The elements seen so far, in sorted order.
   std::multimap<ObservationType, size_t> sortedElements;
-
+  //! The classes we have seen so far (for majority calculations).
   arma::Col<size_t> classCounts;
 
-  bool isAccurate;
+  //! A cached best split point.
   ObservationType bestSplit;
+  //! If true, the cached best split point is accurate (that is, we have not
+  //! seen any more samples since we calculated it).
+  bool isAccurate;
 };
 
 // Convenience typedef.
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
index 9b1c46a..1f8f720 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
@@ -32,28 +32,58 @@ namespace tree {
  * This class will track the sufficient statistics of the training points it has
  * seen.  The HoeffdingSplit class (and other related classes) can use this
  * class to track categorical features and split decision tree nodes.
+ *
+ * @tparam FitnessFunction Fitness function to use for calculating gain.
  */
 template<typename FitnessFunction>
 class HoeffdingCategoricalSplit
 {
  public:
+  //! The type of split information required by the HoeffdingCategoricalSplit.
   typedef CategoricalSplitInfo SplitInfo;
 
+  /**
+   * Create the HoeffdingCategoricalSplit given a number of categories for this
+   * dimension and a number of classes.
+   *
+   * @param numCategories Number of categories in this dimension.
+   * @param numClasses Number of classes in this dimension.
+   */
   HoeffdingCategoricalSplit(const size_t numCategories,
                             const size_t numClasses);
 
+  /**
+   * Train on the given value with the given label.
+   *
+   * @param value Value to train on.
+   * @param label Label to train on.
+   */
   template<typename eT>
   void Train(eT value, const size_t label);
 
+  /**
+   * Given the points seen so far, evaluate the fitness function, returning the
+   * gain if a split was to be made.
+   */
   double EvaluateFitnessFunction() const;
 
+  //! Return the number of children, if the node were to split.
+  size_t NumChildren() const { return sufficientStatistics.n_cols; }
+
+  /**
+   * Gather the information for a split: get the labels of the child majorities,
+   * and initialize the SplitInfo object.
+   *
+   * @param childMajorities Majorities of child nodes to be created.
+   * @param splitInfo Information for splitting.
+   */
   void Split(arma::Col<size_t>& childMajorities, SplitInfo& splitInfo);
 
+  //! Get the majority class seen so far.
   size_t MajorityClass() const;
+  //! Get the probability of the majority class given the points seen so far.
   double MajorityProbability() const;
 
-  size_t NumChildren() const { return sufficientStatistics.n_cols; }
-
   //! Serialize the categorical split.
   template<typename Archive>
   void Serialize(Archive& ar, const unsigned int /* version */)
@@ -62,6 +92,9 @@ class HoeffdingCategoricalSplit
   }
 
  private:
+  //! The sufficient statistics for all points seen so far.  Each column
+  //! corresponds to a category, and contains a count of each of the classes
+  //! seen for points in that category.
   arma::Mat<size_t> sufficientStatistics;
 };
 
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
index 2fa40c8..38bbc35 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
@@ -39,51 +39,90 @@ namespace tree {
  * range is equally split into bins, and splitting proceeds in the same way as
  * with the categorical splits.  This is a simple and stupid strategy, so don't
  * expect it to be the best possible thing you can do.
+ *
+ * @tparam FitnessFunction Fitness function to use for calculating gain.
+ * @tparam ObservationType Type of observations in this dimension.
  */
 template<typename FitnessFunction,
          typename ObservationType = double>
 class HoeffdingNumericSplit
 {
  public:
+  //! The splitting information type required by the HoeffdingNumericSplit.
   typedef NumericSplitInfo<ObservationType> SplitInfo;
 
+  /**
+   * Create the HoeffdingNumericSplit class, and specify some basic parameters
+   * about how the binning should take place.
+   *
+   * @param numClasses Number of classes.
+   * @param bins Number of bins.
+   * @param observationsBeforeBinning Number of points to see before binning is
+   *      performed.
+   */
   HoeffdingNumericSplit(const size_t numClasses,
                         const size_t bins = 10,
                         const size_t observationsBeforeBinning = 100);
 
+  /**
+   * Train the HoeffdingNumericSplit on the given observed value (remember that
+   * this object only cares about the information for a single feature, not an
+   * entire point).
+   *
+   * @param value Value in the dimension that this HoeffdingNumericSplit refers
+   *      to.
+   * @param label Label of the given point.
+   */
   void Train(ObservationType value, const size_t label);
 
+  /**
+   * Evaluate the fitness function given what has been calculated so far.  In
+   * this case, if binning has not yet been performed, 0 will be returned (i.e.,
+   * no gain).
+   */
   double EvaluateFitnessFunction() const;
 
-  // Return the majority class of each child to be created, if a split on this
-  // dimension was performed.  Also create the split object.
+  //! Return the number of children if this node splits on this feature.
+  size_t NumChildren() const { return bins; }
+
+  /**
+   * Return the majority class of each child to be created, if a split on this
+   * dimension was performed.  Also create the split object.
+   */
   void Split(arma::Col<size_t>& childMajorities, SplitInfo& splitInfo) const;
 
+  //! Return the majority class.
   size_t MajorityClass() const;
+  //! Return the probability of the majority class.
   double MajorityProbability() const;
 
+  //! Return the number of bins.
   size_t Bins() const { return bins; }
 
-  // Return the number of children if this node splits on this feature.
-  size_t NumChildren() const { return bins; }
-
+  //! Serialize the object.
   template<typename Archive>
   void Serialize(Archive& ar, const unsigned int /* version */);
 
  private:
-  // Cache the values of the points seen before we make bins.
+  //! Before binning, this holds the points we have seen so far.
   arma::Col<ObservationType> observations;
+  //! This holds the labels of the points before binning.
   arma::Col<size_t> labels;
 
+  //! The split points for the binning (length bins - 1).
   arma::Col<ObservationType> splitPoints;
+  //! The number of bins.
   size_t bins;
+  //! The number of observations we must see before binning.
   size_t observationsBeforeBinning;
+  //! The number of samples we have seen so far.
   size_t samplesSeen;
 
+  //! After binning, this contains the sufficient statistics.
   arma::Mat<size_t> sufficientStatistics;
 };
 
-// Convenience typedef.
+//! Convenience typedef.
 template<typename FitnessFunction>
 using HoeffdingDoubleNumericSplit = HoeffdingNumericSplit<FitnessFunction,
     double>;