[mlpack-git] master: Add some documentation (finally). (70a8e70)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:45:16 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit 70a8e70f969e2db529b19b1231f6b790957d434d
Author: Ryan Curtin <ryan at ratml.org>
Date: Fri Oct 30 16:30:11 2015 +0000
Add some documentation (finally).
>---------------------------------------------------------------
70a8e70f969e2db529b19b1231f6b790957d434d
.../hoeffding_trees/binary_numeric_split.hpp | 46 ++++++++++++++++---
.../hoeffding_categorical_split.hpp | 37 ++++++++++++++-
.../hoeffding_trees/hoeffding_numeric_split.hpp | 53 +++++++++++++++++++---
3 files changed, 121 insertions(+), 15 deletions(-)
diff --git a/src/mlpack/methods/hoeffding_trees/binary_numeric_split.hpp b/src/mlpack/methods/hoeffding_trees/binary_numeric_split.hpp
index 755efeb..07f8d7b 100644
--- a/src/mlpack/methods/hoeffding_trees/binary_numeric_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/binary_numeric_split.hpp
@@ -31,39 +31,73 @@ namespace tree {
* time, where n is the number of samples seen so far. Every split with this
* split type returns only two splits (greater than or equal to the split point,
* and less than the split point). The Train() function should take O(1) time.
+ *
+ * @tparam FitnessFunction Fitness function to use for calculating gain.
+ * @tparam ObservationType Type of observation used by this dimension.
*/
template<typename FitnessFunction,
typename ObservationType = double>
class BinaryNumericSplit
{
public:
+ //! The splitting information required by the BinaryNumericSplit.
typedef NumericSplitInfo<ObservationType> SplitInfo;
+ /**
+ * Create the BinaryNumericSplit object with the given number of classes.
+ *
+ * @param numClasses Number of classes in dataset.
+ */
BinaryNumericSplit(const size_t numClasses);
+ /**
+ * Train on the given value with the given label.
+ *
+ * @param value The value to train on.
+ * @param label The label to train on.
+ */
void Train(ObservationType value, const size_t label);
+ /**
+ * Given the points seen so far, evaluate the fitness function, returning the
+ * best possible gain of a binary split. Note that this takes O(n) time,
+ * where n is the number of points seen so far. So this may not exactly be
+ * fast...
+ */
double EvaluateFitnessFunction();
+ // Return the number of children if this node were to split on this feature.
+ size_t NumChildren() const { return 2; }
+
+ /**
+ * Given that a split should happen, return the majority classes of the (two)
+ * children and an initialized SplitInfo object.
+ *
+ * @param childMajorities Majority classes of the children after the split.
+ * @param splitInfo Split information.
+ */
void Split(arma::Col<size_t>& childMajorities, SplitInfo& splitInfo);
+ //! The majority class of the points seen so far.
size_t MajorityClass() const;
+ //! The probability of the majority class given the points seen so far.
double MajorityProbability() const;
+ //! Serialize the object.
template<typename Archive>
void Serialize(Archive& ar, const unsigned int /* version */);
- // Return the number of children if this node were to split on this feature.
- size_t NumChildren() const { return 2; }
-
private:
- // All we need is ordered access.
+ //! The elements seen so far, in sorted order.
std::multimap<ObservationType, size_t> sortedElements;
-
+ //! The classes we have seen so far (for majority calculations).
arma::Col<size_t> classCounts;
- bool isAccurate;
+ //! A cached best split point.
ObservationType bestSplit;
+ //! If true, the cached best split point is accurate (that is, we have not
+ //! seen any more samples since we calculated it).
+ bool isAccurate;
};
// Convenience typedef.
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
index 9b1c46a..1f8f720 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
@@ -32,28 +32,58 @@ namespace tree {
* This class will track the sufficient statistics of the training points it has
* seen. The HoeffdingSplit class (and other related classes) can use this
* class to track categorical features and split decision tree nodes.
+ *
+ * @tparam FitnessFunction Fitness function to use for calculating gain.
*/
template<typename FitnessFunction>
class HoeffdingCategoricalSplit
{
public:
+ //! The type of split information required by the HoeffdingCategoricalSplit.
typedef CategoricalSplitInfo SplitInfo;
+ /**
+ * Create the HoeffdingCategoricalSplit given a number of categories for this
+ * dimension and a number of classes.
+ *
+ * @param numCategories Number of categories in this dimension.
+ * @param numClasses Number of classes in this dimension.
+ */
HoeffdingCategoricalSplit(const size_t numCategories,
const size_t numClasses);
+ /**
+ * Train on the given value with the given label.
+ *
+ * @param value Value to train on.
+ * @param label Label to train on.
+ */
template<typename eT>
void Train(eT value, const size_t label);
+ /**
+ * Given the points seen so far, evaluate the fitness function, returning the
+ * gain if a split was to be made.
+ */
double EvaluateFitnessFunction() const;
+ //! Return the number of children, if the node were to split.
+ size_t NumChildren() const { return sufficientStatistics.n_cols; }
+
+ /**
+ * Gather the information for a split: get the labels of the child majorities,
+ * and initialize the SplitInfo object.
+ *
+ * @param childMajorities Majorities of child nodes to be created.
+ * @param splitInfo Information for splitting.
+ */
void Split(arma::Col<size_t>& childMajorities, SplitInfo& splitInfo);
+ //! Get the majority class seen so far.
size_t MajorityClass() const;
+ //! Get the probability of the majority class given the points seen so far.
double MajorityProbability() const;
- size_t NumChildren() const { return sufficientStatistics.n_cols; }
-
//! Serialize the categorical split.
template<typename Archive>
void Serialize(Archive& ar, const unsigned int /* version */)
@@ -62,6 +92,9 @@ class HoeffdingCategoricalSplit
}
private:
+ //! The sufficient statistics for all points seen so far. Each column
+ //! corresponds to a category, and contains a count of each of the classes
+ //! seen for points in that category.
arma::Mat<size_t> sufficientStatistics;
};
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
index 2fa40c8..38bbc35 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
@@ -39,51 +39,90 @@ namespace tree {
* range is equally split into bins, and splitting proceeds in the same way as
* with the categorical splits. This is a simple and stupid strategy, so don't
* expect it to be the best possible thing you can do.
+ *
+ * @tparam FitnessFunction Fitness function to use for calculating gain.
+ * @tparam ObservationType Type of observations in this dimension.
*/
template<typename FitnessFunction,
typename ObservationType = double>
class HoeffdingNumericSplit
{
public:
+ //! The splitting information type required by the HoeffdingNumericSplit.
typedef NumericSplitInfo<ObservationType> SplitInfo;
+ /**
+ * Create the HoeffdingNumericSplit class, and specify some basic parameters
+ * about how the binning should take place.
+ *
+ * @param numClasses Number of classes.
+ * @param bins Number of bins.
+ * @param observationsBeforeBinning Number of points to see before binning is
+ * performed.
+ */
HoeffdingNumericSplit(const size_t numClasses,
const size_t bins = 10,
const size_t observationsBeforeBinning = 100);
+ /**
+ * Train the HoeffdingNumericSplit on the given observed value (remember that
+ * this object only cares about the information for a single feature, not an
+ * entire point).
+ *
+ * @param value Value in the dimension that this HoeffdingNumericSplit refers
+ * to.
+ * @param label Label of the given point.
+ */
void Train(ObservationType value, const size_t label);
+ /**
+ * Evaluate the fitness function given what has been calculated so far. In
+ * this case, if binning has not yet been performed, 0 will be returned (i.e.,
+ * no gain).
+ */
double EvaluateFitnessFunction() const;
- // Return the majority class of each child to be created, if a split on this
- // dimension was performed. Also create the split object.
+ //! Return the number of children if this node splits on this feature.
+ size_t NumChildren() const { return bins; }
+
+ /**
+ * Return the majority class of each child to be created, if a split on this
+ * dimension was performed. Also create the split object.
+ */
void Split(arma::Col<size_t>& childMajorities, SplitInfo& splitInfo) const;
+ //! Return the majority class.
size_t MajorityClass() const;
+ //! Return the probability of the majority class.
double MajorityProbability() const;
+ //! Return the number of bins.
size_t Bins() const { return bins; }
- // Return the number of children if this node splits on this feature.
- size_t NumChildren() const { return bins; }
-
+ //! Serialize the object.
template<typename Archive>
void Serialize(Archive& ar, const unsigned int /* version */);
private:
- // Cache the values of the points seen before we make bins.
+ //! Before binning, this holds the points we have seen so far.
arma::Col<ObservationType> observations;
+ //! This holds the labels of the points before binning.
arma::Col<size_t> labels;
+ //! The split points for the binning (length bins - 1).
arma::Col<ObservationType> splitPoints;
+ //! The number of bins.
size_t bins;
+ //! The number of observations we must see before binning.
size_t observationsBeforeBinning;
+ //! The number of samples we have seen so far.
size_t samplesSeen;
+ //! After binning, this contains the sufficient statistics.
arma::Mat<size_t> sufficientStatistics;
};
-// Convenience typedef.
+//! Convenience typedef.
template<typename FitnessFunction>
using HoeffdingDoubleNumericSplit = HoeffdingNumericSplit<FitnessFunction,
double>;
More information about the mlpack-git
mailing list