[mlpack-git] master: Comment the HoeffdingSplit class. (eca9af0)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:45:14 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit eca9af0aa5e1f1565162d57f408c8febc2c423ed
Author: Ryan Curtin <ryan at ratml.org>
Date: Fri Oct 30 16:08:46 2015 +0000
Comment the HoeffdingSplit class.
>---------------------------------------------------------------
eca9af0aa5e1f1565162d57f408c8febc2c423ed
.../methods/hoeffding_trees/hoeffding_split.hpp | 103 +++++++++++++++++++--
1 file changed, 96 insertions(+), 7 deletions(-)
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
index ebaee47..04e24e5 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
@@ -16,6 +16,25 @@
namespace mlpack {
namespace tree {
+/**
+ * The HoeffdingSplit object represents all of the necessary information for a
+ * Hoeffding-bound-based decision tree. (Perhaps, the whole thing should be
+ * refactored so this class is called 'HoeffdingTree'.) This class is able to
+ * train on samples in streaming settings and batch settings, and perform splits
+ * based on the Hoeffding bound.
+ *
+ * The class is modular, and takes three template parameters. The first,
+ * FitnessFunction, is the fitness function that should be used to determine
+ * whether a split is beneficial; examples might be GiniImpurity or
+ * InformationGain. The NumericSplitType determines how numeric attributes are
+ * handled, and the CategoricalSplitType determines how categorical attributes
+ * are handled. As far as the actual splitting goes, the meat of the splitting
+ * procedure will be contained in those two classes.
+ *
+ * @tparam FitnessFunction Fitness function to use.
+ * @tparam NumericSplitType Technique for splitting numeric features.
+ * @tparam CategoricalSplitType Technique for splitting categorical features.
+ */
template<typename FitnessFunction = GiniImpurity,
template<typename> class NumericSplitType =
HoeffdingDoubleNumericSplit,
@@ -25,6 +44,23 @@ template<typename FitnessFunction = GiniImpurity,
class HoeffdingSplit
{
public:
+ /**
+ * Construct the Hoeffding split object with the given parameters. The
+ * dimensionMappings parameter is only used if it is desired that this node
+ * does not create its own dimensionMappings object (for instance, if this is
+ * a child of another node in the tree).
+ *
+ * @param dimensionality Dimensionality of the dataset.
+ * @param numClasses Number of classes in the dataset.
+ * @param datasetInfo Information on the dataset (types of each feature).
+ * @param successProbability Probability of success required in Hoeffding
+ * bound before a split can happen.
+ * @param maxSamples Maximum number of samples before a split is forced.
+ * @param checkInterval Number of samples required before each split check.
+ * @param dimensionMappings Mappings from dimension indices to positions in
+ * numeric and categorical split vectors. If left NULL, a new one will
+ * be created.
+ */
HoeffdingSplit(const size_t dimensionality,
const size_t numClasses,
const data::DatasetInfo& datasetInfo,
@@ -34,12 +70,25 @@ class HoeffdingSplit
std::unordered_map<size_t, std::pair<size_t, size_t>>*
dimensionMappings = NULL);
+ /**
+ * Clean up memory.
+ */
~HoeffdingSplit();
+ /**
+ * Train on a single point in streaming mode, with the given label.
+ *
+ * @param point Point to train on.
+ * @param label Label of point to train on.
+ */
template<typename VecType>
void Train(const VecType& point, const size_t label);
- // 0 if split should not happen; number of splits otherwise.
+ /**
+ * Check if a split would satisfy the conditions of the Hoeffding bound with
+ * the node's specified success probability. If so, the number of children
+ * that would be created is returned. If not, 0 is returned.
+ */
size_t SplitCheck();
//! Get the splitting dimension (size_t(-1) if no split).
@@ -50,18 +99,44 @@ class HoeffdingSplit
//! Modify the majority class.
size_t& MajorityClass();
- // Return index that we should go towards.
+ /**
+ * Given a point and that this node is not a leaf, calculate the index of the
+ * child node this point would go towards. This method is primarily used by
+ * the Classify() function, but it can be used in a standalone sense too.
+ *
+ * @param point Point to classify.
+ */
template<typename VecType>
size_t CalculateDirection(const VecType& point) const;
- // Classify the point according to the statistics in this node.
+ /**
+ * Classify the given point, using this node and the entire (sub)tree beneath
+ * it. The predicted label is returned.
+ *
+ * @param point Point to classify.
+ * @return Predicted label of point.
+ */
template<typename VecType>
size_t Classify(const VecType& point) const;
+ /**
+ * Classify the given point and also return an estimate of the probability
+ * that the prediction is correct. (This estimate is simply the probability
+ * that a training point was from the majority class in the leaf that this
+ * point binned to.)
+ *
+ * @param point Point to classify.
+ * @param prediction Predicted label of point.
+ * @param probability An estimate of the probability that the prediction is
+ * correct.
+ */
template<typename VecType>
void Classify(const VecType& point, size_t& prediction, double& probability)
const;
+ /**
+ * Given that this node should split, create the children.
+ */
template<typename StreamingDecisionTreeType>
void CreateChildren(std::vector<StreamingDecisionTreeType>& children);
@@ -71,28 +146,42 @@ class HoeffdingSplit
private:
// We need to keep some information for before we have split.
+
+ //! Information for splitting of numeric features (used before split).
std::vector<NumericSplitType<FitnessFunction>> numericSplits;
+ //! Information for splitting of categorical features (used before split).
std::vector<CategoricalSplitType<FitnessFunction>> categoricalSplits;
- // This structure is owned by this node only if it is the root of the tree.
+ //! This structure is owned by this node only if it is the root of the tree.
std::unordered_map<size_t, std::pair<size_t, size_t>>* dimensionMappings;
- // Indicates whether or not we own the mappings.
+ //! Indicates whether or not we own the mappings.
bool ownsMappings;
+ //! The number of samples seen so far by this node.
size_t numSamples;
+ //! The number of classes this node is trained on.
size_t numClasses;
+ //! The maximum number of samples we can see before splitting.
size_t maxSamples;
+ //! The number of samples that should be seen before checking for a split.
size_t checkInterval;
+ //! The dataset information. (We don't own this.)
const data::DatasetInfo* datasetInfo;
+ //! The required probability of success for a split to be performed.
double successProbability;
// And we need to keep some information for after we have split.
+
+ //! The dimension that this node has split on.
size_t splitDimension;
+ //! The majority class of this node.
size_t majorityClass;
+ //! The empirical probability of a point this node saw having the majority
+ //! class.
double majorityProbability;
- // In case it's categorical.
+ //! If the split is categorical, this holds the splitting information.
typename CategoricalSplitType<FitnessFunction>::SplitInfo categoricalSplit;
- // In case it's numeric.
+ //! If the split is numeric, this holds the splitting information.
typename NumericSplitType<FitnessFunction>::SplitInfo numericSplit;
};
More information about the mlpack-git
mailing list