[mlpack-git] master: A first pass at the abstractions for VFDT. (f8be407)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:41:51 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit f8be40775e638698390547f99d740190e3f93367
Author: ryan <ryan at ratml.org>
Date: Tue Sep 15 18:29:15 2015 -0400
A first pass at the abstractions for VFDT.
>---------------------------------------------------------------
f8be40775e638698390547f99d740190e3f93367
.../hoeffding_categorical_split.hpp | 31 +++++++++
.../methods/hoeffding_trees/hoeffding_split.hpp | 50 +++++++++++++++
.../hoeffding_trees/streaming_decision_tree.hpp | 74 ++++++++++++++++++++++
3 files changed, 155 insertions(+)
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
new file mode 100644
index 0000000..6d6f9e7
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
@@ -0,0 +1,31 @@
+/**
+ * @file hoeffding_categorical_split.hpp
+ * @author Ryan Curtin
+ *
+ * A class that contains the information necessary to perform a categorical
+ * split for Hoeffding trees.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_CATEGORICAL_SPLIT_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_CATEGORICAL_SPLIT_HPP
+
+namespace mlpack {
+namespace tree {
+
+template<typename FitnessFunction>
+class HoeffdingCategoricalSplit
+{
+ public:
+ HoeffdingCategoricalSplit(const size_t numCategories, const size_t numClasses);
+
+ template<typename eT>
+ void Train(eT value, const size_t label);
+
+ double EvaluateFitnessFunction() const;
+ private:
+ arma::Mat<size_t> sufficientStatistics;
+};
+
+} // namespace tree
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
new file mode 100644
index 0000000..fa18d09
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
@@ -0,0 +1,50 @@
+/**
+ * @file hoeffding_split.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of the standard Hoeffding bound split by Pedro Domingos and
+ * Geoff Hulten in ``Mining High-Speed Data Streams''.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_HPP
+
+namespace mlpack {
+namespace tree {
+
+template<typename FitnessFunction,
+ typename NumericSplitType,
+ typename CategoricalSplitType>
+class HoeffdingSplit
+{
+ public:
+ HoeffdingSplit(const size_t dimensionality,
+ const size_t numClasses,
+ const DatasetInfo& datasetInfo);
+
+ template<typename VecType>
+ void Train(VecType& point, const size_t label);
+
+ // 0 if split should not happen; number of splits otherwise.
+ size_t SplitCheck() const;
+
+ // Return index that we should go towards.
+ template<typename VecType>
+ size_t CalculateDirection(VecType& point) const;
+
+ private:
+ // We need to keep some information for before we have split.
+ std::vector<NumericSplitType> numericSplits;
+ std::vector<CategoricalSplitType> categoricalSplits;
+
+ const DatasetInfo& datasetInfo;
+
+ // And we need to keep some information for after we have split.
+ size_t splitDimension;
+ typename CategoricalSplitType::SplitInfo categoricalSplit; // In case it's categorical.
+ typename NumericSplitType::SplitInfo numericSplit; // In case it's numeric.
+};
+
+} // namespace tree
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
new file mode 100644
index 0000000..fe8bc8e
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
@@ -0,0 +1,74 @@
+/**
+ * @file streaming_decision_tree.hpp
+ * @author Ryan Curtin
+ *
+ * The core class for a streaming decision tree.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_STREAMING_DECISION_TREE_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_STREAMING_DECISION_TREE_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace tree {
+
+template<
+ typename SplitType,
+ typename MatType = arma::mat
+>
+class StreamingDecisionTree
+{
+ public:
+ StreamingDecisionTree(const MatType& data, const arma::Row<size_t>& labels);
+
+ StreamingDecisionTree();
+
+ StreamingDecisionTree(const StreamingDecisionTree& other);
+
+ ~StreamingDecisionTree();
+
+ size_t NumChildren() const { return children.size(); }
+ StreamingDecisionTree* Child(const size_t i) { return children[i]; }
+ const StreamingDecisionTree* Child(const size_t i) const { return children[i];
+}
+
+ template<typename VecType>
+ void Train(const VecType& data, const size_t label);
+
+ void Train(const MatType& data, const arma::Row<size_t>& labels);
+
+ template<typename VecType>
+ size_t Predict(const VecType& data);
+
+ void Predict(const MatType& data, arma::Row<size_t>& predictions);
+
+ // How do we encode the actual split itself?
+
+ // that's just a split dimension and a rule (categorical or numeric)
+
+ private:
+ std::vector<StreamingDecisionTree*> children;
+
+ DatasetInfo info;
+ size_t splitDimension;
+ NumericSplitType* numericSplit;
+ CategoricalSplitType* categoricalSplit;
+
+ SplitType split; // hide it in the split?
+ // split must provide Dimension() and
+ //
+ // template<typename VecType>
+ // StreamingDecisionTree* MakeDecision(const VecType& point);
+ //
+ // template<typename VecType>
+ // void Train(const VecType& data, const size_t label);
+ //
+ // Datatype SplitType() const;
+ //
+ //
+};
+
+} // namespace tree
+} // namespace mlpack
+
+#endif
More information about the mlpack-git
mailing list