[mlpack-git] master: A first pass at the abstractions for VFDT. (f8be407)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:41:51 EST 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit f8be40775e638698390547f99d740190e3f93367
Author: ryan <ryan at ratml.org>
Date:   Tue Sep 15 18:29:15 2015 -0400

    A first pass at the abstractions for VFDT.


>---------------------------------------------------------------

f8be40775e638698390547f99d740190e3f93367
 .../hoeffding_categorical_split.hpp                | 31 +++++++++
 .../methods/hoeffding_trees/hoeffding_split.hpp    | 50 +++++++++++++++
 .../hoeffding_trees/streaming_decision_tree.hpp    | 74 ++++++++++++++++++++++
 3 files changed, 155 insertions(+)

diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
new file mode 100644
index 0000000..6d6f9e7
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
@@ -0,0 +1,31 @@
+/**
+ * @file hoeffding_categorical_split.hpp
+ * @author Ryan Curtin
+ *
+ * A class that contains the information necessary to perform a categorical
+ * split for Hoeffding trees.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_CATEGORICAL_SPLIT_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_CATEGORICAL_SPLIT_HPP
+
+namespace mlpack {
+namespace tree {
+
+template<typename FitnessFunction>
+class HoeffdingCategoricalSplit
+{
+ public:
+  HoeffdingCategoricalSplit(const size_t numCategories, const size_t numClasses);
+
+  template<typename eT>
+  void Train(eT value, const size_t label);
+
+  double EvaluateFitnessFunction() const;
+ private:
+  arma::Mat<size_t> sufficientStatistics;
+};
+
+} // namespace tree
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
new file mode 100644
index 0000000..fa18d09
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
@@ -0,0 +1,50 @@
+/**
+ * @file hoeffding_split.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of the standard Hoeffding bound split by Pedro Domingos and
+ * Geoff Hulten in ``Mining High-Speed Data Streams''.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_HPP
+
+namespace mlpack {
+namespace tree {
+
+template<typename FitnessFunction,
+         typename NumericSplitType,
+         typename CategoricalSplitType>
+class HoeffdingSplit
+{
+ public:
+  HoeffdingSplit(const size_t dimensionality,
+                 const size_t numClasses,
+                 const DatasetInfo& datasetInfo);
+
+  template<typename VecType>
+  void Train(VecType& point, const size_t label);
+
+  // 0 if split should not happen; number of splits otherwise.
+  size_t SplitCheck() const;
+
+  // Return index that we should go towards.
+  template<typename VecType>
+  size_t CalculateDirection(VecType& point) const;
+
+ private:
+  // We need to keep some information for before we have split.
+  std::vector<NumericSplitType> numericSplits;
+  std::vector<CategoricalSplitType> categoricalSplits;
+
+  const DatasetInfo& datasetInfo;
+
+  // And we need to keep some information for after we have split.
+  size_t splitDimension;
+  typename CategoricalSplitType::SplitInfo categoricalSplit; // In case it's categorical.
+  typename NumericSplitType::SplitInfo numericSplit; // In case it's numeric.
+};
+
+} // namespace tree
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
new file mode 100644
index 0000000..fe8bc8e
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
@@ -0,0 +1,74 @@
+/**
+ * @file streaming_decision_tree.hpp
+ * @author Ryan Curtin
+ *
+ * The core class for a streaming decision tree.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_STREAMING_DECISION_TREE_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_STREAMING_DECISION_TREE_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace tree {
+
+template<
+  typename SplitType,
+  typename MatType = arma::mat
+>
+class StreamingDecisionTree
+{
+ public:
+  StreamingDecisionTree(const MatType& data, const arma::Row<size_t>& labels);
+
+  StreamingDecisionTree();
+
+  StreamingDecisionTree(const StreamingDecisionTree& other);
+
+  ~StreamingDecisionTree();
+
+  size_t NumChildren() const { return children.size(); }
+  StreamingDecisionTree* Child(const size_t i) { return children[i]; }
+  const StreamingDecisionTree* Child(const size_t i) const { return children[i];
+}
+
+  template<typename VecType>
+  void Train(const VecType& data, const size_t label);
+
+  void Train(const MatType& data, const arma::Row<size_t>& labels);
+
+  template<typename VecType>
+  size_t Predict(const VecType& data);
+
+  void Predict(const MatType& data, arma::Row<size_t>& predictions);
+
+  // How do we encode the actual split itself?
+
+  // that's just a split dimension and a rule (categorical or numeric)
+
+ private:
+  std::vector<StreamingDecisionTree*> children;
+
+  DatasetInfo info;
+  size_t splitDimension;
+  NumericSplitType* numericSplit;
+  CategoricalSplitType* categoricalSplit;
+
+  SplitType split; // hide it in the split?
+  // split must provide Dimension() and
+  //
+  // template<typename VecType>
+  // StreamingDecisionTree* MakeDecision(const VecType& point);
+  //
+  // template<typename VecType>
+  // void Train(const VecType& data, const size_t label);
+  //
+  // Datatype SplitType() const;
+  //
+  // 
+};
+
+} // namespace tree
+} // namespace mlpack
+
+#endif



More information about the mlpack-git mailing list