[mlpack-git] master: A pass at VFDT implementation. (0a6b3a2)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:42:01 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit 0a6b3a210ccc997a5d44ba91888faec2d0d8190d
Author: Ryan Curtin <ryan at ratml.org>
Date: Mon Sep 21 12:30:59 2015 +0000
A pass at VFDT implementation.
>---------------------------------------------------------------
0a6b3a210ccc997a5d44ba91888faec2d0d8190d
.../CMakeLists.txt | 15 +-
.../hoeffding_categorical_split.hpp | 18 +-
.../hoeffding_categorical_split_impl.hpp | 70 ++++++++
.../hoeffding_trees/hoeffding_numeric_split.hpp | 28 +++
.../methods/hoeffding_trees/hoeffding_split.hpp | 30 +++-
.../hoeffding_trees/hoeffding_split_impl.hpp | 200 +++++++++++++++++++++
.../hoeffding_trees/streaming_decision_tree.hpp | 34 +---
.../streaming_decision_tree_impl.hpp | 100 +++++++++++
8 files changed, 457 insertions(+), 38 deletions(-)
diff --git a/src/mlpack/methods/amf/termination_policies/CMakeLists.txt b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
similarity index 61%
copy from src/mlpack/methods/amf/termination_policies/CMakeLists.txt
copy to src/mlpack/methods/hoeffding_trees/CMakeLists.txt
index e9aca7a..0d06de2 100644
--- a/src/mlpack/methods/amf/termination_policies/CMakeLists.txt
+++ b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
@@ -1,12 +1,15 @@
# Define the files we need to compile
# Anything not in this list will not be compiled into MLPACK.
set(SOURCES
- simple_residue_termination.hpp
- simple_tolerance_termination.hpp
- validation_rmse_termination.hpp
- incomplete_incremental_termination.hpp
- complete_incremental_termination.hpp
- max_iteration_termination.hpp
+ categorical_split_info.hpp
+ gini_impurity.hpp
+ hoeffding_categorical_split.hpp
+ hoeffding_categorical_split_impl.hpp
+ hoeffding_numeric_split.hpp
+ hoeffding_split.hpp
+ hoeffding_split_impl.hpp
+ streaming_decision_tree.hpp
+ streaming_decision_tree_impl.hpp
)
# Add directory name to sources.
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
index e870445..14da600 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split.hpp
@@ -8,6 +8,9 @@
#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_CATEGORICAL_SPLIT_HPP
#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_CATEGORICAL_SPLIT_HPP
+#include <mlpack/core.hpp>
+#include "categorical_split_info.hpp"
+
namespace mlpack {
namespace tree {
@@ -34,12 +37,22 @@ template<typename FitnessFunction>
class HoeffdingCategoricalSplit
{
public:
- HoeffdingCategoricalSplit(const size_t numCategories, const size_t numClasses);
+ typedef CategoricalSplitInfo SplitInfo;
+
+ HoeffdingCategoricalSplit(const size_t numCategories,
+ const size_t numClasses);
template<typename eT>
void Train(eT value, const size_t label);
double EvaluateFitnessFunction() const;
+
+ template<typename StreamingDecisionTreeType>
+ void CreateChildren(std::vector<StreamingDecisionTreeType*>& children,
+ SplitInfo& splitInfo);
+
+ size_t MajorityClass() const;
+
private:
arma::Mat<size_t> sufficientStatistics;
};
@@ -47,4 +60,7 @@ class HoeffdingCategoricalSplit
} // namespace tree
} // namespace mlpack
+// Include implementation.
+#include "hoeffding_categorical_split_impl.hpp"
+
#endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp
new file mode 100644
index 0000000..4faf88a
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp
@@ -0,0 +1,70 @@
+/**
+ * @file hoeffding_categorical_split_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implemental of the HoeffdingCategoricalSplit class.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_CATEGORICAL_SPLIT_IMPL_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_CATEGORICAL_SPLIT_IMPL_HPP
+
+// In case it hasn't been included yet.
+#include "hoeffding_categorical_split.hpp"
+
+namespace mlpack {
+namespace tree {
+
+template<typename FitnessFunction>
+HoeffdingCategoricalSplit<FitnessFunction>::HoeffdingCategoricalSplit(
+ const size_t numCategories,
+ const size_t numClasses) :
+ sufficientStatistics(numClasses, numCategories)
+{
+ sufficientStatistics.zeros();
+}
+
+template<typename eT>
+template<typename FitnessFunction>
+void HoeffdingCategoricalSplit<FitnessFunction>::Train(eT value,
+ const size_t label)
+{
+ // Add this to our counts.
+ // 'value' should be categorical, so we should be able to cast to size_t...
+ sufficientStatistics(label, size_t(value))++;
+}
+
+template<typename FitnessFunction>
+double HoeffdingCategoricalSplit<FitnessFunction>::EvaluateFitnessFunction()
+ const
+{
+ return FitnessFunction::Evaluate(sufficientStatistics);
+}
+
+template<typename StreamingDecisionTreeType>
+template<typename FitnessFunction>
+void HoeffdingCategoricalSplit<FitnessFunction>::CreateChildren(
+ std::vector<StreamingDecisionTreeType*>& children,
+ SplitInfo& splitInfo)
+{
+ // We'll make one child for each category.
+ children.push_back(StreamingDecisionTree(datasetInfo));
+ // Create the according SplitInfo object.
+ splitInfo = SplitInfo(sufficientStatistics.n_cols);
+}
+
+template<typename StreamingDecisionTreeType>
+template<typename FitnessFunction>
+size_t HoeffdingCategoricalSplit<FitnessFunction>::MajorityClass() const
+{
+ // Calculate the class that we have seen the most of.
+ arma::Row<size_t> classCounts = sum(sufficientStatistics);
+
+ arma::uword maxIndex;
+ classCounts.max(maxIndex);
+
+ return size_t(maxIndex);
+}
+
+} // namespace tree
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
new file mode 100644
index 0000000..ce53128
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
@@ -0,0 +1,28 @@
+/**
+ * @file hoeffding_numeric_split.hpp
+ * @author Ryan Curtin
+ *
+ * A numeric feature split for Hoeffding trees. At the moment it does nothing.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_HPP
+
+namespace mlpack {
+namespace tree {
+
+template<typename FitnessFunction>
+class HoeffdingNumericSplit
+{
+ public:
+ HoeffdingNumericSplit();
+
+ template<typename eT>
+ void Train(eT /* value */, const size_t /* label */) { }
+
+ double EvaluateFitnessFunction() const { return 0.0; }
+};
+
+} // namespace tree
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
index fa18d09..1db2847 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
@@ -8,28 +8,41 @@
#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_HPP
#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_HPP
+#include <mlpack/core.hpp>
+#include "gini_impurity.hpp"
+#include "hoeffding_numeric_split.hpp"
+#include "hoeffding_categorical_split.hpp"
+
namespace mlpack {
namespace tree {
-template<typename FitnessFunction,
- typename NumericSplitType,
- typename CategoricalSplitType>
+template<typename FitnessFunction = GiniImpurity,
+ typename NumericSplitType = HoeffdingNumericSplit,
+ typename CategoricalSplitType = HoeffdingCategoricalSplit>
class HoeffdingSplit
{
public:
HoeffdingSplit(const size_t dimensionality,
const size_t numClasses,
- const DatasetInfo& datasetInfo);
+ const DatasetInfo& datasetInfo,
+ const double successProbability);
template<typename VecType>
- void Train(VecType& point, const size_t label);
+ void Train(const VecType& point, const size_t label);
// 0 if split should not happen; number of splits otherwise.
size_t SplitCheck() const;
// Return index that we should go towards.
template<typename VecType>
- size_t CalculateDirection(VecType& point) const;
+ size_t CalculateDirection(const VecType& point) const;
+
+ // Classify the point according to the statistics in this node.
+ template<typename VecType>
+ size_t Classify(const VecType& point) const;
+
+ template<typename StreamingDecisionTreeType>
+ void CreateChildren(std::vector<StreamingDecisionTreeType>& children);
private:
// We need to keep some information for before we have split.
@@ -37,9 +50,12 @@ class HoeffdingSplit
std::vector<CategoricalSplitType> categoricalSplits;
const DatasetInfo& datasetInfo;
+ double successProbability;
+ size_t numSamples;
// And we need to keep some information for after we have split.
size_t splitDimension;
+ size_t majorityClass;
typename CategoricalSplitType::SplitInfo categoricalSplit; // In case it's categorical.
typename NumericSplitType::SplitInfo numericSplit; // In case it's numeric.
};
@@ -47,4 +63,6 @@ class HoeffdingSplit
} // namespace tree
} // namespace mlpack
+#include "hoeffding_split_impl.hpp"
+
#endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
new file mode 100644
index 0000000..62499c29
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
@@ -0,0 +1,200 @@
+/**
+ * @file hoeffding_split_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of the HoeffdingSplit class.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_IMPL_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_SPLIT_IMPL_HPP
+
+namespace mlpack {
+namespace tree {
+
+template<typename FitnessFunction,
+ typename NumericSplitType,
+ typename CategoricalSplitType>
+HoeffdingSplit<
+ FitnessFunction,
+ NumericSplitType,
+ CategoricalSplitType
+>::HoeffdingSplit(const size_t dimensionality,
+ const size_t numClasses,
+ const DatasetInfo& datasetInfo)
+{
+ for (size_t i = 0; i < dimensionality; ++i)
+ {
+ if (datasetInfo.Type(i) == Datatype.categorical)
+ categoricalSplits.push_back(
+ CategoricalSplitType(datasetInfo.NumMappings(), numClasses));
+ // else, numeric splits (not yet!)
+ }
+}
+
+template<typename VecType>
+template<typename FitnessFunction,
+ typename NumericSplitType,
+ typename CategoricalSplitType>
+void HoeffdingSplit<
+ FitnessFunction,
+ NumericSplitType,
+ CategoricalSplitType
+>::Train(VecType& point, const size_t label)
+{
+ if (splitDimension == size_t(-1))
+ {
+ ++numSamples;
+ size_t numericIndex = 0;
+ size_t categoricalIndex = 0;
+ for (size_t i = 0; i < point.n_rows; ++i)
+ {
+ if (datasetInfo.Type(i) == Datatype.categorical)
+ categoricalSplits[categoricalIndex++].Train(point[i], label);
+ else if (datasetInfo.Type(i) == Datatype.numeric)
+ numericSplits[numericIndex++].Train(point[i], label);
+ }
+ }
+ else
+ {
+ // Already split.
+ // But we should probably pass it down anyway.
+ }
+}
+
+template<typename FitnessFunction,
+ typename NumericSplitType,
+ typename CategoricalSplitType>
+size_t HoeffdingSplit<
+ FitnessFunction,
+ NumericSplitType,
+ CategoricalSplitType
+>::SplitCheck() const
+{
+ // Do nothing if we've already split.
+ if (splitDimension == size_t(-1))
+ return 0;
+
+ // Check the fitness of each dimension. Then we'll use a Hoeffding bound
+ // somehow.
+
+ // Calculate epsilon, the value we need things to be greater than.
+ const double rSquared = std::pow(FitnessFunction::Range(numClasses), 2.0);
+ const double epsilon = std::sqrt(rSquared *
+ std::log(1.0 / successProbability) / (2 * numSamples));
+
+ arma::vec gains(categoricalSplits.size() + numericSplits.size());
+ for (size_t i = 0; i < categoricalSplits.size(); ++i)
+ gains[i] = categoricalSplits[i].EvaluateFitnessFunction();
+
+ // Now find the largest and second-largest.
+ double largest = -DBL_MAX;
+ size_t largestIndex = 0;
+ double secondLargest = -DBL_MAX;
+ size_t secondLargestIndex = 0;
+ for (size_t i = 0; i < gains.n_elem; ++i)
+ {
+ if (gains[i] > largest)
+ {
+ secondLargest = largest;
+ secondLargestIndex = largestIndex;
+ largest = gains[i];
+ largestIndex = i;
+ }
+ else if (gains[i] > secondLargest)
+ {
+ secondLargest = gains[i];
+ secondLargestIndex = i;
+ }
+ }
+
+ // Are these far enough apart to split?
+ if (largest - secondLargest > epsilon)
+ {
+ // Split!
+ splitDimension = largestIndex;
+ if (datasetInfo[largestIndex].Type == Datatype.categorical)
+ {
+ // I don't know if this should be here.
+ majorityClass = categoricalSplit[largestIndex].MajorityClass();
+ return datasetInfo[largestIndex].NumMappings();
+ }
+ else
+ {
+ majorityClass = 0;
+ return 0; // I have no idea what to do yet.
+ }
+ }
+}
+
+template<typename VecType>
+template<
+ typename FitnessFunction,
+ typename NumericSplitType,
+ typename CategoricalSplitType
+>
+size_t HoeffdingSplit<
+ FitnessFunction,
+ NumericSplitType,
+ CategoricalSplitType
+>::CalculateDirection(VecType& point) const
+{
+ // Don't call this before the node is split...
+ if (datasetInfo.Type(splitDimension) == Datatype::numeric)
+ return numericSplit.CalculateDirection(point[splitDimension]);
+ else if (datasetInfo.Type(splitDimension) == Datatype::categorical)
+ return categoricalSplit.CalculateDirection(point[splitDimension]);
+}
+
+template<typename VecType>
+template<
+ typename FitnessFunction,
+ typename NumericSplitType,
+ typename CategoricalSplitType
+>
+void HoeffdingSplit<
+ FitnessFunction,
+ NumericSplitType,
+ CategoricalSplitType
+>::Classify(const VecType& point) const
+{
+ // We're a leaf (or being considered a leaf), so classify based on what we
+ // know.
+ return majorityClass;
+}
+
+template<
+ typename FitnessFunction,
+ typename NumericSplitType,
+ typename CategoricalSplitType
+>
+void HoeffdingSplit<
+ FitnessFunction,
+ NumericSplitType,
+ CategoricalSplitType
+>::CreateChildren(std::vector<StreamingDecisionTreeType>& children)
+{
+ // We already know what the splitDimension will be.
+ size_t numericSplitIndex = 0;
+ size_t categoricalSplitIndex = 0;
+ for (size_t i = 0; i < splitDimension; ++i)
+ {
+ if (datasetInfo.Type(i) == Datatype::numeric)
+ ++numericSplitIndex;
+ if (datasetInfo.Type(i) == Datatype::categorical)
+ ++categoricalSplitIndex;
+ }
+
+ if (datasetInfo.Type(splitDimension) == Datatype::numeric)
+ {
+ numericSplits[numericSplitIndex + 1].CreateChildren(children, numericSplit);
+ }
+ else if (datasetInfo.Type(splitDimension) == Datatype::categorical)
+ {
+ categoricalSplits[categoricalSplitIndex + 1].CreateChildren(children,
+ categoricalSplit);
+ }
+}
+
+} // namespace tree
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
index fe8bc8e..af2a160 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
@@ -19,17 +19,17 @@ template<
class StreamingDecisionTree
{
public:
- StreamingDecisionTree(const MatType& data, const arma::Row<size_t>& labels);
+ StreamingDecisionTree(const MatType& data,
+ const data::DatasetInfo& datasetInfo,
+ const arma::Row<size_t>& labels);
- StreamingDecisionTree();
+ StreamingDecisionTree(const data::DatasetInfo& datasetInfo);
StreamingDecisionTree(const StreamingDecisionTree& other);
- ~StreamingDecisionTree();
-
size_t NumChildren() const { return children.size(); }
- StreamingDecisionTree* Child(const size_t i) { return children[i]; }
- const StreamingDecisionTree* Child(const size_t i) const { return children[i];
+ StreamingDecisionTree& Child(const size_t i) { return children[i]; }
+ const StreamingDecisionTree& Child(const size_t i) const { return children[i];
}
template<typename VecType>
@@ -47,25 +47,9 @@ class StreamingDecisionTree
// that's just a split dimension and a rule (categorical or numeric)
private:
- std::vector<StreamingDecisionTree*> children;
-
- DatasetInfo info;
- size_t splitDimension;
- NumericSplitType* numericSplit;
- CategoricalSplitType* categoricalSplit;
-
- SplitType split; // hide it in the split?
- // split must provide Dimension() and
- //
- // template<typename VecType>
- // StreamingDecisionTree* MakeDecision(const VecType& point);
- //
- // template<typename VecType>
- // void Train(const VecType& data, const size_t label);
- //
- // Datatype SplitType() const;
- //
- //
+ std::vector<StreamingDecisionTree> children;
+
+ SplitType split;
};
} // namespace tree
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
new file mode 100644
index 0000000..b40a502
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
@@ -0,0 +1,100 @@
+/**
+ * @file streaming_decision_tree_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of a streaming decision tree.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_STREAMING_DECISION_TREE_IMPL_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_STREAMING_DECISION_TREE_IMPL_HPP
+
+// In case it hasn't been included yet.
+#include "streaming_decision_tree.hpp"
+
+namespace mlpack {
+namespace tree {
+
+template<typename SplitType, typename MatType>
+StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
+ const MatType& data,
+ const data::DatasetInfo& datasetInfo,
+ const arma::Row<size_t>& labels) :
+ split(datasetInfo)
+{
+ Train(data, labels);
+}
+
+template<typename SplitType, typename MatType>
+StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
+ const data::DatasetInfo& datasetInfo) :
+ split(datasetInfo)
+{
+ // No training. Anything else to do...?
+}
+
+template<typename SplitType, typename MatType>
+StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
+ const StreamingDecisionTree& other)
+{
+ // Copy the children of the other tree.
+}
+
+template<typename VecType>
+template<typename SplitType, typename MatType>
+void StreamingDecisionTree<SplitType, MatType>::Train(const VecType& data,
+ const size_t label)
+{
+ split.Train(data, label);
+
+ const size_t numChildren = split.SplitCheck();
+ if (numChildren > 0)
+ {
+ // We need to add a bunch of children.
+ // Delete children, if we have them.
+ if (children.size() > 0)
+ children.clear();
+
+ // The split knows how to add the children.
+ SplitType.CreateChildren(children);
+ }
+}
+
+template<typename SplitType, typename MatType>
+void StreamingDecisionTree<SplitType, MatType>::Train(
+ const MatType& data,
+ const arma::Row<size_t>& labels)
+{
+ // Train on each point sequentially.
+ for (size_t i = 0; i < data.n_cols; ++i)
+ Train(data.col(i), labels[i]);
+}
+
+template<typename VecType>
+template<typename SplitType, typename MatType>
+size_t StreamingDecisionTree<SplitType, MatType>::Classify(const VecType& data)
+{
+ // Get the direction we need to go, and continue classification.
+ // If we're at a leaf, we don't need to go any deeper.
+ if (children.size() == 0)
+ {
+ return split.Classify(data);
+ }
+ else
+ {
+ const size_t direction = split.CalculateDirection(data);
+ return children[direction].Classify(data);
+ }
+}
+
+template<typename SplitType, typename MatType>
+void StreamingDecisionTree<SplitType, MatType>::Classify(
+ const MatType& data,
+ arma::Row<size_t>& predictions)
+{
+ for (size_t i = 0; i < data.n_cols; ++i)
+ predictions[i] = Classify(data.col(i));
+}
+
+} // namespace tree
+} // namespace mlpack
+
+#endif
More information about the mlpack-git
mailing list