[mlpack-git] master: Implement numeric feature support (though stupidly). (1f936fd)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:42:48 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit 1f936fd9a0c35cfca31de53842d950281ed335e9
Author: ryan <ryan at ratml.org>
Date: Thu Sep 24 16:07:27 2015 -0400
Implement numeric feature support (though stupidly).
Testing is not yet completed.
>---------------------------------------------------------------
1f936fd9a0c35cfca31de53842d950281ed335e9
src/mlpack/methods/hoeffding_trees/CMakeLists.txt | 1 +
.../hoeffding_trees/hoeffding_numeric_split.hpp | 62 +++++++--
.../hoeffding_numeric_split_impl.hpp | 141 +++++++++++++++++++++
.../hoeffding_trees/hoeffding_split_impl.hpp | 1 +
.../methods/hoeffding_trees/numeric_split_info.hpp | 19 ++-
src/mlpack/tests/hoeffding_tree_test.cpp | 37 +++++-
6 files changed, 247 insertions(+), 14 deletions(-)
diff --git a/src/mlpack/methods/hoeffding_trees/CMakeLists.txt b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
index 1e583f3..b6e6258 100644
--- a/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
+++ b/src/mlpack/methods/hoeffding_trees/CMakeLists.txt
@@ -6,6 +6,7 @@ set(SOURCES
hoeffding_categorical_split.hpp
hoeffding_categorical_split_impl.hpp
hoeffding_numeric_split.hpp
+ hoeffding_numeric_split_impl.hpp
hoeffding_split.hpp
hoeffding_split_impl.hpp
numeric_split_info.hpp
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
index bda7744..5b3df29 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
@@ -2,7 +2,9 @@
* @file hoeffding_numeric_split.hpp
* @author Ryan Curtin
*
- * A numeric feature split for Hoeffding trees. At the moment it does nothing.
+ * A numeric feature split for Hoeffding trees. This is a very simple
+ * implementation based on a minor note in the paper "Mining High-Speed Data
+ * Streams" by Pedro Domingos and Geoff Hulten.
*/
#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_HPP
#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_HPP
@@ -13,30 +15,72 @@
namespace mlpack {
namespace tree {
-template<typename FitnessFunction>
+/**
+ * The HoeffdingNumericSplit class implements the numeric feature splitting
+ * strategy alluded to by Domingos and Hulten in the following paper:
+ *
+ * @code
+ * @inproceedings{domingos2000mining,
+ * title={{Mining High-Speed Data Streams}},
+ * author={Domingos, P. and Hulten, G.},
+ * year={2000},
+ * booktitle={Proceedings of the Sixth ACM SIGKDD International Conference on
+ * Knowledge Discovery and Data Mining (KDD '00)},
+ * pages={71--80}
+ * }
+ * @endcode
+ *
+ * The strategy alluded to is very simple: we discretize the numeric features
+ * that we see. But in this case, we don't know how many bins we have, which
+ * makes things a little difficult. This class only makes binary splits, and
+ * has a maximum number of bins. The binning strategy is simple: the split
+ * caches the minimum and maximum value of points seen so far, and when the
+ * number of points hits a predefined threshold, the cached minimum-maximum
+ * range is equally split into bins, and splitting proceeds in the same way as
+ * with the categorical splits. This is a simple and stupid strategy, so don't
+ * expect it to be the best possible thing you can do.
+ */
+template<typename FitnessFunction,
+ typename ObservationType = double>
class HoeffdingNumericSplit
{
public:
- typedef NumericSplitInfo SplitInfo;
+ typedef NumericSplitInfo<ObservationType> SplitInfo;
- HoeffdingNumericSplit();
+ HoeffdingNumericSplit(const size_t numClasses,
+ const size_t bins = 10,
+ const size_t observationsBeforeBinning = 100);
- template<typename eT>
- void Train(eT /* value */, const size_t /* label */) { }
+ void Train(ObservationType value, const size_t label);
- double EvaluateFitnessFunction() const { return 0.0; }
+ double EvaluateFitnessFunction() const;
// Does nothing for now.
template<typename StreamingDecisionTreeType>
void CreateChildren(std::vector<StreamingDecisionTreeType>& children,
const data::DatasetInfo& datasetInfo,
const size_t dimensionality,
- SplitInfo& splitInfo) { } // Nothing to do.
+ SplitInfo& splitInfo);
- size_t MajorityClass() const { return 0; } // Nothing yet.
+ size_t MajorityClass() const;
+
+ private:
+ // Cache the values of the points seen before we make bins.
+ arma::Col<ObservationType> observations;
+ arma::Col<size_t> labels;
+
+ arma::Col<ObservationType> splitPoints;
+ size_t bins;
+ size_t observationsBeforeBinning;
+ size_t samplesSeen;
+
+ arma::Mat<size_t> sufficientStatistics;
};
} // namespace tree
} // namespace mlpack
+// Include implementation.
+#include "hoeffding_numeric_split_impl.hpp"
+
#endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split_impl.hpp
new file mode 100644
index 0000000..7d6de50
--- /dev/null
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split_impl.hpp
@@ -0,0 +1,141 @@
+/**
+ * @file hoeffding_numeric_split_impl.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of the simple HoeffdingNumericSplit class.
+ */
+#ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_IMPL_HPP
+#define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_NUMERIC_SPLIT_IMPL_HPP
+
+#include "hoeffding_numeric_split.hpp"
+
+namespace mlpack {
+namespace tree {
+
+template<typename FitnessFunction, typename ObservationType>
+HoeffdingNumericSplit<FitnessFunction, ObservationType>::HoeffdingNumericSplit(
+ const size_t numClasses,
+ const size_t bins,
+ const size_t observationsBeforeBinning) :
+ observations(observationsBeforeBinning - 1),
+ labels(observationsBeforeBinning - 1),
+ splitPoints(bins),
+ bins(bins),
+ observationsBeforeBinning(observationsBeforeBinning),
+ samplesSeen(0),
+ sufficientStatistics(arma::zeros<arma::Mat<size_t>>(numClasses, bins))
+{
+ // Nothing to do.
+}
+
+template<typename FitnessFunction, typename ObservationType>
+void HoeffdingNumericSplit<FitnessFunction, ObservationType>::Train(
+ ObservationType value,
+ const size_t label)
+{
+ if (samplesSeen < observationsBeforeBinning - 1)
+ {
+ // Add this to the samples we have seen.
+ observations[samplesSeen] = value;
+ labels[samplesSeen] = label;
+ ++samplesSeen;
+ return;
+ }
+ else if (samplesSeen == observationsBeforeBinning - 1)
+ {
+ // Now we need to make the bins.
+ ObservationType min = value;
+ ObservationType max = value;
+ for (size_t i = 0; i < observationsBeforeBinning - 1; ++i)
+ {
+ if (observations[i] < min)
+ min = observations[i];
+ else if (observations[i] > max)
+ max = observations[i];
+ }
+
+ // Now split these.
+ splitPoints = arma::linspace<arma::Col<ObservationType>>(min, max, bins);
+ ++samplesSeen;
+
+ // Now, add all of the points we've seen to the sufficient statistics.
+ for (size_t i = 0; i < observationsBeforeBinning - 1; ++i)
+ {
+ // What bin does the point fall into?
+ size_t bin = 0;
+ while (observations[i] > splitPoints[bin] && bin < bins - 1)
+ ++bin;
+
+ sufficientStatistics(labels[i], bin)++;
+ }
+ }
+
+ // If we've gotten to here, then we need to add the point to the sufficient
+ // statistics. What bin does the point fall into?
+ size_t bin = 0;
+ while (value > splitPoints[bin] && bin < bins - 1)
+ ++bin;
+
+ sufficientStatistics(label, bin)++;
+}
+
+template<typename FitnessFunction, typename ObservationType>
+double HoeffdingNumericSplit<FitnessFunction, ObservationType>::
+ EvaluateFitnessFunction() const
+{
+ if (samplesSeen < observationsBeforeBinning)
+ return 0.0;
+ else
+ return FitnessFunction::Evaluate(sufficientStatistics);
+}
+
+template<typename FitnessFunction, typename ObservationType>
+template<typename StreamingDecisionTreeType>
+void HoeffdingNumericSplit<FitnessFunction, ObservationType>::CreateChildren(
+ std::vector<StreamingDecisionTreeType>& children,
+ const data::DatasetInfo& datasetInfo,
+ const size_t dimensionality,
+ SplitInfo& splitInfo)
+{
+ // We'll make one child for each bin.
+ for (size_t i = 0; i < sufficientStatistics.n_cols; ++i)
+ children.push_back(StreamingDecisionTreeType(datasetInfo, dimensionality,
+ sufficientStatistics.n_rows));
+
+ // Create the SplitInfo object.
+ splitInfo = SplitInfo(splitPoints);
+}
+
+template<typename FitnessFunction, typename ObservationType>
+size_t HoeffdingNumericSplit<FitnessFunction, ObservationType>::
+ MajorityClass() const
+{
+ // If we haven't yet determined the bins, we must calculate this by hand.
+ if (samplesSeen < observationsBeforeBinning)
+ {
+ arma::Col<size_t> classes(sufficientStatistics.n_rows);
+ classes.zeros();
+
+ for (size_t i = 0; i < samplesSeen; ++i)
+ classes[labels[i]]++;
+
+ arma::uword majorityClass;
+ classes.max(majorityClass);
+ return size_t(majorityClass);
+ }
+ else
+ {
+ // We've calculated the bins, so we can just sum over the sufficient
+ // statistics.
+ arma::Col<size_t> classCounts = sum(sufficientStatistics, 1);
+
+ arma::uword maxIndex;
+ classCounts.max(maxIndex);
+ return size_t(maxIndex);
+ }
+}
+
+} // namespace tree
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
index 6e98760..2b536f3 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
@@ -27,6 +27,7 @@ HoeffdingSplit<
datasetInfo(datasetInfo),
successProbability(successProbability),
splitDimension(size_t(-1)),
+ numericSplit(),
categoricalSplit(0)
{
for (size_t i = 0; i < dimensionality; ++i)
diff --git a/src/mlpack/methods/hoeffding_trees/numeric_split_info.hpp b/src/mlpack/methods/hoeffding_trees/numeric_split_info.hpp
index b048b4b..c7f2c12 100644
--- a/src/mlpack/methods/hoeffding_trees/numeric_split_info.hpp
+++ b/src/mlpack/methods/hoeffding_trees/numeric_split_info.hpp
@@ -12,14 +12,27 @@
namespace mlpack {
namespace tree {
-// This doesn't do anything yet.
+template<typename ObservationType = double>
class NumericSplitInfo
{
public:
- NumericSplitInfo() { }
+ NumericSplitInfo() { /* Nothing to do. */ }
+ NumericSplitInfo(arma::Col<ObservationType>& splitPoints) :
+ splitPoints(splitPoints) { /* Nothing to do. */ }
template<typename eT>
- static size_t CalculateDirection(const eT& /* value */) { return 0; }
+ size_t CalculateDirection(const eT& value) const
+ {
+ // What bin does the point fall into?
+ size_t bin = 0;
+ while (value > splitPoints[bin] && bin < splitPoints.n_elem - 1)
+ ++bin;
+
+ return bin;
+ }
+
+ private:
+ arma::Col<size_t> splitPoints;
};
} // namespace tree
diff --git a/src/mlpack/tests/hoeffding_tree_test.cpp b/src/mlpack/tests/hoeffding_tree_test.cpp
index 0bab98d..5cbb2d0 100644
--- a/src/mlpack/tests/hoeffding_tree_test.cpp
+++ b/src/mlpack/tests/hoeffding_tree_test.cpp
@@ -417,9 +417,11 @@ BOOST_AUTO_TEST_CASE(StreamingDecisionTreeSimpleDatasetTest)
// Now train two streaming decision trees; one on the whole dataset, and one
// on streaming data.
- StreamingDecisionTree<HoeffdingSplit<>, arma::Mat<size_t>>
+ StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+ HoeffdingNumericSplit<GiniImpurity, size_t>>, arma::Mat<size_t>>
batchTree(dataset, info, labels, 3);
- StreamingDecisionTree<HoeffdingSplit<>, arma::Mat<size_t>>
+ StreamingDecisionTree<HoeffdingSplit<GiniImpurity,
+ HoeffdingNumericSplit<GiniImpurity, size_t>>, arma::Mat<size_t>>
streamTree(info, 3, 3);
for (size_t i = 0; i < 9000; ++i)
streamTree.Train(dataset.col(i), labels[i]);
@@ -445,4 +447,35 @@ BOOST_AUTO_TEST_CASE(StreamingDecisionTreeSimpleDatasetTest)
}
}
+/**
+ * Test that the HoeffdingNumericSplit class has a fitness function value of 0
+ * before it's seen enough points.
+ */
+BOOST_AUTO_TEST_CASE(HoeffdingNumericSplitFitnessFunctionTest)
+{
+ HoeffdingNumericSplit<GiniImpurity> split(5, 10, 100);
+
+ // The first 99 iterations should not calculate anything. The 100th is where
+ // the counting starts.
+ for (size_t i = 0; i < 99; ++i)
+ {
+ split.Train(mlpack::math::Random(), mlpack::math::RandInt(5));
+ BOOST_REQUIRE_SMALL(split.EvaluateFitnessFunction(), 1e-10);
+ }
+}
+
+/**
+ * Make sure the majority class is correct in the samples before binning.
+ */
+BOOST_AUTO_TEST_CASE(HoeffdingNumericSplitPreBinningMajorityClassTest)
+{
+ HoeffdingNumericSplit<GiniImpurity> split(3, 10, 100);
+
+ for (size_t i = 0; i < 100; ++i)
+ {
+ split.Train(mlpack::math::Random(), 1);
+ BOOST_REQUIRE_EQUAL(split.MajorityClass(), 1);
+ }
+}
+
BOOST_AUTO_TEST_SUITE_END();
More information about the mlpack-git
mailing list