[mlpack-git] master: Force splitting after enough samples. (d48fd3f)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:43:01 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125
>---------------------------------------------------------------
commit d48fd3fd9f185bbb1e596dae1f48b4fcb6093cd9
Author: ryan <ryan at ratml.org>
Date: Tue Sep 29 22:21:19 2015 -0400
Force splitting after enough samples.
>---------------------------------------------------------------
d48fd3fd9f185bbb1e596dae1f48b4fcb6093cd9
src/mlpack/core/data/load_impl.hpp | 4 ++--
.../hoeffding_categorical_split_impl.hpp | 1 +
.../hoeffding_numeric_split_impl.hpp | 1 +
.../methods/hoeffding_trees/hoeffding_split.hpp | 4 +++-
.../hoeffding_trees/hoeffding_split_impl.hpp | 24 ++++++++++++++--------
.../streaming_decision_tree_impl.hpp | 5 +++--
.../streaming_decision_tree_main.cpp | 18 ++++++++++++++++
.../naive_bayes/naive_bayes_classifier_impl.hpp | 4 ++++
src/mlpack/tests/hoeffding_tree_test.cpp | 10 ++++-----
9 files changed, 52 insertions(+), 19 deletions(-)
diff --git a/src/mlpack/core/data/load_impl.hpp b/src/mlpack/core/data/load_impl.hpp
index 4f953b5..5497063 100644
--- a/src/mlpack/core/data/load_impl.hpp
+++ b/src/mlpack/core/data/load_impl.hpp
@@ -369,11 +369,11 @@ bool Load(const std::string& filename,
eT val = eT(0);
token >> val;
- if (token.fail())
+// if (token.fail())
{
// Conversion failed; but it may be a NaN or inf. Armadillo has
// convenient functions to check.
- if (!arma::diskio::convert_naninf(val, token.str()))
+// if (!arma::diskio::convert_naninf(val, token.str()))
{
// We need to perform a mapping.
const size_t dim = (transpose) ? col : row;
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp
index 32d2375..e86428b 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_categorical_split_impl.hpp
@@ -36,6 +36,7 @@ template<typename FitnessFunction>
double HoeffdingCategoricalSplit<FitnessFunction>::EvaluateFitnessFunction()
const
{
+ Log::Debug << sufficientStatistics.t();
return FitnessFunction::Evaluate(sufficientStatistics);
}
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split_impl.hpp
index 186de09..d55e99b 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split_impl.hpp
@@ -86,6 +86,7 @@ template<typename FitnessFunction, typename ObservationType>
double HoeffdingNumericSplit<FitnessFunction, ObservationType>::
EvaluateFitnessFunction() const
{
+ Log::Debug << sufficientStatistics.t();
if (samplesSeen < observationsBeforeBinning)
return 0.0;
else
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
index 3c1e9ff..83a6d14 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
@@ -26,7 +26,8 @@ class HoeffdingSplit
HoeffdingSplit(const size_t dimensionality,
const size_t numClasses,
const data::DatasetInfo& datasetInfo,
- const double successProbability);
+ const double successProbability,
+ const size_t maxSamples);
template<typename VecType>
void Train(const VecType& point, const size_t label);
@@ -57,6 +58,7 @@ class HoeffdingSplit
size_t numSamples;
size_t numClasses;
+ size_t maxSamples;
arma::Col<size_t> classCounts;
const data::DatasetInfo& datasetInfo;
double successProbability;
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
index 765a197..cb007b1 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
@@ -20,9 +20,11 @@ HoeffdingSplit<
>::HoeffdingSplit(const size_t dimensionality,
const size_t numClasses,
const data::DatasetInfo& datasetInfo,
- const double successProbability) :
+ const double successProbability,
+ const size_t maxSamples) :
numSamples(0),
numClasses(numClasses),
+ maxSamples(maxSamples),
classCounts(arma::zeros<arma::Col<size_t>>(numClasses)),
datasetInfo(datasetInfo),
successProbability(successProbability),
@@ -110,6 +112,7 @@ size_t HoeffdingSplit<
{
size_t type = dimensionMappings[i].first;
size_t index = dimensionMappings[i].second;
+ Log::Warn << "Evaluate fitness function for dimension " << i << ".\n";
if (type == data::Datatype::categorical)
gains[i] = categoricalSplits[index].EvaluateFitnessFunction();
else if (type == data::Datatype::numeric)
@@ -134,8 +137,10 @@ size_t HoeffdingSplit<
}
}
+ Log::Warn << "Split check (" << numSamples << "): largest " << largest << ", "
+ << "second largest " << secondLargest << ", epsilon " << epsilon << ".\n";
// Are these far enough apart to split?
- if (largest - secondLargest > epsilon)
+ if (largest - secondLargest > epsilon || numSamples > maxSamples)
{
// Split!
splitDimension = largestIndex;
@@ -219,16 +224,17 @@ void HoeffdingSplit<
}
// Create the children.
- if (datasetInfo.Type(splitDimension) == data::Datatype::numeric)
+ if (dimensionMappings[splitDimension].first == data::Datatype::categorical)
{
- numericSplits[numericSplitIndex].CreateChildren(children, datasetInfo,
- numericSplits.size() + categoricalSplits.size(), numericSplit);
+ categoricalSplits[dimensionMappings[splitDimension].second].CreateChildren(
+ children, datasetInfo, numericSplits.size() + categoricalSplits.size(),
+ categoricalSplit);
}
- else if (datasetInfo.Type(splitDimension) == data::Datatype::categorical)
+ else if (dimensionMappings[splitDimension].first == data::Datatype::numeric)
{
- categoricalSplits[categoricalSplitIndex].CreateChildren(children,
- datasetInfo, numericSplits.size() + categoricalSplits.size(),
- categoricalSplit);
+ numericSplits[dimensionMappings[splitDimension].second].CreateChildren(
+ children, datasetInfo, numericSplits.size() + categoricalSplits.size(),
+ numericSplit);
}
}
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
index 295c237..99c2d8e 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
@@ -19,7 +19,7 @@ StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
const data::DatasetInfo& datasetInfo,
const arma::Row<size_t>& labels,
const size_t numClasses) :
- split(data.n_rows, numClasses, datasetInfo, 0.95)
+ split(data.n_rows, numClasses, datasetInfo, 0.95, 5000)
{
Train(data, labels);
}
@@ -29,7 +29,7 @@ StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
const data::DatasetInfo& datasetInfo,
const size_t dimensionality,
const size_t numClasses) :
- split(dimensionality, numClasses, datasetInfo, 0.95)
+ split(dimensionality, numClasses, datasetInfo, 0.95, 5000)
{
// No training. Anything else to do...?
}
@@ -103,6 +103,7 @@ void StreamingDecisionTree<SplitType, MatType>::Classify(
const MatType& data,
arma::Row<size_t>& predictions)
{
+ predictions.set_size(data.n_cols);
for (size_t i = 0; i < data.n_cols; ++i)
predictions[i] = Classify(data.col(i));
}
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
index 9d6226c..c541218 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
@@ -17,6 +17,10 @@ using namespace mlpack::data;
PARAM_STRING_REQ("training_file", "Training dataset file.", "t");
PARAM_STRING("labels_file", "Labels for training dataset.", "l", "");
+PARAM_DOUBLE("confidence", "Confidence before splitting (between 0 and 1).",
+ "c", 0.95);
+
+
int main(int argc, char** argv)
{
CLI::ParseCommandLine(argc, argv);
@@ -27,6 +31,9 @@ int main(int argc, char** argv)
arma::mat trainingSet;
DatasetInfo datasetInfo;
data::Load(trainingFile, trainingSet, datasetInfo, true);
+ for (size_t i = 0; i < trainingSet.n_rows; ++i)
+ Log::Info << datasetInfo.NumMappings(i) << " mappings in dimension " << i <<
+".\n";
arma::Col<size_t> labelsIn;
data::Load(labelsFile, labelsIn, true, false);
@@ -54,4 +61,15 @@ int main(int argc, char** argv)
for (size_t i = 0; i < node->NumChildren(); ++i)
stack.push(&node->Child(i));
}
+
+ // Check the accuracy on the training set.
+ arma::Row<size_t> predictedLabels;
+ tree.Classify(trainingSet, predictedLabels);
+
+ size_t correct = 0;
+ for (size_t i = 0; i < predictedLabels.n_elem; ++i)
+ if (labels[i] == predictedLabels[i])
+ ++correct;
+
+ Log::Info << correct << " correct out of " << predictedLabels.n_elem << ".\n";
}
diff --git a/src/mlpack/methods/naive_bayes/naive_bayes_classifier_impl.hpp b/src/mlpack/methods/naive_bayes/naive_bayes_classifier_impl.hpp
index 5cd4fb9..119d073 100644
--- a/src/mlpack/methods/naive_bayes/naive_bayes_classifier_impl.hpp
+++ b/src/mlpack/methods/naive_bayes/naive_bayes_classifier_impl.hpp
@@ -98,6 +98,10 @@ NaiveBayesClassifier<MatType>::NaiveBayesClassifier(
variances[i] = 1e-50;
probabilities /= data.n_cols;
+
+ Log::Info << "probabilities:\n" << probabilities.t();
+ Log::Info << "means:\n" << means.t();
+ Log::Info << "variances:\n" << variances.t();
}
template<typename MatType>
diff --git a/src/mlpack/tests/hoeffding_tree_test.cpp b/src/mlpack/tests/hoeffding_tree_test.cpp
index 8ad9b40..4bc7a0b 100644
--- a/src/mlpack/tests/hoeffding_tree_test.cpp
+++ b/src/mlpack/tests/hoeffding_tree_test.cpp
@@ -238,7 +238,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitNoSplitTest)
info.MapString("cat1", 2);
info.MapString("cat2", 2);
- HoeffdingSplit<> split(3, 2, info, 0.95);
+ HoeffdingSplit<> split(3, 2, info, 0.95, 5000);
// Feed it samples.
for (size_t i = 0; i < 1000; ++i)
@@ -269,7 +269,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitEasySplitTest)
info.MapString("cat1", 0);
info.MapString("cat0", 1);
- HoeffdingSplit<> split(2, 2, info, 0.95);
+ HoeffdingSplit<> split(2, 2, info, 0.95, 5000);
// Feed samples from each class.
for (size_t i = 0; i < 500; ++i)
@@ -297,7 +297,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitProbability1SplitTest)
info.MapString("cat1", 0);
info.MapString("cat0", 1);
- HoeffdingSplit<> split(2, 2, info, 1.0);
+ HoeffdingSplit<> split(2, 2, info, 1.0, 5000);
// Feed samples from each class.
for (size_t i = 0; i < 5000; ++i)
@@ -325,7 +325,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitAlmostPerfectSplit)
info.MapString("cat0", 1);
info.MapString("cat1", 1);
- HoeffdingSplit<> split(2, 2, info, 0.95);
+ HoeffdingSplit<> split(2, 2, info, 0.95, 5000);
// Feed samples.
for (size_t i = 0; i < 500; ++i)
@@ -360,7 +360,7 @@ BOOST_AUTO_TEST_CASE(HoeffdingSplitEqualSplitTest)
info.MapString("cat0", 1);
info.MapString("cat1", 1);
- HoeffdingSplit<> split(2, 2, info, 0.95);
+ HoeffdingSplit<> split(2, 2, info, 0.95, 5000);
// Feed samples.
for (size_t i = 0; i < 500; ++i)
More information about the mlpack-git
mailing list