[mlpack-git] master: - All DET sparsification works. (4083784)
gitdub at mlpack.org
gitdub at mlpack.org
Tue Oct 18 04:54:50 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/94d14187222231ca29e4f6419c5999c660db4f8a...981ffa2d67d8fe38df6c699589005835fef710ea
>---------------------------------------------------------------
commit 4083784eb8b7ac54fe182aa4565f7f2f25e19cde
Author: theJonan <ivan at jonan.info>
Date: Tue Oct 18 11:54:50 2016 +0300
- All DET sparsification works.
>---------------------------------------------------------------
4083784eb8b7ac54fe182aa4565f7f2f25e19cde
src/mlpack/methods/det/dt_utils_impl.hpp | 29 ++++---
src/mlpack/methods/det/dtree.hpp | 2 +-
src/mlpack/methods/det/dtree_impl.hpp | 9 +-
src/mlpack/tests/det_test.cpp | 137 +++++++++++++++++++++++++++++++
4 files changed, 156 insertions(+), 21 deletions(-)
diff --git a/src/mlpack/methods/det/dt_utils_impl.hpp b/src/mlpack/methods/det/dt_utils_impl.hpp
index cad5289..fcb6b0c 100644
--- a/src/mlpack/methods/det/dt_utils_impl.hpp
+++ b/src/mlpack/methods/det/dt_utils_impl.hpp
@@ -99,11 +99,11 @@ void mlpack::det::PrintVariableImportance(const DTree<MatType, TagType>* dtree,
// folds.
template <typename MatType, typename TagType>
DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
- const size_t folds,
- const bool useVolumeReg,
- const size_t maxLeafSize,
- const size_t minLeafSize,
- const std::string unprunedTreeOutput)
+ const size_t folds,
+ const bool useVolumeReg,
+ const size_t maxLeafSize,
+ const size_t minLeafSize,
+ const std::string unprunedTreeOutput)
{
// Initialize the tree.
DTree<MatType, TagType> dtree(dataset);
@@ -170,7 +170,7 @@ DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
<< " " << oldAlpha << "." << std::endl;
MatType cvData(dataset);
- size_t testSize = dataset.n_cols / folds;
+ const size_t testSize = dataset.n_cols / folds;
arma::vec regularizationConstants(prunedSequence.size());
regularizationConstants.fill(0.0);
@@ -181,17 +181,17 @@ DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
// implementation.
#ifdef _WIN32
#pragma omp parallel for default(none) \
- shared(testSize, cvData, prunedSequence, regularizationConstants, dataset)
+ shared(testSize, cvData, prunedSequence, regularizationConstants)
for (intmax_t fold = 0; fold < (intmax_t) folds; fold++)
#else
#pragma omp parallel for default(none) \
- shared(testSize, cvData, prunedSequence, regularizationConstants, dataset)
+ shared(testSize, cvData, prunedSequence, regularizationConstants)
for (size_t fold = 0; fold < folds; fold++)
#endif
{
// Break up data into train and test sets.
- size_t start = fold * testSize;
- size_t end = std::min((size_t) (fold + 1) * testSize, (size_t) cvData.n_cols);
+ const size_t start = fold * testSize;
+ const size_t end = std::min((size_t) (fold + 1) * testSize, (size_t) cvData.n_cols);
MatType test = cvData.cols(start, end - 1);
MatType train(cvData.n_rows, cvData.n_cols - test.n_cols);
@@ -239,11 +239,10 @@ DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
}
// Update the cv regularization constant.
- cvRegularizationConstants[i] += 2.0 * cvVal / (double) dataset.n_cols;
+ cvRegularizationConstants[i] += 2.0 * cvVal / (double) cvData.n_cols;
// Determine the new alpha value and prune accordingly.
- double cvOldAlpha = 0.5 * (prunedSequence[i + 1].first +
- prunedSequence[i + 2].first);
+ double cvOldAlpha = 0.5 * (prunedSequence[i + 1].first + prunedSequence[i + 2].first);
cvDTree.PruneAndUpdate(cvOldAlpha, train.n_cols, useVolumeReg);
}
@@ -256,9 +255,9 @@ DTree<MatType, TagType>* mlpack::det::Trainer(MatType& dataset,
}
if (prunedSequence.size() > 2)
- cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal / (double) dataset.n_cols;
+ cvRegularizationConstants[prunedSequence.size() - 2] += 2.0 * cvVal / (double) cvData.n_cols;
- #pragma omp critical
+ #pragma omp critical (DTreeCVUpdate)
regularizationConstants += cvRegularizationConstants;
}
Timer::Stop("cross_validation");
diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp
index 70c8cc5..a4ec63a 100644
--- a/src/mlpack/methods/det/dtree.hpp
+++ b/src/mlpack/methods/det/dtree.hpp
@@ -46,7 +46,7 @@ class DTree
*/
typedef typename MatType::elem_type ElemType;
typedef typename MatType::vec_type VecType;
- typedef typename arma::Col<ElemType> StatType;
+ typedef typename arma::Col<ElemType> StatType;
/**
* Create an empty density estimation tree.
diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp
index ad54039..be64f5c 100644
--- a/src/mlpack/methods/det/dtree_impl.hpp
+++ b/src/mlpack/methods/det/dtree_impl.hpp
@@ -146,8 +146,8 @@ template <typename MatType, typename TagType>
DTree<MatType, TagType>::DTree(MatType & data) :
start(0),
end(data.n_cols),
- minVals(arma::min(data, 1)),
maxVals(arma::max(data, 1)),
+ minVals(arma::min(data, 1)),
splitDim(size_t(-1)),
splitValue(std::numeric_limits<ElemType>::max()),
subtreeLeavesLogNegError(-DBL_MAX),
@@ -264,11 +264,11 @@ bool DTree<MatType, TagType>::FindSplit(const MatType& data,
// Loop through each dimension.
#ifdef _WIN32
- #pragma omp parallel for default(shared) \
+ #pragma omp parallel for default(none) \
shared(splitValue, splitDim, data)
for (intmax_t dim = 0; dim < (intmax_t) maxVals.n_elem; ++dim)
#else
- #pragma omp parallel for default(shared) \
+ #pragma omp parallel for default(none) \
shared(splitValue, splitDim, data)
for (size_t dim = 0; dim < maxVals.n_elem; ++dim)
#endif
@@ -341,10 +341,9 @@ bool DTree<MatType, TagType>::FindSplit(const MatType& data,
double actualMinDimError = std::log(minDimError) - 2 * std::log((double) data.n_cols) - volumeWithoutDim;
-#pragma omp atomic
+#pragma omp critical (DTreeFindUpdate)
if ((actualMinDimError > minError) && dimSplitFound)
{
-#pragma omp critical DTreeFindUpdate
{
// Calculate actual error (in logspace) by adding terms back to our
// estimate.
diff --git a/src/mlpack/tests/det_test.cpp b/src/mlpack/tests/det_test.cpp
index 3365984..09b9691 100644
--- a/src/mlpack/tests/det_test.cpp
+++ b/src/mlpack/tests/det_test.cpp
@@ -141,6 +141,70 @@ BOOST_AUTO_TEST_CASE(TestSplitData)
BOOST_REQUIRE_EQUAL(oTest[3], 2);
BOOST_REQUIRE_EQUAL(oTest[4], 5);
}
+
+BOOST_AUTO_TEST_CASE(TestSparseFindSplit)
+{
+ arma::mat realData(4,7);
+
+ realData << .0 << 4 << 5 << 7 << 0 << 5 << 0 << arma::endr
+ << .0 << 5 << 0 << 0 << 1 << 7 << 1 << arma::endr
+ << .0 << 5 << 6 << 7 << 1 << 0 << 8 << arma::endr
+ << -1 << 2 << 5 << 0 << 0 << 0 << 0 << arma::endr;
+
+ arma::sp_mat testData(realData);
+
+ DTree<arma::sp_mat> testDTree(testData);
+
+ size_t obDim, trueDim;
+ double trueLeftError, obLeftError, trueRightError, obRightError, obSplit, trueSplit;
+
+ trueDim = 1;
+ trueSplit = .5;
+ trueLeftError = 2 * log(3.0 / 7.0) - (log(7.0) + log(0.5) + log(8.0) + log(6.0));
+ trueRightError = 2 * log(4.0 / 7.0) - (log(7.0) + log(6.5) + log(8.0) + log(6.0));
+
+ testDTree.logVolume = log(7.0) + log(7.0) + log(8.0) + log(6.0);
+ BOOST_REQUIRE(testDTree.FindSplit(testData, obDim, obSplit, obLeftError, obRightError, 1));
+
+ BOOST_REQUIRE(trueDim == obDim);
+ BOOST_REQUIRE_CLOSE(trueSplit, obSplit, 1e-10);
+
+ BOOST_REQUIRE_CLOSE(trueLeftError, obLeftError, 1e-10);
+ BOOST_REQUIRE_CLOSE(trueRightError, obRightError, 1e-10);
+}
+
+BOOST_AUTO_TEST_CASE(TestSparseSplitData)
+{
+ arma::mat realData(4,7);
+
+ realData << .0 << 4 << 5 << 7 << 0 << 5 << 0 << arma::endr
+ << .0 << 5 << 0 << 0 << 1 << 7 << 1 << arma::endr
+ << .0 << 5 << 6 << 7 << 1 << 0 << 8 << arma::endr
+ << -1 << 2 << 5 << 0 << 0 << 0 << 0 << arma::endr;
+
+ arma::sp_mat testData(realData);
+
+ DTree<arma::sp_mat> testDTree(testData);
+
+ arma::Col<size_t> oTest(7);
+ oTest << 1 << 2 << 3 << 4 << 5 << 6 << 7;
+
+ size_t splitDim = 1;
+ double trueSplitVal = .5;
+
+ size_t splitInd = testDTree.SplitData(testData, splitDim, trueSplitVal, oTest);
+
+ BOOST_REQUIRE_EQUAL(splitInd, 3); // 2 points on left side.
+
+ BOOST_REQUIRE_EQUAL(oTest[0], 1);
+ BOOST_REQUIRE_EQUAL(oTest[1], 4);
+ BOOST_REQUIRE_EQUAL(oTest[2], 3);
+ BOOST_REQUIRE_EQUAL(oTest[3], 2);
+ BOOST_REQUIRE_EQUAL(oTest[4], 5);
+ BOOST_REQUIRE_EQUAL(oTest[5], 6);
+ BOOST_REQUIRE_EQUAL(oTest[6], 7);
+}
+
#endif
// Tests for the public functions.
@@ -307,6 +371,79 @@ BOOST_AUTO_TEST_CASE(TestVariableImportance)
BOOST_REQUIRE_CLOSE((double) (rootError - (lError + rError)), imps[2], 1e-10);
}
+BOOST_AUTO_TEST_CASE(TestSparsePruneAndUpdate)
+{
+ arma::mat realData(3, 5);
+
+ realData << 4 << 5 << 7 << 3 << 5 << arma::endr
+ << 5 << 0 << 1 << 7 << 1 << arma::endr
+ << 5 << 6 << 7 << 1 << 8 << arma::endr;
+
+ arma::sp_mat testData(realData);
+
+ arma::Col<size_t> oTest(5);
+ oTest << 0 << 1 << 2 << 3 << 4;
+
+ DTree<arma::sp_mat> testDTree(testData);
+ double alpha = testDTree.Grow(testData, oTest, false, 2, 1);
+ alpha = testDTree.PruneAndUpdate(alpha, testData.n_cols, false);
+
+ BOOST_REQUIRE_CLOSE(alpha, numeric_limits<double>::max(), 1e-10);
+ BOOST_REQUIRE(testDTree.SubtreeLeaves() == 1);
+
+ double rootError = -log(4.0) - log(7.0) - log(7.0);
+
+ BOOST_REQUIRE_CLOSE(testDTree.LogNegError(), rootError, 1e-10);
+ BOOST_REQUIRE_CLOSE(testDTree.SubtreeLeavesLogNegError(), rootError, 1e-10);
+ BOOST_REQUIRE(testDTree.Left() == NULL);
+ BOOST_REQUIRE(testDTree.Right() == NULL);
+}
+
+BOOST_AUTO_TEST_CASE(TestSparseComputeValue)
+{
+ arma::mat realData(3, 5);
+
+ Log::Info << "OMP threads: " << omp_get_thread_num() << std::endl;
+
+ realData << 4 << 5 << 7 << 3 << 5 << arma::endr
+ << 5 << 0 << 1 << 7 << 1 << arma::endr
+ << 5 << 6 << 7 << 1 << 8 << arma::endr;
+
+ arma::vec _q1(3), _q2(3), _q3(3), _q4(3);
+
+ _q1 << 4 << 2 << 2;
+ _q2 << 5 << 0.25 << 6;
+ _q3 << 5 << 3 << 7;
+ _q4 << 2 << 3 << 3;
+
+ arma::sp_mat testData(realData);
+ arma::sp_vec q1(_q1), q2(_q2), q3(_q3), q4(_q4);
+
+ arma::Col<size_t> oTest(5);
+ oTest << 0 << 1 << 2 << 3 << 4;
+
+ DTree<arma::sp_mat> testDTree(testData);
+ double alpha = testDTree.Grow(testData, oTest, false, 2, 1);
+
+ double d1 = (2.0 / 5.0) / exp(log(4.0) + log(7.0) + log(4.5));
+ double d2 = (1.0 / 5.0) / exp(log(4.0) + log(0.5) + log(2.5));
+ double d3 = (2.0 / 5.0) / exp(log(4.0) + log(6.5) + log(2.5));
+
+ BOOST_REQUIRE_CLOSE(d1, testDTree.ComputeValue(q1), 1e-10);
+ BOOST_REQUIRE_CLOSE(d2, testDTree.ComputeValue(q2), 1e-10);
+ BOOST_REQUIRE_CLOSE(d3, testDTree.ComputeValue(q3), 1e-10);
+ BOOST_REQUIRE_CLOSE(0.0, testDTree.ComputeValue(q4), 1e-10);
+
+ alpha = testDTree.PruneAndUpdate(alpha, testData.n_cols, false);
+
+ double d = 1.0 / exp(log(4.0) + log(7.0) + log(7.0));
+
+ BOOST_REQUIRE_CLOSE(d, testDTree.ComputeValue(q1), 1e-10);
+ BOOST_REQUIRE_CLOSE(d, testDTree.ComputeValue(q2), 1e-10);
+ BOOST_REQUIRE_CLOSE(d, testDTree.ComputeValue(q3), 1e-10);
+ BOOST_REQUIRE_CLOSE(0.0, testDTree.ComputeValue(q4), 1e-10);
+}
+
/**
* These are not yet implemented.
*
More information about the mlpack-git
mailing list