[mlpack-git] master: - Sparse DTree finally working. (dc8d9b1)
gitdub at mlpack.org
gitdub at mlpack.org
Tue Oct 18 05:43:36 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/94d14187222231ca29e4f6419c5999c660db4f8a...981ffa2d67d8fe38df6c699589005835fef710ea
>---------------------------------------------------------------
commit dc8d9b1c7170af2f891538a8585d0494381430c0
Author: theJonan <ivan at jonan.info>
Date: Mon Oct 17 22:55:41 2016 +0300
- Sparse DTree finally working.
>---------------------------------------------------------------
dc8d9b1c7170af2f891538a8585d0494381430c0
src/mlpack/methods/det/dtree.hpp | 25 ++--
src/mlpack/methods/det/dtree_impl.hpp | 244 ++++++++++++++--------------------
2 files changed, 111 insertions(+), 158 deletions(-)
diff --git a/src/mlpack/methods/det/dtree.hpp b/src/mlpack/methods/det/dtree.hpp
index 9a2b175..70c8cc5 100644
--- a/src/mlpack/methods/det/dtree.hpp
+++ b/src/mlpack/methods/det/dtree.hpp
@@ -44,8 +44,9 @@ class DTree
/**
* The actual, underlying type we're working with
*/
- typedef typename MatType::elem_type ElemType;
- typedef typename MatType::vec_type VecType;
+ typedef typename MatType::elem_type ElemType;
+ typedef typename MatType::vec_type VecType;
+ typedef typename arma::Col<ElemType> StatType;
/**
* Create an empty density estimation tree.
@@ -60,8 +61,8 @@ class DTree
* @param minVals Minimum values of the bounding box.
* @param totalPoints Total number of points in the dataset.
*/
- DTree(const VecType& maxVals,
- const VecType& minVals,
+ DTree(const StatType& maxVals,
+ const StatType& minVals,
const size_t totalPoints);
/**
@@ -86,8 +87,8 @@ class DTree
* @param end End of points represented by this node in the data matrix.
* @param error log-negative error of this node.
*/
- DTree(const VecType& maxVals,
- const VecType& minVals,
+ DTree(const StatType& maxVals,
+ const StatType& minVals,
const size_t start,
const size_t end,
const double logNegError);
@@ -103,8 +104,8 @@ class DTree
* @param start Start of points represented by this node in the data matrix.
* @param end End of points represented by this node in the data matrix.
*/
- DTree(const VecType& maxVals,
- const VecType& minVals,
+ DTree(const StatType& maxVals,
+ const StatType& minVals,
const size_t totalPoints,
const size_t start,
const size_t end);
@@ -199,9 +200,9 @@ class DTree
size_t end;
//! Upper half of bounding box for this node.
- VecType maxVals;
+ StatType maxVals;
//! Lower half of bounding box for this node.
- VecType minVals;
+ StatType minVals;
//! The splitting dimension for this node.
size_t splitDim;
@@ -270,10 +271,10 @@ class DTree
TagType BucketTag() const { return subtreeLeaves == 1 ? bucketTag : -1; }
//! Return the maximum values.
- const VecType& MaxVals() const { return maxVals; }
+ const StatType& MaxVals() const { return maxVals; }
//! Return the minimum values.
- const VecType& MinVals() const { return minVals; }
+ const StatType& MinVals() const { return minVals; }
/**
* Serialize the density estimation tree.
diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp
index 16092b8..ad54039 100644
--- a/src/mlpack/methods/det/dtree_impl.hpp
+++ b/src/mlpack/methods/det/dtree_impl.hpp
@@ -5,113 +5,98 @@
* Implementations of some declared functions in
* the Density Estimation Tree class.
*
+ * Sparsification and optimizations.
+ * @author Ivan Georgiev (ivan at jonan.info)
+ *
*/
#include "dtree.hpp"
#include <stack>
-#include <algorithm>
#include <vector>
-#include <mlpack/core/tree/perform_split.hpp>
using namespace mlpack;
using namespace det;
-namespace arma
+namespace details
{
- template <typename ElemType>
- SpMat<ElemType> sort(const SpMat<ElemType>& data)
+ /**
+ * This one sorts and scand the given per-dimension extract and puts all splits
+ * in a vector, that can easily be iterated afterwards.
+ */
+ template <typename ElemType, typename RowType>
+ std::vector<std::pair<ElemType, size_t>> ExtractSplits(RowType& row, size_t minLeafSize)
{
- typedef std::vector<ElemType> ElemVectorType;
+ typedef std::pair<ElemType, size_t> SplitItem;
+ std::vector<SplitItem> splitVec;
- // Construct the vector of values / locations...
- ElemVectorType indexVec(data.begin(), data.end());
+ // We want to do that inplace for optimization purposes;
+ std::sort(row.begin(), row.end());
- // ... and sort it!
- std::sort(indexVec.begin(), indexVec.end());
-
- // Now prepare the structures for the batch construction of the
- // sorted sparse row.
- arma::umat locations(2, data.n_nonzero);
- arma::Col<ElemType> vals(data.n_nonzero);
- ElemType lastVal = -std::numeric_limits<ElemType>::max();
- size_t padding = 0;
-
- for (size_t ii = 0; ii < indexVec.size(); ++ii)
+ // Ensure the minimum leaf size on both sides. We need to figure out why
+ // there are spikes if this minLeafSize is enforced here...
+ for (size_t i = minLeafSize - 1; i < row.n_elem - minLeafSize; ++i)
{
- const ElemType newVal = indexVec[ii];
- if (lastVal < ElemType(0) && newVal > ElemType(0))
- {
- assert(padding == 0); // we should arrive here once!
- padding = data.n_elem - data.n_nonzero;
- }
+ // This makes sense for real continuous data. This kinda corrupts the
+ // data and estimation if the data is ordinal.
+ const ElemType split = (row[i] + row[i + 1]) / 2.0;
- locations.at(0, ii) = (ii + padding) % data.n_rows;
- locations.at(1, ii) = (ii + padding) / data.n_rows;
- vals.at(ii) = lastVal = newVal;
+ // Check if we can split here (two points are different)
+ if (split != row[i])
+ splitVec.push_back(SplitItem(split, i));
}
- return SpMat<ElemType>(locations, vals, data.n_rows, data.n_cols, false, false);
- };
+ return splitVec;
-};
+ }
-namespace detail
-{
+ // This the custom, sparse optimized implementation of the same routine.
template <typename ElemType>
- class DTreeSplit
+ std::vector<std::pair<ElemType, size_t>> ExtractSplits(arma::SpRow<ElemType>& row, size_t minLeafSize)
{
- public:
- typedef DTreeSplit<ElemType> SplitInfo;
+ typedef std::pair<ElemType, size_t> SplitItem;
+ std::vector<SplitItem> splitVec;
- template<typename VecType>
- static bool AssignToLeftNode(const VecType& point,
- const SplitInfo& splitInfo)
- {
- return point[splitInfo.splitDimension] < splitInfo.splitVal;
- }
+ // Construct a vector of values.
+ std::vector<ElemType> valsVec(row.begin(), row.end());
- private:
- ElemType splitVal;
- size_t splitDimension;
- };
+ // ... and sort it!
+ std::sort(valsVec.begin(), valsVec.end());
- /**
- * Get the values for the dimension and sort them. The old implementation:
- * dimVec = data.row(dim).subvec(start, end - 1);
- * dimVec = arma::sort(dimVec);
- * was quite inefficient, due to many (3) vector copy operations. This could be a
- * problem especially for sparse matrices. That's why they have custom implementation.
- */
- template <typename MatType>
- typename MatType::row_type ExtractSortedRow(const MatType& data,
- size_t dim,
- size_t start,
- size_t end)
- {
- typedef typename MatType::elem_type ElemType;
+ // Now iterate over the values, taking account for the over-the-zeroes
+ // jump and construct the splits vector.
+ ElemType lastVal = -std::numeric_limits<ElemType>::max();
+ size_t padding = 0, zeroes = row.n_elem - row.n_nonzero;
- assert(start < end);
+ for (size_t i = 0; i < valsVec.size(); ++i)
+ {
+ const ElemType newVal = valsVec[i];
+ if (lastVal < ElemType(0) && newVal > ElemType(0) && zeroes > 0)
+ {
+ Log::Assert(padding == 0); // we should arrive here once!
+ if (lastVal >= valsVec[0] && // i.e. we're not in the beginning
+ i >= minLeafSize &&
+ i <= row.n_elem - minLeafSize)
+ splitVec.push_back(SplitItem(lastVal / 2.0, i - 1));
+
+ padding = zeroes;
+ lastVal = ElemType(0);
+ }
- typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1));
- std::sort(dimVec.begin(), dimVec.end());
- return dimVec;
- }
+ if (i + padding >= minLeafSize && i + padding <= row.n_elem - minLeafSize)// the normal case
+ {
+ // This makes sense for real continuous data. This kinda corrupts the
+ // data and estimation if the data is ordinal.
+ const ElemType split = (lastVal + newVal) / 2.0;
- /**
- * We need a custom implementation for sparse matrix, in order to save sorting of
- * all these zeroes.
- */
- template <typename ElemType>
- arma::SpRow<ElemType> ExtractSortedRow(const arma::SpMat<ElemType>& data,
- size_t dim,
- size_t start,
- size_t end)
- {
- assert(start < end);
+ // Check if we can split here (two points are different)
+ if (split != newVal)
+ splitVec.push_back(SplitItem(split, i + padding - 1));
+ }
- arma::SpRow<ElemType> dimVec = data(dim, arma::span(start, end - 1));
- return arma::sort(dimVec);
- }
+ lastVal = newVal;
+ }
+ return splitVec;
+ }
};
template <typename MatType, typename TagType>
@@ -136,8 +121,8 @@ DTree<MatType, TagType>::DTree() :
// Root node initializers
template <typename MatType, typename TagType>
-DTree<MatType, TagType>::DTree(const VecType& maxVals,
- const VecType& minVals,
+DTree<MatType, TagType>::DTree(const StatType& maxVals,
+ const StatType& minVals,
const size_t totalPoints) :
start(0),
end(totalPoints),
@@ -161,6 +146,8 @@ template <typename MatType, typename TagType>
DTree<MatType, TagType>::DTree(MatType & data) :
start(0),
end(data.n_cols),
+ minVals(arma::min(data, 1)),
+ maxVals(arma::max(data, 1)),
splitDim(size_t(-1)),
splitValue(std::numeric_limits<ElemType>::max()),
subtreeLeavesLogNegError(-DBL_MAX),
@@ -173,28 +160,13 @@ DTree<MatType, TagType>::DTree(MatType & data) :
left(NULL),
right(NULL)
{
- maxVals = data.col(0);
- minVals = data.col(0);
-
- typename MatType::row_col_iterator dataEnd = data.end_row_col();
-
- // Loop over data to extract maximum and minimum values in each dimension.
- for (typename MatType::row_col_iterator i = data.begin_row_col(); i != dataEnd; ++i)
- {
- size_t j = i.row();
- if (*i > maxVals[j])
- maxVals[j] = *i;
- else if (*i < minVals[j])
- minVals[j] = *i;
- }
-
logNegError = LogNegativeError(data.n_cols);
}
// Non-root node initializers
template <typename MatType, typename TagType>
-DTree<MatType, TagType>::DTree(const VecType& maxVals,
- const VecType& minVals,
+DTree<MatType, TagType>::DTree(const StatType& maxVals,
+ const StatType& minVals,
const size_t start,
const size_t end,
const double logNegError) :
@@ -217,8 +189,8 @@ DTree<MatType, TagType>::DTree(const VecType& maxVals,
{ /* Nothing to do. */ }
template <typename MatType, typename TagType>
-DTree<MatType, TagType>::DTree(const VecType& maxVals,
- const VecType& minVals,
+DTree<MatType, TagType>::DTree(const StatType& maxVals,
+ const StatType& minVals,
const size_t totalPoints,
const size_t start,
const size_t end) :
@@ -256,13 +228,12 @@ double DTree<MatType, TagType>::LogNegativeError(const size_t totalPoints) const
double err = 2 * std::log((double) (end - start)) -
2 * std::log((double) totalPoints);
- VecType valDiffs = maxVals - minVals;
- typename VecType::iterator valEnd = valDiffs.end();
- for (typename VecType::iterator i = valDiffs.begin(); i != valEnd; ++i)
+ StatType valDiffs = maxVals - minVals;
+ for (size_t i = 0;i < valDiffs.n_elem; ++i)
{
// Ignore very small dimensions to prevent overflow.
- if (*i > 1e-50)
- err -= std::log(*i);
+ if (valDiffs[i] > 1e-50)
+ err -= std::log(valDiffs[i]);
}
return err;
@@ -279,10 +250,12 @@ bool DTree<MatType, TagType>::FindSplit(const MatType& data,
double& rightError,
const size_t minLeafSize) const
{
+ typedef std::pair<ElemType, size_t> SplitItem;
+
// Ensure the dimensionality of the data is the same as the dimensionality of
// the bounding rectangle.
- assert(data.n_rows == maxVals.n_elem);
- assert(data.n_rows == minVals.n_elem);
+ Log::Assert(data.n_rows == maxVals.n_elem);
+ Log::Assert(data.n_rows == minVals.n_elem);
const size_t points = end - start;
@@ -320,42 +293,22 @@ bool DTree<MatType, TagType>::FindSplit(const MatType& data,
// Find the log volume of all the other dimensions.
double volumeWithoutDim = logVolume - std::log(max - min);
- // Get a sorted version of the dimension in interest,
- // from the given samples range. This is the most expensive step.
- typename MatType::row_type dimVec = detail::ExtractSortedRow(data,
- dim,
- start,
- end);
+ // Get the values for splitting. The old implementation:
+ // dimVec = data.row(dim).subvec(start, end - 1);
+ // dimVec = arma::sort(dimVec);
+ // could be quite inefficient for sparse matrices, due to copy operations (3).
+ // This one has custom implementation for dense and sparse matrices.
- typename MatType::row_col_iterator dimVecEnd = dimVec.end_row_col();
- typename MatType::row_col_iterator dI = dimVec.begin_row_col();
+ typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1));
+ std::vector<SplitItem> splitVec = details::ExtractSplits<ElemType>(dimVec, minLeafSize);
- // Find the best split for this dimension.
- for (;;)
+ // Iterate on all the splits for this dimension
+ for (typename std::vector<SplitItem>::iterator i = splitVec.begin();
+ i != splitVec.end();
+ ++i)
{
- const size_t position = dI.col();
-
- // Ensure the minimum leaf size on both sides. We need to figure out why
- // there are spikes if this minLeafSize is enforced here...
- if (position >= dimVec.n_cols - minLeafSize)
- break;
- if (position < minLeafSize - 1)
- continue;
-
- ElemType split = *dI;
-
- // In case of sparse matrices and all present values being negative
- // this can happen - last many elements being 0, so we need to check.
- if (++dI == dimVecEnd)
- break;
-
- // This makes sense for real continuous data. This kinda corrupts the
- // data and estimation if the data is ordinal.
- split += *dI;
- split /= 2.0;
-
- if (split == *dI)
- continue; // We can't split here (two points are the same).
+ const ElemType split = i->first;
+ const size_t position = i->second;
// Another way of picking split is using this:
// split = leftsplit;
@@ -475,14 +428,13 @@ double DTree<MatType, TagType>::Grow(MatType& data,
{
// Move the data around for the children to have points in a node lie
// contiguously (to increase efficiency during the training).
-// const size_t splitIndex = splt::PerformSplit(data, start, end - start, )
const size_t splitIndex = SplitData(data, dim, splitValueTmp, oldFromNew);
// Make max and min vals for the children.
- arma::vec maxValsL(maxVals);
- arma::vec maxValsR(maxVals);
- arma::vec minValsL(minVals);
- arma::vec minValsR(minVals);
+ StatType maxValsL(maxVals);
+ StatType maxValsR(maxVals);
+ StatType minValsL(minVals);
+ StatType minValsR(minVals);
maxValsL[dim] = splitValueTmp;
minValsR[dim] = splitValueTmp;
@@ -524,7 +476,7 @@ double DTree<MatType, TagType>::Grow(MatType& data,
else
{
// We can make this a leaf node.
- assert((size_t) (end - start) >= minLeafSize);
+ Log::Assert((size_t) (end - start) >= minLeafSize);
subtreeLeaves = 1;
subtreeLeavesLogNegError = logNegError;
}
More information about the mlpack-git
mailing list