[mlpack-git] master: - Sparse matrix speed-ups. (4514e4c)
gitdub at mlpack.org
gitdub at mlpack.org
Wed Oct 19 11:29:58 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/94d14187222231ca29e4f6419c5999c660db4f8a...981ffa2d67d8fe38df6c699589005835fef710ea
>---------------------------------------------------------------
commit 4514e4c9f9914feaced2b437c5a5b297fa2f1353
Author: theJonan <ivan at jonan.info>
Date: Wed Oct 19 18:29:58 2016 +0300
- Sparse matrix speed-ups.
>---------------------------------------------------------------
4514e4c9f9914feaced2b437c5a5b297fa2f1353
src/mlpack/methods/det/dtree_impl.hpp | 57 +++++++++++++++++++++++++----------
1 file changed, 41 insertions(+), 16 deletions(-)
diff --git a/src/mlpack/methods/det/dtree_impl.hpp b/src/mlpack/methods/det/dtree_impl.hpp
index 92f9078..f14961e 100644
--- a/src/mlpack/methods/det/dtree_impl.hpp
+++ b/src/mlpack/methods/det/dtree_impl.hpp
@@ -22,25 +22,34 @@ namespace details
* This one sorts and scand the given per-dimension extract and puts all splits
* in a vector, that can easily be iterated afterwards.
*/
- template <typename ElemType, typename RowType>
- std::vector<std::pair<ElemType, size_t>> ExtractSplits(RowType& row, size_t minLeafSize)
+ template <typename MatType>
+ std::vector<std::pair<typename MatType::elem_type, size_t>>
+ ExtractSplits(const MatType& data,
+ size_t dim,
+ size_t start,
+ size_t end,
+ size_t minLeafSize)
{
+ typedef typename MatType::elem_type ElemType;
typedef std::pair<ElemType, size_t> SplitItem;
- std::vector<SplitItem> splitVec;
+ typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1));
- // We want to do that inplace for optimization purposes;
- std::sort(row.begin(), row.end());
+ // We sort these, in-place (it's a copy of the data, anyways).
+ std::sort(dimVec.begin(), dimVec.end());
+
+ // We're going to collect results here.
+ std::vector<SplitItem> splitVec;
// Ensure the minimum leaf size on both sides. We need to figure out why
// there are spikes if this minLeafSize is enforced here...
- for (size_t i = minLeafSize - 1; i < row.n_elem - minLeafSize; ++i)
+ for (size_t i = minLeafSize - 1; i < dimVec.n_elem - minLeafSize; ++i)
{
// This makes sense for real continuous data. This kinda corrupts the
// data and estimation if the data is ordinal.
- const ElemType split = (row[i] + row[i + 1]) / 2.0;
+ const ElemType split = (dimVec[i] + dimVec[i + 1]) / 2.0;
// Check if we can split here (two points are different)
- if (split != row[i])
+ if (split != dimVec[i])
splitVec.push_back(SplitItem(split, i));
}
@@ -50,21 +59,34 @@ namespace details
// This the custom, sparse optimized implementation of the same routine.
template <typename ElemType>
- std::vector<std::pair<ElemType, size_t>> ExtractSplits(arma::SpRow<ElemType>& row, size_t minLeafSize)
+ std::vector<std::pair<ElemType, size_t>>
+ ExtractSplits(const arma::SpMat<ElemType>& data,
+ size_t dim,
+ size_t start,
+ size_t end,
+ size_t minLeafSize)
{
+ typedef typename arma::SpMat<ElemType>::const_row_iterator RowIterator;
typedef std::pair<ElemType, size_t> SplitItem;
- std::vector<SplitItem> splitVec;
+ const size_t n_elem = end - start;
// Construct a vector of values.
- std::vector<ElemType> valsVec(row.begin(), row.end());
+ std::vector<ElemType> valsVec;
+ valsVec.reserve(n_elem);
+
+ for (RowIterator j(data, dim, start);j.col() < end; ++j)
+ valsVec.push_back(*j);
// ... and sort it!
std::sort(valsVec.begin(), valsVec.end());
+ // We're going to collect our splits here.
+ std::vector<SplitItem> splitVec;
+
// Now iterate over the values, taking account for the over-the-zeroes
// jump and construct the splits vector.
ElemType lastVal = -std::numeric_limits<ElemType>::max();
- size_t padding = 0, zeroes = row.n_elem - row.n_nonzero;
+ size_t padding = 0, zeroes = n_elem - valsVec.size();
for (size_t i = 0; i < valsVec.size(); ++i)
{
@@ -74,14 +96,14 @@ namespace details
Log::Assert(padding == 0); // we should arrive here once!
if (lastVal >= valsVec[0] && // i.e. we're not in the beginning
i >= minLeafSize &&
- i <= row.n_elem - minLeafSize)
+ i <= n_elem - minLeafSize)
splitVec.push_back(SplitItem(lastVal / 2.0, i - 1));
padding = zeroes;
lastVal = ElemType(0);
}
- if (i + padding >= minLeafSize && i + padding <= row.n_elem - minLeafSize)// the normal case
+ if (i + padding >= minLeafSize && i + padding <= n_elem - minLeafSize)// the normal case
{
// This makes sense for real continuous data. This kinda corrupts the
// data and estimation if the data is ordinal.
@@ -297,8 +319,11 @@ bool DTree<MatType, TagType>::FindSplit(const MatType& data,
// could be quite inefficient for sparse matrices, due to copy operations (3).
// This one has custom implementation for dense and sparse matrices.
- typename MatType::row_type dimVec = data(dim, arma::span(start, end - 1));
- std::vector<SplitItem> splitVec = details::ExtractSplits<ElemType>(dimVec, minLeafSize);
+ std::vector<SplitItem> splitVec = details::ExtractSplits(data,
+ dim,
+ start,
+ end,
+ minLeafSize);
// Iterate on all the splits for this dimension
for (typename std::vector<SplitItem>::iterator i = splitVec.begin();
More information about the mlpack-git
mailing list