[mlpack-git] master: Test with regular kd-trees and cover trees. The test conditions for the cover tree need to be adapted a little bit. (905be1e)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Thu Mar 12 16:05:04 EDT 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/eddd7167d69b6c88b271ef2e51d1c20e13f1acd8...70342dd8e5c17e0c164cfb8189748671e9c0dd44
>---------------------------------------------------------------
commit 905be1e61438ab1e677d325003eb875b090a8540
Author: Ryan Curtin <ryan at ratml.org>
Date: Mon Feb 16 15:33:57 2015 -0500
Test with regular kd-trees and cover trees. The test conditions for the cover tree need to be adapted a little bit.
>---------------------------------------------------------------
905be1e61438ab1e677d325003eb875b090a8540
src/mlpack/tests/kmeans_test.cpp | 218 ++++++++++++++++++++++++++++++++++++++-
1 file changed, 217 insertions(+), 1 deletion(-)
diff --git a/src/mlpack/tests/kmeans_test.cpp b/src/mlpack/tests/kmeans_test.cpp
index 593ea6e..d274e49 100644
--- a/src/mlpack/tests/kmeans_test.cpp
+++ b/src/mlpack/tests/kmeans_test.cpp
@@ -758,7 +758,7 @@ BOOST_AUTO_TEST_CASE(DualTreeKMeansScoreKDTreeOneLeafTest)
// Create the trees.
typedef BinarySpaceTree<HRectBound<2>, DTNNStatistic> TreeType;
- TreeType pointTree(dataset, 1);
+ TreeType pointTree(dataset, 1);
TreeType centroidTree(centroids, 1);
// Create the Rules object.
@@ -850,4 +850,220 @@ BOOST_AUTO_TEST_CASE(DualTreeKMeansScoreKDTreeOneLeafTest)
}
}
+BOOST_AUTO_TEST_CASE(DualTreeKMeansScoreKDTreeTest)
+{
+ // If we run a dual-tree algorithm, do we get valid results for each point
+ // and/or node when we use the kd-tree with the default leaf size?
+ const size_t points = 5000;
+ const size_t clusters = 100;
+ arma::mat dataset(5, points);
+ dataset.randu();
+ arma::mat centroids(5, clusters);
+ centroids.randu();
+
+ arma::mat datasetCopy(dataset);
+ arma::mat centroidsCopy(centroids);
+
+ // Create the trees.
+ typedef BinarySpaceTree<HRectBound<2>, DTNNStatistic> TreeType;
+ TreeType pointTree(dataset);
+ TreeType centroidTree(centroids);
+
+ // Create the Rules object.
+ arma::Col<size_t> assignments(points);
+ arma::vec upperBounds(points);
+ arma::vec lowerBounds(points);
+ upperBounds.fill(DBL_MAX);
+ lowerBounds.fill(DBL_MAX);
+ std::vector<bool> visited(points, false); // Fill with false.
+ std::vector<size_t> oldFromNewCentroids; // Not used.
+ std::vector<bool> prunedPoints(points, false); // Fill with false.
+
+ EuclideanDistance e;
+ typedef DTNNKMeansRules<EuclideanDistance, TreeType> RuleType;
+ RuleType rules(centroids, dataset, assignments, upperBounds, lowerBounds, e,
+ prunedPoints, oldFromNewCentroids, visited);
+
+ // Now create the traverser.
+ typename TreeType::template BreadthFirstDualTreeTraverser<RuleType>
+ traverser(rules);
+
+ pointTree.Stat().Pruned() = 0;
+ traverser.Traverse(pointTree, centroidTree);
+
+ // Get true bounds.
+ AllkNN allknn(centroids, dataset);
+
+ arma::Mat<size_t> trueAssignments;
+ arma::mat trueDistances;
+ allknn.Search(2, trueAssignments, trueDistances);
+
+ // Check the points first. Lots of weird mappings have to go on in this stage
+ // because the tree building procedure changed all the points around.
+ for (size_t i = 0; i < dataset.n_cols; ++i)
+ {
+ if (visited[i])
+ {
+ BOOST_REQUIRE_GE(upperBounds[i], trueDistances(0, i));
+ BOOST_REQUIRE_EQUAL(assignments[i], trueAssignments(0, i));
+ }
+ }
+
+ // Now traverse the tree to see if it is correct.
+ std::queue<TreeType*> nodeQueue;
+ nodeQueue.push(&pointTree);
+ while (!nodeQueue.empty())
+ {
+ // This is an expensive operation. We must ensure that the upper bound is
+ // valid and that the lower bound is valid. Both can be needlessly loose,
+ // but that will simply affect the pruning of the method, not the
+ // correctness. Here we care about correctness.
+ TreeType* node = nodeQueue.front();
+ nodeQueue.pop();
+
+ // We must make sure the upper bound and lower bound are both valid for all
+ // descendant points. The lower bound only matters if the node was pruned.
+ // So, we must calculate the upper and lower bounds manually for the
+ // descendants.
+ double exactUpperBound = 0.0;
+ double exactLowerBound = DBL_MAX;
+ for (size_t i = 0; i < node->NumDescendants(); ++i)
+ {
+ if (trueDistances(0, node->Descendant(i)) > exactUpperBound)
+ exactUpperBound = trueDistances(0, node->Descendant(i));
+ if (trueDistances(1, node->Descendant(i)) < exactLowerBound)
+ exactLowerBound = trueDistances(1, node->Descendant(i));
+ }
+
+ // Multiplication is to add some tolerance for floating point discrepancies.
+ BOOST_REQUIRE_GE(node->Stat().UpperBound() * 1.000001, exactUpperBound);
+
+ if (node->Stat().Pruned() == centroids.n_cols)
+ {
+ BOOST_REQUIRE_LE(node->Stat().LowerBound() * 0.99999, exactLowerBound);
+ }
+ else
+ {
+ for (size_t i = 0; i < node->NumPoints(); ++i)
+ {
+ const double bestLower = std::min(node->Stat().LowerBound(),
+ lowerBounds[node->Point(i)]);
+ BOOST_REQUIRE_LE(bestLower * 0.99999, trueDistances(1, node->Point(i)));
+ }
+
+ // Recurse.
+ for (size_t i = 0; i < node->NumChildren(); ++i)
+ nodeQueue.push(&node->Child(i));
+ }
+ }
+}
+
+BOOST_AUTO_TEST_CASE(DualTreeKMeansScoreCoverTreeTest)
+{
+ // If we run a dual-tree algorithm, do we get valid results for each point
+ // and/or node when we use the kd-tree with the default leaf size?
+ const size_t points = 5000;
+ const size_t clusters = 100;
+ arma::mat dataset(5, points);
+ dataset.randu();
+ arma::mat centroids(5, clusters);
+ centroids.randu();
+
+ arma::mat datasetCopy(dataset);
+ arma::mat centroidsCopy(centroids);
+
+ // Create the trees.
+ typedef CoverTree<EuclideanDistance, FirstPointIsRoot, DTNNStatistic>
+ TreeType;
+ TreeType pointTree(dataset);
+ TreeType centroidTree(centroids);
+
+ // Create the Rules object.
+ arma::Col<size_t> assignments(points);
+ arma::vec upperBounds(points);
+ arma::vec lowerBounds(points);
+ upperBounds.fill(DBL_MAX);
+ lowerBounds.fill(DBL_MAX);
+ std::vector<bool> visited(points, false); // Fill with false.
+ std::vector<size_t> oldFromNewCentroids; // Not used.
+ std::vector<bool> prunedPoints(points, false); // Fill with false.
+
+ EuclideanDistance e;
+ typedef DTNNKMeansRules<EuclideanDistance, TreeType> RuleType;
+ RuleType rules(centroids, dataset, assignments, upperBounds, lowerBounds, e,
+ prunedPoints, oldFromNewCentroids, visited);
+
+ // Now create the traverser.
+ typename TreeType::template DualTreeTraverser<RuleType> traverser(rules);
+
+ pointTree.Stat().Pruned() = 0;
+ traverser.Traverse(pointTree, centroidTree);
+
+ // Get true bounds.
+ AllkNN allknn(centroids, dataset);
+
+ arma::Mat<size_t> trueAssignments;
+ arma::mat trueDistances;
+ allknn.Search(2, trueAssignments, trueDistances);
+
+ // Check the points first. Lots of weird mappings have to go on in this stage
+ // because the tree building procedure changed all the points around.
+ for (size_t i = 0; i < dataset.n_cols; ++i)
+ {
+ if (visited[i])
+ {
+ BOOST_REQUIRE_GE(upperBounds[i], trueDistances(0, i));
+ BOOST_REQUIRE_EQUAL(assignments[i], trueAssignments(0, i));
+ }
+ }
+
+ // Now traverse the tree to see if it is correct.
+ std::queue<TreeType*> nodeQueue;
+ nodeQueue.push(&pointTree);
+ while (!nodeQueue.empty())
+ {
+ // This is an expensive operation. We must ensure that the upper bound is
+ // valid and that the lower bound is valid. Both can be needlessly loose,
+ // but that will simply affect the pruning of the method, not the
+ // correctness. Here we care about correctness.
+ TreeType* node = nodeQueue.front();
+ nodeQueue.pop();
+
+ // We must make sure the upper bound and lower bound are both valid for all
+ // descendant points. The lower bound only matters if the node was pruned.
+ // So, we must calculate the upper and lower bounds manually for the
+ // descendants.
+ double exactUpperBound = 0.0;
+ double exactLowerBound = DBL_MAX;
+ for (size_t i = 0; i < node->NumDescendants(); ++i)
+ {
+ if (trueDistances(0, node->Descendant(i)) > exactUpperBound)
+ exactUpperBound = trueDistances(0, node->Descendant(i));
+ if (trueDistances(1, node->Descendant(i)) < exactLowerBound)
+ exactLowerBound = trueDistances(1, node->Descendant(i));
+ }
+
+ // Multiplication is to add some tolerance for floating point discrepancies.
+ BOOST_REQUIRE_GE(node->Stat().UpperBound() * 1.000001, exactUpperBound);
+
+ if (node->Stat().Pruned() == centroids.n_cols)
+ {
+ BOOST_REQUIRE_LE(node->Stat().LowerBound() * 0.99999, exactLowerBound);
+ }
+ else if (node->NumChildren() == 0)
+ {
+ // The node has one point.
+ const double bestLower = std::min(node->Stat().LowerBound(),
+ lowerBounds[node->Point(0)]);
+ BOOST_REQUIRE_LE(bestLower * 0.99999, trueDistances(1, node->Point(0)));
+ }
+ else
+ {
+ // Recurse.
+ for (size_t i = 0; i < node->NumChildren(); ++i)
+ nodeQueue.push(&node->Child(i));
+ }
+ }
+}
+
BOOST_AUTO_TEST_SUITE_END();
More information about the mlpack-git
mailing list