[mlpack-git] master: Remove debug output; implement Pelleg-Moore prune. Now this is faster than the regular dtkm branch. Sometimes starting over is a good thing. (50d4a32)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Thu Mar 12 16:02:53 EDT 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/eddd7167d69b6c88b271ef2e51d1c20e13f1acd8...70342dd8e5c17e0c164cfb8189748671e9c0dd44

>---------------------------------------------------------------

commit 50d4a325c7fc451700f7ac5535b86fe447e3a447
Author: Ryan Curtin <ryan at ratml.org>
Date:   Wed Jan 28 16:21:58 2015 -0500

    Remove debug output; implement Pelleg-Moore prune. Now this is faster than the regular dtkm branch. Sometimes starting over is a good thing.


>---------------------------------------------------------------

50d4a325c7fc451700f7ac5535b86fe447e3a447
 .../methods/kmeans/dual_tree_kmeans_rules_impl.hpp | 40 ++++++++--------------
 1 file changed, 14 insertions(+), 26 deletions(-)

diff --git a/src/mlpack/methods/kmeans/dual_tree_kmeans_rules_impl.hpp b/src/mlpack/methods/kmeans/dual_tree_kmeans_rules_impl.hpp
index 4615b80..ced3ffa 100644
--- a/src/mlpack/methods/kmeans/dual_tree_kmeans_rules_impl.hpp
+++ b/src/mlpack/methods/kmeans/dual_tree_kmeans_rules_impl.hpp
@@ -50,9 +50,6 @@ inline force_inline double DualTreeKMeansRules<MetricType, TreeType>::BaseCase(
     const size_t queryIndex,
     const size_t referenceIndex)
 {
-  if (referenceIndex == 26038)
-    Log::Warn << "Visit 26038 with query " << queryIndex << ".\n";
-
   // Collect the number of clusters that have been pruned during the traversal.
   // The ternary operator may not be necessary.
   const size_t traversalPruned = (traversalInfo.LastReferenceNode() != NULL) ?
@@ -75,26 +72,17 @@ inline force_inline double DualTreeKMeansRules<MetricType, TreeType>::BaseCase(
     distanceIteration[referenceIndex] = iteration;
     distances[referenceIndex] = distance;
     assignments[referenceIndex] = mappings[queryIndex];
-    if (referenceIndex == 26038)
-      Log::Warn << "assignment for point " << referenceIndex << " set to " <<
-mappings[queryIndex] << ".\n";
   }
   else if (distance < distances[referenceIndex])
   {
     distances[referenceIndex] = distance;
     assignments[referenceIndex] = mappings[queryIndex];
-    if (referenceIndex == 26038)
-      Log::Warn << "assignment for point " << referenceIndex << " set to " <<
-mappings[queryIndex] << ".\n";
   }
 
   ++visited[referenceIndex];
 
   if (visited[referenceIndex] + traversalPruned == centroids.n_cols)
   {
-    if (referenceIndex == 26038)
-      Log::Warn << "assignment for point " << referenceIndex << " committed to " <<
-assignments[referenceIndex] << ".\n";
     newCentroids.col(assignments[referenceIndex]) +=
         dataset.col(referenceIndex);
     ++counts(assignments[referenceIndex]);
@@ -117,12 +105,7 @@ double DualTreeKMeansRules<MetricType, TreeType>::Score(
     TreeType& queryNode,
     TreeType& referenceNode)
 {
-//  if (referenceNode.Begin() == 2432)
-//    Log::Warn << "Visit q" << queryNode.Begin() << "c" << queryNode.Count() <<
-//", r" << referenceNode.Begin() << "c" << referenceNode.Count() << ".\n";
-
   // This won't happen with the root since it is explicitly set to 0.
-  const size_t origPruned = referenceNode.Stat().ClustersPruned();
   if (referenceNode.Stat().ClustersPruned() == size_t(-1))
     referenceNode.Stat().ClustersPruned() =
         referenceNode.Parent()->Stat().ClustersPruned();
@@ -132,9 +115,6 @@ double DualTreeKMeansRules<MetricType, TreeType>::Score(
     // Add to centroids if necessary.
     if (referenceNode.Stat().MinQueryNodeDistance() == DBL_MAX /* hack */)
     {
-      if (referenceNode.Begin() == 26038)
-        Log::Warn << "Add centroid mass for r26038c" << referenceNode.Count() <<
-".\n";
       newCentroids.col(referenceNode.Stat().Owner()) +=
           referenceNode.NumDescendants() * referenceNode.Stat().Centroid();
       counts(referenceNode.Stat().Owner()) += referenceNode.NumDescendants();
@@ -154,9 +134,6 @@ double DualTreeKMeansRules<MetricType, TreeType>::Score(
   // Is this closer than the current best query node?
   if (distances.Lo() < referenceNode.Stat().MinQueryNodeDistance())
   {
-    if (referenceNode.Begin() == 26038)
-      Log::Warn << "r26038c" << referenceNode.Count() << ": new CQN " <<
-queryNode.Begin() << "c" << queryNode.Count() << ".\n";
     // This is the new closest node.
     referenceNode.Stat().SecondClosestQueryNode() =
         referenceNode.Stat().ClosestQueryNode();
@@ -170,14 +147,25 @@ queryNode.Begin() << "c" << queryNode.Count() << ".\n";
   }
   else if (distances.Lo() < referenceNode.Stat().SecondMinQueryNodeDistance())
   {
-    if (referenceNode.Begin() == 26038)
-      Log::Warn << "r26038c" << referenceNode.Count() << ": new SCQN " <<
-queryNode.Begin() << "c" << queryNode.Count() << ".\n";
     // This is the new second closest node.
     referenceNode.Stat().SecondClosestQueryNode() = (void*) &queryNode;
     referenceNode.Stat().SecondMinQueryNodeDistance() = distances.Lo();
     referenceNode.Stat().SecondMaxQueryNodeDistance() = distances.Hi();
   }
+  else if (distances.Lo() > referenceNode.Stat().SecondMaxQueryNodeDistance())
+  {
+    // This is a Pelleg-Moore type prune.
+//    Log::Warn << "Pelleg-Moore prune: " << distances.Lo() << "/" <<
+//distances.Hi() << ", r" << referenceNode.Begin() << "c" << referenceNode.Count()
+//<< ", q" << queryNode.Begin() << "c" << queryNode.Count() << "; mQND " <<
+//referenceNode.Stat().MinQueryNodeDistance() << ", MQND " <<
+//referenceNode.Stat().MaxQueryNodeDistance() << ", smQND " <<
+//referenceNode.Stat().SecondMinQueryNodeDistance() << ", sMQND " <<
+//referenceNode.Stat().SecondMaxQueryNodeDistance() << ".\n";
+
+    referenceNode.Stat().ClustersPruned() += queryNode.NumDescendants();
+    return DBL_MAX;
+  }
 
   return 0.0; // No pruning allowed at this time.
 }



More information about the mlpack-git mailing list