[mlpack-git] master: Beginning to refactor dual-tree k-means. So far the Pelleg-Moore rule is properly refactored (well, sort of, at least). Try to move IterationUpdate() into DualTreeKMeans::TreeUpdate(); for now this will make the algorithm simpler: instead of updating on the fly, update after an iteration. It may make more sense to do updates on demand later, but let's make the algorithm work first... (ee45212)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Thu Mar 12 16:01:29 EDT 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/eddd7167d69b6c88b271ef2e51d1c20e13f1acd8...70342dd8e5c17e0c164cfb8189748671e9c0dd44

>---------------------------------------------------------------

commit ee45212808ab178cd9cde2162b030656e0dbdd3b
Author: ryan <ryan at ratml.org>
Date:   Fri Jan 2 00:24:50 2015 -0500

    Beginning to refactor dual-tree k-means. So far the Pelleg-Moore rule is properly refactored (well, sort of, at least). Try to move IterationUpdate() into DualTreeKMeans::TreeUpdate(); for now this will make the algorithm simpler: instead of updating on the fly, update after an iteration. It may make more sense to do updates on demand later, but let's make the algorithm work first...


>---------------------------------------------------------------

ee45212808ab178cd9cde2162b030656e0dbdd3b
 src/mlpack/methods/kmeans/dual_tree_kmeans.hpp     |  4 ++
 .../methods/kmeans/dual_tree_kmeans_impl.hpp       | 80 +++++++++++++++-------
 .../methods/kmeans/dual_tree_kmeans_rules_impl.hpp | 37 ++++++----
 3 files changed, 83 insertions(+), 38 deletions(-)

diff --git a/src/mlpack/methods/kmeans/dual_tree_kmeans.hpp b/src/mlpack/methods/kmeans/dual_tree_kmeans.hpp
index f2b6376..c1e6070 100644
--- a/src/mlpack/methods/kmeans/dual_tree_kmeans.hpp
+++ b/src/mlpack/methods/kmeans/dual_tree_kmeans.hpp
@@ -57,6 +57,10 @@ class DualTreeKMeans
 
   //! Track distance calculations.
   size_t distanceCalculations;
+
+  void TreeUpdate(TreeType* node,
+                  const size_t clusters,
+                  const arma::vec& clusterDistances);
 };
 
 template<typename MetricType, typename MatType>
diff --git a/src/mlpack/methods/kmeans/dual_tree_kmeans_impl.hpp b/src/mlpack/methods/kmeans/dual_tree_kmeans_impl.hpp
index 088d565..0937cb8 100644
--- a/src/mlpack/methods/kmeans/dual_tree_kmeans_impl.hpp
+++ b/src/mlpack/methods/kmeans/dual_tree_kmeans_impl.hpp
@@ -106,6 +106,9 @@ double DualTreeKMeans<MetricType, MatType, TreeType>::Iterate(
   }
   Log::Info << clusterDistances.t();
 
+  // Update the tree with the centroid movement information.
+  TreeUpdate(tree, centroids.n_cols, clusterDistances);
+
   delete centroidTree;
 
   ++iteration;
@@ -148,43 +151,72 @@ void DualTreeKMeans<MetricType, MatType, TreeType>::ClusterTreeUpdate(
   node->Stat().Owner() = maxChangeCluster;
   node->Stat().MinQueryNodeDistance() = maxChange;
 }
+*/
 
 template<typename MetricType, typename MatType, typename TreeType>
 void DualTreeKMeans<MetricType, MatType, TreeType>::TreeUpdate(
-    TreeType* node)
+    TreeType* node,
+    const size_t clusters,
+    const arma::vec& clusterDistances)
 {
   // This is basically IterationUpdate(), but pulled out to be separate from the
   // actual dual-tree algorithm.
 
-  // First, update the iteration.
-  const size_t itDiff = node->Stat().Iteration() - iteration;
+  if (node->Parent() != NULL && node->Parent()->Stat().Owner() < clusters)
+    node->Stat().Owner() = node->Parent()->Stat().Owner();
 
-  if (itDiff == 1)
+  // The easy case: this node had an owner.
+  if (node->Stat().Owner() < clusters)
   {
-    // The easy case.
-    if (node->Stat().Owner() < centroids.n_cols)
+    // During the last iteration, this node was pruned.
+    const size_t owner = node->Stat().Owner();
+    if (node->Stat().MaxQueryNodeDistance() != DBL_MAX)
+      node->Stat().MaxQueryNodeDistance() += clusterDistances[owner];
+    if (node->Stat().MinQueryNodeDistance() != DBL_MAX)
+      node->Stat().MinQueryNodeDistance() += clusterDistances[owner];
+
+/*
+    // During the last iteration, this node was pruned.  In addition, we have
+    // cached a lower bound on the second closest cluster.  So, use the
+    // triangle inequality: if the maximum distance between the point and the
+    // cluster centroid plus the distance that centroid moved is less than the
+    // lower bound minus the maximum moving centroid, then this cluster *must*
+    // still have the same owner.
+    const size_t owner = node->Stat().Owner();
+    const double closestUpperBound = node->Stat().MaxQueryNodeDistance() +
+        clusterDistances[owner];
+    const TreeType* nonOwner = (TreeType*) node->Stat().ClosestNonOwner();
+    const double tightestLowerBound = node->Stat().ClosestNonOwnerDistance() -
+        nonOwner->Stat().MinQueryNodeDistance();
+    if (closestUpperBound <= tightestLowerBound)
     {
-      // During the last iteration, this node was pruned.  In addition, we have
-      // cached a lower bound on the second closest cluster.  So, use the
-      // triangle inequality: if the maximum distance between the point and the
-      // cluster centroid plus the distance that centroid moved is less than the
-      // lower bound minus the maximum moving centroid, then this cluster *must*
-      // still have the same owner.
-      const size_t owner = node->Stat().Owner();
-      const double closestUpperBound = node->Stat().MaxQueryNodeDistance() +
-          clusterDistances[owner];
-      const TreeType* nonOwner = (TreeType*) node->Stat().ClosestNonOwner();
-      const double tightestLowerBound = node->Stat().ClosestNonOwnerDistance() -
-          nonOwner->Stat().MinQueryNodeDistance() /* abused from earlier *;
-      if (closestUpperBound <= tightestLowerBound)
-      {
-        // Then the owner must not have changed.
-
-      }
+      // Then the owner must not have changed.
     }
+*/
   }
+  else
+  {
+    // This node did not have a single owner, but did have a closest query
+    // node.  So we will simply loosen that bound.  The loosening here is too
+    // loose; TODO: tighten to the max cluster movement in the closest query
+    // node.
+    if (node->Stat().MaxQueryNodeDistance() != DBL_MAX)
+      node->Stat().MaxQueryNodeDistance() += clusterDistances[clusters];
+    if (node->Stat().MinQueryNodeDistance() != DBL_MAX)
+      node->Stat().MinQueryNodeDistance() += clusterDistances[clusters];
+  }
+
+  node->Stat().Iteration() = iteration;
+  node->Stat().ClustersPruned() = (node->Parent() == NULL) ? 0 : -1;
+  // We have to set the closest query node to NULL because the cluster tree will
+  // be rebuilt.
+  node->Stat().ClosestQueryNode() = NULL;
+//  node->Stat().MaxQueryNodeDistance() = DBL_MAX;
+//  node->Stat().MinQueryNodeDistance() = DBL_MAX;
+
+  for (size_t i = 0; i < node->NumChildren(); ++i)
+    TreeUpdate(&node->Child(i), clusters, clusterDistances);
 }
-*/
 
 
 } // namespace kmeans
diff --git a/src/mlpack/methods/kmeans/dual_tree_kmeans_rules_impl.hpp b/src/mlpack/methods/kmeans/dual_tree_kmeans_rules_impl.hpp
index 7676e1e..5678521 100644
--- a/src/mlpack/methods/kmeans/dual_tree_kmeans_rules_impl.hpp
+++ b/src/mlpack/methods/kmeans/dual_tree_kmeans_rules_impl.hpp
@@ -53,8 +53,8 @@ inline force_inline double DualTreeKMeansRules<MetricType, TreeType>::BaseCase(
 
   // Collect the number of clusters that have been pruned during the traversal.
   // The ternary operator may not be necessary.
-  const size_t traversalPruned = (traversalInfo.LastReferenceNode() != NULL &&
-      traversalInfo.LastReferenceNode()->Stat().Iteration() == iteration) ?
+  const size_t traversalPruned = (traversalInfo.LastReferenceNode() != NULL) ?
+//      traversalInfo.LastReferenceNode()->Stat().Iteration() == iteration) ?
       traversalInfo.LastReferenceNode()->Stat().ClustersPruned() : 0;
 
   // It's possible that the reference node has been pruned before we got to the
@@ -101,7 +101,7 @@ double DualTreeKMeansRules<MetricType, TreeType>::Score(
     TreeType& referenceNode)
 {
   // Update from previous iteration, if necessary.
-  IterationUpdate(referenceNode);
+//  IterationUpdate(referenceNode);
 
   // No pruning here, for now.
   return 0.0;
@@ -112,26 +112,32 @@ double DualTreeKMeansRules<MetricType, TreeType>::Score(
     TreeType& queryNode,
     TreeType& referenceNode)
 {
-  if (IterationUpdate(referenceNode) == DBL_MAX)
-  {
+//  if (IterationUpdate(referenceNode) == DBL_MAX)
+//  {
     // The iteration update showed that the owner could not possibly change.
-    return DBL_MAX;
-  }
+//    return DBL_MAX;
+//  }
+
+  if (referenceNode.Stat().ClustersPruned() == size_t(-1))
+    referenceNode.Stat().ClustersPruned() =
+        referenceNode.Parent()->Stat().ClustersPruned();
 
   traversalInfo.LastReferenceNode() = &referenceNode;
 
   // Can we update the minimum query node distance for this reference node?
   const double minDistance = referenceNode.MinDistance(&queryNode);
-  ++distanceCalculations;
-  if (minDistance < referenceNode.Stat().MinQueryNodeDistance())
+  const double maxDistance = referenceNode.MaxDistance(&queryNode);
+  distanceCalculations += 2;
+  if (maxDistance < referenceNode.Stat().MaxQueryNodeDistance())
   {
     referenceNode.Stat().ClosestQueryNode() = (void*) &queryNode;
     referenceNode.Stat().MinQueryNodeDistance() = minDistance;
-    referenceNode.Stat().MaxQueryNodeDistance() =
+    referenceNode.Stat().MaxQueryNodeDistance() = maxDistance;
         referenceNode.MaxDistance(&queryNode);
-    ++distanceCalculations;
+//    ++distanceCalculations;
     return 0.0; // Pruning is not possible.
   }
+
   else if (IsDescendantOf(
       *((TreeType*) referenceNode.Stat().ClosestQueryNode()), queryNode))
   {
@@ -144,9 +150,10 @@ double DualTreeKMeansRules<MetricType, TreeType>::Score(
     return 0.0; // Pruning is not possible.
   }
 
-  double score = ElkanTypeScore(queryNode, referenceNode);
-  if (score != DBL_MAX)
-    score = PellegMooreScore(queryNode, referenceNode, minDistance);
+//  double score = ElkanTypeScore(queryNode, referenceNode);
+//  if (score != DBL_MAX)
+
+  double score = PellegMooreScore(queryNode, referenceNode, minDistance);
 
   if (score == DBL_MAX)
   {
@@ -179,6 +186,7 @@ double DualTreeKMeansRules<MetricType, TreeType>::Score(
   }
 
   return score;
+//  return 0.0;
 }
 
 template<typename MetricType, typename TreeType>
@@ -208,6 +216,7 @@ template<typename MetricType, typename TreeType>
 inline double DualTreeKMeansRules<MetricType, TreeType>::IterationUpdate(
     TreeType& referenceNode)
 {
+  Log::Fatal << "Update! Why!\n";
   if (referenceNode.Stat().Iteration() == iteration)
     return 0;
 



More information about the mlpack-git mailing list