[mlpack-svn] r11926 - mlpack/trunk/src/mlpack/methods/kmeans

fastlab-svn at coffeetalk-1.cc.gatech.edu fastlab-svn at coffeetalk-1.cc.gatech.edu
Tue Mar 20 23:04:13 EDT 2012


Author: jcline3
Date: 2012-03-20 23:04:13 -0400 (Tue, 20 Mar 2012)
New Revision: 11926

Modified:
   mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp
   mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp
Log:
Working, mostly finished pelleg and moore mrkd kmeans implementation


Modified: mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp	2012-03-21 02:34:52 UTC (rev 11925)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans_impl.hpp	2012-03-21 03:04:13 UTC (rev 11926)
@@ -13,6 +13,7 @@
 #include <mlpack/core/metrics/lmetric.hpp>
 
 #include <stack>
+#include <limits>
 
 namespace mlpack {
 namespace kmeans {
@@ -70,97 +71,121 @@
     actualClusters = clusters;
   }
 
-  // TODO: remove
-  // Scale the data to [0,1]
-  if(0){
-    arma::rowvec min = arma::min(data, 0);
-    data = (data - arma::ones<arma::colvec>(data.n_rows) * min) / (arma::ones<arma::colvec>(data.n_rows) * (arma::max(data,0) - min));
-    for(size_t i = 0; i < data.n_cols; ++i)
-      for(size_t j = 0; j < data.n_rows; ++j)
-        assert(data(j,i) >= 0 && data(j,i) <= 1);
-  }
+  size_t dimensionality = data.n_rows;
 
-  if (assignments.n_rows != data.n_cols)
-    assignments.resize(data.n_cols);
-
   // Centroids of each cluster.  Each column corresponds to a centroid.
-  MatType centroids(data.n_rows, actualClusters);
+  MatType centroids(dimensionality, actualClusters);
+  centroids.zeros();
 
   // Counts of points in each cluster.
   arma::Col<size_t> counts(actualClusters);
   counts.zeros();
 
   // Build the mrkd-tree on this dataset
-  tree::BinarySpaceTree<typename bound::HRectBound<2>, tree::MRKDStatistic> tree(data, 1);
+  tree::BinarySpaceTree<typename bound::HRectBound<2>, tree::MRKDStatistic> tree(data, 10);
+  std::cout << "Tree Built \n";
   // A pointer for traversing the mrkd-tree
   tree::BinarySpaceTree<typename bound::HRectBound<2>, tree::MRKDStatistic>* node;
 
-  // We use this to store the furtherst point in a hyperrectangle from a given
-  // vector.
-  arma::colvec p(data.n_rows);
-
-  // Make random centroids and fit them into the root hyperrectangle.
+  // Now, the initial assignments.  First determine if they are necessary.
+  if (assignments.n_elem != data.n_cols)
   {
-    centroids.randu();
-    bound::HRectBound<2>& bound = tree.Bound();
-    size_t dim = bound.Dim();
-    for (size_t i = 0; i < dim; ++i) {
-      double min = bound[i].Lo();
-      double max = bound[i].Hi();
-      for (size_t j = 0; j < centroids.n_cols; ++j)
-      {
-        if (centroids(i,j) < min)
-          centroids(i,j) = min;
-        else if (centroids(i,j) > max)
-          centroids(i,j) = max;
-      }
-    }
+    // Use the partitioner to come up with the partition assignments.
+    partitioner.Cluster(data, actualClusters, assignments);
   }
 
+  // Set counts correctly.
+  for (size_t i = 0; i < assignments.n_elem; i++)
+    counts[assignments[i]]++;
+
+  // Sum the points for each centroid
+  for (size_t i = 0; i < data.n_cols; i++)
+    centroids.col(assignments[i]) += data.col(i);
+
+  // Then divide the sums by the count to get the center of mass for this
+  // centroids assigned points
+  for (size_t i = 0; i < actualClusters; i++)
+    centroids.col(i) /= counts[i];
+
   // Instead of retraversing the tree after an iteration, we will update centroid
   // positions in this matrix, which also prevents clobbering our centroids from
   // the previous iteration.
-  MatType newCentroids(centroids.n_rows, centroids.n_cols);
+  MatType newCentroids(dimensionality, centroids.n_cols);
 
-  std::cout << data.n_cols << std::endl;
+  // Create a stack for traversing the mrkd-tree
+  std::stack<typename tree::BinarySpaceTree<typename bound::HRectBound<2>, 
+                                            tree::MRKDStatistic>* > stack;
+
+  // A variable to keep track of how many kmeans iterations we have made
   size_t iteration = 0;
+
+  // A variable to keep track of how many nodes assignments have changed in
+  // each kmeans iteration
   size_t changedAssignments = 0;
+
+  // A variable to keep track of the number of times something is skipped due
+  // to the blacklist
+  size_t skip = 0;
+
+  // A variable to keep track of the number of distances calculated
+  size_t comps = 0;
+
+  // A variable to keep track of how often we stop at a parent node
+  size_t dominations = 0;
   do 
   {
     // Keep track of what iteration we are on.
     ++iteration;
     changedAssignments = 0;
+
+    // Reset the newCentroids so that we can store the newly calculated ones
+    // here
     newCentroids.zeros();
+
+    // Reset the counts
     counts.zeros();
 
-    // Create a stack for traversing the mrkd-tree
-    std::stack<typename tree::BinarySpaceTree<typename bound::HRectBound<2>, 
-                                              tree::MRKDStatistic>* > stack;
     // Add the root node of the tree to the stack
     stack.push(&tree);
+    // Set the top level whitelist
+    tree.Stat().whiteList.resize(centroids.n_cols, true);
 
+    // Traverse the tree
     while (!stack.empty())
     {
+      // Get the next node in the tree
       node = stack.top();
+      // Remove the node from the stack
       stack.pop();
 
+      // Get a reference to the mrkd statistic for this hyperrectangle
       tree::MRKDStatistic& mrkd = node->Stat();
 
+      // We use this to store the index of the centroid with the minimum
+      // distance from this hyperrectangle or point
       size_t minIndex = 0;
 
       // If this node is a leaf, then we calculate the distance from
       // the centroids to every point the node contains.
       if (node->IsLeaf())
       {
-        //std::cout << "Leaf\t";
         for (size_t i = mrkd.begin; i < mrkd.count + mrkd.begin; ++i)
         {
           // Initialize minDistance to be nonzero.
           double minDistance = metric::SquaredEuclideanDistance::Evaluate(
               data.col(i), centroids.col(0));
+
           // Find the minimal distance centroid for this point.
           for (size_t j = 1; j < centroids.n_cols; ++j)
           {
+            // If this centroid is not in the whitelist, skip it
+            if (!mrkd.whiteList[j])
+            {
+              ++skip;
+              continue;
+            }
+
+            ++comps;
             double distance = metric::SquaredEuclideanDistance::Evaluate(
                 data.col(i), centroids.col(j));
             if ( minDistance > distance )
@@ -170,115 +195,298 @@
             }
           }
 
+          // Add this point to the undivided center of mass summation for
+          // it's assigned centroid
           newCentroids.col(minIndex) += data.col(i);
+
+          // Increment the count for the minimum distance centroid
           ++counts(minIndex);
-          //std::cout << counts(minIndex) << "\t";
+
+          // If we actually changed assignments, increment changedAssignments
+          // and modify the assignment vector for this point
           if (assignments(i) != minIndex)
           {
             ++changedAssignments;
-            // TODO: this if should be removed
-            //if(counts(assignments(i)))
-              //--counts(assignments(i));
             assignments(i) = minIndex;
           }
         }
-        //std::cout << std::endl;
       }
       // If this node is not a leaf, then we continue trying to find dominant
       // centroids
       else
       {
-        //std::cout << "Parent\t";
         bound::HRectBound<2>& bound = node->Bound();
 
+        // A flag to keep track of if we find a single centroid that is closer
+        // to all points in this hyperrectangle than any other centroid.
         bool noDomination = false;
 
-        // There was no centroid inside this hyperrectangle.
-        // We must determine if an external centroid dominates it.
-        for (size_t i = 0; i < centroids.n_cols; ++i) 
+        // Calculate the center of mass of this hyperrectangle
+        arma::vec center = mrkd.centerOfMass / mrkd.count;
+
+        // Set the minDistance to the maximum value of a double so any value 
+        // must be smaller than this
+        double minDistance = std::numeric_limits<double>::max();
+
+        // The candidate distance we calculate for each centroid
+        double distance = 0.0;
+
+        // How many points are inside this hyperrectangle, we stop if we
+        // see more than 1
+        size_t contains = 0;
+
+        // Find the "owner" of this hyperrectangle, if one exists
+        for (size_t i = 0; i < centroids.n_cols; ++i)
         {
-          noDomination = false;
-          for (size_t j = 0; j < centroids.n_cols; ++j)
+          // If this centroid is not in the whitelist, skip it
+          if (!mrkd.whiteList[i])
           {
-            if (j == i)
-              continue;
+            ++skip;
+            continue;
+          }
 
-            for (size_t k = 0; k < p.n_rows; ++k)
+          // Incrememnt the number of distance calculations for what we are
+          // about to do
+          comps += 2;
+
+          // Reinitialize the distance so += works right
+          distance = 0.0;
+
+          // We keep track of how many dimensions have nonzero distance,
+          // if this is 0 then the distance is 0.
+          size_t nonZero = 0;
+
+          /*
+            Compute the distance to the hyperrectangle for this centroid.
+            We do this by finding the furthest point from the centroid inside
+            the hyperrectangle. This is a corner of the hyperrectangle.
+
+            In order to do this faster, we calculate both the distance and the
+            furthest point simultaneously.
+
+            This following code is equivalent to, but faster than:
+
+            arma::vec p;
+            p.zeros(dimensionality);
+
+            for (size_t j = 0; j < dimensionality; ++j)
             {
-              p(k) = (centroids(k,j) > centroids(k,i)) ?
-                bound[k].Hi() : bound[k].Lo();
+              if (centroids(j,i) < bound[j].Lo())
+                p(j) = bound[j].Lo();
+              else
+                p(j) = bound[j].Hi();
             }
 
-            double distancei = metric::SquaredEuclideanDistance::Evaluate(
+            distance = metric::SquaredEuclideanDistance::Evaluate(
                 p.col(0), centroids.col(i));
-            double distancej = metric::SquaredEuclideanDistance::Evaluate(
-                p.col(0), centroids.col(j));
+          */
+          for (size_t j = 0; j < dimensionality; ++j)
+          {
+            double ij = centroids(j,i);
+            double lo = bound[j].Lo();
 
-            if (distancei >= distancej)
+            if (ij < lo)
             {
+              // (ij - lo)^2
+              ij -= lo;
+              ij *= ij;
+
+              distance += ij;
+              ++nonZero;
+            }
+            else
+            {
+              double hi = bound[j].Hi();
+              if (ij > hi)
+              {
+                // (ij - hi)^2
+                ij -= hi;
+                ij *= ij;
+
+                distance += ij;
+                ++nonZero;
+              }
+            }
+          }
+
+          // The centroid is inside the hyperrectangle
+          if (nonZero == 0)
+          {
+            ++contains;
+            minDistance = 0.0;
+            minIndex = i;
+
+            // If more than two points are within this hyperrectangle, then
+            // there can be no dominating centroid, so we should continue
+            // to the children nodes.
+            if (contains > 1)
+            {
               noDomination = true;
               break;
             }
+          }
 
+          if (fabs(distance - minDistance) <= 1e-10)
+          {
+            noDomination = true;
+            break;
           }
+          else if (distance < minDistance)
+          {
+            minIndex = i;
+          }
+        }
 
-          // We identified a centroid that dominates this hyperrectangle.
-          if (!noDomination)
+        distance = minDistance;
+        // Determine if the owner dominates this centroid only if there was
+        // exactly one owner
+        if (!noDomination)
+        {
+          for (size_t i = 0; i < centroids.n_cols; ++i)
           {
-            //std::cout << "Domination\t";
-            newCentroids.col(minIndex) += mrkd.centerOfMass;
-            counts(i) += mrkd.count;
-            //std::cout << counts(i) << std::endl;
-            // Update all assignments for this node
-            const size_t begin = node->Begin();
-            const size_t end = node->End();
-            for (size_t j = begin; j < end; ++j)
+            if (i == minIndex)
+              continue;
+            // If this centroid is blacklisted for this hyperrectangle, then
+            // we skip it
+            if (!mrkd.whiteList[i])
             {
-              if (assignments(j) != i)
+              ++skip;
+              continue;
+            }
+            /*
+              Compute the dominating centroid for this hyperrectangle, if one
+              exists. We do this by calculating the point which is furthest
+              from the i'th centroid in the direction of c_k - c_i. We do this
+              as outlined in the Pelleg and Moore paper.
+
+              This following code is equivalent to, but faster than:
+
+              arma::vec p;
+              p.zeros(dimensionality);
+
+              for (size_t k = 0; k < dimensionality; ++k)
               {
-                ++changedAssignments;
-                //if(counts(assignments(j)))
-                  //--counts(assignments(j));
-                //++counts(i);
-                assignments(j) = i;
+                p(k) = (centroids(k,i) > centroids(k,minIndex)) ?
+                  bound[k].Hi() : bound[k].Lo();
               }
+
+              double distancei = metric::SquaredEuclideanDistance::Evaluate(
+                  p.col(0), centroids.col(i));
+              double distanceMin = metric::SquaredEuclideanDistance::Evaluate(
+                  p.col(0), centroids.col(minIndex));
+            */
+
+            comps += 1;
+            double distancei = 0.0;
+            double distanceMin = 0.0;
+            for (size_t k = 0; k < dimensionality; ++k)
+            {
+              double ci = centroids(k,i);
+              double cm = centroids(k,minIndex);
+              if (ci > cm)
+              {
+                double hi = bound[k].Hi();
+
+                ci -= hi;
+                cm -= hi;
+
+                ci *= ci;
+                cm *= cm;
+
+                distancei += ci;
+                distanceMin += cm;
+              }
+              else
+              {
+                double lo = bound[k].Lo();
+
+                ci -= lo;
+                cm -= lo;
+
+                ci *= ci;
+                cm *= cm;
+
+                distancei += ci;
+                distanceMin += cm;
+              }
             }
-            mrkd.dominatingCentroid = i;
-            break;
+
+            if(distanceMin >= distancei)
+            {
+              noDomination = true;
+              break;
+            }
+            else
+            {
+              mrkd.whiteList[i] = false;
+            }
           }
         }
+        else
+        {
+          noDomination = true;
+        }
 
+        // If did found a centroid that was closer to every point in the
+        // hyperrectangle than every other centroid, then update that centroid
+        if (!noDomination)
+        {
+          // Adjust the new centroid sum for the min distance point to this
+          // hyperrectangle by the center of mass of this hyperrectangle
+          newCentroids.col(minIndex) += mrkd.centerOfMass;
+
+          // Increment the counts for this centroid
+          counts(minIndex) += mrkd.count;
+
+          // Update all assignments for this node
+          const size_t begin = node->Begin();
+          const size_t end = node->End();
+
+          // TODO: Do this outside of the kmeans iterations
+          for (size_t j = begin; j < end; ++j)
+          {
+            if (assignments(j) != minIndex)
+            {
+              ++changedAssignments;
+              assignments(j) = minIndex;
+            }
+          }
+          mrkd.dominatingCentroid = minIndex;
+
+          // Keep track of the number of times we found a dominating centroid
+          ++dominations;
+        }
+
         // If we did not find a dominating centroid then we fall through to the
         // default case, where we add the children of this node to the stack.
-        if (noDomination)
+        else
         {
-          //std::cout << "No Domination" << std::endl;
+          // Add this hyperrectangle's children to our stack
           stack.push(node->Left());
           stack.push(node->Right());
+
+          // (Re)Initialize the whiteList for the children
+          node->Left()->Stat().whiteList = mrkd.whiteList;
+          node->Right()->Stat().whiteList = mrkd.whiteList;
         }
       }
 
     }
 
+    // Divide by the number of points assigned to the centroids so that we
+    // have the actual center of mass and update centroids' positions.
     for (size_t i = 0; i < centroids.n_cols; ++i)
     {
       if (counts(i)) {
-        // Divide by the number of points assigned to this centroid so that we
-        // have the actual center of mass and update centroids' positions.
         centroids.col(i) = newCentroids.col(i) / counts(i);
       }
     }
-    size_t count = 0;
-    for(size_t k = 0; k < counts.n_rows; ++k)
-    {
-      std::cout << counts(k) << '\t';
-      count += counts(k);
-    }
-    std::cout << '\n' << count <<'\t'<< data.n_cols<< std::endl;
-    assert(count <= data.n_cols);
-  } while(0);
-  //} while (changedAssignments > 0 && iteration != maxIterations);
 
+    // Stop when we reach max iterations or we changed no assignments
+    // assignments
+  } while (changedAssignments > 0 && iteration != maxIterations);
+
+  std::cout << "Iterations: " << iteration << '\t' << "Skips: " << skip << '\t' << "Comparisons: " << comps << '\t' << "Dominations: " << dominations << '\n';
 }
 
 /**
@@ -386,6 +594,7 @@
     iteration++;
 
   } while (changedAssignments > 0 && iteration != maxIterations);
+  std::cout << "Iterations: " << iteration << std::endl;
 
   // If we have overclustered, we need to merge the nearest clusters.
   if (actualClusters != clusters)

Modified: mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp	2012-03-21 02:34:52 UTC (rev 11925)
+++ mlpack/trunk/src/mlpack/methods/kmeans/kmeans_main.cpp	2012-03-21 03:04:13 UTC (rev 11926)
@@ -86,24 +86,29 @@
   // Now create the KMeans object.  Because we could be using different types,
   // it gets a little weird...
   arma::Col<size_t> assignments;
+
   if (CLI::HasParam("allow_empty_clusters"))
   {
     KMeans<metric::SquaredEuclideanDistance, RandomPartition,
         AllowEmptyClusters> k(maxIterations, overclustering);
 
+    Timer::Start("clustering");
 		if(CLI::HasParam("fast_kmeans"))
 			k.FastCluster(dataset, clusters, assignments);
 		else
 			k.Cluster(dataset, clusters, assignments);
+    Timer::Stop("clustering");
   }
   else
   {
     KMeans<> k(maxIterations, overclustering);
 
+    Timer::Start("clustering");
 		if(CLI::HasParam("fast_kmeans"))
 			k.FastCluster(dataset, clusters, assignments);
 		else
 			k.Cluster(dataset, clusters, assignments);
+    Timer::Stop("clustering");
   }
 
   // Now figure out what to do with our results.




More information about the mlpack-svn mailing list