[mlpack-git] master,mlpack-1.0.x: Remove the Pelleg-Moore k-means implementation; it is being replaced. (1e5af1f)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Thu Mar 5 21:47:20 EST 2015


Repository : https://github.com/mlpack/mlpack

On branches: master,mlpack-1.0.x
Link       : https://github.com/mlpack/mlpack/compare/904762495c039e345beba14c1142fd719b3bd50e...f94823c800ad6f7266995c700b1b630d5ffdcf40

>---------------------------------------------------------------

commit 1e5af1f408cef192b3ce931cf9268c92cd68f4d2
Author: Ryan Curtin <ryan at ratml.org>
Date:   Wed May 21 16:58:08 2014 +0000

    Remove the Pelleg-Moore k-means implementation; it is being replaced.


>---------------------------------------------------------------

1e5af1f408cef192b3ce931cf9268c92cd68f4d2
 src/mlpack/methods/kmeans/kmeans.hpp      |  10 -
 src/mlpack/methods/kmeans/kmeans_impl.hpp | 433 ------------------------------
 src/mlpack/methods/kmeans/kmeans_main.cpp |  22 +-
 3 files changed, 3 insertions(+), 462 deletions(-)

diff --git a/src/mlpack/methods/kmeans/kmeans.hpp b/src/mlpack/methods/kmeans/kmeans.hpp
index e070e5d..44883f4 100644
--- a/src/mlpack/methods/kmeans/kmeans.hpp
+++ b/src/mlpack/methods/kmeans/kmeans.hpp
@@ -142,16 +142,6 @@ class KMeans
                const bool initialAssignmentGuess = false,
                const bool initialCentroidGuess = false) const;
 
-  /**
-   * An implementation of k-means using the Pelleg-Moore algorithm; this is
-   * known to not work -- do not use it!  (Fixing it is TODO, of course; see
-   * #251.)
-   */
-  template<typename MatType>
-  void FastCluster(MatType& data,
-                   const size_t clusters,
-                   arma::Col<size_t>& assignments) const;
-
   //! Return the overclustering factor.
   double OverclusteringFactor() const { return overclusteringFactor; }
   //! Set the overclustering factor.  Must be greater than 1.
diff --git a/src/mlpack/methods/kmeans/kmeans_impl.hpp b/src/mlpack/methods/kmeans/kmeans_impl.hpp
index 6d48517..1ef8c6f 100644
--- a/src/mlpack/methods/kmeans/kmeans_impl.hpp
+++ b/src/mlpack/methods/kmeans/kmeans_impl.hpp
@@ -49,439 +49,6 @@ KMeans(const size_t maxIterations,
   }
 }
 
-template<typename MetricType,
-         typename InitialPartitionPolicy,
-         typename EmptyClusterPolicy>
-template<typename MatType>
-void KMeans<
-    MetricType,
-    InitialPartitionPolicy,
-    EmptyClusterPolicy>::
-FastCluster(MatType& data,
-            const size_t clusters,
-            arma::Col<size_t>& assignments) const
-{
-  size_t actualClusters = size_t(overclusteringFactor * clusters);
-  if (actualClusters > data.n_cols)
-  {
-    Log::Warn << "KMeans::Cluster(): overclustering factor is too large.  No "
-        << "overclustering will be done." << std::endl;
-    actualClusters = clusters;
-  }
-
-  size_t dimensionality = data.n_rows;
-
-  // Centroids of each cluster.  Each column corresponds to a centroid.
-  MatType centroids(dimensionality, actualClusters);
-  centroids.zeros();
-
-  // Counts of points in each cluster.
-  arma::Col<size_t> counts(actualClusters);
-  counts.zeros();
-
-  // Build the mrkd-tree on this dataset.
-  tree::BinarySpaceTree<typename bound::HRectBound<2>, tree::MRKDStatistic>
-      tree(data, 1);
-  Log::Debug << "Tree Built." << std::endl;
-  // A pointer for traversing the mrkd-tree.
-  tree::BinarySpaceTree<typename bound::HRectBound<2>, tree::MRKDStatistic>*
-      node;
-
-  // Now, the initial assignments.  First determine if they are necessary.
-  if (assignments.n_elem != data.n_cols)
-  {
-    // Use the partitioner to come up with the partition assignments.
-    partitioner.Cluster(data, actualClusters, assignments);
-  }
-
-  // Set counts correctly.
-  for (size_t i = 0; i < assignments.n_elem; i++)
-    counts[assignments[i]]++;
-
-  // Sum the points for each centroid
-  for (size_t i = 0; i < data.n_cols; i++)
-    centroids.col(assignments[i]) += data.col(i);
-
-  // Then divide the sums by the count to get the center of mass for this
-  // centroids assigned points
-  for (size_t i = 0; i < actualClusters; i++)
-    centroids.col(i) /= counts[i];
-
-  // Instead of retraversing the tree after an iteration, we will update
-  // centroid positions in this matrix, which also prevents clobbering our
-  // centroids from the previous iteration.
-  MatType newCentroids(dimensionality, centroids.n_cols);
-
-  // Create a stack for traversing the mrkd-tree.
-  std::stack<typename tree::BinarySpaceTree<typename bound::HRectBound<2>,
-                                            tree::MRKDStatistic>* > stack;
-
-  // A variable to keep track of how many kmeans iterations we have made.
-  size_t iteration = 0;
-
-  // A variable to keep track of how many nodes assignments have changed in
-  // each kmeans iteration.
-  size_t changedAssignments = 0;
-
-  // A variable to keep track of the number of times something is skipped due
-  // to the blacklist.
-  size_t skip = 0;
-
-  // A variable to keep track of the number of distances calculated.
-  size_t comps = 0;
-
-  // A variable to keep track of how often we stop at a parent node.
-  size_t dominations = 0;
-  do
-  {
-    // Keep track of what iteration we are on.
-    ++iteration;
-    changedAssignments = 0;
-
-    // Reset the newCentroids so that we can store the newly calculated ones
-    // here.
-    newCentroids.zeros();
-
-    // Reset the counts.
-    counts.zeros();
-
-    // Add the root node of the tree to the stack.
-    stack.push(&tree);
-    // Set the top level whitelist.
-    tree.Stat().Whitelist().resize(centroids.n_cols, true);
-
-    // Traverse the tree.
-    while (!stack.empty())
-    {
-      // Get the next node in the tree.
-      node = stack.top();
-      // Remove the node from the stack.
-      stack.pop();
-
-      // Get a reference to the mrkd statistic for this hyperrectangle.
-      tree::MRKDStatistic& mrkd = node->Stat();
-
-      // We use this to store the index of the centroid with the minimum
-      // distance from this hyperrectangle or point.
-      size_t minIndex = 0;
-
-      // If this node is a leaf, then we calculate the distance from
-      // the centroids to every point the node contains.
-      if (node->IsLeaf())
-      {
-        for (size_t i = mrkd.Begin(); i < mrkd.Count() + mrkd.Begin(); ++i)
-        {
-          // Initialize minDistance to be nonzero.
-          double minDistance = metric.Evaluate(data.col(i), centroids.col(0));
-
-          // Find the minimal distance centroid for this point.
-          for (size_t j = 1; j < centroids.n_cols; ++j)
-          {
-            // If this centroid is not in the whitelist, skip it.
-            if (!mrkd.Whitelist()[j])
-            {
-              ++skip;
-              continue;
-            }
-
-            ++comps;
-            double distance = metric.Evaluate(data.col(i), centroids.col(j));
-            if (minDistance > distance)
-            {
-              minIndex = j;
-              minDistance = distance;
-            }
-          }
-
-          // Add this point to the undivided center of mass summation for its
-          // assigned centroid.
-          newCentroids.col(minIndex) += data.col(i);
-
-          // Increment the count for the minimum distance centroid.
-          ++counts(minIndex);
-
-          // If we actually changed assignments, increment changedAssignments
-          // and modify the assignment vector for this point.
-          if (assignments(i) != minIndex)
-          {
-            ++changedAssignments;
-            assignments(i) = minIndex;
-          }
-        }
-      }
-      // If this node is not a leaf, then we continue trying to find dominant
-      // centroids.
-      else
-      {
-        bound::HRectBound<2>& bound = node->Bound();
-
-        // A flag to keep track of if we find a single centroid that is closer
-        // to all points in this hyperrectangle than any other centroid.
-        bool noDomination = false;
-
-        // Calculate the center of mass of this hyperrectangle.
-        arma::vec center = mrkd.CenterOfMass() / mrkd.Count();
-
-        // Set the minDistance to the maximum value of a double so any value
-        // must be smaller than this.
-        double minDistance = std::numeric_limits<double>::max();
-
-        // The candidate distance we calculate for each centroid.
-        double distance = 0.0;
-
-        // How many points are inside this hyperrectangle, we stop if we
-        // see more than 1.
-        size_t contains = 0;
-
-        // Find the "owner" of this hyperrectangle, if one exists.
-        for (size_t i = 0; i < centroids.n_cols; ++i)
-        {
-          // If this centroid is not in the whitelist, skip it.
-          if (!mrkd.Whitelist()[i])
-          {
-            ++skip;
-            continue;
-          }
-
-          // Incrememnt the number of distance calculations for what we are
-          // about to do.
-          comps += 2;
-
-          // Reinitialize the distance so += works right.
-          distance = 0.0;
-
-          // We keep track of how many dimensions have nonzero distance,
-          // if this is 0 then the distance is 0.
-          size_t nonZero = 0;
-
-          /*
-            Compute the distance to the hyperrectangle for this centroid.
-            We do this by finding the furthest point from the centroid inside
-            the hyperrectangle. This is a corner of the hyperrectangle.
-
-            In order to do this faster, we calculate both the distance and the
-            furthest point simultaneously.
-
-            This following code is equivalent to, but faster than:
-
-            arma::vec p;
-            p.zeros(dimensionality);
-
-            for (size_t j = 0; j < dimensionality; ++j)
-            {
-              if (centroids(j,i) < bound[j].Lo())
-                p(j) = bound[j].Lo();
-              else
-                p(j) = bound[j].Hi();
-            }
-
-            distance = metric.Evaluate(p.col(0), centroids.col(i));
-          */
-          for (size_t j = 0; j < dimensionality; ++j)
-          {
-            double ij = centroids(j,i);
-            double lo = bound[j].Lo();
-
-            if (ij < lo)
-            {
-              // (ij - lo)^2
-              ij -= lo;
-              ij *= ij;
-
-              distance += ij;
-              ++nonZero;
-            }
-            else
-            {
-              double hi = bound[j].Hi();
-              if (ij > hi)
-              {
-                // (ij - hi)^2
-                ij -= hi;
-                ij *= ij;
-
-                distance += ij;
-                ++nonZero;
-              }
-            }
-          }
-
-          // The centroid is inside the hyperrectangle.
-          if (nonZero == 0)
-          {
-            ++contains;
-            minDistance = 0.0;
-            minIndex = i;
-
-            // If more than two points are within this hyperrectangle, then
-            // there can be no dominating centroid, so we should continue
-            // to the children nodes.
-            if (contains > 1)
-            {
-              noDomination = true;
-              break;
-            }
-          }
-
-          if (fabs(distance - minDistance) <= 1e-10)
-          {
-            noDomination = true;
-            break;
-          }
-          else if (distance < minDistance)
-          {
-            minIndex = i;
-            minDistance = distance;
-          }
-        }
-
-        distance = minDistance;
-        // Determine if the owner dominates this centroid only if there was
-        // exactly one owner.
-        if (!noDomination)
-        {
-          for (size_t i = 0; i < centroids.n_cols; ++i)
-          {
-            if (i == minIndex)
-              continue;
-            // If this centroid is blacklisted for this hyperrectangle, then
-            // we skip it.
-            if (!mrkd.Whitelist()[i])
-            {
-              ++skip;
-              continue;
-            }
-            /*
-              Compute the dominating centroid for this hyperrectangle, if one
-              exists. We do this by calculating the point which is furthest
-              from the min'th centroid in the direction of c_k - c_min. We do
-              this as outlined in the Pelleg and Moore paper.
-
-              This following code is equivalent to, but faster than:
-
-              arma::vec p;
-              p.zeros(dimensionality);
-
-              for (size_t k = 0; k < dimensionality; ++k)
-              {
-                p(k) = (centroids(k,i) > centroids(k,minIndex)) ?
-                  bound[k].Hi() : bound[k].Lo();
-              }
-
-              double distancei = metric.Evaluate(p.col(0), centroids.col(i));
-              double distanceMin = metric.Evaluate(p.col(0),
-                  centroids.col(minIndex));
-            */
-
-            comps += 1;
-            double distancei = 0.0;
-            double distanceMin = 0.0;
-            for (size_t k = 0; k < dimensionality; ++k)
-            {
-              double ci = centroids(k, i);
-              double cm = centroids(k, minIndex);
-              if (ci > cm)
-              {
-                double hi = bound[k].Hi();
-
-                ci -= hi;
-                cm -= hi;
-
-                ci *= ci;
-                cm *= cm;
-
-                distancei += ci;
-                distanceMin += cm;
-              }
-              else
-              {
-                double lo = bound[k].Lo();
-
-                ci -= lo;
-                cm -= lo;
-
-                ci *= ci;
-                cm *= cm;
-
-                distancei += ci;
-                distanceMin += cm;
-              }
-            }
-
-            if (distanceMin >= distancei)
-            {
-              noDomination = true;
-              break;
-            }
-            else
-            {
-              mrkd.Whitelist()[i] = false;
-            }
-          }
-        }
-
-        // If did found a centroid that was closer to every point in the
-        // hyperrectangle than every other centroid, then update that centroid.
-        if (!noDomination)
-        {
-          // Adjust the new centroid sum for the min distance point to this
-          // hyperrectangle by the center of mass of this hyperrectangle.
-          newCentroids.col(minIndex) += mrkd.CenterOfMass();
-
-          // Increment the counts for this centroid.
-          counts(minIndex) += mrkd.Count();
-
-          // Update all assignments for this node.
-          const size_t begin = node->Begin();
-          const size_t end = node->End();
-
-          // TODO: Do this outside of the kmeans iterations.
-          for (size_t j = begin; j < end; ++j)
-          {
-            if (assignments(j) != minIndex)
-            {
-              ++changedAssignments;
-              assignments(j) = minIndex;
-            }
-          }
-          mrkd.DominatingCentroid() = minIndex;
-
-          // Keep track of the number of times we found a dominating centroid.
-          ++dominations;
-        }
-
-        // If we did not find a dominating centroid then we fall through to the
-        // default case, where we add the children of this node to the stack.
-        else
-        {
-          // Add this hyperrectangle's children to our stack.
-          stack.push(node->Left());
-          stack.push(node->Right());
-
-          // (Re)Initialize the whiteList for the children.
-          node->Left()->Stat().Whitelist() = mrkd.Whitelist();
-          node->Right()->Stat().Whitelist() = mrkd.Whitelist();
-        }
-      }
-
-    }
-
-    // Divide by the number of points assigned to the centroids so that we
-    // have the actual center of mass and update centroids' positions.
-    for (size_t i = 0; i < centroids.n_cols; ++i)
-      if (counts(i))
-        centroids.col(i) = newCentroids.col(i) / counts(i);
-
-    // Stop when we reach max iterations or we changed no assignments
-    // assignments.
-  } while (changedAssignments > 0 && iteration != maxIterations);
-
-  Log::Info << "Iterations: " << iteration << std::endl
-      << "Skips: " << skip << std::endl
-      << "Comparisons: " << comps << std::endl
-      << "Dominations: " << dominations << std::endl;
-}
-
 /**
  * Perform k-means clustering on the data, returning a list of cluster
  * assignments.  This just forward to the other function, which returns the
diff --git a/src/mlpack/methods/kmeans/kmeans_main.cpp b/src/mlpack/methods/kmeans/kmeans_main.cpp
index bd1458a..3f81020 100644
--- a/src/mlpack/methods/kmeans/kmeans_main.cpp
+++ b/src/mlpack/methods/kmeans/kmeans_main.cpp
@@ -55,10 +55,6 @@ PARAM_INT("seed", "Random seed.  If 0, 'std::time(NULL)' is used.", "s", 0);
 PARAM_STRING("initial_centroids", "Start with the specified initial centroids.",
              "I", "");
 
-// This is known to not work (#251).
-//PARAM_FLAG("fast_kmeans", "Use the experimental fast k-means algorithm by "
-//    "Pelleg and Moore.", "f");
-
 // Parameters for "refined start" k-means.
 PARAM_FLAG("refined_start", "Use the refined initial point strategy by Bradley "
     "and Fayyad to choose initial points.", "r");
@@ -151,9 +147,6 @@ int main(int argc, char** argv)
           RefinedStart(samplings, percentage));
 
       Timer::Start("clustering");
-//      if (CLI::HasParam("fast_kmeans"))
-//        k.FastCluster(dataset, clusters, assignments);
-//      else
       k.Cluster(dataset, clusters, assignments, centroids);
       Timer::Stop("clustering");
     }
@@ -163,9 +156,6 @@ int main(int argc, char** argv)
           AllowEmptyClusters> k(maxIterations, overclustering);
 
       Timer::Start("clustering");
-//      if (CLI::HasParam("fast_kmeans"))
-//        k.FastCluster(dataset, clusters, assignments);
-//      else
       k.Cluster(dataset, clusters, assignments, centroids, false,
           initialCentroidGuess);
       Timer::Stop("clustering");
@@ -190,10 +180,7 @@ int main(int argc, char** argv)
           RefinedStart(samplings, percentage));
 
       Timer::Start("clustering");
-//      if (CLI::HasParam("fast_kmeans"))
-//        k.FastCluster(dataset, clusters, assignments);
-//      else
-        k.Cluster(dataset, clusters, assignments, centroids);
+      k.Cluster(dataset, clusters, assignments, centroids);
       Timer::Stop("clustering");
     }
     else
@@ -201,11 +188,8 @@ int main(int argc, char** argv)
       KMeans<> k(maxIterations, overclustering);
 
       Timer::Start("clustering");
-//      if (CLI::HasParam("fast_kmeans"))
-//        k.FastCluster(dataset, clusters, assignments);
-//      else
-        k.Cluster(dataset, clusters, assignments, centroids, false,
-            initialCentroidGuess);
+      k.Cluster(dataset, clusters, assignments, centroids, false,
+          initialCentroidGuess);
       Timer::Stop("clustering");
     }
   }



More information about the mlpack-git mailing list