[mlpack-svn] r11711 - mlpack/tags/mlpack-1.0.1/src/mlpack/methods/kmeans

fastlab-svn at coffeetalk-1.cc.gatech.edu fastlab-svn at coffeetalk-1.cc.gatech.edu
Fri Mar 2 19:08:31 EST 2012


Author: rcurtin
Date: 2012-03-02 19:08:30 -0500 (Fri, 02 Mar 2012)
New Revision: 11711

Modified:
   mlpack/tags/mlpack-1.0.1/src/mlpack/methods/kmeans/kmeans.hpp
   mlpack/tags/mlpack-1.0.1/src/mlpack/methods/kmeans/kmeans_impl.hpp
   mlpack/tags/mlpack-1.0.1/src/mlpack/methods/kmeans/kmeans_main.cpp
Log:
Remove MRKD-tree based stuff; it's not yet done for the release.


Modified: mlpack/tags/mlpack-1.0.1/src/mlpack/methods/kmeans/kmeans.hpp
===================================================================
--- mlpack/tags/mlpack-1.0.1/src/mlpack/methods/kmeans/kmeans.hpp	2012-03-03 00:07:00 UTC (rev 11710)
+++ mlpack/tags/mlpack-1.0.1/src/mlpack/methods/kmeans/kmeans.hpp	2012-03-03 00:08:30 UTC (rev 11711)
@@ -102,10 +102,6 @@
   void Cluster(const MatType& data,
                const size_t clusters,
                arma::Col<size_t>& assignments) const;
-  template<typename MatType>
-  void FastCluster(MatType& data,
-               const size_t clusters,
-               arma::Col<size_t>& assignments) const;
 
   /**
    * Return the overclustering factor.

Modified: mlpack/tags/mlpack-1.0.1/src/mlpack/methods/kmeans/kmeans_impl.hpp
===================================================================
--- mlpack/tags/mlpack-1.0.1/src/mlpack/methods/kmeans/kmeans_impl.hpp	2012-03-03 00:07:00 UTC (rev 11710)
+++ mlpack/tags/mlpack-1.0.1/src/mlpack/methods/kmeans/kmeans_impl.hpp	2012-03-03 00:08:30 UTC (rev 11711)
@@ -7,13 +7,8 @@
  */
 #include "kmeans.hpp"
 
-#include <mlpack/core/tree/binary_space_tree.hpp>
-#include <mlpack/core/tree/hrectbound.hpp>
-#include <mlpack/core/tree/mrkd_statistic.hpp>
 #include <mlpack/core/metrics/lmetric.hpp>
 
-#include <stack>
-
 namespace mlpack {
 namespace kmeans {
 
@@ -50,210 +45,6 @@
   }
 }
 
-template<typename DistanceMetric,
-         typename InitialPartitionPolicy,
-         typename EmptyClusterPolicy>
-template<typename MatType>
-void KMeans<
-    DistanceMetric,
-    InitialPartitionPolicy,
-    EmptyClusterPolicy>::
-FastCluster(MatType& data,
-            const size_t clusters,
-            arma::Col<size_t>& assignments) const
-{
-  size_t actualClusters = size_t(overclusteringFactor * clusters);
-  if (actualClusters > data.n_cols)
-  {
-    Log::Warn << "KMeans::Cluster(): overclustering factor is too large.  No "
-        << "overclustering will be done." << std::endl;
-    actualClusters = clusters;
-  }
-
-  // TODO: remove
-  // Scale the data to [0,1]
-  if(0){
-    arma::rowvec min = arma::min(data, 0);
-    data = (data - arma::ones<arma::colvec>(data.n_rows) * min) / (arma::ones<arma::colvec>(data.n_rows) * (arma::max(data,0) - min));
-    for(size_t i = 0; i < data.n_cols; ++i)
-      for(size_t j = 0; j < data.n_rows; ++j)
-        assert(data(j,i) >= 0 && data(j,i) <= 1);
-  }
-
-  // Centroids of each cluster.  Each column corresponds to a centroid.
-  MatType centroids(data.n_rows, actualClusters);
-
-  // Counts of points in each cluster.
-  arma::Col<size_t> counts(actualClusters);
-  counts.zeros();
-
-  // Build the mrkd-tree on this dataset
-  tree::BinarySpaceTree<typename bound::HRectBound<2>, tree::MRKDStatistic> tree(data, 1);
-  // A pointer for traversing the mrkd-tree
-  tree::BinarySpaceTree<typename bound::HRectBound<2>, tree::MRKDStatistic>* node;
-
-  // We use this to store the furtherst point in a hyperrectangle from a given
-  // vector.
-  arma::colvec p(data.n_rows);
-
-  // Make random centroids and fit them into the root hyperrectangle.
-  {
-    centroids.randu();
-    bound::HRectBound<2>& bound = tree.Bound();
-    size_t dim = bound.Dim();
-    for(size_t i = 0; i < dim; ++i) {
-      double min = bound[i].Lo();
-      double max = bound[i].Hi();
-      for(size_t j = 0; j < centroids.n_cols; ++j)
-      {
-        if(centroids(i,j) < min)
-          centroids(i,j) = min;
-        else if(centroids(i,j) > max)
-          centroids(i,j) = max;
-      }
-    }
-  }
-
-  // Instead of retraversing the tree after an iteration, we will update centroid
-  // positions in this matrix, which also prevents clobbering our centroids from
-  // the previous iteration.
-  MatType newCentroids(centroids.n_rows, centroids.n_cols);
-
-  size_t iteration = 0;
-  size_t changedAssignments = 0;
-  do 
-  {
-    // Keep track of what iteration we are on.
-    ++iteration;
-    changedAssignments = 0;
-    newCentroids.zeros();
-
-    // Create a stack for traversing the mrkd-tree
-    std::stack<typename tree::BinarySpaceTree<typename bound::HRectBound<2>, 
-                                              tree::MRKDStatistic>* > stack;
-    // Add the root node of the tree to the stack
-    stack.push(&tree);
-
-    while (!stack.empty())
-    {
-      node = stack.top();
-      stack.pop();
-
-      tree::MRKDStatistic& mrkd = node->Stat();
-
-      size_t minIndex = 0;
-
-      // If this node is a leaf, then we calculate the distance from
-      // the centroids to every point the node contains.
-      if (node->IsLeaf())
-      {
-        for (size_t i = mrkd.begin; i < mrkd.count + mrkd.begin; ++i)
-        {
-          // Initialize minDistance to be nonzero.
-          double minDistance = metric::SquaredEuclideanDistance::Evaluate(
-              data.col(i), centroids.col(0));
-          // Find the minimal distance centroid for this point.
-          for (size_t j = 1; j < centroids.n_cols; ++j)
-          {
-            double distance = metric::SquaredEuclideanDistance::Evaluate(
-                data.col(i), centroids.col(j));
-            if ( minDistance > distance )
-            {
-              minIndex = j;
-              minDistance = distance;
-            }
-          }
-
-          ++counts[minIndex];
-          newCentroids.col(minIndex) += data.col(i);
-        }
-      }
-      // If this node is not a leaf, then we continue trying to find dominant
-      // centroids
-      else
-      {
-        bound::HRectBound<2>& bound = node->Bound();
-
-        bool noDomination = false;
-
-        // There was no centroid inside this hyperrectangle.
-        // We must determine if an external centroid dominates it.
-        for(size_t i = 0; i < centroids.n_cols; ++i) 
-        {
-          noDomination = false;
-          for(size_t j = 0; j < centroids.n_cols; ++j)
-          {
-            if(j == i)
-              continue;
-
-            for(size_t k = 0; k < p.n_rows; ++k)
-            {
-              p(k) = (centroids(k,j) > centroids(k,i)) ?
-                bound[k].Hi() : bound[k].Lo();
-            }
-
-            double distancei = metric::SquaredEuclideanDistance::Evaluate(
-                p.col(0), centroids.col(i));
-            double distancej = metric::SquaredEuclideanDistance::Evaluate(
-                p.col(0), centroids.col(j));
-
-            if(distancei >= distancej)
-            {
-              noDomination = true;
-              break;
-            }
-
-          }
-
-          // We identified a centroid that dominates this hyperrectangle.
-          if(!noDomination)
-          {
-            mrkd.dominatingCentroid = i;
-            counts[i] += mrkd.count;
-            newCentroids.col(minIndex) += mrkd.centerOfMass;
-            break;
-          }
-        }
-
-        // If we did not find a dominating centroid then we fall through to the
-        // default case, where we add the children of this node to the stack.
-        if(noDomination)
-        {
-          stack.push(node->Left());
-          stack.push(node->Right());
-        }
-      }
-
-    }
-
-    for(size_t i = 0; i < centroids.n_cols; ++i)
-    {
-      if(counts(i))
-        // Divide by the number of points assigned to this centroid so that we
-        // have the actual center of mass.
-        newCentroids.col(i) /= counts(i);
-
-      // TODO: switch to faster way of keeping track of changed assignments
-      if(changedAssignments != 0)
-      {
-        for(size_t j = 0; j < centroids.n_rows; ++j)
-        {
-          if(fabs(newCentroids(j,i) - centroids(j,i)) > 1e-5)
-          {
-            ++changedAssignments;
-            break;
-          }
-        }
-      }
-    }
-
-    // Update the centroids' positions.
-    centroids = newCentroids;
-  } while (changedAssignments > 0 && iteration != maxIterations);
-
-  std::cout << centroids << '\n' << counts << std::endl;
-}
-
 /**
  * Perform K-Means clustering on the data, returning a list of cluster
  * assignments.

Modified: mlpack/tags/mlpack-1.0.1/src/mlpack/methods/kmeans/kmeans_main.cpp
===================================================================
--- mlpack/tags/mlpack-1.0.1/src/mlpack/methods/kmeans/kmeans_main.cpp	2012-03-03 00:07:00 UTC (rev 11710)
+++ mlpack/tags/mlpack-1.0.1/src/mlpack/methods/kmeans/kmeans_main.cpp	2012-03-03 00:08:30 UTC (rev 11711)
@@ -37,7 +37,6 @@
 PARAM_INT("max_iterations", "Maximum number of iterations before K-Means "
     "terminates.", "m", 1000);
 PARAM_INT("seed", "Random seed.  If 0, 'std::time(NULL)' is used.", "s", 0);
-PARAM_FLAG("fast_kmeans", "Use the experimental fast k-means algorithm by Pelleg and Moore", "f")
 
 int main(int argc, char** argv)
 {
@@ -91,19 +90,13 @@
     KMeans<metric::SquaredEuclideanDistance, RandomPartition,
         AllowEmptyClusters> k(maxIterations, overclustering);
 
-		if(CLI::HasParam("fast_kmeans"))
-			k.FastCluster(dataset, clusters, assignments);
-		else
-			k.Cluster(dataset, clusters, assignments);
+    k.Cluster(dataset, clusters, assignments);
   }
   else
   {
     KMeans<> k(maxIterations, overclustering);
 
-		if(CLI::HasParam("fast_kmeans"))
-			k.FastCluster(dataset, clusters, assignments);
-		else
-			k.Cluster(dataset, clusters, assignments);
+    k.Cluster(dataset, clusters, assignments);
   }
 
   // Now figure out what to do with our results.




More information about the mlpack-svn mailing list