[mlpack-git] master, mlpack-1.0.x: Integrate nystroem method into the kernel_pca_main.cpp file. (6296040)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Thu Mar 5 21:54:04 EST 2015


Repository : https://github.com/mlpack/mlpack

On branches: master,mlpack-1.0.x
Link       : https://github.com/mlpack/mlpack/compare/904762495c039e345beba14c1142fd719b3bd50e...f94823c800ad6f7266995c700b1b630d5ffdcf40

>---------------------------------------------------------------

commit 629604052193b84abc7a67dbabc84b88bf1ce0e1
Author: Marcus Edel <marcus.edel at fu-berlin.de>
Date:   Tue Jul 22 18:52:36 2014 +0000

    Integrate nystroem method into the kernel_pca_main.cpp file.


>---------------------------------------------------------------

629604052193b84abc7a67dbabc84b88bf1ce0e1
 src/mlpack/methods/kernel_pca/kernel_pca_main.cpp | 102 ++++++++++++++++++----
 1 file changed, 87 insertions(+), 15 deletions(-)

diff --git a/src/mlpack/methods/kernel_pca/kernel_pca_main.cpp b/src/mlpack/methods/kernel_pca/kernel_pca_main.cpp
index 1d6695c..91f6d75 100644
--- a/src/mlpack/methods/kernel_pca/kernel_pca_main.cpp
+++ b/src/mlpack/methods/kernel_pca/kernel_pca_main.cpp
@@ -5,6 +5,11 @@
  * Executable for Kernel PCA.
  */
 #include <mlpack/core.hpp>
+#include <mlpack/methods/nystroem_method/ordered_selection.hpp>
+#include <mlpack/methods/nystroem_method/random_selection.hpp>
+#include <mlpack/methods/nystroem_method/kmeans_selection.hpp>
+#include <mlpack/methods/nystroem_method/nystroem_method.hpp>
+#include <mlpack/methods/kernel_pca/kernel_rules/nystroem_method.hpp>
 
 #include "kernel_pca.hpp"
 
@@ -24,6 +29,14 @@ PROGRAM_INFO("Kernel Principal Components Analysis",
     "For the case where a linear kernel is used, this reduces to regular "
     "PCA."
     "\n\n"
+
+    "For example, the following will perform KPCA on the 'input.csv' file using"
+    " the gaussian kernel and store the transformed date in the "
+    "'transformed.csv' file."
+
+    "\n\n"
+    "$ kernel_pca -i input.csv -k gaussian -o transformed.csv"
+    "\n\n"
     "The kernels that are supported are listed below:"
     "\n\n"
     " * 'linear': the standard linear dot product (same as normal PCA):\n"
@@ -49,7 +62,15 @@ PROGRAM_INFO("Kernel Principal Components Analysis",
     "\n"
     "The parameters for each of the kernels should be specified with the "
     "options --bandwidth, --kernel_scale, --offset, or --degree (or a "
-    "combination of those options).\n");
+    "combination of those options)."
+    "\n\n"
+    "Optionally, the nystroem method (\"Using the Nystroem method to speed up"
+    " kernel machines\", 2001) can be used to calculate the kernel matrix by "
+    "specifying the --nystroem_method (-n) option. This approach works by using"
+    " a subset of the data as basis to reconstruct the kernel matrix; to specify"
+    " the sampling scheme, the --sampling parameter is used, the sampling scheme"
+    " for the nystroem method can be chosen from the following list: kmeans,"
+    " random, ordered.");
 
 PARAM_STRING_REQ("input_file", "Input dataset to perform KPCA on.", "i");
 PARAM_STRING_REQ("output_file", "File to save modified dataset to.", "o");
@@ -63,6 +84,11 @@ PARAM_INT("new_dimensionality", "If not 0, reduce the dimensionality of "
 PARAM_FLAG("center", "If set, the transformed data will be centered about the "
     "origin.", "c");
 
+PARAM_FLAG("nystroem_method", "If set, the nystroem method will be used.", "n");
+
+PARAM_STRING("sampling", "Sampling scheme to use for the nystroem method: "
+    "'kmeans', 'random', 'ordered'", "s", "kmeans");
+
 PARAM_DOUBLE("kernel_scale", "Scale, for 'hyptan' kernel.", "S", 1.0);
 PARAM_DOUBLE("offset", "Offset, for 'hyptan' and 'polynomial' kernels.", "O",
     0.0);
@@ -71,6 +97,48 @@ PARAM_DOUBLE("bandwidth", "Bandwidth, for 'gaussian' and 'laplacian' kernels.",
 PARAM_DOUBLE("degree", "Degree of polynomial, for 'polynomial' kernel.", "D",
     1.0);
 
+//! Run RunKPCA on the specified dataset for the given kernel type.
+template<typename KernelType>
+void RunKPCA(arma::mat& dataset,
+             const bool centerTransformedData,
+             const bool nystroem,
+             const size_t newDim,
+             const string& sampling,
+             KernelType& kernel)
+{
+  if (nystroem) {
+    // Make sure the sampling scheme is valid.
+    if (sampling == "kmeans")
+    {
+      KernelPCA<KernelType, NystroemKernelRule<KernelType, 
+          KMeansSelection<> > >kpca;
+      kpca.Apply(dataset, newDim);
+    }
+    else if (sampling == "random")
+    {
+      KernelPCA<KernelType, NystroemKernelRule<KernelType, 
+          RandomSelection> > kpca;
+      kpca.Apply(dataset, newDim);
+    }
+    else if (sampling == "ordered")
+    {
+      KernelPCA<KernelType, NystroemKernelRule<KernelType, 
+          OrderedSelection> > kpca;
+      kpca.Apply(dataset, newDim);
+    }
+    else
+    {
+      // Invalid sampling scheme.
+      Log::Fatal << "Invalid sampling scheme ('" << sampling << "'); valid "
+        << "choices are 'kmeans', 'random' and 'ordered'" << endl;
+    }
+  }
+  else {
+    KernelPCA<KernelType> kpca(kernel, centerTransformedData);
+    kpca.Apply(dataset, newDim);    
+  }
+}
+
 int main(int argc, char** argv)
 {
   // Parse command line options.
@@ -99,19 +167,22 @@ int main(int argc, char** argv)
   const string kernelType = CLI::GetParam<string>("kernel");
 
   const bool centerTransformedData = CLI::HasParam("center");
+  const bool nystroem = CLI::HasParam("nystroem_method");
+  const string sampling = CLI::GetParam<string>("sampling");
 
   if (kernelType == "linear")
   {
-    KernelPCA<LinearKernel> kpca(LinearKernel(), centerTransformedData);
-    kpca.Apply(dataset, newDim);
+    LinearKernel kernel;
+    RunKPCA<LinearKernel>(dataset, centerTransformedData, nystroem, newDim, 
+        sampling, kernel);
   }
   else if (kernelType == "gaussian")
   {
     const double bandwidth = CLI::GetParam<double>("bandwidth");
 
     GaussianKernel kernel(bandwidth);
-    KernelPCA<GaussianKernel> kpca(kernel, centerTransformedData);
-    kpca.Apply(dataset, newDim);
+    RunKPCA<GaussianKernel>(dataset, centerTransformedData, nystroem, newDim,
+        sampling, kernel);
   }
   else if (kernelType == "polynomial")
   {
@@ -119,8 +190,8 @@ int main(int argc, char** argv)
     const double offset = CLI::GetParam<double>("offset");
 
     PolynomialKernel kernel(degree, offset);
-    KernelPCA<PolynomialKernel> kpca(kernel, centerTransformedData);
-    kpca.Apply(dataset, newDim);
+    RunKPCA<PolynomialKernel>(dataset, centerTransformedData, nystroem,
+        newDim, sampling, kernel);
   }
   else if (kernelType == "hyptan")
   {
@@ -128,29 +199,30 @@ int main(int argc, char** argv)
     const double offset = CLI::GetParam<double>("offset");
 
     HyperbolicTangentKernel kernel(scale, offset);
-    KernelPCA<HyperbolicTangentKernel> kpca(kernel, centerTransformedData);
-    kpca.Apply(dataset, newDim);
+    RunKPCA<HyperbolicTangentKernel>(dataset, centerTransformedData, nystroem,
+        newDim, sampling, kernel);
   }
   else if (kernelType == "laplacian")
   {
     const double bandwidth = CLI::GetParam<double>("bandwidth");
 
     LaplacianKernel kernel(bandwidth);
-    KernelPCA<LaplacianKernel> kpca(kernel, centerTransformedData);
-    kpca.Apply(dataset, newDim);
+    RunKPCA<LaplacianKernel>(dataset, centerTransformedData, nystroem, newDim,
+        sampling, kernel);
   }
   else if (kernelType == "epanechnikov")
   {
     const double bandwidth = CLI::GetParam<double>("bandwidth");
 
     EpanechnikovKernel kernel(bandwidth);
-    KernelPCA<EpanechnikovKernel> kpca(kernel, centerTransformedData);
-    kpca.Apply(dataset, newDim);
+    RunKPCA<EpanechnikovKernel>(dataset, centerTransformedData, nystroem,
+        newDim, sampling, kernel);
   }
   else if (kernelType == "cosine")
   {
-    KernelPCA<CosineDistance> kpca(CosineDistance(), centerTransformedData);
-    kpca.Apply(dataset, newDim);
+    CosineDistance kernel;
+    RunKPCA<CosineDistance>(dataset, centerTransformedData, nystroem, newDim,
+        sampling, kernel);
   }
   else
   {



More information about the mlpack-git mailing list