[mlpack-svn] r15050 - mlpack/trunk/src/mlpack/methods/kernel_pca
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Fri May 10 15:58:40 EDT 2013
Author: rcurtin
Date: 2013-05-10 15:58:40 -0400 (Fri, 10 May 2013)
New Revision: 15050
Modified:
mlpack/trunk/src/mlpack/methods/kernel_pca/kernel_pca.hpp
mlpack/trunk/src/mlpack/methods/kernel_pca/kernel_pca_impl.hpp
mlpack/trunk/src/mlpack/methods/kernel_pca/kernel_pca_main.cpp
Log:
Fix #280 and revamp KernelPCA implementation. It should now be much faster.
Modified: mlpack/trunk/src/mlpack/methods/kernel_pca/kernel_pca.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kernel_pca/kernel_pca.hpp 2013-05-10 19:36:16 UTC (rev 15049)
+++ mlpack/trunk/src/mlpack/methods/kernel_pca/kernel_pca.hpp 2013-05-10 19:58:40 UTC (rev 15050)
@@ -14,46 +14,70 @@
namespace mlpack {
namespace kpca {
+/**
+ * This class performs kernel principal components analysis (Kernel PCA), for a
+ * given kernel. This is a standard machine learning technique and is
+ * well-documented on the Internet and in standard texts. It is often used as a
+ * dimensionality reduction technique, and can also be useful in mapping
+ * linearly inseparable classes of points to different spaces where they are
+ * linearly separable.
+ *
+ * The performance of the method is highly dependent on the kernel choice.
+ * There are numerous available kernels in the mlpack::kernel namespace (see
+ * files in mlpack/core/kernels/) and it is easy to write your own; see other
+ * implementations for examples.
+ */
template <typename KernelType>
class KernelPCA
{
public:
+ /**
+ * Construct the KernelPCA object, optionally passing a kernel. Optionally,
+ * the transformed data can be centered about the origin; to do this, pass
+ * 'true' for centerTransformedData. This will take slightly longer (but not
+ * much).
+ *
+ * @param kernel Kernel to be used for computation.
+ */
KernelPCA(const KernelType kernel = KernelType(),
- const bool scaleData = false);
+ const bool centerTransformedData = false);
/**
- * Apply Kernel Principal Component Analysis to the provided data set.
+ * Apply Kernel Principal Components Analysis to the provided data set.
*
- * @param data - Data matrix
- * @param transformedData - Data with PCA applied
- * @param eigVal - contains eigen values in a column vector
- * @param coeff - PCA Loadings/Coeffs/EigenVectors
+ * @param data Data matrix.
+ * @param transformedData Matrix to output results into.
+ * @param eigval KPCA eigenvalues will be written to this vector.
+ * @param eigvec KPCA eigenvectors will be written to this matrix.
*/
void Apply(const arma::mat& data,
arma::mat& transformedData,
- arma::vec& eigVal,
- arma::mat& coeff);
+ arma::vec& eigval,
+ arma::mat& eigvec);
/**
* Apply Kernel Principal Component Analysis to the provided data set.
*
- * @param data - Data matrix
- * @param transformedData - Data with PCA applied
- * @param eigVal - contains eigen values in a column vector
+ * @param data Data matrix.
+ * @param transformedData Matrix to output results into.
+ * @param eigval KPCA eigenvalues will be written to this vector.
*/
void Apply(const arma::mat& data,
arma::mat& transformedData,
- arma::vec& eigVal);
+ arma::vec& eigval);
/**
- * Apply Dimensionality Reduction using Kernel Principal Component Analysis
- * to the provided data set.
+ * Apply dimensionality reduction using Kernel Principal Component Analysis
+ * to the provided data set. The data matrix will be modified in-place. Note
+ * that the dimension can be larger than the existing dimension because KPCA
+ * works on the kernel matrix, not the covariance matrix. This means the new
+ * dimension can be as large as the number of points (columns) in the dataset.
+ * Note that if you specify newDimension to be larger than the current
+ * dimension of the data (the number of rows), then it's not really
+ * "dimensionality reduction"...
*
- * @param data - M x N Data matrix
- * @param newDimension - matrix consisting of N column vectors,
- * where each vector is the projection of the corresponding data vector
- * from data matrix onto the basis vectors contained in the columns of
- * coeff/eigen vector matrix with only newDimension number of columns chosen.
+ * @param data Data matrix.
+ * @param newDimension New dimension for the dataset.
*/
void Apply(arma::mat& data, const size_t newDimension);
@@ -62,20 +86,26 @@
//! Modify the kernel.
KernelType& Kernel() { return kernel; }
- //! Return whether or not this KernelPCA object will scale (by standard
- //! deviation) the data when kernel PCA is performed.
- bool ScaleData() const { return scaleData; }
- //! Modify whether or not this KernelPCA object will scale (by standard
- //! deviation) the data when kernel PCA is performed.
- bool& ScaleData() { return scaleData; }
+ //! Return whether or not the transformed data is centered.
+ bool CenterTransformedData() const { return centerTransformedData; }
+ //! Return whether or not the transformed data is centered.
+ bool& CenterTransformedData() { return centerTransformedData; }
private:
//! The instantiated kernel.
KernelType kernel;
//! If true, the data will be scaled (by standard deviation) when Apply() is
//! run.
- bool scaleData;
+ bool centerTransformedData;
+ /**
+ * Construct the kernel matrix.
+ *
+ * @param data Input data points.
+ * @param kernelMatrix Matrix to store the constructed kernel matrix in.
+ */
+ void GetKernelMatrix(const arma::mat& data, arma::mat& kernelMatrix);
+
}; // class KernelPCA
}; // namespace kpca
Modified: mlpack/trunk/src/mlpack/methods/kernel_pca/kernel_pca_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kernel_pca/kernel_pca_impl.hpp 2013-05-10 19:36:16 UTC (rev 15049)
+++ mlpack/trunk/src/mlpack/methods/kernel_pca/kernel_pca_impl.hpp 2013-05-10 19:58:40 UTC (rev 15050)
@@ -1,6 +1,7 @@
/**
* @file kernel_pca_impl.hpp
* @author Ajinkya Kale
+ * @author Marcus Edel
*
* Implementation of KernelPCA class to perform Kernel Principal Components
* Analysis on the specified data set.
@@ -21,59 +22,58 @@
template <typename KernelType>
KernelPCA<KernelType>::KernelPCA(const KernelType kernel,
- const bool scaleData) :
+ const bool centerTransformedData) :
kernel(kernel),
- scaleData(scaleData)
+ centerTransformedData(centerTransformedData)
{ }
-/**
- * Apply Kernel Principal Component Analysis to the provided data set.
- *
- * @param data - Data matrix
- * @param transformedData - Data with KernelPCA applied
- * @param eigVal - contains eigen values in a column vector
- * @param coeff - KernelPCA Loadings/Coeffs/EigenVectors
- */
+//! Apply Kernel Principal Component Analysis to the provided data set.
template <typename KernelType>
void KernelPCA<KernelType>::Apply(const arma::mat& data,
arma::mat& transformedData,
- arma::vec& eigVal,
- arma::mat& coeffs)
+ arma::vec& eigval,
+ arma::mat& eigvec)
{
- arma::mat transData = ccov(data);
+ // Construct the kernel matrix.
+ arma::mat kernelMatrix;
+ GetKernelMatrix(data, kernelMatrix);
- // Center the data if necessary.
+ // For PCA the data has to be centered, even if the data is centered. But it
+ // is not guaranteed that the data, when mapped to the kernel space, is also
+ // centered. Since we actually never work in the feature space we cannot
+ // center the data. So, we perform a "psuedo-centering" using the kernel
+ // matrix.
- // Scale the data if necessary.
- if (scaleData)
- {
- transData = transData / (arma::ones<arma::colvec>(transData.n_rows) *
- stddev(transData, 0, 0));
- }
+ // The matrix of ones, made below, may be somewhat large in memory. The
+ // centering expression might be optimizable to avoid creation of ones.
+ arma::mat ones = arma::ones<arma::mat>(kernelMatrix.n_rows,
+ kernelMatrix.n_cols);
+ arma::mat centeredKernelMatrix = kernelMatrix - (ones * kernelMatrix) -
+ (kernelMatrix * ones) + (ones * kernelMatrix * ones);
- arma::mat centeredData = trans(transData);
- arma::mat kernelMat = GetKernelMatrix(kernel, centeredData);
- arma::eig_sym(eigVal, coeffs, kernelMat);
+ // Eigendecompose the centered kernel matrix.
+ arma::eig_sym(eigval, eigvec, centeredKernelMatrix);
- int n_eigVal = eigVal.n_elem;
- for(int i = 0; i < floor(n_eigVal / 2.0); i++)
- eigVal.swap_rows(i, (n_eigVal - 1) - i);
+ // Swap the eigenvalues since they are ordered backwards (we need largest to
+ // smallest).
+ for (size_t i = 0; i < floor(eigval.n_elem / 2.0); ++i)
+ eigval.swap_rows(i, (eigval.n_elem - 1) - i);
- coeffs = arma::fliplr(coeffs);
+ // Flip the coefficients to produce the same effect.
+ eigvec = arma::fliplr(eigvec);
- transformedData = trans(coeffs) * data;
- arma::colvec transformedDataMean = arma::mean(transformedData, 1);
- transformedData = transformedData - (transformedDataMean *
- arma::ones<arma::rowvec>(transformedData.n_cols));
+ transformedData = eigvec.t() * centeredKernelMatrix;
+
+ // Center the transformed data, if the user asked for it.
+ if (centerTransformedData)
+ {
+ arma::colvec transformedDataMean = arma::mean(transformedData, 1);
+ transformedData = transformedData - (transformedDataMean *
+ arma::ones<arma::rowvec>(transformedData.n_cols));
+ }
}
-/**
- * Apply Kernel Principal Component Analysis to the provided data set.
- *
- * @param data - Data matrix
- * @param transformedData - Data with KernelPCA applied
- * @param eigVal - contains eigen values in a column vector
- */
+//! Apply Kernel Principal Component Analysis to the provided data set.
template <typename KernelType>
void KernelPCA<KernelType>::Apply(const arma::mat& data,
arma::mat& transformedData,
@@ -83,16 +83,7 @@
Apply(data, transformedData, eigVal, coeffs);
}
-/**
- * Apply Dimensionality Reduction using Kernel Principal Component Analysis
- * to the provided data set.
- *
- * @param data - M x N Data matrix
- * @param newDimension - matrix consisting of N column vectors,
- * where each vector is the projection of the corresponding data vector
- * from data matrix onto the basis vectors contained in the columns of
- * coeff/eigen vector matrix with only newDimension number of columns chosen.
- */
+//! Use KPCA for dimensionality reduction.
template <typename KernelType>
void KernelPCA<KernelType>::Apply(arma::mat& data, const size_t newDimension)
{
@@ -105,22 +96,31 @@
data.shed_rows(newDimension, data.n_rows - 1);
}
+//! Construct the kernel matrix.
template <typename KernelType>
-arma::mat GetKernelMatrix(KernelType kernel, arma::mat transData)
+void KernelPCA<KernelType>::GetKernelMatrix(const arma::mat& data,
+ arma::mat& kernelMatrix)
{
- arma::mat kernelMat(transData.n_rows, transData.n_rows);
+ // Resize the kernel matrix to the right size.
+ kernelMatrix.set_size(data.n_cols, data.n_cols);
- for (size_t i = 0; i < transData.n_rows; i++)
+ // Note that we only need to calculate the upper triangular part of the kernel
+ // matrix, since it is symmetric. This helps minimize the number of kernel
+ // evaluations.
+ for (size_t i = 0; i < data.n_cols; ++i)
{
- for (size_t j = 0; j < transData.n_rows; j++)
+ for (size_t j = i; j < data.n_cols; ++j)
{
- arma::vec v1 = trans(transData.row(i));
- arma::vec v2 = trans(transData.row(j));
- kernelMat(i, j) = kernel.Evaluate(v1, v2);
+ // Evaluate the kernel on these two points.
+ kernelMatrix(i, j) = kernel.Evaluate(data.unsafe_col(i),
+ data.unsafe_col(j));
}
}
- return kernelMat;
+ // Copy to the lower triangular part of the matrix.
+ for (size_t i = 1; i < data.n_cols; ++i)
+ for (size_t j = 0; j < i; ++j)
+ kernelMatrix(i, j) = kernelMatrix(j, i);
}
}; // namespace mlpack
Modified: mlpack/trunk/src/mlpack/methods/kernel_pca/kernel_pca_main.cpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/kernel_pca/kernel_pca_main.cpp 2013-05-10 19:36:16 UTC (rev 15049)
+++ mlpack/trunk/src/mlpack/methods/kernel_pca/kernel_pca_main.cpp 2013-05-10 19:58:40 UTC (rev 15050)
@@ -63,8 +63,8 @@
"the output dataset by ignoring the dimensions with the smallest "
"eigenvalues.", "d", 0);
-PARAM_FLAG("scale", "If set, the data will be scaled before performing KPCA "
- "such that the variance of each feature is 1.", "s");
+PARAM_FLAG("center", "If set, the transformed data will be centered about the "
+ "origin.", "c");
PARAM_DOUBLE("kernel_scale", "Scale, for 'hyptan' kernel.", "S", 1.0);
PARAM_DOUBLE("offset", "Offset, for 'hyptan' and 'polynomial' kernels.", "O",
@@ -101,11 +101,11 @@
// Get the kernel type and make sure it is valid.
const string kernelType = CLI::GetParam<string>("kernel");
- const bool scaleData = CLI::HasParam("scale");
+ const bool centerTransformedData = CLI::HasParam("center");
if (kernelType == "linear")
{
- KernelPCA<LinearKernel> kpca(LinearKernel(), scaleData);
+ KernelPCA<LinearKernel> kpca(LinearKernel(), centerTransformedData);
kpca.Apply(dataset, newDim);
}
else if (kernelType == "gaussian")
@@ -113,7 +113,7 @@
const double bandwidth = CLI::GetParam<double>("bandwidth");
GaussianKernel kernel(bandwidth);
- KernelPCA<GaussianKernel> kpca(kernel, scaleData);
+ KernelPCA<GaussianKernel> kpca(kernel, centerTransformedData);
kpca.Apply(dataset, newDim);
}
else if (kernelType == "polynomial")
@@ -122,7 +122,7 @@
const double offset = CLI::GetParam<double>("offset");
PolynomialKernel kernel(degree, offset);
- KernelPCA<PolynomialKernel> kpca(kernel, scaleData);
+ KernelPCA<PolynomialKernel> kpca(kernel, centerTransformedData);
kpca.Apply(dataset, newDim);
}
else if (kernelType == "hyptan")
@@ -131,7 +131,7 @@
const double offset = CLI::GetParam<double>("offset");
HyperbolicTangentKernel kernel(scale, offset);
- KernelPCA<HyperbolicTangentKernel> kpca(kernel, scaleData);
+ KernelPCA<HyperbolicTangentKernel> kpca(kernel, centerTransformedData);
kpca.Apply(dataset, newDim);
}
else if (kernelType == "laplacian")
@@ -139,12 +139,12 @@
const double bandwidth = CLI::GetParam<double>("bandwidth");
LaplacianKernel kernel(bandwidth);
- KernelPCA<LaplacianKernel> kpca(kernel, scaleData);
+ KernelPCA<LaplacianKernel> kpca(kernel, centerTransformedData);
kpca.Apply(dataset, newDim);
}
else if (kernelType == "cosine")
{
- KernelPCA<CosineDistance> kpca(CosineDistance(), scaleData);
+ KernelPCA<CosineDistance> kpca(CosineDistance(), centerTransformedData);
kpca.Apply(dataset, newDim);
}
else
More information about the mlpack-svn
mailing list