[mlpack-svn] r13230 - mlpack/trunk/src/mlpack/core/kernels

fastlab-svn at coffeetalk-1.cc.gatech.edu fastlab-svn at coffeetalk-1.cc.gatech.edu
Sun Jul 15 20:57:43 EDT 2012


Author: rcurtin
Date: 2012-07-15 20:57:43 -0400 (Sun, 15 Jul 2012)
New Revision: 13230

Added:
   mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.cpp
   mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.hpp
   mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel_impl.hpp
Modified:
   mlpack/trunk/src/mlpack/core/kernels/CMakeLists.txt
Log:
Add PSpectrumStringKernel to be used for maxip/FMKS.


Modified: mlpack/trunk/src/mlpack/core/kernels/CMakeLists.txt
===================================================================
--- mlpack/trunk/src/mlpack/core/kernels/CMakeLists.txt	2012-07-15 15:44:30 UTC (rev 13229)
+++ mlpack/trunk/src/mlpack/core/kernels/CMakeLists.txt	2012-07-16 00:57:43 UTC (rev 13230)
@@ -14,6 +14,9 @@
   laplacian_kernel.hpp
   linear_kernel.hpp
   polynomial_kernel.hpp
+  pspectrum_string_kernel.hpp
+  pspectrum_string_kernel_impl.hpp
+  pspectrum_string_kernel.cpp
   spherical_kernel.hpp
   triangular_kernel.hpp
 )

Added: mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.cpp
===================================================================
--- mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.cpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.cpp	2012-07-16 00:57:43 UTC (rev 13230)
@@ -0,0 +1,83 @@
+/**
+ * @file pspectrum_string_kernel.cpp
+ * @author Ryan Curtin
+ *
+ * Implementation of the p-spectrum string kernel, created for use with FastMKS.
+ * Instead of passing a data matrix to FastMKS which stores the kernels, pass a
+ * one-dimensional data matrix (data vector) to FastMKS which stores indices of
+ * strings; then, the actual strings are given to the PSpectrumStringKernel at
+ * construction time, and the kernel knows to map the indices to actual strings.
+ */
+#include "pspectrum_string_kernel.hpp"
+
+using namespace std;
+using namespace mlpack;
+using namespace mlpack::kernel;
+
+/**
+ * Initialize the PSpectrumStringKernel with the given string datasets.  For
+ * more information on this, see the general class documentation.
+ *
+ * @param datasets Sets of string data.  @param p The length of substrings to
+ * search.
+ */
+mlpack::kernel::PSpectrumStringKernel::PSpectrumStringKernel(
+    const std::vector<std::vector<std::string> >& datasets,
+    const size_t p) :
+    datasets(datasets),
+    p(p)
+{
+  // We have to assemble the counts of substrings.  This is not a particularly
+  // fast operation, unfortunately, but it only needs to be done once.
+  Log::Info << "Assembling counts of substrings of length " << p << "."
+      << std::endl;
+
+  // Resize for number of datasets.
+  counts.resize(datasets.size());
+
+  for (size_t dataset = 0; dataset < datasets.size(); ++dataset)
+  {
+    const std::vector<std::string>& set = datasets[dataset];
+
+    // Resize for number of strings in dataset.
+    counts[dataset].resize(set.size());
+
+    // Inspect each string in the dataset.
+    for (size_t index = 0; index < set.size(); ++index)
+    {
+      // Convenience references.
+      const std::string& str = set[index];
+      std::map<std::string, int>& mapping = counts[dataset][index];
+
+      size_t start = 0;
+      while ((start + p) <= str.length())
+      {
+        string sub = str.substr(start, p);
+
+        // Convert all characters to lowercase.
+        bool invalid = false;
+        for (size_t j = 0; j < p; ++j)
+        {
+          if (!isalnum(sub[j]))
+          {
+            invalid = true;
+            break; // Only consider substrings with alphanumerics.
+          }
+
+          sub[j] = tolower(sub[j]);
+        }
+
+        // Increment position in string.
+        ++start;
+
+        if (!invalid)
+        {
+          // Add to the map.
+          ++mapping[sub];
+        }
+      }
+    }
+  }
+
+  Log::Info << "Substring extraction complete." << std::endl;
+}

Added: mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.hpp
===================================================================
--- mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.hpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.hpp	2012-07-16 00:57:43 UTC (rev 13230)
@@ -0,0 +1,116 @@
+/**
+ * @file pspectrum_string_kernel.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of the p-spectrum string kernel, created for use with FastMKS.
+ * Instead of passing a data matrix to FastMKS which stores the kernels, pass a
+ * one-dimensional data matrix (data vector) to FastMKS which stores indices of
+ * strings; then, the actual strings are given to the PSpectrumStringKernel at
+ * construction time, and the kernel knows to map the indices to actual strings.
+ */
+#ifndef __MLPACK_CORE_KERNELS_PSPECTRUM_STRING_KERNEL_HPP
+#define __MLPACK_CORE_KERNELS_PSPECTRUM_STRING_KERNEL_HPP
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace kernel {
+
+/**
+ * The p-spectrum string kernel.  Given a length p, the p-spectrum kernel finds
+ * the contiguous subsequence match count between two strings.  The kernel will
+ * take every possible substring of length p of one string and count how many
+ * times it appears in the other string.
+ *
+ * The string kernel, when created, must be passed a reference to a series of
+ * string datasets (std::vector<std::vector<std::string> >&).  This is because
+ * MLPACK only supports datasets which are Armadillo matrices -- and a dataset
+ * of variable-length strings cannot be easily cast into an Armadillo matrix.
+ *
+ * Therefore, once the PSpectrumStringKernel is created with a reference to the
+ * string datasets, a "fake" Armadillo data matrix must be created, which simply
+ * holds indices to the strings they represent.  This "fake" matrix has two rows
+ * and n columns (where n is the number of strings in the dataset).  The first
+ * row holds the index of the dataset (remember, the kernel can have multiple
+ * datasets), and the second row holds the index of the string.  A fake matrix
+ * containing only strings from dataset 0 might look like this:
+ *
+ * [[0 0 0 0 0 0 0 0 0]
+ *  [0 1 2 3 4 5 6 7 8]]
+ *
+ * This fake matrix is then given to the machine learning method, which will
+ * eventually call PSpectrumStringKernel::Evaluate(a, b), where a and b are two
+ * columns of the fake matrix.  The string kernel will then map these fake
+ * columns back to the strings they represent, and then correctly evaluate the
+ * kernel.
+ *
+ * Unfortunately, not every machine learning method will work with this kernel.
+ * Only machine learning methods which do not ever operate on the explicit
+ * representation of points can use this kernel.  So, for instance, one cannot
+ * build a kd-tree on strings, because the BinarySpaceTree<> class will split
+ * the data according to the fake data matrix -- resulting in a meaningless
+ * tree.  This kernel was originally written for the FastMKS method; so, at the
+ * very least, it will work with that.
+ */
+class PSpectrumStringKernel
+{
+ public:
+  /**
+   * Initialize the PSpectrumStringKernel with the given string datasets.  For
+   * more information on this, see the general class documentation.
+   *
+   * @param datasets Sets of string data.
+   * @param p The length of substrings to search.
+   */
+  PSpectrumStringKernel(const std::vector<std::vector<std::string> >& datasets,
+                        const size_t p);
+
+  /**
+   * Evaluate the kernel for the string indices given.  As mentioned in the
+   * class documentation, a and b should be 2-element vectors, where the first
+   * element contains the index of the dataset and the second element contains
+   * the index of the string.  Therefore, if [2 3] is passed for a, the string
+   * used will be datasets[2][3] (datasets is of type
+   * std::vector<std::vector<std::string> >&).
+   *
+   * @param a Index of string and dataset for first string.
+   * @param b Index of string and dataset for second string.
+   */
+  template<typename VecType>
+  double Evaluate(const VecType& a, const VecType& b) const;
+
+  //! Access the lists of substrings.
+  const std::vector<std::vector<std::map<std::string, int> > >& Counts() const
+  { return counts; }
+  //! Modify the lists of substrings.
+  std::vector<std::vector<std::map<std::string, int> > >& Counts()
+  { return counts; }
+
+  //! Access the value of p.
+  size_t P() const { return p; }
+  //! Modify the value of p.
+  size_t& P() { return p; }
+
+ private:
+  //! The datasets.
+  const std::vector<std::vector<std::string> >& datasets;
+
+  //! Mappings of the datasets to counts of substrings.  Such a huge structure
+  //! is not wonderful...
+  std::vector<std::vector<std::map<std::string, int> > > counts;
+
+  //! The value of p to use in calculation.
+  size_t p;
+};
+
+}; // namespace kernel
+}; // namespace mlpack
+
+// Include implementation of templated Evaluate().
+#include "pspectrum_string_kernel_impl.hpp"
+
+#endif

Added: mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel_impl.hpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel_impl.hpp	2012-07-16 00:57:43 UTC (rev 13230)
@@ -0,0 +1,76 @@
+/**
+ * @file pspectrum_string_kernel_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of the p-spectrum string kernel, created for use with FastMKS.
+ * Instead of passing a data matrix to FastMKS which stores the kernels, pass a
+ * one-dimensional data matrix (data vector) to FastMKS which stores indices of
+ * strings; then, the actual strings are given to the PSpectrumStringKernel at
+ * construction time, and the kernel knows to map the indices to actual strings.
+ */
+#ifndef __MLPACK_CORE_KERNELS_PSPECTRUM_STRING_KERNEL_IMPL_HPP
+#define __MLPACK_CORE_KERNELS_PSPECTRUM_STRING_KERNEL_IMPL_HPP
+
+// In case it has not been included yet.
+#include "pspectrum_string_kernel.hpp"
+
+namespace mlpack {
+namespace kernel {
+
+/**
+ * Evaluate the kernel for the string indices given.  As mentioned in the class
+ * documentation, a and b should be 2-element vectors, where the first element
+ * contains the index of the dataset and the second element contains the index
+ * of the string.  Therefore, if [2 3] is passed for a, the string used will be
+ * datasets[2][3] (datasets is of type std::vector<std::vector<std::string> >&).
+ *
+ * @param a Index of string and dataset for first string.
+ * @param b Index of string and dataset for second string.
+ */
+template<typename VecType>
+double PSpectrumStringKernel::Evaluate(const VecType& a,
+                                       const VecType& b) const
+{
+  // Get the map of substrings for the two strings we are interested in.
+  const std::map<std::string, int>& aMap = counts[a[0]][a[1]];
+  const std::map<std::string, int>& bMap = counts[b[0]][b[1]];
+
+  double eval = 0;
+
+  // Loop through the two maps (which, when iterated through, are sorted
+  // alphabetically).
+  std::map<std::string, int>::const_iterator aIt = aMap.begin();
+  std::map<std::string, int>::const_iterator bIt = bMap.begin();
+
+  while ((aIt != aMap.end()) && (bIt != bMap.end()))
+  {
+    // Compare alphabetically (this is how std::map is ordered).
+    int result = (*aIt).first.compare((*bIt).first);
+
+    if (result == 0) // The same substring.
+    {
+      eval += ((*aIt).second * (*bIt).second);
+
+      // Now increment both.
+      ++aIt;
+      ++bIt;
+    }
+    else if (result > 0)
+    {
+      // aIt is "ahead" of bIt (alphabetically); so increment bIt to "catch up".
+      ++bIt;
+    }
+    else
+    {
+      // bIt is "ahead" of aIt (alphabetically); so increment aIt to "catch up".
+      ++aIt;
+    }
+  }
+
+  return eval;
+}
+
+}; // namespace kernel
+}; // namespace mlpack
+
+#endif




More information about the mlpack-svn mailing list