[mlpack-svn] r13230 - mlpack/trunk/src/mlpack/core/kernels
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Sun Jul 15 20:57:43 EDT 2012
Author: rcurtin
Date: 2012-07-15 20:57:43 -0400 (Sun, 15 Jul 2012)
New Revision: 13230
Added:
mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.cpp
mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.hpp
mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel_impl.hpp
Modified:
mlpack/trunk/src/mlpack/core/kernels/CMakeLists.txt
Log:
Add PSpectrumStringKernel to be used for maxip/FMKS.
Modified: mlpack/trunk/src/mlpack/core/kernels/CMakeLists.txt
===================================================================
--- mlpack/trunk/src/mlpack/core/kernels/CMakeLists.txt 2012-07-15 15:44:30 UTC (rev 13229)
+++ mlpack/trunk/src/mlpack/core/kernels/CMakeLists.txt 2012-07-16 00:57:43 UTC (rev 13230)
@@ -14,6 +14,9 @@
laplacian_kernel.hpp
linear_kernel.hpp
polynomial_kernel.hpp
+ pspectrum_string_kernel.hpp
+ pspectrum_string_kernel_impl.hpp
+ pspectrum_string_kernel.cpp
spherical_kernel.hpp
triangular_kernel.hpp
)
Added: mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.cpp
===================================================================
--- mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.cpp (rev 0)
+++ mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.cpp 2012-07-16 00:57:43 UTC (rev 13230)
@@ -0,0 +1,83 @@
+/**
+ * @file pspectrum_string_kernel.cpp
+ * @author Ryan Curtin
+ *
+ * Implementation of the p-spectrum string kernel, created for use with FastMKS.
+ * Instead of passing a data matrix to FastMKS which stores the kernels, pass a
+ * one-dimensional data matrix (data vector) to FastMKS which stores indices of
+ * strings; then, the actual strings are given to the PSpectrumStringKernel at
+ * construction time, and the kernel knows to map the indices to actual strings.
+ */
+#include "pspectrum_string_kernel.hpp"
+
+using namespace std;
+using namespace mlpack;
+using namespace mlpack::kernel;
+
+/**
+ * Initialize the PSpectrumStringKernel with the given string datasets. For
+ * more information on this, see the general class documentation.
+ *
+ * @param datasets Sets of string data. @param p The length of substrings to
+ * search.
+ */
+mlpack::kernel::PSpectrumStringKernel::PSpectrumStringKernel(
+ const std::vector<std::vector<std::string> >& datasets,
+ const size_t p) :
+ datasets(datasets),
+ p(p)
+{
+ // We have to assemble the counts of substrings. This is not a particularly
+ // fast operation, unfortunately, but it only needs to be done once.
+ Log::Info << "Assembling counts of substrings of length " << p << "."
+ << std::endl;
+
+ // Resize for number of datasets.
+ counts.resize(datasets.size());
+
+ for (size_t dataset = 0; dataset < datasets.size(); ++dataset)
+ {
+ const std::vector<std::string>& set = datasets[dataset];
+
+ // Resize for number of strings in dataset.
+ counts[dataset].resize(set.size());
+
+ // Inspect each string in the dataset.
+ for (size_t index = 0; index < set.size(); ++index)
+ {
+ // Convenience references.
+ const std::string& str = set[index];
+ std::map<std::string, int>& mapping = counts[dataset][index];
+
+ size_t start = 0;
+ while ((start + p) <= str.length())
+ {
+ string sub = str.substr(start, p);
+
+ // Convert all characters to lowercase.
+ bool invalid = false;
+ for (size_t j = 0; j < p; ++j)
+ {
+ if (!isalnum(sub[j]))
+ {
+ invalid = true;
+ break; // Only consider substrings with alphanumerics.
+ }
+
+ sub[j] = tolower(sub[j]);
+ }
+
+ // Increment position in string.
+ ++start;
+
+ if (!invalid)
+ {
+ // Add to the map.
+ ++mapping[sub];
+ }
+ }
+ }
+ }
+
+ Log::Info << "Substring extraction complete." << std::endl;
+}
Added: mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.hpp
===================================================================
--- mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.hpp (rev 0)
+++ mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel.hpp 2012-07-16 00:57:43 UTC (rev 13230)
@@ -0,0 +1,116 @@
+/**
+ * @file pspectrum_string_kernel.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of the p-spectrum string kernel, created for use with FastMKS.
+ * Instead of passing a data matrix to FastMKS which stores the kernels, pass a
+ * one-dimensional data matrix (data vector) to FastMKS which stores indices of
+ * strings; then, the actual strings are given to the PSpectrumStringKernel at
+ * construction time, and the kernel knows to map the indices to actual strings.
+ */
+#ifndef __MLPACK_CORE_KERNELS_PSPECTRUM_STRING_KERNEL_HPP
+#define __MLPACK_CORE_KERNELS_PSPECTRUM_STRING_KERNEL_HPP
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace kernel {
+
+/**
+ * The p-spectrum string kernel. Given a length p, the p-spectrum kernel finds
+ * the contiguous subsequence match count between two strings. The kernel will
+ * take every possible substring of length p of one string and count how many
+ * times it appears in the other string.
+ *
+ * The string kernel, when created, must be passed a reference to a series of
+ * string datasets (std::vector<std::vector<std::string> >&). This is because
+ * MLPACK only supports datasets which are Armadillo matrices -- and a dataset
+ * of variable-length strings cannot be easily cast into an Armadillo matrix.
+ *
+ * Therefore, once the PSpectrumStringKernel is created with a reference to the
+ * string datasets, a "fake" Armadillo data matrix must be created, which simply
+ * holds indices to the strings they represent. This "fake" matrix has two rows
+ * and n columns (where n is the number of strings in the dataset). The first
+ * row holds the index of the dataset (remember, the kernel can have multiple
+ * datasets), and the second row holds the index of the string. A fake matrix
+ * containing only strings from dataset 0 might look like this:
+ *
+ * [[0 0 0 0 0 0 0 0 0]
+ * [0 1 2 3 4 5 6 7 8]]
+ *
+ * This fake matrix is then given to the machine learning method, which will
+ * eventually call PSpectrumStringKernel::Evaluate(a, b), where a and b are two
+ * columns of the fake matrix. The string kernel will then map these fake
+ * columns back to the strings they represent, and then correctly evaluate the
+ * kernel.
+ *
+ * Unfortunately, not every machine learning method will work with this kernel.
+ * Only machine learning methods which do not ever operate on the explicit
+ * representation of points can use this kernel. So, for instance, one cannot
+ * build a kd-tree on strings, because the BinarySpaceTree<> class will split
+ * the data according to the fake data matrix -- resulting in a meaningless
+ * tree. This kernel was originally written for the FastMKS method; so, at the
+ * very least, it will work with that.
+ */
+class PSpectrumStringKernel
+{
+ public:
+ /**
+ * Initialize the PSpectrumStringKernel with the given string datasets. For
+ * more information on this, see the general class documentation.
+ *
+ * @param datasets Sets of string data.
+ * @param p The length of substrings to search.
+ */
+ PSpectrumStringKernel(const std::vector<std::vector<std::string> >& datasets,
+ const size_t p);
+
+ /**
+ * Evaluate the kernel for the string indices given. As mentioned in the
+ * class documentation, a and b should be 2-element vectors, where the first
+ * element contains the index of the dataset and the second element contains
+ * the index of the string. Therefore, if [2 3] is passed for a, the string
+ * used will be datasets[2][3] (datasets is of type
+ * std::vector<std::vector<std::string> >&).
+ *
+ * @param a Index of string and dataset for first string.
+ * @param b Index of string and dataset for second string.
+ */
+ template<typename VecType>
+ double Evaluate(const VecType& a, const VecType& b) const;
+
+ //! Access the lists of substrings.
+ const std::vector<std::vector<std::map<std::string, int> > >& Counts() const
+ { return counts; }
+ //! Modify the lists of substrings.
+ std::vector<std::vector<std::map<std::string, int> > >& Counts()
+ { return counts; }
+
+ //! Access the value of p.
+ size_t P() const { return p; }
+ //! Modify the value of p.
+ size_t& P() { return p; }
+
+ private:
+ //! The datasets.
+ const std::vector<std::vector<std::string> >& datasets;
+
+ //! Mappings of the datasets to counts of substrings. Such a huge structure
+ //! is not wonderful...
+ std::vector<std::vector<std::map<std::string, int> > > counts;
+
+ //! The value of p to use in calculation.
+ size_t p;
+};
+
+}; // namespace kernel
+}; // namespace mlpack
+
+// Include implementation of templated Evaluate().
+#include "pspectrum_string_kernel_impl.hpp"
+
+#endif
Added: mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel_impl.hpp (rev 0)
+++ mlpack/trunk/src/mlpack/core/kernels/pspectrum_string_kernel_impl.hpp 2012-07-16 00:57:43 UTC (rev 13230)
@@ -0,0 +1,76 @@
+/**
+ * @file pspectrum_string_kernel_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of the p-spectrum string kernel, created for use with FastMKS.
+ * Instead of passing a data matrix to FastMKS which stores the kernels, pass a
+ * one-dimensional data matrix (data vector) to FastMKS which stores indices of
+ * strings; then, the actual strings are given to the PSpectrumStringKernel at
+ * construction time, and the kernel knows to map the indices to actual strings.
+ */
+#ifndef __MLPACK_CORE_KERNELS_PSPECTRUM_STRING_KERNEL_IMPL_HPP
+#define __MLPACK_CORE_KERNELS_PSPECTRUM_STRING_KERNEL_IMPL_HPP
+
+// In case it has not been included yet.
+#include "pspectrum_string_kernel.hpp"
+
+namespace mlpack {
+namespace kernel {
+
+/**
+ * Evaluate the kernel for the string indices given. As mentioned in the class
+ * documentation, a and b should be 2-element vectors, where the first element
+ * contains the index of the dataset and the second element contains the index
+ * of the string. Therefore, if [2 3] is passed for a, the string used will be
+ * datasets[2][3] (datasets is of type std::vector<std::vector<std::string> >&).
+ *
+ * @param a Index of string and dataset for first string.
+ * @param b Index of string and dataset for second string.
+ */
+template<typename VecType>
+double PSpectrumStringKernel::Evaluate(const VecType& a,
+ const VecType& b) const
+{
+ // Get the map of substrings for the two strings we are interested in.
+ const std::map<std::string, int>& aMap = counts[a[0]][a[1]];
+ const std::map<std::string, int>& bMap = counts[b[0]][b[1]];
+
+ double eval = 0;
+
+ // Loop through the two maps (which, when iterated through, are sorted
+ // alphabetically).
+ std::map<std::string, int>::const_iterator aIt = aMap.begin();
+ std::map<std::string, int>::const_iterator bIt = bMap.begin();
+
+ while ((aIt != aMap.end()) && (bIt != bMap.end()))
+ {
+ // Compare alphabetically (this is how std::map is ordered).
+ int result = (*aIt).first.compare((*bIt).first);
+
+ if (result == 0) // The same substring.
+ {
+ eval += ((*aIt).second * (*bIt).second);
+
+ // Now increment both.
+ ++aIt;
+ ++bIt;
+ }
+ else if (result > 0)
+ {
+ // aIt is "ahead" of bIt (alphabetically); so increment bIt to "catch up".
+ ++bIt;
+ }
+ else
+ {
+ // bIt is "ahead" of aIt (alphabetically); so increment aIt to "catch up".
+ ++aIt;
+ }
+ }
+
+ return eval;
+}
+
+}; // namespace kernel
+}; // namespace mlpack
+
+#endif
More information about the mlpack-svn
mailing list