[mlpack-svn] r13844 - mlpack/trunk/src/mlpack/methods/fastmks
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Mon Nov 5 16:53:56 EST 2012
Author: rcurtin
Date: 2012-11-05 16:53:56 -0500 (Mon, 05 Nov 2012)
New Revision: 13844
Added:
mlpack/trunk/src/mlpack/methods/fastmks/fastmks.hpp
mlpack/trunk/src/mlpack/methods/fastmks/fastmks_impl.hpp
mlpack/trunk/src/mlpack/methods/fastmks/fastmks_main.cpp
mlpack/trunk/src/mlpack/methods/fastmks/fastmks_rules.hpp
mlpack/trunk/src/mlpack/methods/fastmks/fastmks_rules_impl.hpp
Removed:
mlpack/trunk/src/mlpack/methods/fastmks/max_ip.hpp
mlpack/trunk/src/mlpack/methods/fastmks/max_ip_impl.hpp
mlpack/trunk/src/mlpack/methods/fastmks/max_ip_main.cpp
mlpack/trunk/src/mlpack/methods/fastmks/max_ip_rules.hpp
mlpack/trunk/src/mlpack/methods/fastmks/max_ip_rules_impl.hpp
Modified:
mlpack/trunk/src/mlpack/methods/fastmks/CMakeLists.txt
mlpack/trunk/src/mlpack/methods/fastmks/ip_metric.hpp
mlpack/trunk/src/mlpack/methods/fastmks/ip_metric_impl.hpp
Log:
Rename all MaxIP files to FastMKS.
Modified: mlpack/trunk/src/mlpack/methods/fastmks/CMakeLists.txt
===================================================================
--- mlpack/trunk/src/mlpack/methods/fastmks/CMakeLists.txt 2012-11-05 10:00:54 UTC (rev 13843)
+++ mlpack/trunk/src/mlpack/methods/fastmks/CMakeLists.txt 2012-11-05 21:53:56 UTC (rev 13844)
@@ -3,10 +3,10 @@
set(SOURCES
ip_metric.hpp
ip_metric_impl.hpp
- max_ip.hpp
- max_ip_impl.hpp
- max_ip_rules.hpp
- max_ip_rules_impl.hpp
+ fastmks.hpp
+ fastmks_impl.hpp
+ fastmks_rules.hpp
+ fastmks_rules_impl.hpp
)
# Add directory name to sources.
@@ -18,18 +18,11 @@
# the parent scope).
set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)
-add_executable(maxip
- max_ip_main.cpp
+add_executable(fastmks
+ fastmks_main.cpp
)
-target_link_libraries(maxip
+target_link_libraries(fastmks
mlpack
)
-#add_executable(maxip_string
-# max_ip_string_main.cpp
-#)
-#target_link_libraries(maxip_string
-# mlpack
-#)
-
-install(TARGETS maxip RUNTIME DESTINATION bin)
+install(TARGETS fastmks RUNTIME DESTINATION bin)
Copied: mlpack/trunk/src/mlpack/methods/fastmks/fastmks.hpp (from rev 13837, mlpack/trunk/src/mlpack/methods/fastmks/max_ip.hpp)
===================================================================
--- mlpack/trunk/src/mlpack/methods/fastmks/fastmks.hpp (rev 0)
+++ mlpack/trunk/src/mlpack/methods/fastmks/fastmks.hpp 2012-11-05 21:53:56 UTC (rev 13844)
@@ -0,0 +1,70 @@
+/**
+ * @file fastmks.hpp
+ * @author Ryan Curtin
+ *
+ * Definition of the FastMKS class, which is the fast max-kernel search.
+ */
+#ifndef __MLPACK_METHODS_FASTMKS_FASTMKS_HPP
+#define __MLPACK_METHODS_FASTMKS_FASTMKS_HPP
+
+#include <mlpack/core.hpp>
+#include "ip_metric.hpp"
+#include <mlpack/core/tree/cover_tree/cover_tree.hpp>
+
+namespace mlpack {
+namespace fastmks {
+
+template<typename KernelType>
+class FastMKS
+{
+ public:
+ FastMKS(const arma::mat& referenceSet,
+ KernelType& kernel,
+ bool single = false,
+ bool naive = false,
+ double expansionConstant = 2.0);
+
+ FastMKS(const arma::mat& referenceSet,
+ const arma::mat& querySet,
+ KernelType& kernel,
+ bool single = false,
+ bool naive = false,
+ double expansionConstant = 2.0);
+
+ ~FastMKS();
+
+ void Search(const size_t k,
+ arma::Mat<size_t>& indices,
+ arma::mat& products);
+
+ private:
+ const arma::mat& referenceSet;
+
+ const arma::mat& querySet;
+
+ tree::CoverTree<IPMetric<KernelType> >* referenceTree;
+
+ tree::CoverTree<IPMetric<KernelType> >* queryTree;
+
+ bool single;
+
+ bool naive;
+
+ IPMetric<KernelType> metric;
+
+ // Utility function. Copied too many times from too many places.
+ void InsertNeighbor(arma::Mat<size_t>& indices,
+ arma::mat& products,
+ const size_t queryIndex,
+ const size_t pos,
+ const size_t neighbor,
+ const double distance);
+};
+
+}; // namespace fastmks
+}; // namespace mlpack
+
+// Include implementation.
+#include "fastmks_impl.hpp"
+
+#endif
Copied: mlpack/trunk/src/mlpack/methods/fastmks/fastmks_impl.hpp (from rev 13837, mlpack/trunk/src/mlpack/methods/fastmks/max_ip_impl.hpp)
===================================================================
--- mlpack/trunk/src/mlpack/methods/fastmks/fastmks_impl.hpp (rev 0)
+++ mlpack/trunk/src/mlpack/methods/fastmks/fastmks_impl.hpp 2012-11-05 21:53:56 UTC (rev 13844)
@@ -0,0 +1,748 @@
+/**
+ * @file fastmks_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of the FastMKS class (fast max-kernel search).
+ */
+#ifndef __MLPACK_METHODS_FASTMKS_FASTMKS_IMPL_HPP
+#define __MLPACK_METHODS_FASTMKS_FASTMKS_IMPL_HPP
+
+// In case it hasn't yet been included.
+#include "fastmks.hpp"
+
+#include "fastmks_rules.hpp"
+
+#include <mlpack/core/kernels/gaussian_kernel.hpp>
+#include <queue>
+
+namespace mlpack {
+namespace fastmks {
+
+template<typename TreeType>
+void RecurseTreeCountLeaves(const TreeType& node, arma::vec& counts)
+{
+ for (size_t i = 0; i < node.NumChildren(); ++i)
+ {
+ if (node.Child(i).NumChildren() == 0)
+ counts[node.Child(i).Point()]++;
+ else
+ RecurseTreeCountLeaves<TreeType>(node.Child(i), counts);
+ }
+}
+
+template<typename TreeType>
+void CheckSelfChild(const TreeType& node)
+{
+ if (node.NumChildren() == 0)
+ return; // No self-child applicable here.
+
+ bool found = false;
+ for (size_t i = 0; i < node.NumChildren(); ++i)
+ {
+ if (node.Child(i).Point() == node.Point())
+ found = true;
+
+ // Recursively check the children.
+ CheckSelfChild(node.Child(i));
+ }
+
+ // Ensure this has its own self-child.
+ Log::Assert(found == true);
+}
+
+template<typename TreeType, typename MetricType>
+void CheckCovering(const TreeType& node)
+{
+ // Return if a leaf. No checking necessary.
+ if (node.NumChildren() == 0)
+ return;
+
+ const arma::mat& dataset = node.Dataset();
+ const size_t nodePoint = node.Point();
+
+ // To ensure that this node satisfies the covering principle, we must ensure
+ // that the distance to each child is less than pow(expansionConstant, scale).
+ double maxDistance = pow(node.ExpansionConstant(), node.Scale());
+ for (size_t i = 0; i < node.NumChildren(); ++i)
+ {
+ const size_t childPoint = node.Child(i).Point();
+
+ double distance = MetricType::Evaluate(dataset.col(nodePoint),
+ dataset.col(childPoint));
+
+ Log::Assert(distance <= maxDistance);
+
+ // Check the child.
+ CheckCovering<TreeType, MetricType>(node.Child(i));
+ }
+}
+
+template<typename TreeType, typename MetricType>
+void CheckIndividualSeparation(const TreeType& constantNode,
+ const TreeType& node)
+{
+ // Don't check points at a lower scale.
+ if (node.Scale() < constantNode.Scale())
+ return;
+
+ // If at a higher scale, recurse.
+ if (node.Scale() > constantNode.Scale())
+ {
+ for (size_t i = 0; i < node.NumChildren(); ++i)
+ {
+ // Don't recurse into leaves.
+ if (node.Child(i).NumChildren() > 0)
+ CheckIndividualSeparation<TreeType, MetricType>(constantNode,
+ node.Child(i));
+ }
+
+ return;
+ }
+
+ // Don't compare the same point against itself.
+ if (node.Point() == constantNode.Point())
+ return;
+
+ // Now we know we are at the same scale, so make the comparison.
+ const arma::mat& dataset = constantNode.Dataset();
+ const size_t constantPoint = constantNode.Point();
+ const size_t nodePoint = node.Point();
+
+ // Make sure the distance is at least the following value (in accordance with
+ // the separation principle of cover trees).
+ double minDistance = pow(constantNode.ExpansionConstant(),
+ constantNode.Scale());
+
+ double distance = MetricType::Evaluate(dataset.col(constantPoint),
+ dataset.col(nodePoint));
+
+}
+
+template<typename TreeType, typename MetricType>
+void CheckSeparation(const TreeType& node, const TreeType& root)
+{
+ // Check the separation between this point and all other points on this scale.
+ CheckIndividualSeparation<TreeType, MetricType>(node, root);
+
+ // Check the children, but only if they are not leaves. Leaves don't need to
+ // be checked.
+ for (size_t i = 0; i < node.NumChildren(); ++i)
+ if (node.Child(i).NumChildren() > 0)
+ CheckSeparation<TreeType, MetricType>(node.Child(i), root);
+}
+
+template<typename TreeType, typename MetricType>
+void GetMaxDistance(TreeType& node,
+ TreeType& constantNode,
+ double& best,
+ size_t& index)
+{
+ const arma::mat& dataset = node.Dataset();
+ const double eval = MetricType::Evaluate(dataset.unsafe_col(node.Point()),
+ dataset.unsafe_col(constantNode.Point()));
+ if (eval > best)
+ {
+ best = eval;
+ index = node.Point();
+ }
+
+ // Recurse into children.
+ for (size_t i = 0; i < node.NumChildren(); ++i)
+ GetMaxDistance<TreeType, MetricType>(node.Child(i), constantNode, best,
+ index);
+}
+
+template<typename TreeType, typename MetricType>
+void CheckMaxDistances(TreeType& node)
+{
+ // Check child distances.
+ for (size_t i = 0; i < node.NumChildren(); ++i)
+ {
+ const arma::mat& dataset = node.Dataset();
+ double eval = MetricType::Evaluate(dataset.unsafe_col(node.Point()),
+ dataset.unsafe_col(node.Child(i).Point()));
+
+ Log::Assert(std::abs(eval - node.Child(i).ParentDistance()) < 1e-10);
+ }
+
+ // Check all descendants.
+ double maxDescendantDistance = 0;
+ size_t maxIndex = 0;
+ GetMaxDistance<TreeType, MetricType>(node, node, maxDescendantDistance,
+ maxIndex);
+
+ Log::Assert(std::abs(maxDescendantDistance -
+ node.FurthestDescendantDistance()) < 1e-10);
+
+ for (size_t i = 0; i < node.NumChildren(); ++i)
+ CheckMaxDistances<TreeType, MetricType>(node.Child(i));
+}
+
+template<typename TreeType>
+struct SearchFrame
+{
+ TreeType* node;
+ double eval;
+};
+
+template<typename TreeType>
+class SearchFrameCompare
+{
+ public:
+ bool operator()(const SearchFrame<TreeType>& lhs,
+ const SearchFrame<TreeType>& rhs)
+ {
+ // Compare scale.
+ if (lhs.node->Scale() != rhs.node->Scale())
+ return (lhs.node->Scale() < rhs.node->Scale());
+ else
+ {
+ // Now we have to compare by evaluation.
+ return (lhs.eval < rhs.eval);
+ }
+ }
+};
+
+template<typename KernelType>
+FastMKS<KernelType>::FastMKS(const arma::mat& referenceSet,
+ KernelType& kernel,
+ bool single,
+ bool naive,
+ double expansionConstant) :
+ referenceSet(referenceSet),
+ querySet(referenceSet), // Gotta point it somewhere...
+ queryTree(NULL),
+ single(single),
+ naive(naive),
+ metric(kernel)
+{
+
+ Timer::Start("tree_building");
+
+ // Build the tree. Could we do this in the initialization list?
+ if (naive)
+ referenceTree = NULL;
+ else
+ referenceTree = new tree::CoverTree<IPMetric<KernelType> >(referenceSet,
+ expansionConstant, &metric);
+
+ Timer::Stop("tree_building");
+}
+
+template<typename KernelType>
+FastMKS<KernelType>::FastMKS(const arma::mat& referenceSet,
+ const arma::mat& querySet,
+ KernelType& kernel,
+ bool single,
+ bool naive,
+ double expansionConstant) :
+ referenceSet(referenceSet),
+ querySet(querySet),
+ single(single),
+ naive(naive),
+ metric(kernel)
+{
+ Timer::Start("tree_building");
+
+ // Build the trees. Could we do this in the initialization lists?
+ if (naive)
+ referenceTree = NULL;
+ else
+ referenceTree = new tree::CoverTree<IPMetric<KernelType> >(referenceSet,
+ expansionConstant, &metric);
+
+ if (single || naive)
+ queryTree = NULL;
+ else
+ queryTree = new tree::CoverTree<IPMetric<KernelType> >(querySet,
+ expansionConstant, &metric);
+
+/* if (referenceTree != NULL)
+ {
+ Log::Debug << "Check counts" << std::endl;
+ // Now loop through the tree and ensure that each leaf is only created once.
+ arma::vec counts;
+ counts.zeros(referenceSet.n_elem);
+ RecurseTreeCountLeaves(*referenceTree, counts);
+
+ // Each point should only have one leaf node representing it.
+ for (size_t i = 0; i < 20; ++i)
+ Log::Assert(counts[i] == 1);
+
+ Log::Debug << "Check self child\n";
+ // Each non-leaf should have a self-child.
+ CheckSelfChild<tree::CoverTree<IPMetric<KernelType> > >(*referenceTree);
+
+ Log::Debug << "Check covering\n";
+ // Each node must satisfy the covering principle (its children must be less
+ // than or equal to a certain distance apart).
+ CheckCovering<tree::CoverTree<IPMetric<KernelType> >, IPMetric<KernelType> >(*referenceTree);
+
+ Log::Debug << "Check max distances\n";
+ // Check maximum distance of children and grandchildren.
+ CheckMaxDistances<tree::CoverTree<IPMetric<KernelType> >, IPMetric<KernelType> >(*referenceTree);
+ Log::Debug << "Done\n";
+ }*/
+
+ Timer::Stop("tree_building");
+}
+
+template<typename KernelType>
+FastMKS<KernelType>::~FastMKS()
+{
+ if (queryTree)
+ delete queryTree;
+ if (referenceTree)
+ delete referenceTree;
+}
+
+template<typename KernelType>
+void FastMKS<KernelType>::Search(const size_t k,
+ arma::Mat<size_t>& indices,
+ arma::mat& products)
+{
+ // No remapping will be necessary.
+ indices.set_size(k, querySet.n_cols);
+ products.set_size(k, querySet.n_cols);
+ products.fill(-1.0);
+
+ Timer::Start("computing_products");
+
+ size_t kernelEvaluations = 0;
+
+ // Naive implementation.
+ if (naive)
+ {
+ // Simple double loop. Stupid, slow, but a good benchmark.
+ for (size_t q = 0; q < querySet.n_cols; ++q)
+ {
+ for (size_t r = 0; r < referenceSet.n_cols; ++r)
+ {
+ const double eval = metric.Kernel().Evaluate(querySet.unsafe_col(q),
+ referenceSet.unsafe_col(r));
+ ++kernelEvaluations;
+
+ size_t insertPosition;
+ for (insertPosition = 0; insertPosition < indices.n_rows;
+ ++insertPosition)
+ if (eval > products(insertPosition, q))
+ break;
+
+ if (insertPosition < indices.n_rows)
+ InsertNeighbor(indices, products, q, insertPosition, r, eval);
+ }
+ }
+
+ Timer::Stop("computing_products");
+
+ Log::Info << "Kernel evaluations: " << kernelEvaluations << "."
+ << std::endl;
+ return;
+ }
+
+ // Single-tree implementation.
+ if (single)
+ {
+ // Calculate number of pruned nodes.
+ size_t numPrunes = 0;
+
+ // Precalculate query products ( || q || for all q).
+ arma::vec queryProducts(querySet.n_cols);
+ for (size_t queryIndex = 0; queryIndex < querySet.n_cols; ++queryIndex)
+ queryProducts[queryIndex] = sqrt(metric.Kernel().Evaluate(
+ querySet.unsafe_col(queryIndex), querySet.unsafe_col(queryIndex)));
+ kernelEvaluations += querySet.n_cols;
+
+ // Screw the CoverTreeTraverser, we'll implement it by hand.
+ for (size_t queryIndex = 0; queryIndex < querySet.n_cols; ++queryIndex)
+ {
+ // Use an array of priority queues?
+ std::priority_queue<
+ SearchFrame<tree::CoverTree<IPMetric<KernelType> > >,
+ std::vector<SearchFrame<tree::CoverTree<IPMetric<KernelType> > > >,
+ SearchFrameCompare<tree::CoverTree<IPMetric<KernelType> > > >
+ frameQueue;
+
+ // Add initial frame.
+ SearchFrame<tree::CoverTree<IPMetric<KernelType> > > nextFrame;
+ nextFrame.node = referenceTree;
+ nextFrame.eval = metric.Kernel().Evaluate(querySet.unsafe_col(queryIndex),
+ referenceSet.unsafe_col(referenceTree->Point()));
+ ++kernelEvaluations;
+
+ // The initial evaluation will be the best so far.
+ indices(0, queryIndex) = referenceTree->Point();
+ products(0, queryIndex) = nextFrame.eval;
+
+ frameQueue.push(nextFrame);
+
+ tree::CoverTree<IPMetric<KernelType> >* referenceNode;
+ double eval;
+ double maxProduct;
+
+ while (!frameQueue.empty())
+ {
+ // Get the information for this node.
+ const SearchFrame<tree::CoverTree<IPMetric<KernelType> > >& frame =
+ frameQueue.top();
+
+ referenceNode = frame.node;
+ eval = frame.eval;
+
+ // Loop through the children, seeing if we can prune them; if not, add
+ // them to the queue. The self-child is different -- it has the same
+ // parent (and therefore the same kernel evaluation).
+ if (referenceNode->NumChildren() > 0)
+ {
+ SearchFrame<tree::CoverTree<IPMetric<KernelType> > > childFrame;
+
+ // We must handle the self-child differently, to avoid adding it to
+ // the results twice.
+ childFrame.node = &(referenceNode->Child(0));
+ childFrame.eval = eval;
+
+// maxProduct = eval + std::pow(childFrame.node->ExpansionConstant(),
+// childFrame.node->Scale() + 1) * queryProducts[queryIndex];
+ // Alternate pruning rule.
+ maxProduct = eval + childFrame.node->FurthestDescendantDistance() *
+ queryProducts[queryIndex];
+
+ // Add self-child if we can't prune it.
+ if (maxProduct > products(products.n_rows - 1, queryIndex))
+ {
+ // But only if it has children of its own.
+ if (childFrame.node->NumChildren() > 0)
+ frameQueue.push(childFrame);
+ }
+ else
+ ++numPrunes;
+
+ for (size_t i = 1; i < referenceNode->NumChildren(); ++i)
+ {
+ // Before we evaluate the child, let's see if it can possibly have
+ // a better evaluation.
+ double maxChildEval = eval + queryProducts[queryIndex] *
+ (referenceNode->Child(i).ParentDistance() +
+ referenceNode->Child(i).FurthestDescendantDistance());
+
+ if (maxChildEval <= products(products.n_rows - 1, queryIndex))
+ {
+ ++numPrunes;
+ continue; // Skip this child; it can't be any better.
+ }
+
+ // Evaluate child.
+ childFrame.node = &(referenceNode->Child(i));
+ childFrame.eval = metric.Kernel().Evaluate(
+ querySet.unsafe_col(queryIndex),
+ referenceSet.unsafe_col(referenceNode->Child(i).Point()));
+ ++kernelEvaluations;
+
+ // Can we prune it? If we can, we can avoid putting it in the queue
+ // (saves time).
+// double maxProduct = childFrame.eval +
+// std::pow(childFrame.node->ExpansionConstant(),
+// childFrame.node->Scale() + 1) * queryProducts[queryIndex];
+ // Alternate pruning rule.
+ maxProduct = childFrame.eval + queryProducts[queryIndex] *
+ childFrame.node->FurthestDescendantDistance();
+
+ if (maxProduct > products(products.n_rows - 1, queryIndex))
+ {
+ // Good enough to recurse into. While we're at it, check the
+ // actual evaluation and see if it's an improvement.
+ if (childFrame.eval > products(products.n_rows - 1, queryIndex))
+ {
+ // This is a better result. Find out where to insert.
+ size_t insertPosition = 0;
+ for ( ; insertPosition < products.n_rows - 1; ++insertPosition)
+ if (childFrame.eval > products(insertPosition, queryIndex))
+ break;
+
+ // Insert into the correct position; we are guaranteed that
+ // insertPosition is valid.
+ InsertNeighbor(indices, products, queryIndex, insertPosition,
+ childFrame.node->Point(), childFrame.eval);
+ }
+
+ // Now add this to the queue (if it has any children which may
+ // need to be recursed into).
+ if (childFrame.node->NumChildren() > 0)
+ frameQueue.push(childFrame);
+ }
+ else
+ {
+ ++numPrunes;
+ }
+ }
+ }
+
+ frameQueue.pop();
+ }
+ }
+
+ Log::Info << "Pruned " << numPrunes << " nodes." << std::endl;
+ Log::Info << "Kernel evaluations: " << kernelEvaluations << "."
+ << std::endl;
+ Log::Info << "Distance evaluations: " << distanceEvaluations << "."
+ << std::endl;
+
+ Timer::Stop("computing_products");
+ return;
+ }
+
+ // Double-tree implementation.
+ Log::Fatal << "Dual-tree search not implemented yet... oops..." << std::endl;
+}
+
+/**
+ * Helper function to insert a point into the neighbors and distances matrices.
+ *
+ * @param queryIndex Index of point whose neighbors we are inserting into.
+ * @param pos Position in list to insert into.
+ * @param neighbor Index of reference point which is being inserted.
+ * @param distance Distance from query point to reference point.
+ */
+template<typename KernelType>
+void FastMKS<KernelType>::InsertNeighbor(arma::Mat<size_t>& indices,
+ arma::mat& products,
+ const size_t queryIndex,
+ const size_t pos,
+ const size_t neighbor,
+ const double distance)
+{
+ // We only memmove() if there is actually a need to shift something.
+ if (pos < (products.n_rows - 1))
+ {
+ int len = (products.n_rows - 1) - pos;
+ memmove(products.colptr(queryIndex) + (pos + 1),
+ products.colptr(queryIndex) + pos,
+ sizeof(double) * len);
+ memmove(indices.colptr(queryIndex) + (pos + 1),
+ indices.colptr(queryIndex) + pos,
+ sizeof(size_t) * len);
+ }
+
+ // Now put the new information in the right index.
+ products(pos, queryIndex) = distance;
+ indices(pos, queryIndex) = neighbor;
+}
+
+// Specialized implementation for tighter bounds for Gaussian.
+template<>
+void FastMKS<kernel::GaussianKernel>::Search(const size_t k,
+ arma::Mat<size_t>& indices,
+ arma::mat& products)
+{
+ Log::Warn << "Alternate implementation!" << std::endl;
+
+ // Terrible copypasta. Bad bad bad.
+ // No remapping will be necessary.
+ indices.set_size(k, querySet.n_cols);
+ products.set_size(k, querySet.n_cols);
+ products.fill(-1.0);
+
+ Timer::Start("computing_products");
+
+ size_t kernelEvaluations = 0;
+
+ // Naive implementation.
+ if (naive)
+ {
+ // Simple double loop. Stupid, slow, but a good benchmark.
+ for (size_t q = 0; q < querySet.n_cols; ++q)
+ {
+ for (size_t r = 0; r < referenceSet.n_cols; ++r)
+ {
+ const double eval = metric.Kernel().Evaluate(querySet.unsafe_col(q),
+ referenceSet.unsafe_col(r));
+ ++kernelEvaluations;
+
+ size_t insertPosition;
+ for (insertPosition = 0; insertPosition < indices.n_rows;
+ ++insertPosition)
+ if (eval > products(insertPosition, q))
+ break;
+
+ if (insertPosition < indices.n_rows)
+ InsertNeighbor(indices, products, q, insertPosition, r, eval);
+ }
+ }
+
+ Timer::Stop("computing_products");
+
+ Log::Info << "Kernel evaluations: " << kernelEvaluations << "." << std::endl;
+ return;
+ }
+
+ // Single-tree implementation.
+ if (single)
+ {
+ // Calculate number of pruned nodes.
+ size_t numPrunes = 0;
+
+ // Precalculate query products ( || q || for all q).
+ arma::vec queryProducts(querySet.n_cols);
+ for (size_t queryIndex = 0; queryIndex < querySet.n_cols; ++queryIndex)
+ queryProducts[queryIndex] = sqrt(metric.Kernel().Evaluate(
+ querySet.unsafe_col(queryIndex), querySet.unsafe_col(queryIndex)));
+ kernelEvaluations += querySet.n_cols;
+
+ // Screw the CoverTreeTraverser, we'll implement it by hand.
+ for (size_t queryIndex = 0; queryIndex < querySet.n_cols; ++queryIndex)
+ {
+ // Use an array of priority queues?
+ std::priority_queue<
+ SearchFrame<tree::CoverTree<IPMetric<kernel::GaussianKernel> > >,
+ std::vector<SearchFrame<tree::CoverTree<IPMetric<
+ kernel::GaussianKernel> > > >,
+ SearchFrameCompare<tree::CoverTree<IPMetric<
+ kernel::GaussianKernel> > > >
+ frameQueue;
+
+ // Add initial frame.
+ SearchFrame<tree::CoverTree<IPMetric<kernel::GaussianKernel> > >
+ nextFrame;
+ nextFrame.node = referenceTree;
+ nextFrame.eval = metric.Kernel().Evaluate(querySet.unsafe_col(queryIndex),
+ referenceSet.unsafe_col(referenceTree->Point()));
+ Log::Assert(nextFrame.eval <= 1);
+ ++kernelEvaluations;
+
+ // The initial evaluation will be the best so far.
+ indices(0, queryIndex) = referenceTree->Point();
+ products(0, queryIndex) = nextFrame.eval;
+
+ frameQueue.push(nextFrame);
+
+ tree::CoverTree<IPMetric<kernel::GaussianKernel> >* referenceNode;
+ double eval;
+ double maxProduct;
+
+ while (!frameQueue.empty())
+ {
+ // Get the information for this node.
+ const SearchFrame<tree::CoverTree<IPMetric<kernel::GaussianKernel> > >&
+ frame = frameQueue.top();
+
+ referenceNode = frame.node;
+ eval = frame.eval;
+
+ // Loop through the children, seeing if we can prune them; if not, add
+ // them to the queue. The self-child is different -- it has the same
+ // parent (and therefore the same kernel evaluation).
+ if (referenceNode->NumChildren() > 0)
+ {
+ SearchFrame<tree::CoverTree<IPMetric<kernel::GaussianKernel> > >
+ childFrame;
+
+ // We must handle the self-child differently, to avoid adding it to
+ // the results twice.
+ childFrame.node = &(referenceNode->Child(0));
+ childFrame.eval = eval;
+
+ // Alternate pruning rule.
+ const double mdd = childFrame.node->FurthestDescendantDistance();
+ if (eval >= (1 - std::pow(mdd, 2.0) / 2.0))
+ maxProduct = 1;
+ else
+ maxProduct = eval * (1 - std::pow(mdd, 2.0) / 2.0) + sqrt(1 -
+ std::pow(eval, 2.0)) * mdd * sqrt(1 - std::pow(mdd, 2.0) / 4.0);
+
+ // Add self-child if we can't prune it.
+ if (maxProduct > products(products.n_rows - 1, queryIndex))
+ {
+ // But only if it has children of its own.
+ if (childFrame.node->NumChildren() > 0)
+ frameQueue.push(childFrame);
+ }
+ else
+ ++numPrunes;
+
+ for (size_t i = 1; i < referenceNode->NumChildren(); ++i)
+ {
+ // Before we evaluate the child, let's see if it can possibly have
+ // a better evaluation.
+ const double mpdd = std::min(
+ referenceNode->Child(i).ParentDistance() +
+ referenceNode->Child(i).FurthestDescendantDistance(), 2.0);
+ double maxChildEval = 1;
+ if (eval < (1 - std::pow(mpdd, 2.0) / 2.0))
+ maxChildEval = eval * (1 - std::pow(mpdd, 2.0) / 2.0) + sqrt(1 -
+ std::pow(eval, 2.0)) * mpdd * sqrt(1 - std::pow(mpdd, 2.0)
+ / 4.0);
+
+ if (maxChildEval > products(products.n_rows - 1, queryIndex))
+ {
+ // Evaluate child.
+ childFrame.node = &(referenceNode->Child(i));
+ childFrame.eval = metric.Kernel().Evaluate(
+ querySet.unsafe_col(queryIndex),
+ referenceSet.unsafe_col(referenceNode->Child(i).Point()));
+ ++kernelEvaluations;
+
+ // Can we prune it? If we can, we can avoid putting it in the
+ // queue (saves time).
+ const double cmdd = childFrame.node->FurthestDescendantDistance();
+ if (childFrame.eval > (1 - std::pow(cmdd, 2.0) / 2.0))
+ maxProduct = 1;
+ else
+ maxProduct = childFrame.eval * (1 - std::pow(cmdd, 2.0) / 2.0)
+ + sqrt(1 - std::pow(eval, 2.0)) * cmdd * sqrt(1 -
+ std::pow(cmdd, 2.0) / 4.0);
+
+ if (maxProduct > products(products.n_rows - 1, queryIndex))
+ {
+ // Good enough to recurse into. While we're at it, check the
+ // actual evaluation and see if it's an improvement.
+ if (childFrame.eval > products(products.n_rows - 1, queryIndex))
+ {
+ // This is a better result. Find out where to insert.
+ size_t insertPosition = 0;
+ for ( ; insertPosition < products.n_rows - 1;
+ ++insertPosition)
+ if (childFrame.eval > products(insertPosition, queryIndex))
+ break;
+
+ // Insert into the correct position; we are guaranteed that
+ // insertPosition is valid.
+ InsertNeighbor(indices, products, queryIndex, insertPosition,
+ childFrame.node->Point(), childFrame.eval);
+ }
+
+ // Now add this to the queue (if it has any children which may
+ // need to be recursed into).
+ if (childFrame.node->NumChildren() > 0)
+ frameQueue.push(childFrame);
+ }
+ else
+ ++numPrunes;
+ }
+ else
+ ++numPrunes;
+ }
+ }
+
+ frameQueue.pop();
+ }
+ }
+
+ Log::Info << "Pruned " << numPrunes << " nodes." << std::endl;
+ Log::Info << "Kernel evaluations: " << kernelEvaluations << "."
+ << std::endl;
+ Log::Info << "Distance evaluations: " << distanceEvaluations << "."
+ << std::endl;
+
+ Timer::Stop("computing_products");
+ return;
+ }
+
+ // Double-tree implementation.
+ Log::Fatal << "Dual-tree search not implemented yet... oops..." << std::endl;
+
+}
+
+}; // namespace fastmks
+}; // namespace mlpack
+
+#endif
Copied: mlpack/trunk/src/mlpack/methods/fastmks/fastmks_main.cpp (from rev 13837, mlpack/trunk/src/mlpack/methods/fastmks/max_ip_main.cpp)
===================================================================
--- mlpack/trunk/src/mlpack/methods/fastmks/fastmks_main.cpp (rev 0)
+++ mlpack/trunk/src/mlpack/methods/fastmks/fastmks_main.cpp 2012-11-05 21:53:56 UTC (rev 13844)
@@ -0,0 +1,283 @@
+/**
+ * @file fastmks_main.cpp
+ * @author Ryan Curtin
+ *
+ * Main executable for maximum inner product search.
+ */
+// I can't believe I'm doing this.
+#include <stddef.h>
+size_t distanceEvaluations;
+
+#include <mlpack/core.hpp>
+#include <mlpack/core/kernels/linear_kernel.hpp>
+#include <mlpack/core/kernels/polynomial_kernel.hpp>
+#include <mlpack/core/kernels/cosine_distance.hpp>
+#include <mlpack/core/kernels/gaussian_kernel.hpp>
+#include <mlpack/core/kernels/hyperbolic_tangent_kernel.hpp>
+#include <mlpack/core/kernels/triangular_kernel.hpp>
+#include <mlpack/core/kernels/epanechnikov_kernel.hpp>
+
+#include "fastmks.hpp"
+
+using namespace std;
+using namespace mlpack;
+using namespace mlpack::fastmks;
+using namespace mlpack::kernel;
+
+// Needs to be fleshed out a little bit.
+PROGRAM_INFO("FastMKS (Fast Max-Kernel Search)",
+ "This program will find the k maximum kernels of a set of points, "
+ "using a query set and a reference set (which can optionally be the same "
+ "set). More specifically, for each point in the query set, the k points in"
+ " the reference set with maximum inner products are found. The inner "
+ "product used is the kernel function specified by --kernel.");
+
+// Define our input parameters.
+PARAM_STRING_REQ("reference_file", "File containing the reference dataset.",
+ "r");
+PARAM_STRING("query_file", "File containing the query dataset.", "q", "");
+
+PARAM_INT_REQ("k", "Number of maximum inner products to find.", "k");
+
+PARAM_STRING("products_file", "File to save inner products into.", "p", "");
+PARAM_STRING("indices_file", "File to save indices of inner products into.",
+ "i", "");
+
+PARAM_STRING("kernel", "Kernel type to use: 'linear', 'polynomial', 'cosine', "
+ "'gaussian', 'epanechnikov', 'triangular', 'hyptan'.", "K", "linear");
+
+PARAM_FLAG("naive", "If true, O(n^2) naive mode is used for computation.", "N");
+PARAM_FLAG("single", "If true, single-tree search is used (as opposed to "
+ "dual-tree search.", "s");
+
+PARAM_DOUBLE("base", "Base to use during cover tree construction.", "b", 2.0);
+
+PARAM_DOUBLE("degree", "Degree of polynomial kernel.", "d", 2.0);
+PARAM_DOUBLE("offset", "Offset of kernel (for polynomial and hyptan kernels).",
+ "o", 0.0);
+PARAM_DOUBLE("bandwidth", "Bandwidth (for Gaussian, Epanechnikov, and "
+ "triangular kernels).", "w", 1.0);
+PARAM_DOUBLE("scale", "Scale of kernel (for hyptan kernel).", "s", 1.0);
+
+template<typename KernelType>
+void RunFastMKS(const arma::mat& referenceData,
+ const bool single,
+ const bool naive,
+ const double base,
+ const size_t k,
+ arma::Mat<size_t>& indices,
+ arma::mat& products,
+ KernelType& kernel)
+{
+ // Create FastMKS object.
+ FastMKS<KernelType> fastmks(referenceData, kernel, (single && !naive), naive,
+ base);
+
+ // Now search with it.
+ fastmks.Search(k, indices, products);
+}
+
+template<typename KernelType>
+void RunFastMKS(const arma::mat& referenceData,
+ const arma::mat& queryData,
+ const bool single,
+ const bool naive,
+ const double base,
+ const size_t k,
+ arma::Mat<size_t>& indices,
+ arma::mat& products,
+ KernelType& kernel)
+{
+ // Create FastMKS object.
+ FastMKS<KernelType> maxip(referenceData, queryData, kernel,
+ (single && !naive), naive, base);
+
+ // Now search with it.
+ maxip.Search(k, indices, products);
+}
+
+int main(int argc, char** argv)
+{
+ distanceEvaluations = 0;
+
+ CLI::ParseCommandLine(argc, argv);
+
+ // Get reference dataset filename.
+ const string referenceFile = CLI::GetParam<string>("reference_file");
+
+ // The number of max kernel values to find.
+ const size_t k = CLI::GetParam<int>("k");
+
+ // Runtime parameters.
+ const bool naive = CLI::HasParam("naive");
+ const bool single = CLI::HasParam("single");
+
+ // For cover tree construction.
+ const double base = CLI::GetParam<double>("base");
+
+ // Kernel parameters.
+ const string kernelType = CLI::GetParam<string>("kernel");
+ const double degree = CLI::GetParam<double>("degree");
+ const double offset = CLI::GetParam<double>("offset");
+ const double bandwidth = CLI::GetParam<double>("bandwidth");
+ const double scale = CLI::GetParam<double>("scale");
+
+ // The datasets. The query matrix may never be used.
+ arma::mat referenceData;
+ arma::mat queryData;
+
+ data::Load(referenceFile, referenceData, true);
+
+ Log::Info << "Loaded reference data from '" << referenceFile << "' ("
+ << referenceData.n_rows << " x " << referenceData.n_cols << ")." << endl;
+
+ // Sanity check on k value.
+ if (k > referenceData.n_cols)
+ {
+ Log::Fatal << "Invalid k: " << k << "; must be greater than 0 and less ";
+ Log::Fatal << "than or equal to the number of reference points (";
+ Log::Fatal << referenceData.n_cols << ")." << endl;
+ }
+
+ // Check on kernel type.
+ if ((kernelType != "linear") && (kernelType != "polynomial") &&
+ (kernelType != "cosine") && (kernelType != "gaussian") &&
+ (kernelType != "graph") && (kernelType != "approxGraph") &&
+ (kernelType != "triangular") && (kernelType != "hyptan") &&
+ (kernelType != "inv-mq") && (kernelType != "epanechnikov"))
+ {
+ Log::Fatal << "Invalid kernel type: '" << kernelType << "'; must be ";
+ Log::Fatal << "'linear' or 'polynomial'." << endl;
+ }
+
+ // Load the query matrix, if we can.
+ if (CLI::HasParam("query_file"))
+ {
+ const string queryFile = CLI::GetParam<string>("query_file");
+ data::Load(queryFile, queryData, true);
+
+ Log::Info << "Loaded query data from '" << queryFile << "' ("
+ << queryData.n_rows << " x " << queryData.n_cols << ")." << endl;
+ }
+ else
+ {
+ Log::Info << "Using reference dataset as query dataset (--query_file not "
+ << "specified)." << endl;
+ }
+
+ // Naive mode overrides single mode.
+ if (naive && single)
+ {
+ Log::Warn << "--single ignored because --naive is present." << endl;
+ }
+
+ // Matrices for output storage.
+ arma::Mat<size_t> indices;
+ arma::mat products;
+
+ // Construct FastMKS object.
+ if (queryData.n_elem == 0)
+ {
+ if (kernelType == "linear")
+ {
+ LinearKernel lk;
+ RunFastMKS<LinearKernel>(referenceData, single, naive, base, k, indices,
+ products, lk);
+ }
+ else if (kernelType == "polynomial")
+ {
+
+ PolynomialKernel pk(degree, offset);
+ RunFastMKS<PolynomialKernel>(referenceData, single, naive, base, k,
+ indices, products, pk);
+ }
+ else if (kernelType == "cosine")
+ {
+ CosineDistance cd;
+ RunFastMKS<CosineDistance>(referenceData, single, naive, base, k, indices,
+ products, cd);
+ }
+ else if (kernelType == "gaussian")
+ {
+ GaussianKernel gk(bandwidth);
+ RunFastMKS<GaussianKernel>(referenceData, single, naive, base, k, indices,
+ products, gk);
+ }
+ else if (kernelType == "epanechnikov")
+ {
+ EpanechnikovKernel ek(bandwidth);
+ RunFastMKS<EpanechnikovKernel>(referenceData, single, naive, base, k,
+ indices, products, ek);
+ }
+ else if (kernelType == "triangular")
+ {
+ TriangularKernel tk(bandwidth);
+ RunFastMKS<TriangularKernel>(referenceData, single, naive, base, k,
+ indices, products, tk);
+ }
+ else if (kernelType == "hyptan")
+ {
+ HyperbolicTangentKernel htk(scale, offset);
+ RunFastMKS<HyperbolicTangentKernel>(referenceData, single, naive, base, k,
+ indices, products, htk);
+ }
+ }
+ else
+ {
+ if (kernelType == "linear")
+ {
+ LinearKernel lk;
+ RunFastMKS<LinearKernel>(referenceData, queryData, single, naive, base, k,
+ indices, products, lk);
+ }
+ else if (kernelType == "polynomial")
+ {
+ PolynomialKernel pk(degree, offset);
+ RunFastMKS<PolynomialKernel>(referenceData, queryData, single, naive,
+ base, k, indices, products, pk);
+ }
+ else if (kernelType == "cosine")
+ {
+ CosineDistance cd;
+ RunFastMKS<CosineDistance>(referenceData, queryData, single, naive, base,
+ k, indices, products, cd);
+ }
+ else if (kernelType == "gaussian")
+ {
+ GaussianKernel gk(bandwidth);
+ RunFastMKS<GaussianKernel>(referenceData, queryData, single, naive, base,
+ k, indices, products, gk);
+ }
+ else if (kernelType == "epanechnikov")
+ {
+ EpanechnikovKernel ek(bandwidth);
+ RunFastMKS<EpanechnikovKernel>(referenceData, queryData, single, naive,
+ base, k, indices, products, ek);
+ }
+ else if (kernelType == "triangular")
+ {
+ TriangularKernel tk(bandwidth);
+ RunFastMKS<TriangularKernel>(referenceData, queryData, single, naive,
+ base, k, indices, products, tk);
+ }
+ else if (kernelType == "hyptan")
+ {
+ HyperbolicTangentKernel htk(scale, offset);
+ RunFastMKS<HyperbolicTangentKernel>(referenceData, queryData, single,
+ naive, base, k, indices, products, htk);
+ }
+ }
+
+ // Save output, if we were asked to.
+ if (CLI::HasParam("products_file"))
+ {
+ const string productsFile = CLI::GetParam<string>("products_file");
+ data::Save(productsFile, products, false);
+ }
+
+ if (CLI::HasParam("indices_file"))
+ {
+ const string indicesFile = CLI::GetParam<string>("indices_file");
+ data::Save(indicesFile, indices, false);
+ }
+}
Copied: mlpack/trunk/src/mlpack/methods/fastmks/fastmks_rules.hpp (from rev 13837, mlpack/trunk/src/mlpack/methods/fastmks/max_ip_rules.hpp)
===================================================================
--- mlpack/trunk/src/mlpack/methods/fastmks/fastmks_rules.hpp (rev 0)
+++ mlpack/trunk/src/mlpack/methods/fastmks/fastmks_rules.hpp 2012-11-05 21:53:56 UTC (rev 13844)
@@ -0,0 +1,54 @@
+/**
+ * @file fastmks_rules.hpp
+ * @author Ryan Curtin
+ *
+ * Rules for the single or dual tree traversal for fast max-kernel search.
+ */
+#ifndef __MLPACK_METHODS_FASTMKS_FASTMKS_RULES_HPP
+#define __MLPACK_METHODS_FASTMKS_FASTMKS_RULES_HPP
+
+#include <mlpack/core.hpp>
+#include <mlpack/core/tree/cover_tree/cover_tree.hpp>
+
+namespace mlpack {
+namespace fastmks {
+
+template<typename MetricType>
+class FastMKSRules
+{
+ public:
+ FastMKSRules(const arma::mat& referenceSet,
+ const arma::mat& querySet,
+ arma::Mat<size_t>& indices,
+ arma::mat& products);
+
+ void BaseCase(const size_t queryIndex, const size_t referenceIndex);
+
+ bool CanPrune(const size_t queryIndex,
+ tree::CoverTree<MetricType>& referenceNode,
+ const size_t parentIndex);
+
+ private:
+ const arma::mat& referenceSet;
+
+ const arma::mat& querySet;
+
+ arma::Mat<size_t>& indices;
+
+ arma::mat& products;
+
+ arma::vec queryKernels; // || q || for each q.
+
+ void InsertNeighbor(const size_t queryIndex,
+ const size_t pos,
+ const size_t neighbor,
+ const double distance);
+};
+
+}; // namespace fastmks
+}; // namespace mlpack
+
+// Include implementation.
+#include "fastmks_rules_impl.hpp"
+
+#endif
Copied: mlpack/trunk/src/mlpack/methods/fastmks/fastmks_rules_impl.hpp (from rev 13837, mlpack/trunk/src/mlpack/methods/fastmks/max_ip_rules_impl.hpp)
===================================================================
--- mlpack/trunk/src/mlpack/methods/fastmks/fastmks_rules_impl.hpp (rev 0)
+++ mlpack/trunk/src/mlpack/methods/fastmks/fastmks_rules_impl.hpp 2012-11-05 21:53:56 UTC (rev 13844)
@@ -0,0 +1,103 @@
+/**
+ * @file fastmks_rules_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of FastMKSRules for cover tree search.
+ */
+#ifndef __MLPACK_METHODS_FASTMKS_FASTMKS_RULES_IMPL_HPP
+#define __MLPACK_METHODS_FASTMKS_FASTMKS_RULES_IMPL_HPP
+
+// In case it hasn't already been included.
+#include "fastmks_rules.hpp"
+
+namespace mlpack {
+namespace fastmks {
+
+template<typename MetricType>
+FastMKSRules<MetricType>::FastMKSRules(const arma::mat& referenceSet,
+ const arma::mat& querySet,
+ arma::Mat<size_t>& indices,
+ arma::mat& products) :
+ referenceSet(referenceSet),
+ querySet(querySet),
+ indices(indices),
+ products(products)
+{
+ // Precompute each self-kernel.
+// queryKernels.set_size(querySet.n_cols);
+// for (size_t i = 0; i < querySet.n_cols; ++i)
+// queryKernels[i] = sqrt(MetricType::Kernel::Evaluate(querySet.unsafe_col(i),
+// querySet.unsafe_col(i)));
+}
+
+template<typename MetricType>
+bool FastMKSRules<MetricType>::CanPrune(const size_t queryIndex,
+ tree::CoverTree<MetricType>& referenceNode,
+ const size_t parentIndex)
+{
+ // The maximum possible inner product is given by
+ // <q, p_0> + R_p || q ||
+ // and since we are using cover trees, p_0 is the point referred to by the
+ // node, and R_p will be the expansion constant to the power of the scale plus
+ // one.
+ const double eval = MetricType::Kernel::Evaluate(querySet.col(queryIndex),
+ referenceSet.col(referenceNode.Point()));
+
+ // See if base case can be added.
+ if (eval > products(products.n_rows - 1, queryIndex))
+ {
+ size_t insertPosition;
+ for (insertPosition = 0; insertPosition < indices.n_rows; ++insertPosition)
+ if (eval > products(insertPosition, queryIndex))
+ break;
+
+ // We are guaranteed insertPosition is in the valid range.
+ InsertNeighbor(queryIndex, insertPosition, referenceNode.Point(), eval);
+ }
+
+ double maxProduct = eval + std::pow(referenceNode.ExpansionConstant(),
+ referenceNode.Scale() + 1) *
+ sqrt(MetricType::Kernel::Evaluate(querySet.col(queryIndex),
+ querySet.col(queryIndex)));
+
+ if (maxProduct > products(products.n_rows - 1, queryIndex))
+ return false;
+ else
+ return true;
+}
+
+/**
+ * Helper function to insert a point into the neighbors and distances matrices.
+ *
+ * @param queryIndex Index of point whose neighbors we are inserting into.
+ * @param pos Position in list to insert into.
+ * @param neighbor Index of reference point which is being inserted.
+ * @param distance Distance from query point to reference point.
+ */
+template<typename MetricType>
+void FastMKSRules<MetricType>::InsertNeighbor(const size_t queryIndex,
+ const size_t pos,
+ const size_t neighbor,
+ const double distance)
+{
+ // We only memmove() if there is actually a need to shift something.
+ if (pos < (products.n_rows - 1))
+ {
+ int len = (products.n_rows - 1) - pos;
+ memmove(products.colptr(queryIndex) + (pos + 1),
+ products.colptr(queryIndex) + pos,
+ sizeof(double) * len);
+ memmove(indices.colptr(queryIndex) + (pos + 1),
+ indices.colptr(queryIndex) + pos,
+ sizeof(size_t) * len);
+ }
+
+ // Now put the new information in the right index.
+ products(pos, queryIndex) = distance;
+ indices(pos, queryIndex) = neighbor;
+}
+
+}; // namespace fastmks
+}; // namespace mlpack
+
+#endif
Modified: mlpack/trunk/src/mlpack/methods/fastmks/ip_metric.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/fastmks/ip_metric.hpp 2012-11-05 10:00:54 UTC (rev 13843)
+++ mlpack/trunk/src/mlpack/methods/fastmks/ip_metric.hpp 2012-11-05 21:53:56 UTC (rev 13844)
@@ -5,11 +5,11 @@
* Inner product induced metric. If given a kernel function, this gives the
* complementary metric.
*/
-#ifndef __MLPACK_METHODS_MAXIP_IP_METRIC_HPP
-#define __MLPACK_METHODS_MAXIP_IP_METRIC_HPP
+#ifndef __MLPACK_METHODS_FASTMKS_IP_METRIC_HPP
+#define __MLPACK_METHODS_FASTMKS_IP_METRIC_HPP
namespace mlpack {
-namespace maxip /** The maximum inner product problem. */ {
+namespace fastmks /** The fast maximum kernel search problem. */ {
template<typename KernelType>
class IPMetric
@@ -35,7 +35,7 @@
KernelType& kernel;
};
-}; // namespace maxip
+}; // namespace fastmks
}; // namespace mlpack
// Include implementation.
Modified: mlpack/trunk/src/mlpack/methods/fastmks/ip_metric_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/fastmks/ip_metric_impl.hpp 2012-11-05 10:00:54 UTC (rev 13843)
+++ mlpack/trunk/src/mlpack/methods/fastmks/ip_metric_impl.hpp 2012-11-05 21:53:56 UTC (rev 13844)
@@ -4,8 +4,8 @@
*
* Implementation of the IPMetric.
*/
-#ifndef __MLPACK_METHODS_MAXIP_IP_METRIC_IMPL_HPP
-#define __MLPACK_METHODS_MAXIP_IP_METRIC_IMPL_HPP
+#ifndef __MLPACK_METHODS_FASTMKS_IP_METRIC_IMPL_HPP
+#define __MLPACK_METHODS_FASTMKS_IP_METRIC_IMPL_HPP
// In case it hasn't been included yet.
#include "ip_metric_impl.hpp"
@@ -14,7 +14,7 @@
#include <mlpack/core/kernels/linear_kernel.hpp>
namespace mlpack {
-namespace maxip {
+namespace fastmks {
template<typename KernelType>
template<typename Vec1Type, typename Vec2Type>
@@ -37,7 +37,7 @@
return metric::LMetric<2, true>::Evaluate(a, b);
}
-}; // namespace maxip
+}; // namespace fastmks
}; // namespace mlpack
#endif
Deleted: mlpack/trunk/src/mlpack/methods/fastmks/max_ip.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/fastmks/max_ip.hpp 2012-11-05 10:00:54 UTC (rev 13843)
+++ mlpack/trunk/src/mlpack/methods/fastmks/max_ip.hpp 2012-11-05 21:53:56 UTC (rev 13844)
@@ -1,70 +0,0 @@
-/**
- * @file max_ip.hpp
- * @author Ryan Curtin
- *
- * Definition of the MaxIP class, which is the maximum inner product search.
- */
-#ifndef __MLPACK_METHODS_MAXIP_MAX_IP_HPP
-#define __MLPACK_METHODS_MAXIP_MAX_IP_HPP
-
-#include <mlpack/core.hpp>
-#include "ip_metric.hpp"
-#include <mlpack/core/tree/cover_tree/cover_tree.hpp>
-
-namespace mlpack {
-namespace maxip {
-
-template<typename KernelType>
-class MaxIP
-{
- public:
- MaxIP(const arma::mat& referenceSet,
- KernelType& kernel,
- bool single = false,
- bool naive = false,
- double expansionConstant = 2.0);
-
- MaxIP(const arma::mat& referenceSet,
- const arma::mat& querySet,
- KernelType& kernel,
- bool single = false,
- bool naive = false,
- double expansionConstant = 2.0);
-
- ~MaxIP();
-
- void Search(const size_t k,
- arma::Mat<size_t>& indices,
- arma::mat& products);
-
- private:
- const arma::mat& referenceSet;
-
- const arma::mat& querySet;
-
- tree::CoverTree<IPMetric<KernelType> >* referenceTree;
-
- tree::CoverTree<IPMetric<KernelType> >* queryTree;
-
- bool single;
-
- bool naive;
-
- IPMetric<KernelType> metric;
-
- // Utility function. Copied too many times from too many places.
- void InsertNeighbor(arma::Mat<size_t>& indices,
- arma::mat& products,
- const size_t queryIndex,
- const size_t pos,
- const size_t neighbor,
- const double distance);
-};
-
-}; // namespace maxip
-}; // namespace mlpack
-
-// Include implementation.
-#include "max_ip_impl.hpp"
-
-#endif
Deleted: mlpack/trunk/src/mlpack/methods/fastmks/max_ip_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/fastmks/max_ip_impl.hpp 2012-11-05 10:00:54 UTC (rev 13843)
+++ mlpack/trunk/src/mlpack/methods/fastmks/max_ip_impl.hpp 2012-11-05 21:53:56 UTC (rev 13844)
@@ -1,743 +0,0 @@
-/**
- * @file max_ip_impl.hpp
- * @author Ryan Curtin
- *
- * Implementation of the MaxIP class (maximum inner product search).
- */
-#ifndef __MLPACK_METHODS_MAXIP_MAX_IP_IMPL_HPP
-#define __MLPACK_METHODS_MAXIP_MAX_IP_IMPL_HPP
-
-// In case it hasn't yet been included.
-#include "max_ip.hpp"
-
-#include "max_ip_rules.hpp"
-
-#include <mlpack/core/kernels/gaussian_kernel.hpp>
-#include <queue>
-
-namespace mlpack {
-namespace maxip {
-
-template<typename TreeType>
-void RecurseTreeCountLeaves(const TreeType& node, arma::vec& counts)
-{
- for (size_t i = 0; i < node.NumChildren(); ++i)
- {
- if (node.Child(i).NumChildren() == 0)
- counts[node.Child(i).Point()]++;
- else
- RecurseTreeCountLeaves<TreeType>(node.Child(i), counts);
- }
-}
-
-template<typename TreeType>
-void CheckSelfChild(const TreeType& node)
-{
- if (node.NumChildren() == 0)
- return; // No self-child applicable here.
-
- bool found = false;
- for (size_t i = 0; i < node.NumChildren(); ++i)
- {
- if (node.Child(i).Point() == node.Point())
- found = true;
-
- // Recursively check the children.
- CheckSelfChild(node.Child(i));
- }
-
- // Ensure this has its own self-child.
- Log::Assert(found == true);
-}
-
-template<typename TreeType, typename MetricType>
-void CheckCovering(const TreeType& node)
-{
- // Return if a leaf. No checking necessary.
- if (node.NumChildren() == 0)
- return;
-
- const arma::mat& dataset = node.Dataset();
- const size_t nodePoint = node.Point();
-
- // To ensure that this node satisfies the covering principle, we must ensure
- // that the distance to each child is less than pow(expansionConstant, scale).
- double maxDistance = pow(node.ExpansionConstant(), node.Scale());
- for (size_t i = 0; i < node.NumChildren(); ++i)
- {
- const size_t childPoint = node.Child(i).Point();
-
- double distance = MetricType::Evaluate(dataset.col(nodePoint),
- dataset.col(childPoint));
-
- Log::Assert(distance <= maxDistance);
-
- // Check the child.
- CheckCovering<TreeType, MetricType>(node.Child(i));
- }
-}
-
-template<typename TreeType, typename MetricType>
-void CheckIndividualSeparation(const TreeType& constantNode,
- const TreeType& node)
-{
- // Don't check points at a lower scale.
- if (node.Scale() < constantNode.Scale())
- return;
-
- // If at a higher scale, recurse.
- if (node.Scale() > constantNode.Scale())
- {
- for (size_t i = 0; i < node.NumChildren(); ++i)
- {
- // Don't recurse into leaves.
- if (node.Child(i).NumChildren() > 0)
- CheckIndividualSeparation<TreeType, MetricType>(constantNode,
- node.Child(i));
- }
-
- return;
- }
-
- // Don't compare the same point against itself.
- if (node.Point() == constantNode.Point())
- return;
-
- // Now we know we are at the same scale, so make the comparison.
- const arma::mat& dataset = constantNode.Dataset();
- const size_t constantPoint = constantNode.Point();
- const size_t nodePoint = node.Point();
-
- // Make sure the distance is at least the following value (in accordance with
- // the separation principle of cover trees).
- double minDistance = pow(constantNode.ExpansionConstant(),
- constantNode.Scale());
-
- double distance = MetricType::Evaluate(dataset.col(constantPoint),
- dataset.col(nodePoint));
-
-}
-
-template<typename TreeType, typename MetricType>
-void CheckSeparation(const TreeType& node, const TreeType& root)
-{
- // Check the separation between this point and all other points on this scale.
- CheckIndividualSeparation<TreeType, MetricType>(node, root);
-
- // Check the children, but only if they are not leaves. Leaves don't need to
- // be checked.
- for (size_t i = 0; i < node.NumChildren(); ++i)
- if (node.Child(i).NumChildren() > 0)
- CheckSeparation<TreeType, MetricType>(node.Child(i), root);
-}
-
-template<typename TreeType, typename MetricType>
-void GetMaxDistance(TreeType& node, TreeType& constantNode, double& best, size_t& index)
-{
- const arma::mat& dataset = node.Dataset();
- const double eval = MetricType::Evaluate(dataset.unsafe_col(node.Point()),
- dataset.unsafe_col(constantNode.Point()));
- if (eval > best)
- {
- best = eval;
- index = node.Point();
- }
-
- // Recurse into children.
- for (size_t i = 0; i < node.NumChildren(); ++i)
- GetMaxDistance<TreeType, MetricType>(node.Child(i), constantNode, best, index);
-}
-
-template<typename TreeType, typename MetricType>
-void CheckMaxDistances(TreeType& node)
-{
- // Check child distances.
- for (size_t i = 0; i < node.NumChildren(); ++i)
- {
- const arma::mat& dataset = node.Dataset();
- double eval = MetricType::Evaluate(dataset.unsafe_col(node.Point()),
- dataset.unsafe_col(node.Child(i).Point()));
-
-// Log::Debug << "Point " << node.Point() << ", child point " << node.Child(i).Point() << "\n";
-// Log::Debug << "Actual eval " << eval << ", supposed parent distance " << node.Child(i).ParentDistance() << "\n";
-// Log::Debug << "Diff " << eval - node.Child(i).ParentDistance() << "\n";
- Log::Assert(std::abs(eval - node.Child(i).ParentDistance()) < 1e-10);
- }
-
- // Check all descendants.
- double maxDescendantDistance = 0;
- size_t maxIndex = 0;
- GetMaxDistance<TreeType, MetricType>(node, node, maxDescendantDistance, maxIndex);
-
-// Log::Debug << "Point " << node.Point() << ", max desc. index " << maxIndex << "\n";
-// Log::Debug << "Calculated maxDescendantDistance " << maxDescendantDistance << " given " << node.FurthestDescendantDistance() << std::endl;
- Log::Assert(std::abs(maxDescendantDistance - node.FurthestDescendantDistance()) < 1e-10);
-
- for (size_t i = 0; i < node.NumChildren(); ++i)
- CheckMaxDistances<TreeType, MetricType>(node.Child(i));
-}
-
-template<typename TreeType>
-struct SearchFrame
-{
- TreeType* node;
- double eval;
-};
-
-template<typename TreeType>
-class SearchFrameCompare
-{
- public:
- bool operator()(const SearchFrame<TreeType>& lhs,
- const SearchFrame<TreeType>& rhs)
- {
- // Compare scale.
- if (lhs.node->Scale() != rhs.node->Scale())
- return (lhs.node->Scale() < rhs.node->Scale());
- else
- {
- // Now we have to compare by evaluation.
- return (lhs.eval < rhs.eval);
- }
- }
-};
-
-template<typename KernelType>
-MaxIP<KernelType>::MaxIP(const arma::mat& referenceSet,
- KernelType& kernel,
- bool single,
- bool naive,
- double expansionConstant) :
- referenceSet(referenceSet),
- querySet(referenceSet), // Gotta point it somewhere...
- queryTree(NULL),
- single(single),
- naive(naive),
- metric(kernel)
-{
-
- Timer::Start("tree_building");
-
- // Build the tree. Could we do this in the initialization list?
- if (naive)
- referenceTree = NULL;
- else
- referenceTree = new tree::CoverTree<IPMetric<KernelType> >(referenceSet,
- expansionConstant, &metric);
-
- Timer::Stop("tree_building");
-}
-
-template<typename KernelType>
-MaxIP<KernelType>::MaxIP(const arma::mat& referenceSet,
- const arma::mat& querySet,
- KernelType& kernel,
- bool single,
- bool naive,
- double expansionConstant) :
- referenceSet(referenceSet),
- querySet(querySet),
- single(single),
- naive(naive),
- metric(kernel)
-{
- Timer::Start("tree_building");
-
- // Build the trees. Could we do this in the initialization lists?
- if (naive)
- referenceTree = NULL;
- else
- referenceTree = new tree::CoverTree<IPMetric<KernelType> >(referenceSet,
- expansionConstant, &metric);
-
- if (single || naive)
- queryTree = NULL;
- else
- queryTree = new tree::CoverTree<IPMetric<KernelType> >(querySet,
- expansionConstant, &metric);
-
-/* if (referenceTree != NULL)
- {
- Log::Debug << "Check counts" << std::endl;
- // Now loop through the tree and ensure that each leaf is only created once.
- arma::vec counts;
- counts.zeros(referenceSet.n_elem);
- RecurseTreeCountLeaves(*referenceTree, counts);
-
- // Each point should only have one leaf node representing it.
- for (size_t i = 0; i < 20; ++i)
- Log::Assert(counts[i] == 1);
-
- Log::Debug << "Check self child\n";
- // Each non-leaf should have a self-child.
- CheckSelfChild<tree::CoverTree<IPMetric<KernelType> > >(*referenceTree);
-
- Log::Debug << "Check covering\n";
- // Each node must satisfy the covering principle (its children must be less
- // than or equal to a certain distance apart).
- CheckCovering<tree::CoverTree<IPMetric<KernelType> >, IPMetric<KernelType> >(*referenceTree);
-
- Log::Debug << "Check max distances\n";
- // Check maximum distance of children and grandchildren.
- CheckMaxDistances<tree::CoverTree<IPMetric<KernelType> >, IPMetric<KernelType> >(*referenceTree);
- Log::Debug << "Done\n";
- }*/
-
- Timer::Stop("tree_building");
-}
-
-template<typename KernelType>
-MaxIP<KernelType>::~MaxIP()
-{
- if (queryTree)
- delete queryTree;
- if (referenceTree)
- delete referenceTree;
-}
-
-template<typename KernelType>
-void MaxIP<KernelType>::Search(const size_t k,
- arma::Mat<size_t>& indices,
- arma::mat& products)
-{
- // No remapping will be necessary.
- indices.set_size(k, querySet.n_cols);
- products.set_size(k, querySet.n_cols);
- products.fill(-1.0);
-
- Timer::Start("computing_products");
-
- size_t kernelEvaluations = 0;
-
- // Naive implementation.
- if (naive)
- {
- // Simple double loop. Stupid, slow, but a good benchmark.
- for (size_t q = 0; q < querySet.n_cols; ++q)
- {
- for (size_t r = 0; r < referenceSet.n_cols; ++r)
- {
- const double eval = metric.Kernel().Evaluate(querySet.unsafe_col(q),
- referenceSet.unsafe_col(r));
- ++kernelEvaluations;
-
- size_t insertPosition;
- for (insertPosition = 0; insertPosition < indices.n_rows;
- ++insertPosition)
- if (eval > products(insertPosition, q))
- break;
-
- if (insertPosition < indices.n_rows)
- InsertNeighbor(indices, products, q, insertPosition, r, eval);
- }
- }
-
- Timer::Stop("computing_products");
-
- Log::Info << "Kernel evaluations: " << kernelEvaluations << "." << std::endl;
- return;
- }
-
- // Single-tree implementation.
- if (single)
- {
- // Calculate number of pruned nodes.
- size_t numPrunes = 0;
-
- // Precalculate query products ( || q || for all q).
- arma::vec queryProducts(querySet.n_cols);
- for (size_t queryIndex = 0; queryIndex < querySet.n_cols; ++queryIndex)
- queryProducts[queryIndex] = sqrt(metric.Kernel().Evaluate(
- querySet.unsafe_col(queryIndex), querySet.unsafe_col(queryIndex)));
- kernelEvaluations += querySet.n_cols;
-
- // Screw the CoverTreeTraverser, we'll implement it by hand.
- for (size_t queryIndex = 0; queryIndex < querySet.n_cols; ++queryIndex)
- {
- // Use an array of priority queues?
- std::priority_queue<
- SearchFrame<tree::CoverTree<IPMetric<KernelType> > >,
- std::vector<SearchFrame<tree::CoverTree<IPMetric<KernelType> > > >,
- SearchFrameCompare<tree::CoverTree<IPMetric<KernelType> > > >
- frameQueue;
-
- // Add initial frame.
- SearchFrame<tree::CoverTree<IPMetric<KernelType> > > nextFrame;
- nextFrame.node = referenceTree;
- nextFrame.eval = metric.Kernel().Evaluate(querySet.unsafe_col(queryIndex),
- referenceSet.unsafe_col(referenceTree->Point()));
- ++kernelEvaluations;
-
- // The initial evaluation will be the best so far.
- indices(0, queryIndex) = referenceTree->Point();
- products(0, queryIndex) = nextFrame.eval;
-
- frameQueue.push(nextFrame);
-
- tree::CoverTree<IPMetric<KernelType> >* referenceNode;
- double eval;
- double maxProduct;
-
- while (!frameQueue.empty())
- {
- // Get the information for this node.
- const SearchFrame<tree::CoverTree<IPMetric<KernelType> > >& frame =
- frameQueue.top();
-
- referenceNode = frame.node;
- eval = frame.eval;
-
- // Loop through the children, seeing if we can prune them; if not, add
- // them to the queue. The self-child is different -- it has the same
- // parent (and therefore the same kernel evaluation).
- if (referenceNode->NumChildren() > 0)
- {
- SearchFrame<tree::CoverTree<IPMetric<KernelType> > > childFrame;
-
- // We must handle the self-child differently, to avoid adding it to
- // the results twice.
- childFrame.node = &(referenceNode->Child(0));
- childFrame.eval = eval;
-
-// maxProduct = eval + std::pow(childFrame.node->ExpansionConstant(),
-// childFrame.node->Scale() + 1) * queryProducts[queryIndex];
- // Alternate pruning rule.
- maxProduct = eval + childFrame.node->FurthestDescendantDistance() *
- queryProducts[queryIndex];
-
- // Add self-child if we can't prune it.
- if (maxProduct > products(products.n_rows - 1, queryIndex))
- {
- // But only if it has children of its own.
- if (childFrame.node->NumChildren() > 0)
- frameQueue.push(childFrame);
- }
- else
- ++numPrunes;
-
- for (size_t i = 1; i < referenceNode->NumChildren(); ++i)
- {
- // Before we evaluate the child, let's see if it can possibly have
- // a better evaluation.
- double maxChildEval = eval + queryProducts[queryIndex] *
- (referenceNode->Child(i).ParentDistance() +
- referenceNode->Child(i).FurthestDescendantDistance());
-
- if (maxChildEval <= products(products.n_rows - 1, queryIndex))
- {
- ++numPrunes;
- continue; // Skip this child; it can't be any better.
- }
-
- // Evaluate child.
- childFrame.node = &(referenceNode->Child(i));
- childFrame.eval = metric.Kernel().Evaluate(
- querySet.unsafe_col(queryIndex),
- referenceSet.unsafe_col(referenceNode->Child(i).Point()));
- ++kernelEvaluations;
-
- // Can we prune it? If we can, we can avoid putting it in the queue
- // (saves time).
-// double maxProduct = childFrame.eval +
-// std::pow(childFrame.node->ExpansionConstant(),
-// childFrame.node->Scale() + 1) * queryProducts[queryIndex];
- // Alternate pruning rule.
- maxProduct = childFrame.eval + queryProducts[queryIndex] *
- childFrame.node->FurthestDescendantDistance();
-
- if (maxProduct > products(products.n_rows - 1, queryIndex))
- {
- // Good enough to recurse into. While we're at it, check the
- // actual evaluation and see if it's an improvement.
- if (childFrame.eval > products(products.n_rows - 1, queryIndex))
- {
- // This is a better result. Find out where to insert.
- size_t insertPosition = 0;
- for ( ; insertPosition < products.n_rows - 1; ++insertPosition)
- if (childFrame.eval > products(insertPosition, queryIndex))
- break;
-
- // Insert into the correct position; we are guaranteed that
- // insertPosition is valid.
- InsertNeighbor(indices, products, queryIndex, insertPosition,
- childFrame.node->Point(), childFrame.eval);
- }
-
- // Now add this to the queue (if it has any children which may
- // need to be recursed into).
- if (childFrame.node->NumChildren() > 0)
- frameQueue.push(childFrame);
- }
- else
- {
- ++numPrunes;
- }
- }
- }
-
- frameQueue.pop();
- }
- }
-
- Log::Info << "Pruned " << numPrunes << " nodes." << std::endl;
- Log::Info << "Kernel evaluations: " << kernelEvaluations << "."
- << std::endl;
- Log::Info << "Distance evaluations: " << distanceEvaluations << "."
- << std::endl;
-
- Timer::Stop("computing_products");
- return;
- }
-
- // Double-tree implementation.
- Log::Fatal << "Dual-tree search not implemented yet... oops..." << std::endl;
-}
-
-/**
- * Helper function to insert a point into the neighbors and distances matrices.
- *
- * @param queryIndex Index of point whose neighbors we are inserting into.
- * @param pos Position in list to insert into.
- * @param neighbor Index of reference point which is being inserted.
- * @param distance Distance from query point to reference point.
- */
-template<typename KernelType>
-void MaxIP<KernelType>::InsertNeighbor(arma::Mat<size_t>& indices,
- arma::mat& products,
- const size_t queryIndex,
- const size_t pos,
- const size_t neighbor,
- const double distance)
-{
- // We only memmove() if there is actually a need to shift something.
- if (pos < (products.n_rows - 1))
- {
- int len = (products.n_rows - 1) - pos;
- memmove(products.colptr(queryIndex) + (pos + 1),
- products.colptr(queryIndex) + pos,
- sizeof(double) * len);
- memmove(indices.colptr(queryIndex) + (pos + 1),
- indices.colptr(queryIndex) + pos,
- sizeof(size_t) * len);
- }
-
- // Now put the new information in the right index.
- products(pos, queryIndex) = distance;
- indices(pos, queryIndex) = neighbor;
-}
-
-// Specialized implementation for tighter bounds for Gaussian.
-template<>
-void MaxIP<kernel::GaussianKernel>::Search(const size_t k,
- arma::Mat<size_t>& indices,
- arma::mat& products)
-{
- Log::Warn << "Alternate implementation!" << std::endl;
-
- // Terrible copypasta. Bad bad bad.
- // No remapping will be necessary.
- indices.set_size(k, querySet.n_cols);
- products.set_size(k, querySet.n_cols);
- products.fill(-1.0);
-
- Timer::Start("computing_products");
-
- size_t kernelEvaluations = 0;
-
- // Naive implementation.
- if (naive)
- {
- // Simple double loop. Stupid, slow, but a good benchmark.
- for (size_t q = 0; q < querySet.n_cols; ++q)
- {
- for (size_t r = 0; r < referenceSet.n_cols; ++r)
- {
- const double eval = metric.Kernel().Evaluate(querySet.unsafe_col(q),
- referenceSet.unsafe_col(r));
- ++kernelEvaluations;
-
- size_t insertPosition;
- for (insertPosition = 0; insertPosition < indices.n_rows;
- ++insertPosition)
- if (eval > products(insertPosition, q))
- break;
-
- if (insertPosition < indices.n_rows)
- InsertNeighbor(indices, products, q, insertPosition, r, eval);
- }
- }
-
- Timer::Stop("computing_products");
-
- Log::Info << "Kernel evaluations: " << kernelEvaluations << "." << std::endl;
- return;
- }
-
- // Single-tree implementation.
- if (single)
- {
- // Calculate number of pruned nodes.
- size_t numPrunes = 0;
-
- // Precalculate query products ( || q || for all q).
- arma::vec queryProducts(querySet.n_cols);
- for (size_t queryIndex = 0; queryIndex < querySet.n_cols; ++queryIndex)
- queryProducts[queryIndex] = sqrt(metric.Kernel().Evaluate(
- querySet.unsafe_col(queryIndex), querySet.unsafe_col(queryIndex)));
- kernelEvaluations += querySet.n_cols;
-
- // Screw the CoverTreeTraverser, we'll implement it by hand.
- for (size_t queryIndex = 0; queryIndex < querySet.n_cols; ++queryIndex)
- {
- // Use an array of priority queues?
- std::priority_queue<
- SearchFrame<tree::CoverTree<IPMetric<kernel::GaussianKernel> > >,
- std::vector<SearchFrame<tree::CoverTree<IPMetric<kernel::GaussianKernel> > > >,
- SearchFrameCompare<tree::CoverTree<IPMetric<kernel::GaussianKernel> > > >
- frameQueue;
-
- // Add initial frame.
- SearchFrame<tree::CoverTree<IPMetric<kernel::GaussianKernel> > > nextFrame;
- nextFrame.node = referenceTree;
- nextFrame.eval = metric.Kernel().Evaluate(querySet.unsafe_col(queryIndex),
- referenceSet.unsafe_col(referenceTree->Point()));
- Log::Assert(nextFrame.eval <= 1);
- ++kernelEvaluations;
-
- // The initial evaluation will be the best so far.
- indices(0, queryIndex) = referenceTree->Point();
- products(0, queryIndex) = nextFrame.eval;
-
- frameQueue.push(nextFrame);
-
- tree::CoverTree<IPMetric<kernel::GaussianKernel> >* referenceNode;
- double eval;
- double maxProduct;
-
- while (!frameQueue.empty())
- {
- // Get the information for this node.
- const SearchFrame<tree::CoverTree<IPMetric<kernel::GaussianKernel> > >&
- frame = frameQueue.top();
-
- referenceNode = frame.node;
- eval = frame.eval;
-
- // Loop through the children, seeing if we can prune them; if not, add
- // them to the queue. The self-child is different -- it has the same
- // parent (and therefore the same kernel evaluation).
- if (referenceNode->NumChildren() > 0)
- {
- SearchFrame<tree::CoverTree<IPMetric<kernel::GaussianKernel> > >
- childFrame;
-
- // We must handle the self-child differently, to avoid adding it to
- // the results twice.
- childFrame.node = &(referenceNode->Child(0));
- childFrame.eval = eval;
-
- // Alternate pruning rule.
- const double mdd = childFrame.node->FurthestDescendantDistance();
- if (eval >= (1 - std::pow(mdd, 2.0) / 2.0))
- maxProduct = 1;
- else
- maxProduct = eval * (1 - std::pow(mdd, 2.0) / 2.0) + sqrt(1 -
- std::pow(eval, 2.0)) * mdd * sqrt(1 - std::pow(mdd, 2.0) / 4.0);
-
- // Add self-child if we can't prune it.
- if (maxProduct > products(products.n_rows - 1, queryIndex))
- {
- // But only if it has children of its own.
- if (childFrame.node->NumChildren() > 0)
- frameQueue.push(childFrame);
- }
- else
- ++numPrunes;
-
- for (size_t i = 1; i < referenceNode->NumChildren(); ++i)
- {
- // Before we evaluate the child, let's see if it can possibly have
- // a better evaluation.
- const double mpdd = std::min(
- referenceNode->Child(i).ParentDistance() +
- referenceNode->Child(i).FurthestDescendantDistance(), 2.0);
- double maxChildEval = 1;
- if (eval < (1 - std::pow(mpdd, 2.0) / 2.0))
- maxChildEval = eval * (1 - std::pow(mpdd, 2.0) / 2.0) + sqrt(1 -
- std::pow(eval, 2.0)) * mpdd * sqrt(1 - std::pow(mpdd, 2.0)
- / 4.0);
-
- if (maxChildEval > products(products.n_rows - 1, queryIndex))
- {
- // Evaluate child.
- childFrame.node = &(referenceNode->Child(i));
- childFrame.eval = metric.Kernel().Evaluate(
- querySet.unsafe_col(queryIndex),
- referenceSet.unsafe_col(referenceNode->Child(i).Point()));
- ++kernelEvaluations;
-
- // Can we prune it? If we can, we can avoid putting it in the
- // queue (saves time).
- const double cmdd = childFrame.node->FurthestDescendantDistance();
- if (childFrame.eval > (1 - std::pow(cmdd, 2.0) / 2.0))
- maxProduct = 1;
- else
- maxProduct = childFrame.eval * (1 - std::pow(cmdd, 2.0) / 2.0)
- + sqrt(1 - std::pow(eval, 2.0)) * cmdd * sqrt(1 -
- std::pow(cmdd, 2.0) / 4.0);
-
- if (maxProduct > products(products.n_rows - 1, queryIndex))
- {
- // Good enough to recurse into. While we're at it, check the
- // actual evaluation and see if it's an improvement.
- if (childFrame.eval > products(products.n_rows - 1, queryIndex))
- {
- // This is a better result. Find out where to insert.
- size_t insertPosition = 0;
- for ( ; insertPosition < products.n_rows - 1;
- ++insertPosition)
- if (childFrame.eval > products(insertPosition, queryIndex))
- break;
-
- // Insert into the correct position; we are guaranteed that
- // insertPosition is valid.
- InsertNeighbor(indices, products, queryIndex, insertPosition,
- childFrame.node->Point(), childFrame.eval);
- }
-
- // Now add this to the queue (if it has any children which may
- // need to be recursed into).
- if (childFrame.node->NumChildren() > 0)
- frameQueue.push(childFrame);
- }
- else
- ++numPrunes;
- }
- else
- ++numPrunes;
- }
- }
-
- frameQueue.pop();
- }
- }
-
- Log::Info << "Pruned " << numPrunes << " nodes." << std::endl;
- Log::Info << "Kernel evaluations: " << kernelEvaluations << "."
- << std::endl;
- Log::Info << "Distance evaluations: " << distanceEvaluations << "."
- << std::endl;
-
- Timer::Stop("computing_products");
- return;
- }
-
- // Double-tree implementation.
- Log::Fatal << "Dual-tree search not implemented yet... oops..." << std::endl;
-
-}
-
-}; // namespace maxip
-}; // namespace mlpack
-
-#endif
Deleted: mlpack/trunk/src/mlpack/methods/fastmks/max_ip_main.cpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/fastmks/max_ip_main.cpp 2012-11-05 10:00:54 UTC (rev 13843)
+++ mlpack/trunk/src/mlpack/methods/fastmks/max_ip_main.cpp 2012-11-05 21:53:56 UTC (rev 13844)
@@ -1,284 +0,0 @@
-/**
- * @file max_ip_main.cpp
- * @author Ryan Curtin
- *
- * Main executable for maximum inner product search.
- */
-// I can't believe I'm doing this.
-#include <stddef.h>
-size_t distanceEvaluations;
-
-#include <mlpack/core.hpp>
-#include <mlpack/core/kernels/linear_kernel.hpp>
-#include <mlpack/core/kernels/polynomial_kernel.hpp>
-#include <mlpack/core/kernels/cosine_distance.hpp>
-#include <mlpack/core/kernels/gaussian_kernel.hpp>
-#include <mlpack/core/kernels/hyperbolic_tangent_kernel.hpp>
-#include <mlpack/core/kernels/triangular_kernel.hpp>
-#include <mlpack/core/kernels/epanechnikov_kernel.hpp>
-
-#include "max_ip.hpp"
-
-using namespace std;
-using namespace mlpack;
-using namespace mlpack::maxip;
-using namespace mlpack::kernel;
-
-// Needs to be fleshed out a little bit.
-PROGRAM_INFO("Maximum Inner Product Search",
- "This program will find the k maximum inner products of a set of points, "
- "using a query set and a reference set (which can optionally be the same "
- "set). More specifically, for each point in the query set, the k points in"
- " the reference set with maximum inner products are found. The inner "
- "product used is the kernel function specified by --kernel. Currently the"
- " linear kernel is all that is used.");
-
-// Define our input parameters.
-PARAM_STRING_REQ("reference_file", "File containing the reference dataset.",
- "r");
-PARAM_STRING("query_file", "File containing the query dataset.", "q", "");
-
-PARAM_INT_REQ("k", "Number of maximum inner products to find.", "k");
-
-PARAM_STRING("products_file", "File to save inner products into.", "p", "");
-PARAM_STRING("indices_file", "File to save indices of inner products into.",
- "i", "");
-
-PARAM_STRING("kernel", "Kernel type to use: 'linear', 'polynomial', 'cosine', "
- "'gaussian', 'epanechnikov', 'triangular', 'hyptan'.", "K", "linear");
-
-PARAM_FLAG("naive", "If true, O(n^2) naive mode is used for computation.", "N");
-PARAM_FLAG("single", "If true, single-tree search is used (as opposed to "
- "dual-tree search.", "s");
-
-PARAM_DOUBLE("base", "Base to use during cover tree construction.", "b", 2.0);
-
-PARAM_DOUBLE("degree", "Degree of polynomial kernel.", "d", 2.0);
-PARAM_DOUBLE("offset", "Offset of kernel (for polynomial and hyptan kernels).",
- "o", 0.0);
-PARAM_DOUBLE("bandwidth", "Bandwidth (for Gaussian, Epanechnikov, and "
- "triangular kernels).", "w", 1.0);
-PARAM_DOUBLE("scale", "Scale of kernel (for hyptan kernel).", "s", 1.0);
-
-template<typename KernelType>
-void RunMaxIP(const arma::mat& referenceData,
- const bool single,
- const bool naive,
- const double base,
- const size_t k,
- arma::Mat<size_t>& indices,
- arma::mat& products,
- KernelType& kernel)
-{
- // Create MaxIP object.
- MaxIP<KernelType> maxip(referenceData, kernel, (single && !naive), naive,
- base);
-
- // Now search with it.
- maxip.Search(k, indices, products);
-}
-
-template<typename KernelType>
-void RunMaxIP(const arma::mat& referenceData,
- const arma::mat& queryData,
- const bool single,
- const bool naive,
- const double base,
- const size_t k,
- arma::Mat<size_t>& indices,
- arma::mat& products,
- KernelType& kernel)
-{
- // Create MaxIP object.
- MaxIP<KernelType> maxip(referenceData, queryData, kernel, (single && !naive),
- naive, base);
-
- // Now search with it.
- maxip.Search(k, indices, products);
-}
-
-int main(int argc, char** argv)
-{
- distanceEvaluations = 0;
-
- CLI::ParseCommandLine(argc, argv);
-
- // Get reference dataset filename.
- const string referenceFile = CLI::GetParam<string>("reference_file");
-
- // The number of max kernel values to find.
- const size_t k = CLI::GetParam<int>("k");
-
- // Runtime parameters.
- const bool naive = CLI::HasParam("naive");
- const bool single = CLI::HasParam("single");
-
- // For cover tree construction.
- const double base = CLI::GetParam<double>("base");
-
- // Kernel parameters.
- const string kernelType = CLI::GetParam<string>("kernel");
- const double degree = CLI::GetParam<double>("degree");
- const double offset = CLI::GetParam<double>("offset");
- const double bandwidth = CLI::GetParam<double>("bandwidth");
- const double scale = CLI::GetParam<double>("scale");
-
- // The datasets. The query matrix may never be used.
- arma::mat referenceData;
- arma::mat queryData;
-
- data::Load(referenceFile, referenceData, true);
-
- Log::Info << "Loaded reference data from '" << referenceFile << "' ("
- << referenceData.n_rows << " x " << referenceData.n_cols << ")." << endl;
-
- // Sanity check on k value.
- if (k > referenceData.n_cols)
- {
- Log::Fatal << "Invalid k: " << k << "; must be greater than 0 and less ";
- Log::Fatal << "than or equal to the number of reference points (";
- Log::Fatal << referenceData.n_cols << ")." << endl;
- }
-
- // Check on kernel type.
- if ((kernelType != "linear") && (kernelType != "polynomial") &&
- (kernelType != "cosine") && (kernelType != "gaussian") &&
- (kernelType != "graph") && (kernelType != "approxGraph") &&
- (kernelType != "triangular") && (kernelType != "hyptan") &&
- (kernelType != "inv-mq") && (kernelType != "epanechnikov"))
- {
- Log::Fatal << "Invalid kernel type: '" << kernelType << "'; must be ";
- Log::Fatal << "'linear' or 'polynomial'." << endl;
- }
-
- // Load the query matrix, if we can.
- if (CLI::HasParam("query_file"))
- {
- const string queryFile = CLI::GetParam<string>("query_file");
- data::Load(queryFile, queryData, true);
-
- Log::Info << "Loaded query data from '" << queryFile << "' ("
- << queryData.n_rows << " x " << queryData.n_cols << ")." << endl;
- }
- else
- {
- Log::Info << "Using reference dataset as query dataset (--query_file not "
- << "specified)." << endl;
- }
-
- // Naive mode overrides single mode.
- if (naive && single)
- {
- Log::Warn << "--single ignored because --naive is present." << endl;
- }
-
- // Matrices for output storage.
- arma::Mat<size_t> indices;
- arma::mat products;
-
- // Construct MaxIP object.
- if (queryData.n_elem == 0)
- {
- if (kernelType == "linear")
- {
- LinearKernel lk;
- RunMaxIP<LinearKernel>(referenceData, single, naive, base, k, indices,
- products, lk);
- }
- else if (kernelType == "polynomial")
- {
-
- PolynomialKernel pk(degree, offset);
- RunMaxIP<PolynomialKernel>(referenceData, single, naive, base, k, indices,
- products, pk);
- }
- else if (kernelType == "cosine")
- {
- CosineDistance cd;
- RunMaxIP<CosineDistance>(referenceData, single, naive, base, k, indices,
- products, cd);
- }
- else if (kernelType == "gaussian")
- {
- GaussianKernel gk(bandwidth);
- RunMaxIP<GaussianKernel>(referenceData, single, naive, base, k, indices,
- products, gk);
- }
- else if (kernelType == "epanechnikov")
- {
- EpanechnikovKernel ek(bandwidth);
- RunMaxIP<EpanechnikovKernel>(referenceData, single, naive, base, k,
- indices, products, ek);
- }
- else if (kernelType == "triangular")
- {
- TriangularKernel tk(bandwidth);
- RunMaxIP<TriangularKernel>(referenceData, single, naive, base, k, indices,
- products, tk);
- }
- else if (kernelType == "hyptan")
- {
- HyperbolicTangentKernel htk(scale, offset);
- RunMaxIP<HyperbolicTangentKernel>(referenceData, single, naive, base, k,
- indices, products, htk);
- }
- }
- else
- {
- if (kernelType == "linear")
- {
- LinearKernel lk;
- RunMaxIP<LinearKernel>(referenceData, queryData, single, naive, base, k,
- indices, products, lk);
- }
- else if (kernelType == "polynomial")
- {
- PolynomialKernel pk(degree, offset);
- RunMaxIP<PolynomialKernel>(referenceData, queryData, single, naive, base,
- k, indices, products, pk);
- }
- else if (kernelType == "cosine")
- {
- CosineDistance cd;
- RunMaxIP<CosineDistance>(referenceData, queryData, single, naive, base, k,
- indices, products, cd);
- }
- else if (kernelType == "gaussian")
- {
- GaussianKernel gk(bandwidth);
- RunMaxIP<GaussianKernel>(referenceData, queryData, single, naive, base, k,
- indices, products, gk);
- }
- else if (kernelType == "epanechnikov")
- {
- EpanechnikovKernel ek(bandwidth);
- RunMaxIP<EpanechnikovKernel>(referenceData, queryData, single, naive,
- base, k, indices, products, ek);
- }
- else if (kernelType == "triangular")
- {
- TriangularKernel tk(bandwidth);
- RunMaxIP<TriangularKernel>(referenceData, queryData, single, naive, base,
- k, indices, products, tk);
- }
- else if (kernelType == "hyptan")
- {
- HyperbolicTangentKernel htk(scale, offset);
- RunMaxIP<HyperbolicTangentKernel>(referenceData, queryData, single, naive,
- base, k, indices, products, htk);
- }
- }
-
- // Save output, if we were asked to.
- if (CLI::HasParam("products_file"))
- {
- const string productsFile = CLI::GetParam<string>("products_file");
- data::Save(productsFile, products, false);
- }
-
- if (CLI::HasParam("indices_file"))
- {
- const string indicesFile = CLI::GetParam<string>("indices_file");
- data::Save(indicesFile, indices, false);
- }
-}
Deleted: mlpack/trunk/src/mlpack/methods/fastmks/max_ip_rules.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/fastmks/max_ip_rules.hpp 2012-11-05 10:00:54 UTC (rev 13843)
+++ mlpack/trunk/src/mlpack/methods/fastmks/max_ip_rules.hpp 2012-11-05 21:53:56 UTC (rev 13844)
@@ -1,55 +0,0 @@
-/**
- * @file max_ip_rules.hpp
- * @author Ryan Curtin
- *
- * Rules for the single or dual tree traversal for the maximum inner product
- * search.
- */
-#ifndef __MLPACK_METHODS_MAXIP_MAX_IP_RULES_HPP
-#define __MLPACK_METHODS_MAXIP_MAX_IP_RULES_HPP
-
-#include <mlpack/core.hpp>
-#include <mlpack/core/tree/cover_tree/cover_tree.hpp>
-
-namespace mlpack {
-namespace maxip {
-
-template<typename MetricType>
-class MaxIPRules
-{
- public:
- MaxIPRules(const arma::mat& referenceSet,
- const arma::mat& querySet,
- arma::Mat<size_t>& indices,
- arma::mat& products);
-
- void BaseCase(const size_t queryIndex, const size_t referenceIndex);
-
- bool CanPrune(const size_t queryIndex,
- tree::CoverTree<MetricType>& referenceNode,
- const size_t parentIndex);
-
- private:
- const arma::mat& referenceSet;
-
- const arma::mat& querySet;
-
- arma::Mat<size_t>& indices;
-
- arma::mat& products;
-
- arma::vec queryKernels; // || q || for each q.
-
- void InsertNeighbor(const size_t queryIndex,
- const size_t pos,
- const size_t neighbor,
- const double distance);
-};
-
-}; // namespace maxip
-}; // namespace mlpack
-
-// Include implementation.
-#include "max_ip_rules_impl.hpp"
-
-#endif
Deleted: mlpack/trunk/src/mlpack/methods/fastmks/max_ip_rules_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/fastmks/max_ip_rules_impl.hpp 2012-11-05 10:00:54 UTC (rev 13843)
+++ mlpack/trunk/src/mlpack/methods/fastmks/max_ip_rules_impl.hpp 2012-11-05 21:53:56 UTC (rev 13844)
@@ -1,103 +0,0 @@
-/**
- * @file max_ip_rules_impl.hpp
- * @author Ryan Curtin
- *
- * Implementation of MaxIPRules for cover tree search.
- */
-#ifndef __MLPACK_METHODS_MAXIP_MAX_IP_RULES_IMPL_HPP
-#define __MLPACK_METHODS_MAXIP_MAX_IP_RULES_IMPL_HPP
-
-// In case it hasn't already been included.
-#include "max_ip_rules.hpp"
-
-namespace mlpack {
-namespace maxip {
-
-template<typename MetricType>
-MaxIPRules<MetricType>::MaxIPRules(const arma::mat& referenceSet,
- const arma::mat& querySet,
- arma::Mat<size_t>& indices,
- arma::mat& products) :
- referenceSet(referenceSet),
- querySet(querySet),
- indices(indices),
- products(products)
-{
- // Precompute each self-kernel.
-// queryKernels.set_size(querySet.n_cols);
-// for (size_t i = 0; i < querySet.n_cols; ++i)
-// queryKernels[i] = sqrt(MetricType::Kernel::Evaluate(querySet.unsafe_col(i),
-// querySet.unsafe_col(i)));
-}
-
-template<typename MetricType>
-bool MaxIPRules<MetricType>::CanPrune(const size_t queryIndex,
- tree::CoverTree<MetricType>& referenceNode,
- const size_t parentIndex)
-{
- // The maximum possible inner product is given by
- // <q, p_0> + R_p || q ||
- // and since we are using cover trees, p_0 is the point referred to by the
- // node, and R_p will be the expansion constant to the power of the scale plus
- // one.
- const double eval = MetricType::Kernel::Evaluate(querySet.col(queryIndex),
- referenceSet.col(referenceNode.Point()));
-
- // See if base case can be added.
- if (eval > products(products.n_rows - 1, queryIndex))
- {
- size_t insertPosition;
- for (insertPosition = 0; insertPosition < indices.n_rows; ++insertPosition)
- if (eval > products(insertPosition, queryIndex))
- break;
-
- // We are guaranteed insertPosition is in the valid range.
- InsertNeighbor(queryIndex, insertPosition, referenceNode.Point(), eval);
- }
-
- double maxProduct = eval + std::pow(referenceNode.ExpansionConstant(),
- referenceNode.Scale() + 1) *
-sqrt(MetricType::Kernel::Evaluate(querySet.col(queryIndex),
-querySet.col(queryIndex)));
-
- if (maxProduct > products(products.n_rows - 1, queryIndex))
- return false;
- else
- return true;
-}
-
-/**
- * Helper function to insert a point into the neighbors and distances matrices.
- *
- * @param queryIndex Index of point whose neighbors we are inserting into.
- * @param pos Position in list to insert into.
- * @param neighbor Index of reference point which is being inserted.
- * @param distance Distance from query point to reference point.
- */
-template<typename MetricType>
-void MaxIPRules<MetricType>::InsertNeighbor(const size_t queryIndex,
- const size_t pos,
- const size_t neighbor,
- const double distance)
-{
- // We only memmove() if there is actually a need to shift something.
- if (pos < (products.n_rows - 1))
- {
- int len = (products.n_rows - 1) - pos;
- memmove(products.colptr(queryIndex) + (pos + 1),
- products.colptr(queryIndex) + pos,
- sizeof(double) * len);
- memmove(indices.colptr(queryIndex) + (pos + 1),
- indices.colptr(queryIndex) + pos,
- sizeof(size_t) * len);
- }
-
- // Now put the new information in the right index.
- products(pos, queryIndex) = distance;
- indices(pos, queryIndex) = neighbor;
-}
-
-}; // namespace maxip
-}; // namespace mlpack
-
-#endif
More information about the mlpack-svn
mailing list