[mlpack-svn] r13786 - mlpack/trunk/src/mlpack/core/optimizers/sgd

Tue Oct 30 15:32:29 EDT 2012

Author: rcurtin
Date: 2012-10-30 15:32:29 -0400 (Tue, 30 Oct 2012)
New Revision: 13786

Added:
   mlpack/trunk/src/mlpack/core/optimizers/sgd/sgd.hpp
   mlpack/trunk/src/mlpack/core/optimizers/sgd/sgd_impl.hpp
Log:
Not yet tested -- but implemented in a few minutes.  Turns out SGD is pretty
straightforward...


Added: mlpack/trunk/src/mlpack/core/optimizers/sgd/sgd.hpp
===================================================================

--- mlpack/trunk/src/mlpack/core/optimizers/sgd/sgd.hpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/core/optimizers/sgd/sgd.hpp	2012-10-30 19:32:29 UTC (rev 13786)
@@ -0,0 +1,148 @@
+/**
+ * @file sgd.hpp
+ * @author Ryan Curtin
+ *
+ * Stochastic Gradient Descent (SGD).
+ */
+#ifndef __MLPACK_CORE_OPTIMIZERS_SGD_SGD_HPP
+#define __MLPACK_CORE_OPTIMIZERS_SGD_SGD_HPP
+
+namespace mlpack {
+namespace optimizers {
+
+/**
+ * Stochastic Gradient Descent is a technique for minimizing a function which
+ * can be expressed as a sum of other functions.  That is, suppose we have
+ *
+ * \f[
+ * f(A) = \sum_{i = 0}^{n} f_i(A)
+ * \f]
+ *
+ * and our task is to minimize \f$ A \f$.  Stochastic gradient descent iterates
+ * over each function \f$ f_i(A) \f$, producing the following update scheme:
+ *
+ * \f[
+ * A_{j + 1} = A_j + \alpha \nabla f_i(A)
+ * \f]
+ *
+ * where \f$ \alpha \f$ is a parameter which specifies the step size.  \f$ i \f$
+ * is chosen according to \f$ j \f$ (the iteration number).  The SGD class
+ * supports either scanning through each of the \f$ n \f$ functions \f$ f_i(A)
+ * \f$ linearly, or in a random sequence.  The algorithm continues until \f$ j
+ * \f$ reaches the maximum number of iterations -- or when a full sequence of
+ * updates through each of the \f$ n \f$ functions \f$ f_i(A) \f$ produces an
+ * improvement within a certain tolerance \f$ \epsilon \f$.  That is,
+ *
+ * \f[
+ * | f(A_{j + n}) - f(A_j) | < \epsilon.
+ * \f]
+ *
+ * The parameter \f$\epsilon\f$ is specified by the tolerance parameter to the
+ * constructor; \f$n\f$ is specified by the maxIterations parameter.
+ *
+ * This class is useful for data-dependent functions whose objective function
+ * can be expressed as a sum of objective functions operating on an individual
+ * point.  Then, SGD considers the gradient of the objective function operating
+ * on an individual point in its update of \f$ A \f$.
+ *
+ * For SGD to work, a DecomposableFunctionType template parameter is required.
+ * This class must implement the following function:
+ *
+ *   size_t NumFunctions();
+ *   double Evaluate(const arma::mat& coordinates, const size_t i);
+ *   void Gradient(const arma::mat& coordinates,
+ *                 const size_t i,
+ *                 arma::mat& gradient);
+ *
+ * NumFunctions() should return the number of functions (\f$n\f$), and in the
+ * other two functions, the parameter i refers to which individual function (or
+ * gradient) is being evaluated.  So, for the case of a data-dependent function,
+ * such as NCA (see mlpack::nca::NCA), NumFunctions() should return the number
+ * of points in the dataset, and Evaluate(coordinates, 0) will evaluate the
+ * objective function on the first point in the dataset (presumably, the dataset
+ * is held internally in the DecomposableFunctionType).
+ *
+ * @tparam DecomposableFunctionType Decomposable objective function type to be
+ *     minimized.
+ */
+template<typename DecomposableFunctionType>
+class SGD
+{
+ public:
+  /**
+   * Construct the SGD optimizer with the given function and parameters.
+   *
+   * @param function Function to be optimized (minimized).
+   * @param stepSize Step size for each iteration.
+   * @param maxIterations Maximum number of iterations allowed (0 means no
+   *     limit).
+   * @param tolerance Maximum absolute tolerance to terminate algorithm.
+   * @param shuffle If true, the function order is shuffled; otherwise, each
+   *     function is visited in linear order.
+   */
+  SGD(DecomposableFunctionType& function,
+      const double stepSize = 0.01,
+      const size_t maxIterations = 100000,
+      const double tolerance = 1e-5,
+      const bool shuffle = true);
+
+  /**
+   * Optimize the given function using stochastic gradient descent.  The given
+   * starting point will be modified to store the finishing point of the
+   * algorithm, and the final objective value is returned.
+   *
+   * @param iterate Starting point (will be modified).
+   * @return Objective value of the final point.
+   */
+  double Optimize(arma::mat& iterate);
+
+  //! Get the instantiated function to be optimized.
+  const DecomposableFunctionType& Function() const { return function; }
+  //! Modify the instantiated function.
+  DecomposableFunctionType& Function() { return function; }
+
+  //! Get the step size.
+  double StepSize() const { return stepSize; }
+  //! Modify the step size.
+  double& StepSize() { return stepSize; }
+
+  //! Get the maximum number of iterations (0 indicates no limit).
+  size_t MaxIterations() const { return maxIterations; }
+  //! Modify the maximum number of iterations (0 indicates no limit).
+  size_t& MaxIterations() { return maxIterations; }
+
+  //! Get the tolerance for termination.
+  double Tolerance() const { return tolerance; }
+  //! Modify the tolerance for termination.
+  double& Tolerance() { return tolerance; }
+
+  //! Get whether or not the individual functions are shuffled.
+  bool Shuffle() const { return shuffle; }
+  //! Modify whether or not the individual functions are shuffled.
+  bool& Shuffle() { return shuffle; }
+
+ private:
+  //! The instantiated function.
+  DecomposableFunctionType& function;
+
+  //! The step size for each example.
+  double stepSize;
+
+  //! The maximum number of allowed iterations.
+  size_t maxIterations;
+
+  //! The tolerance for termination.
+  double tolerance;
+
+  //! Controls whether or not the individual functions are shuffled when
+  //! iterating.
+  bool shuffle;
+}
+
+}; // namespace optimizers
+}; // namespace mlpack
+
+// Include implementation.
+#include "sgd_impl.hpp"
+
+#endif

Added: mlpack/trunk/src/mlpack/core/optimizers/sgd/sgd_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/core/optimizers/sgd/sgd_impl.hpp	                        (rev 0)
+++ mlpack/trunk/src/mlpack/core/optimizers/sgd/sgd_impl.hpp	2012-10-30 19:32:29 UTC (rev 13786)
@@ -0,0 +1,95 @@
+/**
+ * @file sgd_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of stochastic gradient descent.
+ */
+#ifndef __MLPACK_CORE_OPTIMIZERS_SGD_SGD_IMPL_HPP
+#define __MLPACK_CORE_OPTIMIZERS_SGD_SGD_IMPL_HPP
+
+// In case it hasn't been included yet.
+#include "sgd.hpp"
+
+namespace mlpack {
+namespace optimizers {
+
+template<typename DecomposableFunctionType>
+SGD<DecomposableFunctionType>::SGD(DecomposableFunctionType& function,
+                                   const double stepSize,
+                                   const size_t maxIterations,
+                                   const double tolerance,
+                                   const bool shuffle) :
+    function(function),
+    stepSize(stepSize),
+    maxIterations(maxIterations),
+    tolerance(tolerance),
+    shuffle(shuffle)
+{ /* Nothing to do. */ }
+
+//! Optimize the function (minimize).
+template<typename DecomposableFunctionType>
+double SGD<DecomposableFunctionType>::Optimize(arma::mat& iterate)
+{
+  // Find the number of functions to use.
+  const size_t numFunctions = function.NumFunctions();
+
+  // This is used only if shuffle is true.
+  arma::vec visitationOrder;
+  if (shuffle)
+    visitationOrder = shuffle(linspace(0, (numFunctions - 1), numFunctions));
+
+  // To keep track of where we are and how things are going.
+  size_t currentFunction = 0;
+  double overallObjective = 0;
+  double lastObjective = DBL_MAX;
+
+  // Calculate the first objective function.
+  for (size_t i = 0; i < numFunctions; ++i)
+    overallObjective += function.Evaluate(iterate, i);
+
+  // Now iterate!
+  arma::mat gradient(iterate.n_rows, iterate.n_cols);
+  for (size_t i = 1; i != maxIterations; ++i, ++currentFunction)
+  {
+    // Is this iteration the start of a sequence?
+    if ((currentFunction % numFunctions) == 0)
+    {
+      // Output current objective function.
+      Log::Info << "SGD: iteration " << i << ", objective " << overallObjective
+          << "." << std::endl;
+
+      if (std::abs(lastObjective - overallObjective) < tolerance)
+      {
+        Log::Info << "SGD: minimized within tolerance " << tolerance << "; "
+            << "terminating optimization." << std::endl;
+        return overallObjective;
+      }
+
+      // Reset the counter variables.
+      lastObjective = overallObjective;
+      overallObjective = 0;
+      currentFunction = 0;
+
+      if (shuffle) // Determine order of visitation.
+        visitationOrder = shuffle(visitationOrder);
+    }
+
+    // Evaluate the gradient for this iteration.
+    function.Gradient(iterate, currentFunction, gradient);
+
+    // And update the iterate.
+    iterate -= stepSize * gradient;
+
+    // Now add that to the overall objective function.
+    overallObjective += function.Evaluate(iterate, currentFunction);
+  }
+
+  Log::Info << "SGD: maximum iterations (" << maxIterations << ") reached; "
+      << "terminating optimization." << std::endl;
+  return overallObjective;
+}
+
+}; // namespace optimizers
+}; // namespace mlpack
+
+#endif