[mlpack-git] master: Add RMSprop implementation. (da1207a)

Fri Feb 19 08:22:53 EST 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/f6dd2f7a9752a7db8ec284a938b3e84a13d0bfb2...6205f3e0b62b56452b2a4afc4da24fce5b21e72f

>---------------------------------------------------------------

commit da1207a9e6407e8350d9904964402b0b08513a0a
Author: marcus <marcus.edel at fu-berlin.de>
Date:   Tue Feb 16 21:53:58 2016 +0100

    Add RMSprop implementation.


>---------------------------------------------------------------

da1207a9e6407e8350d9904964402b0b08513a0a
 src/mlpack/core/optimizers/CMakeLists.txt          |  1 +
 .../{minibatch_sgd => rmsprop}/CMakeLists.txt      |  4 +-
 .../{sgd/sgd.hpp => rmsprop/rmsprop.hpp}           | 99 ++++++++++++----------
 .../{sgd/sgd_impl.hpp => rmsprop/rmsprop_impl.hpp} | 50 ++++++-----
 4 files changed, 88 insertions(+), 66 deletions(-)

diff --git a/src/mlpack/core/optimizers/CMakeLists.txt b/src/mlpack/core/optimizers/CMakeLists.txt
index 13731a6..3e16f02 100644
--- a/src/mlpack/core/optimizers/CMakeLists.txt
+++ b/src/mlpack/core/optimizers/CMakeLists.txt
@@ -2,6 +2,7 @@ set(DIRS
   aug_lagrangian
   lbfgs
   minibatch_sgd
+  rmsprop
   sa
   sdp
   sgd
diff --git a/src/mlpack/core/optimizers/minibatch_sgd/CMakeLists.txt b/src/mlpack/core/optimizers/rmsprop/CMakeLists.txt
similarity index 80%
copy from src/mlpack/core/optimizers/minibatch_sgd/CMakeLists.txt
copy to src/mlpack/core/optimizers/rmsprop/CMakeLists.txt
index e88c3ed..75c30c6 100644
--- a/src/mlpack/core/optimizers/minibatch_sgd/CMakeLists.txt
+++ b/src/mlpack/core/optimizers/rmsprop/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(SOURCES
-  minibatch_sgd.hpp
-  minibatch_sgd_impl.hpp
+  rmsprop.hpp
+  rmsprop_impl.hpp
 )
 
 set(DIR_SRCS)
diff --git a/src/mlpack/core/optimizers/sgd/sgd.hpp b/src/mlpack/core/optimizers/rmsprop/rmsprop.hpp
similarity index 61%
copy from src/mlpack/core/optimizers/sgd/sgd.hpp
copy to src/mlpack/core/optimizers/rmsprop/rmsprop.hpp
index 5572f10..690da6a 100644
--- a/src/mlpack/core/optimizers/sgd/sgd.hpp
+++ b/src/mlpack/core/optimizers/rmsprop/rmsprop.hpp
@@ -1,11 +1,13 @@
 /**
- * @file sgd.hpp
+ * @file rmsprop.hpp
  * @author Ryan Curtin
+ * @author Marcus Edel
  *
- * Stochastic Gradient Descent (SGD).
+ * RMSprop optimizer. RmsProp is an optimizer that utilizes the magnitude of
+ * recent gradients to normalize the gradients.
  */
-#ifndef __MLPACK_CORE_OPTIMIZERS_SGD_SGD_HPP
-#define __MLPACK_CORE_OPTIMIZERS_SGD_SGD_HPP
+#ifndef __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_HPP
+#define __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_HPP
 
 #include <mlpack/core.hpp>
 
@@ -13,42 +15,28 @@ namespace mlpack {
 namespace optimization {
 
 /**
- * Stochastic Gradient Descent is a technique for minimizing a function which
- * can be expressed as a sum of other functions.  That is, suppose we have
+ * RMSprop is an optimizer that utilizes the magnitude of recent gradients to
+ * normalize the gradients. In its basic form, given a step rate \f$ \gamma \f$
+ * and a decay term \f$ \alpha \f$ we perform the following updates:
  *
- * \f[
- * f(A) = \sum_{i = 0}^{n} f_i(A)
- * \f]
+ * \f{eqnarray*}{
+ * r_t &=& (1 - \gamma) f'(\Delta_t)^2 + \gamma r_{t - 1} \\
+ * v_{t + 1} &=& \frac{\alpha}{\sqrt{r_t}}f'(\Delta_t) \\
+ * \Delta_{t + 1} &=& \Delta_t - v_{t + 1}
+ * \f}
  *
- * and our task is to minimize \f$ A \f$.  Stochastic gradient descent iterates
- * over each function \f$ f_i(A) \f$, producing the following update scheme:
+ * For more information, see the following.
  *
- * \f[
- * A_{j + 1} = A_j + \alpha \nabla f_i(A)
- * \f]
+ * @code
+ * @misc{tieleman2012,
+ *   title={Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine
+ *   Learning},
+ *   year={2012}
+ * }
+ * @endcode
  *
- * where \f$ \alpha \f$ is a parameter which specifies the step size.  \f$ i \f$
- * is chosen according to \f$ j \f$ (the iteration number).  The SGD class
- * supports either scanning through each of the \f$ n \f$ functions \f$ f_i(A)
- * \f$ linearly, or in a random sequence.  The algorithm continues until \f$ j
- * \f$ reaches the maximum number of iterations---or when a full sequence of
- * updates through each of the \f$ n \f$ functions \f$ f_i(A) \f$ produces an
- * improvement within a certain tolerance \f$ \epsilon \f$.  That is,
- *
- * \f[
- * | f(A_{j + n}) - f(A_j) | < \epsilon.
- * \f]
- *
- * The parameter \f$\epsilon\f$ is specified by the tolerance parameter to the
- * constructor; \f$n\f$ is specified by the maxIterations parameter.
- *
- * This class is useful for data-dependent functions whose objective function
- * can be expressed as a sum of objective functions operating on an individual
- * point.  Then, SGD considers the gradient of the objective function operating
- * on an individual point in its update of \f$ A \f$.
- *
- * For SGD to work, a DecomposableFunctionType template parameter is required.
- * This class must implement the following function:
+ * For RMSprop to work, a DecomposableFunctionType template parameter is
+ * required. This class must implement the following function:
  *
  *   size_t NumFunctions();
  *   double Evaluate(const arma::mat& coordinates, const size_t i);
@@ -68,11 +56,11 @@ namespace optimization {
  *     minimized.
  */
 template<typename DecomposableFunctionType>
-class SGD
+class RMSprop
 {
  public:
   /**
-   * Construct the SGD optimizer with the given function and parameters.  The
+   * Construct the RMSprop optimizer with the given function and parameters. The
    * defaults here are not necessarily good for the given problem, so it is
    * suggested that the values used be tailored to the task at hand.  The
    * maximum number of iterations refers to the maximum number of points that
@@ -81,22 +69,27 @@ class SGD
    *
    * @param function Function to be optimized (minimized).
    * @param stepSize Step size for each iteration.
+   * @param alpha Smoothing constant, similar to that used in AdaDelta and
+   *        momentum methods.
+   * @param eps Value used to initialise the mean squared gradient parameter.
    * @param maxIterations Maximum number of iterations allowed (0 means no
-   *     limit).
+   *        limit).
    * @param tolerance Maximum absolute tolerance to terminate algorithm.
    * @param shuffle If true, the function order is shuffled; otherwise, each
-   *     function is visited in linear order.
+   *        function is visited in linear order.
    */
-  SGD(DecomposableFunctionType& function,
+  RMSprop(DecomposableFunctionType& function,
       const double stepSize = 0.01,
+      const double alpha = 0.99,
+      const double eps = 1e-8,
       const size_t maxIterations = 100000,
       const double tolerance = 1e-5,
       const bool shuffle = true);
 
   /**
-   * Optimize the given function using stochastic gradient descent.  The given
-   * starting point will be modified to store the finishing point of the
-   * algorithm, and the final objective value is returned.
+   * Optimize the given function using RMSprop. The given starting point will be
+   * modified to store the finishing point of the algorithm, and the final
+   * objective value is returned.
    *
    * @param iterate Starting point (will be modified).
    * @return Objective value of the final point.
@@ -113,6 +106,16 @@ class SGD
   //! Modify the step size.
   double& StepSize() { return stepSize; }
 
+  //! Get the smoothing parameter.
+  double Alpha() const { return alpha; }
+  //! Modify the smoothing parameter.
+  double& Alpha() { return alpha; }
+
+  //! Get the value used to initialise the mean squared gradient parameter.
+  double Epsilon() const { return eps; }
+  //! Modify the value used to initialise the mean squared gradient parameter.
+  double& Epsilon() { return eps; }
+
   //! Get the maximum number of iterations (0 indicates no limit).
   size_t MaxIterations() const { return maxIterations; }
   //! Modify the maximum number of iterations (0 indicates no limit).
@@ -135,6 +138,12 @@ class SGD
   //! The step size for each example.
   double stepSize;
 
+  //! The smoothing parameter.
+  double alpha;
+
+  //! The value used to initialise the mean squared gradient parameter.
+  double eps;
+
   //! The maximum number of allowed iterations.
   size_t maxIterations;
 
@@ -150,6 +159,6 @@ class SGD
 } // namespace mlpack
 
 // Include implementation.
-#include "sgd_impl.hpp"
+#include "rmsprop_impl.hpp"
 
 #endif
diff --git a/src/mlpack/core/optimizers/sgd/sgd_impl.hpp b/src/mlpack/core/optimizers/rmsprop/rmsprop_impl.hpp
similarity index 62%
copy from src/mlpack/core/optimizers/sgd/sgd_impl.hpp
copy to src/mlpack/core/optimizers/rmsprop/rmsprop_impl.hpp
index d95624a..539fa05 100644
--- a/src/mlpack/core/optimizers/sgd/sgd_impl.hpp
+++ b/src/mlpack/core/optimizers/rmsprop/rmsprop_impl.hpp
@@ -1,27 +1,31 @@
 /**
- * @file sgd_impl.hpp
+ * @file rmsprop_impl.hpp
  * @author Ryan Curtin
+ * @author Marcus Edel
  *
- * Implementation of stochastic gradient descent.
+ * Implementation of the RMSprop optimizer.
  */
-#ifndef __MLPACK_CORE_OPTIMIZERS_SGD_SGD_IMPL_HPP
-#define __MLPACK_CORE_OPTIMIZERS_SGD_SGD_IMPL_HPP
+#ifndef __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_IMPL_HPP
+#define __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_IMPL_HPP
 
-#include <mlpack/methods/regularized_svd/regularized_svd_function.hpp>
 // In case it hasn't been included yet.
-#include "sgd.hpp"
+#include "rmsprop.hpp"
 
 namespace mlpack {
 namespace optimization {
 
 template<typename DecomposableFunctionType>
-SGD<DecomposableFunctionType>::SGD(DecomposableFunctionType& function,
-                                   const double stepSize,
-                                   const size_t maxIterations,
-                                   const double tolerance,
-                                   const bool shuffle) :
+RMSprop<DecomposableFunctionType>::RMSprop(DecomposableFunctionType& function,
+                                           const double stepSize,
+                                           const double alpha,
+                                           const double eps,
+                                           const size_t maxIterations,
+                                           const double tolerance,
+                                           const bool shuffle) :
     function(function),
     stepSize(stepSize),
+    alpha(alpha),
+    eps(eps),
     maxIterations(maxIterations),
     tolerance(tolerance),
     shuffle(shuffle)
@@ -29,7 +33,7 @@ SGD<DecomposableFunctionType>::SGD(DecomposableFunctionType& function,
 
 //! Optimize the function (minimize).
 template<typename DecomposableFunctionType>
-double SGD<DecomposableFunctionType>::Optimize(arma::mat& iterate)
+double RMSprop<DecomposableFunctionType>::Optimize(arma::mat& iterate)
 {
   // Find the number of functions to use.
   const size_t numFunctions = function.NumFunctions();
@@ -51,25 +55,31 @@ double SGD<DecomposableFunctionType>::Optimize(arma::mat& iterate)
 
   // Now iterate!
   arma::mat gradient(iterate.n_rows, iterate.n_cols);
+
+  // Leaky sum of squares of parameter gradient.
+  arma::mat meanSquaredGradient = arma::zeros<arma::mat>(iterate.n_rows,
+      iterate.n_cols);
+
   for (size_t i = 1; i != maxIterations; ++i, ++currentFunction)
   {
     // Is this iteration the start of a sequence?
     if ((currentFunction % numFunctions) == 0)
     {
       // Output current objective function.
-      Log::Info << "SGD: iteration " << i << ", objective " << overallObjective
-          << "." << std::endl;
+      Log::Info << "RMSprop: iteration " << i << ", objective "
+          << overallObjective << "." << std::endl;
 
       if (std::isnan(overallObjective) || std::isinf(overallObjective))
       {
-        Log::Warn << "SGD: converged to " << overallObjective << "; terminating"
-            << " with failure.  Try a smaller step size?" << std::endl;
+        Log::Warn << "RMSprop: converged to " << overallObjective
+            << "; terminating with failure. Try a smaller step size?"
+            << std::endl;
         return overallObjective;
       }
 
       if (std::abs(lastObjective - overallObjective) < tolerance)
       {
-        Log::Info << "SGD: minimized within tolerance " << tolerance << "; "
+        Log::Info << "RMSprop: minimized within tolerance " << tolerance << "; "
             << "terminating optimization." << std::endl;
         return overallObjective;
       }
@@ -90,7 +100,9 @@ double SGD<DecomposableFunctionType>::Optimize(arma::mat& iterate)
       function.Gradient(iterate, currentFunction, gradient);
 
     // And update the iterate.
-    iterate -= stepSize * gradient;
+    meanSquaredGradient *= alpha;
+    meanSquaredGradient += (1 - alpha) * (gradient % gradient);
+    iterate -= stepSize * gradient / (arma::sqrt(meanSquaredGradient) + eps);
 
     // Now add that to the overall objective function.
     if (shuffle)
@@ -100,7 +112,7 @@ double SGD<DecomposableFunctionType>::Optimize(arma::mat& iterate)
       overallObjective += function.Evaluate(iterate, currentFunction);
   }
 
-  Log::Info << "SGD: maximum iterations (" << maxIterations << ") reached; "
+  Log::Info << "RMSprop: maximum iterations (" << maxIterations << ") reached; "
       << "terminating optimization." << std::endl;
   // Calculate final objective.
   overallObjective = 0;