[mlpack-git] master: Update the learning rate by calculating the bias correction values. (d673e3e)

Mon Mar 14 18:12:23 EDT 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/c0886a18f63c9335a0c39dcc34c27b8925dcb91b...b864df8cf10592b3874b079302774dbe7a4c1dbc

>---------------------------------------------------------------

commit d673e3e33b005341243dc70c2ad79fe11b980b47
Author: marcus <marcus.edel at fu-berlin.de>
Date:   Mon Mar 14 23:10:45 2016 +0100

    Update the learning rate by calculating the bias correction values.


>---------------------------------------------------------------

d673e3e33b005341243dc70c2ad79fe11b980b47
 .../optimizers/{rmsprop => adam}/CMakeLists.txt    |  4 +-
 .../{rmsprop/rmsprop.hpp => adam/adam.hpp}         | 71 ++++++++++++----------
 .../rmsprop_impl.hpp => adam/adam_impl.hpp}        | 60 ++++++++++--------
 3 files changed, 76 insertions(+), 59 deletions(-)

diff --git a/src/mlpack/core/optimizers/rmsprop/CMakeLists.txt b/src/mlpack/core/optimizers/adam/CMakeLists.txt
similarity index 85%
copy from src/mlpack/core/optimizers/rmsprop/CMakeLists.txt
copy to src/mlpack/core/optimizers/adam/CMakeLists.txt
index 75c30c6..3cbcfd8 100644
--- a/src/mlpack/core/optimizers/rmsprop/CMakeLists.txt
+++ b/src/mlpack/core/optimizers/adam/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(SOURCES
-  rmsprop.hpp
-  rmsprop_impl.hpp
+  adam.hpp
+  adam_impl.hpp
 )
 
 set(DIR_SRCS)
diff --git a/src/mlpack/core/optimizers/rmsprop/rmsprop.hpp b/src/mlpack/core/optimizers/adam/adam.hpp
similarity index 73%
copy from src/mlpack/core/optimizers/rmsprop/rmsprop.hpp
copy to src/mlpack/core/optimizers/adam/adam.hpp
index 690da6a..fae4362 100644
--- a/src/mlpack/core/optimizers/rmsprop/rmsprop.hpp
+++ b/src/mlpack/core/optimizers/adam/adam.hpp
@@ -1,13 +1,15 @@
 /**
- * @file rmsprop.hpp
+ * @file adam.hpp
  * @author Ryan Curtin
+ * @author Vasanth Kalingeri
  * @author Marcus Edel
  *
- * RMSprop optimizer. RmsProp is an optimizer that utilizes the magnitude of
- * recent gradients to normalize the gradients.
+ * Adam optimizer. Adam is an an algorithm for first-order gradient-based
+ * optimization of stochastic objective functions, based on adaptive estimates
+ * of lower-order moments.
  */
-#ifndef __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_HPP
-#define __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_HPP
+#ifndef __MLPACK_CORE_OPTIMIZERS_ADAM_ADAM_HPP
+#define __MLPACK_CORE_OPTIMIZERS_ADAM_ADAM_HPP
 
 #include <mlpack/core.hpp>
 
@@ -15,28 +17,24 @@ namespace mlpack {
 namespace optimization {
 
 /**
- * RMSprop is an optimizer that utilizes the magnitude of recent gradients to
- * normalize the gradients. In its basic form, given a step rate \f$ \gamma \f$
- * and a decay term \f$ \alpha \f$ we perform the following updates:
- *
- * \f{eqnarray*}{
- * r_t &=& (1 - \gamma) f'(\Delta_t)^2 + \gamma r_{t - 1} \\
- * v_{t + 1} &=& \frac{\alpha}{\sqrt{r_t}}f'(\Delta_t) \\
- * \Delta_{t + 1} &=& \Delta_t - v_{t + 1}
- * \f}
+ * Adam is an optimizer that computes individual adaptive learning rates for
+ * different parameters from estimates of first and second moments of the
+ * gradients.
  *
  * For more information, see the following.
  *
  * @code
- * @misc{tieleman2012,
- *   title={Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine
- *   Learning},
- *   year={2012}
+ * @article{Kingma2014,
+ *   author    = {Diederik P. Kingma and Jimmy Ba},
+ *   title     = {Adam: {A} Method for Stochastic Optimization},
+ *   journal   = {CoRR},
+ *   year      = {2014}
  * }
  * @endcode
  *
- * For RMSprop to work, a DecomposableFunctionType template parameter is
- * required. This class must implement the following function:
+ *
+ * For Adam to work, a DecomposableFunctionType template parameter is required.
+ * This class must implement the following function:
  *
  *   size_t NumFunctions();
  *   double Evaluate(const arma::mat& coordinates, const size_t i);
@@ -56,11 +54,11 @@ namespace optimization {
  *     minimized.
  */
 template<typename DecomposableFunctionType>
-class RMSprop
+class Adam
 {
  public:
   /**
-   * Construct the RMSprop optimizer with the given function and parameters. The
+   * Construct the Adam optimizer with the given function and parameters. The
    * defaults here are not necessarily good for the given problem, so it is
    * suggested that the values used be tailored to the task at hand.  The
    * maximum number of iterations refers to the maximum number of points that
@@ -69,8 +67,8 @@ class RMSprop
    *
    * @param function Function to be optimized (minimized).
    * @param stepSize Step size for each iteration.
-   * @param alpha Smoothing constant, similar to that used in AdaDelta and
-   *        momentum methods.
+   * @param beta1 The first moment coefficient.
+   * @param beta2 The second moment coefficient.
    * @param eps Value used to initialise the mean squared gradient parameter.
    * @param maxIterations Maximum number of iterations allowed (0 means no
    *        limit).
@@ -78,16 +76,17 @@ class RMSprop
    * @param shuffle If true, the function order is shuffled; otherwise, each
    *        function is visited in linear order.
    */
-  RMSprop(DecomposableFunctionType& function,
+  Adam(DecomposableFunctionType& function,
       const double stepSize = 0.01,
-      const double alpha = 0.99,
+      const double beta1 = 0.9,
+      const double beta2 = 0.999,
       const double eps = 1e-8,
       const size_t maxIterations = 100000,
       const double tolerance = 1e-5,
       const bool shuffle = true);
 
   /**
-   * Optimize the given function using RMSprop. The given starting point will be
+   * Optimize the given function using Adam. The given starting point will be
    * modified to store the finishing point of the algorithm, and the final
    * objective value is returned.
    *
@@ -107,9 +106,14 @@ class RMSprop
   double& StepSize() { return stepSize; }
 
   //! Get the smoothing parameter.
-  double Alpha() const { return alpha; }
+  double Beta1() const { return beta1; }
   //! Modify the smoothing parameter.
-  double& Alpha() { return alpha; }
+  double& Beta1() { return beta1; }
+
+  //! Get the second moment coefficient.
+  double Beta2() const { return beta2; }
+  //! Modify the second moment coefficient.
+  double& Beta2() { return beta2; }
 
   //! Get the value used to initialise the mean squared gradient parameter.
   double Epsilon() const { return eps; }
@@ -138,8 +142,11 @@ class RMSprop
   //! The step size for each example.
   double stepSize;
 
-  //! The smoothing parameter.
-  double alpha;
+  //! The value used as first moment coefficient.
+  double beta1;
+
+  //! The value used as second moment coefficient.
+  double beta2;
 
   //! The value used to initialise the mean squared gradient parameter.
   double eps;
@@ -159,6 +166,6 @@ class RMSprop
 } // namespace mlpack
 
 // Include implementation.
-#include "rmsprop_impl.hpp"
+#include "adam_impl.hpp"
 
 #endif
diff --git a/src/mlpack/core/optimizers/rmsprop/rmsprop_impl.hpp b/src/mlpack/core/optimizers/adam/adam_impl.hpp
similarity index 61%
copy from src/mlpack/core/optimizers/rmsprop/rmsprop_impl.hpp
copy to src/mlpack/core/optimizers/adam/adam_impl.hpp
index 539fa05..51efa05 100644
--- a/src/mlpack/core/optimizers/rmsprop/rmsprop_impl.hpp
+++ b/src/mlpack/core/optimizers/adam/adam_impl.hpp
@@ -1,30 +1,33 @@
 /**
- * @file rmsprop_impl.hpp
+ * @file adam_impl.hpp
  * @author Ryan Curtin
+ * @author Vasanth Kalingeri
  * @author Marcus Edel
  *
- * Implementation of the RMSprop optimizer.
+ * Implementation of the Adam optimizer.
  */
-#ifndef __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_IMPL_HPP
-#define __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_IMPL_HPP
+#ifndef __MLPACK_CORE_OPTIMIZERS_ADAM_ADAM_IMPL_HPP
+#define __MLPACK_CORE_OPTIMIZERS_ADAM_ADAM_IMPL_HPP
 
 // In case it hasn't been included yet.
-#include "rmsprop.hpp"
+#include "adam.hpp"
 
 namespace mlpack {
 namespace optimization {
 
 template<typename DecomposableFunctionType>
-RMSprop<DecomposableFunctionType>::RMSprop(DecomposableFunctionType& function,
-                                           const double stepSize,
-                                           const double alpha,
-                                           const double eps,
-                                           const size_t maxIterations,
-                                           const double tolerance,
-                                           const bool shuffle) :
+Adam<DecomposableFunctionType>::Adam(DecomposableFunctionType& function,
+                                     const double stepSize,
+                                     const double beta1,
+                                     const double beta2,
+                                     const double eps,
+                                     const size_t maxIterations,
+                                     const double tolerance,
+                                     const bool shuffle) :
     function(function),
     stepSize(stepSize),
-    alpha(alpha),
+    beta1(beta1),
+    beta2(beta2),
     eps(eps),
     maxIterations(maxIterations),
     tolerance(tolerance),
@@ -33,7 +36,7 @@ RMSprop<DecomposableFunctionType>::RMSprop(DecomposableFunctionType& function,
 
 //! Optimize the function (minimize).
 template<typename DecomposableFunctionType>
-double RMSprop<DecomposableFunctionType>::Optimize(arma::mat& iterate)
+double Adam<DecomposableFunctionType>::Optimize(arma::mat& iterate)
 {
   // Find the number of functions to use.
   const size_t numFunctions = function.NumFunctions();
@@ -56,9 +59,11 @@ double RMSprop<DecomposableFunctionType>::Optimize(arma::mat& iterate)
   // Now iterate!
   arma::mat gradient(iterate.n_rows, iterate.n_cols);
 
-  // Leaky sum of squares of parameter gradient.
-  arma::mat meanSquaredGradient = arma::zeros<arma::mat>(iterate.n_rows,
-      iterate.n_cols);
+  // Exponential moving average of gradient values.
+  arma::mat mean = arma::zeros<arma::mat>(iterate.n_rows, iterate.n_cols);
+
+  // Exponential moving average of squared gradient values.
+  arma::mat variance = arma::zeros<arma::mat>(iterate.n_rows, iterate.n_cols);
 
   for (size_t i = 1; i != maxIterations; ++i, ++currentFunction)
   {
@@ -66,12 +71,12 @@ double RMSprop<DecomposableFunctionType>::Optimize(arma::mat& iterate)
     if ((currentFunction % numFunctions) == 0)
     {
       // Output current objective function.
-      Log::Info << "RMSprop: iteration " << i << ", objective "
-          << overallObjective << "." << std::endl;
+      Log::Info << "Adam: iteration " << i << ", objective " << overallObjective
+          << "." << std::endl;
 
       if (std::isnan(overallObjective) || std::isinf(overallObjective))
       {
-        Log::Warn << "RMSprop: converged to " << overallObjective
+        Log::Warn << "Adam: converged to " << overallObjective 
             << "; terminating with failure. Try a smaller step size?"
             << std::endl;
         return overallObjective;
@@ -79,7 +84,7 @@ double RMSprop<DecomposableFunctionType>::Optimize(arma::mat& iterate)
 
       if (std::abs(lastObjective - overallObjective) < tolerance)
       {
-        Log::Info << "RMSprop: minimized within tolerance " << tolerance << "; "
+        Log::Info << "Adam: minimized within tolerance " << tolerance << "; "
             << "terminating optimization." << std::endl;
         return overallObjective;
       }
@@ -100,9 +105,14 @@ double RMSprop<DecomposableFunctionType>::Optimize(arma::mat& iterate)
       function.Gradient(iterate, currentFunction, gradient);
 
     // And update the iterate.
-    meanSquaredGradient *= alpha;
-    meanSquaredGradient += (1 - alpha) * (gradient % gradient);
-    iterate -= stepSize * gradient / (arma::sqrt(meanSquaredGradient) + eps);
+    mean += (1 - beta1) * (gradient - mean);
+    variance += (1 - beta2) * (gradient % gradient - variance);
+
+    double biasCorrection1 = 1.0 - std::pow(beta1, (double) i);
+    double biasCorrection2 = 1.0 - std::pow(beta2, (double) i);
+
+    iterate -= (stepSize * std::sqrt(biasCorrection1) / biasCorrection2) *
+        mean / (arma::sqrt(variance) + eps);
 
     // Now add that to the overall objective function.
     if (shuffle)
@@ -112,7 +122,7 @@ double RMSprop<DecomposableFunctionType>::Optimize(arma::mat& iterate)
       overallObjective += function.Evaluate(iterate, currentFunction);
   }
 
-  Log::Info << "RMSprop: maximum iterations (" << maxIterations << ") reached; "
+  Log::Info << "Adam: maximum iterations (" << maxIterations << ") reached; "
       << "terminating optimization." << std::endl;
   // Calculate final objective.
   overallObjective = 0;