[mlpack-git] master: Update the learning rate by calculating the bias correction values. (d673e3e)
gitdub at mlpack.org
gitdub at mlpack.org
Mon Mar 14 18:12:23 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/c0886a18f63c9335a0c39dcc34c27b8925dcb91b...b864df8cf10592b3874b079302774dbe7a4c1dbc
>---------------------------------------------------------------
commit d673e3e33b005341243dc70c2ad79fe11b980b47
Author: marcus <marcus.edel at fu-berlin.de>
Date: Mon Mar 14 23:10:45 2016 +0100
Update the learning rate by calculating the bias correction values.
>---------------------------------------------------------------
d673e3e33b005341243dc70c2ad79fe11b980b47
.../optimizers/{rmsprop => adam}/CMakeLists.txt | 4 +-
.../{rmsprop/rmsprop.hpp => adam/adam.hpp} | 71 ++++++++++++----------
.../rmsprop_impl.hpp => adam/adam_impl.hpp} | 60 ++++++++++--------
3 files changed, 76 insertions(+), 59 deletions(-)
diff --git a/src/mlpack/core/optimizers/rmsprop/CMakeLists.txt b/src/mlpack/core/optimizers/adam/CMakeLists.txt
similarity index 85%
copy from src/mlpack/core/optimizers/rmsprop/CMakeLists.txt
copy to src/mlpack/core/optimizers/adam/CMakeLists.txt
index 75c30c6..3cbcfd8 100644
--- a/src/mlpack/core/optimizers/rmsprop/CMakeLists.txt
+++ b/src/mlpack/core/optimizers/adam/CMakeLists.txt
@@ -1,6 +1,6 @@
set(SOURCES
- rmsprop.hpp
- rmsprop_impl.hpp
+ adam.hpp
+ adam_impl.hpp
)
set(DIR_SRCS)
diff --git a/src/mlpack/core/optimizers/rmsprop/rmsprop.hpp b/src/mlpack/core/optimizers/adam/adam.hpp
similarity index 73%
copy from src/mlpack/core/optimizers/rmsprop/rmsprop.hpp
copy to src/mlpack/core/optimizers/adam/adam.hpp
index 690da6a..fae4362 100644
--- a/src/mlpack/core/optimizers/rmsprop/rmsprop.hpp
+++ b/src/mlpack/core/optimizers/adam/adam.hpp
@@ -1,13 +1,15 @@
/**
- * @file rmsprop.hpp
+ * @file adam.hpp
* @author Ryan Curtin
+ * @author Vasanth Kalingeri
* @author Marcus Edel
*
- * RMSprop optimizer. RmsProp is an optimizer that utilizes the magnitude of
- * recent gradients to normalize the gradients.
+ * Adam optimizer. Adam is an an algorithm for first-order gradient-based
+ * optimization of stochastic objective functions, based on adaptive estimates
+ * of lower-order moments.
*/
-#ifndef __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_HPP
-#define __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_HPP
+#ifndef __MLPACK_CORE_OPTIMIZERS_ADAM_ADAM_HPP
+#define __MLPACK_CORE_OPTIMIZERS_ADAM_ADAM_HPP
#include <mlpack/core.hpp>
@@ -15,28 +17,24 @@ namespace mlpack {
namespace optimization {
/**
- * RMSprop is an optimizer that utilizes the magnitude of recent gradients to
- * normalize the gradients. In its basic form, given a step rate \f$ \gamma \f$
- * and a decay term \f$ \alpha \f$ we perform the following updates:
- *
- * \f{eqnarray*}{
- * r_t &=& (1 - \gamma) f'(\Delta_t)^2 + \gamma r_{t - 1} \\
- * v_{t + 1} &=& \frac{\alpha}{\sqrt{r_t}}f'(\Delta_t) \\
- * \Delta_{t + 1} &=& \Delta_t - v_{t + 1}
- * \f}
+ * Adam is an optimizer that computes individual adaptive learning rates for
+ * different parameters from estimates of first and second moments of the
+ * gradients.
*
* For more information, see the following.
*
* @code
- * @misc{tieleman2012,
- * title={Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine
- * Learning},
- * year={2012}
+ * @article{Kingma2014,
+ * author = {Diederik P. Kingma and Jimmy Ba},
+ * title = {Adam: {A} Method for Stochastic Optimization},
+ * journal = {CoRR},
+ * year = {2014}
* }
* @endcode
*
- * For RMSprop to work, a DecomposableFunctionType template parameter is
- * required. This class must implement the following function:
+ *
+ * For Adam to work, a DecomposableFunctionType template parameter is required.
+ * This class must implement the following function:
*
* size_t NumFunctions();
* double Evaluate(const arma::mat& coordinates, const size_t i);
@@ -56,11 +54,11 @@ namespace optimization {
* minimized.
*/
template<typename DecomposableFunctionType>
-class RMSprop
+class Adam
{
public:
/**
- * Construct the RMSprop optimizer with the given function and parameters. The
+ * Construct the Adam optimizer with the given function and parameters. The
* defaults here are not necessarily good for the given problem, so it is
* suggested that the values used be tailored to the task at hand. The
* maximum number of iterations refers to the maximum number of points that
@@ -69,8 +67,8 @@ class RMSprop
*
* @param function Function to be optimized (minimized).
* @param stepSize Step size for each iteration.
- * @param alpha Smoothing constant, similar to that used in AdaDelta and
- * momentum methods.
+ * @param beta1 The first moment coefficient.
+ * @param beta2 The second moment coefficient.
* @param eps Value used to initialise the mean squared gradient parameter.
* @param maxIterations Maximum number of iterations allowed (0 means no
* limit).
@@ -78,16 +76,17 @@ class RMSprop
* @param shuffle If true, the function order is shuffled; otherwise, each
* function is visited in linear order.
*/
- RMSprop(DecomposableFunctionType& function,
+ Adam(DecomposableFunctionType& function,
const double stepSize = 0.01,
- const double alpha = 0.99,
+ const double beta1 = 0.9,
+ const double beta2 = 0.999,
const double eps = 1e-8,
const size_t maxIterations = 100000,
const double tolerance = 1e-5,
const bool shuffle = true);
/**
- * Optimize the given function using RMSprop. The given starting point will be
+ * Optimize the given function using Adam. The given starting point will be
* modified to store the finishing point of the algorithm, and the final
* objective value is returned.
*
@@ -107,9 +106,14 @@ class RMSprop
double& StepSize() { return stepSize; }
//! Get the smoothing parameter.
- double Alpha() const { return alpha; }
+ double Beta1() const { return beta1; }
//! Modify the smoothing parameter.
- double& Alpha() { return alpha; }
+ double& Beta1() { return beta1; }
+
+ //! Get the second moment coefficient.
+ double Beta2() const { return beta2; }
+ //! Modify the second moment coefficient.
+ double& Beta2() { return beta2; }
//! Get the value used to initialise the mean squared gradient parameter.
double Epsilon() const { return eps; }
@@ -138,8 +142,11 @@ class RMSprop
//! The step size for each example.
double stepSize;
- //! The smoothing parameter.
- double alpha;
+ //! The value used as first moment coefficient.
+ double beta1;
+
+ //! The value used as second moment coefficient.
+ double beta2;
//! The value used to initialise the mean squared gradient parameter.
double eps;
@@ -159,6 +166,6 @@ class RMSprop
} // namespace mlpack
// Include implementation.
-#include "rmsprop_impl.hpp"
+#include "adam_impl.hpp"
#endif
diff --git a/src/mlpack/core/optimizers/rmsprop/rmsprop_impl.hpp b/src/mlpack/core/optimizers/adam/adam_impl.hpp
similarity index 61%
copy from src/mlpack/core/optimizers/rmsprop/rmsprop_impl.hpp
copy to src/mlpack/core/optimizers/adam/adam_impl.hpp
index 539fa05..51efa05 100644
--- a/src/mlpack/core/optimizers/rmsprop/rmsprop_impl.hpp
+++ b/src/mlpack/core/optimizers/adam/adam_impl.hpp
@@ -1,30 +1,33 @@
/**
- * @file rmsprop_impl.hpp
+ * @file adam_impl.hpp
* @author Ryan Curtin
+ * @author Vasanth Kalingeri
* @author Marcus Edel
*
- * Implementation of the RMSprop optimizer.
+ * Implementation of the Adam optimizer.
*/
-#ifndef __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_IMPL_HPP
-#define __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_IMPL_HPP
+#ifndef __MLPACK_CORE_OPTIMIZERS_ADAM_ADAM_IMPL_HPP
+#define __MLPACK_CORE_OPTIMIZERS_ADAM_ADAM_IMPL_HPP
// In case it hasn't been included yet.
-#include "rmsprop.hpp"
+#include "adam.hpp"
namespace mlpack {
namespace optimization {
template<typename DecomposableFunctionType>
-RMSprop<DecomposableFunctionType>::RMSprop(DecomposableFunctionType& function,
- const double stepSize,
- const double alpha,
- const double eps,
- const size_t maxIterations,
- const double tolerance,
- const bool shuffle) :
+Adam<DecomposableFunctionType>::Adam(DecomposableFunctionType& function,
+ const double stepSize,
+ const double beta1,
+ const double beta2,
+ const double eps,
+ const size_t maxIterations,
+ const double tolerance,
+ const bool shuffle) :
function(function),
stepSize(stepSize),
- alpha(alpha),
+ beta1(beta1),
+ beta2(beta2),
eps(eps),
maxIterations(maxIterations),
tolerance(tolerance),
@@ -33,7 +36,7 @@ RMSprop<DecomposableFunctionType>::RMSprop(DecomposableFunctionType& function,
//! Optimize the function (minimize).
template<typename DecomposableFunctionType>
-double RMSprop<DecomposableFunctionType>::Optimize(arma::mat& iterate)
+double Adam<DecomposableFunctionType>::Optimize(arma::mat& iterate)
{
// Find the number of functions to use.
const size_t numFunctions = function.NumFunctions();
@@ -56,9 +59,11 @@ double RMSprop<DecomposableFunctionType>::Optimize(arma::mat& iterate)
// Now iterate!
arma::mat gradient(iterate.n_rows, iterate.n_cols);
- // Leaky sum of squares of parameter gradient.
- arma::mat meanSquaredGradient = arma::zeros<arma::mat>(iterate.n_rows,
- iterate.n_cols);
+ // Exponential moving average of gradient values.
+ arma::mat mean = arma::zeros<arma::mat>(iterate.n_rows, iterate.n_cols);
+
+ // Exponential moving average of squared gradient values.
+ arma::mat variance = arma::zeros<arma::mat>(iterate.n_rows, iterate.n_cols);
for (size_t i = 1; i != maxIterations; ++i, ++currentFunction)
{
@@ -66,12 +71,12 @@ double RMSprop<DecomposableFunctionType>::Optimize(arma::mat& iterate)
if ((currentFunction % numFunctions) == 0)
{
// Output current objective function.
- Log::Info << "RMSprop: iteration " << i << ", objective "
- << overallObjective << "." << std::endl;
+ Log::Info << "Adam: iteration " << i << ", objective " << overallObjective
+ << "." << std::endl;
if (std::isnan(overallObjective) || std::isinf(overallObjective))
{
- Log::Warn << "RMSprop: converged to " << overallObjective
+ Log::Warn << "Adam: converged to " << overallObjective
<< "; terminating with failure. Try a smaller step size?"
<< std::endl;
return overallObjective;
@@ -79,7 +84,7 @@ double RMSprop<DecomposableFunctionType>::Optimize(arma::mat& iterate)
if (std::abs(lastObjective - overallObjective) < tolerance)
{
- Log::Info << "RMSprop: minimized within tolerance " << tolerance << "; "
+ Log::Info << "Adam: minimized within tolerance " << tolerance << "; "
<< "terminating optimization." << std::endl;
return overallObjective;
}
@@ -100,9 +105,14 @@ double RMSprop<DecomposableFunctionType>::Optimize(arma::mat& iterate)
function.Gradient(iterate, currentFunction, gradient);
// And update the iterate.
- meanSquaredGradient *= alpha;
- meanSquaredGradient += (1 - alpha) * (gradient % gradient);
- iterate -= stepSize * gradient / (arma::sqrt(meanSquaredGradient) + eps);
+ mean += (1 - beta1) * (gradient - mean);
+ variance += (1 - beta2) * (gradient % gradient - variance);
+
+ double biasCorrection1 = 1.0 - std::pow(beta1, (double) i);
+ double biasCorrection2 = 1.0 - std::pow(beta2, (double) i);
+
+ iterate -= (stepSize * std::sqrt(biasCorrection1) / biasCorrection2) *
+ mean / (arma::sqrt(variance) + eps);
// Now add that to the overall objective function.
if (shuffle)
@@ -112,7 +122,7 @@ double RMSprop<DecomposableFunctionType>::Optimize(arma::mat& iterate)
overallObjective += function.Evaluate(iterate, currentFunction);
}
- Log::Info << "RMSprop: maximum iterations (" << maxIterations << ") reached; "
+ Log::Info << "Adam: maximum iterations (" << maxIterations << ") reached; "
<< "terminating optimization." << std::endl;
// Calculate final objective.
overallObjective = 0;
More information about the mlpack-git
mailing list