[mlpack-git] master: Add RMSprop implementation. (da1207a)
gitdub at mlpack.org
gitdub at mlpack.org
Fri Feb 19 08:22:53 EST 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/f6dd2f7a9752a7db8ec284a938b3e84a13d0bfb2...6205f3e0b62b56452b2a4afc4da24fce5b21e72f
>---------------------------------------------------------------
commit da1207a9e6407e8350d9904964402b0b08513a0a
Author: marcus <marcus.edel at fu-berlin.de>
Date: Tue Feb 16 21:53:58 2016 +0100
Add RMSprop implementation.
>---------------------------------------------------------------
da1207a9e6407e8350d9904964402b0b08513a0a
src/mlpack/core/optimizers/CMakeLists.txt | 1 +
.../{minibatch_sgd => rmsprop}/CMakeLists.txt | 4 +-
.../{sgd/sgd.hpp => rmsprop/rmsprop.hpp} | 99 ++++++++++++----------
.../{sgd/sgd_impl.hpp => rmsprop/rmsprop_impl.hpp} | 50 ++++++-----
4 files changed, 88 insertions(+), 66 deletions(-)
diff --git a/src/mlpack/core/optimizers/CMakeLists.txt b/src/mlpack/core/optimizers/CMakeLists.txt
index 13731a6..3e16f02 100644
--- a/src/mlpack/core/optimizers/CMakeLists.txt
+++ b/src/mlpack/core/optimizers/CMakeLists.txt
@@ -2,6 +2,7 @@ set(DIRS
aug_lagrangian
lbfgs
minibatch_sgd
+ rmsprop
sa
sdp
sgd
diff --git a/src/mlpack/core/optimizers/minibatch_sgd/CMakeLists.txt b/src/mlpack/core/optimizers/rmsprop/CMakeLists.txt
similarity index 80%
copy from src/mlpack/core/optimizers/minibatch_sgd/CMakeLists.txt
copy to src/mlpack/core/optimizers/rmsprop/CMakeLists.txt
index e88c3ed..75c30c6 100644
--- a/src/mlpack/core/optimizers/minibatch_sgd/CMakeLists.txt
+++ b/src/mlpack/core/optimizers/rmsprop/CMakeLists.txt
@@ -1,6 +1,6 @@
set(SOURCES
- minibatch_sgd.hpp
- minibatch_sgd_impl.hpp
+ rmsprop.hpp
+ rmsprop_impl.hpp
)
set(DIR_SRCS)
diff --git a/src/mlpack/core/optimizers/sgd/sgd.hpp b/src/mlpack/core/optimizers/rmsprop/rmsprop.hpp
similarity index 61%
copy from src/mlpack/core/optimizers/sgd/sgd.hpp
copy to src/mlpack/core/optimizers/rmsprop/rmsprop.hpp
index 5572f10..690da6a 100644
--- a/src/mlpack/core/optimizers/sgd/sgd.hpp
+++ b/src/mlpack/core/optimizers/rmsprop/rmsprop.hpp
@@ -1,11 +1,13 @@
/**
- * @file sgd.hpp
+ * @file rmsprop.hpp
* @author Ryan Curtin
+ * @author Marcus Edel
*
- * Stochastic Gradient Descent (SGD).
+ * RMSprop optimizer. RmsProp is an optimizer that utilizes the magnitude of
+ * recent gradients to normalize the gradients.
*/
-#ifndef __MLPACK_CORE_OPTIMIZERS_SGD_SGD_HPP
-#define __MLPACK_CORE_OPTIMIZERS_SGD_SGD_HPP
+#ifndef __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_HPP
+#define __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_HPP
#include <mlpack/core.hpp>
@@ -13,42 +15,28 @@ namespace mlpack {
namespace optimization {
/**
- * Stochastic Gradient Descent is a technique for minimizing a function which
- * can be expressed as a sum of other functions. That is, suppose we have
+ * RMSprop is an optimizer that utilizes the magnitude of recent gradients to
+ * normalize the gradients. In its basic form, given a step rate \f$ \gamma \f$
+ * and a decay term \f$ \alpha \f$ we perform the following updates:
*
- * \f[
- * f(A) = \sum_{i = 0}^{n} f_i(A)
- * \f]
+ * \f{eqnarray*}{
+ * r_t &=& (1 - \gamma) f'(\Delta_t)^2 + \gamma r_{t - 1} \\
+ * v_{t + 1} &=& \frac{\alpha}{\sqrt{r_t}}f'(\Delta_t) \\
+ * \Delta_{t + 1} &=& \Delta_t - v_{t + 1}
+ * \f}
*
- * and our task is to minimize \f$ A \f$. Stochastic gradient descent iterates
- * over each function \f$ f_i(A) \f$, producing the following update scheme:
+ * For more information, see the following.
*
- * \f[
- * A_{j + 1} = A_j + \alpha \nabla f_i(A)
- * \f]
+ * @code
+ * @misc{tieleman2012,
+ * title={Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine
+ * Learning},
+ * year={2012}
+ * }
+ * @endcode
*
- * where \f$ \alpha \f$ is a parameter which specifies the step size. \f$ i \f$
- * is chosen according to \f$ j \f$ (the iteration number). The SGD class
- * supports either scanning through each of the \f$ n \f$ functions \f$ f_i(A)
- * \f$ linearly, or in a random sequence. The algorithm continues until \f$ j
- * \f$ reaches the maximum number of iterations---or when a full sequence of
- * updates through each of the \f$ n \f$ functions \f$ f_i(A) \f$ produces an
- * improvement within a certain tolerance \f$ \epsilon \f$. That is,
- *
- * \f[
- * | f(A_{j + n}) - f(A_j) | < \epsilon.
- * \f]
- *
- * The parameter \f$\epsilon\f$ is specified by the tolerance parameter to the
- * constructor; \f$n\f$ is specified by the maxIterations parameter.
- *
- * This class is useful for data-dependent functions whose objective function
- * can be expressed as a sum of objective functions operating on an individual
- * point. Then, SGD considers the gradient of the objective function operating
- * on an individual point in its update of \f$ A \f$.
- *
- * For SGD to work, a DecomposableFunctionType template parameter is required.
- * This class must implement the following function:
+ * For RMSprop to work, a DecomposableFunctionType template parameter is
+ * required. This class must implement the following function:
*
* size_t NumFunctions();
* double Evaluate(const arma::mat& coordinates, const size_t i);
@@ -68,11 +56,11 @@ namespace optimization {
* minimized.
*/
template<typename DecomposableFunctionType>
-class SGD
+class RMSprop
{
public:
/**
- * Construct the SGD optimizer with the given function and parameters. The
+ * Construct the RMSprop optimizer with the given function and parameters. The
* defaults here are not necessarily good for the given problem, so it is
* suggested that the values used be tailored to the task at hand. The
* maximum number of iterations refers to the maximum number of points that
@@ -81,22 +69,27 @@ class SGD
*
* @param function Function to be optimized (minimized).
* @param stepSize Step size for each iteration.
+ * @param alpha Smoothing constant, similar to that used in AdaDelta and
+ * momentum methods.
+ * @param eps Value used to initialise the mean squared gradient parameter.
* @param maxIterations Maximum number of iterations allowed (0 means no
- * limit).
+ * limit).
* @param tolerance Maximum absolute tolerance to terminate algorithm.
* @param shuffle If true, the function order is shuffled; otherwise, each
- * function is visited in linear order.
+ * function is visited in linear order.
*/
- SGD(DecomposableFunctionType& function,
+ RMSprop(DecomposableFunctionType& function,
const double stepSize = 0.01,
+ const double alpha = 0.99,
+ const double eps = 1e-8,
const size_t maxIterations = 100000,
const double tolerance = 1e-5,
const bool shuffle = true);
/**
- * Optimize the given function using stochastic gradient descent. The given
- * starting point will be modified to store the finishing point of the
- * algorithm, and the final objective value is returned.
+ * Optimize the given function using RMSprop. The given starting point will be
+ * modified to store the finishing point of the algorithm, and the final
+ * objective value is returned.
*
* @param iterate Starting point (will be modified).
* @return Objective value of the final point.
@@ -113,6 +106,16 @@ class SGD
//! Modify the step size.
double& StepSize() { return stepSize; }
+ //! Get the smoothing parameter.
+ double Alpha() const { return alpha; }
+ //! Modify the smoothing parameter.
+ double& Alpha() { return alpha; }
+
+ //! Get the value used to initialise the mean squared gradient parameter.
+ double Epsilon() const { return eps; }
+ //! Modify the value used to initialise the mean squared gradient parameter.
+ double& Epsilon() { return eps; }
+
//! Get the maximum number of iterations (0 indicates no limit).
size_t MaxIterations() const { return maxIterations; }
//! Modify the maximum number of iterations (0 indicates no limit).
@@ -135,6 +138,12 @@ class SGD
//! The step size for each example.
double stepSize;
+ //! The smoothing parameter.
+ double alpha;
+
+ //! The value used to initialise the mean squared gradient parameter.
+ double eps;
+
//! The maximum number of allowed iterations.
size_t maxIterations;
@@ -150,6 +159,6 @@ class SGD
} // namespace mlpack
// Include implementation.
-#include "sgd_impl.hpp"
+#include "rmsprop_impl.hpp"
#endif
diff --git a/src/mlpack/core/optimizers/sgd/sgd_impl.hpp b/src/mlpack/core/optimizers/rmsprop/rmsprop_impl.hpp
similarity index 62%
copy from src/mlpack/core/optimizers/sgd/sgd_impl.hpp
copy to src/mlpack/core/optimizers/rmsprop/rmsprop_impl.hpp
index d95624a..539fa05 100644
--- a/src/mlpack/core/optimizers/sgd/sgd_impl.hpp
+++ b/src/mlpack/core/optimizers/rmsprop/rmsprop_impl.hpp
@@ -1,27 +1,31 @@
/**
- * @file sgd_impl.hpp
+ * @file rmsprop_impl.hpp
* @author Ryan Curtin
+ * @author Marcus Edel
*
- * Implementation of stochastic gradient descent.
+ * Implementation of the RMSprop optimizer.
*/
-#ifndef __MLPACK_CORE_OPTIMIZERS_SGD_SGD_IMPL_HPP
-#define __MLPACK_CORE_OPTIMIZERS_SGD_SGD_IMPL_HPP
+#ifndef __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_IMPL_HPP
+#define __MLPACK_CORE_OPTIMIZERS_RMSPROP_RMSPROP_IMPL_HPP
-#include <mlpack/methods/regularized_svd/regularized_svd_function.hpp>
// In case it hasn't been included yet.
-#include "sgd.hpp"
+#include "rmsprop.hpp"
namespace mlpack {
namespace optimization {
template<typename DecomposableFunctionType>
-SGD<DecomposableFunctionType>::SGD(DecomposableFunctionType& function,
- const double stepSize,
- const size_t maxIterations,
- const double tolerance,
- const bool shuffle) :
+RMSprop<DecomposableFunctionType>::RMSprop(DecomposableFunctionType& function,
+ const double stepSize,
+ const double alpha,
+ const double eps,
+ const size_t maxIterations,
+ const double tolerance,
+ const bool shuffle) :
function(function),
stepSize(stepSize),
+ alpha(alpha),
+ eps(eps),
maxIterations(maxIterations),
tolerance(tolerance),
shuffle(shuffle)
@@ -29,7 +33,7 @@ SGD<DecomposableFunctionType>::SGD(DecomposableFunctionType& function,
//! Optimize the function (minimize).
template<typename DecomposableFunctionType>
-double SGD<DecomposableFunctionType>::Optimize(arma::mat& iterate)
+double RMSprop<DecomposableFunctionType>::Optimize(arma::mat& iterate)
{
// Find the number of functions to use.
const size_t numFunctions = function.NumFunctions();
@@ -51,25 +55,31 @@ double SGD<DecomposableFunctionType>::Optimize(arma::mat& iterate)
// Now iterate!
arma::mat gradient(iterate.n_rows, iterate.n_cols);
+
+ // Leaky sum of squares of parameter gradient.
+ arma::mat meanSquaredGradient = arma::zeros<arma::mat>(iterate.n_rows,
+ iterate.n_cols);
+
for (size_t i = 1; i != maxIterations; ++i, ++currentFunction)
{
// Is this iteration the start of a sequence?
if ((currentFunction % numFunctions) == 0)
{
// Output current objective function.
- Log::Info << "SGD: iteration " << i << ", objective " << overallObjective
- << "." << std::endl;
+ Log::Info << "RMSprop: iteration " << i << ", objective "
+ << overallObjective << "." << std::endl;
if (std::isnan(overallObjective) || std::isinf(overallObjective))
{
- Log::Warn << "SGD: converged to " << overallObjective << "; terminating"
- << " with failure. Try a smaller step size?" << std::endl;
+ Log::Warn << "RMSprop: converged to " << overallObjective
+ << "; terminating with failure. Try a smaller step size?"
+ << std::endl;
return overallObjective;
}
if (std::abs(lastObjective - overallObjective) < tolerance)
{
- Log::Info << "SGD: minimized within tolerance " << tolerance << "; "
+ Log::Info << "RMSprop: minimized within tolerance " << tolerance << "; "
<< "terminating optimization." << std::endl;
return overallObjective;
}
@@ -90,7 +100,9 @@ double SGD<DecomposableFunctionType>::Optimize(arma::mat& iterate)
function.Gradient(iterate, currentFunction, gradient);
// And update the iterate.
- iterate -= stepSize * gradient;
+ meanSquaredGradient *= alpha;
+ meanSquaredGradient += (1 - alpha) * (gradient % gradient);
+ iterate -= stepSize * gradient / (arma::sqrt(meanSquaredGradient) + eps);
// Now add that to the overall objective function.
if (shuffle)
@@ -100,7 +112,7 @@ double SGD<DecomposableFunctionType>::Optimize(arma::mat& iterate)
overallObjective += function.Evaluate(iterate, currentFunction);
}
- Log::Info << "SGD: maximum iterations (" << maxIterations << ") reached; "
+ Log::Info << "RMSprop: maximum iterations (" << maxIterations << ") reached; "
<< "terminating optimization." << std::endl;
// Calculate final objective.
overallObjective = 0;
More information about the mlpack-git
mailing list