[mlpack-git] master: Add ADAM optimizer. (68157b9)

Thu Oct 15 06:21:23 EDT 2015

Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/90f9f8f270c743ecc38542bb5b04aae83608bc9c...68157b9f819db2636ace8348840e8588e7bc7bc2

>---------------------------------------------------------------

commit 68157b9f819db2636ace8348840e8588e7bc7bc2
Author: Marcus Edel <marcus.edel at fu-berlin.de>
Date:   Thu Oct 15 12:21:17 2015 +0200

    Add ADAM optimizer.


>---------------------------------------------------------------

68157b9f819db2636ace8348840e8588e7bc7bc2
 src/mlpack/methods/ann/optimizer/adam.hpp | 177 ++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)

diff --git a/src/mlpack/methods/ann/optimizer/adam.hpp b/src/mlpack/methods/ann/optimizer/adam.hpp
new file mode 100644
index 0000000..8749469
--- /dev/null
+++ b/src/mlpack/methods/ann/optimizer/adam.hpp
@@ -0,0 +1,177 @@
+/**
+ * @file adam.hpp
+ * @author Marcus Edel
+ *
+ * Implementation of the Adam optimizer. Adam is an an algorithm for first-
+ * order gradient-based optimization of stochastic objective functions, based on
+ * adaptive estimates of lower-order moments.
+ */
+#ifndef __MLPACK_METHODS_ANN_OPTIMIZER_ADAM_HPP
+#define __MLPACK_METHODS_ANN_OPTIMIZER_ADAM_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+/**
+ * Adam is an optimizer that computes individual adaptive learning rates for
+ * different parameters from estimates of first and second moments of the
+ * gradients.
+ *
+ * For more information, see the following.
+ *
+ * @code
+ * @article{Kingma2014,
+ *   author    = {Diederik P. Kingma and Jimmy Ba},
+ *   title     = {Adam: {A} Method for Stochastic Optimization},
+ *   journal   = {CoRR},
+ *   year      = {2014}
+ * }
+ * @endcode
+ */
+template<typename DecomposableFunctionType, typename DataType>
+class Adam
+{
+ public:
+  /**
+   * Construct the Adam optimizer with the given function and parameters.
+   *
+   * @param function Function to be optimized (minimized).
+   * @param lr The learning rate coefficient.
+   * @param beta1 The first moment coefficient.
+   * @param beta2 The second moment coefficient.
+   * @param eps The eps coefficient to avoid division by zero (numerical
+   *        stability).
+   */
+  Adam(DecomposableFunctionType& function,
+          const double lr = 0.001,
+          const double beta1 = 0.9,
+          const double beta2 = 0.999,
+          const double eps = 1e-8) :
+      function(function),
+      lr(lr),
+      beta1(beta1),
+      beta2(beta2),
+      eps(eps)
+  {
+    // Nothing to do here.
+  }
+
+  /**
+   * Optimize the given function using Adam.
+   */
+  void Optimize()
+  {
+    if (mean.n_elem == 0)
+    {
+      mean = function.Weights();
+      mean.zeros();
+
+      variance = mean;
+    }
+
+    Optimize(function.Weights(), gradient, mean, variance);
+  }
+
+  /*
+   * Sum up all gradients and store the results in the gradients storage.
+   */
+  void Update()
+  {
+    if (gradient.n_elem != 0)
+    {
+      gradient += function.Gradient();
+    }
+    else
+    {
+      gradient = function.Gradient();
+    }
+  }
+
+  /*
+   * Reset the gradient storage.
+   */
+  void Reset()
+  {
+    gradient.zeros();
+  }
+
+  //! Get the gradient.
+  DataType& Gradient() const { return gradient; }
+  //! Modify the gradient.
+  DataType& Gradient() { return gradient; }
+
+ private:
+  /**
+   * Optimize the given function using Adam.
+   *
+   * @param weights The weights that should be updated.
+   * @param gradient The gradient used to update the weights.
+   * @param mean The current mean parameter.
+   * @param variance The current variance parameter.
+   */
+  template<typename eT>
+  void Optimize(arma::Cube<eT>& weights,
+                arma::Cube<eT>& gradient,
+                arma::Cube<eT>& mean,
+                arma::Cube<eT>& variance)
+  {
+    for (size_t s = 0; s < weights.n_slices; s++)
+    {
+      Optimize(weights.slice(s), gradient.slice(s), mean.slice(s),
+          variance.slice(s));
+    }
+  }
+
+  /**
+   * Optimize the given function using Adam.
+   *
+   * @param weights The weights that should be updated.
+   * @param gradient The gradient used to update the weights.
+   * @param mean The current mean parameter.
+   * @param variance The current variance parameter.
+   */
+  template<typename eT>
+  void Optimize(arma::Mat<eT>& weights,
+                arma::Mat<eT>& gradient,
+                arma::Mat<eT>& mean,
+                arma::Mat<eT>& variance)
+  {
+    // Accumulate updates.
+    mean += (1 - beta1) * (gradient - mean);
+    variance += (1 - beta2) * (gradient % gradient - variance);
+
+    // Apply update.
+    weights -= lr * mean / (arma::sqrt(variance) + eps);
+  }
+
+  //! The instantiated function.
+  DecomposableFunctionType& function;
+
+  //! The value used as learning rate.
+  const double lr;
+
+  //! The value used as first moment coefficient.
+  const double beta1;
+
+  //! The value used as second moment coefficient.
+  const double beta2;
+
+  //! The value used as eps.
+  const double eps;
+
+  //! The current gradient.
+  DataType gradient;
+
+  //! The current mean parameter.
+  DataType mean;
+
+  //! The current variance parameter.
+  DataType variance;
+}; // class Adam
+
+}; // namespace ann
+}; // namespace mlpack
+
+#endif