[mlpack-git] master: Add GlimpseLayer class which takes an input image and a location to extract a retina-like representation of the input image. (be43684)

Fri May 20 15:38:02 EDT 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/986620375ce84cdc75fdfd99f63f17b5c8ee507a...989dd35359ee0c2258616ea57675f639ff47bfaa

>---------------------------------------------------------------

commit be43684ec0507b0b74e4f8a551c80d28e50b3168
Author: Marcus Edel <marcus.edel at fu-berlin.de>
Date:   Wed Apr 20 18:26:56 2016 +0200

    Add GlimpseLayer class which takes an input image and a location to extract a retina-like representation of the input image.


>---------------------------------------------------------------

be43684ec0507b0b74e4f8a551c80d28e50b3168
 src/mlpack/methods/ann/layer/glimpse_layer.hpp | 480 +++++++++++++++++++++++++
 1 file changed, 480 insertions(+)

diff --git a/src/mlpack/methods/ann/layer/glimpse_layer.hpp b/src/mlpack/methods/ann/layer/glimpse_layer.hpp
new file mode 100644
index 0000000..f41a615
--- /dev/null
+++ b/src/mlpack/methods/ann/layer/glimpse_layer.hpp
@@ -0,0 +1,480 @@
+/**
+ * @file glimpse_layer.hpp
+ * @author Marcus Edel
+ *
+ * Definition of the GlimpseLayer class, which takes an input image and a
+ * location to extract a retina-like representation of the input image at
+ * different increasing scales.
+ *
+ * For more information, see the following.
+ *
+ * @code
+ * @article{CoRR2014,
+ *   author  = {Volodymyr Mnih, Nicolas Heess, Alex Graves, Koray Kavukcuoglu},
+ *   title   = {Recurrent Models of Visual Attention},
+ *   journal = {CoRR},
+ *   volume  = {abs/1406.6247},
+ *   year    = {2014},
+ * }
+ * @endcode
+ */
+#ifndef __MLPACK_METHODS_ANN_LAYER_GLIMPSE_LAYER_HPP
+#define __MLPACK_METHODS_ANN_LAYER_GLIMPSE_LAYER_HPP
+
+#include <mlpack/core.hpp>
+#include <mlpack/methods/ann/pooling_rules/mean_pooling.hpp>
+#include <algorithm>
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+/**
+ * The glimpse layer returns a retina-like representation
+ * (down-scaled cropped images) of increasing scale around a given location in a
+ * given image.
+ *
+ * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ */
+template <
+    typename InputDataType = arma::cube,
+    typename OutputDataType = arma::cube
+>
+class GlimpseLayer
+{
+ public:
+
+  /**
+   * Create the GlimpseLayer object using the specified ratio and rescale
+   * parameter.
+   *
+   * @param inSize The size of the input units.
+   * @param size The used glimpse size (height = width).
+   * @param depth The number of patches to crop per glimpse.
+   * @param scale The scaling factor used to create the increasing retina-like
+   *        representation.
+   */
+  GlimpseLayer(const size_t inSize,
+               const size_t size,
+               const size_t depth = 3,
+               const size_t scale = 2) :
+      inSize(inSize),
+      size(size),
+      depth(depth),
+      scale(scale)
+  {
+    // Nothing to do here.
+  }
+
+  /**
+   * Ordinary feed forward pass of the glimpse layer.
+   *
+   * @param input Input data used for evaluating the specified function.
+   * @param output Resulting output activation.
+   */
+  template<typename eT>
+  void Forward(const arma::Cube<eT>& input, arma::Cube<eT>& output)
+  {
+    output = arma::Cube<eT>(size, size, depth * input.n_slices);
+
+    inputDepth = input.n_slices / inSize;
+
+    for (size_t inputIdx = 0; inputIdx < inSize; inputIdx++)
+    {
+      for (size_t depthIdx = 0, glimpseSize = size;
+          depthIdx < depth; depthIdx++, glimpseSize *= scale)
+      {
+        size_t padSize = std::floor((glimpseSize - 1) / 2);
+
+        arma::Cube<eT> inputPadded = arma::zeros<arma::Cube<eT> >(
+            input.n_rows + padSize * 2, input.n_cols + padSize * 2,
+            input.n_slices / inSize);
+
+        inputPadded.tube(padSize, padSize, padSize + input.n_rows - 1,
+            padSize + input.n_cols - 1) = input.subcube(0, 0,
+            inputIdx * inputDepth, input.n_rows - 1, input.n_cols - 1,
+            (inputIdx + 1) * inputDepth - 1);
+
+        size_t h = inputPadded.n_rows - glimpseSize;
+        size_t w = inputPadded.n_cols - glimpseSize;
+
+        size_t x = std::min(h, (size_t) std::max(0.0,
+            (location(0, inputIdx) + 1) / 2.0 * h));
+        size_t y = std::min(w, (size_t) std::max(0.0,
+            (location(1, inputIdx) + 1) / 2.0 * w));
+
+        if (depthIdx == 0)
+        {
+          for (size_t j = (inputIdx + depthIdx), paddedSlice = 0;
+              j < output.n_slices; j += (inSize * depth), paddedSlice++)
+          {
+            output.slice(j) = inputPadded.subcube(x, y,
+                paddedSlice, x + glimpseSize - 1, y + glimpseSize - 1,
+                paddedSlice);
+          }
+        }
+        else
+        {
+          for (size_t j = (inputIdx + depthIdx * (depth - 1)), paddedSlice = 0;
+              j < output.n_slices; j += (inSize * depth), paddedSlice++)
+          {
+            arma::Mat<eT> poolingInput = inputPadded.subcube(x, y,
+                paddedSlice, x + glimpseSize - 1, y + glimpseSize - 1,
+                paddedSlice);
+
+            if (scale == 2)
+            {
+              Pooling(glimpseSize / size, poolingInput, output.slice(j));
+            }
+            else
+            {
+              ReSampling(poolingInput, output.slice(j));
+            }
+          }
+        }        
+      }
+    }
+  }
+
+  /**
+   * Ordinary feed backward pass of the glimpse layer.
+   *
+   * @param input The propagated input activation.
+   * @param gy The backpropagated error.
+   * @param g The calculated gradient.
+   */
+  template<typename InputType, typename ErrorType, typename eT>
+  void Backward(const InputType& input,
+                const ErrorType& gy,
+                arma::Cube<eT>& g)
+  {
+    // Generate a cube using the backpropagated error matrix.
+    arma::Cube<eT> mappedError = arma::zeros<arma::cube>(input.n_rows,
+        input.n_cols, input.n_slices);
+
+    for (size_t s = 0, j = 0; s < mappedError.n_slices; s+= gy.n_cols, j++)
+    {
+      for (size_t i = 0; i < gy.n_cols; i++)
+      {
+        arma::Col<eT> temp = gy.col(i).subvec(
+            j * input.n_rows * input.n_cols,
+            (j + 1) * input.n_rows * input.n_cols - 1);
+
+        mappedError.slice(s + i) = arma::Mat<eT>(temp.memptr(),
+            input.n_rows, input.n_cols);
+      }
+    }
+
+    g = arma::zeros<arma::cube>(inputParameter.n_rows, inputParameter.n_cols,
+        inputParameter.n_slices);
+
+    for (size_t inputIdx = 0; inputIdx < inSize; inputIdx++)
+    {
+      for (size_t depthIdx = 0, glimpseSize = size;
+          depthIdx < depth; depthIdx++, glimpseSize *= scale)
+      {
+        size_t padSize = std::floor((glimpseSize - 1) / 2);
+
+        arma::Cube<eT> inputPadded = arma::zeros<arma::Cube<eT> >(
+            inputParameter.n_rows + padSize * 2, inputParameter.n_cols +
+            padSize * 2, inputParameter.n_slices / inSize);
+
+        size_t h = inputPadded.n_rows - glimpseSize;
+        size_t w = inputPadded.n_cols - glimpseSize;
+
+        size_t x = std::min(h, (size_t) std::max(0.0,
+            (location(0, inputIdx) + 1) / 2.0 * h));
+        size_t y = std::min(w, (size_t) std::max(0.0,
+            (location(1, inputIdx) + 1) / 2.0 * w));
+
+        if (depthIdx == 0)
+        {
+          for (size_t j = (inputIdx + depthIdx), paddedSlice = 0;
+              j < mappedError.n_slices; j += (inSize * depth), paddedSlice++)
+          {
+            inputPadded.subcube(x, y,
+            paddedSlice, x + glimpseSize - 1, y + glimpseSize - 1,
+            paddedSlice) = mappedError.slice(j);
+          }
+        }
+        else
+        {
+          for (size_t j = (inputIdx + depthIdx * (depth - 1)), paddedSlice = 0;
+              j < mappedError.n_slices; j += (inSize * depth), paddedSlice++)
+          {
+            arma::Mat<eT> poolingOutput = inputPadded.subcube(x, y,
+                 paddedSlice, x + glimpseSize - 1, y + glimpseSize - 1,
+                 paddedSlice);
+
+            if (scale == 2)
+            {
+              Unpooling(inputParameter.slice(paddedSlice), mappedError.slice(j),
+                  poolingOutput);
+            }
+            else
+            {
+              DownwardReSampling(inputParameter.slice(paddedSlice),
+                  mappedError.slice(j), poolingOutput);
+            }            
+
+            inputPadded.subcube(x, y,
+                paddedSlice, x + glimpseSize - 1, y + glimpseSize - 1,
+                paddedSlice) = poolingOutput;
+          }
+        }
+
+        g += inputPadded.tube(padSize, padSize, padSize +
+            inputParameter.n_rows - 1, padSize + inputParameter.n_cols - 1);
+      }
+    }
+
+    Transform(g);
+  }
+
+  //! Get the input parameter.
+  InputDataType& InputParameter() const {return inputParameter; }
+  //! Modify the input parameter.
+  InputDataType& InputParameter() { return inputParameter; }
+
+  //! Get the output parameter.
+  OutputDataType& OutputParameter() const {return outputParameter; }
+  //! Modify the output parameter.
+  OutputDataType& OutputParameter() { return outputParameter; }
+
+  //! Get the detla.
+  OutputDataType& Delta() const { return delta; }
+  //! Modify the delta.
+  OutputDataType& Delta() { return delta; }
+
+  //! Set the locationthe x and y coordinate of the center of the output
+  //! glimpse.
+  void Location(const arma::mat& location)
+  {
+    // Log::Debug << "location: " << location.t() << std::endl;
+    this->location = location;
+  }
+
+ private:
+  /*
+   * Transform the given input by changing rows to columns.
+   *
+   * @param w The input matrix used to perform the transformation.
+   */
+  void Transform(arma::mat& w)
+  {
+    arma::mat t = w;
+
+    for (size_t i = 0, k = 0; i < w.n_elem; k++)
+    {
+      for (size_t j = 0; j < w.n_cols; j++, i++)
+      {
+        w(k, j) = t(i);
+      }      
+    }
+  }
+
+  /*
+   * Transform the given input by changing rows to columns.
+   *
+   * @param w The input matrix used to perform the transformation.
+   */
+  void Transform(arma::cube& w)
+  {
+    for (size_t i = 0; i < w.n_slices; i++)
+    {
+      arma::mat t = w.slice(i);
+      Transform(t);
+      w.slice(i) = t;
+    }
+  }
+
+  /**
+   * Apply pooling to the input and store the results to the output parameter.
+   *
+   * @param kSize the kernel size used to perform the pooling operation.
+   * @param input The input to be apply the pooling rule.
+   * @param output The pooled result.
+   */
+  template<typename eT>
+  void Pooling(const size_t kSize,
+               const arma::Mat<eT>& input,
+               arma::Mat<eT>& output)
+  {
+
+    const size_t rStep = kSize;
+    const size_t cStep = kSize;
+
+    for (size_t j = 0; j < input.n_cols; j += cStep)
+    {
+      for (size_t i = 0; i < input.n_rows; i += rStep)
+      {
+        output(i / rStep, j / cStep) += pooling.Pooling(
+            input(arma::span(i, i + rStep - 1), arma::span(j, j + cStep - 1)));
+      }
+    }
+  }
+
+  /**
+   * Apply unpooling to the input and store the results.
+   *
+   * @param input The input to be apply the unpooling rule.
+   * @param error The error used to perform the unpooling operation.
+   * @param output The pooled result.
+   */
+  template<typename eT>
+  void Unpooling(const arma::Mat<eT>& input,
+                 const arma::Mat<eT>& error,
+                 arma::Mat<eT>& output)
+  {
+    const size_t rStep = input.n_rows / error.n_rows;
+    const size_t cStep = input.n_cols / error.n_cols;
+
+    arma::Mat<eT> unpooledError;
+    for (size_t j = 0; j < input.n_cols; j += cStep)
+    {
+      for (size_t i = 0; i < input.n_rows; i += rStep)
+      {
+        const arma::Mat<eT>& inputArea = input(arma::span(i, i + rStep - 1),
+                                               arma::span(j, j + cStep - 1));
+
+        pooling.Unpooling(inputArea, error(i / rStep, j / cStep),
+            unpooledError);
+
+        output(arma::span(i, i + rStep - 1),
+            arma::span(j, j + cStep - 1)) += unpooledError;
+      }
+    }
+  }
+
+  /**
+   * Apply ReSampling to the input and store the results in the output
+   * parameter.
+   *
+   * @param input The input to be apply the ReSampling rule.
+   * @param output The pooled result.
+   */
+  template<typename eT>
+  void ReSampling(const arma::Mat<eT>& input, arma::Mat<eT>& output)
+  {
+    double wRatio = (double) (input.n_rows - 1) / (size - 1);
+    double hRatio = (double) (input.n_cols - 1) / (size - 1);
+
+    double iWidth = input.n_rows - 1;
+    double iHeight = input.n_cols - 1;
+
+    for (size_t y = 0; y < size; y++)
+    {
+      for (size_t x = 0; x < size; x++)
+      {
+        double ix = wRatio * x;
+        double iy = hRatio * y;
+
+        // Get the 4 nearest neighbors.
+        double ixNw = std::floor(ix);
+        double iyNw = std::floor(iy);
+        double ixNe = ixNw + 1;
+        double iySw = iyNw + 1;
+
+        // Get surfaces to each neighbor.
+        double se = (ix - ixNw) * (iy - iyNw);
+        double sw = (ixNe - ix) * (iy - iyNw);
+        double ne = (ix - ixNw) * (iySw - iy);
+        double nw = (ixNe - ix) * (iySw - iy);
+
+        // Calculate the weighted sum.
+        output(y, x) = input(iyNw, ixNw) * nw + 
+            input(iyNw, std::min(ixNe,  iWidth)) * ne +
+            input(std::min(iySw, iHeight), ixNw) * sw + 
+            input(std::min(iySw, iHeight), std::min(ixNe, iWidth)) * se;
+      }
+    }
+  }
+
+  /**
+   * Apply DownwardReSampling to the input and store the results into the output
+   * parameter.
+   *
+   * @param input The input to be apply the DownwardReSampling rule.
+   * @param error The error used to perform the DownwardReSampling operation.
+   * @param output The DownwardReSampled result.
+   */
+  template<typename eT>
+  void DownwardReSampling(const arma::Mat<eT>& input,
+                          const arma::Mat<eT>& error,
+                          arma::Mat<eT>& output)
+  {
+    double iWidth = input.n_rows - 1;
+    double iHeight = input.n_cols - 1;
+
+    double wRatio = iWidth / (size - 1);
+    double hRatio = iHeight / (size - 1);    
+
+    for (size_t y = 0; y < size; y++)
+    {
+      for (size_t x = 0; x < size; x++)
+      {
+        double ix = wRatio * x;
+        double iy = hRatio * y;
+
+        // Get the 4 nearest neighbors.
+        double ixNw = std::floor(ix);
+        double iyNw = std::floor(iy);
+        double ixNe = ixNw + 1;
+        double iySw = iyNw + 1;
+
+        // Get surfaces to each neighbor.
+        double se = (ix - ixNw) * (iy - iyNw);
+        double sw = (ixNe - ix) * (iy - iyNw);
+        double ne = (ix - ixNw) * (iySw - iy);
+        double nw = (ixNe - ix) * (iySw - iy);
+
+        double ograd = error(y, x);
+
+        output(iyNw, ixNw) = output(iyNw, ixNw) + nw * ograd;
+        output(iyNw, std::min(ixNe, iWidth)) = output(iyNw,
+            std::min(ixNe, iWidth)) + ne * ograd;
+        output(std::min(iySw, iHeight), ixNw) = output(std::min(iySw, iHeight),
+            ixNw) + sw * ograd;
+        output(std::min(iySw, iHeight), std::min(ixNe, iWidth)) = output(
+            std::min(iySw, iHeight), std::min(ixNe, iWidth)) + se * ograd;
+      }
+    }
+  }
+
+  //! Locally-stored delta object.
+  OutputDataType delta;
+
+  //! Locally-stored input parameter object.
+  InputDataType inputParameter;
+
+  //! Locally-stored output parameter object.
+  OutputDataType outputParameter;
+
+  //! Locally-stored depth of the input.
+  size_t inputDepth;
+
+  //! The size of the input units.
+  size_t inSize;
+
+  //! The used glimpse size (height = width).
+  size_t size;
+
+  //! The number of patches to crop per glimpse.
+  size_t depth;
+
+  //! The scale fraction.
+  size_t scale;
+
+  //! The x and y coordinate of the center of the output glimpse.
+  arma::mat location;
+
+  //! Locally-stored object to perform the mean pooling operation.
+  MeanPooling pooling;
+}; // class GlimpseLayer
+
+}; // namespace ann
+}; // namespace mlpack
+
+#endif