[mlpack-git] master: Allow mini-batch SGD optimizer. (6e8e873)

Mon Feb 22 13:32:56 EST 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/29dc196be362c8af44f36f331ac719d5a0e34acd...f3d692c0124e9667076b97318f6d64661015d368

>---------------------------------------------------------------

commit 6e8e87311b91eaf8b5fdc805726dc0639f083c90
Author: Ryan Curtin <ryan at ratml.org>
Date:   Mon Feb 22 10:32:34 2016 -0800

    Allow mini-batch SGD optimizer.


>---------------------------------------------------------------

6e8e87311b91eaf8b5fdc805726dc0639f083c90
 src/mlpack/methods/nca/nca_main.cpp | 94 +++++++++++++++++++++++++------------
 1 file changed, 63 insertions(+), 31 deletions(-)

diff --git a/src/mlpack/methods/nca/nca_main.cpp b/src/mlpack/methods/nca/nca_main.cpp
index 0da3b56..de16626 100644
--- a/src/mlpack/methods/nca/nca_main.cpp
+++ b/src/mlpack/methods/nca/nca_main.cpp
@@ -10,6 +10,7 @@
 #include "nca.hpp"
 
 #include <mlpack/core/optimizers/lbfgs/lbfgs.hpp>
+#include <mlpack/core/optimizers/minibatch_sgd/minibatch_sgd.hpp>
 
 // Define parameters.
 PROGRAM_INFO("Neighborhood Components Analysis (NCA)",
@@ -19,18 +20,18 @@ PROGRAM_INFO("Neighborhood Components Analysis (NCA)",
     "by scaling the dimensions.  The method is nonparametric, and does not "
     "require a value of k.  It works by using stochastic (\"soft\") neighbor "
     "assignments and using optimization techniques over the gradient of the "
-    "accuracy of the neighbor assignments.\n"
-    "\n"
+    "accuracy of the neighbor assignments."
+    "\n\n"
     "To work, this algorithm needs labeled data.  It can be given as the last "
     "row of the input dataset (--input_file), or alternatively in a separate "
-    "file (--labels_file).\n"
-    "\n"
-    "This implementation of NCA uses either stochastic gradient descent or the "
-    "L_BFGS optimizer.  Both of these optimizers do not guarantee global "
-    "convergence for a nonconvex objective function (NCA's objective function "
-    "is nonconvex), so the final results could depend on the random seed or "
-    "other optimizer parameters.\n"
-    "\n"
+    "file (--labels_file)."
+    "\n\n"
+    "This implementation of NCA uses stochastic gradient descent, mini-batch "
+    "stochastic gradient descent, or the L_BFGS optimizer.  These optimizers do"
+    " not guarantee global convergence for a nonconvex objective function "
+    "(NCA's objective function is nonconvex), so the final results could depend"
+    " on the random seed or other optimizer parameters."
+    "\n\n"
     "Stochastic gradient descent, specified by --optimizer \"sgd\", depends "
     "primarily on two parameters: the step size (--step_size) and the maximum "
     "number of iterations (--max_iterations).  In addition, a normalized "
@@ -44,10 +45,18 @@ PROGRAM_INFO("Neighborhood Components Analysis (NCA)",
     "the maximum iterations to a large number and allow SGD to find a minimum, "
     "or set the maximum iterations to 0 (allowing infinite iterations) and set "
     "the tolerance (--tolerance) to define the maximum allowed difference "
-    "between objectives for SGD to terminate.  Be careful -- setting the "
+    "between objectives for SGD to terminate.  Be careful---setting the "
     "tolerance instead of the maximum iterations can take a very long time and "
-    "may actually never converge due to the properties of the SGD optimizer.\n"
-    "\n"
+    "may actually never converge due to the properties of the SGD optimizer. "
+    "Note that a single iteration of SGD refers to a single point, so to take "
+    "a single pass over the dataset, set --max_iterations equal to the number "
+    "of points in the dataset."
+    "\n\n"
+    "The mini-batch SGD optimizer, specified by --optimizer \"minibatch-sgd\", "
+    "has the same parameters as SGD, but the batch size may also be specified "
+    "with the --batch_size (-b) option.  Each iteration of mini-batch SGD "
+    "refers to a single mini-batch."
+    "\n\n"
     "The L-BFGS optimizer, specified by --optimizer \"lbfgs\", uses a "
     "back-tracking line search algorithm to minimize a function.  The "
     "following parameters are used by L-BFGS: --num_basis (specifies the number"
@@ -57,15 +66,16 @@ PROGRAM_INFO("Neighborhood Components Analysis (NCA)",
     "--max_step (which both refer to the line search routine).  For more "
     "details on the L-BFGS optimizer, consult either the mlpack L-BFGS "
     "documentation (in lbfgs.hpp) or the vast set of published literature on "
-    "L-BFGS.\n"
-    "\n"
+    "L-BFGS."
+    "\n\n"
     "By default, the SGD optimizer is used.");
 
 PARAM_STRING_REQ("input_file", "Input dataset to run NCA on.", "i");
 PARAM_STRING_REQ("output_file", "Output file for learned distance matrix.",
     "o");
 PARAM_STRING("labels_file", "File of labels for input dataset.", "l", "");
-PARAM_STRING("optimizer", "Optimizer to use; \"sgd\" or \"lbfgs\".", "O", "sgd");
+PARAM_STRING("optimizer", "Optimizer to use; 'sgd', 'minibatch-sgd', or "
+    "'lbfgs'.", "O", "sgd");
 
 PARAM_FLAG("normalize", "Use a normalized starting point for optimization. This"
     " is useful for when points are far apart, or when SGD is returning NaN.",
@@ -79,7 +89,8 @@ PARAM_DOUBLE("tolerance", "Maximum tolerance for termination of SGD or L-BFGS.",
 PARAM_DOUBLE("step_size", "Step size for stochastic gradient descent (alpha).",
     "a", 0.01);
 PARAM_FLAG("linear_scan", "Don't shuffle the order in which data points are "
-    "visited for SGD.", "L");
+    "visited for SGD or mini-batch SGD.", "L");
+PARAM_INT("batch_size", "Batch size for mini-batch SGD.", "b", 50);
 
 PARAM_INT("num_basis", "Number of memory points to be stored for L-BFGS.", "B",
     5);
@@ -92,7 +103,6 @@ PARAM_DOUBLE("max_step", "Maximum step of line search for L-BFGS.", "M", 1e20);
 
 PARAM_INT("seed", "Random seed.  If 0, 'std::time(NULL)' is used.", "s", 0);
 
-
 using namespace mlpack;
 using namespace mlpack::nca;
 using namespace mlpack::metric;
@@ -115,47 +125,56 @@ int main(int argc, char* argv[])
 
   const string optimizerType = CLI::GetParam<string>("optimizer");
 
-  if ((optimizerType != "sgd") && (optimizerType != "lbfgs"))
+  if ((optimizerType != "sgd") && (optimizerType != "lbfgs") &&
+      (optimizerType != "minibatch-sgd"))
   {
     Log::Fatal << "Optimizer type '" << optimizerType << "' unknown; must be "
-        << "'sgd' or 'lbfgs'!" << std::endl;
+        << "'sgd', 'minibatch-sgd', or 'lbfgs'!" << endl;
   }
 
   // Warn on unused parameters.
-  if (optimizerType == "sgd")
+  if (optimizerType == "sgd" || optimizerType == "minibatch-sgd")
   {
     if (CLI::HasParam("num_basis"))
       Log::Warn << "Parameter --num_basis ignored (not using 'lbfgs' "
-          << "optimizer)." << std::endl;
+          << "optimizer)." << endl;
 
     if (CLI::HasParam("armijo_constant"))
       Log::Warn << "Parameter --armijo_constant ignored (not using 'lbfgs' "
-          << "optimizer)." << std::endl;
+          << "optimizer)." << endl;
 
     if (CLI::HasParam("wolfe"))
       Log::Warn << "Parameter --wolfe ignored (not using 'lbfgs' optimizer).\n";
 
     if (CLI::HasParam("max_line_search_trials"))
       Log::Warn << "Parameter --max_line_search_trials ignored (not using "
-          << "'lbfgs' optimizer." << std::endl;
+          << "'lbfgs' optimizer." << endl;
 
     if (CLI::HasParam("min_step"))
       Log::Warn << "Parameter --min_step ignored (not using 'lbfgs' optimizer)."
-          << std::endl;
+          << endl;
 
     if (CLI::HasParam("max_step"))
       Log::Warn << "Parameter --max_step ignored (not using 'lbfgs' optimizer)."
-          << std::endl;
+          << endl;
+
+    if (optimizerType == "sgd" && CLI::HasParam("batch_size"))
+      Log::Warn << "Parameter --batch_size ignored (not using 'minibatch-sgd' "
+          << "optimizer." << endl;
   }
   else if (optimizerType == "lbfgs")
   {
     if (CLI::HasParam("step_size"))
-      Log::Warn << "Parameter --step_size ignored (not using 'sgd' optimizer)."
-          << std::endl;
+      Log::Warn << "Parameter --step_size ignored (not using 'sgd' or "
+          << "'minibatch-sgd' optimizer)." << endl;
 
     if (CLI::HasParam("linear_scan"))
-      Log::Warn << "Parameter --linear_scan ignored (not using 'sgd' "
-          << "optimizer)." << std::endl;
+      Log::Warn << "Parameter --linear_scan ignored (not using 'sgd' or "
+          << "'minibatch-sgd' optimizer)." << endl;
+
+    if (CLI::HasParam("batch_size"))
+      Log::Warn << "Parameter --batch_size ignored (not using 'minibatch-sgd' "
+          << "optimizer)." << endl;
   }
 
   const double stepSize = CLI::GetParam<double>("step_size");
@@ -169,6 +188,7 @@ int main(int argc, char* argv[])
   const int maxLineSearchTrials = CLI::GetParam<int>("max_line_search_trials");
   const double minStep = CLI::GetParam<double>("min_step");
   const double maxStep = CLI::GetParam<double>("max_step");
+  const size_t batchSize = (size_t) CLI::GetParam<int>("batch_size");
 
   // Load data.
   arma::mat data;
@@ -188,6 +208,7 @@ int main(int argc, char* argv[])
   }
   else
   {
+    Log::Info << "Using last column of input dataset as labels." << endl;
     for (size_t i = 0; i < data.n_cols; i++)
       rawLabels[i] = (int) data(data.n_rows - 1, i);
 
@@ -212,7 +233,7 @@ int main(int argc, char* argv[])
 
     distance = diagmat(1.0 / ranges);
     Log::Info << "Using normalized starting point for optimization."
-        << std::endl;
+        << endl;
   }
   else
   {
@@ -244,6 +265,17 @@ int main(int argc, char* argv[])
 
     nca.LearnDistance(distance);
   }
+  else if (optimizerType == "minibatch-sgd")
+  {
+    NCA<LMetric<2>, MiniBatchSGD> nca(data, labels);
+    nca.Optimizer().StepSize() = stepSize;
+    nca.Optimizer().MaxIterations() = maxIterations;
+    nca.Optimizer().Tolerance() = tolerance;
+    nca.Optimizer().Shuffle() = shuffle;
+    nca.Optimizer().BatchSize() = batchSize;
+
+    nca.LearnDistance(distance);
+  }
 
   // Save the output.
   data::Save(CLI::GetParam<string>("output_file"), distance, true);