[mlpack-git] master: Add model save/load support to sparse_coding. (a03c124)

Fri Dec 11 12:47:08 EST 2015

Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/dd7c8b93fe5f299cb534cda70c1c786456f9a78f...3b926fd86ab143eb8af7327b9fb89fead7538df0

>---------------------------------------------------------------

commit a03c1246bcf8d5b40da11a4ccba39a68fab450bc
Author: Ryan Curtin <ryan at ratml.org>
Date:   Fri Dec 11 08:12:37 2015 +0000

    Add model save/load support to sparse_coding.


>---------------------------------------------------------------

a03c1246bcf8d5b40da11a4ccba39a68fab450bc
 .../methods/sparse_coding/sparse_coding_main.cpp   | 240 ++++++++++++++-------
 1 file changed, 165 insertions(+), 75 deletions(-)

diff --git a/src/mlpack/methods/sparse_coding/sparse_coding_main.cpp b/src/mlpack/methods/sparse_coding/sparse_coding_main.cpp
index 7f43092..d631f3f 100644
--- a/src/mlpack/methods/sparse_coding/sparse_coding_main.cpp
+++ b/src/mlpack/methods/sparse_coding/sparse_coding_main.cpp
@@ -22,49 +22,62 @@ PROGRAM_INFO("Sparse Coding", "An implementation of Sparse Coding with "
     "dictionary step, which updates the dictionary D, and a sparse coding step,"
     " which updates the sparse coding matrix."
     "\n\n"
-    "To run this program, the input matrix X must be specified (with -i), along"
-    " with the number of atoms in the dictionary (-k).  An initial dictionary "
-    "may also be specified with the --initial_dictionary option.  The l1 and l2"
-    " norm regularization parameters may be specified with -l and -L, "
-    "respectively.  For example, to run sparse coding on the dataset in "
+    "Once a dictionary D is found, the sparse coding model may be used to "
+    "encode other matrices, and saved for future usage."
+    "\n\n"
+    "To run this program, either an input matrix or an already-saved sparse "
+    "coding model must be specified.  An input matrix may be specified with the"
+    " --training_file (-t) option, along with the number of atoms in the "
+    "dictionary (--atoms, or -k).  It is also possible to specify an initial "
+    "dictionary for the optimization, with the --initial_dictionary (-i) "
+    "option. An input model may be specified with the --input_model_file (-m) "
+    "option. There are also other training options available."
+    "\n\n"
+    "As an example, to build a sparse coding model on the dataset in "
     "data.csv using 200 atoms and an l1-regularization parameter of 0.1, saving"
-    " the dictionary into dict.csv and the codes into codes.csv, use "
+    " the model into model.xml, use "
+    "\n\n"
+    "$ sparse_coding -t data.csv -k 200 -l 0.1 -M model.xml"
     "\n\n"
-    "$ sparse_coding -i data.csv -k 200 -l 0.1 -d dict.csv -c codes.csv"
+    "Then, this model could be used to encode a new matrix, otherdata.csv, and "
+    "save the output codes to codes.csv:"
     "\n\n"
-    "The maximum number of iterations may be specified with the -n option. "
-    "Optionally, the input data matrix X can be normalized before coding with "
-    "the -N option.");
+    "$ sparse_coding -m model.xml -T otherdata.csv -c codes.csv");
 
-PARAM_STRING_REQ("input_file", "Filename of the input data.", "i");
-PARAM_INT_REQ("atoms", "Number of atoms in the dictionary.", "k");
+// Train the model.
+PARAM_STRING("training_file", "Filename of the training data (X).", "t", "");
+PARAM_INT("atoms", "Number of atoms in the dictionary.", "k", 0);
 
 PARAM_DOUBLE("lambda1", "Sparse coding l1-norm regularization parameter.", "l",
     0);
 PARAM_DOUBLE("lambda2", "Sparse coding l2-norm regularization parameter.", "L",
     0);
-
 PARAM_INT("max_iterations", "Maximum number of iterations for sparse coding (0 "
     "indicates no limit).", "n", 0);
-
 PARAM_STRING("initial_dictionary", "Filename for optional initial dictionary.",
-    "D", "");
-
-PARAM_STRING("dictionary_file", "Filename to save the output dictionary to.",
-    "d", "dictionary.csv");
-PARAM_STRING("codes_file", "Filename to save the output sparse codes to.", "c",
-    "codes.csv");
-
+    "i", "");
 PARAM_FLAG("normalize", "If set, the input data matrix will be normalized "
     "before coding.", "N");
-
 PARAM_INT("seed", "Random seed.  If 0, 'std::time(NULL)' is used.", "s", 0);
-
 PARAM_DOUBLE("objective_tolerance", "Tolerance for convergence of the objective"
     " function.", "o", 0.01);
 PARAM_DOUBLE("newton_tolerance", "Tolerance for convergence of Newton method.",
     "w", 1e-6);
 
+// Load/save a model.
+PARAM_STRING("input_model_file", "File containing input sparse coding model.",
+    "m", "");
+PARAM_STRING("output_model_file", "File to save trained sparse coding model "
+    "to.", "M", "");
+
+PARAM_STRING("dictionary_file", "Filename to save the output dictionary to.",
+    "d", "");
+PARAM_STRING("codes_file", "Filename to save the output sparse codes to.", "c",
+    "");
+
+PARAM_STRING("test_file", "File containing data matrix to be encoded by trained"
+    " model.", "T", "");
+
 using namespace arma;
 using namespace std;
 using namespace mlpack;
@@ -80,76 +93,153 @@ int main(int argc, char* argv[])
   else
     RandomSeed((size_t) std::time(NULL));
 
-  const double lambda1 = CLI::GetParam<double>("lambda1");
-  const double lambda2 = CLI::GetParam<double>("lambda2");
-
-  const string inputFile = CLI::GetParam<string>("input_file");
-  const string dictionaryFile = CLI::GetParam<string>("dictionary_file");
-  const string codesFile = CLI::GetParam<string>("codes_file");
-  const string initialDictionaryFile =
-      CLI::GetParam<string>("initial_dictionary");
+  // Check for parameter validity.
+  if (CLI::HasParam("input_model_file") && CLI::HasParam("initial_dictionary"))
+    Log::Fatal << "Cannot specify both --input_model_file (-m) and "
+        << "--initial_dictionary (-i)!" << endl;
 
-  const size_t maxIterations = CLI::GetParam<int>("max_iterations");
-  const size_t atoms = CLI::GetParam<int>("atoms");
+  if (CLI::HasParam("training_file") && !CLI::HasParam("atoms"))
+    Log::Fatal << "If --training_file is specified, the number of atoms in the "
+        << "dictionary must be specified with --atoms (-k)!" << endl;
 
-  const bool normalize = CLI::HasParam("normalize");
+  if (!CLI::HasParam("training_file") && !CLI::HasParam("input_model_file"))
+    Log::Fatal << "One of --training_file (-t) or --input_model_file (-m) must "
+        << "be specified!" << endl;
 
-  const double objTolerance = CLI::GetParam<double>("objective_tolerance");
-  const double newtonTolerance = CLI::GetParam<double>("newton_tolerance");
+  if (!CLI::HasParam("codes_file") && !CLI::HasParam("dictionary_file") &&
+      !CLI::HasParam("output_model_file"))
+    Log::Warn << "Neither --codes_file (-c), --dictionary_file (-d), nor "
+        << "--output_model_file (-M) are specified; no output will be saved."
+        << endl;
 
-  mat matX;
-  data::Load(inputFile, matX, true);
-
-  Log::Info << "Loaded " << matX.n_cols << " points in " << matX.n_rows <<
-      " dimensions." << endl;
+  if (!CLI::HasParam("training_file"))
+  {
+    if (CLI::HasParam("atoms"))
+      Log::Warn << "--atoms (-k) ignored because --training_file (-t) is not "
+          << "specified." << endl;
+    if (CLI::HasParam("lambda1"))
+      Log::Warn << "--lambda1 (-l) ignored because --training_file (-t) is not "
+          << "specified." << endl;
+    if (CLI::HasParam("lambda2"))
+      Log::Warn << "--lambda2 (-L) ignored because --training_file (-t) is not "
+          << "specified." << endl;
+    if (CLI::HasParam("initial_dictionary"))
+      Log::Warn << "--initial_dictionary (-i) ignored because --training_file "
+          << "(-t) is not specified." << endl;
+    if (CLI::HasParam("max_iterations"))
+      Log::Warn << "--max_iterations (-n) ignored because --training_file (-t) "
+          << "is not specified." << endl;
+    if (CLI::HasParam("normalize"))
+      Log::Warn << "--normalize (-N) ignored because --training_file (-t) is "
+          << "not specified." << endl;
+    if (CLI::HasParam("objective_tolerance"))
+      Log::Warn << "--objective_tolerance (-o) ignored because --training_file "
+          << "(-t) is not specified." << endl;
+    if (CLI::HasParam("newton_tolerance"))
+      Log::Warn << "--newton_tolerance (-w) ignored because --training_file "
+          << "(-t) is not specified." << endl;
+  }
 
-  // Normalize each point if the user asked for it.
-  if (normalize)
+  // Do we have an existing model?
+  SparseCoding sc(0, 0.0);
+  if (CLI::HasParam("input_model_file"))
   {
-    Log::Info << "Normalizing data before coding..." << std::endl;
-    for (size_t i = 0; i < matX.n_cols; ++i)
-      matX.col(i) /= norm(matX.col(i), 2);
+    data::Load(CLI::GetParam<string>("input_model_file"), "sparse_coding_model",
+        sc, true);
   }
 
-  // If there is an initial dictionary, be sure we do not initialize one.
-  SparseCoding sc(atoms, lambda1, lambda2, maxIterations, objTolerance,
-      newtonTolerance);
-  if (initialDictionaryFile != "")
+  if (CLI::HasParam("training_file"))
   {
-    // Load initial dictionary directly into sparse coding object.
-    data::Load(initialDictionaryFile, sc.Dictionary(), true);
+    mat matX;
+    data::Load(CLI::GetParam<string>("training_file"), matX, true);
 
-    // Validate size of initial dictionary.
-    if (sc.Dictionary().n_cols != atoms)
+    // Normalize each point if the user asked for it.
+    if (CLI::HasParam("normalize"))
     {
-      Log::Fatal << "The initial dictionary has " << sc.Dictionary().n_cols
-          << " atoms, but the number of atoms was specified to be " << atoms
-          << "!" << endl;
+      Log::Info << "Normalizing data before coding..." << std::endl;
+      for (size_t i = 0; i < matX.n_cols; ++i)
+        matX.col(i) /= norm(matX.col(i), 2);
     }
 
-    if (sc.Dictionary().n_rows != matX.n_rows)
+    sc.Lambda1() = CLI::GetParam<double>("lambda1");
+    sc.Lambda2() = CLI::GetParam<double>("lambda2");
+    sc.MaxIterations() = CLI::GetParam<int>("max_iterations");
+    sc.Atoms() = CLI::GetParam<int>("atoms");
+    sc.ObjTolerance() = CLI::GetParam<double>("objective_tolerance");
+    sc.NewtonTolerance() = CLI::GetParam<double>("newton_tolerance");
+
+    // Inform the user if we are overwriting their model.
+    if (CLI::HasParam("input_model_file"))
     {
-      Log::Fatal << "The initial dictionary has " << sc.Dictionary().n_rows
-          << " dimensions, but the data has " << matX.n_rows << " dimensions!"
-          << endl;
+      Log::Info << "Using dictionary from existing model in '"
+          << CLI::GetParam<string>("input_model_file") << "' as initial "
+          << "dictionary for training." << endl;
+      sc.Train<NothingInitializer>(matX);
+    }
+    else if (CLI::HasParam("initial_dictionary"))
+    {
+      // Load initial dictionary directly into sparse coding object.
+      data::Load(CLI::GetParam<string>("initial_dictionary"), sc.Dictionary(),
+          true);
+
+      // Validate size of initial dictionary.
+      if (sc.Dictionary().n_cols != sc.Atoms())
+      {
+        Log::Fatal << "The initial dictionary has " << sc.Dictionary().n_cols
+            << " atoms, but the number of atoms was specified to be "
+            << sc.Atoms() << "!" << endl;
+      }
+
+      if (sc.Dictionary().n_rows != matX.n_rows)
+      {
+        Log::Fatal << "The initial dictionary has " << sc.Dictionary().n_rows
+            << " dimensions, but the data has " << matX.n_rows << " dimensions!"
+            << endl;
+      }
+
+      // Run sparse coding.
+      sc.Train<NothingInitializer>(matX);
+    }
+    else
+    {
+      // Run sparse coding with the default initialization.
+      sc.Train(matX);
     }
-
-    // Run sparse coding.
-    sc.Train<NothingInitializer>(matX);
   }
-  else
+
+  // Now, de we have any matrix to encode?
+  if (CLI::HasParam("test_file"))
   {
-    // Run sparse coding with the default initialization.
-    sc.Train(matX);
+    mat matY;
+    data::Load(CLI::GetParam<string>("test_file"), matY, true);
+
+    if (matY.n_rows != sc.Dictionary().n_rows)
+      Log::Fatal << "Model was trained with a dimensionality of "
+          << sc.Dictionary().n_rows << ", but data in test file '"
+          << CLI::GetParam<string>("test_file") << " has a dimensionality of "
+          << matY.n_rows << "!" << endl;
+
+    // Normalize each point if the user asked for it.
+    if (CLI::HasParam("normalize"))
+    {
+      Log::Info << "Normalizing test data before coding..." << std::endl;
+      for (size_t i = 0; i < matY.n_cols; ++i)
+        matY.col(i) /= norm(matY.col(i), 2);
+    }
+
+    mat codes;
+    sc.OptimizeCode(matY, codes);
+
+    if (CLI::HasParam("codes_file"))
+      data::Save(CLI::GetParam<string>("codes_file"), codes);
   }
 
-  // Encode the input matrix.
-  arma::mat codes;
-  sc.OptimizeCode(matX, codes);
+  // Did the user want to save the dictionary?
+  if (CLI::HasParam("dictionary_file"))
+    data::Save(CLI::GetParam<string>("dictionary_file"), sc.Dictionary());
 
-  // Save the results.
-  Log::Info << "Saving dictionary matrix to '" << dictionaryFile << "'.\n";
-  data::Save(dictionaryFile, sc.Dictionary());
-  Log::Info << "Saving sparse codes to '" << codesFile << "'.\n";
-  data::Save(codesFile, codes);
+  // Did the user want to save the model?
+  if (CLI::HasParam("output_model_file"))
+    data::Save(CLI::GetParam<string>("output_model_file"),
+        "sparse_coding_model", sc, false); // Non-fatal on failure.
 }