[mlpack-git] master: Refactor executable to allow saving/loading models. (9e76f0b)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Thu Dec 3 14:37:19 EST 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/c829fc1a2415f3dddb672431bb51ff05cbc40a76...9e76f0b82b8bc4fe038179abe77a78146b40c195

>---------------------------------------------------------------

commit 9e76f0b82b8bc4fe038179abe77a78146b40c195
Author: ryan <ryan at ratml.org>
Date:   Thu Dec 3 14:37:03 2015 -0500

    Refactor executable to allow saving/loading models.


>---------------------------------------------------------------

9e76f0b82b8bc4fe038179abe77a78146b40c195
 src/mlpack/methods/lars/lars_main.cpp | 145 +++++++++++++++++++++++-----------
 1 file changed, 101 insertions(+), 44 deletions(-)

diff --git a/src/mlpack/methods/lars/lars_main.cpp b/src/mlpack/methods/lars/lars_main.cpp
index 4b23472..7981ca0 100644
--- a/src/mlpack/methods/lars/lars_main.cpp
+++ b/src/mlpack/methods/lars/lars_main.cpp
@@ -13,34 +13,46 @@ PROGRAM_INFO("LARS", "An implementation of LARS: Least Angle Regression "
     "L1-regularized linear regression (LASSO) and L1+L2-regularized linear "
     "regression (Elastic Net).\n"
     "\n"
+    "This program is able to train a LARS/LASSO/Elastic Net model or load a "
+    "model from file, output regression predictions for a test set, and save "
+    "the trained model to a file.  The LARS algorithm is described in more "
+    "detail below:\n"
+    "\n"
     "Let X be a matrix where each row is a point and each column is a "
     "dimension, and let y be a vector of targets.\n"
     "\n"
     "The Elastic Net problem is to solve\n\n"
     "  min_beta 0.5 || X * beta - y ||_2^2 + lambda_1 ||beta||_1 +\n"
     "      0.5 lambda_2 ||beta||_2^2\n\n"
-    "If lambda_1 > 0 and lambda_2 = 0, the problem is the LASSO.\n"
-    "If lambda_1 > 0 and lambda_2 > 0, the problem is the Elastic Net.\n"
-    "If lambda_1 = 0 and lambda_2 > 0, the problem is ridge regression.\n"
-    "If lambda_1 = 0 and lambda_2 = 0, the problem is unregularized linear "
+    "If --lambda1 > 0 and --lambda2 = 0, the problem is the LASSO.\n"
+    "If --lambda1 > 0 and --lambda2 > 0, the problem is the Elastic Net.\n"
+    "If --lambda1 = 0 and --lambda2 > 0, the problem is ridge regression.\n"
+    "If --lambda1 = 0 and --lambda2 = 0, the problem is unregularized linear "
     "regression.\n"
     "\n"
     "For efficiency reasons, it is not recommended to use this algorithm with "
-    "lambda_1 = 0.  In that case, use the 'linear_regression' program, which "
-    "implements both unregularized linear regression and ridge regression.\n");
+    "--lambda_1 = 0.  In that case, use the 'linear_regression' program, which "
+    "implements both unregularized linear regression and ridge regression.\n"
+    "\n"
+    "To train a LARS/LASSO/Elastic Net model, the --input_file and "
+    "--responses_file parameters must be given.  The --lambda1 --lambda2, and "
+    "--use_cholesky arguments control the training parameters.  A trained model"
+    " can be saved with the --output_model_file, or, if training is not desired"
+    " at all, a model can be loaded with --input_model_file.  Any output "
+    "predictions from a test file can be saved into the file specified by the "
+    "--output_predictions option.");
 
-PARAM_STRING_REQ("input_file", "File containing covariates (X).",
-    "i");
-PARAM_STRING_REQ("responses_file", "File containing y "
-    "(responses/observations).", "r");
+PARAM_STRING("input_file", "File containing covariates (X).", "i", "");
+PARAM_STRING("responses_file", "File containing y (responses/observations).",
+    "r", "");
 
-PARAM_STRING("output_file", "File to save beta (linear estimator) to.", "o",
-    "output.csv");
+PARAM_STRING("input_model_file", "File to load model from.", "m", "");
+PARAM_STRING("output_model_file", "File to save model to.", "M", "");
 
 PARAM_STRING("test_file", "File containing points to regress on (test points).",
     "t", "");
 PARAM_STRING("output_predictions", "If --test_file is specified, this file is "
-    "where the predicted responses will be saved.", "p", "predictions.csv");
+    "where the predicted responses will be saved.", "o", "predictions.csv");
 
 PARAM_DOUBLE("lambda1", "Regularization parameter for l1-norm penalty.", "l",
     0);
@@ -63,36 +75,69 @@ int main(int argc, char* argv[])
   double lambda2 = CLI::GetParam<double>("lambda2");
   bool useCholesky = CLI::HasParam("use_cholesky");
 
-  // Load covariates.  We can avoid LARS transposing our data by choosing to not
-  // transpose this data.
-  const string matXFilename = CLI::GetParam<string>("input_file");
-  mat matX;
-  data::Load(matXFilename, matX, true, false);
-
-  // Load responses.  The responses should be a one-dimensional vector, and it
-  // seems more likely that these will be stored with one response per line (one
-  // per row).  So we should not transpose upon loading.
-  const string yFilename = CLI::GetParam<string>("responses_file");
-  mat matY; // Will be a vector.
-  data::Load(yFilename, matY, true, false);
-
-  // Make sure y is oriented the right way.
-  if (matY.n_rows == 1)
-    matY = trans(matY);
-  if (matY.n_cols > 1)
-    Log::Fatal << "Only one column or row allowed in responses file!" << endl;
-
-  if (matY.n_elem != matX.n_rows)
-    Log::Fatal << "Number of responses must be equal to number of rows of X!"
-        << endl;
-
-  // Do LARS.
+  // Check parameters -- make sure everything given makes sense.
+  if (CLI::HasParam("input_file") && !CLI::HasParam("responses_file"))
+    Log::Fatal << "--input_file (-i) is specified, but --responses_file (-r) is"
+        << " not!" << endl;
+
+  if (CLI::HasParam("responses_file") && !CLI::HasParam("input_file"))
+    Log::Fatal << "--responses_file (-r) is specified, but --input_file (-i) is"
+        << " not!" << endl;
+
+  if (!CLI::HasParam("input_file") && !CLI::HasParam("input_model_file"))
+    Log::Fatal << "No input data specified (with --input_file (-i) and "
+        << "--responses_file (-r)), and no input model specified (with "
+        << "--input_model_file (-m))!" << endl;
+
+  if (CLI::HasParam("input_file") && CLI::HasParam("input_model_file"))
+    Log::Fatal << "Both --input_file (-i) and --input_model_file (-m) are "
+        << "specified, but only one may be specified!" << endl;
+
+  if (!CLI::HasParam("output_predictions") &&
+      !CLI::HasParam("output_model_file"))
+    Log::Warn << "--output_predictions (-o) and --output_model_file (-M) are "
+        << "not specified; no results will be saved!" << endl;
+
+  if (CLI::HasParam("output_predictions") && !CLI::HasParam("test_file"))
+    Log::Warn << "--output_predictions (-o) specified, but --test_file (-t) is "
+        << "not; no results will be saved." << endl;
+
+  // Initialize the object.
   LARS lars(useCholesky, lambda1, lambda2);
-  vec beta;
-  lars.Train(matX, matY.unsafe_col(0), beta, false /* do not transpose */);
 
-  const string betaFilename = CLI::GetParam<string>("output_file");
-  beta.save(betaFilename, raw_ascii);
+  if (CLI::HasParam("input_file"))
+  {
+    // Load covariates.  We can avoid LARS transposing our data by choosing to
+    // not transpose this data.
+    const string matXFilename = CLI::GetParam<string>("input_file");
+    mat matX;
+    data::Load(matXFilename, matX, true, false);
+
+    // Load responses.  The responses should be a one-dimensional vector, and it
+    // seems more likely that these will be stored with one response per line
+    // (one per row).  So we should not transpose upon loading.
+    const string yFilename = CLI::GetParam<string>("responses_file");
+    mat matY; // Will be a vector.
+    data::Load(yFilename, matY, true, false);
+
+    // Make sure y is oriented the right way.
+    if (matY.n_rows == 1)
+      matY = trans(matY);
+    if (matY.n_cols > 1)
+      Log::Fatal << "Only one column or row allowed in responses file!" << endl;
+
+    if (matY.n_elem != matX.n_rows)
+      Log::Fatal << "Number of responses must be equal to number of rows of X!"
+          << endl;
+
+    vec beta;
+    lars.Train(matX, matY.unsafe_col(0), beta, false /* do not transpose */);
+  }
+  else // We must have --input_model_file.
+  {
+    const string modelFile = CLI::GetParam<string>("input_model_file");
+    data::Load(modelFile, "lars_model", lars, true);
+  }
 
   if (CLI::HasParam("test_file"))
   {
@@ -105,11 +150,23 @@ int main(int argc, char* argv[])
     mat testPoints;
     data::Load(testFile, testPoints, true, false);
 
+    // Make sure the dimensionality is right.  We haven't transposed, so, we
+    // check n_cols not n_rows.
+    if (testPoints.n_cols != lars.BetaPath().back().n_elem)
+      Log::Fatal << "Dimensionality of test set (" << testPoints.n_cols << ") "
+          << "is not equal to the dimensionality of the model ("
+          << lars.BetaPath().back().n_elem << ")!" << endl;
+
     arma::vec predictions;
     lars.Predict(testPoints.t(), predictions, false);
 
-    // Save test predictions.  One per line, so, we need a rowvec.
-    arma::rowvec predToSave = predictions.t();
-    data::Save(outputPredictionsFile, predToSave);
+    // Save test predictions.  One per line, so, don't transpose on save.
+    data::Save(outputPredictionsFile, predictions, true, false);
+  }
+
+  if (CLI::HasParam("output_model_file"))
+  {
+    const string outputModelFile = CLI::GetParam<string>("output_model_file");
+    data::Save(outputModelFile, "lars_model", lars, true);
   }
 }



More information about the mlpack-git mailing list