[mlpack-svn] r13808 - mlpack/trunk/src/mlpack/methods/nca

fastlab-svn at coffeetalk-1.cc.gatech.edu fastlab-svn at coffeetalk-1.cc.gatech.edu
Wed Oct 31 17:27:43 EDT 2012


Author: rcurtin
Date: 2012-10-31 17:27:41 -0400 (Wed, 31 Oct 2012)
New Revision: 13808

Modified:
   mlpack/trunk/src/mlpack/methods/nca/nca_main.cpp
Log:
Slightly smarter normalization strategy for large datasets.


Modified: mlpack/trunk/src/mlpack/methods/nca/nca_main.cpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/nca/nca_main.cpp	2012-10-31 21:19:56 UTC (rev 13807)
+++ mlpack/trunk/src/mlpack/methods/nca/nca_main.cpp	2012-10-31 21:27:41 UTC (rev 13808)
@@ -33,6 +33,8 @@
     "gradient descent (0 indicates no limit).", "n", 500000);
 PARAM_DOUBLE("tolerance", "Maximum tolerance for termination of stochastic "
     "gradient descent.", "t", 1e-7);
+PARAM_FLAG("normalize", "Normalize data; useful for datasets where points are "
+    "far apart, or when SGD is converging to an objective of NaN.", "N");
 
 using namespace mlpack;
 using namespace mlpack::nca;
@@ -52,6 +54,7 @@
   const double stepSize = CLI::GetParam<double>("step_size");
   const size_t maxIterations = CLI::GetParam<int>("max_iterations");
   const double tolerance = CLI::GetParam<double>("tolerance");
+  const bool normalize = CLI::HasParam("normalize");
 
   // Load data.
   mat data;
@@ -77,6 +80,25 @@
     data.shed_row(data.n_rows - 1);
   }
 
+  // Normalize the data, if necessary.
+  if (normalize)
+  {
+    // Find the minimum and maximum values for each dimension.
+    arma::vec range = arma::max(data, 1) - arma::min(data, 1);
+
+    // Now find the maximum range.
+    double maxRange = arma::max(range);
+
+    // We can place a (lazy) upper bound on the distance with range^2 * d.
+    // Since we want no distance greater than 700 (because std::exp(-750)
+    // underflows), we can normalize with (range^2 * d) / 700).
+    double normalization = (std::pow(maxRange, 2.0) * data.n_rows) / 700.0;
+    data /= normalization; // Element-wise division.
+
+    Log::Info << "Data normalized (normalization constant " << normalization
+        << ")." << std::endl;
+  }
+
   // Now create the NCA object and run the optimization.
   NCA<LMetric<2> > nca(data, labels.unsafe_col(0), stepSize, maxIterations,
       tolerance);




More information about the mlpack-svn mailing list