[mlpack-git] master: add descriptive statistics cli executable (27ac82e)

gitdub at mlpack.org gitdub at mlpack.org
Wed Jul 27 13:05:26 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/44a2b52f4d9ece563a5b9850db43ab60f71e5ec7...acd81e11579f69e75aa8406b2982328c88cf1fde

>---------------------------------------------------------------

commit 27ac82e876c3d68637d9e72d28bf0dadbc229ace
Author: Keon Kim <kwk236 at gmail.com>
Date:   Thu Jul 28 02:05:26 2016 +0900

    add descriptive statistics cli executable


>---------------------------------------------------------------

27ac82e876c3d68637d9e72d28bf0dadbc229ace
 .../preprocess/preprocess_describe_main.cpp        | 327 ++++++++++++---------
 1 file changed, 180 insertions(+), 147 deletions(-)

diff --git a/src/mlpack/methods/preprocess/preprocess_describe_main.cpp b/src/mlpack/methods/preprocess/preprocess_describe_main.cpp
index ae454f0..2e56c86 100644
--- a/src/mlpack/methods/preprocess/preprocess_describe_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_describe_main.cpp
@@ -5,194 +5,227 @@
  * Descriptive Statistics Class and CLI executable.
  */
 #include <mlpack/core.hpp>
+#include <boost/format.hpp>
+#include <boost/lexical_cast.hpp>
 
 using namespace mlpack;
 using namespace mlpack::data;
 using namespace std;
-
-PROGRAM_INFO("Descriptive Statistics", "This utility takes a dataset prints "
-    "out the statistical facts about the data.");
+using namespace boost;
+
+PROGRAM_INFO("Descriptive Statistics", "This utility takes a dataset and "
+    "prints out the descriptive statistics of the data. Descriptive statistics "
+    "is the discipline of quantitatively describing the main features of a "
+    "collection of information, or the quantitative description itself. The "
+    "program does not modify the original file, but instead prints out the "
+    "statistics to the console. The printed result will look like a table."
+    "\n\n"
+    "Optionally, width and precision of the output can be adjusted by a user "
+    "using the --width (-w) and --precision (-p). A user can also select a "
+    "specific dimension to analyize if he or she has too many dimensions."
+    "--population (-P) is a flag which can be used when the user wants the "
+    "dataset to be considered as a population. Otherwise, the dataset will "
+    "be considered as a sample."
+    "\n\n"
+    "So, a simple example where we want to print out statistical facts about "
+    "dataset.csv, and keep the default settings, we could run"
+    "\n\n"
+    "$ mlpack_preprocess_describe -i dataset.csv -v"
+    "\n\n"
+    "If we want to customize the width to 10 and precision to 5 and consider "
+    "the dataset as a population, we could run"
+    "\n\n"
+    "$ mlpack_preprocess_describe -i dataset.csv -w 10 -p 5 -P -v");
 
 // Define parameters for data.
-PARAM_STRING_REQ("input_file", "File containing data,", "i");
-PARAM_INT("dimension", "Dimension of the data", "d", 0);
-PARAM_INT("precision", "preferred precision of the result", "p", 2);
+PARAM_STRING_IN_REQ("input_file", "File containing data,", "i");
+PARAM_INT_IN("dimension", "Dimension of the data. Use this to specify a "
+    "dimension", "d", 0);
+PARAM_INT_IN("precision", "Precision of the output statistics.", "p", 4);
+PARAM_INT_IN("width", "Width of the output table.", "w", 8);
+PARAM_FLAG("population", "If specified, the program will calculate statistics "
+    "assuming the dataset is the population. By default, the program will "
+    "assume the dataset as a sample.", "P");
 
-// Statistics class, it calculates most of the statistical elements in its
-// constructor.
-template <typename T>
-class Statistics
+/**
+* Calculates the sum of deviations to the Nth Power
+*
+* @param input Vector that captures a dimension of a dataset
+* @param rowMean Mean of the given vector.
+* @return sum of nth power deviations
+*/
+double SumNthPowerDeviations(const arma::rowvec& input,
+    const double& rowMean,
+    const size_t Nth) // Degree of Power
 {
- public:
-  Statistics(arma::Mat<T> input, size_t norm_type = 1, bool columnMajor = true):
-      data(input)
-  {
-    minVec = arma::min(data, columnMajor);
-    maxVec = arma::max(data, columnMajor);
-    meanVec = arma::mean(data, columnMajor);
-    medianVec = arma::median(data, columnMajor);
-    stdVec = arma::stddev(data, 1, columnMajor);
-    varVec = arma::var(data, 1, columnMajor);
-  }
-  double Min(const size_t dimension) const
+  double sum = 0;
+  for (size_t i = 0; i < input.n_elem; ++i)
   {
-    return minVec(dimension);
+    sum += pow(input(i) - rowMean, Nth);
   }
-
-  double Max(const size_t dimension) const
-  {
-    return maxVec(dimension);
-  }
-
-  double Range(const size_t dimension) const
-  {
-    return maxVec(dimension) - minVec(dimension);
-  }
-
-  double Mean(const size_t dimension) const
-  {
-    return meanVec(dimension);
-  }
-
-  double Median(const size_t dimension) const
-  {
-    return medianVec(dimension);
-  }
-
-  double Variance(const size_t dimension) const
-  {
-    return varVec(dimension);
-  }
-
-  double StandardDeviation(const size_t dimension) const
-  {
-    return stdVec(dimension);
-  }
-
-  double Skewness(const size_t dimension) const
-  {
-    return this->CentralMoment(3, dimension);
-  }
-
-  double Kurtosis(const size_t dimension) const
+  return sum;
+}
+/**
+ * Calculates Skewness of the given vector.
+ *
+ * @param input Vector that captures a dimension of a dataset
+ * @param rowStd Standard Deviation of the given vector.
+ * @param rowMean Mean of the given vector.
+ * @return Skewness of the given vector.
+ */
+double Skewness(const arma::rowvec& input,
+    const double& rowStd,
+    const double& rowMean,
+    const bool population)
+{
+  double skewness = 0;
+  double S3 = pow(rowStd, 3);
+  double M3 = SumNthPowerDeviations(input, rowMean, 3);
+  double n = input.n_elem;
+  if (population)
   {
-    return this->CentralMoment(4, dimension);
+    // Calculate Population Skewness
+    skewness = n * M3 / (n * n * S3);
   }
-
-  double RawMoment(const size_t order, const size_t dimension) const
+  else
   {
-    // E(x)^order
-    double moment = 0;
-    for (size_t i = 0; i < data.n_cols; ++i)
-    {
-      moment += pow(data(dimension, i), order);
-    }
-    return moment / data.n_cols;
+    // Calculate Sample Skewness
+    skewness = n * M3 / ((n-1) * (n-2) * S3);
   }
-
-  double CentralMoment(const size_t order, const size_t dimension) const
+  return skewness;
+}
+/**
+ * Calculates kurtosis of the given vector.
+ *
+ * @param input Vector that captures a dimension of a dataset
+ * @param rowStd Standard Deviation of the given vector.
+ * @param rowMean Mean of the given vector.
+ * @return Kurtosis of the given vector.
+ */
+double Kurtosis(const arma::rowvec& input,
+    const double& rowStd,
+    const double& rowMean,
+    const bool population)
+{
+  double kurtosis = 0;
+  double M4 = SumNthPowerDeviations(input, rowMean, 4);
+  double n = input.n_elem;
+  if (population)
   {
-    // E(X-u)^order
-    if (order == 1)
-    {
-      return 0.0;
-    }
-    double moment = 0;
-    for (size_t i = 0; i < data.n_cols; ++i)
-    {
-      moment += pow(data(dimension, i) - meanVec(dimension), order);
-    }
-    return moment / data.n_cols;
+    // Calculate Population Excess Kurtosis
+    double M2 = SumNthPowerDeviations(input, rowMean, 2);
+    kurtosis = n * (M4 / pow(M2, 2)) - 3;
   }
-
-  double StandardError(const size_t dimension) const
+  else
   {
-     return stdVec(dimension) / sqrt(data.n_cols);
+    // Calculate Sample Excess Kurtosis
+    double S4 = pow(rowStd, 4);
+    double norm3 = (3 * (n-1) * (n-1)) / ((n-2) * (n-3));
+    double normC = (n * (n+1))/((n-1) * (n-2) * (n-3));
+    double normM = M4 / S4;
+    kurtosis = normC * normM - norm3;
   }
- private:
-  arma::Mat<T> data;
-
-  arma::vec minVec;
-  arma::vec maxVec;
-  arma::vec meanVec;
-  arma::vec medianVec;
-  arma::vec stdVec;
-  arma::vec varVec;
-};
-
+  return kurtosis;
+}
 /**
- * Make sure a CSV is loaded correctly.
+ * Calculates standard error of standard deviation.
+ *
+ * @param input Vector that captures a dimension of a dataset
+ * @param rowStd Standard Deviation of the given vector.
+ * @return Standard error of the stanrdard devation of the given vector.
  */
+double StandardError(const arma::rowvec& input, const double rowStd)
+{
+   return rowStd / sqrt(input.n_elem);
+}
+
 int main(int argc, char** argv)
 {
   // Parse command line options.
   CLI::ParseCommandLine(argc, argv);
   const string inputFile = CLI::GetParam<string>("input_file");
-  const size_t dimension = (size_t) CLI::GetParam<int>("dimension");
-  const size_t precision = (size_t) CLI::GetParam<int>("precision");
+  const size_t dimension = static_cast<size_t>(CLI::GetParam<int>("dimension"));
+  const size_t precision = static_cast<size_t>(CLI::GetParam<int>("precision"));
+  const size_t width = static_cast<size_t>(CLI::GetParam<int>("width"));
+  const bool population = CLI::HasParam("population");
 
   // Load the data
   arma::mat data;
-  data::Load(inputFile, data);
-
-  Statistics<double> stats(data);
+  data::Load(inputFile, data, false, true /*transpose*/);
+
+  // Generate boost format recipe.
+  const string widthPrecision("%-"+
+    to_string(width)+ "." +
+    to_string(precision));
+  const string widthOnly("%-"+
+    to_string(width)+ ".");
+  string stringFormat = "";
+  string numberFormat = "";
+  for (size_t i = 0; i < 11; ++i)
+  {
+    stringFormat += widthOnly + "s";
+    numberFormat += widthPrecision + "f";
+  }
 
+  Timer::Start("statistics");
   // Headers
-  Log::Info << left << setw(8) << "dim";
-  Log::Info << left << setw(8) << "var";
-  Log::Info << left << setw(8) << "mean";
-  Log::Info << left << setw(8) << "std";
-  Log::Info << left << setw(8) << "median";
-  Log::Info << left << setw(8) << "min";
-  Log::Info << left << setw(8) << "max";
-  Log::Info << left << setw(8) << "range";
-  Log::Info << left << setw(10) << "skewness";
-  Log::Info << left << setw(10) << "kurtosis";
-  Log::Info << left << setw(10) << "SE";
-  Log::Info << endl;
+  Log::Info << boost::format(stringFormat)
+      % "dim" % "var" % "mean" % "std" % "median" % "min" % "max"
+      % "range" % "skew" % "kurt" % "SE" << endl;
 
   // If the user specified dimension, describe statistics of the given
   // dimension. If it dimension not specified, describe all dimensions.
   if (CLI::HasParam("dimension"))
   {
-      // Options
-      Log::Info << fixed;
-      Log::Info << setprecision(2);
-      // Describe Data
-      Log::Info << left << setw(8) << dimension;
-      Log::Info << left << setw(8) << stats.Variance(dimension);
-      Log::Info << left << setw(8) << stats.Mean(dimension);
-      Log::Info << left << setw(8) << stats.StandardDeviation(dimension);
-      Log::Info << left << setw(8) << stats.Median(dimension);
-      Log::Info << left << setw(8) << stats.Min(dimension);
-      Log::Info << left << setw(8) << stats.Max(dimension);
-      Log::Info << left << setw(8) << stats.Range(dimension);
-      Log::Info << left << setw(10) << stats.Skewness(dimension);
-      Log::Info << left << setw(10) << stats.Kurtosis(dimension);
-      Log::Info << left << setw(10) << stats.StandardError(dimension);
-      Log::Info << endl;
+    // Extract row of the data with the given dimension.
+    arma::rowvec row = data.row(dimension);
+    // These variables are kept for future calculations.
+    double rowMax = arma::max(row);
+    double rowMin = arma::min(row);
+    double rowMean = arma::mean(row);
+    double rowStd = arma::stddev(row, population);
+
+    // Print statistics of the given dimension.
+    Log::Info << boost::format(numberFormat)
+        % dimension
+        % arma::var(row, population)
+        % rowMean
+        % rowStd
+        % arma::median(row)
+        % rowMin
+        % rowMax
+        % (rowMax - rowMin) // range
+        % Skewness(row, rowStd, rowMean, population)
+        % Kurtosis(row, rowStd, rowMean, population)
+        % StandardError(row, rowStd) << endl;
   }
   else
   {
     for (size_t i = 0; i < data.n_rows; ++i)
     {
-      // Options
-      Log::Info << fixed;
-      Log::Info << setprecision(2);
-      // Describe Data
-      Log::Info << left << setw(8) << i;
-      Log::Info << left << setw(8) << stats.Variance(i);
-      Log::Info << left << setw(8) << stats.Mean(i);
-      Log::Info << left << setw(8) << stats.StandardDeviation(i);
-      Log::Info << left << setw(8) << stats.Median(i);
-      Log::Info << left << setw(8) << stats.Min(i);
-      Log::Info << left << setw(8) << stats.Max(i);
-      Log::Info << left << setw(8) << stats.Range(i);
-      Log::Info << left << setw(10) << stats.Skewness(i);
-      Log::Info << left << setw(10) << stats.Kurtosis(i);
-      Log::Info << left << setw(10) << stats.StandardError(i);
-      Log::Info << endl;
+      // Extract each dimension of the data.
+      arma::rowvec row = data.row(i);
+      // These variables are kept for future calculations.
+      double rowMax = arma::max(row);
+      double rowMin = arma::min(row);
+      double rowMean = arma::mean(row);
+      double rowStd = arma::stddev(row, population);
+
+      // Print statistics of the row i.
+      Log::Info << boost::format(numberFormat)
+          % i
+          % arma::var(row, population)
+          % rowMean
+          % rowStd
+          % arma::median(row)
+          % rowMin
+          % rowMax
+          % (rowMax - rowMin) // range
+          % Skewness(row, rowStd, rowMean, population)
+          % Kurtosis(row, rowStd, rowMean, population)
+          % StandardError(row, rowStd) << endl;
     }
   }
+  Timer::Stop("statistics");
 }
 




More information about the mlpack-git mailing list