[mlpack-git] master: add descriptive statistics executable (5aed5ba)

gitdub at mlpack.org gitdub at mlpack.org
Mon Aug 8 00:19:22 EDT 2016


Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/44a2b52f4d9ece563a5b9850db43ab60f71e5ec7...acd81e11579f69e75aa8406b2982328c88cf1fde

>---------------------------------------------------------------

commit 5aed5ba9c78e4584f445217e9c66e52f79d6daec
Author: Keon Kim <kwk236 at gmail.com>
Date:   Tue Jul 12 04:35:54 2016 +0900

    add descriptive statistics executable


>---------------------------------------------------------------

5aed5ba9c78e4584f445217e9c66e52f79d6daec
 src/mlpack/methods/preprocess/CMakeLists.txt       |   1 +
 .../preprocess/preprocess_describe_main.cpp        | 198 +++++++++++++++++++++
 2 files changed, 199 insertions(+)

diff --git a/src/mlpack/methods/preprocess/CMakeLists.txt b/src/mlpack/methods/preprocess/CMakeLists.txt
index b10c8ea..b0b9f39 100644
--- a/src/mlpack/methods/preprocess/CMakeLists.txt
+++ b/src/mlpack/methods/preprocess/CMakeLists.txt
@@ -15,5 +15,6 @@ set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)
 #add_cli_executable(preprocess_stats)
 add_cli_executable(preprocess_split)
 add_cli_executable(preprocess_binarize)
+add_cli_executable(preprocess_describe)
 #add_cli_executable(preprocess_scan)
 #add_cli_executable(preprocess_imputer)
diff --git a/src/mlpack/methods/preprocess/preprocess_describe_main.cpp b/src/mlpack/methods/preprocess/preprocess_describe_main.cpp
new file mode 100644
index 0000000..ae454f0
--- /dev/null
+++ b/src/mlpack/methods/preprocess/preprocess_describe_main.cpp
@@ -0,0 +1,198 @@
+/**
+ * @file preprocess_describe_main.cpp
+ * @author Keon Kim
+ *
+ * Descriptive Statistics Class and CLI executable.
+ */
+#include <mlpack/core.hpp>
+
+using namespace mlpack;
+using namespace mlpack::data;
+using namespace std;
+
+PROGRAM_INFO("Descriptive Statistics", "This utility takes a dataset prints "
+    "out the statistical facts about the data.");
+
+// Define parameters for data.
+PARAM_STRING_REQ("input_file", "File containing data,", "i");
+PARAM_INT("dimension", "Dimension of the data", "d", 0);
+PARAM_INT("precision", "preferred precision of the result", "p", 2);
+
+// Statistics class, it calculates most of the statistical elements in its
+// constructor.
+template <typename T>
+class Statistics
+{
+ public:
+  Statistics(arma::Mat<T> input, size_t norm_type = 1, bool columnMajor = true):
+      data(input)
+  {
+    minVec = arma::min(data, columnMajor);
+    maxVec = arma::max(data, columnMajor);
+    meanVec = arma::mean(data, columnMajor);
+    medianVec = arma::median(data, columnMajor);
+    stdVec = arma::stddev(data, 1, columnMajor);
+    varVec = arma::var(data, 1, columnMajor);
+  }
+  double Min(const size_t dimension) const
+  {
+    return minVec(dimension);
+  }
+
+  double Max(const size_t dimension) const
+  {
+    return maxVec(dimension);
+  }
+
+  double Range(const size_t dimension) const
+  {
+    return maxVec(dimension) - minVec(dimension);
+  }
+
+  double Mean(const size_t dimension) const
+  {
+    return meanVec(dimension);
+  }
+
+  double Median(const size_t dimension) const
+  {
+    return medianVec(dimension);
+  }
+
+  double Variance(const size_t dimension) const
+  {
+    return varVec(dimension);
+  }
+
+  double StandardDeviation(const size_t dimension) const
+  {
+    return stdVec(dimension);
+  }
+
+  double Skewness(const size_t dimension) const
+  {
+    return this->CentralMoment(3, dimension);
+  }
+
+  double Kurtosis(const size_t dimension) const
+  {
+    return this->CentralMoment(4, dimension);
+  }
+
+  double RawMoment(const size_t order, const size_t dimension) const
+  {
+    // E(x)^order
+    double moment = 0;
+    for (size_t i = 0; i < data.n_cols; ++i)
+    {
+      moment += pow(data(dimension, i), order);
+    }
+    return moment / data.n_cols;
+  }
+
+  double CentralMoment(const size_t order, const size_t dimension) const
+  {
+    // E(X-u)^order
+    if (order == 1)
+    {
+      return 0.0;
+    }
+    double moment = 0;
+    for (size_t i = 0; i < data.n_cols; ++i)
+    {
+      moment += pow(data(dimension, i) - meanVec(dimension), order);
+    }
+    return moment / data.n_cols;
+  }
+
+  double StandardError(const size_t dimension) const
+  {
+     return stdVec(dimension) / sqrt(data.n_cols);
+  }
+ private:
+  arma::Mat<T> data;
+
+  arma::vec minVec;
+  arma::vec maxVec;
+  arma::vec meanVec;
+  arma::vec medianVec;
+  arma::vec stdVec;
+  arma::vec varVec;
+};
+
+/**
+ * Make sure a CSV is loaded correctly.
+ */
+int main(int argc, char** argv)
+{
+  // Parse command line options.
+  CLI::ParseCommandLine(argc, argv);
+  const string inputFile = CLI::GetParam<string>("input_file");
+  const size_t dimension = (size_t) CLI::GetParam<int>("dimension");
+  const size_t precision = (size_t) CLI::GetParam<int>("precision");
+
+  // Load the data
+  arma::mat data;
+  data::Load(inputFile, data);
+
+  Statistics<double> stats(data);
+
+  // Headers
+  Log::Info << left << setw(8) << "dim";
+  Log::Info << left << setw(8) << "var";
+  Log::Info << left << setw(8) << "mean";
+  Log::Info << left << setw(8) << "std";
+  Log::Info << left << setw(8) << "median";
+  Log::Info << left << setw(8) << "min";
+  Log::Info << left << setw(8) << "max";
+  Log::Info << left << setw(8) << "range";
+  Log::Info << left << setw(10) << "skewness";
+  Log::Info << left << setw(10) << "kurtosis";
+  Log::Info << left << setw(10) << "SE";
+  Log::Info << endl;
+
+  // If the user specified dimension, describe statistics of the given
+  // dimension. If it dimension not specified, describe all dimensions.
+  if (CLI::HasParam("dimension"))
+  {
+      // Options
+      Log::Info << fixed;
+      Log::Info << setprecision(2);
+      // Describe Data
+      Log::Info << left << setw(8) << dimension;
+      Log::Info << left << setw(8) << stats.Variance(dimension);
+      Log::Info << left << setw(8) << stats.Mean(dimension);
+      Log::Info << left << setw(8) << stats.StandardDeviation(dimension);
+      Log::Info << left << setw(8) << stats.Median(dimension);
+      Log::Info << left << setw(8) << stats.Min(dimension);
+      Log::Info << left << setw(8) << stats.Max(dimension);
+      Log::Info << left << setw(8) << stats.Range(dimension);
+      Log::Info << left << setw(10) << stats.Skewness(dimension);
+      Log::Info << left << setw(10) << stats.Kurtosis(dimension);
+      Log::Info << left << setw(10) << stats.StandardError(dimension);
+      Log::Info << endl;
+  }
+  else
+  {
+    for (size_t i = 0; i < data.n_rows; ++i)
+    {
+      // Options
+      Log::Info << fixed;
+      Log::Info << setprecision(2);
+      // Describe Data
+      Log::Info << left << setw(8) << i;
+      Log::Info << left << setw(8) << stats.Variance(i);
+      Log::Info << left << setw(8) << stats.Mean(i);
+      Log::Info << left << setw(8) << stats.StandardDeviation(i);
+      Log::Info << left << setw(8) << stats.Median(i);
+      Log::Info << left << setw(8) << stats.Min(i);
+      Log::Info << left << setw(8) << stats.Max(i);
+      Log::Info << left << setw(8) << stats.Range(i);
+      Log::Info << left << setw(10) << stats.Skewness(i);
+      Log::Info << left << setw(10) << stats.Kurtosis(i);
+      Log::Info << left << setw(10) << stats.StandardError(i);
+      Log::Info << endl;
+    }
+  }
+}
+




More information about the mlpack-git mailing list