[mlpack-git] master: add descriptive statistics cli executable (27ac82e)
gitdub at mlpack.org
gitdub at mlpack.org
Wed Jul 27 13:05:26 EDT 2016
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/44a2b52f4d9ece563a5b9850db43ab60f71e5ec7...acd81e11579f69e75aa8406b2982328c88cf1fde
>---------------------------------------------------------------
commit 27ac82e876c3d68637d9e72d28bf0dadbc229ace
Author: Keon Kim <kwk236 at gmail.com>
Date: Thu Jul 28 02:05:26 2016 +0900
add descriptive statistics cli executable
>---------------------------------------------------------------
27ac82e876c3d68637d9e72d28bf0dadbc229ace
.../preprocess/preprocess_describe_main.cpp | 327 ++++++++++++---------
1 file changed, 180 insertions(+), 147 deletions(-)
diff --git a/src/mlpack/methods/preprocess/preprocess_describe_main.cpp b/src/mlpack/methods/preprocess/preprocess_describe_main.cpp
index ae454f0..2e56c86 100644
--- a/src/mlpack/methods/preprocess/preprocess_describe_main.cpp
+++ b/src/mlpack/methods/preprocess/preprocess_describe_main.cpp
@@ -5,194 +5,227 @@
* Descriptive Statistics Class and CLI executable.
*/
#include <mlpack/core.hpp>
+#include <boost/format.hpp>
+#include <boost/lexical_cast.hpp>
using namespace mlpack;
using namespace mlpack::data;
using namespace std;
-
-PROGRAM_INFO("Descriptive Statistics", "This utility takes a dataset prints "
- "out the statistical facts about the data.");
+using namespace boost;
+
+PROGRAM_INFO("Descriptive Statistics", "This utility takes a dataset and "
+ "prints out the descriptive statistics of the data. Descriptive statistics "
+ "is the discipline of quantitatively describing the main features of a "
+ "collection of information, or the quantitative description itself. The "
+ "program does not modify the original file, but instead prints out the "
+ "statistics to the console. The printed result will look like a table."
+ "\n\n"
+ "Optionally, width and precision of the output can be adjusted by a user "
+ "using the --width (-w) and --precision (-p). A user can also select a "
+ "specific dimension to analyize if he or she has too many dimensions."
+ "--population (-P) is a flag which can be used when the user wants the "
+ "dataset to be considered as a population. Otherwise, the dataset will "
+ "be considered as a sample."
+ "\n\n"
+ "So, a simple example where we want to print out statistical facts about "
+ "dataset.csv, and keep the default settings, we could run"
+ "\n\n"
+ "$ mlpack_preprocess_describe -i dataset.csv -v"
+ "\n\n"
+ "If we want to customize the width to 10 and precision to 5 and consider "
+ "the dataset as a population, we could run"
+ "\n\n"
+ "$ mlpack_preprocess_describe -i dataset.csv -w 10 -p 5 -P -v");
// Define parameters for data.
-PARAM_STRING_REQ("input_file", "File containing data,", "i");
-PARAM_INT("dimension", "Dimension of the data", "d", 0);
-PARAM_INT("precision", "preferred precision of the result", "p", 2);
+PARAM_STRING_IN_REQ("input_file", "File containing data,", "i");
+PARAM_INT_IN("dimension", "Dimension of the data. Use this to specify a "
+ "dimension", "d", 0);
+PARAM_INT_IN("precision", "Precision of the output statistics.", "p", 4);
+PARAM_INT_IN("width", "Width of the output table.", "w", 8);
+PARAM_FLAG("population", "If specified, the program will calculate statistics "
+ "assuming the dataset is the population. By default, the program will "
+ "assume the dataset as a sample.", "P");
-// Statistics class, it calculates most of the statistical elements in its
-// constructor.
-template <typename T>
-class Statistics
+/**
+* Calculates the sum of deviations to the Nth Power
+*
+* @param input Vector that captures a dimension of a dataset
+* @param rowMean Mean of the given vector.
+* @return sum of nth power deviations
+*/
+double SumNthPowerDeviations(const arma::rowvec& input,
+ const double& rowMean,
+ const size_t Nth) // Degree of Power
{
- public:
- Statistics(arma::Mat<T> input, size_t norm_type = 1, bool columnMajor = true):
- data(input)
- {
- minVec = arma::min(data, columnMajor);
- maxVec = arma::max(data, columnMajor);
- meanVec = arma::mean(data, columnMajor);
- medianVec = arma::median(data, columnMajor);
- stdVec = arma::stddev(data, 1, columnMajor);
- varVec = arma::var(data, 1, columnMajor);
- }
- double Min(const size_t dimension) const
+ double sum = 0;
+ for (size_t i = 0; i < input.n_elem; ++i)
{
- return minVec(dimension);
+ sum += pow(input(i) - rowMean, Nth);
}
-
- double Max(const size_t dimension) const
- {
- return maxVec(dimension);
- }
-
- double Range(const size_t dimension) const
- {
- return maxVec(dimension) - minVec(dimension);
- }
-
- double Mean(const size_t dimension) const
- {
- return meanVec(dimension);
- }
-
- double Median(const size_t dimension) const
- {
- return medianVec(dimension);
- }
-
- double Variance(const size_t dimension) const
- {
- return varVec(dimension);
- }
-
- double StandardDeviation(const size_t dimension) const
- {
- return stdVec(dimension);
- }
-
- double Skewness(const size_t dimension) const
- {
- return this->CentralMoment(3, dimension);
- }
-
- double Kurtosis(const size_t dimension) const
+ return sum;
+}
+/**
+ * Calculates Skewness of the given vector.
+ *
+ * @param input Vector that captures a dimension of a dataset
+ * @param rowStd Standard Deviation of the given vector.
+ * @param rowMean Mean of the given vector.
+ * @return Skewness of the given vector.
+ */
+double Skewness(const arma::rowvec& input,
+ const double& rowStd,
+ const double& rowMean,
+ const bool population)
+{
+ double skewness = 0;
+ double S3 = pow(rowStd, 3);
+ double M3 = SumNthPowerDeviations(input, rowMean, 3);
+ double n = input.n_elem;
+ if (population)
{
- return this->CentralMoment(4, dimension);
+ // Calculate Population Skewness
+ skewness = n * M3 / (n * n * S3);
}
-
- double RawMoment(const size_t order, const size_t dimension) const
+ else
{
- // E(x)^order
- double moment = 0;
- for (size_t i = 0; i < data.n_cols; ++i)
- {
- moment += pow(data(dimension, i), order);
- }
- return moment / data.n_cols;
+ // Calculate Sample Skewness
+ skewness = n * M3 / ((n-1) * (n-2) * S3);
}
-
- double CentralMoment(const size_t order, const size_t dimension) const
+ return skewness;
+}
+/**
+ * Calculates kurtosis of the given vector.
+ *
+ * @param input Vector that captures a dimension of a dataset
+ * @param rowStd Standard Deviation of the given vector.
+ * @param rowMean Mean of the given vector.
+ * @return Kurtosis of the given vector.
+ */
+double Kurtosis(const arma::rowvec& input,
+ const double& rowStd,
+ const double& rowMean,
+ const bool population)
+{
+ double kurtosis = 0;
+ double M4 = SumNthPowerDeviations(input, rowMean, 4);
+ double n = input.n_elem;
+ if (population)
{
- // E(X-u)^order
- if (order == 1)
- {
- return 0.0;
- }
- double moment = 0;
- for (size_t i = 0; i < data.n_cols; ++i)
- {
- moment += pow(data(dimension, i) - meanVec(dimension), order);
- }
- return moment / data.n_cols;
+ // Calculate Population Excess Kurtosis
+ double M2 = SumNthPowerDeviations(input, rowMean, 2);
+ kurtosis = n * (M4 / pow(M2, 2)) - 3;
}
-
- double StandardError(const size_t dimension) const
+ else
{
- return stdVec(dimension) / sqrt(data.n_cols);
+ // Calculate Sample Excess Kurtosis
+ double S4 = pow(rowStd, 4);
+ double norm3 = (3 * (n-1) * (n-1)) / ((n-2) * (n-3));
+ double normC = (n * (n+1))/((n-1) * (n-2) * (n-3));
+ double normM = M4 / S4;
+ kurtosis = normC * normM - norm3;
}
- private:
- arma::Mat<T> data;
-
- arma::vec minVec;
- arma::vec maxVec;
- arma::vec meanVec;
- arma::vec medianVec;
- arma::vec stdVec;
- arma::vec varVec;
-};
-
+ return kurtosis;
+}
/**
- * Make sure a CSV is loaded correctly.
+ * Calculates standard error of standard deviation.
+ *
+ * @param input Vector that captures a dimension of a dataset
+ * @param rowStd Standard Deviation of the given vector.
+ * @return Standard error of the stanrdard devation of the given vector.
*/
+double StandardError(const arma::rowvec& input, const double rowStd)
+{
+ return rowStd / sqrt(input.n_elem);
+}
+
int main(int argc, char** argv)
{
// Parse command line options.
CLI::ParseCommandLine(argc, argv);
const string inputFile = CLI::GetParam<string>("input_file");
- const size_t dimension = (size_t) CLI::GetParam<int>("dimension");
- const size_t precision = (size_t) CLI::GetParam<int>("precision");
+ const size_t dimension = static_cast<size_t>(CLI::GetParam<int>("dimension"));
+ const size_t precision = static_cast<size_t>(CLI::GetParam<int>("precision"));
+ const size_t width = static_cast<size_t>(CLI::GetParam<int>("width"));
+ const bool population = CLI::HasParam("population");
// Load the data
arma::mat data;
- data::Load(inputFile, data);
-
- Statistics<double> stats(data);
+ data::Load(inputFile, data, false, true /*transpose*/);
+
+ // Generate boost format recipe.
+ const string widthPrecision("%-"+
+ to_string(width)+ "." +
+ to_string(precision));
+ const string widthOnly("%-"+
+ to_string(width)+ ".");
+ string stringFormat = "";
+ string numberFormat = "";
+ for (size_t i = 0; i < 11; ++i)
+ {
+ stringFormat += widthOnly + "s";
+ numberFormat += widthPrecision + "f";
+ }
+ Timer::Start("statistics");
// Headers
- Log::Info << left << setw(8) << "dim";
- Log::Info << left << setw(8) << "var";
- Log::Info << left << setw(8) << "mean";
- Log::Info << left << setw(8) << "std";
- Log::Info << left << setw(8) << "median";
- Log::Info << left << setw(8) << "min";
- Log::Info << left << setw(8) << "max";
- Log::Info << left << setw(8) << "range";
- Log::Info << left << setw(10) << "skewness";
- Log::Info << left << setw(10) << "kurtosis";
- Log::Info << left << setw(10) << "SE";
- Log::Info << endl;
+ Log::Info << boost::format(stringFormat)
+ % "dim" % "var" % "mean" % "std" % "median" % "min" % "max"
+ % "range" % "skew" % "kurt" % "SE" << endl;
// If the user specified dimension, describe statistics of the given
// dimension. If it dimension not specified, describe all dimensions.
if (CLI::HasParam("dimension"))
{
- // Options
- Log::Info << fixed;
- Log::Info << setprecision(2);
- // Describe Data
- Log::Info << left << setw(8) << dimension;
- Log::Info << left << setw(8) << stats.Variance(dimension);
- Log::Info << left << setw(8) << stats.Mean(dimension);
- Log::Info << left << setw(8) << stats.StandardDeviation(dimension);
- Log::Info << left << setw(8) << stats.Median(dimension);
- Log::Info << left << setw(8) << stats.Min(dimension);
- Log::Info << left << setw(8) << stats.Max(dimension);
- Log::Info << left << setw(8) << stats.Range(dimension);
- Log::Info << left << setw(10) << stats.Skewness(dimension);
- Log::Info << left << setw(10) << stats.Kurtosis(dimension);
- Log::Info << left << setw(10) << stats.StandardError(dimension);
- Log::Info << endl;
+ // Extract row of the data with the given dimension.
+ arma::rowvec row = data.row(dimension);
+ // These variables are kept for future calculations.
+ double rowMax = arma::max(row);
+ double rowMin = arma::min(row);
+ double rowMean = arma::mean(row);
+ double rowStd = arma::stddev(row, population);
+
+ // Print statistics of the given dimension.
+ Log::Info << boost::format(numberFormat)
+ % dimension
+ % arma::var(row, population)
+ % rowMean
+ % rowStd
+ % arma::median(row)
+ % rowMin
+ % rowMax
+ % (rowMax - rowMin) // range
+ % Skewness(row, rowStd, rowMean, population)
+ % Kurtosis(row, rowStd, rowMean, population)
+ % StandardError(row, rowStd) << endl;
}
else
{
for (size_t i = 0; i < data.n_rows; ++i)
{
- // Options
- Log::Info << fixed;
- Log::Info << setprecision(2);
- // Describe Data
- Log::Info << left << setw(8) << i;
- Log::Info << left << setw(8) << stats.Variance(i);
- Log::Info << left << setw(8) << stats.Mean(i);
- Log::Info << left << setw(8) << stats.StandardDeviation(i);
- Log::Info << left << setw(8) << stats.Median(i);
- Log::Info << left << setw(8) << stats.Min(i);
- Log::Info << left << setw(8) << stats.Max(i);
- Log::Info << left << setw(8) << stats.Range(i);
- Log::Info << left << setw(10) << stats.Skewness(i);
- Log::Info << left << setw(10) << stats.Kurtosis(i);
- Log::Info << left << setw(10) << stats.StandardError(i);
- Log::Info << endl;
+ // Extract each dimension of the data.
+ arma::rowvec row = data.row(i);
+ // These variables are kept for future calculations.
+ double rowMax = arma::max(row);
+ double rowMin = arma::min(row);
+ double rowMean = arma::mean(row);
+ double rowStd = arma::stddev(row, population);
+
+ // Print statistics of the row i.
+ Log::Info << boost::format(numberFormat)
+ % i
+ % arma::var(row, population)
+ % rowMean
+ % rowStd
+ % arma::median(row)
+ % rowMin
+ % rowMax
+ % (rowMax - rowMin) // range
+ % Skewness(row, rowStd, rowMean, population)
+ % Kurtosis(row, rowStd, rowMean, population)
+ % StandardError(row, rowStd) << endl;
}
}
+ Timer::Stop("statistics");
}
More information about the mlpack-git
mailing list