[mlpack-git] master: Change 'attribute' to 'dimension' for consistency. (cb74433)
gitdub at big.cc.gt.atl.ga.us
gitdub at big.cc.gt.atl.ga.us
Mon Nov 30 17:24:23 EST 2015
Repository : https://github.com/mlpack/mlpack
On branch : master
Link : https://github.com/mlpack/mlpack/compare/10b9d45b806a3e879b0564d78ccb183ebc7051ba...31c557d9cc7e4da57fd8a246085c19e076d12271
>---------------------------------------------------------------
commit cb7443381af756aebd673390d763d8560d7ab82e
Author: Ryan Curtin <ryan at ratml.org>
Date: Sat Nov 21 01:51:53 2015 +0000
Change 'attribute' to 'dimension' for consistency.
The rest of the mlpack codebase uses the term 'dimension', so it's probably a good idea to use that term everywhere, instead of using the term 'attribute' only in DecisionStump. This should hopefully help avoid confusion.
Also, templatize TrainOnDim to avoid copying the row.
>---------------------------------------------------------------
cb7443381af756aebd673390d763d8560d7ab82e
.../methods/decision_stump/decision_stump.hpp | 39 +++++-----
.../methods/decision_stump/decision_stump_impl.hpp | 88 +++++++++++-----------
src/mlpack/tests/decision_stump_test.cpp | 10 +--
src/mlpack/tests/serialization_test.cpp | 6 +-
4 files changed, 72 insertions(+), 71 deletions(-)
diff --git a/src/mlpack/methods/decision_stump/decision_stump.hpp b/src/mlpack/methods/decision_stump/decision_stump.hpp
index ccfe678..18e78da 100644
--- a/src/mlpack/methods/decision_stump/decision_stump.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump.hpp
@@ -17,7 +17,7 @@ namespace decision_stump {
* decision tree, i.e., a decision stump. It uses entropy to decide splitting
* ranges.
*
- * The stump is parameterized by a splitting attribute (the dimension on which
+ * The stump is parameterized by a splitting dimension (the dimension on which
* points are split), a vector of bin split values, and a vector of labels for
* each bin. Bin i is specified by the range [split[i], split[i + 1]). The
* last bin has range up to \infty (split[i + 1] does not exist in that case).
@@ -91,10 +91,10 @@ class DecisionStump
*/
void Classify(const MatType& test, arma::Row<size_t>& predictedLabels);
- //! Access the splitting attribute.
- size_t SplitAttribute() const { return splitAttribute; }
- //! Modify the splitting attribute (be careful!).
- size_t& SplitAttribute() { return splitAttribute; }
+ //! Access the splitting dimension.
+ size_t SplitDimension() const { return splitDimension; }
+ //! Modify the splitting dimension (be careful!).
+ size_t& SplitDimension() { return splitDimension; }
//! Access the splitting values.
const arma::vec& Split() const { return split; }
@@ -116,34 +116,35 @@ class DecisionStump
//! The minimum number of points in a bucket.
size_t bucketSize;
- //! Stores the value of the attribute on which to split.
- size_t splitAttribute;
+ //! Stores the value of the dimension on which to split.
+ size_t splitDimension;
//! Stores the splitting values after training.
arma::vec split;
//! Stores the labels for each splitting bin.
arma::Col<size_t> binLabels;
/**
- * Sets up attribute as if it were splitting on it and finds entropy when
- * splitting on attribute.
+ * Sets up dimension as if it were splitting on it and finds entropy when
+ * splitting on dimension.
*
- * @param attribute A row from the training data, which might be a
- * candidate for the splitting attribute.
+ * @param dimension A row from the training data, which might be a
+ * candidate for the splitting dimension.
* @tparam UseWeights Whether we need to run a weighted Decision Stump.
*/
template<bool UseWeights>
- double SetupSplitAttribute(const arma::rowvec& attribute,
+ double SetupSplitDimension(const arma::rowvec& dimension,
const arma::Row<size_t>& labels,
const arma::rowvec& weightD);
/**
- * After having decided the attribute on which to split, train on that
- * attribute.
+ * After having decided the dimension on which to split, train on that
+ * dimension.
*
- * @tparam attribute attribute is the attribute decided by the constructor
+ * @tparam dimension dimension is the dimension decided by the constructor
* on which we now train the decision stump.
*/
- void TrainOnAtt(const arma::rowvec& attribute,
+ template<typename VecType>
+ void TrainOnDim(const VecType& dimension,
const arma::Row<size_t>& labels);
/**
@@ -164,15 +165,15 @@ class DecisionStump
/**
* Returns 1 if all the values of featureRow are not same.
*
- * @param featureRow The attribute which is checked for identical values.
+ * @param featureRow The dimension which is checked for identical values.
*/
template<typename VecType>
int IsDistinct(const VecType& featureRow);
/**
- * Calculate the entropy of the given attribute.
+ * Calculate the entropy of the given dimension.
*
- * @param labels Corresponding labels of the attribute.
+ * @param labels Corresponding labels of the dimension.
* @param classes Number of classes.
* @param weights Weights for this set of labels.
* @tparam UseWeights If true, the weights in the weight vector will be used
diff --git a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
index c4b8b77..7b17ae2 100644
--- a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
@@ -41,7 +41,7 @@ template<typename MatType>
DecisionStump<MatType>::DecisionStump() :
classes(1),
bucketSize(0),
- splitAttribute(0),
+ splitDimension(0),
split(1),
binLabels(1)
{
@@ -83,37 +83,37 @@ void DecisionStump<MatType>::Train(const MatType& data,
this->bucketSize = bucketSize;
// If classLabels are not all identical, proceed with training.
- size_t bestAtt = 0;
+ size_t bestDim = 0;
double entropy;
const double rootEntropy = CalculateEntropy<UseWeights>(labels, weights);
double gain, bestGain = 0.0;
for (size_t i = 0; i < data.n_rows; i++)
{
- // Go through each attribute of the data.
+ // Go through each dimension of the data.
if (IsDistinct(data.row(i)))
{
- // For each attribute with non-identical values, treat it as a potential
- // splitting attribute and calculate entropy if split on it.
- entropy = SetupSplitAttribute<UseWeights>(data.row(i), labels, weights);
+ // For each dimension with non-identical values, treat it as a potential
+ // splitting dimension and calculate entropy if split on it.
+ entropy = SetupSplitDimension<UseWeights>(data.row(i), labels, weights);
gain = rootEntropy - entropy;
- // Find the attribute with the best entropy so that the gain is
+ // Find the dimension with the best entropy so that the gain is
// maximized.
// We are maximizing gain, which is what is returned from
- // SetupSplitAttribute().
+ // SetupSplitDimension().
if (gain < bestGain)
{
- bestAtt = i;
+ bestDim = i;
bestGain = gain;
}
}
}
- splitAttribute = bestAtt;
+ splitDimension = bestDim;
- // Once the splitting column/attribute has been decided, train on it.
- TrainOnAtt(data.row(splitAttribute), labels);
+ // Once the splitting column/dimension has been decided, train on it.
+ TrainOnDim(data.row(splitDimension), labels);
}
/**
@@ -135,7 +135,7 @@ void DecisionStump<MatType>::Classify(const MatType& test,
// Assume first that it falls into the first bin, then proceed through the
// bins until it is known which bin it falls into.
size_t bin = 0;
- const double val = test(splitAttribute, i);
+ const double val = test(splitDimension, i);
while (bin < split.n_elem - 1)
{
@@ -186,46 +186,46 @@ void DecisionStump<MatType>::Serialize(Archive& ar,
// None need special handling.
ar & CreateNVP(classes, "classes");
ar & CreateNVP(bucketSize, "bucketSize");
- ar & CreateNVP(splitAttribute, "splitAttribute");
+ ar & CreateNVP(splitDimension, "splitDimension");
ar & CreateNVP(split, "split");
ar & CreateNVP(binLabels, "binLabels");
}
/**
- * Sets up attribute as if it were splitting on it and finds entropy when
- * splitting on attribute.
+ * Sets up dimension as if it were splitting on it and finds entropy when
+ * splitting on dimension.
*
- * @param attribute A row from the training data, which might be a candidate for
- * the splitting attribute.
+ * @param dimension A row from the training data, which might be a candidate for
+ * the splitting dimension.
* @param UseWeights Whether we need to run a weighted Decision Stump.
*/
template<typename MatType>
template<bool UseWeights>
-double DecisionStump<MatType>::SetupSplitAttribute(
- const arma::rowvec& attribute,
+double DecisionStump<MatType>::SetupSplitDimension(
+ const arma::rowvec& dimension,
const arma::Row<size_t>& labels,
const arma::rowvec& weights)
{
size_t i, count, begin, end;
double entropy = 0.0;
- // Sort the attribute in order to calculate splitting ranges.
- arma::rowvec sortedAtt = arma::sort(attribute);
+ // Sort the dimension in order to calculate splitting ranges.
+ arma::rowvec sortedDim = arma::sort(dimension);
- // Store the indices of the sorted attribute to build a vector of sorted
+ // Store the indices of the sorted dimension to build a vector of sorted
// labels. This sort is stable.
- arma::uvec sortedIndexAtt = arma::stable_sort_index(attribute.t());
+ arma::uvec sortedIndexDim = arma::stable_sort_index(dimension.t());
- arma::Row<size_t> sortedLabels(attribute.n_elem);
- arma::rowvec sortedWeights(attribute.n_elem);
+ arma::Row<size_t> sortedLabels(dimension.n_elem);
+ arma::rowvec sortedWeights(dimension.n_elem);
- for (i = 0; i < attribute.n_elem; i++)
+ for (i = 0; i < dimension.n_elem; i++)
{
- sortedLabels(i) = labels(sortedIndexAtt(i));
+ sortedLabels(i) = labels(sortedIndexDim(i));
// Apply weights if necessary.
if (UseWeights)
- sortedWeights(i) = weights(sortedIndexAtt(i));
+ sortedWeights(i) = weights(sortedIndexDim(i));
}
i = 0;
@@ -286,25 +286,26 @@ double DecisionStump<MatType>::SetupSplitAttribute(
}
/**
- * After having decided the attribute on which to split, train on that
- * attribute.
+ * After having decided the dimension on which to split, train on that
+ * dimension.
*
- * @param attribute Attribute is the attribute decided by the constructor on
+ * @param dimension Dimension is the dimension decided by the constructor on
* which we now train the decision stump.
*/
template<typename MatType>
-void DecisionStump<MatType>::TrainOnAtt(const arma::rowvec& attribute,
+template<typename VecType>
+void DecisionStump<MatType>::TrainOnDim(const VecType& dimension,
const arma::Row<size_t>& labels)
{
size_t i, count, begin, end;
- arma::rowvec sortedSplitAtt = arma::sort(attribute);
- arma::uvec sortedSplitIndexAtt = arma::stable_sort_index(attribute.t());
- arma::Row<size_t> sortedLabels(attribute.n_elem);
+ arma::rowvec sortedSplitDim = arma::sort(dimension);
+ arma::uvec sortedSplitIndexDim = arma::stable_sort_index(dimension.t());
+ arma::Row<size_t> sortedLabels(dimension.n_elem);
sortedLabels.fill(0);
- for (i = 0; i < attribute.n_elem; i++)
- sortedLabels(i) = labels(sortedSplitIndexAtt(i));
+ for (i = 0; i < dimension.n_elem; i++)
+ sortedLabels(i) = labels(sortedSplitIndexDim(i));
arma::rowvec subCols;
double mostFreq;
@@ -321,7 +322,7 @@ void DecisionStump<MatType>::TrainOnAtt(const arma::rowvec& attribute,
mostFreq = CountMostFreq(sortedLabels.cols(begin, end));
split.resize(split.n_elem + 1);
- split(split.n_elem - 1) = sortedSplitAtt(begin);
+ split(split.n_elem - 1) = sortedSplitDim(begin);
binLabels.resize(binLabels.n_elem + 1);
binLabels(binLabels.n_elem - 1) = mostFreq;
@@ -349,7 +350,7 @@ void DecisionStump<MatType>::TrainOnAtt(const arma::rowvec& attribute,
mostFreq = CountMostFreq(sortedLabels.cols(begin, end));
split.resize(split.n_elem + 1);
- split(split.n_elem - 1) = sortedSplitAtt(begin);
+ split(split.n_elem - 1) = sortedSplitDim(begin);
binLabels.resize(binLabels.n_elem + 1);
binLabels(binLabels.n_elem - 1) = mostFreq;
@@ -422,7 +423,7 @@ double DecisionStump<MatType>::CountMostFreq(const VecType& subCols)
/**
* Returns 1 if all the values of featureRow are not the same.
*
- * @param featureRow The attribute which is checked for identical values.
+ * @param featureRow The dimension which is checked for identical values.
*/
template<typename MatType>
template<typename VecType>
@@ -436,10 +437,9 @@ int DecisionStump<MatType>::IsDistinct(const VecType& featureRow)
}
/**
- * Calculate entropy of attribute.
+ * Calculate entropy of dimension.
*
- * @param attribute The attribute for which we calculate the entropy.
- * @param labels Corresponding labels of the attribute.
+ * @param labels Corresponding labels of the dimension.
* @param UseWeights Whether we need to run a weighted Decision Stump.
*/
template<typename MatType>
diff --git a/src/mlpack/tests/decision_stump_test.cpp b/src/mlpack/tests/decision_stump_test.cpp
index 69e279d..36c83f7 100644
--- a/src/mlpack/tests/decision_stump_test.cpp
+++ b/src/mlpack/tests/decision_stump_test.cpp
@@ -52,16 +52,16 @@ BOOST_AUTO_TEST_CASE(OneClass)
/**
* This tests whether the entropy is being correctly calculated by checking the
* correct value of the splitting column value. This test is for an
- * inpBucketSize of 4 and the correct value of the splitting attribute is 0.
+ * inpBucketSize of 4 and the correct value of the splitting dimension is 0.
*/
-BOOST_AUTO_TEST_CASE(CorrectAttributeChosen)
+BOOST_AUTO_TEST_CASE(CorrectDimensionChosen)
{
const size_t numClasses = 2;
const size_t inpBucketSize = 4;
// This dataset comes from Chapter 6 of the book "Data Mining: Concepts,
// Models, Methods, and Algorithms" (2nd Edition) by Mehmed Kantardzic. It is
- // found on page 176 (and a description of the correct splitting attribute is
+ // found on page 176 (and a description of the correct splitting dimension is
// given below that).
mat trainingData;
trainingData << 0 << 0 << 0 << 0 << 0 << 1 << 1 << 1 << 1
@@ -80,7 +80,7 @@ BOOST_AUTO_TEST_CASE(CorrectAttributeChosen)
// Only need to check the value of the splitting column, no need of
// classification.
- BOOST_CHECK_EQUAL(ds.SplitAttribute(), 0);
+ BOOST_CHECK_EQUAL(ds.SplitDimension(), 0);
}
/**
@@ -295,7 +295,7 @@ BOOST_AUTO_TEST_CASE(DimensionSelectionTest)
DecisionStump<> ds(dataset, labels, numClasses, inpBucketSize);
// Make sure it split on the dimension that is most separable.
- BOOST_CHECK_EQUAL(ds.SplitAttribute(), 1);
+ BOOST_CHECK_EQUAL(ds.SplitDimension(), 1);
// Make sure every bin below -1 classifies as label 0, and every bin above 1
// classifies as label 1 (What happens in [-1, 1] isn't that big a deal.).
diff --git a/src/mlpack/tests/serialization_test.cpp b/src/mlpack/tests/serialization_test.cpp
index f2d3077..d63a516 100644
--- a/src/mlpack/tests/serialization_test.cpp
+++ b/src/mlpack/tests/serialization_test.cpp
@@ -1458,9 +1458,9 @@ BOOST_AUTO_TEST_CASE(DecisionStumpTest)
SerializeObjectAll(ds, xmlDs, textDs, binaryDs);
// Make sure that everything is the same about the new decision stumps.
- BOOST_REQUIRE_EQUAL(ds.SplitAttribute(), xmlDs.SplitAttribute());
- BOOST_REQUIRE_EQUAL(ds.SplitAttribute(), textDs.SplitAttribute());
- BOOST_REQUIRE_EQUAL(ds.SplitAttribute(), binaryDs.SplitAttribute());
+ BOOST_REQUIRE_EQUAL(ds.SplitDimension(), xmlDs.SplitDimension());
+ BOOST_REQUIRE_EQUAL(ds.SplitDimension(), textDs.SplitDimension());
+ BOOST_REQUIRE_EQUAL(ds.SplitDimension(), binaryDs.SplitDimension());
CheckMatrices(ds.Split(), xmlDs.Split(), textDs.Split(), binaryDs.Split());
CheckMatrices(ds.BinLabels(), xmlDs.BinLabels(), textDs.BinLabels(),
More information about the mlpack-git
mailing list