[mlpack-git] master: Change 'attribute' to 'dimension' for consistency. (cb74433)

Mon Nov 30 17:24:23 EST 2015

Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/10b9d45b806a3e879b0564d78ccb183ebc7051ba...31c557d9cc7e4da57fd8a246085c19e076d12271

>---------------------------------------------------------------

commit cb7443381af756aebd673390d763d8560d7ab82e
Author: Ryan Curtin <ryan at ratml.org>
Date:   Sat Nov 21 01:51:53 2015 +0000

    Change 'attribute' to 'dimension' for consistency.
    
    The rest of the mlpack codebase uses the term 'dimension', so it's probably a good idea to use that term everywhere, instead of using the term 'attribute' only in DecisionStump.  This should hopefully help avoid confusion.
    
    Also, templatize TrainOnDim to avoid copying the row.


>---------------------------------------------------------------

cb7443381af756aebd673390d763d8560d7ab82e
 .../methods/decision_stump/decision_stump.hpp      | 39 +++++-----
 .../methods/decision_stump/decision_stump_impl.hpp | 88 +++++++++++-----------
 src/mlpack/tests/decision_stump_test.cpp           | 10 +--
 src/mlpack/tests/serialization_test.cpp            |  6 +-
 4 files changed, 72 insertions(+), 71 deletions(-)

diff --git a/src/mlpack/methods/decision_stump/decision_stump.hpp b/src/mlpack/methods/decision_stump/decision_stump.hpp
index ccfe678..18e78da 100644
--- a/src/mlpack/methods/decision_stump/decision_stump.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump.hpp
@@ -17,7 +17,7 @@ namespace decision_stump {
  * decision tree, i.e., a decision stump. It uses entropy to decide splitting
  * ranges.
  *
- * The stump is parameterized by a splitting attribute (the dimension on which
+ * The stump is parameterized by a splitting dimension (the dimension on which
  * points are split), a vector of bin split values, and a vector of labels for
  * each bin.  Bin i is specified by the range [split[i], split[i + 1]).  The
  * last bin has range up to \infty (split[i + 1] does not exist in that case).
@@ -91,10 +91,10 @@ class DecisionStump
    */
   void Classify(const MatType& test, arma::Row<size_t>& predictedLabels);
 
-  //! Access the splitting attribute.
-  size_t SplitAttribute() const { return splitAttribute; }
-  //! Modify the splitting attribute (be careful!).
-  size_t& SplitAttribute() { return splitAttribute; }
+  //! Access the splitting dimension.
+  size_t SplitDimension() const { return splitDimension; }
+  //! Modify the splitting dimension (be careful!).
+  size_t& SplitDimension() { return splitDimension; }
 
   //! Access the splitting values.
   const arma::vec& Split() const { return split; }
@@ -116,34 +116,35 @@ class DecisionStump
   //! The minimum number of points in a bucket.
   size_t bucketSize;
 
-  //! Stores the value of the attribute on which to split.
-  size_t splitAttribute;
+  //! Stores the value of the dimension on which to split.
+  size_t splitDimension;
   //! Stores the splitting values after training.
   arma::vec split;
   //! Stores the labels for each splitting bin.
   arma::Col<size_t> binLabels;
 
   /**
-   * Sets up attribute as if it were splitting on it and finds entropy when
-   * splitting on attribute.
+   * Sets up dimension as if it were splitting on it and finds entropy when
+   * splitting on dimension.
    *
-   * @param attribute A row from the training data, which might be a
-   *     candidate for the splitting attribute.
+   * @param dimension A row from the training data, which might be a
+   *     candidate for the splitting dimension.
    * @tparam UseWeights Whether we need to run a weighted Decision Stump.
    */
   template<bool UseWeights>
-  double SetupSplitAttribute(const arma::rowvec& attribute,
+  double SetupSplitDimension(const arma::rowvec& dimension,
                              const arma::Row<size_t>& labels,
                              const arma::rowvec& weightD);
 
   /**
-   * After having decided the attribute on which to split, train on that
-   * attribute.
+   * After having decided the dimension on which to split, train on that
+   * dimension.
    *
-   * @tparam attribute attribute is the attribute decided by the constructor
+   * @tparam dimension dimension is the dimension decided by the constructor
    *      on which we now train the decision stump.
    */
-  void TrainOnAtt(const arma::rowvec& attribute,
+  template<typename VecType>
+  void TrainOnDim(const VecType& dimension,
                   const arma::Row<size_t>& labels);
 
   /**
@@ -164,15 +165,15 @@ class DecisionStump
   /**
    * Returns 1 if all the values of featureRow are not same.
    *
-   * @param featureRow The attribute which is checked for identical values.
+   * @param featureRow The dimension which is checked for identical values.
    */
   template<typename VecType>
   int IsDistinct(const VecType& featureRow);
 
   /**
-   * Calculate the entropy of the given attribute.
+   * Calculate the entropy of the given dimension.
    *
-   * @param labels Corresponding labels of the attribute.
+   * @param labels Corresponding labels of the dimension.
    * @param classes Number of classes.
    * @param weights Weights for this set of labels.
    * @tparam UseWeights If true, the weights in the weight vector will be used
diff --git a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
index c4b8b77..7b17ae2 100644
--- a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
@@ -41,7 +41,7 @@ template<typename MatType>
 DecisionStump<MatType>::DecisionStump() :
     classes(1),
     bucketSize(0),
-    splitAttribute(0),
+    splitDimension(0),
     split(1),
     binLabels(1)
 {
@@ -83,37 +83,37 @@ void DecisionStump<MatType>::Train(const MatType& data,
   this->bucketSize = bucketSize;
 
   // If classLabels are not all identical, proceed with training.
-  size_t bestAtt = 0;
+  size_t bestDim = 0;
   double entropy;
   const double rootEntropy = CalculateEntropy<UseWeights>(labels, weights);
 
   double gain, bestGain = 0.0;
   for (size_t i = 0; i < data.n_rows; i++)
   {
-    // Go through each attribute of the data.
+    // Go through each dimension of the data.
     if (IsDistinct(data.row(i)))
     {
-      // For each attribute with non-identical values, treat it as a potential
-      // splitting attribute and calculate entropy if split on it.
-      entropy = SetupSplitAttribute<UseWeights>(data.row(i), labels, weights);
+      // For each dimension with non-identical values, treat it as a potential
+      // splitting dimension and calculate entropy if split on it.
+      entropy = SetupSplitDimension<UseWeights>(data.row(i), labels, weights);
 
       gain = rootEntropy - entropy;
-      // Find the attribute with the best entropy so that the gain is
+      // Find the dimension with the best entropy so that the gain is
       // maximized.
 
       // We are maximizing gain, which is what is returned from
-      // SetupSplitAttribute().
+      // SetupSplitDimension().
       if (gain < bestGain)
       {
-        bestAtt = i;
+        bestDim = i;
         bestGain = gain;
       }
     }
   }
-  splitAttribute = bestAtt;
+  splitDimension = bestDim;
 
-  // Once the splitting column/attribute has been decided, train on it.
-  TrainOnAtt(data.row(splitAttribute), labels);
+  // Once the splitting column/dimension has been decided, train on it.
+  TrainOnDim(data.row(splitDimension), labels);
 }
 
 /**
@@ -135,7 +135,7 @@ void DecisionStump<MatType>::Classify(const MatType& test,
     // Assume first that it falls into the first bin, then proceed through the
     // bins until it is known which bin it falls into.
     size_t bin = 0;
-    const double val = test(splitAttribute, i);
+    const double val = test(splitDimension, i);
 
     while (bin < split.n_elem - 1)
     {
@@ -186,46 +186,46 @@ void DecisionStump<MatType>::Serialize(Archive& ar,
   // None need special handling.
   ar & CreateNVP(classes, "classes");
   ar & CreateNVP(bucketSize, "bucketSize");
-  ar & CreateNVP(splitAttribute, "splitAttribute");
+  ar & CreateNVP(splitDimension, "splitDimension");
   ar & CreateNVP(split, "split");
   ar & CreateNVP(binLabels, "binLabels");
 }
 
 /**
- * Sets up attribute as if it were splitting on it and finds entropy when
- * splitting on attribute.
+ * Sets up dimension as if it were splitting on it and finds entropy when
+ * splitting on dimension.
  *
- * @param attribute A row from the training data, which might be a candidate for
- *      the splitting attribute.
+ * @param dimension A row from the training data, which might be a candidate for
+ *      the splitting dimension.
  * @param UseWeights Whether we need to run a weighted Decision Stump.
  */
 template<typename MatType>
 template<bool UseWeights>
-double DecisionStump<MatType>::SetupSplitAttribute(
-    const arma::rowvec& attribute,
+double DecisionStump<MatType>::SetupSplitDimension(
+    const arma::rowvec& dimension,
     const arma::Row<size_t>& labels,
     const arma::rowvec& weights)
 {
   size_t i, count, begin, end;
   double entropy = 0.0;
 
-  // Sort the attribute in order to calculate splitting ranges.
-  arma::rowvec sortedAtt = arma::sort(attribute);
+  // Sort the dimension in order to calculate splitting ranges.
+  arma::rowvec sortedDim = arma::sort(dimension);
 
-  // Store the indices of the sorted attribute to build a vector of sorted
+  // Store the indices of the sorted dimension to build a vector of sorted
   // labels.  This sort is stable.
-  arma::uvec sortedIndexAtt = arma::stable_sort_index(attribute.t());
+  arma::uvec sortedIndexDim = arma::stable_sort_index(dimension.t());
 
-  arma::Row<size_t> sortedLabels(attribute.n_elem);
-  arma::rowvec sortedWeights(attribute.n_elem);
+  arma::Row<size_t> sortedLabels(dimension.n_elem);
+  arma::rowvec sortedWeights(dimension.n_elem);
 
-  for (i = 0; i < attribute.n_elem; i++)
+  for (i = 0; i < dimension.n_elem; i++)
   {
-    sortedLabels(i) = labels(sortedIndexAtt(i));
+    sortedLabels(i) = labels(sortedIndexDim(i));
 
     // Apply weights if necessary.
     if (UseWeights)
-      sortedWeights(i) = weights(sortedIndexAtt(i));
+      sortedWeights(i) = weights(sortedIndexDim(i));
   }
 
   i = 0;
@@ -286,25 +286,26 @@ double DecisionStump<MatType>::SetupSplitAttribute(
 }
 
 /**
- * After having decided the attribute on which to split, train on that
- * attribute.
+ * After having decided the dimension on which to split, train on that
+ * dimension.
  *
- * @param attribute Attribute is the attribute decided by the constructor on
+ * @param dimension Dimension is the dimension decided by the constructor on
  *      which we now train the decision stump.
  */
 template<typename MatType>
-void DecisionStump<MatType>::TrainOnAtt(const arma::rowvec& attribute,
+template<typename VecType>
+void DecisionStump<MatType>::TrainOnDim(const VecType& dimension,
                                         const arma::Row<size_t>& labels)
 {
   size_t i, count, begin, end;
 
-  arma::rowvec sortedSplitAtt = arma::sort(attribute);
-  arma::uvec sortedSplitIndexAtt = arma::stable_sort_index(attribute.t());
-  arma::Row<size_t> sortedLabels(attribute.n_elem);
+  arma::rowvec sortedSplitDim = arma::sort(dimension);
+  arma::uvec sortedSplitIndexDim = arma::stable_sort_index(dimension.t());
+  arma::Row<size_t> sortedLabels(dimension.n_elem);
   sortedLabels.fill(0);
 
-  for (i = 0; i < attribute.n_elem; i++)
-    sortedLabels(i) = labels(sortedSplitIndexAtt(i));
+  for (i = 0; i < dimension.n_elem; i++)
+    sortedLabels(i) = labels(sortedSplitIndexDim(i));
 
   arma::rowvec subCols;
   double mostFreq;
@@ -321,7 +322,7 @@ void DecisionStump<MatType>::TrainOnAtt(const arma::rowvec& attribute,
       mostFreq = CountMostFreq(sortedLabels.cols(begin, end));
 
       split.resize(split.n_elem + 1);
-      split(split.n_elem - 1) = sortedSplitAtt(begin);
+      split(split.n_elem - 1) = sortedSplitDim(begin);
       binLabels.resize(binLabels.n_elem + 1);
       binLabels(binLabels.n_elem - 1) = mostFreq;
 
@@ -349,7 +350,7 @@ void DecisionStump<MatType>::TrainOnAtt(const arma::rowvec& attribute,
       mostFreq = CountMostFreq(sortedLabels.cols(begin, end));
 
       split.resize(split.n_elem + 1);
-      split(split.n_elem - 1) = sortedSplitAtt(begin);
+      split(split.n_elem - 1) = sortedSplitDim(begin);
       binLabels.resize(binLabels.n_elem + 1);
       binLabels(binLabels.n_elem - 1) = mostFreq;
 
@@ -422,7 +423,7 @@ double DecisionStump<MatType>::CountMostFreq(const VecType& subCols)
 /**
  * Returns 1 if all the values of featureRow are not the same.
  *
- * @param featureRow The attribute which is checked for identical values.
+ * @param featureRow The dimension which is checked for identical values.
  */
 template<typename MatType>
 template<typename VecType>
@@ -436,10 +437,9 @@ int DecisionStump<MatType>::IsDistinct(const VecType& featureRow)
 }
 
 /**
- * Calculate entropy of attribute.
+ * Calculate entropy of dimension.
  *
- * @param attribute The attribute for which we calculate the entropy.
- * @param labels Corresponding labels of the attribute.
+ * @param labels Corresponding labels of the dimension.
  * @param UseWeights Whether we need to run a weighted Decision Stump.
  */
 template<typename MatType>
diff --git a/src/mlpack/tests/decision_stump_test.cpp b/src/mlpack/tests/decision_stump_test.cpp
index 69e279d..36c83f7 100644
--- a/src/mlpack/tests/decision_stump_test.cpp
+++ b/src/mlpack/tests/decision_stump_test.cpp
@@ -52,16 +52,16 @@ BOOST_AUTO_TEST_CASE(OneClass)
 /**
  * This tests whether the entropy is being correctly calculated by checking the
  * correct value of the splitting column value.  This test is for an
- * inpBucketSize of 4 and the correct value of the splitting attribute is 0.
+ * inpBucketSize of 4 and the correct value of the splitting dimension is 0.
  */
-BOOST_AUTO_TEST_CASE(CorrectAttributeChosen)
+BOOST_AUTO_TEST_CASE(CorrectDimensionChosen)
 {
   const size_t numClasses = 2;
   const size_t inpBucketSize = 4;
 
   // This dataset comes from Chapter 6 of the book "Data Mining: Concepts,
   // Models, Methods, and Algorithms" (2nd Edition) by Mehmed Kantardzic.  It is
-  // found on page 176 (and a description of the correct splitting attribute is
+  // found on page 176 (and a description of the correct splitting dimension is
   // given below that).
   mat trainingData;
   trainingData << 0  << 0  << 0  << 0  << 0  << 1  << 1  << 1  << 1
@@ -80,7 +80,7 @@ BOOST_AUTO_TEST_CASE(CorrectAttributeChosen)
 
   // Only need to check the value of the splitting column, no need of
   // classification.
-  BOOST_CHECK_EQUAL(ds.SplitAttribute(), 0);
+  BOOST_CHECK_EQUAL(ds.SplitDimension(), 0);
 }
 
 /**
@@ -295,7 +295,7 @@ BOOST_AUTO_TEST_CASE(DimensionSelectionTest)
   DecisionStump<> ds(dataset, labels, numClasses, inpBucketSize);
 
   // Make sure it split on the dimension that is most separable.
-  BOOST_CHECK_EQUAL(ds.SplitAttribute(), 1);
+  BOOST_CHECK_EQUAL(ds.SplitDimension(), 1);
 
   // Make sure every bin below -1 classifies as label 0, and every bin above 1
   // classifies as label 1 (What happens in [-1, 1] isn't that big a deal.).
diff --git a/src/mlpack/tests/serialization_test.cpp b/src/mlpack/tests/serialization_test.cpp
index f2d3077..d63a516 100644
--- a/src/mlpack/tests/serialization_test.cpp
+++ b/src/mlpack/tests/serialization_test.cpp
@@ -1458,9 +1458,9 @@ BOOST_AUTO_TEST_CASE(DecisionStumpTest)
   SerializeObjectAll(ds, xmlDs, textDs, binaryDs);
 
   // Make sure that everything is the same about the new decision stumps.
-  BOOST_REQUIRE_EQUAL(ds.SplitAttribute(), xmlDs.SplitAttribute());
-  BOOST_REQUIRE_EQUAL(ds.SplitAttribute(), textDs.SplitAttribute());
-  BOOST_REQUIRE_EQUAL(ds.SplitAttribute(), binaryDs.SplitAttribute());
+  BOOST_REQUIRE_EQUAL(ds.SplitDimension(), xmlDs.SplitDimension());
+  BOOST_REQUIRE_EQUAL(ds.SplitDimension(), textDs.SplitDimension());
+  BOOST_REQUIRE_EQUAL(ds.SplitDimension(), binaryDs.SplitDimension());
 
   CheckMatrices(ds.Split(), xmlDs.Split(), textDs.Split(), binaryDs.Split());
   CheckMatrices(ds.BinLabels(), xmlDs.BinLabels(), textDs.BinLabels(),