[mlpack-git] master, mlpack-1.0.x: Remove oneClass and defaultClass variables. There is a shortcut that can be taken when all the labels are the same, but the Entropy() function does not appear to be working correctly. (5800c8a)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Thu Mar 5 21:49:57 EST 2015


Repository : https://github.com/mlpack/mlpack

On branches: master,mlpack-1.0.x
Link       : https://github.com/mlpack/mlpack/compare/904762495c039e345beba14c1142fd719b3bd50e...f94823c800ad6f7266995c700b1b630d5ffdcf40

>---------------------------------------------------------------

commit 5800c8adf6c7866d1cbf68c378753eb73a8e5054
Author: Ryan Curtin <ryan at ratml.org>
Date:   Thu Jun 26 23:21:07 2014 +0000

    Remove oneClass and defaultClass variables.  There is a shortcut that can be taken when all the labels are the same, but the Entropy() function does not appear to be working correctly.


>---------------------------------------------------------------

5800c8adf6c7866d1cbf68c378753eb73a8e5054
 .../methods/decision_stump/decision_stump.hpp      |   6 --
 .../methods/decision_stump/decision_stump_impl.hpp | 113 ++++++++++-----------
 2 files changed, 56 insertions(+), 63 deletions(-)

diff --git a/src/mlpack/methods/decision_stump/decision_stump.hpp b/src/mlpack/methods/decision_stump/decision_stump.hpp
index 9b16a39..fb7515d 100644
--- a/src/mlpack/methods/decision_stump/decision_stump.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump.hpp
@@ -49,15 +49,9 @@ class DecisionStump
   //! Stores the number of classes.
   size_t numClass;
 
-  //! Stores the default class. Provided for handling missing attribute values.
-  size_t defaultClass;
-
   //! Stores the value of the attribute on which to split.
   int splitCol;
 
-  //! Flag value for distinct input class labels.
-  bool oneClass;
-
   //! Size of bucket while determining splitting criterion.
   size_t bucketSize;
 
diff --git a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
index 5f6bf8c..625e12e 100644
--- a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
@@ -34,48 +34,55 @@ DecisionStump<MatType>::DecisionStump(const MatType& data,
   numClass = classes;
   bucketSize = inpBucketSize;
 
-  // Check whether the input labels are not all identical.
-  if (!isDistinct<size_t>(labels))
-  {
-    // If the labels are all identical, the default class is the only class.
-    oneClass = true;
-    defaultClass = labels(0);
-  }
-  else
-  {
-    // If labels are not all identical, proceed with training.
-    oneClass = false;
-    int bestAtt = -1;
-    double entropy;
-    double bestEntropy = DBL_MAX;
+  // If classLabels are not all identical, proceed with training.
+  int bestAtt = -1;
+  double entropy;
+  double bestEntropy = DBL_MAX;
 
-    // Set the default class to handle attribute values which are not present in
-    // the training data.
-    defaultClass = CountMostFreq<size_t>(labels);
+  // Set the default class to handle attribute values which are not present in
+  // the training data.
+  //defaultClass = CountMostFreq<size_t>(classLabels);
 
-    for (int i = 0; i < data.n_rows; i++)
+  for (int i = 0; i < data.n_rows; i++)
+  {
+    // Go through each attribute of the data.
+    if (isDistinct<double>(data.row(i)))
     {
-      // Go through each attribute of the data.
-      if (isDistinct<double>(data.row(i)))
+      // For each attribute with non-identical values, treat it as a potential
+      // splitting attribute and calculate entropy if split on it.
+      entropy = SetupSplitAttribute(data.row(i), labels);
+
+      // Find the attribute with the bestEntropy so that the gain is
+      // maximized.
+      if (entropy < bestEntropy)
       {
-        // For each attribute with non-identical values, treat it as a potential
-        // splitting attribute and calculate entropy if split on it.
-        entropy = SetupSplitAttribute(data.row(i), labels);
-
-        // Find the attribute with the bestEntropy so that the gain is
-        // maximized.
-        if (entropy < bestEntropy)
-        {
-          bestAtt = i;
-          bestEntropy = entropy;
-        }
+        bestAtt = i;
+        bestEntropy = entropy;
       }
-    }
-    splitCol = bestAtt;
 
-    // Once the splitting column/attribute has been decided, train on it.
-    TrainOnAtt<double>(data.row(splitCol), labels);
+      /* This section is commented out because I believe entropy calculation is
+       * wrong.  Entropy should only be 0 if there is only one class, in which
+       * case classification is perfect and we can take the shortcut below.
+
+      // If the entropy is 0, then all the labels are the same and we are done.
+      Log::Debug << "Entropy is " << entropy << "\n";
+      if (entropy == 0)
+      {
+        // Only one split element... there is no split at all, just one bin.
+        split.set_size(1);
+        binLabels.set_size(1);
+        split[0] = -DBL_MAX;
+        binLabels[0] = labels[0];
+        splitCol = 0; // It doesn't matter.
+        return;
+      }
+      */
+    }
   }
+  splitCol = bestAtt;
+
+  // Once the splitting column/attribute has been decided, train on it.
+  TrainOnAtt<double>(data.row(splitCol), labels);
 }
 
 /**
@@ -90,31 +97,23 @@ template<typename MatType>
 void DecisionStump<MatType>::Classify(const MatType& test,
                                       arma::Row<size_t>& predictedLabels)
 {
-  if (!oneClass)
+  for (int i = 0; i < test.n_cols; i++)
   {
-    for (int i = 0; i < test.n_cols; i++)
-    {
-      // Determine which bin the test point falls into.
-      // Assume first that it falls into the first bin, then proceed through the
-      // bins until it is known which bin it falls into.
-      int bin = 0;
-      const double val = test(splitCol, i);
+    // Determine which bin the test point falls into.
+    // Assume first that it falls into the first bin, then proceed through the
+    // bins until it is known which bin it falls into.
+    int bin = 0;
+    const double val = test(splitCol, i);
 
-      while (bin < split.n_elem - 1)
-      {
-        if (val < split(bin + 1))
-          break;
-
-        ++bin;
-      }
+    while (bin < split.n_elem - 1)
+    {
+      if (val < split(bin + 1))
+        break;
 
-      predictedLabels(i) = binLabels(bin);
+      ++bin;
     }
-  }
-  else
-  {
-    for (int i = 0; i < test.n_cols; i++)
-      predictedLabels(i) = defaultClass;
+
+    predictedLabels(i) = binLabels(bin);
   }
 }
 
@@ -408,7 +407,7 @@ double DecisionStump<MatType>::CalculateEntropy(const arma::rowvec& attribute,
     {
       if (uniqueAtt[j] == attribute[i])
       {
-        entropyArray(j,labels(i))++;
+        entropyArray(j, labels(i))++;
         numElem(j)++;
       }
     }



More information about the mlpack-git mailing list