[mlpack-git] master, mlpack-1.0.x: New test added. Improved entropy calculation. (1c27cea)

Thu Mar 5 21:51:09 EST 2015

Repository : https://github.com/mlpack/mlpack

On branches: master,mlpack-1.0.x
Link       : https://github.com/mlpack/mlpack/compare/904762495c039e345beba14c1142fd719b3bd50e...f94823c800ad6f7266995c700b1b630d5ffdcf40

>---------------------------------------------------------------

commit 1c27cea95221c437c0bc1ea9358749c6d9f786e4
Author: Udit Saxena <saxena.udit at gmail.com>
Date:   Thu Jul 3 18:39:04 2014 +0000

    New test added. Improved entropy calculation.


>---------------------------------------------------------------

1c27cea95221c437c0bc1ea9358749c6d9f786e4
 .../methods/decision_stump/decision_stump.hpp      |  6 +--
 .../methods/decision_stump/decision_stump_impl.hpp | 54 ++++++++--------------
 src/mlpack/tests/decision_stump_test.cpp           | 30 ++++++++++++
 3 files changed, 52 insertions(+), 38 deletions(-)

diff --git a/src/mlpack/methods/decision_stump/decision_stump.hpp b/src/mlpack/methods/decision_stump/decision_stump.hpp
index 689d7d3..3f90729 100644
--- a/src/mlpack/methods/decision_stump/decision_stump.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump.hpp
@@ -45,12 +45,13 @@ class DecisionStump
    */
   void Classify(const MatType& test, arma::Row<size_t>& predictedLabels);
 
+  int splitCol;
  private:
   //! Stores the number of classes.
   size_t numClass;
 
   //! Stores the value of the attribute on which to split.
-  int splitCol;
+  // int splitCol;
 
   //! Size of bucket while determining splitting criterion.
   size_t bucketSize;
@@ -109,8 +110,7 @@ class DecisionStump
    * @param labels Corresponding labels of the attribute.
    */
   template <typename AttType, typename LabelType>
-  double CalculateEntropy(arma::subview_row<AttType> attribute,
-                          arma::subview_row<LabelType> labels);
+  double CalculateEntropy(arma::subview_row<LabelType> labels);
 };
 
 }; // namespace decision_stump
diff --git a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
index b9c58a5..bdf531c 100644
--- a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
@@ -147,7 +147,7 @@ double DecisionStump<MatType>::SetupSplitAttribute(
 
   i = 0;
   count = 0;
-
+  double ratioEl;
   // This splits the sorted into buckets of size greater than or equal to
   // inpBucketSize.
   while (i < sortedLabels.n_elem)
@@ -160,8 +160,10 @@ double DecisionStump<MatType>::SetupSplitAttribute(
       begin = i - count + 1;
       end = i;
       
-      entropy += CalculateEntropy<double, size_t>(
-                 sortedAtt.subvec(begin,end),sortedLabels.subvec(begin,end));
+      // using ratioEl to calculate the ratio of elements in this split.
+      ratioEl = ((double)(end - begin + 1)/sortedLabels.n_elem);
+      
+      entropy += ratioEl * CalculateEntropy<size_t>(sortedLabels.subvec(begin,end));
       i++;
     }
     else if (sortedLabels(i) != sortedLabels(i + 1))
@@ -171,6 +173,8 @@ double DecisionStump<MatType>::SetupSplitAttribute(
       if (count < bucketSize)
       {
         // if it is, then take the minimum bucket size anyways
+        // this is where the inpBucketSize comes into use
+        // This makes sure there isn't a bucket for every change in labels.
         begin = i - count + 1;
         end = begin + bucketSize - 1;
 
@@ -183,9 +187,9 @@ double DecisionStump<MatType>::SetupSplitAttribute(
         begin = i - count + 1;
         end = i;
       }
+      ratioEl = ((double)(end - begin + 1)/sortedLabels.n_elem);
     
-      entropy += CalculateEntropy<double, size_t>(
-                 sortedAtt.subvec(begin,end),sortedLabels.subvec(begin,end));
+      entropy +=ratioEl * CalculateEntropy<size_t>(sortedLabels.subvec(begin,end));
 
       i = end + 1;
       count = 0;
@@ -269,7 +273,7 @@ void DecisionStump<MatType>::TrainOnAtt(const arma::rowvec& attribute,
 
       // Find the most frequent element in subCols so as to assign a label to
       // the bucket of subCols.
-      mostFreq = CountMostFreq<double>(subCols);//sortedLabels.subvec(begin, end));
+      mostFreq = CountMostFreq<double>(subCols);
 
       split.resize(split.n_elem + 1);
       split(split.n_elem - 1) = sortedSplitAtt(begin);
@@ -372,45 +376,25 @@ int DecisionStump<MatType>::isDistinct(const arma::Row<rType>& featureRow)
  */
 template<typename MatType>
 template<typename AttType, typename LabelType>
-double DecisionStump<MatType>::CalculateEntropy(arma::subview_row<AttType> attribute,
-                                                arma::subview_row<LabelType> labels)
+double DecisionStump<MatType>::CalculateEntropy(arma::subview_row<LabelType> labels)
 {
   double entropy = 0.0;
+  size_t j;
   
-  arma::rowvec uniqueAtt = arma::unique(attribute);
-  arma::Row<LabelType> uniqueLabel = arma::unique(labels);
-  arma::Row<size_t> numElem(uniqueAtt.n_elem);
+  arma::Row<size_t> numElem(numClass); 
   numElem.fill(0);
-  arma::Mat<size_t> entropyArray(uniqueAtt.n_elem,numClass);
-  entropyArray.fill(0);
 
-  // Populate entropyArray and numElem; they are used as helpers to calculate
+  // Populate numElem; they are used as helpers to calculate
   // entropy.
-  for (int j = 0; j < uniqueAtt.n_elem; j++)
-  {
-    for (int i = 0; i < attribute.n_elem; i++)
-    {
-      if (uniqueAtt[j] == attribute[i])
-      {
-        entropyArray(j, labels(i))++;
-        numElem(j)++;
-      }
-    }
-  }
+  for (j = 0; j < labels.n_elem; j++)
+    numElem(labels(j))++;
 
-  for (int j = 0; j < uniqueAtt.size(); j++)
+  for (j = 0; j < numClass; j++)
   {
-    const double p1 = ((double) numElem(j) / attribute.n_elem);
-
-    for (int i = 0; i < numClass; i++)
-    {
-      const double p2 = ((double) entropyArray(j, i) / numElem(j));
-      const double p3 = (p2 == 0) ? 0 : p2 * log2(p2);
+    const double p1 = ((double) numElem(j) / labels.n_elem);
   
-      entropy += p1 * p3;
-    }
+    entropy += (p1 == 0) ? 0 : p1 * log2(p1);
   }
-
   return entropy;
 }
 
diff --git a/src/mlpack/tests/decision_stump_test.cpp b/src/mlpack/tests/decision_stump_test.cpp
index efb55d5..04fbf41 100644
--- a/src/mlpack/tests/decision_stump_test.cpp
+++ b/src/mlpack/tests/decision_stump_test.cpp
@@ -47,6 +47,36 @@ BOOST_AUTO_TEST_CASE(OneClass)
     BOOST_CHECK_EQUAL(predictedLabels(i), 1);
 
 }
+/*
+This tests whether the entropy is being correctly calculated by
+checking the correct value of the splitting column value. 
+This test is for an inpBucketSize of 4 and the correct value of 
+the splitCol is 1. 
+*/
+BOOST_AUTO_TEST_CASE(CorrectAttributeChosen)
+{
+  const size_t numClasses = 2;
+  const size_t inpBucketSize = 4;
+
+  mat trainingData;
+  trainingData << 0 << 0 << 0 << 0 << 0 << 1 << 1 << 1 << 1
+               << 2  << 2  << 2  << 2  << 2 << endr
+               << 70 << 90 << 85 << 95 << 70 << 90 << 78 << 65 << 75
+               << 80  << 70  << 80  << 80  << 96 << endr
+               << 1 << 1 << 0 << 0 << 0 << 1 << 0 << 1 << 0
+               << 1  << 1  << 0  << 0  << 0 << endr;
+
+  // No need to normalize labels here.
+  Mat<size_t> labelsIn;
+  labelsIn << 0 << 1 << 1 << 1 << 0 << 0 << 0 << 0
+           << 0 << 1 << 1 << 0 << 0 << 0;
+
+  DecisionStump<> ds(trainingData, labelsIn.row(0), numClasses, inpBucketSize);
+
+  // Only need to check the value of the splitting column, no need of classification.
+
+  BOOST_CHECK_EQUAL(ds.splitCol,1);
+}
 
 /**
  * This tests for the classification: