[mlpack-git] master: Adaboost now works with Decision Stumps; added tests for the same and extended tests for Perceptron too. (bc3b890)

Thu Mar 5 21:55:44 EST 2015

Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/904762495c039e345beba14c1142fd719b3bd50e...f94823c800ad6f7266995c700b1b630d5ffdcf40

>---------------------------------------------------------------

commit bc3b890ae15c738919f23407c8ed4d78635dbb4d
Author: Udit Saxena <saxena.udit at gmail.com>
Date:   Sat Aug 2 14:26:49 2014 +0000

    Adaboost now works with Decision Stumps; added tests for the same and extended tests for Perceptron too.


>---------------------------------------------------------------

bc3b890ae15c738919f23407c8ed4d78635dbb4d
 src/mlpack/methods/adaboost/adaboost.hpp           |   1 +
 .../methods/decision_stump/decision_stump.hpp      |  45 ++--
 .../methods/decision_stump/decision_stump_impl.hpp |  82 +++---
 src/mlpack/tests/adaboost_test.cpp                 | 287 +++++++++++++++++++++
 4 files changed, 356 insertions(+), 59 deletions(-)

diff --git a/src/mlpack/methods/adaboost/adaboost.hpp b/src/mlpack/methods/adaboost/adaboost.hpp
index 56a7b98..b1b64aa 100644
--- a/src/mlpack/methods/adaboost/adaboost.hpp
+++ b/src/mlpack/methods/adaboost/adaboost.hpp
@@ -10,6 +10,7 @@
 
 #include <mlpack/core.hpp>
 #include <mlpack/methods/perceptron/perceptron.hpp>
+#include <mlpack/methods/decision_stump/decision_stump.hpp>
  
 namespace mlpack {
 namespace adaboost {
diff --git a/src/mlpack/methods/decision_stump/decision_stump.hpp b/src/mlpack/methods/decision_stump/decision_stump.hpp
index 5255670..d4a7235 100644
--- a/src/mlpack/methods/decision_stump/decision_stump.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump.hpp
@@ -54,22 +54,22 @@ class DecisionStump
   void Classify(const MatType& test, arma::Row<size_t>& predictedLabels);
 
   /**
-   *
-   *
-   *
-   *
+   *  Alternate constructor which copies parameters bucketSize and numClass
+   *  from an already initiated decision stump, other. It appropriately 
+   *  sets the Weight vector.
+   *
+   *  @param other The other initiated Decision Stump object from 
+   *               which we copy the values from.
+   *  @param data The data on which to train this object on.
+   *  @param D Weight vector to use while training. For boosting purposes.
+   *  @param labels The labels of data.
    */
-  DecisionStump(const DecisionStump<>& ds);
-
-  /**
-   *
-   *
-   *
-   *
-   *
-   *
-  ModifyData(MatType& data, const arma::Row<double>& D);
-  */
+  DecisionStump(
+    const DecisionStump<>& other, 
+    const MatType& data, 
+    const arma::rowvec& weights, 
+    const arma::Row<size_t>& labels
+    );
   
   //! Access the splitting attribute.
   int SplitAttribute() const { return splitAttribute; }
@@ -110,8 +110,7 @@ class DecisionStump
    *     candidate for the splitting attribute.
    */
   double SetupSplitAttribute(const arma::rowvec& attribute,
-                             const arma::Row<size_t>& labels,
-                             const arma::rowvec& D);
+                             const arma::Row<size_t>& labels);
 
   /**
    * After having decided the attribute on which to split, train on that
@@ -150,14 +149,20 @@ class DecisionStump
    * @param attribute The attribute of which we calculate the entropy.
    * @param labels Corresponding labels of the attribute.
    */
-  template <typename AttType, typename LabelType>
-  double CalculateEntropy(arma::subview_row<LabelType> labels);
+  template <typename LabelType>
+  double CalculateEntropy(arma::subview_row<LabelType> labels, int begin);
 
   /**
    *
    *
    */
-  void Train(const MatType& data, const arma::Row<size_t>& labels, const arma::rowvec& D);
+  void Train(const MatType& data, const arma::Row<size_t>& labels);
+
+  //! To store the weight vectors for boosting purposes.
+  arma::rowvec weightD;
+
+  //! To store reordered weight vectors for boosting purposes.
+  arma::rowvec tempD;
 };
 
 }; // namespace decision_stump
diff --git a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
index 089415f..419abae 100644
--- a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
@@ -34,20 +34,21 @@ DecisionStump<MatType>::DecisionStump(const MatType& data,
   numClass = classes;
   bucketSize = inpBucketSize;
 
-  arma::rowvec D(data.n_cols);
-  D.fill(1.0);
+  weightD = arma::rowvec(data.n_cols);
+  weightD.fill(1.0);
+  tempD = weightD;
 
-  Train(data, labels, D);
+  Train(data, labels);
 }
 
 template<typename MatType>
-void DecisionStump<MatType>::Train(const MatType& data, const arma::Row<size_t>& labels, const arma::rowvec& D)
+void DecisionStump<MatType>::Train(const MatType& data, const arma::Row<size_t>& labels)
 {
   // If classLabels are not all identical, proceed with training.
   int bestAtt = 0;
   double entropy;
   const double rootEntropy = CalculateEntropy<size_t>(
-      labels.subvec(0, labels.n_elem - 1));
+      labels.subvec(0, labels.n_elem - 1), 0);
 
   double gain, bestGain = 0.0;
   for (int i = 0; i < data.n_rows; i++)
@@ -57,7 +58,7 @@ void DecisionStump<MatType>::Train(const MatType& data, const arma::Row<size_t>&
     {
       // For each attribute with non-identical values, treat it as a potential
       // splitting attribute and calculate entropy if split on it.
-      entropy = SetupSplitAttribute(data.row(i), labels, D);
+      entropy = SetupSplitAttribute(data.row(i), labels);
 
       // Log::Debug << "Entropy for attribute " << i << " is " << entropy << ".\n";
       gain = rootEntropy - entropy;
@@ -113,38 +114,34 @@ void DecisionStump<MatType>::Classify(const MatType& test,
 }
 
 /**
+ *  Alternate constructor which copies parameters bucketSize and numClass
+ *  from an already initiated decision stump, other. It appropriately 
+ *  sets the Weight vector.
  *
- *
- *
- *
- *
+ *  @param other The other initiated Decision Stump object from 
+ *               which we copy the values from.
+ *  @param data The data on which to train this object on.
+ *  @param D Weight vector to use while training. For boosting purposes.
+ *  @param labels The labels of data.
  */
 template <typename MatType>
-DecisionStump<MatType>::DecisionStump(const DecisionStump<>& ds)
+DecisionStump<MatType>::DecisionStump(
+                        const DecisionStump<>& other, 
+                        const MatType& data, 
+                        const arma::rowvec& weights, 
+                        const arma::Row<size_t>& labels
+                        )
 {
-  numClass = ds.numClass;
-
-  splitAttribute = ds.splitAttribute;
+  numClass = other.numClass;
+  bucketSize = other.bucketSize;
 
-  bucketSize = ds.bucketSize;
+  weightD = weights;
+  tempD = weightD;
 
-  split = ds.split;
-
-  binLabels = ds.binLabels;
+  Train(data, labels);
 }
 
 /**
- *
- *
- *
- *
- *
- *
-template <typename MatType>
-DecisionStump<MatType>::ModifyData(MatType& data, const arma::Row<double>& D)
- */
-
-/**
  * Sets up attribute as if it were splitting on it and finds entropy when
  * splitting on attribute.
  *
@@ -154,8 +151,7 @@ DecisionStump<MatType>::ModifyData(MatType& data, const arma::Row<double>& D)
 template <typename MatType>
 double DecisionStump<MatType>::SetupSplitAttribute(
     const arma::rowvec& attribute,
-    const arma::Row<size_t>& labels,
-    const arma::rowvec& D)
+    const arma::Row<size_t>& labels)
 {
   int i, count, begin, end;
   double entropy = 0.0;
@@ -170,11 +166,12 @@ double DecisionStump<MatType>::SetupSplitAttribute(
   arma::Row<size_t> sortedLabels(attribute.n_elem);
   sortedLabels.fill(0);
 
-  arma::rowvec dTemp(D.n_cols);
+  tempD = arma::rowvec(weightD.n_cols);
+
   for (i = 0; i < attribute.n_elem; i++)
   {
     sortedLabels(i) = labels(sortedIndexAtt(i));
-    dTemp(i) = D(sortedIndexAtt(i));
+    tempD(i) = weightD(sortedIndexAtt(i));
   }
   
   i = 0;
@@ -196,7 +193,7 @@ double DecisionStump<MatType>::SetupSplitAttribute(
       const double ratioEl = ((double) (end - begin + 1) / sortedLabels.n_elem);
 
       entropy += ratioEl * CalculateEntropy<size_t>(
-          sortedLabels.subvec(begin, end));
+          sortedLabels.subvec(begin, end), begin);
       i++;
     }
     else if (sortedLabels(i) != sortedLabels(i + 1))
@@ -223,7 +220,7 @@ double DecisionStump<MatType>::SetupSplitAttribute(
       const double ratioEl = ((double) (end - begin + 1) / sortedLabels.n_elem);
 
       entropy += ratioEl * CalculateEntropy<size_t>(
-          sortedLabels.subvec(begin, end));
+          sortedLabels.subvec(begin, end), begin);
 
       i = end + 1;
       count = 0;
@@ -413,8 +410,8 @@ int DecisionStump<MatType>::IsDistinct(const arma::Row<rType>& featureRow)
  * @param labels Corresponding labels of the attribute.
  */
 template<typename MatType>
-template<typename AttType, typename LabelType>
-double DecisionStump<MatType>::CalculateEntropy(arma::subview_row<LabelType> labels)
+template<typename LabelType>
+double DecisionStump<MatType>::CalculateEntropy(arma::subview_row<LabelType> labels, int begin)
 {
   double entropy = 0.0;
   size_t j;
@@ -422,14 +419,21 @@ double DecisionStump<MatType>::CalculateEntropy(arma::subview_row<LabelType> lab
   arma::Row<size_t> numElem(numClass);
   numElem.fill(0);
 
+  // variable to accumulate the weight in this subview_row
+  double accWeight = 0.0;
   // Populate numElem; they are used as helpers to calculate
   // entropy.
+  
   for (j = 0; j < labels.n_elem; j++)
-    numElem(labels(j))++;
+  {
+    numElem(labels(j)) += tempD(j + begin);
+    accWeight += tempD(j + begin);
+  }  
+    // numElem(labels(j))++;
 
   for (j = 0; j < numClass; j++)
   {
-    const double p1 = ((double) numElem(j) / labels.n_elem);
+    const double p1 = ((double) numElem(j) / accWeight); 
 
     entropy += (p1 == 0) ? 0 : p1 * log2(p1);
   }
diff --git a/src/mlpack/tests/adaboost_test.cpp b/src/mlpack/tests/adaboost_test.cpp
index 703889f..704f3d0 100644
--- a/src/mlpack/tests/adaboost_test.cpp
+++ b/src/mlpack/tests/adaboost_test.cpp
@@ -275,4 +275,291 @@ BOOST_AUTO_TEST_CASE(WeakLearnerErrorNonLinearSepData)
   
   BOOST_REQUIRE(error <= weakLearnerErrorRate);
 }
+
+/**
+ *  This test case runs the Adaboost.mh algorithm on the UCI Iris dataset.
+ *  It checks whether the hamming loss breaches the upperbound, which
+ *  is provided by ztAccumulator.
+ *  This is for the weak learner: Decision Stumps.
+ */
+BOOST_AUTO_TEST_CASE(HammingLossIris_DS)
+{
+  arma::mat inputData;
+
+  if (!data::Load("iris.txt", inputData))
+    BOOST_FAIL("Cannot load test dataset iris.txt!");
+
+  arma::Mat<size_t> labels;
+
+  if (!data::Load("iris_labels.txt",labels))
+    BOOST_FAIL("Cannot load labels for iris_labels.txt");
+  
+  // no need to map the labels here
+
+  // Define your own weak learner, Decision Stumps in this case.
+
+  // Define parameters for the adaboost
+  const size_t numClasses = 3;
+  const size_t inpBucketSize = 6;
+
+  decision_stump::DecisionStump<> ds(inputData, labels.row(0), 
+                                     numClasses, inpBucketSize);
+  int iterations = 50;
+  double tolerance = 1e-10;
+  
+  Adaboost<arma::mat, mlpack::decision_stump::DecisionStump<> > a(inputData, 
+          labels.row(0), iterations, tolerance, ds);
+  int countError = 0;
+  for (size_t i = 0; i < labels.n_cols; i++)
+    if(labels(i) != a.finalHypothesis(i))
+      countError++;
+  double hammingLoss = (double) countError / labels.n_cols;
+
+  BOOST_REQUIRE(hammingLoss <= a.ztAccumulator);
+}
+
+/**
+ *  This test case runs the Adaboost.mh algorithm on a non-linearly 
+ *  separable dataset. 
+ *  It checks if the error returned by running a single instance of the 
+ *  weak learner is worse than running the boosted weak learner using 
+ *  adaboost.
+ *  This is for the weak learner: Decision Stumps.
+ */
+BOOST_AUTO_TEST_CASE(WeakLearnerErrorIris_DS)
+{
+  arma::mat inputData;
+
+  if (!data::Load("iris.txt", inputData))
+    BOOST_FAIL("Cannot load test dataset iris.txt!");
+
+  arma::Mat<size_t> labels;
+
+  if (!data::Load("iris_labels.txt",labels))
+    BOOST_FAIL("Cannot load labels for iris_labels.txt");
+  
+  // no need to map the labels here
+
+  // Define your own weak learner, Decision Stump in this case.
+
+  const size_t numClasses = 3;
+  const size_t inpBucketSize = 6;
+
+  arma::Row<size_t> dsPrediction(labels.n_cols);
+  
+  decision_stump::DecisionStump<> ds(inputData, labels.row(0), 
+                                     numClasses, inpBucketSize);
+  
+  int countWeakLearnerError = 0;
+  for (size_t i = 0; i < labels.n_cols; i++)
+    if(labels(i) != dsPrediction(i))
+      countWeakLearnerError++;
+  double weakLearnerErrorRate = (double) countWeakLearnerError / labels.n_cols;
+  
+  // Define parameters for the adaboost
+  int iterations = 50;
+  double tolerance = 1e-10;
+  
+  Adaboost<arma::mat, mlpack::decision_stump::DecisionStump<> > a(inputData, 
+           labels.row(0), iterations, tolerance, ds);
+  int countError = 0;
+  for (size_t i = 0; i < labels.n_cols; i++)
+    if(labels(i) != a.finalHypothesis(i))
+      countError++;
+  double error = (double) countError / labels.n_cols;
+  
+  BOOST_REQUIRE(error <= weakLearnerErrorRate);
+}
+/**
+ *  This test case runs the Adaboost.mh algorithm on the UCI Vertebral 
+ *  Column dataset.
+ *  It checks if the error returned by running a single instance of the 
+ *  weak learner is worse than running the boosted weak learner using 
+ *  adaboost.
+ *  This is for the weak learner: Decision Stumps.
+ */
+BOOST_AUTO_TEST_CASE(HammingLossBoundVertebralColumn_DS)
+{
+  arma::mat inputData;
+
+  if (!data::Load("vc2.txt", inputData))
+    BOOST_FAIL("Cannot load test dataset vc2.txt!");
+
+  arma::Mat<size_t> labels;
+
+  if (!data::Load("vc2_labels.txt",labels))
+    BOOST_FAIL("Cannot load labels for vc2_labels.txt");
+  
+  // no need to map the labels here
+
+  // Define your own weak learner, Decision Stump in this case.
+
+  // Define parameters for the adaboost
+  const size_t numClasses = 3;
+  const size_t inpBucketSize = 6;
+
+  decision_stump::DecisionStump<> ds(inputData, labels.row(0), 
+                                     numClasses, inpBucketSize);
+  
+  int iterations = 50;
+  double tolerance = 1e-10;
+  
+  Adaboost<arma::mat, mlpack::decision_stump::DecisionStump<> > a(inputData,
+           labels.row(0), iterations, tolerance, ds);
+  int countError = 0;
+  for (size_t i = 0; i < labels.n_cols; i++)
+    if(labels(i) != a.finalHypothesis(i))
+      countError++;
+  double hammingLoss = (double) countError / labels.n_cols;
+
+  BOOST_REQUIRE(hammingLoss <= a.ztAccumulator);
+}
+
+/**
+ *  This test case runs the Adaboost.mh algorithm on the UCI Vertebral 
+ *  Column dataset.
+ *  It checks if the error returned by running a single instance of the 
+ *  weak learner is worse than running the boosted weak learner using 
+ *  adaboost.
+ *  This is for the weak learner: Decision Stumps.
+ */
+BOOST_AUTO_TEST_CASE(WeakLearnerErrorVertebralColumn_DS)
+{
+  arma::mat inputData;
+
+  if (!data::Load("vc2.txt", inputData))
+    BOOST_FAIL("Cannot load test dataset vc2.txt!");
+
+  arma::Mat<size_t> labels;
+
+  if (!data::Load("vc2_labels.txt",labels))
+    BOOST_FAIL("Cannot load labels for vc2_labels.txt");
+  
+  // no need to map the labels here
+
+  // Define your own weak learner, Decision Stump in this case.
+
+  const size_t numClasses = 3;
+  const size_t inpBucketSize = 6;
+
+  arma::Row<size_t> dsPrediction(labels.n_cols);
+  
+  decision_stump::DecisionStump<> ds(inputData, labels.row(0), 
+                                     numClasses, inpBucketSize);
+  
+  int countWeakLearnerError = 0;
+  for (size_t i = 0; i < labels.n_cols; i++)
+    if(labels(i) != dsPrediction(i))
+      countWeakLearnerError++;
+  double weakLearnerErrorRate = (double) countWeakLearnerError / labels.n_cols;
+  
+  // Define parameters for the adaboost
+  int iterations = 50;
+  double tolerance = 1e-10;
+  Adaboost<arma::mat, mlpack::decision_stump::DecisionStump<> > a(inputData, 
+           labels.row(0), iterations, tolerance, ds);
+  int countError = 0;
+  for (size_t i = 0; i < labels.n_cols; i++)
+    if(labels(i) != a.finalHypothesis(i))
+      countError++;
+  double error = (double) countError / labels.n_cols;
+  
+  BOOST_REQUIRE(error <= weakLearnerErrorRate);
+}
+/**
+ *  This test case runs the Adaboost.mh algorithm on non-linearly 
+ *  separable dataset. 
+ *  It checks whether the hamming loss breaches the upperbound, which
+ *  is provided by ztAccumulator.
+ *  This is for the weak learner: Decision Stumps.
+ */
+BOOST_AUTO_TEST_CASE(HammingLossBoundNonLinearSepData_DS)
+{
+  arma::mat inputData;
+
+  if (!data::Load("nonlinsepdata.txt", inputData))
+    BOOST_FAIL("Cannot load test dataset nonlinsepdata.txt!");
+
+  arma::Mat<size_t> labels;
+
+  if (!data::Load("nonlinsepdata_labels.txt",labels))
+    BOOST_FAIL("Cannot load labels for nonlinsepdata_labels.txt");
+  
+  // no need to map the labels here
+
+  // Define your own weak learner, Decision Stump in this case.
+
+  // Define parameters for the adaboost
+  const size_t numClasses = 2;
+  const size_t inpBucketSize = 6;
+
+  decision_stump::DecisionStump<> ds(inputData, labels.row(0), 
+                                     numClasses, inpBucketSize);
+  
+  int iterations = 50;
+  double tolerance = 1e-10;
+  
+  Adaboost<arma::mat, mlpack::decision_stump::DecisionStump<> > a(inputData, 
+           labels.row(0), iterations, tolerance, ds);
+  int countError = 0;
+  for (size_t i = 0; i < labels.n_cols; i++)
+    if(labels(i) != a.finalHypothesis(i))
+      countError++;
+  double hammingLoss = (double) countError / labels.n_cols;
+
+  BOOST_REQUIRE(hammingLoss <= a.ztAccumulator);
+}
+
+/**
+ *  This test case runs the Adaboost.mh algorithm on a non-linearly 
+ *  separable dataset. 
+ *  It checks if the error returned by running a single instance of the 
+ *  weak learner is worse than running the boosted weak learner using 
+ *  adaboost.
+ *  This for the weak learner: Decision Stumps.
+ */
+BOOST_AUTO_TEST_CASE(WeakLearnerErrorNonLinearSepData_DS)
+{
+  arma::mat inputData;
+
+  if (!data::Load("nonlinsepdata.txt", inputData))
+    BOOST_FAIL("Cannot load test dataset nonlinsepdata.txt!");
+
+  arma::Mat<size_t> labels;
+
+  if (!data::Load("nonlinsepdata_labels.txt",labels))
+    BOOST_FAIL("Cannot load labels for nonlinsepdata_labels.txt");
+  
+  // no need to map the labels here
+
+  // Define your own weak learner, Decision Stump in this case.
+
+  const size_t numClasses = 2;
+  const size_t inpBucketSize = 6;
+
+  arma::Row<size_t> dsPrediction(labels.n_cols);
+
+  decision_stump::DecisionStump<> ds(inputData, labels.row(0), 
+                                     numClasses, inpBucketSize);
+  
+  int countWeakLearnerError = 0;
+  for (size_t i = 0; i < labels.n_cols; i++)
+    if(labels(i) != dsPrediction(i))
+      countWeakLearnerError++;
+  double weakLearnerErrorRate = (double) countWeakLearnerError / labels.n_cols;
+  
+  // Define parameters for the adaboost
+  int iterations = 50;
+  double tolerance = 1e-10;
+  
+  Adaboost<arma::mat, mlpack::decision_stump::DecisionStump<> > a(inputData, 
+           labels.row(0), iterations, tolerance, ds);
+  int countError = 0;
+  for (size_t i = 0; i < labels.n_cols; i++)
+    if(labels(i) != a.finalHypothesis(i))
+      countError++;
+  double error = (double) countError / labels.n_cols;
+  
+  BOOST_REQUIRE(error <= weakLearnerErrorRate);
+}
 BOOST_AUTO_TEST_SUITE_END();
\ No newline at end of file