[mlpack-git] master: More tests for Adaboost added, with tolerance for change in rt also provided. (fa79435)

Thu Mar 5 21:55:18 EST 2015

Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/904762495c039e345beba14c1142fd719b3bd50e...f94823c800ad6f7266995c700b1b630d5ffdcf40

>---------------------------------------------------------------

commit fa79435d73928832748a028978992111da63adc5
Author: Udit Saxena <saxena.udit at gmail.com>
Date:   Tue Jul 29 18:56:10 2014 +0000

    More tests for Adaboost added, with tolerance for change in rt also provided.


>---------------------------------------------------------------

fa79435d73928832748a028978992111da63adc5
 src/mlpack/methods/adaboost/adaboost.hpp           |   5 +-
 src/mlpack/methods/adaboost/adaboost_impl.hpp      |  13 +-
 src/mlpack/methods/adaboost/adaboost_main.cpp      |   5 +-
 .../methods/decision_stump/decision_stump.hpp      |   9 +-
 .../methods/decision_stump/decision_stump_impl.hpp |  18 +-
 src/mlpack/tests/adaboost_test.cpp                 |  99 +++++++++-
 src/mlpack/tests/data/nonlinsepdata.txt            | 200 +++++++++++++++++++++
 .../{vc2_labels.txt => nonlinsepdata_labels.txt}   | 190 +++++---------------
 8 files changed, 372 insertions(+), 167 deletions(-)

diff --git a/src/mlpack/methods/adaboost/adaboost.hpp b/src/mlpack/methods/adaboost/adaboost.hpp
index cfca3bc..56a7b98 100644
--- a/src/mlpack/methods/adaboost/adaboost.hpp
+++ b/src/mlpack/methods/adaboost/adaboost.hpp
@@ -25,10 +25,11 @@ public:
    *  @param data Input data
    *  @param labels Corresponding labels
    *  @param iterations Number of boosting rounds 
+   *  @param tol The tolerance for change in values of rt.
    *  @param other Weak Learner, which has been initialized already
    */
   Adaboost(const MatType& data, const arma::Row<size_t>& labels,
-           int iterations, const WeakLearner& other);
+           int iterations, double tol, const WeakLearner& other);
 
   /**
    *  This function helps in building a classification Matrix which is of 
@@ -59,6 +60,8 @@ public:
   // To check for the bound for the hammingLoss.
   double ztAccumulator;
 
+  // The tolerance for change in rt and when to stop.
+  double tolerance;
 }; // class Adaboost
 
 } // namespace adaboost
diff --git a/src/mlpack/methods/adaboost/adaboost_impl.hpp b/src/mlpack/methods/adaboost/adaboost_impl.hpp
index 3d0d663..a6ed804 100644
--- a/src/mlpack/methods/adaboost/adaboost_impl.hpp
+++ b/src/mlpack/methods/adaboost/adaboost_impl.hpp
@@ -46,15 +46,15 @@ namespace adaboost {
  */
 template<typename MatType, typename WeakLearner>
 Adaboost<MatType, WeakLearner>::Adaboost(const MatType& data, 
-        const arma::Row<size_t>& labels, int iterations, 
+        const arma::Row<size_t>& labels, int iterations, double tol,
         const WeakLearner& other)
 {
   // Counting the number of classes into numClasses.
   size_t numClasses = (arma::max(labels) - arma::min(labels)) + 1;
-
+  tolerance = tol;
   int i, j, k;
   double rt, crt, alphat = 0.0, zt;
-  double tolerance = 1e-20;
+  // double tolerance = 1e-8;
   // std::cout<<"Tolerance is "<<tolerance<<"\n";
   // crt is for stopping the iterations when rt 
   // stops changing by less than a tolerant value.
@@ -127,11 +127,8 @@ Adaboost<MatType, WeakLearner>::Adaboost(const MatType& data,
 
     if (i > 0)
     {
-      if ( (rt - crt) < tolerance)
-      {
-        // std::cout<<(rt-crt)<<"\n";
-        i = iterations;
-      }
+      if ( std::abs(rt - crt) < tolerance )
+        break;
     }
     crt = rt;
 
diff --git a/src/mlpack/methods/adaboost/adaboost_main.cpp b/src/mlpack/methods/adaboost/adaboost_main.cpp
index 3cb9028..82e25b7 100644
--- a/src/mlpack/methods/adaboost/adaboost_main.cpp
+++ b/src/mlpack/methods/adaboost/adaboost_main.cpp
@@ -27,6 +27,7 @@ PARAM_STRING("output", "The file in which the predicted labels for the test set"
 PARAM_INT("iterations","The maximum number of boosting iterations "
   "to be run", "i", 1000);
 PARAM_INT_REQ("classes","The number of classes in the input label set.","c");
+PARAM_DOUBLE("tolerance","The tolerance for change in values of rt","e",1e-10);
 
 int main(int argc, char *argv[])
 {
@@ -75,6 +76,8 @@ int main(int argc, char *argv[])
   mat testingData;
   data::Load(testingDataFilename, testingData, true);
 
+  const double tolerance = CLI::GetParam<double>("tolerance");
+
   if (testingData.n_rows != trainingData.n_rows)
     Log::Fatal << "Test data dimensionality (" << testingData.n_rows << ") "
         << "must be the same as training data (" << trainingData.n_rows - 1
@@ -88,7 +91,7 @@ int main(int argc, char *argv[])
   perceptron::Perceptron<> p(trainingData, labels.t(), iter);
   
   Timer::Start("Training");
-  Adaboost<> a(trainingData, labels.t(), iterations, p);
+  Adaboost<> a(trainingData, labels.t(), iterations, tolerance, p);
   Timer::Stop("Training");
 
   return 0;
diff --git a/src/mlpack/methods/decision_stump/decision_stump.hpp b/src/mlpack/methods/decision_stump/decision_stump.hpp
index 3c0adcb..5255670 100644
--- a/src/mlpack/methods/decision_stump/decision_stump.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump.hpp
@@ -110,7 +110,8 @@ class DecisionStump
    *     candidate for the splitting attribute.
    */
   double SetupSplitAttribute(const arma::rowvec& attribute,
-                             const arma::Row<size_t>& labels);
+                             const arma::Row<size_t>& labels,
+                             const arma::rowvec& D);
 
   /**
    * After having decided the attribute on which to split, train on that
@@ -151,6 +152,12 @@ class DecisionStump
    */
   template <typename AttType, typename LabelType>
   double CalculateEntropy(arma::subview_row<LabelType> labels);
+
+  /**
+   *
+   *
+   */
+  void Train(const MatType& data, const arma::Row<size_t>& labels, const arma::rowvec& D);
 };
 
 }; // namespace decision_stump
diff --git a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
index 80d961c..089415f 100644
--- a/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
+++ b/src/mlpack/methods/decision_stump/decision_stump_impl.hpp
@@ -34,6 +34,15 @@ DecisionStump<MatType>::DecisionStump(const MatType& data,
   numClass = classes;
   bucketSize = inpBucketSize;
 
+  arma::rowvec D(data.n_cols);
+  D.fill(1.0);
+
+  Train(data, labels, D);
+}
+
+template<typename MatType>
+void DecisionStump<MatType>::Train(const MatType& data, const arma::Row<size_t>& labels, const arma::rowvec& D)
+{
   // If classLabels are not all identical, proceed with training.
   int bestAtt = 0;
   double entropy;
@@ -48,7 +57,7 @@ DecisionStump<MatType>::DecisionStump(const MatType& data,
     {
       // For each attribute with non-identical values, treat it as a potential
       // splitting attribute and calculate entropy if split on it.
-      entropy = SetupSplitAttribute(data.row(i), labels);
+      entropy = SetupSplitAttribute(data.row(i), labels, D);
 
       // Log::Debug << "Entropy for attribute " << i << " is " << entropy << ".\n";
       gain = rootEntropy - entropy;
@@ -145,7 +154,8 @@ DecisionStump<MatType>::ModifyData(MatType& data, const arma::Row<double>& D)
 template <typename MatType>
 double DecisionStump<MatType>::SetupSplitAttribute(
     const arma::rowvec& attribute,
-    const arma::Row<size_t>& labels)
+    const arma::Row<size_t>& labels,
+    const arma::rowvec& D)
 {
   int i, count, begin, end;
   double entropy = 0.0;
@@ -160,8 +170,12 @@ double DecisionStump<MatType>::SetupSplitAttribute(
   arma::Row<size_t> sortedLabels(attribute.n_elem);
   sortedLabels.fill(0);
 
+  arma::rowvec dTemp(D.n_cols);
   for (i = 0; i < attribute.n_elem; i++)
+  {
     sortedLabels(i) = labels(sortedIndexAtt(i));
+    dTemp(i) = D(sortedIndexAtt(i));
+  }
 
   i = 0;
   count = 0;
diff --git a/src/mlpack/tests/adaboost_test.cpp b/src/mlpack/tests/adaboost_test.cpp
index d613e21..703889f 100644
--- a/src/mlpack/tests/adaboost_test.cpp
+++ b/src/mlpack/tests/adaboost_test.cpp
@@ -44,7 +44,8 @@ BOOST_AUTO_TEST_CASE(HammingLossBoundIris)
 
   // Define parameters for the adaboost
   int iterations = 100;
-  Adaboost<> a(inputData, labels.row(0), iterations, p);
+  double tolerance = 1e-10;
+  Adaboost<> a(inputData, labels.row(0), iterations, tolerance, p);
   int countError = 0;
   for (size_t i = 0; i < labels.n_cols; i++)
     if(labels(i) != a.finalHypothesis(i))
@@ -90,7 +91,8 @@ BOOST_AUTO_TEST_CASE(WeakLearnerErrorIris)
 
   // Define parameters for the adaboost
   int iterations = 100;
-  Adaboost<> a(inputData, labels.row(0), iterations, p);
+  double tolerance = 1e-10;
+  Adaboost<> a(inputData, labels.row(0), iterations, tolerance, p);
   int countError = 0;
   for (size_t i = 0; i < labels.n_cols; i++)
     if(labels(i) != a.finalHypothesis(i))
@@ -128,7 +130,8 @@ BOOST_AUTO_TEST_CASE(HammingLossBoundVertebralColumn)
 
   // Define parameters for the adaboost
   int iterations = 50;
-  Adaboost<> a(inputData, labels.row(0), iterations, p);
+  double tolerance = 1e-10;
+  Adaboost<> a(inputData, labels.row(0), iterations, tolerance, p);
   int countError = 0;
   for (size_t i = 0; i < labels.n_cols; i++)
     if(labels(i) != a.finalHypothesis(i))
@@ -175,7 +178,95 @@ BOOST_AUTO_TEST_CASE(WeakLearnerErrorVertebralColumn)
 
   // Define parameters for the adaboost
   int iterations = 50;
-  Adaboost<> a(inputData, labels.row(0), iterations, p);
+  double tolerance = 1e-10;
+  Adaboost<> a(inputData, labels.row(0), iterations, tolerance, p);
+  int countError = 0;
+  for (size_t i = 0; i < labels.n_cols; i++)
+    if(labels(i) != a.finalHypothesis(i))
+      countError++;
+  double error = (double) countError / labels.n_cols;
+  
+  BOOST_REQUIRE(error <= weakLearnerErrorRate);
+}
+
+/**
+ *  This test case runs the Adaboost.mh algorithm on non-linearly 
+ *  separable dataset. 
+ *  It checks whether the hamming loss breaches the upperbound, which
+ *  is provided by ztAccumulator.
+ */
+BOOST_AUTO_TEST_CASE(HammingLossBoundNonLinearSepData)
+{
+  arma::mat inputData;
+
+  if (!data::Load("nonlinsepdata.txt", inputData))
+    BOOST_FAIL("Cannot load test dataset nonlinsepdata.txt!");
+
+  arma::Mat<size_t> labels;
+
+  if (!data::Load("nonlinsepdata_labels.txt",labels))
+    BOOST_FAIL("Cannot load labels for nonlinsepdata_labels.txt");
+  
+  // no need to map the labels here
+
+  // Define your own weak learner, perceptron in this case.
+  // Run the perceptron for perceptron_iter iterations.
+  int perceptron_iter = 800;
+
+  perceptron::Perceptron<> p(inputData, labels.row(0), perceptron_iter);
+
+  // Define parameters for the adaboost
+  int iterations = 50;
+  double tolerance = 1e-10;
+  Adaboost<> a(inputData, labels.row(0), iterations, tolerance, p);
+  int countError = 0;
+  for (size_t i = 0; i < labels.n_cols; i++)
+    if(labels(i) != a.finalHypothesis(i))
+      countError++;
+  double hammingLoss = (double) countError / labels.n_cols;
+
+  BOOST_REQUIRE(hammingLoss <= a.ztAccumulator);
+}
+
+/**
+ *  This test case runs the Adaboost.mh algorithm on a non-linearly 
+ *  separable dataset. 
+ *  It checks if the error returned by running a single instance of the 
+ *  weak learner is worse than running the boosted weak learner using 
+ *  adaboost.
+ */
+BOOST_AUTO_TEST_CASE(WeakLearnerErrorNonLinearSepData)
+{
+  arma::mat inputData;
+
+  if (!data::Load("nonlinsepdata.txt", inputData))
+    BOOST_FAIL("Cannot load test dataset nonlinsepdata.txt!");
+
+  arma::Mat<size_t> labels;
+
+  if (!data::Load("nonlinsepdata_labels.txt",labels))
+    BOOST_FAIL("Cannot load labels for nonlinsepdata_labels.txt");
+  
+  // no need to map the labels here
+
+  // Define your own weak learner, perceptron in this case.
+  // Run the perceptron for perceptron_iter iterations.
+  int perceptron_iter = 800;
+
+  arma::Row<size_t> perceptronPrediction(labels.n_cols);
+  perceptron::Perceptron<> p(inputData, labels.row(0), perceptron_iter);
+  p.Classify(inputData, perceptronPrediction);
+  
+  int countWeakLearnerError = 0;
+  for (size_t i = 0; i < labels.n_cols; i++)
+    if(labels(i) != perceptronPrediction(i))
+      countWeakLearnerError++;
+  double weakLearnerErrorRate = (double) countWeakLearnerError / labels.n_cols;
+
+  // Define parameters for the adaboost
+  int iterations = 50;
+  double tolerance = 1e-10;
+  Adaboost<> a(inputData, labels.row(0), iterations, tolerance, p);
   int countError = 0;
   for (size_t i = 0; i < labels.n_cols; i++)
     if(labels(i) != a.finalHypothesis(i))
diff --git a/src/mlpack/tests/data/nonlinsepdata.txt b/src/mlpack/tests/data/nonlinsepdata.txt
new file mode 100644
index 0000000..aae2e37
--- /dev/null
+++ b/src/mlpack/tests/data/nonlinsepdata.txt
@@ -0,0 +1,200 @@
+-0.299105532	0.572326729
+-0.836483249	-0.14359759
+0.008063874	-0.007024867
+-0.343167143	0.42961481
+0.32154837	-0.208236731
+-0.217072934	-0.212645094
+0.429448087	0.042831669
+0.531008929	-0.252171061
+-0.228575587	-0.586958836
+0.733937526	-0.012346747
+0.303535501	0.571892222
+0.539039196	0.628216718
+0.412215252	-0.185824745
+-0.307331594	-0.885248399
+0.151464995	0.066677945
+0.02810081	-0.818338472
+0.015731441	-0.645799755
+0.661724665	-0.347577756
+-0.089698701	-0.363787138
+0.316161623	0.313278339
+0.88787143	0.111484946
+0.970688757	0.161322385
+0.55101173	0.099046711
+0.019806601	0.831823487
+0.596855325	-0.721216246
+-0.917824177	-0.228485105
+-0.27570897	-0.095019869
+0.108012122	-0.1654937
+0.292911812	0.289884615
+-0.170870048	0.921382619
+-0.269166632	-0.160922833
+0.637122848	-0.123215673
+-0.907638043	-0.366173065
+0.725175629	0.08535568
+0.260180377	-0.790052711
+0.059974592	0.558496803
+0.290949275	-0.775789564
+-0.696033218	0.591746086
+0.088498834	-0.102255727
+-0.015662941	-0.865967082
+-0.932972607	-0.344126727
+0.566055791	0.654023314
+0.492482144	0.194692911
+0.64246928	0.613606187
+0.308782039	0.088767443
+0.522317298	0.41842343
+0.144916266	0.942122486
+-0.124875724	-0.52727621
+-0.499126147	-0.196592035
+0.359473181	0.028418378
+0.125175757	0.060811083
+0.806560524	-0.16821312
+-0.418556149	-0.19088967
+0.149065896	0.493280871
+-0.010245202	0.701047926
+-0.008959135	-0.230829545
+0.254738853	0.713600534
+-0.334082867	-0.187395872
+0.754083034	-0.143750879
+0.478110324	-0.686417423
+0.020391919	0.319246924
+-0.642978854	0.025182941
+-0.607613175	-0.570268698
+0.306423566	-0.565936275
+0.123538578	0.186257023
+0.72706875	0.085249277
+0.737146843	0.347000254
+-0.714918394	0.061488226
+-0.762951097	-0.602080044
+-0.695314036	-0.321081659
+-0.390384895	0.866603455
+0.369827758	-0.026497804
+-0.327322825	-0.17926549
+-0.655559774	0.614518243
+-0.598580033	-0.198440877
+-0.011918728	-0.406462964
+-0.468026012	-0.841700798
+-0.055200503	0.587127554
+0.792314883	0.350783736
+-0.583600401	0.62081282
+-0.64458229	-0.68397351
+0.195561235	0.765132675
+0.015002733	-0.866439468
+0.539805874	0.239808175
+-0.033649068	0.518455043
+0.319246005	-0.891048936
+0.227677062	0.61339575
+-0.628677954	-0.754721192
+-0.804753322	-0.12575209
+0.702105593	-0.358808384
+-0.028267632	0.337560238
+0.891922873	0.392380999
+-0.06284684	-0.667804169
+0.105634707	-0.753147345
+0.534655451	-0.349686075
+0.443053473	-0.201818235
+0.946813295	0.083211205
+0.074300943	0.75376313
+-0.039789138	-0.035876894
+0.745621743	-0.247372451
+0.743619596	-1.81591802
+1.155134773	-1.155691663
+-0.730475116	-1.792547114
+-1.549717954	-0.723237783
+0.781720033	1.24857396
+-1.533967653	0.323906224
+-0.944213794	1.386142763
+0.254188113	1.607492509
+-1.624602712	-1.067337954
+-0.739085468	1.791634164
+1.278347298	-0.741926562
+1.662805028	-1.060597485
+-0.055156833	1.232406071
+0.900610262	0.753584576
+0.41579285	1.11011431
+-1.586495923	0.004145099
+-0.138408011	-1.022456668
+0.509090382	1.893340333
+0.02305279	1.865694236
+0.609535781	-0.820684516
+-1.067528965	0.623675136
+-1.049854852	-0.945513359
+0.120914993	1.174040076
+1.484419861	1.106783517
+-1.4191842	-0.141866709
+-1.934438955	0.279764857
+0.951389268	1.356798069
+-1.375209835	-1.321095706
+-1.808529076	-0.513250346
+-1.092175944	-1.588498871
+0.948952249	1.216195061
+0.015594362	-1.624437522
+0.321967496	1.814427652
+-1.352713814	-0.16529335
+0.345744458	-1.533369276
+1.656086814	0.233896107
+-0.460792886	0.913383678
+1.163585001	-1.194367686
+0.850341757	-1.256163979
+-1.796190808	-0.442470528
+0.932987115	-1.036952001
+1.82338821	-0.594913071
+0.763599686	-1.466453943
+0.758080019	-0.988607753
+1.492253497	1.066286642
+-0.978672999	-1.370508188
+-0.310337647	-1.032191049
+-1.157032863	-0.960133321
+0.534533646	-1.65875866
+1.892449042	0.417662491
+0.126897732	-0.992523572
+-0.227095688	-1.969442814
+-0.0888739	1.456654514
+-0.087928817	1.97828692
+-1.13053945	-1.239366312
+-1.052421933	1.542383058
+0.476271294	-1.580284838
+-1.602001068	0.563756307
+0.278412374	-1.333872331
+1.658498678	-0.099409044
+-0.55976856	1.713687384
+1.177482802	1.479349776
+1.470575571	-1.050073357
+-1.699394959	-0.454123902
+1.871811971	-0.299596785
+-0.917079278	-1.061606607
+1.73434779	-0.066363522
+-1.470010888	-0.437823934
+-1.444880111	-0.24213575
+0.985071168	1.123418374
+-0.915193748	1.169127518
+1.378278589	0.22912872
+0.903525291	1.431572615
+-0.0414839	-1.512951056
+1.944240347	0.375946415
+-1.164397187	1.295214988
+0.819463581	-0.961347546
+-0.532169769	-1.413116543
+-0.516650608	-1.327821851
+0.692724746	1.706765607
+-1.78217547	0.681062009
+-1.678951498	-0.944800369
+1.725046866	-0.111232858
+0.384280254	1.289555533
+0.018881382	1.325422045
+-0.144267356	1.88251979
+0.001741105	-1.607215265
+-1.778635709	0.492887681
+1.544133898	0.577162072
+-1.786134271	-0.095665791
+0.378568063	1.553615494
+0.547230462	-1.735854416
+-0.482079187	1.572950613
+1.525034275	0.267207967
+-1.185673846	-0.788824603
+0.610287572	-0.912237428
+-1.071198843	0.993129184
+-0.000990903	-1.113048629
+-1.982204157	-0.138989282
+-1.201996822	-0.316294472
diff --git a/src/mlpack/tests/data/vc2_labels.txt b/src/mlpack/tests/data/nonlinsepdata_labels.txt
similarity index 51%
copy from src/mlpack/tests/data/vc2_labels.txt
copy to src/mlpack/tests/data/nonlinsepdata_labels.txt
index 7601f70..cbfde7b 100644
--- a/src/mlpack/tests/data/vc2_labels.txt
+++ b/src/mlpack/tests/data/nonlinsepdata_labels.txt
@@ -58,6 +58,46 @@
 0
 0
 0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
 1
 1
 1
@@ -158,153 +198,3 @@
 1
 1
 1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2