[mlpack-git] master: Handle numeric and categorical attributes simultaneously. (55dfc55)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:42:59 EST 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit 55dfc552834bc7c9f2f84b8b636320c4ae207905
Author: ryan <ryan at ratml.org>
Date:   Tue Sep 29 12:21:41 2015 -0400

    Handle numeric and categorical attributes simultaneously.


>---------------------------------------------------------------

55dfc552834bc7c9f2f84b8b636320c4ae207905
 .../hoeffding_trees/hoeffding_numeric_split.hpp    |  2 ++
 .../methods/hoeffding_trees/hoeffding_split.hpp    |  2 ++
 .../hoeffding_trees/hoeffding_split_impl.hpp       | 23 ++++++++++++++++++----
 .../streaming_decision_tree_main.cpp               | 23 ++++++++++++++++++++--
 4 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
index 5b3df29..567fd76 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_numeric_split.hpp
@@ -64,6 +64,8 @@ class HoeffdingNumericSplit
 
   size_t MajorityClass() const;
 
+  size_t Bins() const { return bins; }
+
  private:
   // Cache the values of the points seen before we make bins.
   arma::Col<ObservationType> observations;
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
index bc96870..3c1e9ff 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
@@ -53,6 +53,8 @@ class HoeffdingSplit
   std::vector<NumericSplitType> numericSplits;
   std::vector<CategoricalSplitType> categoricalSplits;
 
+  std::unordered_map<size_t, std::pair<size_t, size_t>> dimensionMappings;
+
   size_t numSamples;
   size_t numClasses;
   arma::Col<size_t> classCounts;
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
index 5dffd01..765a197 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
@@ -33,10 +33,18 @@ HoeffdingSplit<
   for (size_t i = 0; i < dimensionality; ++i)
   {
     if (datasetInfo.Type(i) == data::Datatype::categorical)
+    {
       categoricalSplits.push_back(
           CategoricalSplitType(datasetInfo.NumMappings(i), numClasses));
+      dimensionMappings[i] = std::make_pair(data::Datatype::categorical,
+          categoricalSplits.size() - 1);
+    }
     else
+    {
       numericSplits.push_back(NumericSplitType(numClasses));
+      dimensionMappings[i] = std::make_pair(data::Datatype::numeric,
+          numericSplits.size() - 1);
+    }
   }
 }
 
@@ -98,8 +106,15 @@ size_t HoeffdingSplit<
       std::log(1.0 / (1.0 - successProbability)) / (2 * numSamples));
 
   arma::vec gains(categoricalSplits.size() + numericSplits.size());
-  for (size_t i = 0; i < categoricalSplits.size(); ++i)
-    gains[i] = categoricalSplits[i].EvaluateFitnessFunction();
+  for (size_t i = 0; i < gains.n_elem; ++i)
+  {
+    size_t type = dimensionMappings[i].first;
+    size_t index = dimensionMappings[i].second;
+    if (type == data::Datatype::categorical)
+      gains[i] = categoricalSplits[index].EvaluateFitnessFunction();
+    else if (type == data::Datatype::numeric)
+      gains[i] = numericSplits[index].EvaluateFitnessFunction();
+  }
 
   // Now find the largest and second-largest.
   double largest = -DBL_MAX;
@@ -132,8 +147,8 @@ size_t HoeffdingSplit<
     }
     else
     {
-      majorityClass = 0;
-      return 0; // I have no idea what to do yet.
+      majorityClass = numericSplits[largestIndex].MajorityClass();
+      return numericSplits[largestIndex].Bins();
     }
   }
   else
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
index 073ea1d..9d6226c 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_main.cpp
@@ -7,6 +7,7 @@
 #include <mlpack/core.hpp>
 #include <mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp>
 #include <mlpack/methods/hoeffding_trees/hoeffding_split.hpp>
+#include <stack>
 
 using namespace std;
 using namespace mlpack;
@@ -27,12 +28,30 @@ int main(int argc, char** argv)
   DatasetInfo datasetInfo;
   data::Load(trainingFile, trainingSet, datasetInfo, true);
 
-  arma::Row<size_t> labels;
-  data::Load(labelsFile, labels, true);
+  arma::Col<size_t> labelsIn;
+  data::Load(labelsFile, labelsIn, true, false);
+  arma::Row<size_t> labels = labelsIn.t();
 
   // Now create the decision tree.
   StreamingDecisionTree<HoeffdingSplit<>> tree(trainingSet, datasetInfo, labels,
       max(labels) + 1);
 
   // Great.  Good job team.
+  std::stack<StreamingDecisionTree<HoeffdingSplit<>>*> stack;
+  stack.push(&tree);
+  while (!stack.empty())
+  {
+    StreamingDecisionTree<HoeffdingSplit<>>* node = stack.top();
+    stack.pop();
+
+    Log::Info << "Node:\n";
+    Log::Info << "  split dimension " << node->Split().SplitDimension()
+        << ".\n";
+    Log::Info << "  majority class " << node->Split().Classify(arma::vec())
+        << ".\n";
+    Log::Info << "  children " << node->NumChildren() << ".\n";
+
+    for (size_t i = 0; i < node->NumChildren(); ++i)
+      stack.push(&node->Child(i));
+  }
 }



More information about the mlpack-git mailing list