[mlpack-git] master: Avoid copies of dimensionMappings. (a4415f3)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Wed Dec 23 11:43:13 EST 2015


Repository : https://github.com/mlpack/mlpack

On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/de9cc4b05069e1fa4793d9355f2f595af5ff45d2...6070527af14296cd99739de6c62666cc5d2a2125

>---------------------------------------------------------------

commit a4415f3815eb29bcf58a6edcb8857f30acadf6b4
Author: Ryan Curtin <ryan at ratml.org>
Date:   Wed Sep 30 15:37:58 2015 -0400

    Avoid copies of dimensionMappings.
    
    This currently fails, but I need to move it to another system to use gdb.


>---------------------------------------------------------------

a4415f3815eb29bcf58a6edcb8857f30acadf6b4
 .../methods/hoeffding_trees/hoeffding_split.hpp    | 11 +++-
 .../hoeffding_trees/hoeffding_split_impl.hpp       | 76 +++++++++++++++++-----
 .../hoeffding_trees/streaming_decision_tree.hpp    |  4 +-
 .../streaming_decision_tree_impl.hpp               |  6 +-
 4 files changed, 74 insertions(+), 23 deletions(-)

diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
index 4d18b75..31d352a 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split.hpp
@@ -27,7 +27,11 @@ class HoeffdingSplit
                  const size_t numClasses,
                  const data::DatasetInfo& datasetInfo,
                  const double successProbability,
-                 const size_t maxSamples);
+                 const size_t maxSamples,
+                 std::unordered_map<size_t, std::pair<size_t, size_t>>*
+                     dimensionMappings = NULL);
+
+  ~HoeffdingSplit();
 
   template<typename VecType>
   void Train(const VecType& point, const size_t label);
@@ -59,7 +63,10 @@ class HoeffdingSplit
   std::vector<NumericSplitType> numericSplits;
   std::vector<CategoricalSplitType> categoricalSplits;
 
-  std::unordered_map<size_t, std::pair<size_t, size_t>> dimensionMappings;
+  // This structure is owned by this node only if it is the root of the tree.
+  std::unordered_map<size_t, std::pair<size_t, size_t>>* dimensionMappings;
+  // Indicates whether or not we own the mappings.
+  bool ownsMappings;
 
   size_t numSamples;
   size_t numClasses;
diff --git a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
index b43a426..558dfb6 100644
--- a/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/hoeffding_split_impl.hpp
@@ -21,7 +21,12 @@ HoeffdingSplit<
                   const size_t numClasses,
                   const data::DatasetInfo& datasetInfo,
                   const double successProbability,
-                  const size_t maxSamples) :
+                  const size_t maxSamples,
+                  std::unordered_map<size_t, std::pair<size_t, size_t>>*
+                      dimensionMappings) :
+    dimensionMappings((dimensionMappings != NULL) ? dimensionMappings :
+        new std::unordered_map<size_t, std::pair<size_t, size_t>>()),
+    ownsMappings(dimensionMappings == NULL),
     numSamples(0),
     numClasses(numClasses),
     maxSamples(maxSamples),
@@ -31,20 +36,39 @@ HoeffdingSplit<
     categoricalSplit(0),
     numericSplit()
 {
-  for (size_t i = 0; i < dimensionality; ++i)
+  // Do we need to generate the mappings too?
+  if (ownsMappings)
   {
-    if (datasetInfo.Type(i) == data::Datatype::categorical)
+    for (size_t i = 0; i < dimensionality; ++i)
     {
-      categoricalSplits.push_back(
-          CategoricalSplitType(datasetInfo.NumMappings(i), numClasses));
-      dimensionMappings[i] = std::make_pair(data::Datatype::categorical,
-          categoricalSplits.size() - 1);
+      if (datasetInfo.Type(i) == data::Datatype::categorical)
+      {
+        categoricalSplits.push_back(
+            CategoricalSplitType(datasetInfo.NumMappings(i), numClasses));
+        dimensionMappings->at(i) = std::make_pair(data::Datatype::categorical,
+            categoricalSplits.size() - 1);
+      }
+      else
+      {
+        numericSplits.push_back(NumericSplitType(numClasses));
+        dimensionMappings->at(i) = std::make_pair(data::Datatype::numeric,
+            numericSplits.size() - 1);
+      }
     }
-    else
+  }
+  else
+  {
+    for (size_t i = 0; i < dimensionality; ++i)
     {
-      numericSplits.push_back(NumericSplitType(numClasses));
-      dimensionMappings[i] = std::make_pair(data::Datatype::numeric,
-          numericSplits.size() - 1);
+      if (datasetInfo.Type(i) == data::Datatype::categorical)
+      {
+        categoricalSplits.push_back(
+            CategoricalSplitType(datasetInfo.NumMappings(i), numClasses));
+      }
+      else
+      {
+        numericSplits.push_back(NumericSplitType(numClasses));
+      }
     }
   }
 }
@@ -52,6 +76,16 @@ HoeffdingSplit<
 template<typename FitnessFunction,
          typename NumericSplitType,
          typename CategoricalSplitType>
+HoeffdingSplit<FitnessFunction, NumericSplitType, CategoricalSplitType>::
+    ~HoeffdingSplit()
+{
+  if (ownsMappings)
+    delete dimensionMappings;
+}
+
+template<typename FitnessFunction,
+         typename NumericSplitType,
+         typename CategoricalSplitType>
 template<typename VecType>
 void HoeffdingSplit<
     FitnessFunction,
@@ -103,8 +137,8 @@ size_t HoeffdingSplit<
   arma::vec gains(categoricalSplits.size() + numericSplits.size());
   for (size_t i = 0; i < gains.n_elem; ++i)
   {
-    size_t type = dimensionMappings[i].first;
-    size_t index = dimensionMappings[i].second;
+    size_t type = dimensionMappings->at(i).first;
+    size_t index = dimensionMappings->at(i).second;
     if (type == data::Datatype::categorical)
       gains[i] = categoricalSplits[index].EvaluateFitnessFunction();
     else if (type == data::Datatype::numeric)
@@ -243,14 +277,16 @@ void HoeffdingSplit<
 {
   // Create the children.
   arma::Col<size_t> childMajorities;
-  if (dimensionMappings[splitDimension].first == data::Datatype::categorical)
+  if (dimensionMappings->at(splitDimension).first ==
+      data::Datatype::categorical)
   {
-    categoricalSplits[dimensionMappings[splitDimension].second].Split(
+    categoricalSplits[dimensionMappings->at(splitDimension).second].Split(
         childMajorities, categoricalSplit);
   }
-  else if (dimensionMappings[splitDimension].first == data::Datatype::numeric)
+  else if (dimensionMappings->at(splitDimension).first ==
+           data::Datatype::numeric)
   {
-    numericSplits[dimensionMappings[splitDimension].second].Split(
+    numericSplits[dimensionMappings->at(splitDimension).second].Split(
         childMajorities, numericSplit);
   }
 
@@ -259,9 +295,13 @@ void HoeffdingSplit<
   for (size_t i = 0; i < childMajorities.n_elem; ++i)
   {
     children.push_back(StreamingDecisionTreeType(datasetInfo, dimensionality,
-        numClasses, successProbability, numSamples));
+        numClasses, successProbability, numSamples, dimensionMappings));
     children[i].MajorityClass() = childMajorities[i];
   }
+
+  // Eliminate now-unnecessary split information.
+  numericSplits.clear();
+  categoricalSplits.clear();
 }
 
 } // namespace tree
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
index 71b4315..bcfb2c2 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree.hpp
@@ -30,7 +30,9 @@ class StreamingDecisionTree
                         const size_t dimensionality,
                         const size_t numClasses,
                         const double confidence = 0.95,
-                        const size_t numSamples = 5000);
+                        const size_t numSamples = 5000,
+                        std::unordered_map<size_t, std::pair<size_t, size_t>>*
+                            dimensionMappings = NULL);
 
   StreamingDecisionTree(const StreamingDecisionTree& other);
 
diff --git a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
index 0c0a3b5..31e9857 100644
--- a/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
+++ b/src/mlpack/methods/hoeffding_trees/streaming_decision_tree_impl.hpp
@@ -32,8 +32,10 @@ StreamingDecisionTree<SplitType, MatType>::StreamingDecisionTree(
     const size_t dimensionality,
     const size_t numClasses,
     const double confidence,
-    const size_t numSamples) :
-    split(dimensionality, numClasses, datasetInfo, confidence, numSamples)
+    const size_t numSamples,
+    std::unordered_map<size_t, std::pair<size_t, size_t>>* dimensionMappings) :
+    split(dimensionality, numClasses, datasetInfo, confidence, numSamples,
+        dimensionMappings)
 {
   // No training.  Anything else to do...?
 }



More information about the mlpack-git mailing list