[mlpack-svn] r13226 - mlpack/trunk/src/mlpack/methods/det

fastlab-svn at coffeetalk-1.cc.gatech.edu fastlab-svn at coffeetalk-1.cc.gatech.edu
Fri Jul 13 11:43:19 EDT 2012


Author: rcurtin
Date: 2012-07-13 11:43:19 -0400 (Fri, 13 Jul 2012)
New Revision: 13226

Modified:
   mlpack/trunk/src/mlpack/methods/det/dtree.hpp
   mlpack/trunk/src/mlpack/methods/det/dtree_impl.hpp
Log:
Rename FindSplit and its variables to names that adhere to the style guidelines.
Remove a few unused variables.


Modified: mlpack/trunk/src/mlpack/methods/det/dtree.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/det/dtree.hpp	2012-07-12 20:51:58 UTC (rev 13225)
+++ mlpack/trunk/src/mlpack/methods/det/dtree.hpp	2012-07-13 15:43:19 UTC (rev 13226)
@@ -145,15 +145,15 @@
   ////////////////////// Private Functions ////////////////////////////////////
  private:
 
-  inline double LogNegativeError(const size_t total_points);
+  inline double LogNegativeError(const size_t total_points) const;
 
-  bool FindSplit_(const arma::mat& data,
-                  size_t* split_dim,
-                  size_t* split_ind,
-                  cT* left_error,
-                  cT* right_error,
-                  size_t maxLeafSize = 10,
-                  size_t minLeafSize = 5);
+  bool FindSplit(const arma::mat& data,
+                 size_t& split_dim,
+                 size_t& split_ind,
+                 double& left_error,
+                 double& right_error,
+                 const size_t maxLeafSize = 10,
+                 const size_t minLeafSize = 5) const;
 
   void SplitData_(MatType* data,
                   size_t split_dim,

Modified: mlpack/trunk/src/mlpack/methods/det/dtree_impl.hpp
===================================================================
--- mlpack/trunk/src/mlpack/methods/det/dtree_impl.hpp	2012-07-12 20:51:58 UTC (rev 13225)
+++ mlpack/trunk/src/mlpack/methods/det/dtree_impl.hpp	2012-07-13 15:43:19 UTC (rev 13226)
@@ -17,11 +17,11 @@
 // This function computes the log-l2-negative-error of a given node from the
 // formula R(t) = log(|t|^2 / (N^2 V_t)).
 template<typename eT, typename cT>
-inline double DTree<eT, cT>::LogNegativeError(const size_t total_points)
+inline double DTree<eT, cT>::LogNegativeError(const size_t totalPoints) const
 {
   // log(-|t|^2 / (N^2 V_t)) = log(-1) + 2 log(|t|) - 2 log(N) - log(V_t).
   return 2 * std::log((double) (end_ - start_)) -
-         2 * std::log((double) total_points) -
+         2 * std::log((double) totalPoints) -
          arma::accu(arma::log((*max_vals_) - (*min_vals_)));
 }
 
@@ -29,24 +29,23 @@
 // all possible splits.  The dataset is the full data set but the start_ and
 // end_ are used to obtain the point in this node.
 template<typename eT, typename cT>
-bool DTree<eT, cT>::FindSplit_(const arma::mat& data,
-                               size_t *split_dim,
-                               size_t *split_ind,
-                               cT *left_error,
-                               cT *right_error,
-                               size_t maxLeafSize,
-                               size_t minLeafSize)
+bool DTree<eT, cT>::FindSplit(const arma::mat& data,
+                              size_t& splitDim,
+                              size_t& splitIndex,
+                              double& leftError,
+                              double& rightError,
+                              const size_t maxLeafSize,
+                              const size_t minLeafSize) const
 {
   // Ensure the dimensionality of the data is the same as the dimensionality of
   // the bounding rectangle.
   assert(data.n_rows == max_vals_->n_elem);
   assert(data.n_rows == min_vals_->n_elem);
 
-  const size_t n_t = end_ - start_;
+  const size_t points = end_ - start_;
 
-  double min_error = std::log(-error_);
-  bool some_split_found = false;
-  size_t point_mass_in_dim = 0;
+  double minError = std::log(-error_);
+  bool splitFound = false;
 
   // Loop through each dimension.
   for (size_t dim = 0; dim < max_vals_->n_elem; dim++)
@@ -58,50 +57,42 @@
 
     // If there is nothing to split in this dimension, move on.
     if (max - min == 0.0)
-    {
-      ++point_mass_in_dim;
       continue; // Skip to next dimension.
-    }
 
     // Initializing all the stuff for this dimension.
-    bool dim_split_found = false;
+    bool dimSplitFound = false;
     // Take an error estimate for this dimension.
-    double min_dim_error = n_t / (max - min);
-    double temp_lval = 0.0;
-    double temp_rval = 0.0;
-    size_t dim_split_ind = -1;
+    double minDimError = points / (max - min);
+    double dimLeftError = 0.0;
+    double dimRightError = 0.0;
+    size_t dimSplitIndex = -1;
 
     // Find the log volume of all the other dimensions.
-    double log_range_all_not_dim = 0;
+    double volumeWithoutDim = 0;
     for (size_t i = 0; i < max_vals_->n_elem; ++i)
     {
       if (((*max_vals_)[i] - (*min_vals_)[i] > 0.0) && (i != dim))
       {
-        log_range_all_not_dim += std::log((*max_vals_)[i] - (*min_vals_)[i]);
+        volumeWithoutDim += std::log((*max_vals_)[i] - (*min_vals_)[i]);
       }
     }
 
     // Get the values for the dimension.
-    arma::rowvec dim_val_vec = data.row(dim).subvec(start_, end_ - 1);
+    arma::rowvec dimVec = data.row(dim).subvec(start_, end_ - 1);
 
     // Sort the values in ascending order.
-    dim_val_vec = arma::sort(dim_val_vec);
+    dimVec = arma::sort(dimVec);
 
     // Get ready to go through the sorted list and compute error.
-    assert(dim_val_vec.n_elem > maxLeafSize);
+    assert(dimVec.n_elem > maxLeafSize);
 
-    // Enforce that the leaves have a minimum number of points to avoid
-    // spikes.  One way of doing this is to only consider splits resulting in
-    // sizes > some constant (minLeafSize).
-    size_t right_child_size;
-
     // Find the best split for this dimension.  We need to figure out why
     // there are spikes if this min_leaf_size is enforced here...
-    for (size_t i = minLeafSize - 1; i < dim_val_vec.n_elem - minLeafSize; ++i)
+    for (size_t i = minLeafSize - 1; i < dimVec.n_elem - minLeafSize; ++i)
     {
       double split;
-      double lsplit = dim_val_vec[i];
-      double rsplit = dim_val_vec[i + 1];
+      double lsplit = dimVec[i];
+      double rsplit = dimVec[i + 1];
 
       if (lsplit == rsplit)
         continue; // We can't split here.
@@ -114,56 +105,57 @@
       //   split = left_split;
       if ((split - min > 0.0) && (max - split > 0.0))
       {
+        // Ensure that the right node will have at least the minimum number of
+        // points.
+        Log::Assert((points - i - 1) >= minLeafSize);
+
         // Now we have to see if the error will be reduced.  Simple manipulation
         // of the error function gives us the condition we must satisfy:
         //   |t_l|^2 / V_l + |t_r|^2 / V_r  >= |t|^2 / (V_l + V_r)
         // and because the volume is only dependent on the dimension we are
         // splitting, we can assume V_l is just the range of the left and V_r is
         // just the range of the right.
-        right_child_size = n_t - i - 1;
-        Log::Assert(right_child_size >= minLeafSize);
-
         double negLeftError = std::pow(i + 1, 2.0) / (split - min);
-        double negRightError = std::pow(n_t - i - 1, 2.0) / (max - split);
+        double negRightError = std::pow(points - i - 1, 2.0) / (max - split);
 
         // If this is better, take it.
-        if ((negLeftError + negRightError) >= min_dim_error)
+        if ((negLeftError + negRightError) >= minDimError)
         {
-          min_dim_error = negLeftError + negRightError;
-          temp_lval = negLeftError;
-          temp_rval = negRightError;
-          dim_split_ind = i;
-          dim_split_found = true;
+          minDimError = negLeftError + negRightError;
+          dimLeftError = negLeftError;
+          dimRightError = negRightError;
+          dimSplitIndex = i;
+          dimSplitFound = true;
         }
       }
     }
 
-    dim_val_vec.clear();
+    dimVec.clear();
 
-    double actualMinDimError = std::log(min_dim_error) -
-        2 * std::log(data.n_cols) - log_range_all_not_dim;
+    double actualMinDimError = std::log(minDimError) - 2 * std::log(data.n_cols)
+        - volumeWithoutDim;
 
-    if ((actualMinDimError > min_error) && dim_split_found)
+    if ((actualMinDimError > minError) && dimSplitFound)
     {
       // Calculate actual error (in logspace) by adding terms back to our
       // estimate.
-      min_error = actualMinDimError;
-      *split_dim = dim;
-      *split_ind = dim_split_ind;
-      *left_error = std::log(temp_lval) - 2 * std::log(data.n_cols) -
-          log_range_all_not_dim;
-      *right_error = std::log(temp_rval) - 2 * std::log(data.n_cols) -
-          log_range_all_not_dim;
-      some_split_found = true;
+      minError = actualMinDimError;
+      splitDim = dim;
+      splitIndex = dimSplitIndex;
+      leftError = std::log(dimLeftError) - 2 * std::log(data.n_cols) -
+          volumeWithoutDim;
+      rightError = std::log(dimRightError) - 2 * std::log(data.n_cols) -
+          volumeWithoutDim;
+      splitFound = true;
     } // end if better split found in this dimension.
   }
 
   // Map out of logspace.
-  min_error = -std::exp(min_error);
-  *left_error = -std::exp(*left_error);
-  *right_error = -std::exp(*right_error);
+  minError = -std::exp(minError);
+  leftError = -std::exp(leftError);
+  rightError = -std::exp(rightError);
 
-  return some_split_found;
+  return splitFound;
 }
 
 template<typename eT, typename cT>
@@ -176,13 +168,13 @@
                                eT *rsplit_val)
 {
   // Get the values for the split dimension.
-  RowVecType dim_val_vec = data->row(split_dim).subvec(start_, end_ - 1);
+  RowVecType dimVec = data->row(split_dim).subvec(start_, end_ - 1);
 
   // Sort the values.
-  dim_val_vec = arma::sort(dim_val_vec);
+  dimVec = arma::sort(dimVec);
 
-  *lsplit_val = dim_val_vec[split_ind];
-  *rsplit_val = dim_val_vec[split_ind + 1];
+  *lsplit_val = dimVec[split_ind];
+  *rsplit_val = dimVec[split_ind + 1];
   *split_val = (*lsplit_val + *rsplit_val) / 2 ;
 
   std::vector<bool> left_membership;
@@ -380,9 +372,9 @@
 
     // Find the split.
     size_t dim, split_ind;
-    cT left_error, right_error;
-    if (FindSplit_(*data, &dim, &split_ind, &left_error, &right_error,
-        maxLeafSize, minLeafSize))
+    double left_error, right_error;
+    if (FindSplit(*data, dim, split_ind, left_error, right_error, maxLeafSize,
+        minLeafSize))
     {
       // Move the data around for the children to have points in a node lie
       // contiguously (to increase efficiency during the training).
@@ -405,9 +397,9 @@
 
       // Recursively grow the children.
       left_ = new DTree(max_vals_l, min_vals_l, start_, start_ + split_ind + 1,
-          left_error);
+          (cT) left_error);
       right_ = new DTree(max_vals_r, min_vals_r, start_ + split_ind + 1, end_,
-          right_error);
+          (cT) right_error);
 
       left_g = left_->Grow(data, old_from_new, useVolReg, maxLeafSize,
           minLeafSize);




More information about the mlpack-svn mailing list