[mlpack-git] master, mlpack-1.0.x: Combined CosineNode and CosineTree classes. (79fd529)

gitdub at big.cc.gt.atl.ga.us gitdub at big.cc.gt.atl.ga.us
Thu Mar 5 21:50:09 EST 2015


Repository : https://github.com/mlpack/mlpack

On branches: master,mlpack-1.0.x
Link       : https://github.com/mlpack/mlpack/compare/904762495c039e345beba14c1142fd719b3bd50e...f94823c800ad6f7266995c700b1b630d5ffdcf40

>---------------------------------------------------------------

commit 79fd529120de4e775b90281d8808c66d66c8daa5
Author: Siddharth Agrawal <siddharth.950 at gmail.com>
Date:   Mon Jun 30 15:25:43 2014 +0000

    Combined CosineNode and CosineTree classes.


>---------------------------------------------------------------

79fd529120de4e775b90281d8808c66d66c8daa5
 src/mlpack/core/tree/CMakeLists.txt                |   4 +-
 src/mlpack/core/tree/cosine_tree/cosine_node.hpp   | 178 ----------------
 .../core/tree/cosine_tree/cosine_node_impl.hpp     | 230 ---------------------
 src/mlpack/core/tree/cosine_tree/cosine_tree.hpp   | 158 +++++++++++++-
 .../core/tree/cosine_tree/cosine_tree_impl.hpp     | 225 +++++++++++++++++++-
 src/mlpack/tests/cosine_tree_test.cpp              |  20 +-
 6 files changed, 381 insertions(+), 434 deletions(-)

diff --git a/src/mlpack/core/tree/CMakeLists.txt b/src/mlpack/core/tree/CMakeLists.txt
index 5003336..c526a70 100644
--- a/src/mlpack/core/tree/CMakeLists.txt
+++ b/src/mlpack/core/tree/CMakeLists.txt
@@ -13,10 +13,8 @@ set(SOURCES
   binary_space_tree/single_tree_traverser_impl.hpp
   binary_space_tree/traits.hpp
   bounds.hpp
-  cosine_tree/cosine_node.hpp
-  cosine_tree/cosine_node_impl.hpp
-  cosine_tree/cosine_tree_impl.hpp
   cosine_tree/cosine_tree.hpp
+  cosine_tree/cosine_tree_impl.hpp
   cover_tree/cover_tree.hpp
   cover_tree/cover_tree_impl.hpp
   cover_tree/first_point_is_root.hpp
diff --git a/src/mlpack/core/tree/cosine_tree/cosine_node.hpp b/src/mlpack/core/tree/cosine_tree/cosine_node.hpp
deleted file mode 100644
index 2f69dbe..0000000
--- a/src/mlpack/core/tree/cosine_tree/cosine_node.hpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/**
- * @file cosine_node.hpp
- * @author Siddharth Agrawal
- *
- * Definition of Cosine Node.
- */
- 
-#ifndef __MLPACK_CORE_TREE_COSINE_TREE_COSINE_NODE_HPP
-#define __MLPACK_CORE_TREE_COSINE_TREE_COSINE_NODE_HPP
-
-#include <mlpack/core.hpp>
-
-namespace mlpack {
-namespace tree {
-
-class CosineNode
-{
- public:
- 
-  /**
-   * CosineNode constructor for the root node of the tree. It initializes the
-   * necessary variables required for splitting of the node, and building the
-   * tree further. It takes a pointer to the input matrix and calculates the
-   * relevant variables using it.
-   *
-   * @param dataset Matrix for which cosine tree is constructed.
-   */
-  CosineNode(const arma::mat& dataset);
-  
-  /**
-   * CosineNode constructor for nodes other than the root node of the tree. It
-   * takes in a pointer to the parent node and a list of column indices which
-   * mentions the columns to be included in the node. The function calculate the
-   * relevant variables just like the constructor above.
-   *
-   * @param parentNode Pointer to the parent CosineNode.
-   * @param subIndices Pointer to vector of column indices to be included.
-   */
-  CosineNode(CosineNode& parentNode, const std::vector<size_t>& subIndices);
-  
-  /**
-   * This function splits the CosineNode into two children based on the cosines
-   * of the columns contained in the node, with respect to the sampled splitting
-   * point. The function also calls the CosineNode constructor for the children.
-   */
-  void CosineNodeSplit();
-  
-  /**
-   * Sample 'numSamples' points from the Length-Squared distribution of the
-   * CosineNode. The function uses 'l2NormsSquared' to calculate the cumulative
-   * probability distribution of the column vectors. The sampling is based on a
-   * randomly generated values in the range [0, 1].
-   */
-  void ColumnSamplesLS(std::vector<size_t>& sampledIndices, 
-                       arma::vec& probabilities, size_t numSamples);
-  
-  /**
-   * Sample a point from the Length-Squared distribution of the CosineNode. The
-   * function uses 'l2NormsSquared' to calculate the cumulative probability
-   * distribution of the column vectors. The sampling is based on a randomly
-   * generated value in the range [0, 1].
-   */
-  size_t ColumnSampleLS();
-  
-  /**
-   * Sample a column based on the cumulative Length-Squared distribution of the
-   * CosineNode, and a randomly generated value in the range [0, 1]. Binary
-   * search is more efficient than searching linearly for the same. This leads
-   * a significant speedup when there are large number of columns to choose from
-   * and when a number of samples are to be drawn from the distribution.
-   *
-   * @param cDistribution Cumulative LS distibution of columns in the node.
-   * @param value Randomly generated value in the range [0, 1].
-   * @param start Starting index of the distribution interval to search in.
-   * @param end Ending index of the distribution interval to search in.
-   */
-  size_t BinarySearch(arma::vec& cDistribution, double value, size_t start,
-                      size_t end);
-  
-  /**
-   * Calculate cosines of the columns present in the node, with respect to the
-   * sampled splitting point. The calculated cosine values are useful for
-   * splitting the node into its children.
-   *
-   * @param cosines Vector to store the cosine values in.
-   */
-  void CalculateCosines(arma::vec& cosines);
-  
-  /**
-   * Calculate centroid of the columns present in the node. The calculated
-   * centroid is used as a basis vector for the cosine tree being constructed.
-   */
-  void CalculateCentroid();
-  
-  //! Get pointer to the dataset matrix.
-  const arma::mat& GetDataset() const { return dataset; }
-  
-  //! Get the indices of columns in the node.
-  std::vector<size_t>& VectorIndices() { return indices; }
-  
-  //! Set the Monte Carlo error.
-  void L2Error(const double error) { this->l2Error = error; }
-  
-  //! Get the Monte Carlo error.
-  double L2Error() const { return l2Error; }
-  
-  //! Get pointer to the centroid vector.
-  arma::vec& Centroid() { return centroid; }
-  
-  //! Set the basis vector of the node.
-  void BasisVector(arma::vec& bVector) { this->basisVector = bVector; }
-  
-  //! Get the basis vector of the node.
-  arma::vec& BasisVector() { return basisVector; }
-  
-  //! Get pointer to the left child of the node.
-  CosineNode* Left() { return left; }
-  
-  //! Get pointer to the right child of the node.
-  CosineNode* Right() { return right; }
-  
-  //! Get number of columns of input matrix in the node.
-  size_t NumColumns() const { return numColumns; }
-  
-  //! Get the Frobenius norm squared of columns in the node.
-  double FrobNormSquared() const { return frobNormSquared; }
-  
-  //! Get the column index of split point of the node.
-  size_t SplitPointIndex() const { return indices[splitPointIndex]; }
- 
- private:
-  //! Matrix for which cosine tree is constructed.
-  const arma::mat& dataset;
-  //! Parent of the node.
-  CosineNode* parent;
-  //! Right child of the node.
-  CosineNode* right;
-  //! Left child of the node.
-  CosineNode* left;
-  //! Indices of columns of input matrix in the node.
-  std::vector<size_t> indices;
-  //! L2-norm squared of columns in the node.
-  arma::vec l2NormsSquared;
-  //! Centroid of columns of input matrix in the node.
-  arma::vec centroid;
-  //! Orthonormalized basis vector of the node.
-  arma::vec basisVector;
-  //! Index of split point of cosine node.
-  size_t splitPointIndex;
-  //! Number of columns of input matrix in the node.
-  size_t numColumns;
-  //! Monte Carlo error for this node.
-  double l2Error;
-  //! Frobenius norm squared of columns in the node.
-  double frobNormSquared;
-  
-  // Friend class to facilitate construction of priority queue.
-  friend class CompareCosineNode;
-};
-
-class CompareCosineNode
-{
- public:
- 
-  // Comparison function for construction of priority queue.
-  bool operator() (const CosineNode* a, const CosineNode* b) const
-  {
-    return a->l2Error < b->l2Error;
-  }
-};
-
-}; // namespace tree
-}; // namespace mlpack
-
-// Include implementation.
-#include "cosine_node_impl.hpp"
-
-#endif
diff --git a/src/mlpack/core/tree/cosine_tree/cosine_node_impl.hpp b/src/mlpack/core/tree/cosine_tree/cosine_node_impl.hpp
deleted file mode 100644
index fb92665..0000000
--- a/src/mlpack/core/tree/cosine_tree/cosine_node_impl.hpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/**
- * @file cosine_node_impl.hpp
- * @author Siddharth Agrawal
- *
- * Implementation of cosine node.
- */
-#ifndef __MLPACK_CORE_TREE_COSINE_TREE_COSINE_NODE_IMPL_HPP
-#define __MLPACK_CORE_TREE_COSINE_TREE_COSINE_NODE_IMPL_HPP
-
-// In case it wasn't included already for some reason.
-#include "cosine_node.hpp"
-
-namespace mlpack {
-namespace tree {
-
-CosineNode::CosineNode(const arma::mat& dataset) :
-    dataset(dataset),
-    parent(NULL),
-    right(NULL),
-    left(NULL),
-    numColumns(dataset.n_cols)
-{  
-  // Initialize sizes of column indices and l2 norms.
-  indices.resize(numColumns);
-  l2NormsSquared.zeros(numColumns);
-  
-  // Set indices and calculate squared norms of the columns.
-  for(size_t i = 0; i < numColumns; i++)
-  {
-    indices[i] = i;
-    double l2Norm = arma::norm(dataset.col(i), 2);
-    l2NormsSquared(i) = l2Norm * l2Norm;
-  }
-  
-  // Frobenius norm of columns in the node.
-  frobNormSquared = arma::accu(l2NormsSquared);
-  
-  // Calculate centroid of columns in the node.
-  CalculateCentroid();
-  
-  splitPointIndex = ColumnSampleLS();
-}
-
-CosineNode::CosineNode(CosineNode& parentNode,
-                       const std::vector<size_t>& subIndices) :
-    dataset(parentNode.GetDataset()),
-    parent(&parentNode),
-    right(NULL),
-    left(NULL),
-    numColumns(subIndices.size())
-{
-  // Initialize sizes of column indices and l2 norms.
-  indices.resize(numColumns);
-  l2NormsSquared.zeros(numColumns);
-  
-  // Set indices and squared norms of the columns.
-  for(size_t i = 0; i < numColumns; i++)
-  {
-    indices[i] = parentNode.indices[subIndices[i]];
-    l2NormsSquared(i) = parentNode.l2NormsSquared(subIndices[i]);
-  }
-  
-  // Frobenius norm of columns in the node.
-  frobNormSquared = arma::accu(l2NormsSquared);
-  
-  // Calculate centroid of columns in the node.
-  CalculateCentroid();
-  
-  splitPointIndex = ColumnSampleLS();
-}
-
-void CosineNode::CosineNodeSplit()
-{
-  //! If less than two nodes, splitting does not make sense.
-  if(numColumns < 3) return;
-  
-  //! Calculate cosines with respect to the splitting point.
-  arma::vec cosines;
-  CalculateCosines(cosines);
-  
-  //! Compute maximum and minimum cosine values.
-  double cosineMax, cosineMin;
-  cosineMax = arma::max(cosines % (cosines < 1));
-  cosineMin = arma::min(cosines);
-  
-  std::vector<size_t> leftIndices, rightIndices;
-  
-  // Split columns into left and right children. The splitting condition for the
-  // column to be in the left child is as follows:
-  // 			cos_max - cos(i) <= cos(i) - cos_min
-  for(size_t i = 0; i < numColumns; i++)
-  {
-    if(cosineMax - cosines(i) <= cosines(i) - cosineMin)
-    {
-      leftIndices.push_back(i);
-    }
-    else
-    {
-      rightIndices.push_back(i);
-    }
-  }
-  
-  // Split the node into left and right children.
-  left = new CosineNode(*this, leftIndices);
-  right = new CosineNode(*this, rightIndices);
-}
-
-void CosineNode::ColumnSamplesLS(std::vector<size_t>& sampledIndices,
-                                 arma::vec& probabilities,
-                                 size_t numSamples)
-{
-  // Initialize the cumulative distribution vector size.
-  arma::vec cDistribution;
-  cDistribution.zeros(numColumns + 1);
-  
-  // Calculate cumulative length-squared distribution for the node.
-  for(size_t i = 0; i < numColumns; i++)
-  {
-    cDistribution(i+1) = cDistribution(i) + l2NormsSquared(i) / frobNormSquared;
-  }
-  
-  // Intialize sizes of the 'sampledIndices' and 'probabilities' vectors.
-  sampledIndices.resize(numSamples);
-  probabilities.zeros(numSamples);
-  
-  for(size_t i = 0; i < numSamples; i++)
-  {
-    // Generate a random value for sampling.
-    double randValue = arma::randu();
-    size_t start = 0, end = numColumns, searchIndex;
-    
-    // Sample from the distribution and store corresponding probability.
-    searchIndex = BinarySearch(cDistribution, randValue, start, end);
-    sampledIndices[i] = indices[searchIndex];
-    probabilities(i) = l2NormsSquared(searchIndex) / frobNormSquared;
-  }
-}
-
-size_t CosineNode::ColumnSampleLS()
-{
-  // If only one element is present, there can only be one sample.
-  if(numColumns < 2)
-  {
-    return 0;
-  }
-
-  // Initialize the cumulative distribution vector size.
-  arma::vec cDistribution;
-  cDistribution.zeros(numColumns + 1);
-  
-  // Calculate cumulative length-squared distribution for the node.
-  for(size_t i = 0; i < numColumns; i++)
-  {
-    cDistribution(i+1) = cDistribution(i) + l2NormsSquared(i) / frobNormSquared;
-  }
-  
-  // Generate a random value for sampling.
-  double randValue = arma::randu();
-  size_t start = 0, end = numColumns;
-  
-  // Sample from the distribution.
-  return BinarySearch(cDistribution, randValue, start, end);
-}
-
-size_t CosineNode::BinarySearch(arma::vec& cDistribution,
-                                double value,
-                                size_t start,
-                                size_t end)
-{
-  size_t pivot = (start + end) / 2;
-  
-  // If pivot is zero, first point is the sampled point.
-  if(!pivot)
-  {
-    return pivot;
-  }
-  
-  // Binary search recursive algorithm.
-  if(value > cDistribution(pivot - 1) && value <= cDistribution(pivot))
-  {
-    return (pivot - 1);
-  }
-  else if(value < cDistribution(pivot - 1))
-  {
-    return BinarySearch(cDistribution, value, start, pivot - 1);
-  }
-  else
-  {
-    return BinarySearch(cDistribution, value, pivot + 1, end);
-  }
-}
-
-void CosineNode::CalculateCosines(arma::vec& cosines)
-{
-  // Initialize cosine vector as a vector of zeros.
-  cosines.zeros(numColumns);
-  
-  for(size_t i = 0; i < numColumns; i++)
-  {
-    // If norm is zero, store cosine value as zero. Else, calculate cosine value
-    // between two vectors.
-    if(l2NormsSquared(i) == 0)
-    {
-      cosines(i) = 0;
-    }
-    else
-    {
-      cosines(i) = arma::norm_dot(dataset.col(indices[splitPointIndex]),
-                                  dataset.col(indices[i]));
-    }
-  }
-}
-
-void CosineNode::CalculateCentroid()
-{
-  // Initialize centroid as vector of zeros.
-  centroid.zeros(dataset.n_rows);
-  
-  // Calculate centroid of columns in the node.
-  for(size_t i = 0; i < numColumns; i++)
-  {
-    centroid += dataset.col(indices[i]);
-  }
-  centroid /= numColumns;
-}
-
-}; // namespace tree
-}; // namespace mlpack
-
-#endif
diff --git a/src/mlpack/core/tree/cosine_tree/cosine_tree.hpp b/src/mlpack/core/tree/cosine_tree/cosine_tree.hpp
index e073381..4a411d9 100644
--- a/src/mlpack/core/tree/cosine_tree/cosine_tree.hpp
+++ b/src/mlpack/core/tree/cosine_tree/cosine_tree.hpp
@@ -11,18 +11,41 @@
 #include <mlpack/core.hpp>
 #include <boost/heap/priority_queue.hpp>
 
-#include "cosine_node.hpp"
-
 namespace mlpack {
 namespace tree {
 
+// Predeclare classes for CosineNodeQueue typedef.
+class CompareCosineNode;
+class CosineTree;
+
+// CosineNodeQueue typedef.
+typedef boost::heap::priority_queue<CosineTree*,
+    boost::heap::compare<CompareCosineNode> > CosineNodeQueue;
+
 class CosineTree
 {
  public:
       
-  // Type definition for CosineNode priority queue.
-  typedef boost::heap::priority_queue<CosineNode*,
-      boost::heap::compare<CompareCosineNode> > CosineNodeQueue;
+  /**
+   * CosineTree constructor for the root node of the tree. It initializes the
+   * necessary variables required for splitting of the node, and building the
+   * tree further. It takes a pointer to the input matrix and calculates the
+   * relevant variables using it.
+   *
+   * @param dataset Matrix for which cosine tree is constructed.
+   */
+  CosineTree(const arma::mat& dataset);
+  
+  /**
+   * CosineTree constructor for nodes other than the root node of the tree. It
+   * takes in a pointer to the parent node and a list of column indices which
+   * mentions the columns to be included in the node. The function calculate the
+   * relevant variables just like the constructor above.
+   *
+   * @param parentNode Pointer to the parent cosine node.
+   * @param subIndices Pointer to vector of column indices to be included.
+   */
+  CosineTree(CosineTree& parentNode, const std::vector<size_t>& subIndices);
  
   /**
    * Construct the CosineTree and the basis for the given matrix, and passed
@@ -68,7 +91,7 @@ class CosineTree
    * @param addBasisVector1 Address to first additional basis vector.
    * @param addBasisVector2 Address to second additional basis vector.
    */                         
-  double MonteCarloError(CosineNode* node,
+  double MonteCarloError(CosineTree* node,
                          CosineNodeQueue& treeQueue,
                          arma::vec* addBasisVector1 = NULL,
                          arma::vec* addBasisVector2 = NULL);
@@ -80,9 +103,99 @@ class CosineTree
    */                       
   void ConstructBasis(CosineNodeQueue& treeQueue);
                          
+  /**
+   * This function splits the cosine node into two children based on the cosines
+   * of the columns contained in the node, with respect to the sampled splitting
+   * point. The function also calls the CosineTree constructor for the children.
+   */
+  void CosineNodeSplit();
+                         
+  /**
+   * Sample 'numSamples' points from the Length-Squared distribution of the
+   * cosine node. The function uses 'l2NormsSquared' to calculate the cumulative
+   * probability distribution of the column vectors. The sampling is based on a
+   * randomly generated values in the range [0, 1].
+   */
+  void ColumnSamplesLS(std::vector<size_t>& sampledIndices, 
+                       arma::vec& probabilities, size_t numSamples);
+  
+  /**
+   * Sample a point from the Length-Squared distribution of the cosine node. The
+   * function uses 'l2NormsSquared' to calculate the cumulative probability
+   * distribution of the column vectors. The sampling is based on a randomly
+   * generated value in the range [0, 1].
+   */
+  size_t ColumnSampleLS();
+  
+  /**
+   * Sample a column based on the cumulative Length-Squared distribution of the
+   * cosine node, and a randomly generated value in the range [0, 1]. Binary
+   * search is more efficient than searching linearly for the same. This leads
+   * a significant speedup when there are large number of columns to choose from
+   * and when a number of samples are to be drawn from the distribution.
+   *
+   * @param cDistribution Cumulative LS distibution of columns in the node.
+   * @param value Randomly generated value in the range [0, 1].
+   * @param start Starting index of the distribution interval to search in.
+   * @param end Ending index of the distribution interval to search in.
+   */
+  size_t BinarySearch(arma::vec& cDistribution, double value, size_t start,
+                      size_t end);
+  
+  /**
+   * Calculate cosines of the columns present in the node, with respect to the
+   * sampled splitting point. The calculated cosine values are useful for
+   * splitting the node into its children.
+   *
+   * @param cosines Vector to store the cosine values in.
+   */
+  void CalculateCosines(arma::vec& cosines);
+  
+  /**
+   * Calculate centroid of the columns present in the node. The calculated
+   * centroid is used as a basis vector for the cosine tree being constructed.
+   */
+  void CalculateCentroid();
+  
   //! Returns the basis of the constructed subspace.
   void GetFinalBasis(arma::mat& finalBasis) { finalBasis = basis; }
   
+  //! Get pointer to the dataset matrix.
+  const arma::mat& GetDataset() const { return dataset; }
+  
+  //! Get the indices of columns in the node.
+  std::vector<size_t>& VectorIndices() { return indices; }
+  
+  //! Set the Monte Carlo error.
+  void L2Error(const double error) { this->l2Error = error; }
+  
+  //! Get the Monte Carlo error.
+  double L2Error() const { return l2Error; }
+  
+  //! Get pointer to the centroid vector.
+  arma::vec& Centroid() { return centroid; }
+  
+  //! Set the basis vector of the node.
+  void BasisVector(arma::vec& bVector) { this->basisVector = bVector; }
+  
+  //! Get the basis vector of the node.
+  arma::vec& BasisVector() { return basisVector; }
+  
+  //! Get pointer to the left child of the node.
+  CosineTree* Left() { return left; }
+  
+  //! Get pointer to the right child of the node.
+  CosineTree* Right() { return right; }
+  
+  //! Get number of columns of input matrix in the node.
+  size_t NumColumns() const { return numColumns; }
+  
+  //! Get the Frobenius norm squared of columns in the node.
+  double FrobNormSquared() const { return frobNormSquared; }
+  
+  //! Get the column index of split point of the node.
+  size_t SplitPointIndex() const { return indices[splitPointIndex]; }
+  
  private:
   //! Matrix for which cosine tree is constructed.
   const arma::mat& dataset;
@@ -92,6 +205,39 @@ class CosineTree
   double delta;
   //! Subspace basis of the input dataset.
   arma::mat basis;
+  //! Parent of the node.
+  CosineTree* parent;
+  //! Right child of the node.
+  CosineTree* right;
+  //! Left child of the node.
+  CosineTree* left;
+  //! Indices of columns of input matrix in the node.
+  std::vector<size_t> indices;
+  //! L2-norm squared of columns in the node.
+  arma::vec l2NormsSquared;
+  //! Centroid of columns of input matrix in the node.
+  arma::vec centroid;
+  //! Orthonormalized basis vector of the node.
+  arma::vec basisVector;
+  //! Index of split point of cosine node.
+  size_t splitPointIndex;
+  //! Number of columns of input matrix in the node.
+  size_t numColumns;
+  //! Monte Carlo error for this node.
+  double l2Error;
+  //! Frobenius norm squared of columns in the node.
+  double frobNormSquared;
+};
+
+class CompareCosineNode
+{
+ public:
+ 
+  // Comparison function for construction of priority queue.
+  bool operator() (const CosineTree* a, const CosineTree* b) const
+  {
+    return a->L2Error() < b->L2Error();
+  }
 };
 
 }; // namespace tree
diff --git a/src/mlpack/core/tree/cosine_tree/cosine_tree_impl.hpp b/src/mlpack/core/tree/cosine_tree/cosine_tree_impl.hpp
index 263d0b1..46167d5 100644
--- a/src/mlpack/core/tree/cosine_tree/cosine_tree_impl.hpp
+++ b/src/mlpack/core/tree/cosine_tree/cosine_tree_impl.hpp
@@ -15,6 +15,62 @@
 namespace mlpack {
 namespace tree {
 
+CosineTree::CosineTree(const arma::mat& dataset) :
+    dataset(dataset),
+    parent(NULL),
+    right(NULL),
+    left(NULL),
+    numColumns(dataset.n_cols)
+{  
+  // Initialize sizes of column indices and l2 norms.
+  indices.resize(numColumns);
+  l2NormsSquared.zeros(numColumns);
+  
+  // Set indices and calculate squared norms of the columns.
+  for(size_t i = 0; i < numColumns; i++)
+  {
+    indices[i] = i;
+    double l2Norm = arma::norm(dataset.col(i), 2);
+    l2NormsSquared(i) = l2Norm * l2Norm;
+  }
+  
+  // Frobenius norm of columns in the node.
+  frobNormSquared = arma::accu(l2NormsSquared);
+  
+  // Calculate centroid of columns in the node.
+  CalculateCentroid();
+  
+  splitPointIndex = ColumnSampleLS();
+}
+
+CosineTree::CosineTree(CosineTree& parentNode,
+                       const std::vector<size_t>& subIndices) :
+    dataset(parentNode.GetDataset()),
+    parent(&parentNode),
+    right(NULL),
+    left(NULL),
+    numColumns(subIndices.size())
+{
+  // Initialize sizes of column indices and l2 norms.
+  indices.resize(numColumns);
+  l2NormsSquared.zeros(numColumns);
+  
+  // Set indices and squared norms of the columns.
+  for(size_t i = 0; i < numColumns; i++)
+  {
+    indices[i] = parentNode.indices[subIndices[i]];
+    l2NormsSquared(i) = parentNode.l2NormsSquared(subIndices[i]);
+  }
+  
+  // Frobenius norm of columns in the node.
+  frobNormSquared = arma::accu(l2NormsSquared);
+  
+  // Calculate centroid of columns in the node.
+  CalculateCentroid();
+  
+  splitPointIndex = ColumnSampleLS();
+}
+
 CosineTree::CosineTree(const arma::mat& dataset,
                        const double epsilon,
                        const double delta) :
@@ -26,7 +82,7 @@ CosineTree::CosineTree(const arma::mat& dataset,
   CosineNodeQueue treeQueue;
   
   // Define root node of the tree and add it to the queue.
-  CosineNode root(dataset);
+  CosineTree root(dataset);
   arma::vec tempVector = arma::zeros(dataset.n_rows);
   root.L2Error(0);
   root.BasisVector(tempVector);
@@ -38,7 +94,7 @@ CosineTree::CosineTree(const arma::mat& dataset,
   while(monteCarloError > epsilon * root.FrobNormSquared())
   {
     // Pop node from queue with highest projection error.
-    CosineNode* currentNode;
+    CosineTree* currentNode;
     currentNode = treeQueue.top();
     treeQueue.pop();
     
@@ -46,7 +102,7 @@ CosineTree::CosineTree(const arma::mat& dataset,
     currentNode->CosineNodeSplit();
     
     // Obtain pointers to the left and right children of the current node.
-    CosineNode *currentLeft, *currentRight;
+    CosineTree *currentLeft, *currentRight;
     currentLeft = currentNode->Left();
     currentRight = currentNode->Right();
     
@@ -88,7 +144,7 @@ void CosineTree::ModifiedGramSchmidt(CosineNodeQueue& treeQueue,
   newBasisVector = centroid;
 
   // Variables for iterating throught the priority queue.
-  CosineNode *currentNode;
+  CosineTree *currentNode;
   CosineNodeQueue::const_iterator i = treeQueue.begin();
 
   // For every vector in the current basis, remove its projection from the
@@ -113,7 +169,7 @@ void CosineTree::ModifiedGramSchmidt(CosineNodeQueue& treeQueue,
     newBasisVector /= arma::norm(newBasisVector, 2);
 }
 
-double CosineTree::MonteCarloError(CosineNode* node,
+double CosineTree::MonteCarloError(CosineTree* node,
                                    CosineNodeQueue& treeQueue,
                                    arma::vec* addBasisVector1,
                                    arma::vec* addBasisVector2)
@@ -148,7 +204,7 @@ double CosineTree::MonteCarloError(CosineNode* node,
     arma::vec projection;
     projection.zeros(projectionSize);
 
-    CosineNode *currentNode;
+    CosineTree *currentNode;
     CosineNodeQueue::const_iterator j = treeQueue.begin();
   
     size_t k = 0;
@@ -204,7 +260,7 @@ void CosineTree::ConstructBasis(CosineNodeQueue& treeQueue)
   basis.zeros(dataset.n_rows, treeQueue.size());
   
   // Variables for iterating through the priority queue.
-  CosineNode *currentNode;
+  CosineTree *currentNode;
   CosineNodeQueue::const_iterator i = treeQueue.begin();
   
   // Transfer basis vectors from the queue to the basis matrix.
@@ -216,6 +272,161 @@ void CosineTree::ConstructBasis(CosineNodeQueue& treeQueue)
   }
 }
 
+void CosineTree::CosineNodeSplit()
+{
+  //! If less than two nodes, splitting does not make sense.
+  if(numColumns < 3) return;
+  
+  //! Calculate cosines with respect to the splitting point.
+  arma::vec cosines;
+  CalculateCosines(cosines);
+  
+  //! Compute maximum and minimum cosine values.
+  double cosineMax, cosineMin;
+  cosineMax = arma::max(cosines % (cosines < 1));
+  cosineMin = arma::min(cosines);
+  
+  std::vector<size_t> leftIndices, rightIndices;
+  
+  // Split columns into left and right children. The splitting condition for the
+  // column to be in the left child is as follows:
+  // 			cos_max - cos(i) <= cos(i) - cos_min
+  for(size_t i = 0; i < numColumns; i++)
+  {
+    if(cosineMax - cosines(i) <= cosines(i) - cosineMin)
+    {
+      leftIndices.push_back(i);
+    }
+    else
+    {
+      rightIndices.push_back(i);
+    }
+  }
+  
+  // Split the node into left and right children.
+  left = new CosineTree(*this, leftIndices);
+  right = new CosineTree(*this, rightIndices);
+}
+
+void CosineTree::ColumnSamplesLS(std::vector<size_t>& sampledIndices,
+                                 arma::vec& probabilities,
+                                 size_t numSamples)
+{
+  // Initialize the cumulative distribution vector size.
+  arma::vec cDistribution;
+  cDistribution.zeros(numColumns + 1);
+  
+  // Calculate cumulative length-squared distribution for the node.
+  for(size_t i = 0; i < numColumns; i++)
+  {
+    cDistribution(i+1) = cDistribution(i) + l2NormsSquared(i) / frobNormSquared;
+  }
+  
+  // Intialize sizes of the 'sampledIndices' and 'probabilities' vectors.
+  sampledIndices.resize(numSamples);
+  probabilities.zeros(numSamples);
+  
+  for(size_t i = 0; i < numSamples; i++)
+  {
+    // Generate a random value for sampling.
+    double randValue = arma::randu();
+    size_t start = 0, end = numColumns, searchIndex;
+    
+    // Sample from the distribution and store corresponding probability.
+    searchIndex = BinarySearch(cDistribution, randValue, start, end);
+    sampledIndices[i] = indices[searchIndex];
+    probabilities(i) = l2NormsSquared(searchIndex) / frobNormSquared;
+  }
+}
+
+size_t CosineTree::ColumnSampleLS()
+{
+  // If only one element is present, there can only be one sample.
+  if(numColumns < 2)
+  {
+    return 0;
+  }
+
+  // Initialize the cumulative distribution vector size.
+  arma::vec cDistribution;
+  cDistribution.zeros(numColumns + 1);
+  
+  // Calculate cumulative length-squared distribution for the node.
+  for(size_t i = 0; i < numColumns; i++)
+  {
+    cDistribution(i+1) = cDistribution(i) + l2NormsSquared(i) / frobNormSquared;
+  }
+  
+  // Generate a random value for sampling.
+  double randValue = arma::randu();
+  size_t start = 0, end = numColumns;
+  
+  // Sample from the distribution.
+  return BinarySearch(cDistribution, randValue, start, end);
+}
+
+size_t CosineTree::BinarySearch(arma::vec& cDistribution,
+                                double value,
+                                size_t start,
+                                size_t end)
+{
+  size_t pivot = (start + end) / 2;
+  
+  // If pivot is zero, first point is the sampled point.
+  if(!pivot)
+  {
+    return pivot;
+  }
+  
+  // Binary search recursive algorithm.
+  if(value > cDistribution(pivot - 1) && value <= cDistribution(pivot))
+  {
+    return (pivot - 1);
+  }
+  else if(value < cDistribution(pivot - 1))
+  {
+    return BinarySearch(cDistribution, value, start, pivot - 1);
+  }
+  else
+  {
+    return BinarySearch(cDistribution, value, pivot + 1, end);
+  }
+}
+
+void CosineTree::CalculateCosines(arma::vec& cosines)
+{
+  // Initialize cosine vector as a vector of zeros.
+  cosines.zeros(numColumns);
+  
+  for(size_t i = 0; i < numColumns; i++)
+  {
+    // If norm is zero, store cosine value as zero. Else, calculate cosine value
+    // between two vectors.
+    if(l2NormsSquared(i) == 0)
+    {
+      cosines(i) = 0;
+    }
+    else
+    {
+      cosines(i) = arma::norm_dot(dataset.col(indices[splitPointIndex]),
+                                  dataset.col(indices[i]));
+    }
+  }
+}
+
+void CosineTree::CalculateCentroid()
+{
+  // Initialize centroid as vector of zeros.
+  centroid.zeros(dataset.n_rows);
+  
+  // Calculate centroid of columns in the node.
+  for(size_t i = 0; i < numColumns; i++)
+  {
+    centroid += dataset.col(indices[i]);
+  }
+  centroid /= numColumns;
+}
+
 }; // namespace tree
 }; // namespace mlpack
 
diff --git a/src/mlpack/tests/cosine_tree_test.cpp b/src/mlpack/tests/cosine_tree_test.cpp
index d9c2f37..10447ba 100644
--- a/src/mlpack/tests/cosine_tree_test.cpp
+++ b/src/mlpack/tests/cosine_tree_test.cpp
@@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE(CosineTreeNoSplit)
 }
 
 /**
- * Checks CosineNode::CosineNodeSplit() by doing a depth first search on a
+ * Checks CosineTree::CosineNodeSplit() by doing a depth first search on a
  * random dataset and checking if it satisfies the split condition.
  */
 BOOST_AUTO_TEST_CASE(CosineNodeCosineSplit)
@@ -54,17 +54,17 @@ BOOST_AUTO_TEST_CASE(CosineNodeCosineSplit)
   
   // Make a random dataset and the root object.
   arma::mat data = arma::randu(numRows, numCols);
-  CosineNode root(data);
+  CosineTree root(data);
   
   // Stack for depth first search of the tree.
-  std::vector<CosineNode*> nodeStack;
+  std::vector<CosineTree*> nodeStack;
   nodeStack.push_back(&root);
   
   // While stack is not empty.
   while(nodeStack.size())
   {
     // Pop a node from the stack and split it.
-    CosineNode *currentNode, *currentLeft, *currentRight;
+    CosineTree *currentNode, *currentLeft, *currentRight;
     currentNode = nodeStack.back();
     currentNode->CosineNodeSplit();
     nodeStack.pop_back();
@@ -139,14 +139,14 @@ BOOST_AUTO_TEST_CASE(CosineTreeModifiedGramSchmidt)
   arma::mat data = arma::randu(numRows, numCols);
   
   // Declare a queue and a dummy CosineTree object.
-  CosineTree::CosineNodeQueue basisQueue;
+  CosineNodeQueue basisQueue;
   CosineTree dummyTree(data, epsilon, delta);
   
   for(size_t i = 0; i < numCols; i++)
   {
     // Make a new CosineNode object.
-    CosineNode* basisNode;
-    basisNode = new CosineNode(data);
+    CosineTree* basisNode;
+    basisNode = new CosineTree(data);
     
     // Use the columns of the dataset as random centroids.
     arma::vec centroid = data.col(i);
@@ -156,8 +156,8 @@ BOOST_AUTO_TEST_CASE(CosineTreeModifiedGramSchmidt)
     dummyTree.ModifiedGramSchmidt(basisQueue, centroid, newBasisVector);   
     
     // Check if the obtained vector is orthonormal to the basis vectors.
-    CosineTree::CosineNodeQueue::const_iterator j = basisQueue.begin();
-    CosineNode* currentNode;
+    CosineNodeQueue::const_iterator j = basisQueue.begin();
+    CosineTree* currentNode;
     
     for(; j != basisQueue.end(); j++)
     {
@@ -175,7 +175,7 @@ BOOST_AUTO_TEST_CASE(CosineTreeModifiedGramSchmidt)
   // Deallocate memory given to the objects.
   for(size_t i = 0; i < numCols; i++)
   {
-    CosineNode* currentNode;
+    CosineTree* currentNode;
     currentNode = basisQueue.top();
     basisQueue.pop();
     



More information about the mlpack-git mailing list