[mlpack-svn] r16730 - in mlpack/trunk/src/mlpack: core/tree core/tree/cosine_tree tests
fastlab-svn at coffeetalk-1.cc.gatech.edu
fastlab-svn at coffeetalk-1.cc.gatech.edu
Mon Jun 30 11:25:43 EDT 2014
Author: siddharth.950
Date: Mon Jun 30 11:25:43 2014
New Revision: 16730
Log:
Combined CosineNode and CosineTree classes.
Removed:
mlpack/trunk/src/mlpack/core/tree/cosine_tree/cosine_node.hpp
mlpack/trunk/src/mlpack/core/tree/cosine_tree/cosine_node_impl.hpp
Modified:
mlpack/trunk/src/mlpack/core/tree/CMakeLists.txt
mlpack/trunk/src/mlpack/core/tree/cosine_tree/cosine_tree.hpp
mlpack/trunk/src/mlpack/core/tree/cosine_tree/cosine_tree_impl.hpp
mlpack/trunk/src/mlpack/tests/cosine_tree_test.cpp
Modified: mlpack/trunk/src/mlpack/core/tree/CMakeLists.txt
==============================================================================
--- mlpack/trunk/src/mlpack/core/tree/CMakeLists.txt (original)
+++ mlpack/trunk/src/mlpack/core/tree/CMakeLists.txt Mon Jun 30 11:25:43 2014
@@ -13,10 +13,8 @@
binary_space_tree/single_tree_traverser_impl.hpp
binary_space_tree/traits.hpp
bounds.hpp
- cosine_tree/cosine_node.hpp
- cosine_tree/cosine_node_impl.hpp
- cosine_tree/cosine_tree_impl.hpp
cosine_tree/cosine_tree.hpp
+ cosine_tree/cosine_tree_impl.hpp
cover_tree/cover_tree.hpp
cover_tree/cover_tree_impl.hpp
cover_tree/first_point_is_root.hpp
Modified: mlpack/trunk/src/mlpack/core/tree/cosine_tree/cosine_tree.hpp
==============================================================================
--- mlpack/trunk/src/mlpack/core/tree/cosine_tree/cosine_tree.hpp (original)
+++ mlpack/trunk/src/mlpack/core/tree/cosine_tree/cosine_tree.hpp Mon Jun 30 11:25:43 2014
@@ -11,18 +11,41 @@
#include <mlpack/core.hpp>
#include <boost/heap/priority_queue.hpp>
-#include "cosine_node.hpp"
-
namespace mlpack {
namespace tree {
+// Predeclare classes for CosineNodeQueue typedef.
+class CompareCosineNode;
+class CosineTree;
+
+// CosineNodeQueue typedef.
+typedef boost::heap::priority_queue<CosineTree*,
+ boost::heap::compare<CompareCosineNode> > CosineNodeQueue;
+
class CosineTree
{
public:
-
- // Type definition for CosineNode priority queue.
- typedef boost::heap::priority_queue<CosineNode*,
- boost::heap::compare<CompareCosineNode> > CosineNodeQueue;
+
+ /**
+ * CosineTree constructor for the root node of the tree. It initializes the
+ * necessary variables required for splitting of the node, and building the
+ * tree further. It takes a pointer to the input matrix and calculates the
+ * relevant variables using it.
+ *
+ * @param dataset Matrix for which cosine tree is constructed.
+ */
+ CosineTree(const arma::mat& dataset);
+
+ /**
+ * CosineTree constructor for nodes other than the root node of the tree. It
+ * takes in a pointer to the parent node and a list of column indices which
+ * mentions the columns to be included in the node. The function calculate the
+ * relevant variables just like the constructor above.
+ *
+ * @param parentNode Pointer to the parent cosine node.
+ * @param subIndices Pointer to vector of column indices to be included.
+ */
+ CosineTree(CosineTree& parentNode, const std::vector<size_t>& subIndices);
/**
* Construct the CosineTree and the basis for the given matrix, and passed
@@ -68,7 +91,7 @@
* @param addBasisVector1 Address to first additional basis vector.
* @param addBasisVector2 Address to second additional basis vector.
*/
- double MonteCarloError(CosineNode* node,
+ double MonteCarloError(CosineTree* node,
CosineNodeQueue& treeQueue,
arma::vec* addBasisVector1 = NULL,
arma::vec* addBasisVector2 = NULL);
@@ -79,10 +102,100 @@
* @param treeQueue Priority queue of cosine nodes.
*/
void ConstructBasis(CosineNodeQueue& treeQueue);
+
+ /**
+ * This function splits the cosine node into two children based on the cosines
+ * of the columns contained in the node, with respect to the sampled splitting
+ * point. The function also calls the CosineTree constructor for the children.
+ */
+ void CosineNodeSplit();
+
+ /**
+ * Sample 'numSamples' points from the Length-Squared distribution of the
+ * cosine node. The function uses 'l2NormsSquared' to calculate the cumulative
+ * probability distribution of the column vectors. The sampling is based on a
+ * randomly generated values in the range [0, 1].
+ */
+ void ColumnSamplesLS(std::vector<size_t>& sampledIndices,
+ arma::vec& probabilities, size_t numSamples);
+
+ /**
+ * Sample a point from the Length-Squared distribution of the cosine node. The
+ * function uses 'l2NormsSquared' to calculate the cumulative probability
+ * distribution of the column vectors. The sampling is based on a randomly
+ * generated value in the range [0, 1].
+ */
+ size_t ColumnSampleLS();
+
+ /**
+ * Sample a column based on the cumulative Length-Squared distribution of the
+ * cosine node, and a randomly generated value in the range [0, 1]. Binary
+ * search is more efficient than searching linearly for the same. This leads
+ * a significant speedup when there are large number of columns to choose from
+ * and when a number of samples are to be drawn from the distribution.
+ *
+ * @param cDistribution Cumulative LS distibution of columns in the node.
+ * @param value Randomly generated value in the range [0, 1].
+ * @param start Starting index of the distribution interval to search in.
+ * @param end Ending index of the distribution interval to search in.
+ */
+ size_t BinarySearch(arma::vec& cDistribution, double value, size_t start,
+ size_t end);
+
+ /**
+ * Calculate cosines of the columns present in the node, with respect to the
+ * sampled splitting point. The calculated cosine values are useful for
+ * splitting the node into its children.
+ *
+ * @param cosines Vector to store the cosine values in.
+ */
+ void CalculateCosines(arma::vec& cosines);
+
+ /**
+ * Calculate centroid of the columns present in the node. The calculated
+ * centroid is used as a basis vector for the cosine tree being constructed.
+ */
+ void CalculateCentroid();
//! Returns the basis of the constructed subspace.
void GetFinalBasis(arma::mat& finalBasis) { finalBasis = basis; }
+ //! Get pointer to the dataset matrix.
+ const arma::mat& GetDataset() const { return dataset; }
+
+ //! Get the indices of columns in the node.
+ std::vector<size_t>& VectorIndices() { return indices; }
+
+ //! Set the Monte Carlo error.
+ void L2Error(const double error) { this->l2Error = error; }
+
+ //! Get the Monte Carlo error.
+ double L2Error() const { return l2Error; }
+
+ //! Get pointer to the centroid vector.
+ arma::vec& Centroid() { return centroid; }
+
+ //! Set the basis vector of the node.
+ void BasisVector(arma::vec& bVector) { this->basisVector = bVector; }
+
+ //! Get the basis vector of the node.
+ arma::vec& BasisVector() { return basisVector; }
+
+ //! Get pointer to the left child of the node.
+ CosineTree* Left() { return left; }
+
+ //! Get pointer to the right child of the node.
+ CosineTree* Right() { return right; }
+
+ //! Get number of columns of input matrix in the node.
+ size_t NumColumns() const { return numColumns; }
+
+ //! Get the Frobenius norm squared of columns in the node.
+ double FrobNormSquared() const { return frobNormSquared; }
+
+ //! Get the column index of split point of the node.
+ size_t SplitPointIndex() const { return indices[splitPointIndex]; }
+
private:
//! Matrix for which cosine tree is constructed.
const arma::mat& dataset;
@@ -92,6 +205,39 @@
double delta;
//! Subspace basis of the input dataset.
arma::mat basis;
+ //! Parent of the node.
+ CosineTree* parent;
+ //! Right child of the node.
+ CosineTree* right;
+ //! Left child of the node.
+ CosineTree* left;
+ //! Indices of columns of input matrix in the node.
+ std::vector<size_t> indices;
+ //! L2-norm squared of columns in the node.
+ arma::vec l2NormsSquared;
+ //! Centroid of columns of input matrix in the node.
+ arma::vec centroid;
+ //! Orthonormalized basis vector of the node.
+ arma::vec basisVector;
+ //! Index of split point of cosine node.
+ size_t splitPointIndex;
+ //! Number of columns of input matrix in the node.
+ size_t numColumns;
+ //! Monte Carlo error for this node.
+ double l2Error;
+ //! Frobenius norm squared of columns in the node.
+ double frobNormSquared;
+};
+
+class CompareCosineNode
+{
+ public:
+
+ // Comparison function for construction of priority queue.
+ bool operator() (const CosineTree* a, const CosineTree* b) const
+ {
+ return a->L2Error() < b->L2Error();
+ }
};
}; // namespace tree
Modified: mlpack/trunk/src/mlpack/core/tree/cosine_tree/cosine_tree_impl.hpp
==============================================================================
--- mlpack/trunk/src/mlpack/core/tree/cosine_tree/cosine_tree_impl.hpp (original)
+++ mlpack/trunk/src/mlpack/core/tree/cosine_tree/cosine_tree_impl.hpp Mon Jun 30 11:25:43 2014
@@ -15,6 +15,62 @@
namespace mlpack {
namespace tree {
+CosineTree::CosineTree(const arma::mat& dataset) :
+ dataset(dataset),
+ parent(NULL),
+ right(NULL),
+ left(NULL),
+ numColumns(dataset.n_cols)
+{
+ // Initialize sizes of column indices and l2 norms.
+ indices.resize(numColumns);
+ l2NormsSquared.zeros(numColumns);
+
+ // Set indices and calculate squared norms of the columns.
+ for(size_t i = 0; i < numColumns; i++)
+ {
+ indices[i] = i;
+ double l2Norm = arma::norm(dataset.col(i), 2);
+ l2NormsSquared(i) = l2Norm * l2Norm;
+ }
+
+ // Frobenius norm of columns in the node.
+ frobNormSquared = arma::accu(l2NormsSquared);
+
+ // Calculate centroid of columns in the node.
+ CalculateCentroid();
+
+ splitPointIndex = ColumnSampleLS();
+}
+
+CosineTree::CosineTree(CosineTree& parentNode,
+ const std::vector<size_t>& subIndices) :
+ dataset(parentNode.GetDataset()),
+ parent(&parentNode),
+ right(NULL),
+ left(NULL),
+ numColumns(subIndices.size())
+{
+ // Initialize sizes of column indices and l2 norms.
+ indices.resize(numColumns);
+ l2NormsSquared.zeros(numColumns);
+
+ // Set indices and squared norms of the columns.
+ for(size_t i = 0; i < numColumns; i++)
+ {
+ indices[i] = parentNode.indices[subIndices[i]];
+ l2NormsSquared(i) = parentNode.l2NormsSquared(subIndices[i]);
+ }
+
+ // Frobenius norm of columns in the node.
+ frobNormSquared = arma::accu(l2NormsSquared);
+
+ // Calculate centroid of columns in the node.
+ CalculateCentroid();
+
+ splitPointIndex = ColumnSampleLS();
+}
+
CosineTree::CosineTree(const arma::mat& dataset,
const double epsilon,
const double delta) :
@@ -26,7 +82,7 @@
CosineNodeQueue treeQueue;
// Define root node of the tree and add it to the queue.
- CosineNode root(dataset);
+ CosineTree root(dataset);
arma::vec tempVector = arma::zeros(dataset.n_rows);
root.L2Error(0);
root.BasisVector(tempVector);
@@ -38,7 +94,7 @@
while(monteCarloError > epsilon * root.FrobNormSquared())
{
// Pop node from queue with highest projection error.
- CosineNode* currentNode;
+ CosineTree* currentNode;
currentNode = treeQueue.top();
treeQueue.pop();
@@ -46,7 +102,7 @@
currentNode->CosineNodeSplit();
// Obtain pointers to the left and right children of the current node.
- CosineNode *currentLeft, *currentRight;
+ CosineTree *currentLeft, *currentRight;
currentLeft = currentNode->Left();
currentRight = currentNode->Right();
@@ -88,7 +144,7 @@
newBasisVector = centroid;
// Variables for iterating throught the priority queue.
- CosineNode *currentNode;
+ CosineTree *currentNode;
CosineNodeQueue::const_iterator i = treeQueue.begin();
// For every vector in the current basis, remove its projection from the
@@ -113,7 +169,7 @@
newBasisVector /= arma::norm(newBasisVector, 2);
}
-double CosineTree::MonteCarloError(CosineNode* node,
+double CosineTree::MonteCarloError(CosineTree* node,
CosineNodeQueue& treeQueue,
arma::vec* addBasisVector1,
arma::vec* addBasisVector2)
@@ -148,7 +204,7 @@
arma::vec projection;
projection.zeros(projectionSize);
- CosineNode *currentNode;
+ CosineTree *currentNode;
CosineNodeQueue::const_iterator j = treeQueue.begin();
size_t k = 0;
@@ -204,7 +260,7 @@
basis.zeros(dataset.n_rows, treeQueue.size());
// Variables for iterating through the priority queue.
- CosineNode *currentNode;
+ CosineTree *currentNode;
CosineNodeQueue::const_iterator i = treeQueue.begin();
// Transfer basis vectors from the queue to the basis matrix.
@@ -216,6 +272,161 @@
}
}
+void CosineTree::CosineNodeSplit()
+{
+ //! If less than two nodes, splitting does not make sense.
+ if(numColumns < 3) return;
+
+ //! Calculate cosines with respect to the splitting point.
+ arma::vec cosines;
+ CalculateCosines(cosines);
+
+ //! Compute maximum and minimum cosine values.
+ double cosineMax, cosineMin;
+ cosineMax = arma::max(cosines % (cosines < 1));
+ cosineMin = arma::min(cosines);
+
+ std::vector<size_t> leftIndices, rightIndices;
+
+ // Split columns into left and right children. The splitting condition for the
+ // column to be in the left child is as follows:
+ // cos_max - cos(i) <= cos(i) - cos_min
+ for(size_t i = 0; i < numColumns; i++)
+ {
+ if(cosineMax - cosines(i) <= cosines(i) - cosineMin)
+ {
+ leftIndices.push_back(i);
+ }
+ else
+ {
+ rightIndices.push_back(i);
+ }
+ }
+
+ // Split the node into left and right children.
+ left = new CosineTree(*this, leftIndices);
+ right = new CosineTree(*this, rightIndices);
+}
+
+void CosineTree::ColumnSamplesLS(std::vector<size_t>& sampledIndices,
+ arma::vec& probabilities,
+ size_t numSamples)
+{
+ // Initialize the cumulative distribution vector size.
+ arma::vec cDistribution;
+ cDistribution.zeros(numColumns + 1);
+
+ // Calculate cumulative length-squared distribution for the node.
+ for(size_t i = 0; i < numColumns; i++)
+ {
+ cDistribution(i+1) = cDistribution(i) + l2NormsSquared(i) / frobNormSquared;
+ }
+
+ // Intialize sizes of the 'sampledIndices' and 'probabilities' vectors.
+ sampledIndices.resize(numSamples);
+ probabilities.zeros(numSamples);
+
+ for(size_t i = 0; i < numSamples; i++)
+ {
+ // Generate a random value for sampling.
+ double randValue = arma::randu();
+ size_t start = 0, end = numColumns, searchIndex;
+
+ // Sample from the distribution and store corresponding probability.
+ searchIndex = BinarySearch(cDistribution, randValue, start, end);
+ sampledIndices[i] = indices[searchIndex];
+ probabilities(i) = l2NormsSquared(searchIndex) / frobNormSquared;
+ }
+}
+
+size_t CosineTree::ColumnSampleLS()
+{
+ // If only one element is present, there can only be one sample.
+ if(numColumns < 2)
+ {
+ return 0;
+ }
+
+ // Initialize the cumulative distribution vector size.
+ arma::vec cDistribution;
+ cDistribution.zeros(numColumns + 1);
+
+ // Calculate cumulative length-squared distribution for the node.
+ for(size_t i = 0; i < numColumns; i++)
+ {
+ cDistribution(i+1) = cDistribution(i) + l2NormsSquared(i) / frobNormSquared;
+ }
+
+ // Generate a random value for sampling.
+ double randValue = arma::randu();
+ size_t start = 0, end = numColumns;
+
+ // Sample from the distribution.
+ return BinarySearch(cDistribution, randValue, start, end);
+}
+
+size_t CosineTree::BinarySearch(arma::vec& cDistribution,
+ double value,
+ size_t start,
+ size_t end)
+{
+ size_t pivot = (start + end) / 2;
+
+ // If pivot is zero, first point is the sampled point.
+ if(!pivot)
+ {
+ return pivot;
+ }
+
+ // Binary search recursive algorithm.
+ if(value > cDistribution(pivot - 1) && value <= cDistribution(pivot))
+ {
+ return (pivot - 1);
+ }
+ else if(value < cDistribution(pivot - 1))
+ {
+ return BinarySearch(cDistribution, value, start, pivot - 1);
+ }
+ else
+ {
+ return BinarySearch(cDistribution, value, pivot + 1, end);
+ }
+}
+
+void CosineTree::CalculateCosines(arma::vec& cosines)
+{
+ // Initialize cosine vector as a vector of zeros.
+ cosines.zeros(numColumns);
+
+ for(size_t i = 0; i < numColumns; i++)
+ {
+ // If norm is zero, store cosine value as zero. Else, calculate cosine value
+ // between two vectors.
+ if(l2NormsSquared(i) == 0)
+ {
+ cosines(i) = 0;
+ }
+ else
+ {
+ cosines(i) = arma::norm_dot(dataset.col(indices[splitPointIndex]),
+ dataset.col(indices[i]));
+ }
+ }
+}
+
+void CosineTree::CalculateCentroid()
+{
+ // Initialize centroid as vector of zeros.
+ centroid.zeros(dataset.n_rows);
+
+ // Calculate centroid of columns in the node.
+ for(size_t i = 0; i < numColumns; i++)
+ {
+ centroid += dataset.col(indices[i]);
+ }
+ centroid /= numColumns;
+}
+
}; // namespace tree
}; // namespace mlpack
Modified: mlpack/trunk/src/mlpack/tests/cosine_tree_test.cpp
==============================================================================
--- mlpack/trunk/src/mlpack/tests/cosine_tree_test.cpp (original)
+++ mlpack/trunk/src/mlpack/tests/cosine_tree_test.cpp Mon Jun 30 11:25:43 2014
@@ -43,7 +43,7 @@
}
/**
- * Checks CosineNode::CosineNodeSplit() by doing a depth first search on a
+ * Checks CosineTree::CosineNodeSplit() by doing a depth first search on a
* random dataset and checking if it satisfies the split condition.
*/
BOOST_AUTO_TEST_CASE(CosineNodeCosineSplit)
@@ -54,17 +54,17 @@
// Make a random dataset and the root object.
arma::mat data = arma::randu(numRows, numCols);
- CosineNode root(data);
+ CosineTree root(data);
// Stack for depth first search of the tree.
- std::vector<CosineNode*> nodeStack;
+ std::vector<CosineTree*> nodeStack;
nodeStack.push_back(&root);
// While stack is not empty.
while(nodeStack.size())
{
// Pop a node from the stack and split it.
- CosineNode *currentNode, *currentLeft, *currentRight;
+ CosineTree *currentNode, *currentLeft, *currentRight;
currentNode = nodeStack.back();
currentNode->CosineNodeSplit();
nodeStack.pop_back();
@@ -139,14 +139,14 @@
arma::mat data = arma::randu(numRows, numCols);
// Declare a queue and a dummy CosineTree object.
- CosineTree::CosineNodeQueue basisQueue;
+ CosineNodeQueue basisQueue;
CosineTree dummyTree(data, epsilon, delta);
for(size_t i = 0; i < numCols; i++)
{
// Make a new CosineNode object.
- CosineNode* basisNode;
- basisNode = new CosineNode(data);
+ CosineTree* basisNode;
+ basisNode = new CosineTree(data);
// Use the columns of the dataset as random centroids.
arma::vec centroid = data.col(i);
@@ -156,8 +156,8 @@
dummyTree.ModifiedGramSchmidt(basisQueue, centroid, newBasisVector);
// Check if the obtained vector is orthonormal to the basis vectors.
- CosineTree::CosineNodeQueue::const_iterator j = basisQueue.begin();
- CosineNode* currentNode;
+ CosineNodeQueue::const_iterator j = basisQueue.begin();
+ CosineTree* currentNode;
for(; j != basisQueue.end(); j++)
{
@@ -175,7 +175,7 @@
// Deallocate memory given to the objects.
for(size_t i = 0; i < numCols; i++)
{
- CosineNode* currentNode;
+ CosineTree* currentNode;
currentNode = basisQueue.top();
basisQueue.pop();
More information about the mlpack-svn
mailing list