[mlpack-git] master: Merged changes from #690 and #675 (840efe9)

Wed Jun 22 14:31:02 EDT 2016

Repository : https://github.com/mlpack/mlpack
On branch  : master
Link       : https://github.com/mlpack/mlpack/compare/eaa7182ebed8cce3fd6191dc1f8170546ea297da...812048c7c6bee0b6c8d936677f23bbb5930c6cfc

>---------------------------------------------------------------

commit 840efe976789936d718501b0c392b5b082520f5b
Merge: ae81ee5 a9f5622
Author: Yannis Mentekidis <mentekid at gmail.com>
Date:   Wed Jun 22 19:31:02 2016 +0100

    Merged changes from #690 and #675


>---------------------------------------------------------------

840efe976789936d718501b0c392b5b082520f5b
 .appveyor.yml                                      |  12 +-
 .gitignore                                         |   5 -
 CMakeLists.txt                                     |   2 +-
 HISTORY.md                                         |   9 +-
 LICENSE.txt                                        |  12 +-
 README.md                                          |   2 +-
 doc/tutorials/kmeans/kmeans.txt                    |  10 +-
 src/mlpack/core.hpp                                |   2 +-
 src/mlpack/core/arma_extend/CMakeLists.txt         |   4 -
 src/mlpack/core/arma_extend/Mat_extra_bones.hpp    |  10 +
 src/mlpack/core/arma_extend/Mat_extra_meat.hpp     |  10 +
 src/mlpack/core/arma_extend/README.md              |  23 +
 src/mlpack/core/arma_extend/SpMat_extra_bones.hpp  |  40 +-
 src/mlpack/core/arma_extend/SpMat_extra_meat.hpp   | 323 +------------
 src/mlpack/core/arma_extend/arma_extend.hpp        |   4 -
 src/mlpack/core/arma_extend/fn_ind2sub.hpp         |   8 +
 src/mlpack/core/arma_extend/hdf5_misc.hpp          |  11 +
 src/mlpack/core/arma_extend/operator_minus.hpp     |  10 +
 src/mlpack/core/arma_extend/promote_type.hpp       |  50 --
 src/mlpack/core/arma_extend/restrictors.hpp        |  22 -
 src/mlpack/core/arma_extend/traits.hpp             |  49 --
 src/mlpack/core/arma_extend/typedef.hpp            |  30 --
 src/mlpack/core/boost_backport/LICENSE.txt         |  23 +
 src/mlpack/core/boost_backport/README.md           |  11 +-
 .../unordered_collections_load_imp.hpp             |   3 +
 .../unordered_collections_save_imp.hpp             |   3 +
 src/mlpack/core/boost_backport/unordered_map.hpp   |   5 +-
 src/mlpack/core/data/CMakeLists.txt                |   1 +
 src/mlpack/core/data/binarize.hpp                  |  99 ++++
 src/mlpack/core/metrics/ip_metric.hpp              |   2 +-
 src/mlpack/core/metrics/ip_metric_impl.hpp         |  10 +-
 src/mlpack/core/metrics/lmetric.hpp                |   3 +-
 src/mlpack/core/metrics/lmetric_impl.hpp           |  57 ++-
 src/mlpack/core/optimizers/lbfgs/lbfgs.hpp         |   4 +-
 .../binary_space_tree/binary_space_tree_impl.hpp   |   1 -
 .../core/tree/cover_tree/cover_tree_impl.hpp       |   1 -
 .../core/tree/rectangle_tree/r_star_tree_split.hpp |  12 +-
 .../tree/rectangle_tree/r_star_tree_split_impl.hpp |  25 +-
 .../core/tree/rectangle_tree/r_tree_split.hpp      |  18 +-
 .../core/tree/rectangle_tree/r_tree_split_impl.hpp |  51 +-
 .../core/tree/rectangle_tree/rectangle_tree.hpp    |   4 +-
 .../tree/rectangle_tree/rectangle_tree_impl.hpp    |   6 +-
 .../core/tree/rectangle_tree/x_tree_split.hpp      |   9 +-
 .../core/tree/rectangle_tree/x_tree_split_impl.hpp |  24 +-
 src/mlpack/core/util/CMakeLists.txt                |   2 -
 src/mlpack/core/util/cli.cpp                       |  57 +--
 src/mlpack/core/util/cli.hpp                       |  18 +-
 src/mlpack/core/util/cli_impl.hpp                  |  45 +-
 src/mlpack/core/util/prefixedoutstream.hpp         |   1 -
 src/mlpack/core/util/string_util.cpp               |  47 --
 src/mlpack/core/util/string_util.hpp               |  23 -
 src/mlpack/methods/ann/ffn.hpp                     |  24 +-
 src/mlpack/methods/ann/rnn.hpp                     |  20 +-
 src/mlpack/methods/cf/cf_main.cpp                  |   2 +-
 src/mlpack/methods/det/det_main.cpp                |   2 +-
 src/mlpack/methods/emst/dtb_rules.hpp              |  28 --
 src/mlpack/methods/emst/dtb_rules_impl.hpp         |  46 --
 src/mlpack/methods/emst/emst_main.cpp              |  25 +-
 src/mlpack/methods/gmm/gmm_generate_main.cpp       |  11 +-
 src/mlpack/methods/gmm/gmm_probability_main.cpp    |  17 +-
 src/mlpack/methods/hmm/hmm_generate_main.cpp       |  16 +-
 src/mlpack/methods/hmm/hmm_train_main.cpp          |   7 +-
 src/mlpack/methods/hmm/hmm_viterbi_main.cpp        |  13 +-
 .../hoeffding_trees/hoeffding_tree_main.cpp        |  45 +-
 src/mlpack/methods/kmeans/CMakeLists.txt           |   1 +
 src/mlpack/methods/kmeans/allow_empty_clusters.hpp |  15 +-
 .../methods/kmeans/dual_tree_kmeans_impl.hpp       |   1 -
 src/mlpack/methods/kmeans/elkan_kmeans_impl.hpp    |   2 -
 src/mlpack/methods/kmeans/hamerly_kmeans_impl.hpp  |   2 -
 ..._empty_clusters.hpp => kill_empty_clusters.hpp} |  25 +-
 src/mlpack/methods/kmeans/kmeans_main.cpp          |  26 +-
 src/mlpack/methods/kmeans/naive_kmeans.hpp         |   5 +-
 src/mlpack/methods/kmeans/naive_kmeans_impl.hpp    |   2 -
 .../methods/kmeans/pelleg_moore_kmeans_impl.hpp    |   6 +-
 src/mlpack/methods/lars/lars_main.cpp              |  37 +-
 .../linear_regression/linear_regression_main.cpp   |  54 ++-
 src/mlpack/methods/lsh/lsh_main.cpp                |  17 +-
 src/mlpack/methods/lsh/lsh_search.hpp              |  71 ++-
 src/mlpack/methods/lsh/lsh_search_impl.hpp         | 241 ++++++----
 src/mlpack/methods/mvu/mvu_main.cpp                |  18 +-
 src/mlpack/methods/neighbor_search/CMakeLists.txt  |  30 +-
 .../methods/neighbor_search/neighbor_search.hpp    |   4 +-
 src/mlpack/methods/neighbor_search/ns_model.hpp    | 210 +++++++-
 .../methods/neighbor_search/ns_model_impl.hpp      | 532 +++++++++------------
 src/mlpack/methods/pca/pca_main.cpp                |   7 +-
 src/mlpack/methods/perceptron/perceptron_main.cpp  |  15 +-
 src/mlpack/methods/preprocess/CMakeLists.txt       |   1 +
 .../preprocess/preprocess_binarize_main.cpp        |  85 ++++
 .../methods/preprocess/preprocess_split_main.cpp   |  69 +--
 src/mlpack/methods/radical/radical_main.cpp        |   6 +-
 src/mlpack/methods/rann/CMakeLists.txt             |  20 +-
 .../rann/{allkrann_main.cpp => krann_main.cpp}     |   6 +-
 src/mlpack/methods/rann/ra_typedef.hpp             |  43 +-
 .../softmax_regression/softmax_regression_main.cpp |  78 ++-
 src/mlpack/tests/CMakeLists.txt                    |   3 +-
 src/mlpack/tests/binarize_test.cpp                 |  66 +++
 ...krann_search_test.cpp => krann_search_test.cpp} |  10 +-
 src/mlpack/tests/lsh_test.cpp                      | 218 +++++----
 src/mlpack/tests/metric_test.cpp                   |   4 +-
 src/mlpack/tests/rectangle_tree_test.cpp           |   1 -
 src/mlpack/tests/serialization_test.cpp            |  12 +-
 101 files changed, 1713 insertions(+), 1699 deletions(-)

diff --cc src/mlpack/methods/lsh/lsh_main.cpp
index b2681da,c4d6ff6..0c22f93

--- a/src/mlpack/methods/lsh/lsh_main.cpp
+++ b/src/mlpack/methods/lsh/lsh_main.cpp
@@@ -64,11 -64,10 +64,12 @@@ PARAM_INT("tables", "The number of has
  PARAM_DOUBLE("hash_width", "The hash width for the first-level hashing in the "
      "LSH preprocessing. By default, the LSH class automatically estimates a "
      "hash width for its use.", "H", 0.0);
 +PARAM_INT("num_probes", "Number of additional probes for Multiprobe LSH"
 +    " If 0, traditional LSH is used.", "T", 0);
  PARAM_INT("second_hash_size", "The size of the second level hash table.", "S",
      99901);
- PARAM_INT("bucket_size", "The size of a bucket in the second level hash.", "B",
+ PARAM_INT("bucket_size", "The maximum size of a bucket in the second level "
+     "hash; 0 indicates no limit (so the table can be arbitrarily large!).", "B",
      500);
  PARAM_INT("seed", "Random seed.  If 0, 'std::time(NULL)' is used.", "s", 0);
  
diff --cc src/mlpack/methods/lsh/lsh_search_impl.hpp
index a89ea36,149beab..2f81b66
--- a/src/mlpack/methods/lsh/lsh_search_impl.hpp
+++ b/src/mlpack/methods/lsh/lsh_search_impl.hpp
@@@ -106,8 -105,8 +106,9 @@@ void LSHSearch<SortPolicy>::Train(cons
  
    if (hashWidth == 0.0) // The user has not provided any value.
    {
++    const size_t numSamples = 25;
      // Compute a heuristic hash width from the data.
--    for (size_t i = 0; i < 25; i++)
++    for (size_t i = 0; i < numSamples; i++)
      {
        size_t p1 = (size_t) math::RandInt(referenceSet.n_cols);
        size_t p2 = (size_t) math::RandInt(referenceSet.n_cols);
@@@ -116,7 -115,7 +117,7 @@@
            referenceSet.unsafe_col(p1), referenceSet.unsafe_col(p2)));
      }
  
--    hashWidth /= 25;
++    hashWidth /= numSamples;
    }
  
    Log::Info << "Hash width chosen as: " << hashWidth << std::endl;
@@@ -625,69 -354,31 +618,70 @@@ void LSHSearch<SortPolicy>::ReturnIndic
  
    // Compute the projection of the query in each table.
    arma::mat allProjInTables(numProj, numTablesToSearch);
 +  arma::mat queryCodesNotFloored(numProj, numTablesToSearch);
    for (size_t i = 0; i < numTablesToSearch; i++)
 -    //allProjInTables.unsafe_col(i) = projections[i].t() * queryPoint;
 -    allProjInTables.unsafe_col(i) = projections.slice(i).t() * queryPoint;
 -  allProjInTables += offsets.cols(0, numTablesToSearch - 1);
 -  allProjInTables /= hashWidth;
 +    queryCodesNotFloored.unsafe_col(i) = projections.slice(i).t() * queryPoint;
 +  queryCodesNotFloored += offsets.cols(0, numTablesToSearch - 1);
 +  allProjInTables = arma::floor(queryCodesNotFloored/hashWidth);
  
- 
    // Compute the hash value of each key of the query into a bucket of the
    // 'secondHashTable' using the 'secondHashWeights'.
 -  arma::rowvec hashVec = secondHashWeights.t() * arma::floor(allProjInTables);
 -
 +  arma::Row<size_t> hashVec = 
 +    arma::conv_to< arma::Row<size_t> >::
 +    from( secondHashWeights.t() * allProjInTables ); // typecast to floor
 +  // mod to compute 2nd-level codes
    for (size_t i = 0; i < hashVec.n_elem; i++)
 -    hashVec[i] = (double) ((size_t) hashVec[i] % secondHashSize);
 +    hashVec[i] = (hashVec[i] % secondHashSize);
  
    Log::Assert(hashVec.n_elem == numTablesToSearch);
  
 +  // Compute hashVectors of additional probing bins
 +  arma::Mat<size_t> hashMat;
 +  if (T > 0)
 +  {
 +    hashMat.zeros(T, numTablesToSearch);
 +
 +    for (size_t i = 0; i < numTablesToSearch; ++i)
 +    {
 +      // construct this table's probing sequence of length T
 +      arma::mat additionalProbingBins;
 +      GetAdditionalProbingBins(allProjInTables.unsafe_col(i),
 +                                queryCodesNotFloored.unsafe_col(i),
 +                                T,
 +                                additionalProbingBins);
 +
 +      // map the probing bin to second hash table bins
 +      hashMat.col(i) = 
 +        arma::conv_to< arma::Col<size_t> >::
 +        from(additionalProbingBins.t() * secondHashWeights); // typecast floor
 +      for (size_t p = 0; p < T; ++p)
 +        hashMat(p, i) = (hashMat(p, i) % secondHashSize);
 +    }
 +
 +    // top row of hashMat is primary bins for each table
 +    hashMat = arma::join_vert(hashVec, hashMat);
 +  }
 +  else
 +  {
 +    // if not multiprobe, hashMat is only hashVec's elements
 +    hashMat.set_size(1, numTablesToSearch);
 +    hashMat.row(0) = hashVec;
 +  }
 +
-   // Count number of points hashed in the same bucket as the query
++
+   // Count number of points hashed in the same bucket as the query.
    size_t maxNumPoints = 0;
-   for (size_t i = 0; i < numTablesToSearch; ++i) //For all tables
+   for (size_t i = 0; i < numTablesToSearch; ++i)
    {
 -    const size_t hashInd = (size_t) hashVec[i];
 -    const size_t tableRow = bucketRowInHashTable[hashInd];
 -    if (tableRow != secondHashSize)
 -      maxNumPoints += bucketContentSize[tableRow];
 +    for (size_t p = 0; p < T + 1; ++p)
 +    {
-       size_t hashInd = (size_t) hashMat(p, i); //find query's bucket
-       maxNumPoints += bucketContentSize[hashInd]; //count bucket contents
++      const size_t hashInd = hashMat(p, i); // find query's bucket
++      const size_t tableRow = bucketRowInHashTable[hashInd];
++      if (tableRow != secondHashSize)
++        maxNumPoints += bucketContentSize[hashInd]; // count bucket contents
 +    }
    }
  
- 
    // There are two ways to proceed here:
    // Either allocate a maxNumPoints-size vector, place all candidates, and run
    // unique on the vector to discard duplicates.
@@@ -707,25 -398,15 +701,25 @@@
      arma::Col<size_t> refPointsConsidered;
      refPointsConsidered.zeros(referenceSet->n_cols);
  
 -    for (size_t i = 0; i < hashVec.n_elem; ++i)
 +    for (size_t i = 0; i < numTablesToSearch; ++i) // for all tables
      {
 -      const size_t hashInd = (size_t) hashVec[i];
 -      const size_t tableRow = bucketRowInHashTable[hashInd];
 +      for (size_t p = 0; p < T + 1; ++p) // for entire probing sequence
 +      {
  
 -      // Pick the indices in the bucket corresponding to 'hashInd'.
 -      if (tableRow != secondHashSize)
 -        for (size_t j = 0; j < bucketContentSize[tableRow]; j++)
 -          refPointsConsidered[secondHashTable[tableRow](j)]++;
 +        // get the sequence code
-         size_t hashInd = (size_t) hashMat(p, i);
++        size_t hashInd = hashMat(p, i);
 +
 +        if (bucketContentSize[hashInd] > 0)
 +        {
 +          // Pick the indices in the bucket corresponding to hashInd.
 +          size_t tableRow = bucketRowInHashTable[hashInd];
-           assert(tableRow < secondHashSize);
-           assert(tableRow < secondHashTable.n_rows);
- 
-           for (size_t j = 0; j < bucketContentSize[hashInd]; ++j)
-             refPointsConsidered[secondHashTable(tableRow, j)]++;
++          if (tableRow != secondHashSize)
++          {
++            for (size_t j = 0; j < bucketContentSize[tableRow]; ++j)
++              refPointsConsidered[ secondHashTable[tableRow](j) ]++;
++          }
 +        }
 +      }
      }
  
      // Only keep reference points found in at least one bucket.
@@@ -744,22 -425,13 +738,16 @@@
      size_t start = 0;
      for (size_t i = 0; i < numTablesToSearch; ++i) // For all tables
      {
 -      const size_t hashInd = (size_t) hashVec[i]; // Find the query's bucket.
 -      const size_t tableRow = bucketRowInHashTable[hashInd];
 +      for (size_t p = 0; p < T + 1; ++p)
 +      {
-         size_t hashInd =  hashMat(p, i); // Find the query's bucket.
++        const size_t hashInd =  hashMat(p, i); // Find the query's bucket.
++        const size_t tableRow = bucketRowInHashTable[hashInd];
  
-         if (bucketContentSize[hashInd] > 0)
-         {
-           // tableRow hash indices corresponding to query.
-           size_t tableRow = bucketRowInHashTable[hashInd];
-           assert(tableRow < secondHashSize);
-           assert(tableRow < secondHashTable.n_rows);
- 
-           // Store all secondHashTable points in the candidates set.
-           for (size_t j = 0; j < bucketContentSize[hashInd]; ++j)
-             refPointsConsideredSmall(start++) = secondHashTable(tableRow, j);
-         }
 -      // Store all secondHashTable points in the candidates set.
 -      if (tableRow != secondHashSize)
 -        for (size_t j = 0; j < bucketContentSize[tableRow]; ++j)
 -          refPointsConsideredSmall(start++) = secondHashTable[tableRow][j];
++        if (tableRow != secondHashSize)
++         // Store all secondHashTable points in the candidates set.
++         for (size_t j = 0; j < bucketContentSize[tableRow]; ++j)
++           refPointsConsideredSmall(start++) = secondHashTable[tableRow][j];
 +      }
      }
  
      // Only keep unique candidates.
diff --cc src/mlpack/tests/lsh_test.cpp
index 2d9f556,65d1d78..7ae48d1
--- a/src/mlpack/tests/lsh_test.cpp
+++ b/src/mlpack/tests/lsh_test.cpp
@@@ -477,160 -454,18 +454,155 @@@ BOOST_AUTO_TEST_CASE(DeterministicNoMer
      if (neighbors(j, 0) == N || neighbors(j, 1) == N)
        continue;
  
-     //query 1 is in cluster 3, which is points 20:29
+     // Query 1 is in cluster 3, which is points 20:29.
      q = 0;
-     BOOST_REQUIRE(
-         neighbors(j, q) < 3 * N / 4 &&
-         neighbors(j, q) >= N / 2
-         );
+     BOOST_REQUIRE_LT(neighbors(j, q), 3 * N / 4);
+     BOOST_REQUIRE_GE(neighbors(j, q), N / 2);
  
-     //query 2 is in cluster 2, which is points 10:19
+     // Query 2 is in cluster 2, which is points 10:19.
      q = 1;
-     BOOST_REQUIRE(
-         neighbors(j, q) < N / 2 &&
-         neighbors(j, q) >= N / 4
-         );
+     BOOST_REQUIRE_LT(neighbors(j, q), N / 2);
+     BOOST_REQUIRE_GE(neighbors(j, q), N / 4);
    }
- 
  }
  
 +/**
 + * Test: Create an LSHSearch object and use an increasing number of probes to
 + * search for points. Require that recall for the same object doesn't decrease
 + * with increasing number of probes. Also require that at least a few times
 + * there's some increase in recall.
 + */
 +BOOST_AUTO_TEST_CASE(MultiprobeTest)
 +{
 +  const double epsilonIncrease = 0.05;
 +  const size_t repetitions = 5; // train five objects
 +
 +  const size_t probeTrials = 5;
 +  const size_t numProbes[probeTrials] = {0, 1, 2, 3, 4};
 +
 +
 +  /// algorithm parameters
 +  const int k = 4;
 +  const int numTables = 16;
 +  const int numProj = 3;
 +  const double hashWidth = 0;
 +  const int secondHashSize = 99901;
 +  const int bucketSize = 500;
 +
 +  const string trainSet = "iris_train.csv";
 +  const string testSet = "iris_test.csv";
 +  arma::mat rdata;
 +  arma::mat qdata;
 +  data::Load(trainSet, rdata, true);
 +  data::Load(testSet, qdata, true);
 +
 +  // Run classic knn on reference set
 +  KNN knn(rdata);
 +  arma::Mat<size_t> groundTruth;
 +  arma::mat groundDistances;
 +  knn.Search(qdata, k, groundTruth, groundDistances);
 +  
 +  bool foundIncrease = 0;
 +
 +  for (size_t rep = 0; rep < repetitions; ++rep)
 +  {
 +    // train a model
 +    LSHSearch<> multiprobeTest(
 +        rdata,
 +        numProj,
 +        numTables,
 +        hashWidth,
 +        secondHashSize,
 +        bucketSize);
 +
 +    double prevRecall = 0;
 +    // search with varying number of probes
 +    for (size_t p = 0; p < probeTrials; ++p)
 +    {
 +      arma::Mat<size_t> lshNeighbors;
 +      arma::mat lshDistances; //move outside of loop for speed?
 +      
 +      multiprobeTest.Search(
 +          qdata, 
 +          k, 
 +          lshNeighbors, 
 +          lshDistances, 
 +          0, 
 +          numProbes[p]);
 +
 +      // compute recall
 +      double recall = ComputeRecall(lshNeighbors, groundTruth); //TODO: change to LSHSearch::ComputeRecall??
 +      if (p > 0)
 +      {
 +        // more probes should at the very least not lower recall...
 +        BOOST_REQUIRE_GE(recall, prevRecall);
 +
 +        // ... and should ideally increase it a bit
 +        if (recall > prevRecall + epsilonIncrease)
 +         foundIncrease = true;
 +        prevRecall = recall;
 +      }
 +    }
 +  }
 +
 +  BOOST_REQUIRE(foundIncrease);
 +}
 +
 +/**
 + * Test: This is a deterministic test that verifies multiprobe LSH works
 + * correctly. To do this, we generate a query , q1, that will end up
 + * outside of the bucket of its nearest neighbors in C1. The neighbors will be
 + * found only by searching additional probing bins.
 + *
 + * We verify this actually is the case.
 + */
 +BOOST_AUTO_TEST_CASE(MultiprobeDeterministicTest)
 +{
 +  // generate known deterministic clusters of points
 +  const size_t N = 40;
 +  arma::mat rdata;
 +  GetPointset(N, rdata);
 +
 +  const int k = N/2;
 +  const double hashWidth = 1;
 +  const int secondHashSize = 99901;
 +  const int bucketSize = 500;
 +
 +  // 1 table, projections on orthonormal plane
 +  arma::cube projections(2, 2, 1);
 +  projections(0, 0, 0) = 1;
 +  projections(1, 0, 0) = 0;
 +  projections(0, 1, 0) = 0;
 +  projections(1, 1, 0) = 1;
 +
 +  // construct LSH object with given tables
 +  LSHSearch<> lshTest(rdata, projections,
 +                      hashWidth, secondHashSize, bucketSize);
 +
 +  // get offset of each projection dimension. Since projection table is I(2),
 +  // offsets will map to offsets of actual dimensions. This is to enforce that
 +  // q1 is right outside the bounding box of C1.
 +  arma::mat offsets = lshTest.Offsets();
 +
 +
 +  // two points near the clusters but outside their bins. q1's lowest scoring
 +  // perturbation vectors will map to C1 cluster's bins.
 +  arma::mat q1;
 +  q1 << 1.1 << arma::endr << 3.3; // vector [1.1, 3.3]
 +  q1 += offsets;
 +
 +  arma::Mat<size_t> neighbors;
 +  arma::mat distances;
 +
 +  // Test that q1 simple search comes up empty
 +  lshTest.Search(q1, k, neighbors, distances);
 +  BOOST_REQUIRE( arma::all(neighbors.col(0) == N) );
 +
 +  // Searching with 3 additional probing bins should find neighbors
 +  lshTest.Search(q1, k, neighbors, distances, 0, 3);
 +  BOOST_REQUIRE( arma::all(neighbors.col(0) == N || neighbors.col(0) < 10) );
 +}
 +
  BOOST_AUTO_TEST_CASE(LSHTrainTest)
  {
    // This is a not very good test that simply checks that the re-trained LSH