You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
549 lines
20 KiB
549 lines
20 KiB
/***********************************************************************
|
|
* Software License Agreement (BSD License)
|
|
*
|
|
* Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
|
|
* Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
|
|
*
|
|
* THE BSD LICENSE
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*************************************************************************/
|
|
|
|
/***********************************************************************
|
|
* Author: Vincent Rabaud
|
|
*************************************************************************/
|
|
|
|
#ifndef FLANN_LSH_INDEX_H_
|
|
#define FLANN_LSH_INDEX_H_
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cstring>
|
|
#include <map>
|
|
#include <vector>
|
|
|
|
#include "FLANN/general.h"
|
|
#include "FLANN/algorithms/nn_index.h"
|
|
#include "FLANN/util/matrix.h"
|
|
#include "FLANN/util/result_set.h"
|
|
#include "FLANN/util/heap.h"
|
|
#include "FLANN/util/lsh_table.h"
|
|
#include "FLANN/util/allocator.h"
|
|
#include "FLANN/util/random.h"
|
|
#include "FLANN/util/saving.h"
|
|
|
|
namespace flann
|
|
{
|
|
|
|
struct LshIndexParams : public IndexParams
|
|
{
|
|
LshIndexParams(unsigned int table_number = 12, unsigned int key_size = 20, unsigned int multi_probe_level = 2)
|
|
{
|
|
(* this)["algorithm"] = FLANN_INDEX_LSH;
|
|
// The number of hash tables to use
|
|
(*this)["table_number"] = table_number;
|
|
// The length of the key in the hash tables
|
|
(*this)["key_size"] = key_size;
|
|
// Number of levels to use in multi-probe (0 for standard LSH)
|
|
(*this)["multi_probe_level"] = multi_probe_level;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Locality-sensitive hashing index
|
|
*
|
|
* Contains the tables and other information for indexing a set of points
|
|
* for nearest-neighbor matching.
|
|
*/
|
|
template<typename Distance>
|
|
class LshIndex : public NNIndex<Distance>
|
|
{
|
|
public:
|
|
typedef typename Distance::ElementType ElementType;
|
|
typedef typename Distance::ResultType DistanceType;
|
|
|
|
typedef NNIndex<Distance> BaseClass;
|
|
|
|
/** Constructor
|
|
* @param params parameters passed to the LSH algorithm
|
|
* @param d the distance used
|
|
*/
|
|
LshIndex(const IndexParams& params = LshIndexParams(), Distance d = Distance()) :
|
|
BaseClass(params, d)
|
|
{
|
|
table_number_ = get_param<unsigned int>(index_params_,"table_number",12);
|
|
key_size_ = get_param<unsigned int>(index_params_,"key_size",20);
|
|
multi_probe_level_ = get_param<unsigned int>(index_params_,"multi_probe_level",2);
|
|
|
|
fill_xor_mask(0, key_size_, multi_probe_level_, xor_masks_);
|
|
}
|
|
|
|
|
|
/** Constructor
|
|
* @param input_data dataset with the input features
|
|
* @param params parameters passed to the LSH algorithm
|
|
* @param d the distance used
|
|
*/
|
|
LshIndex(const Matrix<ElementType>& input_data, const IndexParams& params = LshIndexParams(), Distance d = Distance()) :
|
|
BaseClass(params, d)
|
|
{
|
|
table_number_ = get_param<unsigned int>(index_params_,"table_number",12);
|
|
key_size_ = get_param<unsigned int>(index_params_,"key_size",20);
|
|
multi_probe_level_ = get_param<unsigned int>(index_params_,"multi_probe_level",2);
|
|
|
|
fill_xor_mask(0, key_size_, multi_probe_level_, xor_masks_);
|
|
|
|
setDataset(input_data);
|
|
}
|
|
|
|
LshIndex(const LshIndex& other) : BaseClass(other),
|
|
tables_(other.tables_),
|
|
table_number_(other.table_number_),
|
|
key_size_(other.key_size_),
|
|
multi_probe_level_(other.multi_probe_level_),
|
|
xor_masks_(other.xor_masks_)
|
|
{
|
|
}
|
|
|
|
LshIndex& operator=(LshIndex other)
|
|
{
|
|
this->swap(other);
|
|
return *this;
|
|
}
|
|
|
|
virtual ~LshIndex()
|
|
{
|
|
freeIndex();
|
|
}
|
|
|
|
|
|
BaseClass* clone() const
|
|
{
|
|
return new LshIndex(*this);
|
|
}
|
|
|
|
using BaseClass::buildIndex;
|
|
|
|
void addPoints(const Matrix<ElementType>& points, float rebuild_threshold = 2)
|
|
{
|
|
assert(points.cols==veclen_);
|
|
size_t old_size = size_;
|
|
|
|
extendDataset(points);
|
|
|
|
if (rebuild_threshold>1 && size_at_build_*rebuild_threshold<size_) {
|
|
buildIndex();
|
|
}
|
|
else {
|
|
for (unsigned int i = 0; i < table_number_; ++i) {
|
|
lsh::LshTable<ElementType>& table = tables_[i];
|
|
for (size_t i=old_size;i<size_;++i) {
|
|
table.add(i, points_[i]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
flann_algorithm_t getType() const
|
|
{
|
|
return FLANN_INDEX_LSH;
|
|
}
|
|
|
|
|
|
template<typename Archive>
|
|
void serialize(Archive& ar)
|
|
{
|
|
ar.setObject(this);
|
|
|
|
ar & *static_cast<NNIndex<Distance>*>(this);
|
|
|
|
ar & table_number_;
|
|
ar & key_size_;
|
|
ar & multi_probe_level_;
|
|
|
|
ar & xor_masks_;
|
|
ar & tables_;
|
|
|
|
if (Archive::is_loading::value) {
|
|
index_params_["algorithm"] = getType();
|
|
index_params_["table_number"] = table_number_;
|
|
index_params_["key_size"] = key_size_;
|
|
index_params_["multi_probe_level"] = multi_probe_level_;
|
|
}
|
|
}
|
|
|
|
void saveIndex(FILE* stream)
|
|
{
|
|
serialization::SaveArchive sa(stream);
|
|
sa & *this;
|
|
}
|
|
|
|
void loadIndex(FILE* stream)
|
|
{
|
|
serialization::LoadArchive la(stream);
|
|
la & *this;
|
|
}
|
|
|
|
/**
|
|
* Computes the index memory usage
|
|
* Returns: memory used by the index
|
|
*/
|
|
int usedMemory() const
|
|
{
|
|
return size_ * sizeof(int);
|
|
}
|
|
|
|
/**
|
|
* \brief Perform k-nearest neighbor search
|
|
* \param[in] queries The query points for which to find the nearest neighbors
|
|
* \param[out] indices The indices of the nearest neighbors found
|
|
* \param[out] dists Distances to the nearest neighbors found
|
|
* \param[in] knn Number of nearest neighbors to return
|
|
* \param[in] params Search parameters
|
|
*/
|
|
int knnSearch(const Matrix<ElementType>& queries,
|
|
Matrix<size_t>& indices,
|
|
Matrix<DistanceType>& dists,
|
|
size_t knn,
|
|
const SearchParams& params) const
|
|
{
|
|
assert(queries.cols == veclen_);
|
|
assert(indices.rows >= queries.rows);
|
|
assert(dists.rows >= queries.rows);
|
|
assert(indices.cols >= knn);
|
|
assert(dists.cols >= knn);
|
|
|
|
int count = 0;
|
|
if (params.use_heap==FLANN_True) {
|
|
#pragma omp parallel num_threads(params.cores)
|
|
{
|
|
KNNUniqueResultSet<DistanceType> resultSet(knn);
|
|
#pragma omp for schedule(dynamic) reduction(+:count)
|
|
for (int i = 0; i < (int)queries.rows; i++) {
|
|
resultSet.clear();
|
|
findNeighbors(resultSet, queries[i], params);
|
|
size_t n = std::min(resultSet.size(), knn);
|
|
resultSet.copy(indices[i], dists[i], n, params.sorted);
|
|
indices_to_ids(indices[i], indices[i], n);
|
|
count += n;
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
#pragma omp parallel num_threads(params.cores)
|
|
{
|
|
KNNResultSet<DistanceType> resultSet(knn);
|
|
#pragma omp for schedule(dynamic) reduction(+:count)
|
|
for (int i = 0; i < (int)queries.rows; i++) {
|
|
resultSet.clear();
|
|
findNeighbors(resultSet, queries[i], params);
|
|
size_t n = std::min(resultSet.size(), knn);
|
|
resultSet.copy(indices[i], dists[i], n, params.sorted);
|
|
indices_to_ids(indices[i], indices[i], n);
|
|
count += n;
|
|
}
|
|
}
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
/**
|
|
* \brief Perform k-nearest neighbor search
|
|
* \param[in] queries The query points for which to find the nearest neighbors
|
|
* \param[out] indices The indices of the nearest neighbors found
|
|
* \param[out] dists Distances to the nearest neighbors found
|
|
* \param[in] knn Number of nearest neighbors to return
|
|
* \param[in] params Search parameters
|
|
*/
|
|
int knnSearch(const Matrix<ElementType>& queries,
|
|
std::vector< std::vector<size_t> >& indices,
|
|
std::vector<std::vector<DistanceType> >& dists,
|
|
size_t knn,
|
|
const SearchParams& params) const
|
|
{
|
|
assert(queries.cols == veclen_);
|
|
if (indices.size() < queries.rows ) indices.resize(queries.rows);
|
|
if (dists.size() < queries.rows ) dists.resize(queries.rows);
|
|
|
|
int count = 0;
|
|
if (params.use_heap==FLANN_True) {
|
|
#pragma omp parallel num_threads(params.cores)
|
|
{
|
|
KNNUniqueResultSet<DistanceType> resultSet(knn);
|
|
#pragma omp for schedule(dynamic) reduction(+:count)
|
|
for (int i = 0; i < (int)queries.rows; i++) {
|
|
resultSet.clear();
|
|
findNeighbors(resultSet, queries[i], params);
|
|
size_t n = std::min(resultSet.size(), knn);
|
|
indices[i].resize(n);
|
|
dists[i].resize(n);
|
|
if (n > 0) {
|
|
resultSet.copy(&indices[i][0], &dists[i][0], n, params.sorted);
|
|
indices_to_ids(&indices[i][0], &indices[i][0], n);
|
|
}
|
|
count += n;
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
#pragma omp parallel num_threads(params.cores)
|
|
{
|
|
KNNResultSet<DistanceType> resultSet(knn);
|
|
#pragma omp for schedule(dynamic) reduction(+:count)
|
|
for (int i = 0; i < (int)queries.rows; i++) {
|
|
resultSet.clear();
|
|
findNeighbors(resultSet, queries[i], params);
|
|
size_t n = std::min(resultSet.size(), knn);
|
|
indices[i].resize(n);
|
|
dists[i].resize(n);
|
|
if (n > 0) {
|
|
resultSet.copy(&indices[i][0], &dists[i][0], n, params.sorted);
|
|
indices_to_ids(&indices[i][0], &indices[i][0], n);
|
|
}
|
|
count += n;
|
|
}
|
|
}
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
/**
|
|
* Find set of nearest neighbors to vec. Their indices are stored inside
|
|
* the result object.
|
|
*
|
|
* Params:
|
|
* result = the result object in which the indices of the nearest-neighbors are stored
|
|
* vec = the vector for which to search the nearest neighbors
|
|
* maxCheck = the maximum number of restarts (in a best-bin-first manner)
|
|
*/
|
|
void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& /*searchParams*/) const
|
|
{
|
|
getNeighbors(vec, result);
|
|
}
|
|
|
|
protected:
|
|
|
|
/**
|
|
* Builds the index
|
|
*/
|
|
void buildIndexImpl()
|
|
{
|
|
tables_.resize(table_number_);
|
|
std::vector<std::pair<size_t,ElementType*> > features;
|
|
features.reserve(points_.size());
|
|
for (size_t i=0;i<points_.size();++i) {
|
|
features.push_back(std::make_pair(i, points_[i]));
|
|
}
|
|
for (unsigned int i = 0; i < table_number_; ++i) {
|
|
lsh::LshTable<ElementType>& table = tables_[i];
|
|
table = lsh::LshTable<ElementType>(veclen_, key_size_);
|
|
|
|
// Add the features to the table
|
|
table.add(features);
|
|
}
|
|
}
|
|
|
|
void freeIndex()
|
|
{
|
|
/* nothing to do here */
|
|
}
|
|
|
|
|
|
private:
|
|
/** Defines the comparator on score and index
|
|
*/
|
|
typedef std::pair<float, unsigned int> ScoreIndexPair;
|
|
struct SortScoreIndexPairOnSecond
|
|
{
|
|
bool operator()(const ScoreIndexPair& left, const ScoreIndexPair& right) const
|
|
{
|
|
return left.second < right.second;
|
|
}
|
|
};
|
|
|
|
/** Fills the different xor masks to use when getting the neighbors in multi-probe LSH
|
|
* @param key the key we build neighbors from
|
|
* @param lowest_index the lowest index of the bit set
|
|
* @param level the multi-probe level we are at
|
|
* @param xor_masks all the xor mask
|
|
*/
|
|
void fill_xor_mask(lsh::BucketKey key, int lowest_index, unsigned int level,
|
|
std::vector<lsh::BucketKey>& xor_masks)
|
|
{
|
|
xor_masks.push_back(key);
|
|
if (level == 0) return;
|
|
for (int index = lowest_index - 1; index >= 0; --index) {
|
|
// Create a new key
|
|
lsh::BucketKey new_key = key | (lsh::BucketKey(1) << index);
|
|
fill_xor_mask(new_key, index, level - 1, xor_masks);
|
|
}
|
|
}
|
|
|
|
/** Performs the approximate nearest-neighbor search.
|
|
* @param vec the feature to analyze
|
|
* @param do_radius flag indicating if we check the radius too
|
|
* @param radius the radius if it is a radius search
|
|
* @param do_k flag indicating if we limit the number of nn
|
|
* @param k_nn the number of nearest neighbors
|
|
* @param checked_average used for debugging
|
|
*/
|
|
void getNeighbors(const ElementType* vec, bool do_radius, float radius, bool do_k, unsigned int k_nn,
|
|
float& checked_average)
|
|
{
|
|
static std::vector<ScoreIndexPair> score_index_heap;
|
|
|
|
if (do_k) {
|
|
unsigned int worst_score = std::numeric_limits<unsigned int>::max();
|
|
typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
|
|
typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
|
|
for (; table != table_end; ++table) {
|
|
size_t key = table->getKey(vec);
|
|
std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
|
|
std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
|
|
for (; xor_mask != xor_mask_end; ++xor_mask) {
|
|
size_t sub_key = key ^ (*xor_mask);
|
|
const lsh::Bucket* bucket = table->getBucketFromKey(sub_key);
|
|
if (bucket == 0) continue;
|
|
|
|
// Go over each descriptor index
|
|
std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
|
|
std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
|
|
DistanceType hamming_distance;
|
|
|
|
// Process the rest of the candidates
|
|
for (; training_index < last_training_index; ++training_index) {
|
|
if (removed_ && removed_points_.test(*training_index)) continue;
|
|
hamming_distance = distance_(vec, points_[*training_index].point, veclen_);
|
|
|
|
if (hamming_distance < worst_score) {
|
|
// Insert the new element
|
|
score_index_heap.push_back(ScoreIndexPair(hamming_distance, training_index));
|
|
std::push_heap(score_index_heap.begin(), score_index_heap.end());
|
|
|
|
if (score_index_heap.size() > (unsigned int)k_nn) {
|
|
// Remove the highest distance value as we have too many elements
|
|
std::pop_heap(score_index_heap.begin(), score_index_heap.end());
|
|
score_index_heap.pop_back();
|
|
// Keep track of the worst score
|
|
worst_score = score_index_heap.front().first;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
|
|
typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
|
|
for (; table != table_end; ++table) {
|
|
size_t key = table->getKey(vec);
|
|
std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
|
|
std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
|
|
for (; xor_mask != xor_mask_end; ++xor_mask) {
|
|
size_t sub_key = key ^ (*xor_mask);
|
|
const lsh::Bucket* bucket = table->getBucketFromKey(sub_key);
|
|
if (bucket == 0) continue;
|
|
|
|
// Go over each descriptor index
|
|
std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
|
|
std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
|
|
DistanceType hamming_distance;
|
|
|
|
// Process the rest of the candidates
|
|
for (; training_index < last_training_index; ++training_index) {
|
|
if (removed_ && removed_points_.test(*training_index)) continue;
|
|
// Compute the Hamming distance
|
|
hamming_distance = distance_(vec, points_[*training_index].point, veclen_);
|
|
if (hamming_distance < radius) score_index_heap.push_back(ScoreIndexPair(hamming_distance, training_index));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/** Performs the approximate nearest-neighbor search.
|
|
* This is a slower version than the above as it uses the ResultSet
|
|
* @param vec the feature to analyze
|
|
*/
|
|
void getNeighbors(const ElementType* vec, ResultSet<DistanceType>& result) const
|
|
{
|
|
typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
|
|
typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
|
|
for (; table != table_end; ++table) {
|
|
size_t key = table->getKey(vec);
|
|
std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
|
|
std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
|
|
for (; xor_mask != xor_mask_end; ++xor_mask) {
|
|
size_t sub_key = key ^ (*xor_mask);
|
|
const lsh::Bucket* bucket = table->getBucketFromKey(sub_key);
|
|
if (bucket == 0) continue;
|
|
|
|
// Go over each descriptor index
|
|
std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
|
|
std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
|
|
DistanceType hamming_distance;
|
|
|
|
// Process the rest of the candidates
|
|
for (; training_index < last_training_index; ++training_index) {
|
|
if (removed_ && removed_points_.test(*training_index)) continue;
|
|
// Compute the Hamming distance
|
|
hamming_distance = distance_(vec, points_[*training_index], veclen_);
|
|
result.addPoint(hamming_distance, *training_index);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void swap(LshIndex& other)
|
|
{
|
|
BaseClass::swap(other);
|
|
std::swap(tables_, other.tables_);
|
|
std::swap(size_at_build_, other.size_at_build_);
|
|
std::swap(table_number_, other.table_number_);
|
|
std::swap(key_size_, other.key_size_);
|
|
std::swap(multi_probe_level_, other.multi_probe_level_);
|
|
std::swap(xor_masks_, other.xor_masks_);
|
|
}
|
|
|
|
/** The different hash tables */
|
|
std::vector<lsh::LshTable<ElementType> > tables_;
|
|
|
|
/** table number */
|
|
unsigned int table_number_;
|
|
/** key size */
|
|
unsigned int key_size_;
|
|
/** How far should we look for neighbors in multi-probe LSH */
|
|
unsigned int multi_probe_level_;
|
|
|
|
/** The XOR masks to apply to a key to get the neighboring buckets */
|
|
std::vector<lsh::BucketKey> xor_masks_;
|
|
|
|
USING_BASECLASS_SYMBOLS
|
|
};
|
|
}
|
|
|
|
#endif //FLANN_LSH_INDEX_H_
|