/*********************************************************************** * Software License Agreement (BSD License) * * Copyright 2008-2011 Marius Muja (mariusm@cs.ubc.ca). All rights reserved. * Copyright 2008-2011 David G. Lowe (lowe@cs.ubc.ca). All rights reserved. * * THE BSD LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************************/ #ifndef FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_ #define FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_ #include #include #include #include #include #include #ifndef SIZE_MAX #define SIZE_MAX ((size_t) -1) #endif #include "FLANN/general.h" #include "FLANN/algorithms/nn_index.h" #include "FLANN/algorithms/dist.h" #include "FLANN/util/matrix.h" #include "FLANN/util/result_set.h" #include "FLANN/util/heap.h" #include "FLANN/util/allocator.h" #include "FLANN/util/random.h" #include "FLANN/util/saving.h" #include "FLANN/util/serialization.h" namespace flann { struct HierarchicalClusteringIndexParams : public IndexParams { HierarchicalClusteringIndexParams(int branching = 32, flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, int trees = 4, int leaf_max_size = 100) { (*this)["algorithm"] = FLANN_INDEX_HIERARCHICAL; // The branching factor used in the hierarchical clustering (*this)["branching"] = branching; // Algorithm used for picking the initial cluster centers (*this)["centers_init"] = centers_init; // number of parallel trees to build (*this)["trees"] = trees; // maximum leaf size (*this)["leaf_max_size"] = leaf_max_size; } }; /** * Hierarchical index * * Contains a tree constructed through a hierarchical clustering * and other information for indexing a set of points for nearest-neighbour matching. */ template class HierarchicalClusteringIndex : public NNIndex { public: typedef typename Distance::ElementType ElementType; typedef typename Distance::ResultType DistanceType; typedef NNIndex BaseClass; /** * Constructor. * * @param index_params * @param d */ HierarchicalClusteringIndex(const IndexParams& index_params = HierarchicalClusteringIndexParams(), Distance d = Distance()) : BaseClass(index_params, d) { memoryCounter_ = 0; branching_ = get_param(index_params_,"branching",32); centers_init_ = get_param(index_params_,"centers_init", FLANN_CENTERS_RANDOM); trees_ = get_param(index_params_,"trees",4); leaf_max_size_ = get_param(index_params_,"leaf_max_size",100); initCenterChooser(); } /** * Index constructor * * Params: * inputData = dataset with the input features * params = parameters passed to the hierarchical k-means algorithm */ HierarchicalClusteringIndex(const Matrix& inputData, const IndexParams& index_params = HierarchicalClusteringIndexParams(), Distance d = Distance()) : BaseClass(index_params, d) { memoryCounter_ = 0; branching_ = get_param(index_params_,"branching",32); centers_init_ = get_param(index_params_,"centers_init", FLANN_CENTERS_RANDOM); trees_ = get_param(index_params_,"trees",4); leaf_max_size_ = get_param(index_params_,"leaf_max_size",100); initCenterChooser(); setDataset(inputData); chooseCenters_->setDataSize(veclen_); } HierarchicalClusteringIndex(const HierarchicalClusteringIndex& other) : BaseClass(other), memoryCounter_(other.memoryCounter_), branching_(other.branching_), trees_(other.trees_), centers_init_(other.centers_init_), leaf_max_size_(other.leaf_max_size_) { initCenterChooser(); tree_roots_.resize(other.tree_roots_.size()); for (size_t i=0;iswap(other); return *this; } void initCenterChooser() { switch(centers_init_) { case FLANN_CENTERS_RANDOM: chooseCenters_ = new RandomCenterChooser(distance_, points_); break; case FLANN_CENTERS_GONZALES: chooseCenters_ = new GonzalesCenterChooser(distance_, points_); break; case FLANN_CENTERS_KMEANSPP: chooseCenters_ = new KMeansppCenterChooser(distance_, points_); break; case FLANN_CENTERS_GROUPWISE: chooseCenters_ = new GroupWiseCenterChooser(distance_, points_); break; default: throw FLANNException("Unknown algorithm for choosing initial centers."); } } /** * Index destructor. * * Release the memory used by the index. */ virtual ~HierarchicalClusteringIndex() { delete chooseCenters_; freeIndex(); } BaseClass* clone() const { return new HierarchicalClusteringIndex(*this); } /** * Computes the inde memory usage * Returns: memory used by the index */ int usedMemory() const { return pool_.usedMemory+pool_.wastedMemory+memoryCounter_; } using BaseClass::buildIndex; void addPoints(const Matrix& points, float rebuild_threshold = 2) { assert(points.cols==veclen_); size_t old_size = size_; extendDataset(points); if (rebuild_threshold>1 && size_at_build_*rebuild_threshold void serialize(Archive& ar) { ar.setObject(this); ar & *static_cast*>(this); ar & branching_; ar & trees_; ar & centers_init_; ar & leaf_max_size_; if (Archive::is_loading::value) { tree_roots_.resize(trees_); } for (size_t i=0;i& result, const ElementType* vec, const SearchParams& searchParams) const { if (removed_) { findNeighborsWithRemoved(result, vec, searchParams); } else { findNeighborsWithRemoved(result, vec, searchParams); } } protected: /** * Builds the index */ void buildIndexImpl() { chooseCenters_->setDataSize(veclen_); if (branching_<2) { throw FLANNException("Branching factor must be at least 2"); } tree_roots_.resize(trees_); std::vector indices(size_); for (int i=0; i void serialize(Archive& ar) { typedef HierarchicalClusteringIndex Index; Index* obj = static_cast(ar.getObject()); ar & index; // ar & point; if (Archive::is_loading::value) { point = obj->points_[index]; } } friend struct serialization::access; }; /** * Struture representing a node in the hierarchical k-means tree. */ struct Node { /** * The cluster center */ ElementType* pivot; size_t pivot_index; /** * Child nodes (only for non-terminal nodes) */ std::vector childs; /** * Node points (only for terminal nodes) */ std::vector points; Node(){ pivot = NULL; pivot_index = SIZE_MAX; } /** * destructor * calling Node destructor explicitly */ ~Node() { for(size_t i=0; i~Node(); pivot = NULL; pivot_index = -1; } }; private: template void serialize(Archive& ar) { typedef HierarchicalClusteringIndex Index; Index* obj = static_cast(ar.getObject()); ar & pivot_index; if (Archive::is_loading::value) { if (pivot_index != SIZE_MAX) pivot = obj->points_[pivot_index]; else pivot = NULL; } size_t childs_size; if (Archive::is_saving::value) { childs_size = childs.size(); } ar & childs_size; if (childs_size==0) { ar & points; } else { if (Archive::is_loading::value) { childs.resize(childs_size); } for (size_t i=0;ipool_) Node(); } ar & *childs[i]; } } } friend struct serialization::access; }; typedef Node* NodePtr; /** * Alias definition for a nicer syntax. */ typedef BranchStruct BranchSt; /** * Clears Node tree * calling Node destructor explicitly */ void freeIndex(){ for (size_t i=0; i~Node(); } pool_.free(); } void copyTree(NodePtr& dst, const NodePtr& src) { dst = new(pool_) Node(); dst->pivot_index = src->pivot_index; dst->pivot = points_[dst->pivot_index]; if (src->childs.size()==0) { dst->points = src->points; } else { dst->childs.resize(src->childs.size()); for (size_t i=0;ichilds.size();++i) { copyTree(dst->childs[i], src->childs[i]); } } } void computeLabels(int* indices, int indices_length, int* centers, int centers_length, int* labels, DistanceType& cost) { cost = 0; for (int i=0; inew_dist) { labels[i] = j; dist = new_dist; } } cost += dist; } } /** * The method responsible with actually doing the recursive hierarchical * clustering * * Params: * node = the node to cluster * indices = indices of the points belonging to the current node * branching = the branching factor to use in the clustering * */ void computeClustering(NodePtr node, int* indices, int indices_length) { if (indices_length < leaf_max_size_) { // leaf node node->points.resize(indices_length); for (int i=0;ipoints[i].index = indices[i]; node->points[i].point = points_[indices[i]]; } node->childs.clear(); return; } std::vector centers(branching_); std::vector labels(indices_length); int centers_length; (*chooseCenters_)(branching_, indices, indices_length, ¢ers[0], centers_length); if (centers_lengthpoints.resize(indices_length); for (int i=0;ipoints[i].index = indices[i]; node->points[i].point = points_[indices[i]]; } node->childs.clear(); return; } // assign points to clusters DistanceType cost; computeLabels(indices, indices_length, ¢ers[0], centers_length, &labels[0], cost); node->childs.resize(branching_); int start = 0; int end = start; for (int i=0; ichilds[i] = new(pool_) Node(); node->childs[i]->pivot_index = centers[i]; node->childs[i]->pivot = points_[centers[i]]; node->childs[i]->points.clear(); computeClustering(node->childs[i],indices+start, end-start); start=end; } } template void findNeighborsWithRemoved(ResultSet& result, const ElementType* vec, const SearchParams& searchParams) const { int maxChecks = searchParams.checks; // Priority queue storing intermediate branches in the best-bin-first search Heap* heap = new Heap(size_); DynamicBitset checked(size_); int checks = 0; for (int i=0; i(tree_roots_[i], result, vec, checks, maxChecks, heap, checked); } BranchSt branch; while (heap->popMin(branch) && (checks(node, result, vec, checks, maxChecks, heap, checked); } delete heap; } /** * Performs one descent in the hierarchical k-means tree. The branches not * visited are stored in a priority queue. * * Params: * node = node to explore * result = container for the k-nearest neighbors found * vec = query points * checks = how many points in the dataset have been checked so far * maxChecks = maximum dataset points to checks */ template void findNN(NodePtr node, ResultSet& result, const ElementType* vec, int& checks, int maxChecks, Heap* heap, DynamicBitset& checked) const { if (node->childs.empty()) { if (checks>=maxChecks) { if (result.full()) return; } for (size_t i=0; ipoints.size(); ++i) { PointInfo& pointInfo = node->points[i]; if (with_removed) { if (removed_points_.test(pointInfo.index)) continue; } if (checked.test(pointInfo.index)) continue; DistanceType dist = distance_(pointInfo.point, vec, veclen_); result.addPoint(dist, pointInfo.index); checked.set(pointInfo.index); ++checks; } } else { DistanceType* domain_distances = new DistanceType[branching_]; int best_index = 0; domain_distances[best_index] = distance_(vec, node->childs[best_index]->pivot, veclen_); for (int i=1; ichilds[i]->pivot, veclen_); if (domain_distances[i]insert(BranchSt(node->childs[i],domain_distances[i])); } } delete[] domain_distances; findNN(node->childs[best_index],result,vec, checks, maxChecks, heap, checked); } } void addPointToTree(NodePtr node, size_t index) { ElementType* point = points_[index]; if (node->childs.empty()) { // leaf node PointInfo pointInfo; pointInfo.point = point; pointInfo.index = index; node->points.push_back(pointInfo); if (node->points.size()>=size_t(branching_)) { std::vector indices(node->points.size()); for (size_t i=0;ipoints.size();++i) { indices[i] = node->points[i].index; } computeClustering(node, &indices[0], indices.size()); } } else { // find the closest child int closest = 0; ElementType* center = node->childs[closest]->pivot; DistanceType dist = distance_(center, point, veclen_); for (size_t i=1;ichilds[i]->pivot; DistanceType crt_dist = distance_(center, point, veclen_); if (crt_distchilds[closest], index); } } void swap(HierarchicalClusteringIndex& other) { BaseClass::swap(other); std::swap(tree_roots_, other.tree_roots_); std::swap(pool_, other.pool_); std::swap(memoryCounter_, other.memoryCounter_); std::swap(branching_, other.branching_); std::swap(trees_, other.trees_); std::swap(centers_init_, other.centers_init_); std::swap(leaf_max_size_, other.leaf_max_size_); std::swap(chooseCenters_, other.chooseCenters_); } private: /** * The root nodes in the tree. */ std::vector tree_roots_; /** * Pooled memory allocator. * * Using a pooled memory allocator is more efficient * than allocating memory directly when there is a large * number small of memory allocations. */ PooledAllocator pool_; /** * Memory occupied by the index. */ int memoryCounter_; /** index parameters */ /** * Branching factor to use for clustering */ int branching_; /** * How many parallel trees to build */ int trees_; /** * Algorithm to use for choosing cluster centers */ flann_centers_init_t centers_init_; /** * Max size of leaf nodes */ int leaf_max_size_; /** * Algorithm used to choose initial centers */ CenterChooser* chooseCenters_; USING_BASECLASS_SYMBOLS }; } #endif /* FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_ */