/********************************************************************** * Software License Agreement (BSD License) * * Copyright 2011 Andreas Muetzel (amuetzel@uni-koblenz.de). All rights reserved. * * THE BSD LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************************/ #ifndef FLANN_UTIL_CUDA_RESULTSET_H #define FLANN_UTIL_CUDA_RESULTSET_H #include #include __device__ __forceinline__ float infinity() { return __int_as_float(0x7f800000); } #ifndef INFINITY #define INFINITY infinity() #endif namespace flann { namespace cuda { //! result set for the 1nn search. Doesn't do any global memory accesses on its own, template< typename DistanceType > struct SingleResultSet { int bestIndex; DistanceType bestDist; const DistanceType epsError; __device__ __host__ SingleResultSet( DistanceType eps ) : bestIndex(-1),bestDist(INFINITY), epsError(eps){ } __device__ inline float worstDist() { return bestDist; } __device__ inline void insert(int index, DistanceType dist) { if( dist <= bestDist ) { bestIndex=index; bestDist=dist; } } DistanceType* resultDist; int* resultIndex; __device__ inline void setResultLocation( DistanceType* dists, int* index, int thread, int stride ) { resultDist=dists+thread*stride; resultIndex=index+thread*stride; if( stride != 1 ) { for( int i=1; i struct GreaterThan { __device__ bool operator()(DistanceType a, DistanceType b) { return a>b; } }; // using this and the template uses 2 or 3 registers more than the direct implementation in the kNearestKernel, but // there is no speed difference. // Setting useHeap as a template parameter leads to a whole lot of things being // optimized away by nvcc. // Register counts are the same as when removing not-needed variables in explicit specializations // and the "if( useHeap )" branches are eliminated at compile time. // The downside of this: a bit more complex kernel launch code. template< typename DistanceType, bool useHeap > struct KnnResultSet { int foundNeighbors; DistanceType largestHeapDist; int maxDistIndex; const int k; const bool sorted; const DistanceType epsError; __device__ __host__ KnnResultSet(int knn, bool sortResults, DistanceType eps) : foundNeighbors(0),largestHeapDist(INFINITY),k(knn), sorted(sortResults), epsError(eps){ } // __host__ __device__ // KnnResultSet(const KnnResultSet& o):foundNeighbors(o.foundNeighbors),largestHeapDist(o.largestHeapDist),k(o.k){ } __device__ inline DistanceType worstDist() { return largestHeapDist; } __device__ inline void insert(int index, DistanceType dist) { if( foundNeighbors()); largestHeapDist=resultDist[0]; } else { findLargestDistIndex(); } } foundNeighbors++; } else if( dist < largestHeapDist ) { if( useHeap ) { resultDist[0]=dist; resultIndex[0]=index; flann::cuda::heap::sift_down(resultDist,resultIndex,0,k,GreaterThan()); largestHeapDist=resultDist[0]; } else { resultDist[maxDistIndex]=dist; resultIndex[maxDistIndex]=index; findLargestDistIndex(); } } } __device__ void findLargestDistIndex( ) { largestHeapDist=resultDist[0]; maxDistIndex=0; for( int i=1; i largestHeapDist ) { maxDistIndex=i; largestHeapDist=resultDist[i]; } } float* resultDist; int* resultIndex; __device__ inline void setResultLocation( DistanceType* dists, int* index, int thread, int stride ) { resultDist=dists+stride*thread; resultIndex=index+stride*thread; for( int i=0; i()); for( int i=k-1; i>0; i-- ) { flann::cuda::swap( resultDist[0], resultDist[i] ); flann::cuda::swap( resultIndex[0], resultIndex[i] ); flann::cuda::heap::sift_down( resultDist,resultIndex, 0, i, GreaterThan() ); } } } }; template struct CountingRadiusResultSet { int count_; DistanceType radius_sq_; int max_neighbors_; __device__ __host__ CountingRadiusResultSet(DistanceType radius, int max_neighbors) : count_(0),radius_sq_(radius), max_neighbors_(max_neighbors){ } __device__ inline DistanceType worstDist() { return radius_sq_; } __device__ inline void insert(int index, float dist) { if( dist < radius_sq_ ) { count_++; } } int* resultIndex; __device__ inline void setResultLocation( DistanceType* /*dists*/, int* count, int thread, int stride ) { resultIndex=count+thread*stride; } __device__ inline void finish() { if(( max_neighbors_<=0) ||( count_<=max_neighbors_) ) resultIndex[0]=count_; else resultIndex[0]=max_neighbors_; } }; template struct RadiusKnnResultSet { int foundNeighbors; DistanceType largestHeapDist; int maxDistElem; const int k; const bool sorted; const DistanceType radius_sq_; int* segment_starts_; // int count_; __device__ __host__ RadiusKnnResultSet(DistanceType radius, int knn, int* segment_starts, bool sortResults) : foundNeighbors(0),largestHeapDist(radius),k(knn), sorted(sortResults), radius_sq_(radius),segment_starts_(segment_starts) { } // __host__ __device__ // KnnResultSet(const KnnResultSet& o):foundNeighbors(o.foundNeighbors),largestHeapDist(o.largestHeapDist),k(o.k){ } __device__ inline DistanceType worstDist() { return largestHeapDist; } __device__ inline void insert(int index, DistanceType dist) { if( dist < radius_sq_ ) { if( foundNeighbors()); largestHeapDist=resultDist[0]; } else { findLargestDistIndex(); } } foundNeighbors++; } else if( dist < largestHeapDist ) { if( useHeap ) { resultDist[0]=dist; resultIndex[0]=index; flann::cuda::heap::sift_down(resultDist,resultIndex,0,k,GreaterThan()); largestHeapDist=resultDist[0]; } else { resultDist[maxDistElem]=dist; resultIndex[maxDistElem]=index; findLargestDistIndex(); } } } } __device__ void findLargestDistIndex( ) { largestHeapDist=resultDist[0]; maxDistElem=0; for( int i=1; i largestHeapDist ) { maxDistElem=i; largestHeapDist=resultDist[i]; } } DistanceType* resultDist; int* resultIndex; __device__ inline void setResultLocation( DistanceType* dists, int* index, int thread, int /*stride*/ ) { resultDist=dists+segment_starts_[thread]; resultIndex=index+segment_starts_[thread]; } __device__ inline void finish() { if( sorted ) { if( !useHeap ) flann::cuda::heap::make_heap(resultDist,resultIndex,k,GreaterThan()); for( int i=foundNeighbors-1; i>0; i-- ) { flann::cuda::swap( resultDist[0], resultDist[i] ); flann::cuda::swap( resultIndex[0], resultIndex[i] ); flann::cuda::heap::sift_down( resultDist,resultIndex, 0, i, GreaterThan() ); } } } }; // Difference to RadiusKnnResultSet: Works like KnnResultSet, doesn't pack the results densely (as the RadiusResultSet does) template struct KnnRadiusResultSet { int foundNeighbors; DistanceType largestHeapDist; int maxDistIndex; const int k; const bool sorted; const DistanceType epsError; const DistanceType radius_sq; __device__ __host__ KnnRadiusResultSet(int knn, bool sortResults, DistanceType eps, DistanceType radius) : foundNeighbors(0),largestHeapDist(radius),k(knn), sorted(sortResults), epsError(eps),radius_sq(radius){ } // __host__ __device__ // KnnResultSet(const KnnResultSet& o):foundNeighbors(o.foundNeighbors),largestHeapDist(o.largestHeapDist),k(o.k){ } __device__ inline DistanceType worstDist() { return largestHeapDist; } __device__ inline void insert(int index, DistanceType dist) { if( dist < largestHeapDist ) { if( foundNeighbors()); largestHeapDist=resultDist[0]; } else { findLargestDistIndex(); } } foundNeighbors++; } else { //if( dist < largestHeapDist ) if( useHeap ) { resultDist[0]=dist; resultIndex[0]=index; flann::cuda::heap::sift_down(resultDist,resultIndex,0,k,GreaterThan()); largestHeapDist=resultDist[0]; } else { resultDist[maxDistIndex]=dist; resultIndex[maxDistIndex]=index; findLargestDistIndex(); } } } } __device__ void findLargestDistIndex( ) { largestHeapDist=resultDist[0]; maxDistIndex=0; for( int i=1; i largestHeapDist ) { maxDistIndex=i; largestHeapDist=resultDist[i]; } } DistanceType* resultDist; int* resultIndex; __device__ inline void setResultLocation( DistanceType* dists, int* index, int thread, int stride ) { resultDist=dists+stride*thread; resultIndex=index+stride*thread; for( int i=0; i()); for( int i=k-1; i>0; i-- ) { flann::cuda::swap( resultDist[0], resultDist[i] ); flann::cuda::swap( resultIndex[0], resultIndex[i] ); flann::cuda::heap::sift_down( resultDist,resultIndex, 0, i, GreaterThan() ); } } } }; //! fills the radius output buffer. //! IMPORTANT ASSERTION: ASSUMES THAT THERE IS ENOUGH SPACE FOR EVERY NEIGHBOR! IF THIS ISN'T //! TRUE, USE KnnRadiusResultSet! (Otherwise, the neighbors of one element might overflow into the next element, or past the buffer.) template< typename DistanceType > struct RadiusResultSet { DistanceType radius_sq_; int* segment_starts_; int count_; bool sorted_; __device__ __host__ RadiusResultSet(DistanceType radius, int* segment_starts, bool sorted) : radius_sq_(radius), segment_starts_(segment_starts), count_(0), sorted_(sorted){ } __device__ inline DistanceType worstDist() { return radius_sq_; } __device__ inline void insert(int index, DistanceType dist) { if( dist < radius_sq_ ) { resultIndex[count_]=index; resultDist[count_]=dist; count_++; } } int* resultIndex; DistanceType* resultDist; __device__ inline void setResultLocation( DistanceType* dists, int* index, int thread, int /*stride*/ ) { resultIndex=index+segment_starts_[thread]; resultDist=dists+segment_starts_[thread]; } __device__ inline void finish() { if( sorted_ ) { flann::cuda::heap::make_heap( resultDist,resultIndex, count_, GreaterThan() ); for( int i=count_-1; i>0; i-- ) { flann::cuda::swap( resultDist[0], resultDist[i] ); flann::cuda::swap( resultIndex[0], resultIndex[i] ); flann::cuda::heap::sift_down( resultDist,resultIndex, 0, i, GreaterThan() ); } } } }; } } #endif