/*********************************************************************** * Software License Agreement (BSD License) * * Copyright 2011 Andreas Muetzel (amuetzel@uni-koblenz.de). All rights reserved. * * THE BSD LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************************/ #ifndef FLANN_CUDA_KD_TREE_BUILDER_H_ #define FLANN_CUDA_KD_TREE_BUILDER_H_ #include #include #include #include #include #include #include #include // #define PRINT_DEBUG_TIMING namespace flann { // template< typename T > // void print_vector( const thrust::device_vector& v ) // { // for( int i=0; i< v.size(); i++ ) // { // std::cout< // void print_vector( const thrust::device_vector& v1, const thrust::device_vector& v2 ) // { // for( int i=0; i< v1.size(); i++ ) // { // std::cout< // void print_vector( const thrust::device_vector& v1, const thrust::device_vector& v2, const thrust::device_vector& v3 ) // { // for( int i=0; i< v1.size(); i++ ) // { // std::cout< // void print_vector_by_index( const thrust::device_vector& v,const thrust::device_vector& ind ) // { // for( int i=0; i< v.size(); i++ ) // { // std::cout<& i ) { return (thrust::get<1>(i)& 1)==0; } }; //! just for convenience: access a float4 by an index in [0,1,2] //! (casting it to a float* and accessing it by the index is way slower...) __host__ __device__ float get_value_by_index( const float4& f, int i ) { switch(i) { case 0: return f.x; case 1: return f.y; default: return f.z; } } //! mark a point as belonging to the left or right child of its current parent //! called after parents are split struct MovePointsToChildNodes { MovePointsToChildNodes( int* child1, SplitInfo* splits, float* x, float* y, float* z, int* ox, int* oy, int* oz, int* lrx, int* lry, int* lrz ) : child1_(child1), splits_(splits), x_(x), y_(y), z_(z), ox_(ox), oy_(oy), oz_(oz), lrx_(lrx), lry_(lry), lrz_(lrz){} // int dim; // float threshold; int* child1_; SplitInfo* splits_; // coordinate values float* x_, * y_, * z_; // owner indices -> which node does the point belong to? int* ox_, * oy_, * oz_; // temp info: will be set to 1 of a point is moved to the right child node, 0 otherwise // (used later in the scan op to separate the points of the children into continuous ranges) int* lrx_, * lry_, * lrz_; __device__ void operator()( const thrust::tuple& data ) { int index = thrust::get<0>(data); int owner = ox_[index]; // before a split, all points at the same position in the index array have the same owner int point_ind1=thrust::get<1>(data); int point_ind2=thrust::get<2>(data); int point_ind3=thrust::get<3>(data); int leftChild=child1_[owner]; int split_dim; float dim_val1, dim_val2, dim_val3; SplitInfo split; lrx_[index]=0; lry_[index]=0; lrz_[index]=0; // this element already belongs to a leaf node -> everything alright, no need to change anything if( leftChild==-1 ) { return; } // otherwise: load split data, and assign this index to the new owner split = splits_[owner]; split_dim=split.split_dim; switch( split_dim ) { case 0: dim_val1=x_[point_ind1]; dim_val2=x_[point_ind2]; dim_val3=x_[point_ind3]; break; case 1: dim_val1=y_[point_ind1]; dim_val2=y_[point_ind2]; dim_val3=y_[point_ind3]; break; default: dim_val1=z_[point_ind1]; dim_val2=z_[point_ind2]; dim_val3=z_[point_ind3]; break; } int r1=leftChild +(dim_val1 > split.split_val); ox_[index]=r1; int r2=leftChild+(dim_val2 > split.split_val); oy_[index]=r2; oz_[index]=leftChild+(dim_val3 > split.split_val); lrx_[index] = (dim_val1 > split.split_val); lry_[index] = (dim_val2 > split.split_val); lrz_[index] = (dim_val3 > split.split_val); // return thrust::make_tuple( r1, r2, leftChild+(dim_val > split.split_val) ); } }; //! used to update the left/right pointers and aabb infos after the node splits struct SetLeftAndRightAndAABB { int maxPoints; int nElements; SplitInfo* nodes; int* counts; int* labels; float4* aabbMin; float4* aabbMax; const float* x,* y,* z; const int* ix, * iy, * iz; __host__ __device__ void operator()( int i ) { int index=labels[i]; int right; int left = counts[i]; nodes[index].left=left; if( i < nElements-1 ) { right=counts[i+1]; } else { // index==nNodes right=maxPoints; } nodes[index].right=right; aabbMin[index].x=x[ix[left]]; aabbMin[index].y=y[iy[left]]; aabbMin[index].z=z[iz[left]]; aabbMax[index].x=x[ix[right-1]]; aabbMax[index].y=y[iy[right-1]]; aabbMax[index].z=z[iz[right-1]]; } }; //! - decide whether a node has to be split //! if yes: //! - allocate child nodes //! - set split axis as axis of maximum aabb length struct SplitNodes { int maxPointsPerNode; int* node_count; int* nodes_allocated; int* out_of_space; int* child1_; int* parent_; SplitInfo* splits; __device__ void operator()( thrust::tuple node ) // float4: aabbMin, aabbMax { int& parent=thrust::get<0>(node); int& child1=thrust::get<1>(node); SplitInfo& s=thrust::get<2>(node); const float4& aabbMin=thrust::get<3>(node); const float4& aabbMax=thrust::get<4>(node); int my_index = thrust::get<5>(node); bool split_node=false; // first, each thread block counts the number of nodes that it needs to allocate... __shared__ int block_nodes_to_allocate; if( threadIdx.x== 0 ) block_nodes_to_allocate=0; __syncthreads(); // don't split if all points are equal // (could lead to an infinite loop, and doesn't make any sense anyway) bool all_points_in_node_are_equal=aabbMin.x == aabbMax.x && aabbMin.y==aabbMax.y && aabbMin.z==aabbMax.z; int offset_to_global=0; // maybe this could be replaced with a reduction... if(( child1==-1) &&( s.right-s.left > maxPointsPerNode) && !all_points_in_node_are_equal ) { // leaf node split_node=true; offset_to_global = atomicAdd( &block_nodes_to_allocate,2 ); } __syncthreads(); __shared__ int block_left; __shared__ bool enough_space; // ... then the first thread tries to allocate this many nodes... if( threadIdx.x==0) { block_left = atomicAdd( node_count, block_nodes_to_allocate ); enough_space = block_left+block_nodes_to_allocate < *nodes_allocated; // if it doesn't succeed, no nodes will be created by this block if( !enough_space ) { atomicAdd( node_count, -block_nodes_to_allocate ); *out_of_space=1; } } __syncthreads(); // this thread needs to split it's node && there was enough space for all the nodes // in this block. //(The whole "allocate-per-block-thing" is much faster than letting each element allocate // its space on its own, because shared memory atomics are A LOT faster than // global mem atomics!) if( split_node && enough_space ) { int left = block_left + offset_to_global; splits[left].left=s.left; splits[left].right=s.right; splits[left+1].left=0; splits[left+1].right=0; // split axis/position: middle of longest aabb extent float4 aabbDim=aabbMax-aabbMin; int maxDim=0; float maxDimLength=aabbDim.x; float4 splitVal=(aabbMax+aabbMin); splitVal*=0.5f; for( int i=1; i<=2; i++ ) { float val = get_value_by_index(aabbDim,i); if( val > maxDimLength ) { maxDim=i; maxDimLength=val; } } s.split_dim=maxDim; s.split_val=get_value_by_index(splitVal,maxDim); child1_[my_index]=left; splits[my_index]=s; parent_[left]=my_index; parent_[left+1]=my_index; child1_[left]=-1; child1_[left+1]=-1; } } }; //! computes the scatter target address for the split operation, see Sengupta,Harris,Zhang,Owen: Scan Primitives for GPU Computing //! in my use case, this is about 2x as fast as thrust::partition struct set_addr3 { const int* val_, * f_; int npoints_; __device__ int operator()( int id ) { int nf = f_[npoints_-1] + (val_[npoints_-1]); int f=f_[id]; int t = id -f+nf; return val_[id] ? f : t; } }; //! converts a float4 point (xyz) to a tuple of three float vals (used to separate the //! float4 input buffer into three arrays in the beginning of the tree build) struct pointxyz_to_px_py_pz { __device__ thrust::tuple operator()( const float4& val ) { return thrust::make_tuple(val.x, val.y, val.z); } }; } // namespace kd_tree_builder_detail } // namespace cuda std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_detail::SplitInfo& s) { stream<<"(split l/r: "<< s.left <<" "<< s.right<< " split:"<& points, int max_leaf_size ) : /*out_of_space_(1,0),node_count_(1,1),*/ max_leaf_size_(max_leaf_size) { points_=&points; int prealloc = points.size()/max_leaf_size_*16; allocation_info_.resize(3); allocation_info_[NodeCount]=1; allocation_info_[NodesAllocated]=prealloc; allocation_info_[OutOfSpace]=0; // std::cout<size()<(prealloc,-1); parent_=new thrust::device_vector(prealloc,-1); cuda::kd_tree_builder_detail::SplitInfo s; s.left=0; s.right=0; splits_=new thrust::device_vector(prealloc,s); s.right=points.size(); (*splits_)[0]=s; aabb_min_=new thrust::device_vector(prealloc); aabb_max_=new thrust::device_vector(prealloc); index_x_=new thrust::device_vector(points_->size()); index_y_=new thrust::device_vector(points_->size()); index_z_=new thrust::device_vector(points_->size()); owners_x_=new thrust::device_vector(points_->size(),0); owners_y_=new thrust::device_vector(points_->size(),0); owners_z_=new thrust::device_vector(points_->size(),0); leftright_x_ = new thrust::device_vector(points_->size(),0); leftright_y_ = new thrust::device_vector(points_->size(),0); leftright_z_ = new thrust::device_vector(points_->size(),0); tmp_index_=new thrust::device_vector(points_->size()); tmp_owners_=new thrust::device_vector(points_->size()); tmp_misc_=new thrust::device_vector(points_->size()); points_x_=new thrust::device_vector(points_->size()); points_y_=new thrust::device_vector(points_->size()); points_z_=new thrust::device_vector(points_->size()); delete_node_info_=false; } ~CudaKdTreeBuilder() { if( delete_node_info_ ) { delete child1_; delete parent_; delete splits_; delete aabb_min_; delete aabb_max_; delete index_x_; } delete index_y_; delete index_z_; delete owners_x_; delete owners_y_; delete owners_z_; delete points_x_; delete points_y_; delete points_z_; delete leftright_x_; delete leftright_y_; delete leftright_z_; delete tmp_index_; delete tmp_owners_; delete tmp_misc_; } //! build the tree //! general idea: //! - build sorted lists of the points in x y and z order (to be able to compute tight AABBs in O(1) ) //! - while( nodes to split exist ) //! - split non-child nodes along longest axis if the number of points is > max_points_per_node //! - for each point: determine whether it is in a node that was split. If yes, mark it as belonging to the left or right child node of its current parent node //! - reorder the points so that the points of a single node are continuous in the node array //! - update the left/right pointers and AABBs of all nodes void buildTree() { // std::cout<<"buildTree()"<begin(), points_->end(), thrust::make_zip_iterator(thrust::make_tuple(points_x_->begin(), points_y_->begin(),points_z_->begin()) ), cuda::kd_tree_builder_detail::pointxyz_to_px_py_pz() ); thrust::counting_iterator it(0); thrust::copy( it, it+points_->size(), index_x_->begin() ); thrust::copy( index_x_->begin(), index_x_->end(), index_y_->begin() ); thrust::copy( index_x_->begin(), index_x_->end(), index_z_->begin() ); thrust::device_vector tmpv(points_->size()); // create sorted index list -> can be used to compute AABBs in O(1) thrust::copy(points_x_->begin(), points_x_->end(), tmpv.begin()); thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_x_->begin() ); thrust::copy(points_y_->begin(), points_y_->end(), tmpv.begin()); thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_y_->begin() ); thrust::copy(points_z_->begin(), points_z_->end(), tmpv.begin()); thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_z_->begin() ); (*aabb_min_)[0]=make_float4((*points_x_)[(*index_x_)[0]],(*points_y_)[(*index_y_)[0]],(*points_z_)[(*index_z_)[0]],0); (*aabb_max_)[0]=make_float4((*points_x_)[(*index_x_)[points_->size()-1]],(*points_y_)[(*index_y_)[points_->size()-1]],(*points_z_)[(*index_z_)[points_->size()-1]],0); #ifdef PRINT_DEBUG_TIMING cudaDeviceSynchronize(); std::cout<<" initial stuff:"< cit(0); thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple( parent_->begin(), child1_->begin(), splits_->begin(), aabb_min_->begin(), aabb_max_->begin(), cit )), thrust::make_zip_iterator(thrust::make_tuple( parent_->begin()+last_node_count, child1_->begin()+last_node_count,splits_->begin()+last_node_count, aabb_min_->begin()+last_node_count, aabb_max_->begin()+last_node_count,cit+last_node_count )), sn ); // copy allocation info to host thrust::host_vector alloc_info = allocation_info_; if( last_node_count == alloc_info[NodeCount] ) { // no more nodes were split -> done break; } last_node_count=alloc_info[NodeCount]; // a node was un-splittable due to a lack of space if( alloc_info[OutOfSpace]==1 ) { resize_node_vectors(alloc_info[NodesAllocated]*2); alloc_info[OutOfSpace]=0; alloc_info[NodesAllocated]*=2; allocation_info_=alloc_info; } #ifdef PRINT_DEBUG_TIMING cudaDeviceSynchronize(); std::cout<<" node split:"< ci0(0); thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( ci0, index_x_->begin(), index_y_->begin(), index_z_->begin()) ), thrust::make_zip_iterator( thrust::make_tuple( ci0+points_->size(), index_x_->end(), index_y_->end(), index_z_->end()) ),sno ); #ifdef PRINT_DEBUG_TIMING cudaDeviceSynchronize(); std::cout<<" set new owners:"< friend class KDTreeCuda3dIndex; protected: //! takes the partitioned nodes, and sets the left-/right info of leaf nodes, as well as the AABBs void update_leftright_and_aabb( const thrust::device_vector& x, const thrust::device_vector& y,const thrust::device_vector& z, const thrust::device_vector& ix, const thrust::device_vector& iy,const thrust::device_vector& iz, const thrust::device_vector& owners, thrust::device_vector& splits, thrust::device_vector& aabbMin,thrust::device_vector& aabbMax) { thrust::device_vector* labelsUnique=tmp_owners_; thrust::device_vector* countsUnique=tmp_index_; // assume: points of each node are continuous in the array // find which nodes are here, and where each node's points begin and end int unique_labels = thrust::unique_by_key_copy( owners.begin(), owners.end(), thrust::counting_iterator(0), labelsUnique->begin(), countsUnique->begin()).first - labelsUnique->begin(); // update the info cuda::kd_tree_builder_detail::SetLeftAndRightAndAABB s; s.maxPoints=x.size(); s.nElements=unique_labels; s.nodes=thrust::raw_pointer_cast(&(splits[0])); s.counts=thrust::raw_pointer_cast(&( (*countsUnique)[0])); s.labels=thrust::raw_pointer_cast(&( (*labelsUnique)[0])); s.x=thrust::raw_pointer_cast(&x[0]); s.y=thrust::raw_pointer_cast(&y[0]); s.z=thrust::raw_pointer_cast(&z[0]); s.ix=thrust::raw_pointer_cast(&ix[0]); s.iy=thrust::raw_pointer_cast(&iy[0]); s.iz=thrust::raw_pointer_cast(&iz[0]); s.aabbMin=thrust::raw_pointer_cast(&aabbMin[0]); s.aabbMax=thrust::raw_pointer_cast(&aabbMax[0]); thrust::counting_iterator it(0); thrust::for_each(it, it+unique_labels, s); } //! Separates the left and right children of each node into continuous parts of the array. //! More specifically, it seperates children with even and odd node indices because nodes are always //! allocated in pairs -> child1==child2+1 -> child1 even and child2 odd, or vice-versa. //! Since the split operation is stable, this results in continuous partitions //! for all the single nodes. //! (basically the split primitive according to sengupta et al) //! about twice as fast as thrust::partition void separate_left_and_right_children( thrust::device_vector& key_in, thrust::device_vector& val_in, thrust::device_vector& key_out, thrust::device_vector& val_out, thrust::device_vector& left_right_marks, bool scatter_val_out=true ) { thrust::device_vector* f_tmp = &val_out; thrust::device_vector* addr_tmp = tmp_misc_; thrust::exclusive_scan( /*thrust::make_transform_iterator(*/ left_right_marks.begin() /*,cuda::kd_tree_builder_detail::IsEven*/ /*())*/, /*thrust::make_transform_iterator(*/ left_right_marks.end() /*,cuda::kd_tree_builder_detail::IsEven*/ /*())*/, f_tmp->begin() ); cuda::kd_tree_builder_detail::set_addr3 sa; sa.val_=thrust::raw_pointer_cast(&left_right_marks[0]); sa.f_=thrust::raw_pointer_cast(&(*f_tmp)[0]); sa.npoints_=key_in.size(); thrust::counting_iterator it(0); thrust::transform(it, it+val_in.size(), addr_tmp->begin(), sa); thrust::scatter(key_in.begin(), key_in.end(), addr_tmp->begin(), key_out.begin()); if( scatter_val_out ) thrust::scatter(val_in.begin(), val_in.end(), addr_tmp->begin(), val_out.begin()); } //! allocates additional space in all the node-related vectors. //! new_size elements will be added to all vectors. void resize_node_vectors( size_t new_size ) { size_t add = new_size - child1_->size(); child1_->insert(child1_->end(), add, -1); parent_->insert(parent_->end(), add, -1); cuda::kd_tree_builder_detail::SplitInfo s; s.left=0; s.right=0; splits_->insert(splits_->end(), add, s); float4 f; aabb_min_->insert(aabb_min_->end(), add, f); aabb_max_->insert(aabb_max_->end(), add, f); } const thrust::device_vector* points_; // tree data, those are stored per-node //! left child of each node. (right child==left child + 1, due to the alloc mechanism) //! child1_[node]==-1 if node is a leaf node thrust::device_vector* child1_; //! parent node of each node thrust::device_vector* parent_; //! split info (dim/value or left/right pointers) thrust::device_vector* splits_; //! min aabb value of each node thrust::device_vector* aabb_min_; //! max aabb value of each node thrust::device_vector* aabb_max_; enum AllocationInfo { NodeCount=0, NodesAllocated=1, OutOfSpace=2 }; // those were put into a single vector of 3 elements so that only one mem transfer will be needed for all three of them // thrust::device_vector out_of_space_; // thrust::device_vector node_count_; // thrust::device_vector nodes_allocated_; thrust::device_vector allocation_info_; int max_leaf_size_; // coordinate values of the points thrust::device_vector* points_x_, * points_y_, * points_z_; // indices thrust::device_vector* index_x_, * index_y_, * index_z_; // owner node thrust::device_vector* owners_x_, * owners_y_, * owners_z_; // contains info about whether a point was partitioned to the left or right child after a split thrust::device_vector* leftright_x_, * leftright_y_, * leftright_z_; thrust::device_vector* tmp_index_, * tmp_owners_, * tmp_misc_; bool delete_node_info_; }; } // namespace flann #endif