You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
exercise_2/colmap-dev/lib/FLANN/algorithms/kdtree_cuda_builder.h

729 lines
29 KiB

/***********************************************************************
* Software License Agreement (BSD License)
*
* Copyright 2011 Andreas Muetzel (amuetzel@uni-koblenz.de). All rights reserved.
*
* THE BSD LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*************************************************************************/
#ifndef FLANN_CUDA_KD_TREE_BUILDER_H_
#define FLANN_CUDA_KD_TREE_BUILDER_H_
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/partition.h>
#include <thrust/unique.h>
#include <thrust/scan.h>
#include <FLANN/util/cutil_math.h>
#include <stdlib.h>
// #define PRINT_DEBUG_TIMING
namespace flann
{
// template< typename T >
// void print_vector( const thrust::device_vector<T>& v )
// {
// for( int i=0; i< v.size(); i++ )
// {
// std::cout<<v[i]<<std::endl;
// }
// }
//
// template< typename T1, typename T2 >
// void print_vector( const thrust::device_vector<T1>& v1, const thrust::device_vector<T2>& v2 )
// {
// for( int i=0; i< v1.size(); i++ )
// {
// std::cout<<i<<": "<<v1[i]<<" "<<v2[i]<<std::endl;
// }
// }
//
// template< typename T1, typename T2, typename T3 >
// void print_vector( const thrust::device_vector<T1>& v1, const thrust::device_vector<T2>& v2, const thrust::device_vector<T3>& v3 )
// {
// for( int i=0; i< v1.size(); i++ )
// {
// std::cout<<i<<": "<<v1[i]<<" "<<v2[i]<<" "<<v3[i]<<std::endl;
// }
// }
//
// template< typename T >
// void print_vector_by_index( const thrust::device_vector<T>& v,const thrust::device_vector<int>& ind )
// {
// for( int i=0; i< v.size(); i++ )
// {
// std::cout<<v[ind[i]]<<std::endl;
// }
// }
// std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_detail::SplitInfo& s) {
// stream<<"(split l/r: "<< s.left <<" "<< s.right<< " split:"<<s.split_dim<<" "<<s.split_val<<")";
// return stream;
// }
//
//
// std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_detail::NodeInfo& s) {
// stream<<"(node: "<<s.child1()<<" "<<s.parent()<<" "<<s.child2()<<")";
// return stream;
// }
//
// std::ostream& operator <<(std::ostream& stream, const float4& s) {
// stream<<"("<<s.x<<","<<s.y<<","<<s.z<<","<<s.w<<")";
// return stream;
// }
namespace cuda
{
namespace kd_tree_builder_detail
{
//! normal node: contains the split dimension and value
//! leaf node: left == index of first points, right==index of last point +1
struct SplitInfo
{
union {
struct
{
// begin of child nodes
int left;
// end of child nodes
int right;
};
struct
{
int split_dim;
float split_val;
};
};
};
struct IsEven
{
typedef int result_type;
__device__
int operator()(int i )
{
return (i& 1)==0;
}
};
struct SecondElementIsEven
{
__host__ __device__
bool operator()( const thrust::tuple<int,int>& i )
{
return (thrust::get<1>(i)& 1)==0;
}
};
//! just for convenience: access a float4 by an index in [0,1,2]
//! (casting it to a float* and accessing it by the index is way slower...)
__host__ __device__
float get_value_by_index( const float4& f, int i )
{
switch(i) {
case 0:
return f.x;
case 1:
return f.y;
default:
return f.z;
}
}
//! mark a point as belonging to the left or right child of its current parent
//! called after parents are split
struct MovePointsToChildNodes
{
MovePointsToChildNodes( int* child1, SplitInfo* splits, float* x, float* y, float* z, int* ox, int* oy, int* oz, int* lrx, int* lry, int* lrz )
: child1_(child1), splits_(splits), x_(x), y_(y), z_(z), ox_(ox), oy_(oy), oz_(oz), lrx_(lrx), lry_(lry), lrz_(lrz){}
// int dim;
// float threshold;
int* child1_;
SplitInfo* splits_;
// coordinate values
float* x_, * y_, * z_;
// owner indices -> which node does the point belong to?
int* ox_, * oy_, * oz_;
// temp info: will be set to 1 of a point is moved to the right child node, 0 otherwise
// (used later in the scan op to separate the points of the children into continuous ranges)
int* lrx_, * lry_, * lrz_;
__device__
void operator()( const thrust::tuple<int, int, int, int>& data )
{
int index = thrust::get<0>(data);
int owner = ox_[index]; // before a split, all points at the same position in the index array have the same owner
int point_ind1=thrust::get<1>(data);
int point_ind2=thrust::get<2>(data);
int point_ind3=thrust::get<3>(data);
int leftChild=child1_[owner];
int split_dim;
float dim_val1, dim_val2, dim_val3;
SplitInfo split;
lrx_[index]=0;
lry_[index]=0;
lrz_[index]=0;
// this element already belongs to a leaf node -> everything alright, no need to change anything
if( leftChild==-1 ) {
return;
}
// otherwise: load split data, and assign this index to the new owner
split = splits_[owner];
split_dim=split.split_dim;
switch( split_dim ) {
case 0:
dim_val1=x_[point_ind1];
dim_val2=x_[point_ind2];
dim_val3=x_[point_ind3];
break;
case 1:
dim_val1=y_[point_ind1];
dim_val2=y_[point_ind2];
dim_val3=y_[point_ind3];
break;
default:
dim_val1=z_[point_ind1];
dim_val2=z_[point_ind2];
dim_val3=z_[point_ind3];
break;
}
int r1=leftChild +(dim_val1 > split.split_val);
ox_[index]=r1;
int r2=leftChild+(dim_val2 > split.split_val);
oy_[index]=r2;
oz_[index]=leftChild+(dim_val3 > split.split_val);
lrx_[index] = (dim_val1 > split.split_val);
lry_[index] = (dim_val2 > split.split_val);
lrz_[index] = (dim_val3 > split.split_val);
// return thrust::make_tuple( r1, r2, leftChild+(dim_val > split.split_val) );
}
};
//! used to update the left/right pointers and aabb infos after the node splits
struct SetLeftAndRightAndAABB
{
int maxPoints;
int nElements;
SplitInfo* nodes;
int* counts;
int* labels;
float4* aabbMin;
float4* aabbMax;
const float* x,* y,* z;
const int* ix, * iy, * iz;
__host__ __device__
void operator()( int i )
{
int index=labels[i];
int right;
int left = counts[i];
nodes[index].left=left;
if( i < nElements-1 ) {
right=counts[i+1];
}
else { // index==nNodes
right=maxPoints;
}
nodes[index].right=right;
aabbMin[index].x=x[ix[left]];
aabbMin[index].y=y[iy[left]];
aabbMin[index].z=z[iz[left]];
aabbMax[index].x=x[ix[right-1]];
aabbMax[index].y=y[iy[right-1]];
aabbMax[index].z=z[iz[right-1]];
}
};
//! - decide whether a node has to be split
//! if yes:
//! - allocate child nodes
//! - set split axis as axis of maximum aabb length
struct SplitNodes
{
int maxPointsPerNode;
int* node_count;
int* nodes_allocated;
int* out_of_space;
int* child1_;
int* parent_;
SplitInfo* splits;
__device__
void operator()( thrust::tuple<int&, int&,SplitInfo&,float4&,float4&, int> node ) // float4: aabbMin, aabbMax
{
int& parent=thrust::get<0>(node);
int& child1=thrust::get<1>(node);
SplitInfo& s=thrust::get<2>(node);
const float4& aabbMin=thrust::get<3>(node);
const float4& aabbMax=thrust::get<4>(node);
int my_index = thrust::get<5>(node);
bool split_node=false;
// first, each thread block counts the number of nodes that it needs to allocate...
__shared__ int block_nodes_to_allocate;
if( threadIdx.x== 0 ) block_nodes_to_allocate=0;
__syncthreads();
// don't split if all points are equal
// (could lead to an infinite loop, and doesn't make any sense anyway)
bool all_points_in_node_are_equal=aabbMin.x == aabbMax.x && aabbMin.y==aabbMax.y && aabbMin.z==aabbMax.z;
int offset_to_global=0;
// maybe this could be replaced with a reduction...
if(( child1==-1) &&( s.right-s.left > maxPointsPerNode) && !all_points_in_node_are_equal ) { // leaf node
split_node=true;
offset_to_global = atomicAdd( &block_nodes_to_allocate,2 );
}
__syncthreads();
__shared__ int block_left;
__shared__ bool enough_space;
// ... then the first thread tries to allocate this many nodes...
if( threadIdx.x==0) {
block_left = atomicAdd( node_count, block_nodes_to_allocate );
enough_space = block_left+block_nodes_to_allocate < *nodes_allocated;
// if it doesn't succeed, no nodes will be created by this block
if( !enough_space ) {
atomicAdd( node_count, -block_nodes_to_allocate );
*out_of_space=1;
}
}
__syncthreads();
// this thread needs to split it's node && there was enough space for all the nodes
// in this block.
//(The whole "allocate-per-block-thing" is much faster than letting each element allocate
// its space on its own, because shared memory atomics are A LOT faster than
// global mem atomics!)
if( split_node && enough_space ) {
int left = block_left + offset_to_global;
splits[left].left=s.left;
splits[left].right=s.right;
splits[left+1].left=0;
splits[left+1].right=0;
// split axis/position: middle of longest aabb extent
float4 aabbDim=aabbMax-aabbMin;
int maxDim=0;
float maxDimLength=aabbDim.x;
float4 splitVal=(aabbMax+aabbMin);
splitVal*=0.5f;
for( int i=1; i<=2; i++ ) {
float val = get_value_by_index(aabbDim,i);
if( val > maxDimLength ) {
maxDim=i;
maxDimLength=val;
}
}
s.split_dim=maxDim;
s.split_val=get_value_by_index(splitVal,maxDim);
child1_[my_index]=left;
splits[my_index]=s;
parent_[left]=my_index;
parent_[left+1]=my_index;
child1_[left]=-1;
child1_[left+1]=-1;
}
}
};
//! computes the scatter target address for the split operation, see Sengupta,Harris,Zhang,Owen: Scan Primitives for GPU Computing
//! in my use case, this is about 2x as fast as thrust::partition
struct set_addr3
{
const int* val_, * f_;
int npoints_;
__device__
int operator()( int id )
{
int nf = f_[npoints_-1] + (val_[npoints_-1]);
int f=f_[id];
int t = id -f+nf;
return val_[id] ? f : t;
}
};
//! converts a float4 point (xyz) to a tuple of three float vals (used to separate the
//! float4 input buffer into three arrays in the beginning of the tree build)
struct pointxyz_to_px_py_pz
{
__device__
thrust::tuple<float,float,float> operator()( const float4& val )
{
return thrust::make_tuple(val.x, val.y, val.z);
}
};
} // namespace kd_tree_builder_detail
} // namespace cuda
std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_detail::SplitInfo& s)
{
stream<<"(split l/r: "<< s.left <<" "<< s.right<< " split:"<<s.split_dim<<" "<<s.split_val<<")";
return stream;
}
class CudaKdTreeBuilder
{
public:
CudaKdTreeBuilder( const thrust::device_vector<float4>& points, int max_leaf_size ) : /*out_of_space_(1,0),node_count_(1,1),*/ max_leaf_size_(max_leaf_size)
{
points_=&points;
int prealloc = points.size()/max_leaf_size_*16;
allocation_info_.resize(3);
allocation_info_[NodeCount]=1;
allocation_info_[NodesAllocated]=prealloc;
allocation_info_[OutOfSpace]=0;
// std::cout<<points_->size()<<std::endl;
child1_=new thrust::device_vector<int>(prealloc,-1);
parent_=new thrust::device_vector<int>(prealloc,-1);
cuda::kd_tree_builder_detail::SplitInfo s;
s.left=0;
s.right=0;
splits_=new thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>(prealloc,s);
s.right=points.size();
(*splits_)[0]=s;
aabb_min_=new thrust::device_vector<float4>(prealloc);
aabb_max_=new thrust::device_vector<float4>(prealloc);
index_x_=new thrust::device_vector<int>(points_->size());
index_y_=new thrust::device_vector<int>(points_->size());
index_z_=new thrust::device_vector<int>(points_->size());
owners_x_=new thrust::device_vector<int>(points_->size(),0);
owners_y_=new thrust::device_vector<int>(points_->size(),0);
owners_z_=new thrust::device_vector<int>(points_->size(),0);
leftright_x_ = new thrust::device_vector<int>(points_->size(),0);
leftright_y_ = new thrust::device_vector<int>(points_->size(),0);
leftright_z_ = new thrust::device_vector<int>(points_->size(),0);
tmp_index_=new thrust::device_vector<int>(points_->size());
tmp_owners_=new thrust::device_vector<int>(points_->size());
tmp_misc_=new thrust::device_vector<int>(points_->size());
points_x_=new thrust::device_vector<float>(points_->size());
points_y_=new thrust::device_vector<float>(points_->size());
points_z_=new thrust::device_vector<float>(points_->size());
delete_node_info_=false;
}
~CudaKdTreeBuilder()
{
if( delete_node_info_ ) {
delete child1_;
delete parent_;
delete splits_;
delete aabb_min_;
delete aabb_max_;
delete index_x_;
}
delete index_y_;
delete index_z_;
delete owners_x_;
delete owners_y_;
delete owners_z_;
delete points_x_;
delete points_y_;
delete points_z_;
delete leftright_x_;
delete leftright_y_;
delete leftright_z_;
delete tmp_index_;
delete tmp_owners_;
delete tmp_misc_;
}
//! build the tree
//! general idea:
//! - build sorted lists of the points in x y and z order (to be able to compute tight AABBs in O(1) )
//! - while( nodes to split exist )
//! - split non-child nodes along longest axis if the number of points is > max_points_per_node
//! - for each point: determine whether it is in a node that was split. If yes, mark it as belonging to the left or right child node of its current parent node
//! - reorder the points so that the points of a single node are continuous in the node array
//! - update the left/right pointers and AABBs of all nodes
void buildTree()
{
// std::cout<<"buildTree()"<<std::endl;
// sleep(1);
// Util::Timer stepTimer;
thrust::transform( points_->begin(), points_->end(), thrust::make_zip_iterator(thrust::make_tuple(points_x_->begin(), points_y_->begin(),points_z_->begin()) ), cuda::kd_tree_builder_detail::pointxyz_to_px_py_pz() );
thrust::counting_iterator<int> it(0);
thrust::copy( it, it+points_->size(), index_x_->begin() );
thrust::copy( index_x_->begin(), index_x_->end(), index_y_->begin() );
thrust::copy( index_x_->begin(), index_x_->end(), index_z_->begin() );
thrust::device_vector<float> tmpv(points_->size());
// create sorted index list -> can be used to compute AABBs in O(1)
thrust::copy(points_x_->begin(), points_x_->end(), tmpv.begin());
thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_x_->begin() );
thrust::copy(points_y_->begin(), points_y_->end(), tmpv.begin());
thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_y_->begin() );
thrust::copy(points_z_->begin(), points_z_->end(), tmpv.begin());
thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_z_->begin() );
(*aabb_min_)[0]=make_float4((*points_x_)[(*index_x_)[0]],(*points_y_)[(*index_y_)[0]],(*points_z_)[(*index_z_)[0]],0);
(*aabb_max_)[0]=make_float4((*points_x_)[(*index_x_)[points_->size()-1]],(*points_y_)[(*index_y_)[points_->size()-1]],(*points_z_)[(*index_z_)[points_->size()-1]],0);
#ifdef PRINT_DEBUG_TIMING
cudaDeviceSynchronize();
std::cout<<" initial stuff:"<<stepTimer.elapsed()<<std::endl;
stepTimer.restart();
#endif
int last_node_count=0;
for( int i=0;; i++ ) {
cuda::kd_tree_builder_detail::SplitNodes sn;
sn.maxPointsPerNode=max_leaf_size_;
sn.node_count=thrust::raw_pointer_cast(&allocation_info_[NodeCount]);
sn.nodes_allocated=thrust::raw_pointer_cast(&allocation_info_[NodesAllocated]);
sn.out_of_space=thrust::raw_pointer_cast(&allocation_info_[OutOfSpace]);
sn.child1_=thrust::raw_pointer_cast(&(*child1_)[0]);
sn.parent_=thrust::raw_pointer_cast(&(*parent_)[0]);
sn.splits=thrust::raw_pointer_cast(&(*splits_)[0]);
thrust::counting_iterator<int> cit(0);
thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple( parent_->begin(), child1_->begin(), splits_->begin(), aabb_min_->begin(), aabb_max_->begin(), cit )),
thrust::make_zip_iterator(thrust::make_tuple( parent_->begin()+last_node_count, child1_->begin()+last_node_count,splits_->begin()+last_node_count, aabb_min_->begin()+last_node_count, aabb_max_->begin()+last_node_count,cit+last_node_count )),
sn );
// copy allocation info to host
thrust::host_vector<int> alloc_info = allocation_info_;
if( last_node_count == alloc_info[NodeCount] ) { // no more nodes were split -> done
break;
}
last_node_count=alloc_info[NodeCount];
// a node was un-splittable due to a lack of space
if( alloc_info[OutOfSpace]==1 ) {
resize_node_vectors(alloc_info[NodesAllocated]*2);
alloc_info[OutOfSpace]=0;
alloc_info[NodesAllocated]*=2;
allocation_info_=alloc_info;
}
#ifdef PRINT_DEBUG_TIMING
cudaDeviceSynchronize();
std::cout<<" node split:"<<stepTimer.elapsed()<<std::endl;
stepTimer.restart();
#endif
// foreach point: point was in node that was split?move it to child (leaf) node : do nothing
cuda::kd_tree_builder_detail::MovePointsToChildNodes sno( thrust::raw_pointer_cast(&(*child1_)[0]),
thrust::raw_pointer_cast(&(*splits_)[0]),
thrust::raw_pointer_cast(&(*points_x_)[0]),
thrust::raw_pointer_cast(&(*points_y_)[0]),
thrust::raw_pointer_cast(&(*points_z_)[0]),
thrust::raw_pointer_cast(&(*owners_x_)[0]),
thrust::raw_pointer_cast(&(*owners_y_)[0]),
thrust::raw_pointer_cast(&(*owners_z_)[0]),
thrust::raw_pointer_cast(&(*leftright_x_)[0]),
thrust::raw_pointer_cast(&(*leftright_y_)[0]),
thrust::raw_pointer_cast(&(*leftright_z_)[0])
);
thrust::counting_iterator<int> ci0(0);
thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( ci0, index_x_->begin(), index_y_->begin(), index_z_->begin()) ),
thrust::make_zip_iterator( thrust::make_tuple( ci0+points_->size(), index_x_->end(), index_y_->end(), index_z_->end()) ),sno );
#ifdef PRINT_DEBUG_TIMING
cudaDeviceSynchronize();
std::cout<<" set new owners:"<<stepTimer.elapsed()<<std::endl;
stepTimer.restart();
#endif
// move points around so that each leaf node's points are continuous
separate_left_and_right_children(*index_x_,*owners_x_,*tmp_index_,*tmp_owners_, *leftright_x_);
std::swap(tmp_index_, index_x_);
std::swap(tmp_owners_, owners_x_);
separate_left_and_right_children(*index_y_,*owners_y_,*tmp_index_,*tmp_owners_, *leftright_y_,false);
std::swap(tmp_index_, index_y_);
separate_left_and_right_children(*index_z_,*owners_z_,*tmp_index_,*tmp_owners_, *leftright_z_,false);
std::swap(tmp_index_, index_z_);
#ifdef PRINT_DEBUG_TIMING
cudaDeviceSynchronize();
std::cout<<" split:"<<stepTimer.elapsed()<<std::endl;
stepTimer.restart();
#endif
// calculate new AABB etc
update_leftright_and_aabb( *points_x_, *points_y_, *points_z_, *index_x_, *index_y_, *index_z_, *owners_x_, *splits_,*aabb_min_, *aabb_max_);
#ifdef PRINT_DEBUG_TIMING
cudaDeviceSynchronize();
std::cout<<" update_leftright_and_aabb:"<<stepTimer.elapsed()<<std::endl;
stepTimer.restart();
print_vector(node_count_);
#endif
}
}
template<class Distance>
friend class KDTreeCuda3dIndex;
protected:
//! takes the partitioned nodes, and sets the left-/right info of leaf nodes, as well as the AABBs
void
update_leftright_and_aabb( const thrust::device_vector<float>& x, const thrust::device_vector<float>& y,const thrust::device_vector<float>& z,
const thrust::device_vector<int>& ix, const thrust::device_vector<int>& iy,const thrust::device_vector<int>& iz,
const thrust::device_vector<int>& owners,
thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>& splits, thrust::device_vector<float4>& aabbMin,thrust::device_vector<float4>& aabbMax)
{
thrust::device_vector<int>* labelsUnique=tmp_owners_;
thrust::device_vector<int>* countsUnique=tmp_index_;
// assume: points of each node are continuous in the array
// find which nodes are here, and where each node's points begin and end
int unique_labels = thrust::unique_by_key_copy( owners.begin(), owners.end(), thrust::counting_iterator<int>(0), labelsUnique->begin(), countsUnique->begin()).first - labelsUnique->begin();
// update the info
cuda::kd_tree_builder_detail::SetLeftAndRightAndAABB s;
s.maxPoints=x.size();
s.nElements=unique_labels;
s.nodes=thrust::raw_pointer_cast(&(splits[0]));
s.counts=thrust::raw_pointer_cast(&( (*countsUnique)[0]));
s.labels=thrust::raw_pointer_cast(&( (*labelsUnique)[0]));
s.x=thrust::raw_pointer_cast(&x[0]);
s.y=thrust::raw_pointer_cast(&y[0]);
s.z=thrust::raw_pointer_cast(&z[0]);
s.ix=thrust::raw_pointer_cast(&ix[0]);
s.iy=thrust::raw_pointer_cast(&iy[0]);
s.iz=thrust::raw_pointer_cast(&iz[0]);
s.aabbMin=thrust::raw_pointer_cast(&aabbMin[0]);
s.aabbMax=thrust::raw_pointer_cast(&aabbMax[0]);
thrust::counting_iterator<int> it(0);
thrust::for_each(it, it+unique_labels, s);
}
//! Separates the left and right children of each node into continuous parts of the array.
//! More specifically, it seperates children with even and odd node indices because nodes are always
//! allocated in pairs -> child1==child2+1 -> child1 even and child2 odd, or vice-versa.
//! Since the split operation is stable, this results in continuous partitions
//! for all the single nodes.
//! (basically the split primitive according to sengupta et al)
//! about twice as fast as thrust::partition
void separate_left_and_right_children( thrust::device_vector<int>& key_in, thrust::device_vector<int>& val_in, thrust::device_vector<int>& key_out, thrust::device_vector<int>& val_out, thrust::device_vector<int>& left_right_marks, bool scatter_val_out=true )
{
thrust::device_vector<int>* f_tmp = &val_out;
thrust::device_vector<int>* addr_tmp = tmp_misc_;
thrust::exclusive_scan( /*thrust::make_transform_iterator(*/ left_right_marks.begin() /*,cuda::kd_tree_builder_detail::IsEven*/
/*())*/, /*thrust::make_transform_iterator(*/ left_right_marks.end() /*,cuda::kd_tree_builder_detail::IsEven*/
/*())*/, f_tmp->begin() );
cuda::kd_tree_builder_detail::set_addr3 sa;
sa.val_=thrust::raw_pointer_cast(&left_right_marks[0]);
sa.f_=thrust::raw_pointer_cast(&(*f_tmp)[0]);
sa.npoints_=key_in.size();
thrust::counting_iterator<int> it(0);
thrust::transform(it, it+val_in.size(), addr_tmp->begin(), sa);
thrust::scatter(key_in.begin(), key_in.end(), addr_tmp->begin(), key_out.begin());
if( scatter_val_out ) thrust::scatter(val_in.begin(), val_in.end(), addr_tmp->begin(), val_out.begin());
}
//! allocates additional space in all the node-related vectors.
//! new_size elements will be added to all vectors.
void resize_node_vectors( size_t new_size )
{
size_t add = new_size - child1_->size();
child1_->insert(child1_->end(), add, -1);
parent_->insert(parent_->end(), add, -1);
cuda::kd_tree_builder_detail::SplitInfo s;
s.left=0;
s.right=0;
splits_->insert(splits_->end(), add, s);
float4 f;
aabb_min_->insert(aabb_min_->end(), add, f);
aabb_max_->insert(aabb_max_->end(), add, f);
}
const thrust::device_vector<float4>* points_;
// tree data, those are stored per-node
//! left child of each node. (right child==left child + 1, due to the alloc mechanism)
//! child1_[node]==-1 if node is a leaf node
thrust::device_vector<int>* child1_;
//! parent node of each node
thrust::device_vector<int>* parent_;
//! split info (dim/value or left/right pointers)
thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>* splits_;
//! min aabb value of each node
thrust::device_vector<float4>* aabb_min_;
//! max aabb value of each node
thrust::device_vector<float4>* aabb_max_;
enum AllocationInfo
{
NodeCount=0,
NodesAllocated=1,
OutOfSpace=2
};
// those were put into a single vector of 3 elements so that only one mem transfer will be needed for all three of them
// thrust::device_vector<int> out_of_space_;
// thrust::device_vector<int> node_count_;
// thrust::device_vector<int> nodes_allocated_;
thrust::device_vector<int> allocation_info_;
int max_leaf_size_;
// coordinate values of the points
thrust::device_vector<float>* points_x_, * points_y_, * points_z_;
// indices
thrust::device_vector<int>* index_x_, * index_y_, * index_z_;
// owner node
thrust::device_vector<int>* owners_x_, * owners_y_, * owners_z_;
// contains info about whether a point was partitioned to the left or right child after a split
thrust::device_vector<int>* leftright_x_, * leftright_y_, * leftright_z_;
thrust::device_vector<int>* tmp_index_, * tmp_owners_, * tmp_misc_;
bool delete_node_info_;
};
} // namespace flann
#endif