From ce8f56a2400725a8d7ae9778e403e6bda01143fd Mon Sep 17 00:00:00 2001 From: yuyi1005 Date: Wed, 9 May 2018 19:34:35 +0800 Subject: [PATCH 1/7] Add files via upload fix bug for knnSearch when knn!=ostride --- src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu index 8465322e..813d0f94 100644 --- a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu +++ b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu @@ -430,7 +430,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie ); } } - thrust::transform(id, id+knn*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); + thrust::transform(id, id+ostride*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); } } From c0843304fe3797a9a6c7e7082ecfdb54346d75bc Mon Sep 17 00:00:00 2001 From: yuyi1005 Date: Sun, 13 May 2018 15:51:10 +0800 Subject: [PATCH 2/7] create cuda stream for each index Now most nvidia gpu supports kernel concurrency, but flann always uses default stream, which means gpu can not process two indexs simutaneously when we use multithread. Creating cuda stream for each index can fix this problem. --- .../flann/algorithms/kdtree_cuda_3d_index.cu | 148 +++++---- .../flann/algorithms/kdtree_cuda_builder.h | 307 +++++++++++++----- 2 files changed, 297 insertions(+), 158 deletions(-) diff --git a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu index 813d0f94..70b91ece 100644 --- a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu +++ b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu @@ -161,14 +161,17 @@ void nearestKernel(const cuda::kd_tree_builder_detail::SplitInfo* splits, template struct KDTreeCuda3dIndex::GpuHelper { - thrust::device_vector< cuda::kd_tree_builder_detail::SplitInfo >* gpu_splits_; - thrust::device_vector< int >* gpu_parent_; - thrust::device_vector< int >* gpu_child1_; - thrust::device_vector< float4 >* gpu_aabb_min_; - thrust::device_vector< float4 >* gpu_aabb_max_; - thrust::device_vector* gpu_points_; - thrust::device_vector* gpu_vind_; - GpuHelper() : gpu_splits_(0), gpu_parent_(0), gpu_child1_(0), gpu_aabb_min_(0), gpu_aabb_max_(0), gpu_points_(0), gpu_vind_(0){ + cudaStream_t gpu_stream; + flann::cuda::device_vector_noinit< cuda::kd_tree_builder_detail::SplitInfo >* gpu_splits_; + flann::cuda::device_vector_noinit< int >* gpu_parent_; + flann::cuda::device_vector_noinit< int >* gpu_child1_; + flann::cuda::device_vector_noinit< float4 >* gpu_aabb_min_; + flann::cuda::device_vector_noinit< float4 >* gpu_aabb_max_; + flann::cuda::device_vector_noinit* gpu_points_; + flann::cuda::device_vector_noinit* gpu_vind_; + GpuHelper() : gpu_splits_(0), gpu_parent_(0), gpu_child1_(0), gpu_aabb_min_(0), gpu_aabb_max_(0), gpu_points_(0), gpu_vind_(0) + { + cudaStreamCreate(&gpu_stream); } ~GpuHelper() { @@ -187,6 +190,8 @@ struct KDTreeCuda3dIndex::GpuHelper delete gpu_points_; gpu_points_=0; + + cudaStreamDestroy(gpu_stream); } }; @@ -302,15 +307,15 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie // std::cout<<" knn:"< queriesDev(istride* queries.rows,0); - thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() ); - thrust::device_vector distsDev(queries.rows* ostride); - thrust::device_vector indicesDev(queries.rows* ostride); + flann::cuda::device_vector_noinit queriesDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), istride* queries.rows,0); + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_helper_->gpu_stream), queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() ); + flann::cuda::device_vector_noinit distsDev(queries.rows* ostride); + flann::cuda::device_vector_noinit indicesDev(queries.rows* ostride); if( knn==1 ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -322,7 +327,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie thrust::raw_pointer_cast(&indicesDev[0]), thrust::raw_pointer_cast(&distsDev[0]), queries.rows, flann::cuda::SingleResultSet(epsError),distance); - // KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])), + // KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])), // thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ), // thrust::raw_pointer_cast(&queriesDev[0]), // queries.stride, @@ -333,7 +338,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie } else { if( use_heap ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -348,7 +353,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie , distance); } else { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -364,9 +369,9 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie ); } } - thrust::copy( distsDev.begin(), distsDev.end(), dists.ptr() ); - thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); - thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() ); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, distsDev.begin(), distsDev.end(), dists.ptr() ); + thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, indicesDev.begin(), indicesDev.end(), indices.ptr() ); } else { thrust::device_ptr qd = thrust::device_pointer_cast(queries.ptr()); @@ -376,7 +381,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie if( knn==1 ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -388,7 +393,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie id.get(), dd.get(), queries.rows, flann::cuda::SingleResultSet(epsError),distance); - // KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])), + // KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])), // thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ), // thrust::raw_pointer_cast(&queriesDev[0]), // queries.stride, @@ -399,7 +404,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie } else { if( use_heap ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -414,7 +419,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie , distance); } else { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -430,7 +435,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie ); } } - thrust::transform(id, id+ostride*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); + thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), id, id+ostride*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); } } @@ -450,9 +455,9 @@ int KDTreeCuda3dIndex::radiusSearchGpu(const Matrix& que int istride=queries.stride/sizeof(ElementType); - thrust::device_vector queriesDev(istride* queries.rows,0); - thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() ); - thrust::device_vector countsDev(queries.rows); + flann::cuda::device_vector_noinit queriesDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), istride* queries.rows,0); + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_helper_->gpu_stream), queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() ); + flann::cuda::device_vector_noinit countsDev(queries.rows); typename GpuDistance::type distance; @@ -460,7 +465,7 @@ int KDTreeCuda3dIndex::radiusSearchGpu(const Matrix& que int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock; - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -475,7 +480,8 @@ int KDTreeCuda3dIndex::radiusSearchGpu(const Matrix& que distance ); - thrust::host_vector counts_host=countsDev; + thrust::host_vector counts_host(countsDev.size()); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, countsDev.begin(), countsDev.end(), counts_host.begin()); if( max_neighbors!=0 ) { // we'll need this later, but the exclusive_scan will change the array for( size_t i=0; i::radiusSearchGpu(const Matrix& que } int neighbors_last_elem = countsDev.back(); - thrust::exclusive_scan( countsDev.begin(), countsDev.end(), countsDev.begin() ); + thrust::exclusive_scan(thrust::cuda::par.on(gpu_helper_->gpu_stream), countsDev.begin(), countsDev.end(), countsDev.begin() ); size_t total_neighbors=neighbors_last_elem+countsDev.back(); if( max_neighbors==0 ) return total_neighbors; - thrust::device_vector indicesDev(total_neighbors,-1); - thrust::device_vector distsDev(total_neighbors,std::numeric_limits::infinity()); + flann::cuda::device_vector_noinit indicesDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), total_neighbors,-1); + flann::cuda::device_vector_noinit distsDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), total_neighbors,std::numeric_limits::infinity()); if( max_neighbors<0 ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -517,7 +523,7 @@ int KDTreeCuda3dIndex::radiusSearchGpu(const Matrix& que } else { if( use_heap ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -531,7 +537,7 @@ int KDTreeCuda3dIndex::radiusSearchGpu(const Matrix& que queries.rows, flann::cuda::RadiusKnnResultSet(radius,max_neighbors, thrust::raw_pointer_cast(&countsDev[0]),sorted), distance); } else { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -545,9 +551,11 @@ int KDTreeCuda3dIndex::radiusSearchGpu(const Matrix& que queries.rows, flann::cuda::RadiusKnnResultSet(radius,max_neighbors, thrust::raw_pointer_cast(&countsDev[0]),sorted), distance); } } - thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); - thrust::host_vector indices_temp = indicesDev; - thrust::host_vector dists_temp = distsDev; + thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); + thrust::host_vector indices_temp(indicesDev.size()); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, indicesDev.begin(), indicesDev.end(), indices_temp.begin()); + thrust::host_vector dists_temp(distsDev.size()); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, distsDev.begin(), distsDev.end(), dists_temp.begin()); int buffer_index=0; for( size_t i=0; i::radiusSearchGpu(const Matrix& qu if( max_neighbors<0 ) max_neighbors=indices.cols; if( !matrices_on_gpu ) { - thrust::device_vector queriesDev(istride* queries.rows,0); - thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() ); + flann::cuda::device_vector_noinit queriesDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), istride* queries.rows,0); + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_helper_->gpu_stream), queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() ); typename GpuDistance::type distance; int threadsPerBlock = 128; int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock; if( max_neighbors== 0 ) { - thrust::device_vector indicesDev(queries.rows* ostride); - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + flann::cuda::device_vector_noinit indicesDev(queries.rows* ostride); + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -612,16 +620,16 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix& qu queries.rows, flann::cuda::CountingRadiusResultSet(radius,-1), distance ); - thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() ); - return thrust::reduce(indicesDev.begin(), indicesDev.end() ); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, indicesDev.begin(), indicesDev.end(), indices.ptr() ); + return thrust::reduce(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end() ); } - thrust::device_vector distsDev(queries.rows* max_neighbors); - thrust::device_vector indicesDev(queries.rows* max_neighbors); + flann::cuda::device_vector_noinit distsDev(queries.rows* max_neighbors); + flann::cuda::device_vector_noinit indicesDev(queries.rows* max_neighbors); if( use_heap ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -635,7 +643,7 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix& qu queries.rows, flann::cuda::KnnRadiusResultSet(max_neighbors,sorted,epsError, radius), distance); } else { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -649,11 +657,11 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix& qu queries.rows, flann::cuda::KnnRadiusResultSet(max_neighbors,sorted,epsError, radius), distance); } - thrust::copy( distsDev.begin(), distsDev.end(), dists.ptr() ); - thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); - thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() ); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, distsDev.begin(), distsDev.end(), dists.ptr() ); + thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, indicesDev.begin(), indicesDev.end(), indices.ptr() ); - return thrust::count_if(indicesDev.begin(), indicesDev.end(), isNotMinusOne() ); + return thrust::count_if(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), isNotMinusOne() ); } else { @@ -665,8 +673,8 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix& qu int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock; if( max_neighbors== 0 ) { - thrust::device_vector indicesDev(queries.rows* indices.stride); - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + flann::cuda::device_vector_noinit indicesDev(queries.rows* indices.stride); + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -680,12 +688,12 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix& qu queries.rows, flann::cuda::CountingRadiusResultSet(radius,-1), distance ); - thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() ); - return thrust::reduce(indicesDev.begin(), indicesDev.end() ); + thrust::copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), indices.ptr() ); + return thrust::reduce(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end() ); } if( use_heap ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -699,7 +707,7 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix& qu queries.rows, flann::cuda::KnnRadiusResultSet(max_neighbors,sorted,epsError, radius), distance); } else { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -713,9 +721,9 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix& qu queries.rows, flann::cuda::KnnRadiusResultSet(max_neighbors,sorted,epsError, radius), distance); } - thrust::transform(id, id+max_neighbors*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); + thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), id, id+max_neighbors*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); - return thrust::count_if(id, id+max_neighbors*queries.rows, isNotMinusOne() ); + return thrust::count_if(thrust::cuda::par.on(gpu_helper_->gpu_stream), id, id+max_neighbors*queries.rows, isNotMinusOne() ); } } @@ -728,11 +736,11 @@ void KDTreeCuda3dIndex::uploadTreeToGpu() // assert( sizeof( KdTreeCudaPrivate::GpuNode)==sizeof( Node ) ); delete gpu_helper_; gpu_helper_ = new GpuHelper; - gpu_helper_->gpu_points_=new thrust::device_vector(size_); - thrust::device_vector tmp(size_); + gpu_helper_->gpu_points_=new flann::cuda::device_vector_noinit(size_); + flann::cuda::device_vector_noinit tmp(size_); if( get_param(index_params_,"input_is_gpu_float4",false) ) { assert( dataset_.cols == 3 && dataset_.stride==4*sizeof(float)); - thrust::copy( thrust::device_pointer_cast((float4*)dataset_.ptr()),thrust::device_pointer_cast((float4*)(dataset_.ptr()))+size_,tmp.begin()); + thrust::copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::device_pointer_cast((float4*)dataset_.ptr()),thrust::device_pointer_cast((float4*)(dataset_.ptr()))+size_,tmp.begin()); } else { @@ -750,24 +758,24 @@ void KDTreeCuda3dIndex::uploadTreeToGpu() data_[i][j] = 0; } } - thrust::copy((float4*)data_.ptr(),(float4*)(data_.ptr())+size_,tmp.begin()); + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_helper_->gpu_stream), (float4*)data_.ptr(),(float4*)(data_.ptr())+size_,tmp.begin()); } - CudaKdTreeBuilder builder( tmp, leaf_max_size_ ); + CudaKdTreeBuilder builder( tmp, leaf_max_size_, gpu_helper_->gpu_stream ); builder.buildTree(); - gpu_helper_->gpu_splits_ = builder.splits_; - gpu_helper_->gpu_aabb_min_ = builder.aabb_min_; + gpu_helper_->gpu_splits_ = builder.splits_; + gpu_helper_->gpu_aabb_min_ = builder.aabb_min_; gpu_helper_->gpu_aabb_max_ = builder.aabb_max_; gpu_helper_->gpu_child1_ = builder.child1_; gpu_helper_->gpu_parent_=builder.parent_; gpu_helper_->gpu_vind_=builder.index_x_; - thrust::gather( builder.index_x_->begin(), builder.index_x_->end(), tmp.begin(), gpu_helper_->gpu_points_->begin()); + thrust::gather(thrust::cuda::par.on(gpu_helper_->gpu_stream), builder.index_x_->begin(), builder.index_x_->end(), tmp.begin(), gpu_helper_->gpu_points_->begin()); - // gpu_helper_->gpu_nodes_=new thrust::device_vector(node_count_); + // gpu_helper_->gpu_nodes_=new flann::cuda::device_vector_noinit(node_count_); - // gpu_helper_->gpu_vind_=new thrust::device_vector(size_); + // gpu_helper_->gpu_vind_=new flann::cuda::device_vector_noinit(size_); // thrust::copy( (KdTreeCudaPrivate::GpuNode*)&(tree_[0]), ((KdTreeCudaPrivate::GpuNode*)&(tree_[0]))+tree_.size(), gpu_helper_->gpu_nodes_->begin()); // thrust::copy(vind_.begin(),vind_.end(),gpu_helper_->gpu_vind_->begin()); diff --git a/src/cpp/flann/algorithms/kdtree_cuda_builder.h b/src/cpp/flann/algorithms/kdtree_cuda_builder.h index 8bb4e838..24ddf0db 100644 --- a/src/cpp/flann/algorithms/kdtree_cuda_builder.h +++ b/src/cpp/flann/algorithms/kdtree_cuda_builder.h @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -43,7 +44,7 @@ namespace flann { // template< typename T > -// void print_vector( const thrust::device_vector& v ) +// void print_vector( const flann::cuda::device_vector_noinit& v ) // { // for( int i=0; i< v.size(); i++ ) // { @@ -52,7 +53,7 @@ namespace flann // } // // template< typename T1, typename T2 > -// void print_vector( const thrust::device_vector& v1, const thrust::device_vector& v2 ) +// void print_vector( const flann::cuda::device_vector_noinit& v1, const flann::cuda::device_vector_noinit& v2 ) // { // for( int i=0; i< v1.size(); i++ ) // { @@ -61,7 +62,7 @@ namespace flann // } // // template< typename T1, typename T2, typename T3 > -// void print_vector( const thrust::device_vector& v1, const thrust::device_vector& v2, const thrust::device_vector& v3 ) +// void print_vector( const flann::cuda::device_vector_noinit& v1, const flann::cuda::device_vector_noinit& v2, const flann::cuda::device_vector_noinit& v3 ) // { // for( int i=0; i< v1.size(); i++ ) // { @@ -70,7 +71,7 @@ namespace flann // } // // template< typename T > -// void print_vector_by_index( const thrust::device_vector& v,const thrust::device_vector& ind ) +// void print_vector_by_index( const flann::cuda::device_vector_noinit& v,const flann::cuda::device_vector_noinit& ind ) // { // for( int i=0; i< v.size(); i++ ) // { @@ -95,6 +96,108 @@ namespace flann // } namespace cuda { + // flann::cuda::device_vector_noinit is used to take place of flann::cuda::device_vector_noinit + // as flann::cuda::device_vector_noinit always uses default stream, and always contains a fill + template + class device_vector_noinit + { + private: + thrust::device_ptr m_ptr; + size_t m_size; + + public: + device_vector_noinit() + { + m_size = 0; + } + + ~device_vector_noinit() + { + if (m_size) + thrust::device_free(m_ptr); + m_size = 0; + } + + device_vector_noinit(size_t s) + { + m_size = s; + if (s) + m_ptr = thrust::device_malloc(s); + } + + template + device_vector_noinit(const thrust::detail::execution_policy_base &exec, size_t s, T t) + { + m_size = s; + if (s) + { + m_ptr = thrust::device_malloc(s); + thrust::fill(exec, m_ptr, m_ptr + m_size, t); + } + } + + size_t size() const + { + return m_size; + } + + void resize(size_t s) + { + if (m_size) + thrust::device_free(m_ptr); + m_size = s; + if (s) + m_ptr = thrust::device_malloc(s); + } + + template + void append(const thrust::detail::execution_policy_base &exec, size_t n, T t) + { + if (n == 0) + return; + + if (m_size) + { + thrust::device_ptr new_ptr = thrust::device_malloc(m_size + n); + + thrust::copy(exec, m_ptr, m_ptr + m_size, new_ptr); + thrust::fill(exec, new_ptr + m_size, new_ptr + m_size + n, t); + + thrust::device_free(m_ptr); + m_ptr = new_ptr; + + m_size = m_size + n; + } + else + { + resize(n); + thrust::fill(exec, m_ptr, m_ptr + n, t); + m_size = n; + } + } + + thrust::device_ptr begin() const + { + return m_ptr; + } + + thrust::device_ptr end() const + { + return m_ptr + m_size; + } + + auto back() const + { + return m_ptr[m_size - 1]; + } + + auto operator [] (const size_t i) const + { + return m_ptr[i]; + } + }; + + namespace kd_tree_builder_detail { //! normal node: contains the split dimension and value @@ -401,48 +504,52 @@ std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_deta class CudaKdTreeBuilder { public: - CudaKdTreeBuilder( const thrust::device_vector& points, int max_leaf_size ) : /*out_of_space_(1,0),node_count_(1,1),*/ max_leaf_size_(max_leaf_size) + CudaKdTreeBuilder( const flann::cuda::device_vector_noinit& points, int max_leaf_size, cudaStream_t stream ) : /*out_of_space_(1,0),node_count_(1,1),*/ max_leaf_size_(max_leaf_size) { points_=&points; + gpu_stream = stream; int prealloc = points.size()/max_leaf_size_*16; - allocation_info_.resize(3); - allocation_info_[NodeCount]=1; - allocation_info_[NodesAllocated]=prealloc; - allocation_info_[OutOfSpace]=0; + thrust::host_vector alloc_info(3); + alloc_info[0] = 1; + alloc_info[1] = prealloc; + alloc_info[2] = 0; + allocation_info_.resize(3); + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), alloc_info.begin(), alloc_info.end(), allocation_info_.begin()); // std::cout<size()<(prealloc,-1); - parent_=new thrust::device_vector(prealloc,-1); - cuda::kd_tree_builder_detail::SplitInfo s; + child1_ = new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), prealloc, -1); + parent_ = new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), prealloc, -1); + cuda::kd_tree_builder_detail::SplitInfo s; s.left=0; s.right=0; - splits_=new thrust::device_vector(prealloc,s); - s.right=points.size(); - (*splits_)[0]=s; + splits_ = new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), prealloc, s); + s.right=points.size(); + //(*splits_)[0]=s; + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), &s, &s + 1, splits_->begin()); - aabb_min_=new thrust::device_vector(prealloc); - aabb_max_=new thrust::device_vector(prealloc); + aabb_min_=new flann::cuda::device_vector_noinit(prealloc); + aabb_max_=new flann::cuda::device_vector_noinit(prealloc); - index_x_=new thrust::device_vector(points_->size()); - index_y_=new thrust::device_vector(points_->size()); - index_z_=new thrust::device_vector(points_->size()); + index_x_=new flann::cuda::device_vector_noinit(points_->size()); + index_y_=new flann::cuda::device_vector_noinit(points_->size()); + index_z_=new flann::cuda::device_vector_noinit(points_->size()); - owners_x_=new thrust::device_vector(points_->size(),0); - owners_y_=new thrust::device_vector(points_->size(),0); - owners_z_=new thrust::device_vector(points_->size(),0); + owners_x_=new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), points_->size(),0); + owners_y_=new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), points_->size(),0); + owners_z_=new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), points_->size(),0); - leftright_x_ = new thrust::device_vector(points_->size(),0); - leftright_y_ = new thrust::device_vector(points_->size(),0); - leftright_z_ = new thrust::device_vector(points_->size(),0); + leftright_x_ = new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), points_->size(),0); + leftright_y_ = new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), points_->size(),0); + leftright_z_ = new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), points_->size(),0); - tmp_index_=new thrust::device_vector(points_->size()); - tmp_owners_=new thrust::device_vector(points_->size()); - tmp_misc_=new thrust::device_vector(points_->size()); + tmp_index_=new flann::cuda::device_vector_noinit(points_->size()); + tmp_owners_=new flann::cuda::device_vector_noinit(points_->size()); + tmp_misc_=new flann::cuda::device_vector_noinit(points_->size()); - points_x_=new thrust::device_vector(points_->size()); - points_y_=new thrust::device_vector(points_->size()); - points_z_=new thrust::device_vector(points_->size()); + points_x_=new flann::cuda::device_vector_noinit(points_->size()); + points_y_=new flann::cuda::device_vector_noinit(points_->size()); + points_z_=new flann::cuda::device_vector_noinit(points_->size()); delete_node_info_=false; } @@ -455,7 +562,7 @@ class CudaKdTreeBuilder delete aabb_min_; delete aabb_max_; delete index_x_; - } + } delete index_y_; delete index_z_; @@ -486,28 +593,49 @@ class CudaKdTreeBuilder // std::cout<<"buildTree()"<begin(), points_->end(), thrust::make_zip_iterator(thrust::make_tuple(points_x_->begin(), points_y_->begin(),points_z_->begin()) ), cuda::kd_tree_builder_detail::pointxyz_to_px_py_pz() ); + thrust::transform(thrust::cuda::par.on(gpu_stream), points_->begin(), points_->end(), thrust::make_zip_iterator(thrust::make_tuple(points_x_->begin(), points_y_->begin(),points_z_->begin()) ), cuda::kd_tree_builder_detail::pointxyz_to_px_py_pz() ); thrust::counting_iterator it(0); - thrust::copy( it, it+points_->size(), index_x_->begin() ); + thrust::copy(thrust::cuda::par.on(gpu_stream), it, it+points_->size(), index_x_->begin() ); - thrust::copy( index_x_->begin(), index_x_->end(), index_y_->begin() ); - thrust::copy( index_x_->begin(), index_x_->end(), index_z_->begin() ); + thrust::copy(thrust::cuda::par.on(gpu_stream), index_x_->begin(), index_x_->end(), index_y_->begin() ); + thrust::copy(thrust::cuda::par.on(gpu_stream), index_x_->begin(), index_x_->end(), index_z_->begin() ); - thrust::device_vector tmpv(points_->size()); + flann::cuda::device_vector_noinit tmpv(points_->size()); // create sorted index list -> can be used to compute AABBs in O(1) - thrust::copy(points_x_->begin(), points_x_->end(), tmpv.begin()); - thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_x_->begin() ); - thrust::copy(points_y_->begin(), points_y_->end(), tmpv.begin()); - thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_y_->begin() ); - thrust::copy(points_z_->begin(), points_z_->end(), tmpv.begin()); - thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_z_->begin() ); - - - (*aabb_min_)[0]=make_float4((*points_x_)[(*index_x_)[0]],(*points_y_)[(*index_y_)[0]],(*points_z_)[(*index_z_)[0]],0); - - (*aabb_max_)[0]=make_float4((*points_x_)[(*index_x_)[points_->size()-1]],(*points_y_)[(*index_y_)[points_->size()-1]],(*points_z_)[(*index_z_)[points_->size()-1]],0); + thrust::copy(thrust::cuda::par.on(gpu_stream), points_x_->begin(), points_x_->end(), tmpv.begin()); + thrust::sort_by_key(thrust::cuda::par.on(gpu_stream), tmpv.begin(), tmpv.end(), index_x_->begin() ); + thrust::copy(thrust::cuda::par.on(gpu_stream), points_y_->begin(), points_y_->end(), tmpv.begin()); + thrust::sort_by_key(thrust::cuda::par.on(gpu_stream), tmpv.begin(), tmpv.end(), index_y_->begin() ); + thrust::copy(thrust::cuda::par.on(gpu_stream), points_z_->begin(), points_z_->end(), tmpv.begin()); + thrust::sort_by_key(thrust::cuda::par.on(gpu_stream), tmpv.begin(), tmpv.end(), index_z_->begin() ); + + int idxx, idxy, idxz; + float xx, yy, zz; + float4 xyzw; + + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_x_->begin(), index_x_->begin() + 1, &idxx); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_y_->begin(), index_y_->begin() + 1, &idxy); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_z_->begin(), index_z_->begin() + 1, &idxz); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_x_->begin() + idxx, points_x_->begin() + idxx + 1, &xx); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_y_->begin() + idxy, points_y_->begin() + idxy + 1, &yy); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_z_->begin() + idxz, points_z_->begin() + idxz + 1, &zz); + xyzw = make_float4(xx, yy, zz, 0); + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), &xyzw, &xyzw + 1, aabb_min_->begin()); + + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_x_->end() - 1, index_x_->end(), &idxx); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_y_->end() - 1, index_y_->end(), &idxy); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_z_->end() - 1, index_z_->end(), &idxz); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_x_->begin() + idxx, points_x_->begin() + idxx + 1, &xx); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_y_->begin() + idxy, points_y_->begin() + idxy + 1, &yy); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_z_->begin() + idxz, points_z_->begin() + idxz + 1, &zz); + xyzw = make_float4(xx, yy, zz, 0); + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), &xyzw, &xyzw + 1, aabb_max_->begin()); + + //(*aabb_min_)[0]=make_float4((*points_x_)[(*index_x_)[0]],(*points_y_)[(*index_y_)[0]],(*points_z_)[(*index_z_)[0]],0); + + //(*aabb_max_)[0]=make_float4((*points_x_)[(*index_x_)[points_->size()-1]],(*points_y_)[(*index_y_)[points_->size()-1]],(*points_z_)[(*index_z_)[points_->size()-1]],0); #ifdef PRINT_DEBUG_TIMING cudaDeviceSynchronize(); std::cout<<" initial stuff:"< cit(0); - thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple( parent_->begin(), child1_->begin(), splits_->begin(), aabb_min_->begin(), aabb_max_->begin(), cit )), + thrust::for_each(thrust::cuda::par.on(gpu_stream), thrust::make_zip_iterator(thrust::make_tuple( parent_->begin(), child1_->begin(), splits_->begin(), aabb_min_->begin(), aabb_max_->begin(), cit )), thrust::make_zip_iterator(thrust::make_tuple( parent_->begin()+last_node_count, child1_->begin()+last_node_count,splits_->begin()+last_node_count, aabb_min_->begin()+last_node_count, aabb_max_->begin()+last_node_count,cit+last_node_count )), sn ); // copy allocation info to host - thrust::host_vector alloc_info = allocation_info_; + thrust::host_vector alloc_info(allocation_info_.size()); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, allocation_info_.begin(), allocation_info_.end(), alloc_info.begin()); if( last_node_count == alloc_info[NodeCount] ) { // no more nodes were split -> done break; @@ -542,7 +671,8 @@ class CudaKdTreeBuilder resize_node_vectors(alloc_info[NodesAllocated]*2); alloc_info[OutOfSpace]=0; alloc_info[NodesAllocated]*=2; - allocation_info_=alloc_info; + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), alloc_info.begin(), alloc_info.end(), allocation_info_.begin()); + //allocation_info_=alloc_info; } #ifdef PRINT_DEBUG_TIMING cudaDeviceSynchronize(); @@ -564,7 +694,7 @@ class CudaKdTreeBuilder thrust::raw_pointer_cast(&(*leftright_z_)[0]) ); thrust::counting_iterator ci0(0); - thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( ci0, index_x_->begin(), index_y_->begin(), index_z_->begin()) ), + thrust::for_each(thrust::cuda::par.on(gpu_stream), thrust::make_zip_iterator( thrust::make_tuple( ci0, index_x_->begin(), index_y_->begin(), index_z_->begin()) ), thrust::make_zip_iterator( thrust::make_tuple( ci0+points_->size(), index_x_->end(), index_y_->end(), index_z_->end()) ),sno ); #ifdef PRINT_DEBUG_TIMING @@ -607,17 +737,17 @@ class CudaKdTreeBuilder //! takes the partitioned nodes, and sets the left-/right info of leaf nodes, as well as the AABBs void - update_leftright_and_aabb( const thrust::device_vector& x, const thrust::device_vector& y,const thrust::device_vector& z, - const thrust::device_vector& ix, const thrust::device_vector& iy,const thrust::device_vector& iz, - const thrust::device_vector& owners, - thrust::device_vector& splits, thrust::device_vector& aabbMin,thrust::device_vector& aabbMax) + update_leftright_and_aabb( const flann::cuda::device_vector_noinit& x, const flann::cuda::device_vector_noinit& y,const flann::cuda::device_vector_noinit& z, + const flann::cuda::device_vector_noinit& ix, const flann::cuda::device_vector_noinit& iy,const flann::cuda::device_vector_noinit& iz, + const flann::cuda::device_vector_noinit& owners, + flann::cuda::device_vector_noinit& splits, flann::cuda::device_vector_noinit& aabbMin,flann::cuda::device_vector_noinit& aabbMax) { - thrust::device_vector* labelsUnique=tmp_owners_; - thrust::device_vector* countsUnique=tmp_index_; + flann::cuda::device_vector_noinit* labelsUnique=tmp_owners_; + flann::cuda::device_vector_noinit* countsUnique=tmp_index_; // assume: points of each node are continuous in the array // find which nodes are here, and where each node's points begin and end - int unique_labels = thrust::unique_by_key_copy( owners.begin(), owners.end(), thrust::counting_iterator(0), labelsUnique->begin(), countsUnique->begin()).first - labelsUnique->begin(); + int unique_labels = thrust::unique_by_key_copy(thrust::cuda::par.on(gpu_stream), owners.begin(), owners.end(), thrust::counting_iterator(0), labelsUnique->begin(), countsUnique->begin()).first - labelsUnique->begin(); // update the info cuda::kd_tree_builder_detail::SetLeftAndRightAndAABB s; @@ -636,7 +766,7 @@ class CudaKdTreeBuilder s.aabbMax=thrust::raw_pointer_cast(&aabbMax[0]); thrust::counting_iterator it(0); - thrust::for_each(it, it+unique_labels, s); + thrust::for_each(thrust::cuda::par.on(gpu_stream), it, it+unique_labels, s); } //! Separates the left and right children of each node into continuous parts of the array. @@ -646,12 +776,12 @@ class CudaKdTreeBuilder //! for all the single nodes. //! (basically the split primitive according to sengupta et al) //! about twice as fast as thrust::partition - void separate_left_and_right_children( thrust::device_vector& key_in, thrust::device_vector& val_in, thrust::device_vector& key_out, thrust::device_vector& val_out, thrust::device_vector& left_right_marks, bool scatter_val_out=true ) + void separate_left_and_right_children( flann::cuda::device_vector_noinit& key_in, flann::cuda::device_vector_noinit& val_in, flann::cuda::device_vector_noinit& key_out, flann::cuda::device_vector_noinit& val_out, flann::cuda::device_vector_noinit& left_right_marks, bool scatter_val_out=true ) { - thrust::device_vector* f_tmp = &val_out; - thrust::device_vector* addr_tmp = tmp_misc_; + flann::cuda::device_vector_noinit* f_tmp = &val_out; + flann::cuda::device_vector_noinit* addr_tmp = tmp_misc_; - thrust::exclusive_scan( /*thrust::make_transform_iterator(*/ left_right_marks.begin() /*,cuda::kd_tree_builder_detail::IsEven*/ + thrust::exclusive_scan(thrust::cuda::par.on(gpu_stream), /*thrust::make_transform_iterator(*/ left_right_marks.begin() /*,cuda::kd_tree_builder_detail::IsEven*/ /*())*/, /*thrust::make_transform_iterator(*/ left_right_marks.end() /*,cuda::kd_tree_builder_detail::IsEven*/ /*())*/, f_tmp->begin() ); cuda::kd_tree_builder_detail::set_addr3 sa; @@ -659,10 +789,10 @@ class CudaKdTreeBuilder sa.f_=thrust::raw_pointer_cast(&(*f_tmp)[0]); sa.npoints_=key_in.size(); thrust::counting_iterator it(0); - thrust::transform(it, it+val_in.size(), addr_tmp->begin(), sa); + thrust::transform(thrust::cuda::par.on(gpu_stream), it, it+val_in.size(), addr_tmp->begin(), sa); - thrust::scatter(key_in.begin(), key_in.end(), addr_tmp->begin(), key_out.begin()); - if( scatter_val_out ) thrust::scatter(val_in.begin(), val_in.end(), addr_tmp->begin(), val_out.begin()); + thrust::scatter(thrust::cuda::par.on(gpu_stream), key_in.begin(), key_in.end(), addr_tmp->begin(), key_out.begin()); + if( scatter_val_out ) thrust::scatter(thrust::cuda::par.on(gpu_stream), val_in.begin(), val_in.end(), addr_tmp->begin(), val_out.begin()); } //! allocates additional space in all the node-related vectors. @@ -670,33 +800,34 @@ class CudaKdTreeBuilder void resize_node_vectors( size_t new_size ) { size_t add = new_size - child1_->size(); - child1_->insert(child1_->end(), add, -1); - parent_->insert(parent_->end(), add, -1); + child1_->append(thrust::cuda::par.on(gpu_stream), add, -1); + parent_->append(thrust::cuda::par.on(gpu_stream), add, -1); cuda::kd_tree_builder_detail::SplitInfo s; s.left=0; s.right=0; - splits_->insert(splits_->end(), add, s); + splits_->append(thrust::cuda::par.on(gpu_stream), add, s); float4 f; - aabb_min_->insert(aabb_min_->end(), add, f); - aabb_max_->insert(aabb_max_->end(), add, f); + aabb_min_->append(thrust::cuda::par.on(gpu_stream), add, f); + aabb_max_->append(thrust::cuda::par.on(gpu_stream), add, f); } + cudaStream_t gpu_stream; - const thrust::device_vector* points_; + const flann::cuda::device_vector_noinit* points_; // tree data, those are stored per-node //! left child of each node. (right child==left child + 1, due to the alloc mechanism) //! child1_[node]==-1 if node is a leaf node - thrust::device_vector* child1_; + flann::cuda::device_vector_noinit* child1_; //! parent node of each node - thrust::device_vector* parent_; + flann::cuda::device_vector_noinit* parent_; //! split info (dim/value or left/right pointers) - thrust::device_vector* splits_; + flann::cuda::device_vector_noinit* splits_; //! min aabb value of each node - thrust::device_vector* aabb_min_; + flann::cuda::device_vector_noinit* aabb_min_; //! max aabb value of each node - thrust::device_vector* aabb_max_; + flann::cuda::device_vector_noinit* aabb_max_; enum AllocationInfo { @@ -705,22 +836,22 @@ class CudaKdTreeBuilder OutOfSpace=2 }; // those were put into a single vector of 3 elements so that only one mem transfer will be needed for all three of them - // thrust::device_vector out_of_space_; - // thrust::device_vector node_count_; - // thrust::device_vector nodes_allocated_; - thrust::device_vector allocation_info_; + // flann::cuda::device_vector_noinit out_of_space_; + // flann::cuda::device_vector_noinit node_count_; + // flann::cuda::device_vector_noinit nodes_allocated_; + flann::cuda::device_vector_noinit allocation_info_; int max_leaf_size_; // coordinate values of the points - thrust::device_vector* points_x_, * points_y_, * points_z_; + flann::cuda::device_vector_noinit* points_x_, * points_y_, * points_z_; // indices - thrust::device_vector* index_x_, * index_y_, * index_z_; + flann::cuda::device_vector_noinit* index_x_, * index_y_, * index_z_; // owner node - thrust::device_vector* owners_x_, * owners_y_, * owners_z_; + flann::cuda::device_vector_noinit* owners_x_, * owners_y_, * owners_z_; // contains info about whether a point was partitioned to the left or right child after a split - thrust::device_vector* leftright_x_, * leftright_y_, * leftright_z_; - thrust::device_vector* tmp_index_, * tmp_owners_, * tmp_misc_; + flann::cuda::device_vector_noinit* leftright_x_, * leftright_y_, * leftright_z_; + flann::cuda::device_vector_noinit* tmp_index_, * tmp_owners_, * tmp_misc_; bool delete_node_info_; }; From 89b9fbb39922175eb7d53ecde4fb514150fa4545 Mon Sep 17 00:00:00 2001 From: yuyi1005 Date: Sun, 13 May 2018 16:02:53 +0800 Subject: [PATCH 3/7] create cuda stream for each index fix errors in comment --- src/cpp/flann/algorithms/kdtree_cuda_builder.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpp/flann/algorithms/kdtree_cuda_builder.h b/src/cpp/flann/algorithms/kdtree_cuda_builder.h index 24ddf0db..55f381c1 100644 --- a/src/cpp/flann/algorithms/kdtree_cuda_builder.h +++ b/src/cpp/flann/algorithms/kdtree_cuda_builder.h @@ -96,8 +96,8 @@ namespace flann // } namespace cuda { - // flann::cuda::device_vector_noinit is used to take place of flann::cuda::device_vector_noinit - // as flann::cuda::device_vector_noinit always uses default stream, and always contains a fill + // flann::cuda::device_vector_noinit is used to take place of thrust::device_vector + // as thrust::device_vector always uses default stream, and always contains a fill template class device_vector_noinit { From 3111a42fc5c12b3aa3a05e3211e0558fa8deb2ac Mon Sep 17 00:00:00 2001 From: yuyi1005 Date: Sun, 13 May 2018 20:10:13 +0800 Subject: [PATCH 4/7] add virtual keyword knnSearch should be overrided. If not, Index initialize with KDTreeCuda3dIndexParams will call the default knnSearch, not the gpu one --- src/cpp/flann/algorithms/nn_index.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/cpp/flann/algorithms/nn_index.h b/src/cpp/flann/algorithms/nn_index.h index 245d258c..844bc97b 100644 --- a/src/cpp/flann/algorithms/nn_index.h +++ b/src/cpp/flann/algorithms/nn_index.h @@ -365,7 +365,7 @@ class NNIndex : public IndexBase * @param params * @return */ - int knnSearch(const Matrix& queries, + virtual int knnSearch(const Matrix& queries, Matrix& indices, Matrix& dists, size_t knn, @@ -392,7 +392,7 @@ class NNIndex : public IndexBase * @param[in] knn Number of nearest neighbors to return * @param[in] params Search parameters */ - int knnSearch(const Matrix& queries, + virtual int knnSearch(const Matrix& queries, std::vector< std::vector >& indices, std::vector >& dists, size_t knn, @@ -463,7 +463,7 @@ class NNIndex : public IndexBase * @param params * @return */ - int knnSearch(const Matrix& queries, + virtual int knnSearch(const Matrix& queries, std::vector< std::vector >& indices, std::vector >& dists, size_t knn, @@ -488,7 +488,7 @@ class NNIndex : public IndexBase * @param[in] params Search parameters * @return Number of neighbors found */ - int radiusSearch(const Matrix& queries, + virtual int radiusSearch(const Matrix& queries, Matrix& indices, Matrix& dists, float radius, @@ -571,7 +571,7 @@ class NNIndex : public IndexBase * @param params * @return */ - int radiusSearch(const Matrix& queries, + virtual int radiusSearch(const Matrix& queries, Matrix& indices, Matrix& dists, float radius, @@ -598,7 +598,7 @@ class NNIndex : public IndexBase * @param[in] params Search parameters * @return Number of neighbors found */ - int radiusSearch(const Matrix& queries, + virtual int radiusSearch(const Matrix& queries, std::vector< std::vector >& indices, std::vector >& dists, float radius, @@ -677,7 +677,7 @@ class NNIndex : public IndexBase * @param params * @return */ - int radiusSearch(const Matrix& queries, + virtual int radiusSearch(const Matrix& queries, std::vector< std::vector >& indices, std::vector >& dists, float radius, From c577292e05621ff54e7586e4435feaa3a2270d6e Mon Sep 17 00:00:00 2001 From: yuyi1005 Date: Tue, 15 May 2018 12:58:24 +0800 Subject: [PATCH 5/7] add external gpu stream support flann::KDTreeCuda3dIndexParams params; params["gpu_stream"] = mystream;//cudaStream_t mystream params["gpu_stream"] = cudaStreamLegacy;//use default stream params["gpu_stream"] = cudaStreamPerThread;//one stream per thread if params["gpu_stream"] is not set, stream will be created each time calling buildIndex() note that there is no internal synchronization, that is, cudaStreamSynchronize() should be called manually if another stream is used to process the matrices (only in case that matrices_in_gpu_ram==true) --- .../flann/algorithms/kdtree_cuda_3d_index.cu | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu index 70b91ece..4feb0a5c 100644 --- a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu +++ b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu @@ -162,6 +162,7 @@ template struct KDTreeCuda3dIndex::GpuHelper { cudaStream_t gpu_stream; + bool use_ext_stream; flann::cuda::device_vector_noinit< cuda::kd_tree_builder_detail::SplitInfo >* gpu_splits_; flann::cuda::device_vector_noinit< int >* gpu_parent_; flann::cuda::device_vector_noinit< int >* gpu_child1_; @@ -169,9 +170,18 @@ struct KDTreeCuda3dIndex::GpuHelper flann::cuda::device_vector_noinit< float4 >* gpu_aabb_max_; flann::cuda::device_vector_noinit* gpu_points_; flann::cuda::device_vector_noinit* gpu_vind_; - GpuHelper() : gpu_splits_(0), gpu_parent_(0), gpu_child1_(0), gpu_aabb_min_(0), gpu_aabb_max_(0), gpu_points_(0), gpu_vind_(0) + GpuHelper(cudaStream_t s = (cudaStream_t)0) : gpu_splits_(0), gpu_parent_(0), gpu_child1_(0), gpu_aabb_min_(0), gpu_aabb_max_(0), gpu_points_(0), gpu_vind_(0) { - cudaStreamCreate(&gpu_stream); + if (s == (cudaStream_t)0) + { + cudaStreamCreate(&gpu_stream); + use_ext_stream = false; + } + else + { + gpu_stream = s; + use_ext_stream = true; + } } ~GpuHelper() { @@ -191,7 +201,8 @@ struct KDTreeCuda3dIndex::GpuHelper delete gpu_points_; gpu_points_=0; - cudaStreamDestroy(gpu_stream); + if (use_ext_stream == false) + cudaStreamDestroy(gpu_stream); } }; @@ -735,8 +746,9 @@ void KDTreeCuda3dIndex::uploadTreeToGpu() // (I would make this a (boost) static assertion, but so far flann seems to avoid boost // assert( sizeof( KdTreeCudaPrivate::GpuNode)==sizeof( Node ) ); delete gpu_helper_; - gpu_helper_ = new GpuHelper; - gpu_helper_->gpu_points_=new flann::cuda::device_vector_noinit(size_); + cudaStream_t s = get_param(index_params_, "gpu_stream", (cudaStream_t)0); + gpu_helper_ = new GpuHelper(s); + gpu_helper_->gpu_points_=new flann::cuda::device_vector_noinit(size_); flann::cuda::device_vector_noinit tmp(size_); if( get_param(index_params_,"input_is_gpu_float4",false) ) { assert( dataset_.cols == 3 && dataset_.stride==4*sizeof(float)); From b8e3da518a2e7515316b8fbb6a4ecb762729cbfe Mon Sep 17 00:00:00 2001 From: yuyi1005 Date: Mon, 21 May 2018 18:27:16 +0800 Subject: [PATCH 6/7] Add nvcc compiler flag to specify gpu arch and code -gencode=arch=compute_52,code=sm_52 --- src/cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index b44a7351..4ce34924 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -15,7 +15,7 @@ endif() set_property(TARGET flann_cpp_s PROPERTY COMPILE_DEFINITIONS FLANN_STATIC FLANN_USE_CUDA) if (BUILD_CUDA_LIB) - SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-DFLANN_USE_CUDA") + SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-DFLANN_USE_CUDA;-gencode=arch=compute_52,code=sm_52") if(CMAKE_COMPILER_IS_GNUCC) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler;-fPIC;" ) if (NVCC_COMPILER_BINDIR) From 5fc7f2443f71b096e9ae1e9dc1f847ba215e5297 Mon Sep 17 00:00:00 2001 From: yuyi1005 Date: Mon, 28 May 2018 11:17:46 +0800 Subject: [PATCH 7/7] Change gpu arch and code -gencode=arch=compute_52,code=\"sm_52,compute_52\"; -gencode=arch=compute_61,code=\"sm_61,compute_61\" --- src/cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index 4ce34924..ffdd2394 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -15,7 +15,7 @@ endif() set_property(TARGET flann_cpp_s PROPERTY COMPILE_DEFINITIONS FLANN_STATIC FLANN_USE_CUDA) if (BUILD_CUDA_LIB) - SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-DFLANN_USE_CUDA;-gencode=arch=compute_52,code=sm_52") + SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-DFLANN_USE_CUDA;-gencode=arch=compute_52,code=\"sm_52,compute_52\";-gencode=arch=compute_61,code=\"sm_61,compute_61\"") if(CMAKE_COMPILER_IS_GNUCC) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler;-fPIC;" ) if (NVCC_COMPILER_BINDIR)