From ce8f56a2400725a8d7ae9778e403e6bda01143fd Mon Sep 17 00:00:00 2001
From: yuyi1005 <yuyi1005@vip.qq.com>
Date: Wed, 9 May 2018 19:34:35 +0800
Subject: [PATCH 1/7] Add files via upload

fix bug for knnSearch when knn!=ostride
---
 src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu
index 8465322e..813d0f94 100644
--- a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu
+++ b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu
@@ -430,7 +430,7 @@ void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& querie
                                                                                       );
             }
         }
-        thrust::transform(id, id+knn*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
+        thrust::transform(id, id+ostride*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
     }
 }
 

From c0843304fe3797a9a6c7e7082ecfdb54346d75bc Mon Sep 17 00:00:00 2001
From: yuyi1005 <yuyi1005@vip.qq.com>
Date: Sun, 13 May 2018 15:51:10 +0800
Subject: [PATCH 2/7] create cuda stream for each index

Now most nvidia gpu supports kernel concurrency, but flann always uses default stream, which means gpu can not process two indexs simutaneously when we use multithread. Creating cuda stream for each index can fix this problem.
---
 .../flann/algorithms/kdtree_cuda_3d_index.cu  | 148 +++++----
 .../flann/algorithms/kdtree_cuda_builder.h    | 307 +++++++++++++-----
 2 files changed, 297 insertions(+), 158 deletions(-)

diff --git a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu
index 813d0f94..70b91ece 100644
--- a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu
+++ b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu
@@ -161,14 +161,17 @@ void nearestKernel(const cuda::kd_tree_builder_detail::SplitInfo* splits,
 template<typename Distance>
 struct KDTreeCuda3dIndex<Distance>::GpuHelper
 {
-    thrust::device_vector< cuda::kd_tree_builder_detail::SplitInfo >* gpu_splits_;
-    thrust::device_vector< int >* gpu_parent_;
-    thrust::device_vector< int >* gpu_child1_;
-    thrust::device_vector< float4 >* gpu_aabb_min_;
-    thrust::device_vector< float4 >* gpu_aabb_max_;
-    thrust::device_vector<float4>* gpu_points_;
-    thrust::device_vector<int>* gpu_vind_;
-    GpuHelper() :  gpu_splits_(0), gpu_parent_(0), gpu_child1_(0), gpu_aabb_min_(0), gpu_aabb_max_(0), gpu_points_(0), gpu_vind_(0){
+	cudaStream_t gpu_stream;
+    flann::cuda::device_vector_noinit< cuda::kd_tree_builder_detail::SplitInfo >* gpu_splits_;
+    flann::cuda::device_vector_noinit< int >* gpu_parent_;
+    flann::cuda::device_vector_noinit< int >* gpu_child1_;
+    flann::cuda::device_vector_noinit< float4 >* gpu_aabb_min_;
+    flann::cuda::device_vector_noinit< float4 >* gpu_aabb_max_;
+    flann::cuda::device_vector_noinit<float4>* gpu_points_;
+    flann::cuda::device_vector_noinit<int>* gpu_vind_;
+    GpuHelper() : gpu_splits_(0), gpu_parent_(0), gpu_child1_(0), gpu_aabb_min_(0), gpu_aabb_max_(0), gpu_points_(0), gpu_vind_(0)
+	{
+		cudaStreamCreate(&gpu_stream);
     }
     ~GpuHelper()
     {
@@ -187,6 +190,8 @@ struct KDTreeCuda3dIndex<Distance>::GpuHelper
 
         delete gpu_points_;
         gpu_points_=0;
+
+		cudaStreamDestroy(gpu_stream);
     }
 };
 
@@ -302,15 +307,15 @@ void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& querie
 //       std::cout<<"  knn:"<<knn<<"  matrices_on_gpu:"<<matrices_on_gpu<<std::endl;
 
     if( !matrices_on_gpu ) {
-        thrust::device_vector<float> queriesDev(istride* queries.rows,0);
-        thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
-        thrust::device_vector<float> distsDev(queries.rows* ostride);
-        thrust::device_vector<int> indicesDev(queries.rows* ostride);
+        flann::cuda::device_vector_noinit<float> queriesDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), istride* queries.rows,0);
+		thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_helper_->gpu_stream), queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
+        flann::cuda::device_vector_noinit<float> distsDev(queries.rows* ostride);
+        flann::cuda::device_vector_noinit<int> indicesDev(queries.rows* ostride);
 
 
 
         if( knn==1  ) {
-            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -322,7 +327,7 @@ void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& querie
                                                                                   thrust::raw_pointer_cast(&indicesDev[0]),
                                                                                   thrust::raw_pointer_cast(&distsDev[0]),
                                                                                   queries.rows, flann::cuda::SingleResultSet<float>(epsError),distance);
-            //                          KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])),
+            //                          KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])),
             //                                                                                                                                                                                                                                                                                          thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
             //                                                                                                                                                                                                                                                                                          thrust::raw_pointer_cast(&queriesDev[0]),
             //                                                                                                                                                                                                                                                                                          queries.stride,
@@ -333,7 +338,7 @@ void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& querie
         }
         else {
             if( use_heap ) {
-                KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+                KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -348,7 +353,7 @@ void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& querie
                                                                                       , distance);
             }
             else {
-                KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+                KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -364,9 +369,9 @@ void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& querie
                                                                                       );
             }
         }
-        thrust::copy( distsDev.begin(), distsDev.end(), dists.ptr() );
-        thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
-        thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, distsDev.begin(), distsDev.end(), dists.ptr() );
+        thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, indicesDev.begin(), indicesDev.end(), indices.ptr() );
     }
     else {
         thrust::device_ptr<float> qd = thrust::device_pointer_cast(queries.ptr());
@@ -376,7 +381,7 @@ void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& querie
 
 
         if( knn==1  ) {
-            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -388,7 +393,7 @@ void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& querie
                                                                                   id.get(),
                                                                                   dd.get(),
                                                                                   queries.rows, flann::cuda::SingleResultSet<float>(epsError),distance);
-            //                          KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])),
+            //                          KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])),
             //                                                                                                                                                                                                                                                                                          thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
             //                                                                                                                                                                                                                                                                                          thrust::raw_pointer_cast(&queriesDev[0]),
             //                                                                                                                                                                                                                                                                                          queries.stride,
@@ -399,7 +404,7 @@ void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& querie
         }
         else {
             if( use_heap ) {
-                KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+                KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -414,7 +419,7 @@ void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& querie
                                                                                       , distance);
             }
             else {
-                KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+                KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                                       thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -430,7 +435,7 @@ void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& querie
                                                                                       );
             }
         }
-        thrust::transform(id, id+ostride*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
+        thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), id, id+ostride*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
     }
 }
 
@@ -450,9 +455,9 @@ int KDTreeCuda3dIndex<Distance >::radiusSearchGpu(const Matrix<ElementType>& que
     
     int istride=queries.stride/sizeof(ElementType);
 
-    thrust::device_vector<float> queriesDev(istride* queries.rows,0);
-    thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
-    thrust::device_vector<int> countsDev(queries.rows);
+    flann::cuda::device_vector_noinit<float> queriesDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), istride* queries.rows,0);
+	thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_helper_->gpu_stream), queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
+    flann::cuda::device_vector_noinit<int> countsDev(queries.rows);
 
     typename GpuDistance<Distance>::type distance;
 
@@ -460,7 +465,7 @@ int KDTreeCuda3dIndex<Distance >::radiusSearchGpu(const Matrix<ElementType>& que
     int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
 
 
-    KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+    KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                           thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                           thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                           thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -475,7 +480,8 @@ int KDTreeCuda3dIndex<Distance >::radiusSearchGpu(const Matrix<ElementType>& que
                                                                           distance
                                                                           );
 
-    thrust::host_vector<int> counts_host=countsDev;
+	thrust::host_vector<int> counts_host(countsDev.size());
+	thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, countsDev.begin(), countsDev.end(), counts_host.begin());
 
     if( max_neighbors!=0 ) { // we'll need this later, but the exclusive_scan will change the array
         for( size_t i=0; i<queries.rows; i++ ) {
@@ -493,16 +499,16 @@ int KDTreeCuda3dIndex<Distance >::radiusSearchGpu(const Matrix<ElementType>& que
     }
 
     int neighbors_last_elem = countsDev.back();
-    thrust::exclusive_scan( countsDev.begin(), countsDev.end(), countsDev.begin() );
+    thrust::exclusive_scan(thrust::cuda::par.on(gpu_helper_->gpu_stream), countsDev.begin(), countsDev.end(), countsDev.begin() );
 
     size_t total_neighbors=neighbors_last_elem+countsDev.back();
     if( max_neighbors==0 ) return total_neighbors;
 
-    thrust::device_vector<int> indicesDev(total_neighbors,-1);
-    thrust::device_vector<float> distsDev(total_neighbors,std::numeric_limits<float>::infinity());
+    flann::cuda::device_vector_noinit<int> indicesDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), total_neighbors,-1);
+    flann::cuda::device_vector_noinit<float> distsDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), total_neighbors,std::numeric_limits<float>::infinity());
 
     if( max_neighbors<0 ) {
-        KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+        KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                               thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                               thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                               thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -517,7 +523,7 @@ int KDTreeCuda3dIndex<Distance >::radiusSearchGpu(const Matrix<ElementType>& que
     }
     else {
         if( use_heap ) {
-            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -531,7 +537,7 @@ int KDTreeCuda3dIndex<Distance >::radiusSearchGpu(const Matrix<ElementType>& que
                                                                                   queries.rows, flann::cuda::RadiusKnnResultSet<float, true>(radius,max_neighbors, thrust::raw_pointer_cast(&countsDev[0]),sorted), distance);
         }
         else {
-            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -545,9 +551,11 @@ int KDTreeCuda3dIndex<Distance >::radiusSearchGpu(const Matrix<ElementType>& que
                                                                                   queries.rows, flann::cuda::RadiusKnnResultSet<float, false>(radius,max_neighbors, thrust::raw_pointer_cast(&countsDev[0]),sorted), distance);
         }
     }
-    thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
-    thrust::host_vector<int> indices_temp = indicesDev;
-    thrust::host_vector<float> dists_temp = distsDev;
+    thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
+    thrust::host_vector<int> indices_temp(indicesDev.size());
+	thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, indicesDev.begin(), indicesDev.end(), indices_temp.begin());
+	thrust::host_vector<float> dists_temp(distsDev.size());
+	thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, distsDev.begin(), distsDev.end(), dists_temp.begin());
 
     int buffer_index=0;
     for( size_t i=0; i<queries.rows; i++ ) {
@@ -591,14 +599,14 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix<ElementType>& qu
     if( max_neighbors<0 ) max_neighbors=indices.cols;
 
     if( !matrices_on_gpu ) {
-        thrust::device_vector<float> queriesDev(istride* queries.rows,0);
-        thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
+        flann::cuda::device_vector_noinit<float> queriesDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), istride* queries.rows,0);
+		thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_helper_->gpu_stream), queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
         typename GpuDistance<Distance>::type distance;
         int threadsPerBlock = 128;
         int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
         if( max_neighbors== 0 ) {
-            thrust::device_vector<int> indicesDev(queries.rows* ostride);
-            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+            flann::cuda::device_vector_noinit<int> indicesDev(queries.rows* ostride);
+            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -612,16 +620,16 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix<ElementType>& qu
                                                                                   queries.rows, flann::cuda::CountingRadiusResultSet<float>(radius,-1),
                                                                                   distance
                                                                                   );
-            thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
-            return thrust::reduce(indicesDev.begin(), indicesDev.end() );
+			thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, indicesDev.begin(), indicesDev.end(), indices.ptr() );
+            return thrust::reduce(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end() );
         }
 
 
-        thrust::device_vector<float> distsDev(queries.rows* max_neighbors);
-        thrust::device_vector<int> indicesDev(queries.rows* max_neighbors);
+        flann::cuda::device_vector_noinit<float> distsDev(queries.rows* max_neighbors);
+        flann::cuda::device_vector_noinit<int> indicesDev(queries.rows* max_neighbors);
 
         if( use_heap ) {
-            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -635,7 +643,7 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix<ElementType>& qu
                                                                                   queries.rows, flann::cuda::KnnRadiusResultSet<float, true>(max_neighbors,sorted,epsError, radius), distance);
         }
         else {
-            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -649,11 +657,11 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix<ElementType>& qu
                                                                                   queries.rows, flann::cuda::KnnRadiusResultSet<float, false>(max_neighbors,sorted,epsError, radius), distance);
         }
 
-        thrust::copy( distsDev.begin(), distsDev.end(), dists.ptr() );
-        thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
-        thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, distsDev.begin(), distsDev.end(), dists.ptr() );
+        thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, indicesDev.begin(), indicesDev.end(), indices.ptr() );
 
-        return thrust::count_if(indicesDev.begin(), indicesDev.end(), isNotMinusOne() );
+        return thrust::count_if(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), isNotMinusOne() );
     }
     else {
 
@@ -665,8 +673,8 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix<ElementType>& qu
         int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
 
         if( max_neighbors== 0 ) {
-            thrust::device_vector<int> indicesDev(queries.rows* indices.stride);
-            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+            flann::cuda::device_vector_noinit<int> indicesDev(queries.rows* indices.stride);
+            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -680,12 +688,12 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix<ElementType>& qu
                                                                                   queries.rows, flann::cuda::CountingRadiusResultSet<float>(radius,-1),
                                                                                   distance
                                                                                   );
-            thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
-            return thrust::reduce(indicesDev.begin(), indicesDev.end() );
+            thrust::copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), indices.ptr() );
+            return thrust::reduce(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end() );
         }
 
         if( use_heap ) {
-            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -699,7 +707,7 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix<ElementType>& qu
                                                                                   queries.rows, flann::cuda::KnnRadiusResultSet<float, true>(max_neighbors,sorted,epsError, radius), distance);
         }
         else {
-            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
+            KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock, 0, gpu_helper_->gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
                                                                                   thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
@@ -713,9 +721,9 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix<ElementType>& qu
                                                                                   queries.rows, flann::cuda::KnnRadiusResultSet<float, false>(max_neighbors,sorted,epsError, radius), distance);
         }
 
-        thrust::transform(id, id+max_neighbors*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
+        thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), id, id+max_neighbors*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
 
-        return thrust::count_if(id, id+max_neighbors*queries.rows, isNotMinusOne() );
+        return thrust::count_if(thrust::cuda::par.on(gpu_helper_->gpu_stream), id, id+max_neighbors*queries.rows, isNotMinusOne() );
     }
 }
 
@@ -728,11 +736,11 @@ void KDTreeCuda3dIndex<Distance>::uploadTreeToGpu()
     //  assert( sizeof( KdTreeCudaPrivate::GpuNode)==sizeof( Node ) );
     delete gpu_helper_;
     gpu_helper_ = new GpuHelper;
-    gpu_helper_->gpu_points_=new thrust::device_vector<float4>(size_);
-    thrust::device_vector<float4> tmp(size_);
+    gpu_helper_->gpu_points_=new flann::cuda::device_vector_noinit<float4>(size_);
+    flann::cuda::device_vector_noinit<float4> tmp(size_);
     if( get_param(index_params_,"input_is_gpu_float4",false) ) {
 		assert( dataset_.cols == 3 && dataset_.stride==4*sizeof(float));
-        thrust::copy( thrust::device_pointer_cast((float4*)dataset_.ptr()),thrust::device_pointer_cast((float4*)(dataset_.ptr()))+size_,tmp.begin());
+        thrust::copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::device_pointer_cast((float4*)dataset_.ptr()),thrust::device_pointer_cast((float4*)(dataset_.ptr()))+size_,tmp.begin());
         
     }
     else {
@@ -750,24 +758,24 @@ void KDTreeCuda3dIndex<Distance>::uploadTreeToGpu()
                 data_[i][j] = 0;
             }
         }
-        thrust::copy((float4*)data_.ptr(),(float4*)(data_.ptr())+size_,tmp.begin());
+		thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_helper_->gpu_stream), (float4*)data_.ptr(),(float4*)(data_.ptr())+size_,tmp.begin());
     }
 
-    CudaKdTreeBuilder builder( tmp, leaf_max_size_ );
+    CudaKdTreeBuilder builder( tmp, leaf_max_size_, gpu_helper_->gpu_stream );
     builder.buildTree();
 
-    gpu_helper_->gpu_splits_ = builder.splits_;
-    gpu_helper_->gpu_aabb_min_ = builder.aabb_min_;
+	gpu_helper_->gpu_splits_ = builder.splits_;
+	gpu_helper_->gpu_aabb_min_ = builder.aabb_min_;
     gpu_helper_->gpu_aabb_max_ = builder.aabb_max_;
     gpu_helper_->gpu_child1_ = builder.child1_;
     gpu_helper_->gpu_parent_=builder.parent_;
     gpu_helper_->gpu_vind_=builder.index_x_;
-    thrust::gather( builder.index_x_->begin(), builder.index_x_->end(), tmp.begin(), gpu_helper_->gpu_points_->begin());
+    thrust::gather(thrust::cuda::par.on(gpu_helper_->gpu_stream), builder.index_x_->begin(), builder.index_x_->end(), tmp.begin(), gpu_helper_->gpu_points_->begin());
 
-    //  gpu_helper_->gpu_nodes_=new thrust::device_vector<KdTreeCudaPrivate::GpuNode>(node_count_);
+    //  gpu_helper_->gpu_nodes_=new flann::cuda::device_vector_noinit<KdTreeCudaPrivate::GpuNode>(node_count_);
 
 
-    //  gpu_helper_->gpu_vind_=new thrust::device_vector<int>(size_);
+    //  gpu_helper_->gpu_vind_=new flann::cuda::device_vector_noinit<int>(size_);
     //  thrust::copy( (KdTreeCudaPrivate::GpuNode*)&(tree_[0]), ((KdTreeCudaPrivate::GpuNode*)&(tree_[0]))+tree_.size(),  gpu_helper_->gpu_nodes_->begin());
 
     //  thrust::copy(vind_.begin(),vind_.end(),gpu_helper_->gpu_vind_->begin());
diff --git a/src/cpp/flann/algorithms/kdtree_cuda_builder.h b/src/cpp/flann/algorithms/kdtree_cuda_builder.h
index 8bb4e838..24ddf0db 100644
--- a/src/cpp/flann/algorithms/kdtree_cuda_builder.h
+++ b/src/cpp/flann/algorithms/kdtree_cuda_builder.h
@@ -35,6 +35,7 @@
 #include <thrust/partition.h>
 #include <thrust/unique.h>
 #include <thrust/scan.h>
+#include <thrust/execution_policy.h>
 #include <flann/util/cutil_math.h>
 #include <stdlib.h>
 
@@ -43,7 +44,7 @@
 namespace flann
 {
 //      template< typename T >
-//      void print_vector(  const thrust::device_vector<T>& v )
+//      void print_vector(  const flann::cuda::device_vector_noinit<T>& v )
 //      {
 //              for( int i=0; i< v.size(); i++ )
 //              {
@@ -52,7 +53,7 @@ namespace flann
 //      }
 //
 //      template< typename T1, typename T2 >
-//      void print_vector(  const thrust::device_vector<T1>& v1, const thrust::device_vector<T2>& v2 )
+//      void print_vector(  const flann::cuda::device_vector_noinit<T1>& v1, const flann::cuda::device_vector_noinit<T2>& v2 )
 //      {
 //              for( int i=0; i< v1.size(); i++ )
 //              {
@@ -61,7 +62,7 @@ namespace flann
 //      }
 //
 //      template< typename T1, typename T2, typename T3 >
-//      void print_vector(  const thrust::device_vector<T1>& v1, const thrust::device_vector<T2>& v2, const thrust::device_vector<T3>& v3 )
+//      void print_vector(  const flann::cuda::device_vector_noinit<T1>& v1, const flann::cuda::device_vector_noinit<T2>& v2, const flann::cuda::device_vector_noinit<T3>& v3 )
 //      {
 //              for( int i=0; i< v1.size(); i++ )
 //              {
@@ -70,7 +71,7 @@ namespace flann
 //      }
 //
 //      template< typename T >
-//      void print_vector_by_index(  const thrust::device_vector<T>& v,const thrust::device_vector<int>& ind )
+//      void print_vector_by_index(  const flann::cuda::device_vector_noinit<T>& v,const flann::cuda::device_vector_noinit<int>& ind )
 //      {
 //              for( int i=0; i< v.size(); i++ )
 //              {
@@ -95,6 +96,108 @@ namespace flann
 // }
 namespace cuda
 {
+	// flann::cuda::device_vector_noinit is used to take place of flann::cuda::device_vector_noinit
+	// as flann::cuda::device_vector_noinit always uses default stream, and always contains a fill
+	template<typename T>
+	class device_vector_noinit
+	{
+	private:
+		thrust::device_ptr<T> m_ptr;
+		size_t m_size;
+
+	public:
+		device_vector_noinit()
+		{
+			m_size = 0;
+		}
+
+		~device_vector_noinit()
+		{
+			if (m_size)
+				thrust::device_free(m_ptr);
+			m_size = 0;
+		}
+
+		device_vector_noinit(size_t s)
+		{
+			m_size = s;
+			if (s)
+				m_ptr = thrust::device_malloc<T>(s);
+		}
+
+		template<typename DerivedPolicy>
+		device_vector_noinit(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, size_t s, T t)
+		{
+			m_size = s;
+			if (s)
+			{
+				m_ptr = thrust::device_malloc<T>(s);
+				thrust::fill(exec, m_ptr, m_ptr + m_size, t);
+			}
+		}
+
+		size_t size() const
+		{
+			return m_size;
+		}
+
+		void resize(size_t s)
+		{
+			if (m_size)
+				thrust::device_free(m_ptr);
+			m_size = s;
+			if (s)
+				m_ptr = thrust::device_malloc<T>(s);
+		}
+
+		template<typename DerivedPolicy>
+		void append(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, size_t n, T t)
+		{
+			if (n == 0)
+				return;
+
+			if (m_size)
+			{
+				thrust::device_ptr<T> new_ptr = thrust::device_malloc<T>(m_size + n);
+
+				thrust::copy(exec, m_ptr, m_ptr + m_size, new_ptr);
+				thrust::fill(exec, new_ptr + m_size, new_ptr + m_size + n, t);
+
+				thrust::device_free(m_ptr);
+				m_ptr = new_ptr;
+
+				m_size = m_size + n;
+			}
+			else
+			{
+				resize(n);
+				thrust::fill(exec, m_ptr, m_ptr + n, t);
+				m_size = n;
+			}
+		}
+
+		thrust::device_ptr<T> begin() const
+		{
+			return m_ptr;
+		}
+
+		thrust::device_ptr<T> end() const
+		{
+			return m_ptr + m_size;
+		}
+
+		auto back() const
+		{
+			return m_ptr[m_size - 1];
+		}
+
+		auto operator [] (const size_t i) const
+		{
+			return m_ptr[i];
+		}
+	};
+
+
 namespace kd_tree_builder_detail
 {
 //! normal node: contains the split dimension and value
@@ -401,48 +504,52 @@ std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_deta
 class CudaKdTreeBuilder
 {
 public:
-    CudaKdTreeBuilder( const thrust::device_vector<float4>& points, int max_leaf_size ) : /*out_of_space_(1,0),node_count_(1,1),*/ max_leaf_size_(max_leaf_size)
+    CudaKdTreeBuilder( const flann::cuda::device_vector_noinit<float4>& points, int max_leaf_size, cudaStream_t stream ) : /*out_of_space_(1,0),node_count_(1,1),*/ max_leaf_size_(max_leaf_size)
     {
         points_=&points;
+		gpu_stream = stream;
         int prealloc = points.size()/max_leaf_size_*16;
-        allocation_info_.resize(3);
-        allocation_info_[NodeCount]=1;
-        allocation_info_[NodesAllocated]=prealloc;
-        allocation_info_[OutOfSpace]=0;
+		thrust::host_vector<int> alloc_info(3);
+		alloc_info[0] = 1;
+		alloc_info[1] = prealloc;
+		alloc_info[2] = 0;
+		allocation_info_.resize(3);
+		thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), alloc_info.begin(), alloc_info.end(), allocation_info_.begin());
 
         //              std::cout<<points_->size()<<std::endl;
 
-        child1_=new thrust::device_vector<int>(prealloc,-1);
-        parent_=new thrust::device_vector<int>(prealloc,-1);
-        cuda::kd_tree_builder_detail::SplitInfo s;
+		child1_ = new flann::cuda::device_vector_noinit<int>(thrust::cuda::par.on(gpu_stream), prealloc, -1);
+		parent_ = new flann::cuda::device_vector_noinit<int>(thrust::cuda::par.on(gpu_stream), prealloc, -1);
+		cuda::kd_tree_builder_detail::SplitInfo s;
         s.left=0;
         s.right=0;
-        splits_=new thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>(prealloc,s);
-        s.right=points.size();
-        (*splits_)[0]=s;
+		splits_ = new flann::cuda::device_vector_noinit<cuda::kd_tree_builder_detail::SplitInfo>(thrust::cuda::par.on(gpu_stream), prealloc, s);
+		s.right=points.size();
+        //(*splits_)[0]=s;
+		thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), &s, &s + 1, splits_->begin());
 
-        aabb_min_=new thrust::device_vector<float4>(prealloc);
-        aabb_max_=new thrust::device_vector<float4>(prealloc);
+        aabb_min_=new flann::cuda::device_vector_noinit<float4>(prealloc);
+        aabb_max_=new flann::cuda::device_vector_noinit<float4>(prealloc);
 
-        index_x_=new thrust::device_vector<int>(points_->size());
-        index_y_=new thrust::device_vector<int>(points_->size());
-        index_z_=new thrust::device_vector<int>(points_->size());
+        index_x_=new flann::cuda::device_vector_noinit<int>(points_->size());
+        index_y_=new flann::cuda::device_vector_noinit<int>(points_->size());
+        index_z_=new flann::cuda::device_vector_noinit<int>(points_->size());
 
-        owners_x_=new thrust::device_vector<int>(points_->size(),0);
-        owners_y_=new thrust::device_vector<int>(points_->size(),0);
-        owners_z_=new thrust::device_vector<int>(points_->size(),0);
+        owners_x_=new flann::cuda::device_vector_noinit<int>(thrust::cuda::par.on(gpu_stream), points_->size(),0);
+        owners_y_=new flann::cuda::device_vector_noinit<int>(thrust::cuda::par.on(gpu_stream), points_->size(),0);
+        owners_z_=new flann::cuda::device_vector_noinit<int>(thrust::cuda::par.on(gpu_stream), points_->size(),0);
 
-        leftright_x_ = new thrust::device_vector<int>(points_->size(),0);
-        leftright_y_ = new thrust::device_vector<int>(points_->size(),0);
-        leftright_z_ = new thrust::device_vector<int>(points_->size(),0);
+        leftright_x_ = new flann::cuda::device_vector_noinit<int>(thrust::cuda::par.on(gpu_stream), points_->size(),0);
+        leftright_y_ = new flann::cuda::device_vector_noinit<int>(thrust::cuda::par.on(gpu_stream), points_->size(),0);
+        leftright_z_ = new flann::cuda::device_vector_noinit<int>(thrust::cuda::par.on(gpu_stream), points_->size(),0);
 
-        tmp_index_=new thrust::device_vector<int>(points_->size());
-        tmp_owners_=new thrust::device_vector<int>(points_->size());
-        tmp_misc_=new thrust::device_vector<int>(points_->size());
+        tmp_index_=new flann::cuda::device_vector_noinit<int>(points_->size());
+        tmp_owners_=new flann::cuda::device_vector_noinit<int>(points_->size());
+        tmp_misc_=new flann::cuda::device_vector_noinit<int>(points_->size());
 
-        points_x_=new thrust::device_vector<float>(points_->size());
-        points_y_=new thrust::device_vector<float>(points_->size());
-        points_z_=new thrust::device_vector<float>(points_->size());
+        points_x_=new flann::cuda::device_vector_noinit<float>(points_->size());
+        points_y_=new flann::cuda::device_vector_noinit<float>(points_->size());
+        points_z_=new flann::cuda::device_vector_noinit<float>(points_->size());
         delete_node_info_=false;
     }
 
@@ -455,7 +562,7 @@ class CudaKdTreeBuilder
             delete aabb_min_;
             delete aabb_max_;
             delete index_x_;
-        }
+		}
 
         delete index_y_;
         delete index_z_;
@@ -486,28 +593,49 @@ class CudaKdTreeBuilder
         //              std::cout<<"buildTree()"<<std::endl;
         //              sleep(1);
         //              Util::Timer stepTimer;
-        thrust::transform( points_->begin(), points_->end(), thrust::make_zip_iterator(thrust::make_tuple(points_x_->begin(), points_y_->begin(),points_z_->begin()) ), cuda::kd_tree_builder_detail::pointxyz_to_px_py_pz() );
+        thrust::transform(thrust::cuda::par.on(gpu_stream), points_->begin(), points_->end(), thrust::make_zip_iterator(thrust::make_tuple(points_x_->begin(), points_y_->begin(),points_z_->begin()) ), cuda::kd_tree_builder_detail::pointxyz_to_px_py_pz() );
 
         thrust::counting_iterator<int> it(0);
-        thrust::copy( it, it+points_->size(), index_x_->begin() );
+        thrust::copy(thrust::cuda::par.on(gpu_stream), it, it+points_->size(), index_x_->begin() );
 
-        thrust::copy( index_x_->begin(), index_x_->end(), index_y_->begin() );
-        thrust::copy( index_x_->begin(), index_x_->end(), index_z_->begin() );
+        thrust::copy(thrust::cuda::par.on(gpu_stream), index_x_->begin(), index_x_->end(), index_y_->begin() );
+        thrust::copy(thrust::cuda::par.on(gpu_stream), index_x_->begin(), index_x_->end(), index_z_->begin() );
 
-        thrust::device_vector<float> tmpv(points_->size());
+        flann::cuda::device_vector_noinit<float> tmpv(points_->size());
 
         // create sorted index list -> can be used to compute AABBs in O(1)
-        thrust::copy(points_x_->begin(), points_x_->end(), tmpv.begin());
-        thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_x_->begin() );
-        thrust::copy(points_y_->begin(), points_y_->end(), tmpv.begin());
-        thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_y_->begin() );
-        thrust::copy(points_z_->begin(), points_z_->end(), tmpv.begin());
-        thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_z_->begin() );
-
-
-        (*aabb_min_)[0]=make_float4((*points_x_)[(*index_x_)[0]],(*points_y_)[(*index_y_)[0]],(*points_z_)[(*index_z_)[0]],0);
-
-        (*aabb_max_)[0]=make_float4((*points_x_)[(*index_x_)[points_->size()-1]],(*points_y_)[(*index_y_)[points_->size()-1]],(*points_z_)[(*index_z_)[points_->size()-1]],0);
+        thrust::copy(thrust::cuda::par.on(gpu_stream), points_x_->begin(), points_x_->end(), tmpv.begin());
+        thrust::sort_by_key(thrust::cuda::par.on(gpu_stream), tmpv.begin(), tmpv.end(), index_x_->begin() );
+        thrust::copy(thrust::cuda::par.on(gpu_stream), points_y_->begin(), points_y_->end(), tmpv.begin());
+        thrust::sort_by_key(thrust::cuda::par.on(gpu_stream), tmpv.begin(), tmpv.end(), index_y_->begin() );
+        thrust::copy(thrust::cuda::par.on(gpu_stream), points_z_->begin(), points_z_->end(), tmpv.begin());
+        thrust::sort_by_key(thrust::cuda::par.on(gpu_stream), tmpv.begin(), tmpv.end(), index_z_->begin() );
+
+		int idxx, idxy, idxz;
+		float xx, yy, zz;
+		float4 xyzw;
+
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_x_->begin(), index_x_->begin() + 1, &idxx);
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_y_->begin(), index_y_->begin() + 1, &idxy);
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_z_->begin(), index_z_->begin() + 1, &idxz);
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_x_->begin() + idxx, points_x_->begin() + idxx + 1, &xx);
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_y_->begin() + idxy, points_y_->begin() + idxy + 1, &yy);
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_z_->begin() + idxz, points_z_->begin() + idxz + 1, &zz);
+		xyzw = make_float4(xx, yy, zz, 0);
+		thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), &xyzw, &xyzw + 1, aabb_min_->begin());
+
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_x_->end() - 1, index_x_->end(), &idxx);
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_y_->end() - 1, index_y_->end(), &idxy);
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_z_->end() - 1, index_z_->end(), &idxz);
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_x_->begin() + idxx, points_x_->begin() + idxx + 1, &xx);
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_y_->begin() + idxy, points_y_->begin() + idxy + 1, &yy);
+		thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_z_->begin() + idxz, points_z_->begin() + idxz + 1, &zz);
+		xyzw = make_float4(xx, yy, zz, 0);
+		thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), &xyzw, &xyzw + 1, aabb_max_->begin());
+
+        //(*aabb_min_)[0]=make_float4((*points_x_)[(*index_x_)[0]],(*points_y_)[(*index_y_)[0]],(*points_z_)[(*index_z_)[0]],0);
+
+        //(*aabb_max_)[0]=make_float4((*points_x_)[(*index_x_)[points_->size()-1]],(*points_y_)[(*index_y_)[points_->size()-1]],(*points_z_)[(*index_z_)[points_->size()-1]],0);
         #ifdef PRINT_DEBUG_TIMING
         cudaDeviceSynchronize();
         std::cout<<" initial stuff:"<<stepTimer.elapsed()<<std::endl;
@@ -526,11 +654,12 @@ class CudaKdTreeBuilder
             sn.splits=thrust::raw_pointer_cast(&(*splits_)[0]);
 
             thrust::counting_iterator<int> cit(0);
-            thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple( parent_->begin(), child1_->begin(),  splits_->begin(), aabb_min_->begin(), aabb_max_->begin(), cit  )),
+            thrust::for_each(thrust::cuda::par.on(gpu_stream), thrust::make_zip_iterator(thrust::make_tuple( parent_->begin(), child1_->begin(),  splits_->begin(), aabb_min_->begin(), aabb_max_->begin(), cit  )),
                               thrust::make_zip_iterator(thrust::make_tuple( parent_->begin()+last_node_count, child1_->begin()+last_node_count,splits_->begin()+last_node_count, aabb_min_->begin()+last_node_count, aabb_max_->begin()+last_node_count,cit+last_node_count  )),
                               sn   );
             // copy allocation info to host
-            thrust::host_vector<int> alloc_info = allocation_info_;
+            thrust::host_vector<int> alloc_info(allocation_info_.size());
+			thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, allocation_info_.begin(), allocation_info_.end(), alloc_info.begin());
 
             if( last_node_count == alloc_info[NodeCount] ) { // no more nodes were split -> done
                 break;
@@ -542,7 +671,8 @@ class CudaKdTreeBuilder
                 resize_node_vectors(alloc_info[NodesAllocated]*2);
                 alloc_info[OutOfSpace]=0;
                 alloc_info[NodesAllocated]*=2;
-                allocation_info_=alloc_info;
+				thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), alloc_info.begin(), alloc_info.end(), allocation_info_.begin());
+				//allocation_info_=alloc_info;
             }
             #ifdef PRINT_DEBUG_TIMING
             cudaDeviceSynchronize();
@@ -564,7 +694,7 @@ class CudaKdTreeBuilder
                                                                       thrust::raw_pointer_cast(&(*leftright_z_)[0])
                                                                       );
             thrust::counting_iterator<int> ci0(0);
-            thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( ci0, index_x_->begin(), index_y_->begin(), index_z_->begin()) ),
+            thrust::for_each(thrust::cuda::par.on(gpu_stream), thrust::make_zip_iterator( thrust::make_tuple( ci0, index_x_->begin(), index_y_->begin(), index_z_->begin()) ),
                               thrust::make_zip_iterator( thrust::make_tuple( ci0+points_->size(), index_x_->end(), index_y_->end(), index_z_->end()) ),sno  );
 
             #ifdef PRINT_DEBUG_TIMING
@@ -607,17 +737,17 @@ class CudaKdTreeBuilder
 
     //! takes the partitioned nodes, and sets the left-/right info of leaf nodes, as well as the AABBs
     void
-    update_leftright_and_aabb( const thrust::device_vector<float>& x, const thrust::device_vector<float>& y,const thrust::device_vector<float>& z,
-                               const thrust::device_vector<int>& ix, const thrust::device_vector<int>& iy,const thrust::device_vector<int>& iz,
-                               const thrust::device_vector<int>& owners,
-                               thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>& splits, thrust::device_vector<float4>& aabbMin,thrust::device_vector<float4>& aabbMax)
+    update_leftright_and_aabb( const flann::cuda::device_vector_noinit<float>& x, const flann::cuda::device_vector_noinit<float>& y,const flann::cuda::device_vector_noinit<float>& z,
+                               const flann::cuda::device_vector_noinit<int>& ix, const flann::cuda::device_vector_noinit<int>& iy,const flann::cuda::device_vector_noinit<int>& iz,
+                               const flann::cuda::device_vector_noinit<int>& owners,
+                               flann::cuda::device_vector_noinit<cuda::kd_tree_builder_detail::SplitInfo>& splits, flann::cuda::device_vector_noinit<float4>& aabbMin,flann::cuda::device_vector_noinit<float4>& aabbMax)
     {
-        thrust::device_vector<int>* labelsUnique=tmp_owners_;
-        thrust::device_vector<int>* countsUnique=tmp_index_;
+        flann::cuda::device_vector_noinit<int>* labelsUnique=tmp_owners_;
+        flann::cuda::device_vector_noinit<int>* countsUnique=tmp_index_;
 		// assume: points of each node are continuous in the array
 		
 		// find which nodes are here, and where each node's points begin and end
-        int unique_labels = thrust::unique_by_key_copy( owners.begin(), owners.end(), thrust::counting_iterator<int>(0), labelsUnique->begin(), countsUnique->begin()).first - labelsUnique->begin();
+        int unique_labels = thrust::unique_by_key_copy(thrust::cuda::par.on(gpu_stream), owners.begin(), owners.end(), thrust::counting_iterator<int>(0), labelsUnique->begin(), countsUnique->begin()).first - labelsUnique->begin();
 
 		// update the info
         cuda::kd_tree_builder_detail::SetLeftAndRightAndAABB s;
@@ -636,7 +766,7 @@ class CudaKdTreeBuilder
         s.aabbMax=thrust::raw_pointer_cast(&aabbMax[0]);
 
         thrust::counting_iterator<int> it(0);
-        thrust::for_each(it, it+unique_labels, s);
+        thrust::for_each(thrust::cuda::par.on(gpu_stream), it, it+unique_labels, s);
     }
 
     //! Separates the left and right children of each node into continuous parts of the array.
@@ -646,12 +776,12 @@ class CudaKdTreeBuilder
     //! for all the single nodes.
     //! (basically the split primitive according to sengupta et al)
     //! about twice as fast as thrust::partition
-    void separate_left_and_right_children( thrust::device_vector<int>& key_in, thrust::device_vector<int>& val_in, thrust::device_vector<int>& key_out, thrust::device_vector<int>& val_out, thrust::device_vector<int>& left_right_marks, bool scatter_val_out=true )
+    void separate_left_and_right_children( flann::cuda::device_vector_noinit<int>& key_in, flann::cuda::device_vector_noinit<int>& val_in, flann::cuda::device_vector_noinit<int>& key_out, flann::cuda::device_vector_noinit<int>& val_out, flann::cuda::device_vector_noinit<int>& left_right_marks, bool scatter_val_out=true )
     {
-        thrust::device_vector<int>* f_tmp = &val_out;
-        thrust::device_vector<int>* addr_tmp = tmp_misc_;
+        flann::cuda::device_vector_noinit<int>* f_tmp = &val_out;
+        flann::cuda::device_vector_noinit<int>* addr_tmp = tmp_misc_;
 
-        thrust::exclusive_scan( /*thrust::make_transform_iterator(*/ left_right_marks.begin() /*,cuda::kd_tree_builder_detail::IsEven*/
+        thrust::exclusive_scan(thrust::cuda::par.on(gpu_stream), /*thrust::make_transform_iterator(*/ left_right_marks.begin() /*,cuda::kd_tree_builder_detail::IsEven*/
                                                                      /*())*/, /*thrust::make_transform_iterator(*/ left_right_marks.end() /*,cuda::kd_tree_builder_detail::IsEven*/
                                                                      /*())*/,     f_tmp->begin() );
         cuda::kd_tree_builder_detail::set_addr3 sa;
@@ -659,10 +789,10 @@ class CudaKdTreeBuilder
         sa.f_=thrust::raw_pointer_cast(&(*f_tmp)[0]);
         sa.npoints_=key_in.size();
         thrust::counting_iterator<int> it(0);
-        thrust::transform(it, it+val_in.size(), addr_tmp->begin(), sa);
+        thrust::transform(thrust::cuda::par.on(gpu_stream), it, it+val_in.size(), addr_tmp->begin(), sa);
 
-        thrust::scatter(key_in.begin(), key_in.end(), addr_tmp->begin(), key_out.begin());
-        if( scatter_val_out ) thrust::scatter(val_in.begin(), val_in.end(), addr_tmp->begin(), val_out.begin());
+        thrust::scatter(thrust::cuda::par.on(gpu_stream), key_in.begin(), key_in.end(), addr_tmp->begin(), key_out.begin());
+        if( scatter_val_out ) thrust::scatter(thrust::cuda::par.on(gpu_stream), val_in.begin(), val_in.end(), addr_tmp->begin(), val_out.begin());
     }
 
     //! allocates additional space in all the node-related vectors.
@@ -670,33 +800,34 @@ class CudaKdTreeBuilder
     void resize_node_vectors( size_t new_size )
     {
         size_t add = new_size - child1_->size();
-        child1_->insert(child1_->end(), add, -1);
-        parent_->insert(parent_->end(), add, -1);
+        child1_->append(thrust::cuda::par.on(gpu_stream), add, -1);
+        parent_->append(thrust::cuda::par.on(gpu_stream), add, -1);
         cuda::kd_tree_builder_detail::SplitInfo s;
         s.left=0;
         s.right=0;
-        splits_->insert(splits_->end(), add, s);
+        splits_->append(thrust::cuda::par.on(gpu_stream), add, s);
         float4 f;
-        aabb_min_->insert(aabb_min_->end(), add, f);
-        aabb_max_->insert(aabb_max_->end(), add, f);
+        aabb_min_->append(thrust::cuda::par.on(gpu_stream), add, f);
+        aabb_max_->append(thrust::cuda::par.on(gpu_stream), add, f);
     }
 
+	cudaStream_t gpu_stream;
 
-    const thrust::device_vector<float4>* points_;
+    const flann::cuda::device_vector_noinit<float4>* points_;
 	
 	// tree data, those are stored per-node
 	
 	//! left child of each node. (right child==left child + 1, due to the alloc mechanism)
 	//! child1_[node]==-1 if node is a leaf node
-    thrust::device_vector<int>* child1_;
+    flann::cuda::device_vector_noinit<int>* child1_;
 	//! parent node of each node
-    thrust::device_vector<int>* parent_;
+    flann::cuda::device_vector_noinit<int>* parent_;
 	//! split info (dim/value or left/right pointers)
-    thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>* splits_;
+    flann::cuda::device_vector_noinit<cuda::kd_tree_builder_detail::SplitInfo>* splits_;
 	//! min aabb value of each node
-    thrust::device_vector<float4>* aabb_min_;
+    flann::cuda::device_vector_noinit<float4>* aabb_min_;
 	//! max aabb value of each node
-    thrust::device_vector<float4>* aabb_max_;
+    flann::cuda::device_vector_noinit<float4>* aabb_max_;
 
     enum AllocationInfo
     {
@@ -705,22 +836,22 @@ class CudaKdTreeBuilder
         OutOfSpace=2
     };
     // those were put into a single vector of 3 elements so that only one mem transfer will be needed for all three of them
-    //  thrust::device_vector<int> out_of_space_;
-    //  thrust::device_vector<int> node_count_;
-    //  thrust::device_vector<int> nodes_allocated_;
-    thrust::device_vector<int> allocation_info_;
+    //  flann::cuda::device_vector_noinit<int> out_of_space_;
+    //  flann::cuda::device_vector_noinit<int> node_count_;
+    //  flann::cuda::device_vector_noinit<int> nodes_allocated_;
+    flann::cuda::device_vector_noinit<int> allocation_info_;
 	
     int max_leaf_size_;
 
 	// coordinate values of the points
-    thrust::device_vector<float>* points_x_, * points_y_, * points_z_;
+    flann::cuda::device_vector_noinit<float>* points_x_, * points_y_, * points_z_;
 	// indices
-    thrust::device_vector<int>* index_x_,  * index_y_,  * index_z_;
+    flann::cuda::device_vector_noinit<int>* index_x_,  * index_y_,  * index_z_;
 	// owner node
-    thrust::device_vector<int>* owners_x_, * owners_y_, * owners_z_;
+    flann::cuda::device_vector_noinit<int>* owners_x_, * owners_y_, * owners_z_;
 	// contains info about whether a point was partitioned to the left or right child after a split
-    thrust::device_vector<int>* leftright_x_, * leftright_y_, * leftright_z_;
-    thrust::device_vector<int>* tmp_index_, * tmp_owners_, * tmp_misc_;
+    flann::cuda::device_vector_noinit<int>* leftright_x_, * leftright_y_, * leftright_z_;
+    flann::cuda::device_vector_noinit<int>* tmp_index_, * tmp_owners_, * tmp_misc_;
     bool delete_node_info_;
 };
 

From 89b9fbb39922175eb7d53ecde4fb514150fa4545 Mon Sep 17 00:00:00 2001
From: yuyi1005 <yuyi1005@vip.qq.com>
Date: Sun, 13 May 2018 16:02:53 +0800
Subject: [PATCH 3/7] create cuda stream for each index

fix errors in comment
---
 src/cpp/flann/algorithms/kdtree_cuda_builder.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpp/flann/algorithms/kdtree_cuda_builder.h b/src/cpp/flann/algorithms/kdtree_cuda_builder.h
index 24ddf0db..55f381c1 100644
--- a/src/cpp/flann/algorithms/kdtree_cuda_builder.h
+++ b/src/cpp/flann/algorithms/kdtree_cuda_builder.h
@@ -96,8 +96,8 @@ namespace flann
 // }
 namespace cuda
 {
-	// flann::cuda::device_vector_noinit is used to take place of flann::cuda::device_vector_noinit
-	// as flann::cuda::device_vector_noinit always uses default stream, and always contains a fill
+	// flann::cuda::device_vector_noinit is used to take place of thrust::device_vector
+	// as thrust::device_vector always uses default stream, and always contains a fill
 	template<typename T>
 	class device_vector_noinit
 	{

From 3111a42fc5c12b3aa3a05e3211e0558fa8deb2ac Mon Sep 17 00:00:00 2001
From: yuyi1005 <yuyi1005@vip.qq.com>
Date: Sun, 13 May 2018 20:10:13 +0800
Subject: [PATCH 4/7] add virtual keyword

knnSearch should be overrided. If not, Index initialize with KDTreeCuda3dIndexParams will call the default knnSearch, not the gpu one
---
 src/cpp/flann/algorithms/nn_index.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/cpp/flann/algorithms/nn_index.h b/src/cpp/flann/algorithms/nn_index.h
index 245d258c..844bc97b 100644
--- a/src/cpp/flann/algorithms/nn_index.h
+++ b/src/cpp/flann/algorithms/nn_index.h
@@ -365,7 +365,7 @@ class NNIndex : public IndexBase
      * @param params
      * @return
      */
-    int knnSearch(const Matrix<ElementType>& queries,
+	virtual int knnSearch(const Matrix<ElementType>& queries,
                                  Matrix<int>& indices,
                                  Matrix<DistanceType>& dists,
                                  size_t knn,
@@ -392,7 +392,7 @@ class NNIndex : public IndexBase
      * @param[in] knn Number of nearest neighbors to return
      * @param[in] params Search parameters
      */
-    int knnSearch(const Matrix<ElementType>& queries,
+	virtual int knnSearch(const Matrix<ElementType>& queries,
 					std::vector< std::vector<size_t> >& indices,
 					std::vector<std::vector<DistanceType> >& dists,
     				size_t knn,
@@ -463,7 +463,7 @@ class NNIndex : public IndexBase
      * @param params
      * @return
      */
-    int knnSearch(const Matrix<ElementType>& queries,
+	virtual int knnSearch(const Matrix<ElementType>& queries,
                                  std::vector< std::vector<int> >& indices,
                                  std::vector<std::vector<DistanceType> >& dists,
                                  size_t knn,
@@ -488,7 +488,7 @@ class NNIndex : public IndexBase
      * @param[in] params Search parameters
      * @return Number of neighbors found
      */
-    int radiusSearch(const Matrix<ElementType>& queries,
+	virtual int radiusSearch(const Matrix<ElementType>& queries,
     		Matrix<size_t>& indices,
     		Matrix<DistanceType>& dists,
     		float radius,
@@ -571,7 +571,7 @@ class NNIndex : public IndexBase
      * @param params
      * @return
      */
-    int radiusSearch(const Matrix<ElementType>& queries,
+	virtual int radiusSearch(const Matrix<ElementType>& queries,
                                     Matrix<int>& indices,
                                     Matrix<DistanceType>& dists,
                                     float radius,
@@ -598,7 +598,7 @@ class NNIndex : public IndexBase
      * @param[in] params Search parameters
      * @return Number of neighbors found
      */
-    int radiusSearch(const Matrix<ElementType>& queries,
+	virtual int radiusSearch(const Matrix<ElementType>& queries,
     		std::vector< std::vector<size_t> >& indices,
     		std::vector<std::vector<DistanceType> >& dists,
     		float radius,
@@ -677,7 +677,7 @@ class NNIndex : public IndexBase
      * @param params
      * @return
      */
-    int radiusSearch(const Matrix<ElementType>& queries,
+	virtual int radiusSearch(const Matrix<ElementType>& queries,
                                     std::vector< std::vector<int> >& indices,
                                     std::vector<std::vector<DistanceType> >& dists,
                                     float radius,

From c577292e05621ff54e7586e4435feaa3a2270d6e Mon Sep 17 00:00:00 2001
From: yuyi1005 <yuyi1005@vip.qq.com>
Date: Tue, 15 May 2018 12:58:24 +0800
Subject: [PATCH 5/7] add external gpu stream support

flann::KDTreeCuda3dIndexParams params;
params["gpu_stream"] = mystream;//cudaStream_t mystream
params["gpu_stream"] = cudaStreamLegacy;//use default stream
params["gpu_stream"] = cudaStreamPerThread;//one stream per thread
if params["gpu_stream"] is not set, stream will be created each time calling buildIndex()
note that there is no internal synchronization, that is, cudaStreamSynchronize() should be called manually if another stream is used to process the matrices (only in case that matrices_in_gpu_ram==true)
---
 .../flann/algorithms/kdtree_cuda_3d_index.cu  | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu
index 70b91ece..4feb0a5c 100644
--- a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu
+++ b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu
@@ -162,6 +162,7 @@ template<typename Distance>
 struct KDTreeCuda3dIndex<Distance>::GpuHelper
 {
 	cudaStream_t gpu_stream;
+	bool use_ext_stream;
     flann::cuda::device_vector_noinit< cuda::kd_tree_builder_detail::SplitInfo >* gpu_splits_;
     flann::cuda::device_vector_noinit< int >* gpu_parent_;
     flann::cuda::device_vector_noinit< int >* gpu_child1_;
@@ -169,9 +170,18 @@ struct KDTreeCuda3dIndex<Distance>::GpuHelper
     flann::cuda::device_vector_noinit< float4 >* gpu_aabb_max_;
     flann::cuda::device_vector_noinit<float4>* gpu_points_;
     flann::cuda::device_vector_noinit<int>* gpu_vind_;
-    GpuHelper() : gpu_splits_(0), gpu_parent_(0), gpu_child1_(0), gpu_aabb_min_(0), gpu_aabb_max_(0), gpu_points_(0), gpu_vind_(0)
+    GpuHelper(cudaStream_t s = (cudaStream_t)0) : gpu_splits_(0), gpu_parent_(0), gpu_child1_(0), gpu_aabb_min_(0), gpu_aabb_max_(0), gpu_points_(0), gpu_vind_(0)
 	{
-		cudaStreamCreate(&gpu_stream);
+		if (s == (cudaStream_t)0)
+		{
+			cudaStreamCreate(&gpu_stream);
+			use_ext_stream = false;
+		}
+		else
+		{
+			gpu_stream = s;
+			use_ext_stream = true;
+		}
     }
     ~GpuHelper()
     {
@@ -191,7 +201,8 @@ struct KDTreeCuda3dIndex<Distance>::GpuHelper
         delete gpu_points_;
         gpu_points_=0;
 
-		cudaStreamDestroy(gpu_stream);
+		if (use_ext_stream == false)
+			cudaStreamDestroy(gpu_stream);
     }
 };
 
@@ -735,8 +746,9 @@ void KDTreeCuda3dIndex<Distance>::uploadTreeToGpu()
     // (I would make this a (boost) static assertion, but so far flann seems to avoid boost
     //  assert( sizeof( KdTreeCudaPrivate::GpuNode)==sizeof( Node ) );
     delete gpu_helper_;
-    gpu_helper_ = new GpuHelper;
-    gpu_helper_->gpu_points_=new flann::cuda::device_vector_noinit<float4>(size_);
+	cudaStream_t s = get_param(index_params_, "gpu_stream", (cudaStream_t)0);
+	gpu_helper_ = new GpuHelper(s);
+	gpu_helper_->gpu_points_=new flann::cuda::device_vector_noinit<float4>(size_);
     flann::cuda::device_vector_noinit<float4> tmp(size_);
     if( get_param(index_params_,"input_is_gpu_float4",false) ) {
 		assert( dataset_.cols == 3 && dataset_.stride==4*sizeof(float));

From b8e3da518a2e7515316b8fbb6a4ecb762729cbfe Mon Sep 17 00:00:00 2001
From: yuyi1005 <yuyi1005@vip.qq.com>
Date: Mon, 21 May 2018 18:27:16 +0800
Subject: [PATCH 6/7] Add nvcc compiler flag to specify gpu arch and code

-gencode=arch=compute_52,code=sm_52
---
 src/cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index b44a7351..4ce34924 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -15,7 +15,7 @@ endif()
 set_property(TARGET flann_cpp_s PROPERTY COMPILE_DEFINITIONS FLANN_STATIC FLANN_USE_CUDA)
 
 if (BUILD_CUDA_LIB)
-    SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-DFLANN_USE_CUDA")
+    SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-DFLANN_USE_CUDA;-gencode=arch=compute_52,code=sm_52")
     if(CMAKE_COMPILER_IS_GNUCC)
 		set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler;-fPIC;" )
         if (NVCC_COMPILER_BINDIR)

From 5fc7f2443f71b096e9ae1e9dc1f847ba215e5297 Mon Sep 17 00:00:00 2001
From: yuyi1005 <yuyi1005@vip.qq.com>
Date: Mon, 28 May 2018 11:17:46 +0800
Subject: [PATCH 7/7] Change gpu arch and code

-gencode=arch=compute_52,code=\"sm_52,compute_52\";
-gencode=arch=compute_61,code=\"sm_61,compute_61\"
---
 src/cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 4ce34924..ffdd2394 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -15,7 +15,7 @@ endif()
 set_property(TARGET flann_cpp_s PROPERTY COMPILE_DEFINITIONS FLANN_STATIC FLANN_USE_CUDA)
 
 if (BUILD_CUDA_LIB)
-    SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-DFLANN_USE_CUDA;-gencode=arch=compute_52,code=sm_52")
+    SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-DFLANN_USE_CUDA;-gencode=arch=compute_52,code=\"sm_52,compute_52\";-gencode=arch=compute_61,code=\"sm_61,compute_61\"")
     if(CMAKE_COMPILER_IS_GNUCC)
 		set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler;-fPIC;" )
         if (NVCC_COMPILER_BINDIR)