PR #12687 from invisible-ai/invisibleai: Replace calls to cudaDeviceS…

…ynchronize with calls to only synchronize the default CUDA stream
IntelRealSense · Mar 4, 2024 · 4ee22d9 · 4ee22d9
2 parents 1f4f630 + d26c653
commit 4ee22d9
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 7 deletions.
diff --git a/src/cuda/cuda-conversion.cu b/src/cuda/cuda-conversion.cu
@@ -282,7 +282,7 @@ void rscuda::unpack_yuy2_cuda_helper(const uint8_t* h_src, uint8_t* h_dst, int n
     result = cudaGetLastError();
     assert(result == cudaSuccess);
 
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(0);
 
     result = cudaMemcpy(h_dst, d_dst.get(), n * sizeof(uint8_t) * size, cudaMemcpyDeviceToHost);
     assert(result == cudaSuccess);
@@ -325,7 +325,7 @@ void rscuda::y8_y8_from_y8i_cuda_helper(uint8_t* const dest[], int count, const
     assert(result == cudaSuccess);
 
     kernel_split_frame_y8_y8_from_y8i_cuda << <numBlocks, RS2_CUDA_THREADS_PER_BLOCK >> > (d_dst_0.get(), d_dst_1.get(), count, d_src.get());
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(0);
 
     result = cudaGetLastError();
     assert(result == cudaSuccess);
@@ -377,7 +377,7 @@ void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count,
     assert(result == cudaSuccess);
 
     kernel_split_frame_y16_y16_from_y12i_cuda <<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK>>> (d_dst_0.get(), d_dst_1.get(), count, d_src.get());
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(0);
 
     result = cudaGetLastError();
     assert(result == cudaSuccess);
@@ -423,7 +423,7 @@ void rscuda::unpack_z16_y8_from_sr300_inzi_cuda(uint8_t * const dest, const uint
     assert(result == cudaSuccess);
 
     kernel_z16_y8_from_sr300_inzi_cuda <<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK >>> (d_src.get(), d_dst.get(), count);
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(0);
 
     result = cudaMemcpy(dest, d_dst.get(), count * sizeof(uint8_t), cudaMemcpyDeviceToHost);
     assert(result == cudaSuccess);
@@ -461,7 +461,7 @@ void rscuda::unpack_z16_y16_from_sr300_inzi_cuda(uint16_t * const dest, const ui
     assert(result == cudaSuccess);
 
     kernel_z16_y16_from_sr300_inzi_cuda << <numBlocks, RS2_CUDA_THREADS_PER_BLOCK >> > (d_src.get(), d_dst.get(), count);
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(0);
 
     result = cudaMemcpy(dest, d_dst.get(), count * sizeof(uint16_t), cudaMemcpyDeviceToHost);
     assert(result == cudaSuccess);

diff --git a/src/proc/cuda/cuda-align.cu b/src/proc/cuda/cuda-align.cu
@@ -179,7 +179,7 @@ void align_cuda_helper::align_other_to_depth(unsigned char* h_aligned_out, const
     case 4: kernel_other_to_depth<4> <<<depth_blocks,threads>>> (_d_aligned_out.get(), _d_other_in.get(), _d_pixel_map.get(), _d_depth_intrinsics.get(), _d_other_intrinsics.get()); break;
     }
 
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(0);
 
     cudaMemcpy(h_aligned_out, _d_aligned_out.get(), aligned_size, cudaMemcpyDeviceToHost);
 }
@@ -222,7 +222,7 @@ void align_cuda_helper::align_depth_to_other(unsigned char* h_aligned_out, const
 
     kernel_replace_to_zero <<<other_blocks, threads>>> ((uint16_t*)_d_aligned_out.get(), _d_other_intrinsics.get());
 
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(0);
 
     cudaMemcpy(h_aligned_out, _d_aligned_out.get(), aligned_pixel_count * 2, cudaMemcpyDeviceToHost);
 }