From d26c6532db827e3bb17ffc518f32a4b062c5bad2 Mon Sep 17 00:00:00 2001
From: Michael Mead <michael@invisible.ai>
Date: Sat, 17 Feb 2024 09:02:38 -0800
Subject: [PATCH] fix: use cudaStreamSynchronize to avoid synchronizing user
 streams

The CUDA Programming Guide states that calling `cudaDeviceSynchronize`
has the following synchronization behavior: "cudaDeviceSynchronize()
waits until all preceding commands in all streams of all
host threads have completed."
(source: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#explicit-synchronization)

The user may be running CUDA code in separate CUDA streams in their
application and the cudaDeviceSynchronize call will wait for those operations
to finish if they are executing concurrently. A solution to the problem,
the one taken in this commit, is to replace calls to
`cudaDeviceSynchronize` with `cudaStreamSynchronize(0)`, where `0`
stands for the default CUDA stream. This will change the SDK behavior to
only wait for the default stream to synchronize, rather than all
streams executing CUDA code.

Signed-off-by: Michael Mead <michael@invisible.ai>
---
 src/cuda/cuda-conversion.cu | 10 +++++-----
 src/proc/cuda/cuda-align.cu |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/cuda/cuda-conversion.cu b/src/cuda/cuda-conversion.cu
index 98e41879db..1c79e16c56 100644
--- a/src/cuda/cuda-conversion.cu
+++ b/src/cuda/cuda-conversion.cu
@@ -282,7 +282,7 @@ void rscuda::unpack_yuy2_cuda_helper(const uint8_t* h_src, uint8_t* h_dst, int n
     result = cudaGetLastError();
     assert(result == cudaSuccess);
 
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(0);
 
     result = cudaMemcpy(h_dst, d_dst.get(), n * sizeof(uint8_t) * size, cudaMemcpyDeviceToHost);
     assert(result == cudaSuccess);
@@ -325,7 +325,7 @@ void rscuda::y8_y8_from_y8i_cuda_helper(uint8_t* const dest[], int count, const
     assert(result == cudaSuccess);
 
     kernel_split_frame_y8_y8_from_y8i_cuda << <numBlocks, RS2_CUDA_THREADS_PER_BLOCK >> > (d_dst_0.get(), d_dst_1.get(), count, d_src.get());
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(0);
 
     result = cudaGetLastError();
     assert(result == cudaSuccess);
@@ -377,7 +377,7 @@ void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count,
     assert(result == cudaSuccess);
 
     kernel_split_frame_y16_y16_from_y12i_cuda <<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK>>> (d_dst_0.get(), d_dst_1.get(), count, d_src.get());
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(0);
 
     result = cudaGetLastError();
     assert(result == cudaSuccess);
@@ -423,7 +423,7 @@ void rscuda::unpack_z16_y8_from_sr300_inzi_cuda(uint8_t * const dest, const uint
     assert(result == cudaSuccess);
 
     kernel_z16_y8_from_sr300_inzi_cuda <<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK >>> (d_src.get(), d_dst.get(), count);
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(0);
 
     result = cudaMemcpy(dest, d_dst.get(), count * sizeof(uint8_t), cudaMemcpyDeviceToHost);
     assert(result == cudaSuccess);
@@ -461,7 +461,7 @@ void rscuda::unpack_z16_y16_from_sr300_inzi_cuda(uint16_t * const dest, const ui
     assert(result == cudaSuccess);
 
     kernel_z16_y16_from_sr300_inzi_cuda << <numBlocks, RS2_CUDA_THREADS_PER_BLOCK >> > (d_src.get(), d_dst.get(), count);
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(0);
 
     result = cudaMemcpy(dest, d_dst.get(), count * sizeof(uint16_t), cudaMemcpyDeviceToHost);
     assert(result == cudaSuccess);
diff --git a/src/proc/cuda/cuda-align.cu b/src/proc/cuda/cuda-align.cu
index b50ce2b0e2..dca3cda86f 100644
--- a/src/proc/cuda/cuda-align.cu
+++ b/src/proc/cuda/cuda-align.cu
@@ -179,7 +179,7 @@ void align_cuda_helper::align_other_to_depth(unsigned char* h_aligned_out, const
     case 4: kernel_other_to_depth<4> <<<depth_blocks,threads>>> (_d_aligned_out.get(), _d_other_in.get(), _d_pixel_map.get(), _d_depth_intrinsics.get(), _d_other_intrinsics.get()); break;
     }
 
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(0);
 
     cudaMemcpy(h_aligned_out, _d_aligned_out.get(), aligned_size, cudaMemcpyDeviceToHost);
 }
@@ -222,7 +222,7 @@ void align_cuda_helper::align_depth_to_other(unsigned char* h_aligned_out, const
 
     kernel_replace_to_zero <<<other_blocks, threads>>> ((uint16_t*)_d_aligned_out.get(), _d_other_intrinsics.get());
 
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(0);
 
     cudaMemcpy(h_aligned_out, _d_aligned_out.get(), aligned_pixel_count * 2, cudaMemcpyDeviceToHost);
 }