From d26c6532db827e3bb17ffc518f32a4b062c5bad2 Mon Sep 17 00:00:00 2001 From: Michael Mead Date: Sat, 17 Feb 2024 09:02:38 -0800 Subject: [PATCH] fix: use cudaStreamSynchronize to avoid synchronizing user streams The CUDA Programming Guide states that calling `cudaDeviceSynchronize` has the following synchronization behavior: "cudaDeviceSynchronize() waits until all preceding commands in all streams of all host threads have completed." (source: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#explicit-synchronization) The user may be running CUDA code in separate CUDA streams in their application and the cudaDeviceSynchronize call will wait for those operations to finish if they are executing concurrently. A solution to the problem, the one taken in this commit, is to replace calls to `cudaDeviceSynchronize` with `cudaStreamSynchronize(0)`, where `0` stands for the default CUDA stream. This will change the SDK behavior to only wait for the default stream to synchronize, rather than all streams executing CUDA code. Signed-off-by: Michael Mead --- src/cuda/cuda-conversion.cu | 10 +++++----- src/proc/cuda/cuda-align.cu | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/cuda/cuda-conversion.cu b/src/cuda/cuda-conversion.cu index 98e41879db..1c79e16c56 100644 --- a/src/cuda/cuda-conversion.cu +++ b/src/cuda/cuda-conversion.cu @@ -282,7 +282,7 @@ void rscuda::unpack_yuy2_cuda_helper(const uint8_t* h_src, uint8_t* h_dst, int n result = cudaGetLastError(); assert(result == cudaSuccess); - cudaDeviceSynchronize(); + cudaStreamSynchronize(0); result = cudaMemcpy(h_dst, d_dst.get(), n * sizeof(uint8_t) * size, cudaMemcpyDeviceToHost); assert(result == cudaSuccess); @@ -325,7 +325,7 @@ void rscuda::y8_y8_from_y8i_cuda_helper(uint8_t* const dest[], int count, const assert(result == cudaSuccess); kernel_split_frame_y8_y8_from_y8i_cuda << > > (d_dst_0.get(), d_dst_1.get(), count, d_src.get()); - cudaDeviceSynchronize(); + cudaStreamSynchronize(0); result = cudaGetLastError(); assert(result == cudaSuccess); @@ -377,7 +377,7 @@ void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, assert(result == cudaSuccess); kernel_split_frame_y16_y16_from_y12i_cuda <<>> (d_dst_0.get(), d_dst_1.get(), count, d_src.get()); - cudaDeviceSynchronize(); + cudaStreamSynchronize(0); result = cudaGetLastError(); assert(result == cudaSuccess); @@ -423,7 +423,7 @@ void rscuda::unpack_z16_y8_from_sr300_inzi_cuda(uint8_t * const dest, const uint assert(result == cudaSuccess); kernel_z16_y8_from_sr300_inzi_cuda <<>> (d_src.get(), d_dst.get(), count); - cudaDeviceSynchronize(); + cudaStreamSynchronize(0); result = cudaMemcpy(dest, d_dst.get(), count * sizeof(uint8_t), cudaMemcpyDeviceToHost); assert(result == cudaSuccess); @@ -461,7 +461,7 @@ void rscuda::unpack_z16_y16_from_sr300_inzi_cuda(uint16_t * const dest, const ui assert(result == cudaSuccess); kernel_z16_y16_from_sr300_inzi_cuda << > > (d_src.get(), d_dst.get(), count); - cudaDeviceSynchronize(); + cudaStreamSynchronize(0); result = cudaMemcpy(dest, d_dst.get(), count * sizeof(uint16_t), cudaMemcpyDeviceToHost); assert(result == cudaSuccess); diff --git a/src/proc/cuda/cuda-align.cu b/src/proc/cuda/cuda-align.cu index b50ce2b0e2..dca3cda86f 100644 --- a/src/proc/cuda/cuda-align.cu +++ b/src/proc/cuda/cuda-align.cu @@ -179,7 +179,7 @@ void align_cuda_helper::align_other_to_depth(unsigned char* h_aligned_out, const case 4: kernel_other_to_depth<4> <<>> (_d_aligned_out.get(), _d_other_in.get(), _d_pixel_map.get(), _d_depth_intrinsics.get(), _d_other_intrinsics.get()); break; } - cudaDeviceSynchronize(); + cudaStreamSynchronize(0); cudaMemcpy(h_aligned_out, _d_aligned_out.get(), aligned_size, cudaMemcpyDeviceToHost); } @@ -222,7 +222,7 @@ void align_cuda_helper::align_depth_to_other(unsigned char* h_aligned_out, const kernel_replace_to_zero <<>> ((uint16_t*)_d_aligned_out.get(), _d_other_intrinsics.get()); - cudaDeviceSynchronize(); + cudaStreamSynchronize(0); cudaMemcpy(h_aligned_out, _d_aligned_out.get(), aligned_pixel_count * 2, cudaMemcpyDeviceToHost); }