Skip to content

Commit

Permalink
PR #12687 from invisible-ai/invisibleai: Replace calls to cudaDeviceS…
Browse files Browse the repository at this point in the history
…ynchronize with calls to only synchronize the default CUDA stream
  • Loading branch information
Nir-Az authored Mar 4, 2024
2 parents 1f4f630 + d26c653 commit 4ee22d9
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 7 deletions.
10 changes: 5 additions & 5 deletions src/cuda/cuda-conversion.cu
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ void rscuda::unpack_yuy2_cuda_helper(const uint8_t* h_src, uint8_t* h_dst, int n
result = cudaGetLastError();
assert(result == cudaSuccess);

cudaDeviceSynchronize();
cudaStreamSynchronize(0);

result = cudaMemcpy(h_dst, d_dst.get(), n * sizeof(uint8_t) * size, cudaMemcpyDeviceToHost);
assert(result == cudaSuccess);
Expand Down Expand Up @@ -325,7 +325,7 @@ void rscuda::y8_y8_from_y8i_cuda_helper(uint8_t* const dest[], int count, const
assert(result == cudaSuccess);

kernel_split_frame_y8_y8_from_y8i_cuda << <numBlocks, RS2_CUDA_THREADS_PER_BLOCK >> > (d_dst_0.get(), d_dst_1.get(), count, d_src.get());
cudaDeviceSynchronize();
cudaStreamSynchronize(0);

result = cudaGetLastError();
assert(result == cudaSuccess);
Expand Down Expand Up @@ -377,7 +377,7 @@ void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count,
assert(result == cudaSuccess);

kernel_split_frame_y16_y16_from_y12i_cuda <<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK>>> (d_dst_0.get(), d_dst_1.get(), count, d_src.get());
cudaDeviceSynchronize();
cudaStreamSynchronize(0);

result = cudaGetLastError();
assert(result == cudaSuccess);
Expand Down Expand Up @@ -423,7 +423,7 @@ void rscuda::unpack_z16_y8_from_sr300_inzi_cuda(uint8_t * const dest, const uint
assert(result == cudaSuccess);

kernel_z16_y8_from_sr300_inzi_cuda <<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK >>> (d_src.get(), d_dst.get(), count);
cudaDeviceSynchronize();
cudaStreamSynchronize(0);

result = cudaMemcpy(dest, d_dst.get(), count * sizeof(uint8_t), cudaMemcpyDeviceToHost);
assert(result == cudaSuccess);
Expand Down Expand Up @@ -461,7 +461,7 @@ void rscuda::unpack_z16_y16_from_sr300_inzi_cuda(uint16_t * const dest, const ui
assert(result == cudaSuccess);

kernel_z16_y16_from_sr300_inzi_cuda << <numBlocks, RS2_CUDA_THREADS_PER_BLOCK >> > (d_src.get(), d_dst.get(), count);
cudaDeviceSynchronize();
cudaStreamSynchronize(0);

result = cudaMemcpy(dest, d_dst.get(), count * sizeof(uint16_t), cudaMemcpyDeviceToHost);
assert(result == cudaSuccess);
Expand Down
4 changes: 2 additions & 2 deletions src/proc/cuda/cuda-align.cu
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ void align_cuda_helper::align_other_to_depth(unsigned char* h_aligned_out, const
case 4: kernel_other_to_depth<4> <<<depth_blocks,threads>>> (_d_aligned_out.get(), _d_other_in.get(), _d_pixel_map.get(), _d_depth_intrinsics.get(), _d_other_intrinsics.get()); break;
}

cudaDeviceSynchronize();
cudaStreamSynchronize(0);

cudaMemcpy(h_aligned_out, _d_aligned_out.get(), aligned_size, cudaMemcpyDeviceToHost);
}
Expand Down Expand Up @@ -222,7 +222,7 @@ void align_cuda_helper::align_depth_to_other(unsigned char* h_aligned_out, const

kernel_replace_to_zero <<<other_blocks, threads>>> ((uint16_t*)_d_aligned_out.get(), _d_other_intrinsics.get());

cudaDeviceSynchronize();
cudaStreamSynchronize(0);

cudaMemcpy(h_aligned_out, _d_aligned_out.get(), aligned_pixel_count * 2, cudaMemcpyDeviceToHost);
}
Expand Down

0 comments on commit 4ee22d9

Please sign in to comment.