From 261b0052a45a3f2adefd4416c46574edc36d6be7 Mon Sep 17 00:00:00 2001 From: Oleg Abramov Date: Sun, 10 Dec 2023 16:23:58 +0600 Subject: [PATCH] Add caching for numba and cupy functions (#591) --- docker/Dockerfile.deepstream | 4 +++- samples/yolov8_seg/docker-compose.l4t.yml | 2 -- samples/yolov8_seg/docker-compose.x86.yml | 2 -- samples/yolov8_seg/module/converter.py | 5 +++-- savant/selector/detector.py | 4 ++-- savant/utils/nms.py | 4 ++-- 6 files changed, 10 insertions(+), 11 deletions(-) diff --git a/docker/Dockerfile.deepstream b/docker/Dockerfile.deepstream index 037122ba..3668ff28 100644 --- a/docker/Dockerfile.deepstream +++ b/docker/Dockerfile.deepstream @@ -111,7 +111,9 @@ RUN apt update \ ARG PROJECT_PATH=/opt/savant ENV PYTHONUNBUFFERED=1 \ PROJECT_PATH=$PROJECT_PATH \ - PYTHONPATH=$PROJECT_PATH + PYTHONPATH=$PROJECT_PATH \ + NUMBA_CACHE_DIR=/cache/numba \ + CUPY_CACHE_DIR=/cache/cupy WORKDIR $PROJECT_PATH # add deepstream libs to path, so that ctypes can load them diff --git a/samples/yolov8_seg/docker-compose.l4t.yml b/samples/yolov8_seg/docker-compose.l4t.yml index bc8347ea..56158cdd 100644 --- a/samples/yolov8_seg/docker-compose.l4t.yml +++ b/samples/yolov8_seg/docker-compose.l4t.yml @@ -29,8 +29,6 @@ services: environment: - MODEL_PATH=/cache/models/yolov8_seg - DOWNLOAD_PATH=/cache/downloads/yolov8_seg - - CUPY_CACHE_DIR=/cache/cupy - - NUMBA_CACHE_DIR=/cache/numba - ZMQ_SRC_ENDPOINT=sub+bind:ipc:///tmp/zmq-sockets/input-video.ipc - ZMQ_SINK_ENDPOINT=pub+bind:ipc:///tmp/zmq-sockets/output-video.ipc - METRICS_FRAME_PERIOD=1000 diff --git a/samples/yolov8_seg/docker-compose.x86.yml b/samples/yolov8_seg/docker-compose.x86.yml index 19cf7153..c6a03eb5 100644 --- a/samples/yolov8_seg/docker-compose.x86.yml +++ b/samples/yolov8_seg/docker-compose.x86.yml @@ -30,8 +30,6 @@ services: - MODEL_OUTPUT_CONVERTER=gpu_converter - MODEL_PATH=/cache/models/yolov8_seg - DOWNLOAD_PATH=/cache/downloads/yolov8_seg - - CUPY_CACHE_DIR=/cache/cupy - - NUMBA_CACHE_DIR=/cache/numba - ZMQ_SRC_ENDPOINT=sub+bind:ipc:///tmp/zmq-sockets/input-video.ipc - ZMQ_SINK_ENDPOINT=pub+bind:ipc:///tmp/zmq-sockets/output-video.ipc - METRICS_FRAME_PERIOD=1000 diff --git a/samples/yolov8_seg/module/converter.py b/samples/yolov8_seg/module/converter.py index 0e70d52e..b5bf3b62 100644 --- a/samples/yolov8_seg/module/converter.py +++ b/samples/yolov8_seg/module/converter.py @@ -108,7 +108,7 @@ def __call__( return tensors, mask_list -@nb.njit('Tuple((u2[:], f4[:]))(f4[:, :])', nogil=True) +@nb.njit('Tuple((u2[:], f4[:]))(f4[:, :])', nogil=True, cache=True) def _parse_scores(scores: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: class_ids = np.empty(scores.shape[0], dtype=np.uint16) confidences = np.empty(scores.shape[0], dtype=scores.dtype) @@ -118,7 +118,7 @@ def _parse_scores(scores: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: return class_ids, confidences -@nb.njit('f4[:, ::1](f4[:, :])', nogil=True) +@nb.njit('f4[:, ::1](f4[:, :])', nogil=True, cache=True) def sigmoid(a: np.ndarray) -> np.ndarray: ones = np.ones(a.shape, dtype=np.float32) return np.divide(ones, (ones + np.exp(-a))) @@ -127,6 +127,7 @@ def sigmoid(a: np.ndarray) -> np.ndarray: @nb.njit( 'Tuple((f4[:, ::1], f4[:, :, ::1]))(f4[:, ::1], f4[:, :, ::1], u2, f4, f4, u2)', nogil=True, + cache=True, ) def _postproc( output: np.ndarray, diff --git a/savant/selector/detector.py b/savant/selector/detector.py index 43e72984..1fa2e0e6 100644 --- a/savant/selector/detector.py +++ b/savant/selector/detector.py @@ -6,7 +6,7 @@ from savant.utils.nms import nms_cpu -@nb.njit('f4[:, :](f4[:, :], u2, u2, u2, u2)', nogil=True) +@nb.njit('f4[:, :](f4[:, :], u2, u2, u2, u2)', nogil=True, cache=True) def min_max_bbox_size_selector( bbox_tensor: np.ndarray, min_width: int = 0, @@ -86,7 +86,7 @@ def __call__(self, bbox_tensor: np.ndarray) -> np.ndarray: ) -@nb.njit('f4[:, :](f4[:, :], f4, f4, u2, u2, u2, u2)', nogil=True) +@nb.njit('f4[:, :](f4[:, :], f4, f4, u2, u2, u2, u2)', nogil=True, cache=True) def default_selector( bbox_tensor: np.ndarray, confidence_threshold: float = 0.0, diff --git a/savant/utils/nms.py b/savant/utils/nms.py index 9cf27690..9bfd85af 100644 --- a/savant/utils/nms.py +++ b/savant/utils/nms.py @@ -6,7 +6,7 @@ __all__ = ['nms_cpu', 'nms_gpu'] -@nb.njit('u4[:](f4[:, :], f4[:], f4, u2)', nogil=True) +@nb.njit('u4[:](f4[:, :], f4[:], f4, u2)', nogil=True, cache=True) def nms_cpu( bboxes: np.ndarray, confidences: np.ndarray, threshold: float, top_k: int = 300 ) -> np.ndarray: @@ -172,7 +172,7 @@ def _call_nms_kernel(bboxes: cp.ndarray, threshold: float) -> cp.ndarray: return _nms_gpu_post(mask.get(), n_bbox, threads_per_block, col_blocks) -@nb.njit('u4[:](u8[:], u2, u2, u2)', nogil=True) +@nb.njit('u4[:](u8[:], u2, u2, u2)', nogil=True, cache=True) def _nms_gpu_post( mask: np.ndarray, n_bbox: int, threads_per_block: int, col_blocks: int ) -> np.ndarray: