Add caching for numba and cupy functions (#591)

insight-platform · Dec 10, 2023 · 261b005 · 261b005
1 parent 2b50d3a
commit 261b005
Show file tree

Hide file tree

Showing 6 changed files with 10 additions and 11 deletions.
diff --git a/docker/Dockerfile.deepstream b/docker/Dockerfile.deepstream
@@ -111,7 +111,9 @@ RUN apt update \
 ARG PROJECT_PATH=/opt/savant
 ENV PYTHONUNBUFFERED=1 \
     PROJECT_PATH=$PROJECT_PATH \
-    PYTHONPATH=$PROJECT_PATH
+    PYTHONPATH=$PROJECT_PATH \
+    NUMBA_CACHE_DIR=/cache/numba \
+    CUPY_CACHE_DIR=/cache/cupy
 WORKDIR $PROJECT_PATH
 
 # add deepstream libs to path, so that ctypes can load them

diff --git a/samples/yolov8_seg/docker-compose.l4t.yml b/samples/yolov8_seg/docker-compose.l4t.yml
@@ -29,8 +29,6 @@ services:
     environment:
       - MODEL_PATH=/cache/models/yolov8_seg
       - DOWNLOAD_PATH=/cache/downloads/yolov8_seg
-      - CUPY_CACHE_DIR=/cache/cupy
-      - NUMBA_CACHE_DIR=/cache/numba
       - ZMQ_SRC_ENDPOINT=sub+bind:ipc:///tmp/zmq-sockets/input-video.ipc
       - ZMQ_SINK_ENDPOINT=pub+bind:ipc:///tmp/zmq-sockets/output-video.ipc
       - METRICS_FRAME_PERIOD=1000

diff --git a/samples/yolov8_seg/docker-compose.x86.yml b/samples/yolov8_seg/docker-compose.x86.yml
@@ -30,8 +30,6 @@ services:
       - MODEL_OUTPUT_CONVERTER=gpu_converter
       - MODEL_PATH=/cache/models/yolov8_seg
       - DOWNLOAD_PATH=/cache/downloads/yolov8_seg
-      - CUPY_CACHE_DIR=/cache/cupy
-      - NUMBA_CACHE_DIR=/cache/numba
       - ZMQ_SRC_ENDPOINT=sub+bind:ipc:///tmp/zmq-sockets/input-video.ipc
       - ZMQ_SINK_ENDPOINT=pub+bind:ipc:///tmp/zmq-sockets/output-video.ipc
       - METRICS_FRAME_PERIOD=1000

diff --git a/samples/yolov8_seg/module/converter.py b/samples/yolov8_seg/module/converter.py
@@ -108,7 +108,7 @@ def __call__(
         return tensors, mask_list
 
 
-@nb.njit('Tuple((u2[:], f4[:]))(f4[:, :])', nogil=True)
+@nb.njit('Tuple((u2[:], f4[:]))(f4[:, :])', nogil=True, cache=True)
 def _parse_scores(scores: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     class_ids = np.empty(scores.shape[0], dtype=np.uint16)
     confidences = np.empty(scores.shape[0], dtype=scores.dtype)
@@ -118,7 +118,7 @@ def _parse_scores(scores: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     return class_ids, confidences
 
 
-@nb.njit('f4[:, ::1](f4[:, :])', nogil=True)
+@nb.njit('f4[:, ::1](f4[:, :])', nogil=True, cache=True)
 def sigmoid(a: np.ndarray) -> np.ndarray:
     ones = np.ones(a.shape, dtype=np.float32)
     return np.divide(ones, (ones + np.exp(-a)))
@@ -127,6 +127,7 @@ def sigmoid(a: np.ndarray) -> np.ndarray:
 @nb.njit(
     'Tuple((f4[:, ::1], f4[:, :, ::1]))(f4[:, ::1], f4[:, :, ::1], u2, f4, f4, u2)',
     nogil=True,
+    cache=True,
 )
 def _postproc(
     output: np.ndarray,

diff --git a/savant/selector/detector.py b/savant/selector/detector.py
@@ -6,7 +6,7 @@
 from savant.utils.nms import nms_cpu
 
 
-@nb.njit('f4[:, :](f4[:, :], u2, u2, u2, u2)', nogil=True)
+@nb.njit('f4[:, :](f4[:, :], u2, u2, u2, u2)', nogil=True, cache=True)
 def min_max_bbox_size_selector(
     bbox_tensor: np.ndarray,
     min_width: int = 0,
@@ -86,7 +86,7 @@ def __call__(self, bbox_tensor: np.ndarray) -> np.ndarray:
         )
 
 
-@nb.njit('f4[:, :](f4[:, :], f4, f4, u2, u2, u2, u2)', nogil=True)
+@nb.njit('f4[:, :](f4[:, :], f4, f4, u2, u2, u2, u2)', nogil=True, cache=True)
 def default_selector(
     bbox_tensor: np.ndarray,
     confidence_threshold: float = 0.0,

diff --git a/savant/utils/nms.py b/savant/utils/nms.py
@@ -6,7 +6,7 @@
 __all__ = ['nms_cpu', 'nms_gpu']
 
 
-@nb.njit('u4[:](f4[:, :], f4[:], f4, u2)', nogil=True)
+@nb.njit('u4[:](f4[:, :], f4[:], f4, u2)', nogil=True, cache=True)
 def nms_cpu(
     bboxes: np.ndarray, confidences: np.ndarray, threshold: float, top_k: int = 300
 ) -> np.ndarray:
@@ -172,7 +172,7 @@ def _call_nms_kernel(bboxes: cp.ndarray, threshold: float) -> cp.ndarray:
     return _nms_gpu_post(mask.get(), n_bbox, threads_per_block, col_blocks)
 
 
-@nb.njit('u4[:](u8[:], u2, u2, u2)', nogil=True)
+@nb.njit('u4[:](u8[:], u2, u2, u2)', nogil=True, cache=True)
 def _nms_gpu_post(
     mask: np.ndarray, n_bbox: int, threads_per_block: int, col_blocks: int
 ) -> np.ndarray: