Deci-AI · BloodAxe · Nov 15, 2023 · Nov 7, 2023 · Nov 8, 2023 · Nov 8, 2023
@@ -25,7 +25,7 @@
     ClassificationPrediction,
 )
 from super_gradients.training.utils.utils import generate_batch, infer_model_device, resolve_torch_device
-from super_gradients.training.utils.media.video import load_video, includes_video_extension
+from super_gradients.training.utils.media.video import load_video, includes_video_extension, lazy_load_video
 from super_gradients.training.utils.media.image import ImageSource, check_image_typing
 from super_gradients.training.utils.media.stream import WebcamStreaming
 from super_gradients.training.utils.detection_utils import DetectionPostPredictionCallback
@@ -133,9 +133,10 @@ def predict_video(self, video_path: str, batch_size: Optional[int] = 32) -> Vide
         :param batch_size:  The size of each batch.
         :return:            Results of the prediction.
         """
-        video_frames, fps = load_video(file_path=video_path)
+        video_frames, fps, num_frames = lazy_load_video(file_path=video_path)
         result_generator = self._generate_prediction_result(images=video_frames, batch_size=batch_size)
-        return self._combine_image_prediction_to_video(result_generator, fps=fps, n_images=len(video_frames))
+        return self._combine_image_prediction_to_video(result_generator, fps=fps, n_images=num_frames)
+        # return self._combine_image_prediction_to_video(result_generator, fps=fps, n_images=len(video_frames))
 
     def predict_webcam(self) -> None:
         """Predict using webcam"""
@@ -317,8 +318,8 @@ def _combine_image_prediction_to_images(
     def _combine_image_prediction_to_video(
         self, images_predictions: Iterable[ImageDetectionPrediction], fps: float, n_images: Optional[int] = None
     ) -> VideoDetectionPrediction:
-        images_predictions = [image_predictions for image_predictions in tqdm(images_predictions, total=n_images, desc="Predicting Video")]
-        return VideoDetectionPrediction(_images_prediction_lst=images_predictions, fps=fps)
+        # images_predictions = [image_predictions for image_predictions in tqdm(images_predictions, total=n_images, desc="Predicting Video")]
+        return VideoDetectionPrediction(_images_prediction_gen=images_predictions, fps=fps, n_frames=n_images)
 
 
 class PoseEstimationPipeline(Pipeline):

@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Generator
 import cv2
 import PIL
 
@@ -30,6 +30,15 @@ def load_video(file_path: str, max_frames: Optional[int] = None) -> Tuple[List[n
     return frames, fps
 
 
+def lazy_load_video(file_path: str, max_frames: Optional[int] = None) -> Tuple[Generator, int]:
+    cap = _open_video(file_path)
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frames = _lazy_extract_frames(cap, max_frames)
+    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # cap.release()
+    return frames, fps, num_frames
+
+
 def _open_video(file_path: str) -> cv2.VideoCapture:
     """Open a video file.
 
@@ -61,6 +70,20 @@ def _extract_frames(cap: cv2.VideoCapture, max_frames: Optional[int] = None) ->
     return frames
 
 
+def _lazy_extract_frames(cap: cv2.VideoCapture, max_frames: Optional[int] = None) -> Generator:
+    frames_counter = 0
+
+    while frames_counter != max_frames:
+        frame_read_success, frame = cap.read()
+        if not frame_read_success:
+            break
+
+        frames_counter += 1
+        yield cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+
+    cap.release()
+
+
 def save_video(output_path: str, frames: List[np.ndarray], fps: int) -> None:
     """Save a video locally. Depending on the extension, the video will be saved as a .mp4 file or as a .gif file.
 
@@ -75,7 +98,7 @@ def save_video(output_path: str, frames: List[np.ndarray], fps: int) -> None:
     if check_is_gif(output_path):
         save_gif(output_path, frames, fps)
     else:
-        save_mp4(output_path, frames, fps)
+        lazy_save_mp4(output_path, frames, fps)
 
 
 def save_gif(output_path: str, frames: List[np.ndarray], fps: int) -> None:
@@ -113,6 +136,24 @@ def save_mp4(output_path: str, frames: List[np.ndarray], fps: int) -> None:
     video_writer.release()
 
 
+def lazy_save_mp4(output_path, frames, fps) -> None:
+    video_height, video_width, video_writer = None, None, None
+
+    for frame in frames:
+        if video_height is None:
+            video_height, video_width = frame.shape[:2]
+            video_writer = cv2.VideoWriter(
+                output_path,
+                cv2.VideoWriter_fourcc(*"mp4v"),
+                fps,
+                (video_width, video_height),
+            )
+        _validate_frame(frame, video_height, video_width)
+        video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+
+    video_writer.release()
+
+
 def _validate_frames(frames: List[np.ndarray]) -> Tuple[float, float]:
     """Validate the frames to make sure that every frame has the same size and includes the channel dimension. (i.e. (H, W, C))
 
@@ -137,6 +178,24 @@ def _validate_frames(frames: List[np.ndarray]) -> Tuple[float, float]:
     return max_height, max_width
 
 
+def _validate_frame(frame, control_height, control_width) -> None:
+    """Validate the frames to make sure that every frame has the same size and includes the channel dimension. (i.e. (H, W, C))
+
+    :param frames:  Frames representing the video, each in (H, W, C), RGB. Note that all the frames are expected to have the same shape.
+    :return:        (Height, Weight) of the video.
+    """
+    height, width = frame.shape[:2]
+
+    if (height, width) != (control_height, control_width):
+        raise RuntimeError(
+            f"Current frame has resolution {height}x{width} but {control_height}x{control_width} is expected!"
+            f"Please make sure that all the frames have the same shape."
+        )
+
+    if frame.ndim != 3:
+        raise RuntimeError("Your frames must include 3 channels.")
+
+
 def show_video_from_disk(video_path: str, window_name: str = "Prediction"):
     """Display a video from disk using OpenCV.
 

@@ -1,7 +1,7 @@
 import os
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Iterator, Union
+from typing import List, Optional, Tuple, Iterator, Union, Generator
 
 import cv2
 import numpy as np
@@ -16,6 +16,8 @@
 from .predictions import Prediction, DetectionPrediction, ClassificationPrediction
 from ...datasets.data_formats.bbox_formats import convert_bboxes
 
+from tqdm import tqdm
+
 
 @dataclass
 class ImagePrediction(ABC):
@@ -325,15 +327,16 @@ def save(self, *args, **kwargs) -> None:
 
 
 @dataclass
-class VideoPredictions(ImagesPredictions, ABC):
+class VideoPredictions(ABC):
     """Object wrapping the list of image predictions as a Video.
 
-    :attr _images_prediction_lst:   List of results of the run
+    :attr _images_prediction_gen:   List of results of the run
     :att fps:                       Frames per second of the video
     """
 
-    _images_prediction_lst: List[ImagePrediction]
+    _images_prediction_gen: Generator
     fps: float
+    n_frames: int
 
     @abstractmethod
     def show(self, *args, **kwargs) -> None:
@@ -504,20 +507,21 @@ def save(
 class VideoDetectionPrediction(VideoPredictions):
     """Object wrapping the list of image detection predictions as a Video.
 
-    :attr _images_prediction_lst:   List of the predictions results
+    :attr _images_prediction_gen:   List of the predictions results
     :att fps:                       Frames per second of the video
     """
 
-    _images_prediction_lst: List[ImageDetectionPrediction]
+    _images_prediction_gen: Generator
     fps: int
+    n_frames: int
 
     def draw(
         self,
         box_thickness: int = 2,
         show_confidence: bool = True,
         color_mapping: Optional[List[Tuple[int, int, int]]] = None,
         class_names: Optional[List[str]] = None,
-    ) -> List[np.ndarray]:
+    ) -> Generator:
         """Draw the predicted bboxes on the images.
 
         :param box_thickness:   Thickness of bounding boxes.
@@ -527,16 +531,14 @@ def draw(
         :param class_names:     List of class names to show. By default, is None which shows all classes using during training.
         :return:                List of images with predicted bboxes. Note that this does not modify the original image.
         """
-        frames_with_bbox = [
-            result.draw(
+
+        for result in tqdm(self._images_prediction_gen, total=self.n_frames, desc="Processing Video"):
+            yield result.draw(
                 box_thickness=box_thickness,
                 show_confidence=show_confidence,
                 color_mapping=color_mapping,
                 class_names=class_names,
             )
-            for result in self._images_prediction_lst
-        ]
-        return frames_with_bbox
 
     def show(
         self,