From 45335b7d1b5ae027875f72496796ba8f8df7e229 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 20 May 2022 13:02:28 +0100 Subject: [PATCH 1/6] Remove `(N, T, H, W, C) => (N, T, C, H, W)` conversion on presets --- torchvision/transforms/_presets.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchvision/transforms/_presets.py b/torchvision/transforms/_presets.py index 765ae8ec3c4..93b2d0a8f85 100644 --- a/torchvision/transforms/_presets.py +++ b/torchvision/transforms/_presets.py @@ -99,7 +99,6 @@ def forward(self, vid: Tensor) -> Tensor: vid = vid.unsqueeze(dim=0) need_squeeze = True - vid = vid.permute(0, 1, 4, 2, 3) # (N, T, H, W, C) => (N, T, C, H, W) N, T, C, H, W = vid.shape vid = vid.view(-1, C, H, W) vid = F.resize(vid, self.resize_size, interpolation=self.interpolation) From c04fc3f54f4fe60cc273492af43b1c7dc0aa451e Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 20 May 2022 13:02:54 +0100 Subject: [PATCH 2/6] Update docs. --- docs/source/models.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/models.rst b/docs/source/models.rst index ea3c57bb62b..982867da981 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -472,6 +472,7 @@ Here is an example of how to use the pre-trained video classification models: from torchvision.models.video import r3d_18, R3D_18_Weights vid, _, _ = read_video("test/assets/videos/v_SoccerJuggling_g23_c01.avi") + vid = vid.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) vid = vid[:32] # optionally shorten duration # Step 1: Initialize model with the best available weights From 4aa5a9a79b10cbffb9bccbb8c32a149d5e69f3e2 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 20 May 2022 13:20:09 +0100 Subject: [PATCH 3/6] Fix the tests --- test/test_extended_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_extended_models.py b/test/test_extended_models.py index 408a8c0514c..396e79c3f6d 100644 --- a/test/test_extended_models.py +++ b/test/test_extended_models.py @@ -180,7 +180,7 @@ def test_transforms_jit(model_fn): "input_shape": (1, 3, 520, 520), }, "video": { - "input_shape": (1, 4, 112, 112, 3), + "input_shape": (1, 4, 3, 112, 112), }, "optical_flow": { "input_shape": (1, 3, 128, 128), From e61bc7db183e26798ef2a00b3a45176e0218bef0 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 23 May 2022 12:44:33 +0100 Subject: [PATCH 4/6] Use `output_format` for `read_video()` --- docs/source/models.rst | 3 +-- gallery/plot_optical_flow.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/source/models.rst b/docs/source/models.rst index 982867da981..b549c25bf94 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -471,8 +471,7 @@ Here is an example of how to use the pre-trained video classification models: from torchvision.io.video import read_video from torchvision.models.video import r3d_18, R3D_18_Weights - vid, _, _ = read_video("test/assets/videos/v_SoccerJuggling_g23_c01.avi") - vid = vid.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) + vid, _, _ = read_video("test/assets/videos/v_SoccerJuggling_g23_c01.avi", output_format="TCHW") vid = vid[:32] # optionally shorten duration # Step 1: Initialize model with the best available weights diff --git a/gallery/plot_optical_flow.py b/gallery/plot_optical_flow.py index 495422c1f3e..9e8d0006f1a 100644 --- a/gallery/plot_optical_flow.py +++ b/gallery/plot_optical_flow.py @@ -72,8 +72,7 @@ def plot(imgs, **imshow_kwargs): # single model input. from torchvision.io import read_video -frames, _, _ = read_video(str(video_path)) -frames = frames.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) +frames, _, _ = read_video(str(video_path), output_format="TCHW") img1_batch = torch.stack([frames[100], frames[150]]) img2_batch = torch.stack([frames[101], frames[151]]) From 095f07a30449c76ad69aec853a56038ed1a90e59 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 23 May 2022 12:49:12 +0100 Subject: [PATCH 5/6] Use `output_format` for `Kinetics()` --- references/video_classification/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/references/video_classification/train.py b/references/video_classification/train.py index c7ac9e8c133..e1df08cbe4a 100644 --- a/references/video_classification/train.py +++ b/references/video_classification/train.py @@ -157,6 +157,7 @@ def main(args): "avi", "mp4", ), + output_format="TCHW", ) if args.cache_dataset: print(f"Saving dataset_train to {cache_path}") @@ -193,6 +194,7 @@ def main(args): "avi", "mp4", ), + output_format="TCHW", ) if args.cache_dataset: print(f"Saving dataset_test to {cache_path}") From bf9a0f85b63efafc0a84e4ddcfd109f96b65f999 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 23 May 2022 13:07:42 +0100 Subject: [PATCH 6/6] Adding input descriptions on presets --- torchvision/transforms/_presets.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/torchvision/transforms/_presets.py b/torchvision/transforms/_presets.py index 93b2d0a8f85..e49912e0f00 100644 --- a/torchvision/transforms/_presets.py +++ b/torchvision/transforms/_presets.py @@ -29,7 +29,10 @@ def __repr__(self) -> str: return self.__class__.__name__ + "()" def describe(self) -> str: - return "The images are rescaled to ``[0.0, 1.0]``." + return ( + "Accepts ``PIL.Image``, batched ``(B, C, H, W)`` and single ``(C, H, W)`` image ``torch.Tensor`` objects. " + "The images are rescaled to ``[0.0, 1.0]``." + ) class ImageClassification(nn.Module): @@ -70,6 +73,7 @@ def __repr__(self) -> str: def describe(self) -> str: return ( + "Accepts ``PIL.Image``, batched ``(B, C, H, W)`` and single ``(C, H, W)`` image ``torch.Tensor`` objects. " f"The images are resized to ``resize_size={self.resize_size}`` using ``interpolation={self.interpolation}``, " f"followed by a central crop of ``crop_size={self.crop_size}``. Finally the values are first rescaled to " f"``[0.0, 1.0]`` and then normalized using ``mean={self.mean}`` and ``std={self.std}``." @@ -125,9 +129,11 @@ def __repr__(self) -> str: def describe(self) -> str: return ( - f"The video frames are resized to ``resize_size={self.resize_size}`` using ``interpolation={self.interpolation}``, " + "Accepts batched ``(B, T, C, H, W)`` and single ``(T, C, H, W)`` video frame ``torch.Tensor`` objects. " + f"The frames are resized to ``resize_size={self.resize_size}`` using ``interpolation={self.interpolation}``, " f"followed by a central crop of ``crop_size={self.crop_size}``. Finally the values are first rescaled to " - f"``[0.0, 1.0]`` and then normalized using ``mean={self.mean}`` and ``std={self.std}``." + f"``[0.0, 1.0]`` and then normalized using ``mean={self.mean}`` and ``std={self.std}``. Finally the output " + "dimensions are permuted to ``(..., C, T, H, W)`` tensors." ) @@ -166,6 +172,7 @@ def __repr__(self) -> str: def describe(self) -> str: return ( + "Accepts ``PIL.Image``, batched ``(B, C, H, W)`` and single ``(C, H, W)`` image ``torch.Tensor`` objects. " f"The images are resized to ``resize_size={self.resize_size}`` using ``interpolation={self.interpolation}``. " f"Finally the values are first rescaled to ``[0.0, 1.0]`` and then normalized using ``mean={self.mean}`` and " f"``std={self.std}``." @@ -195,4 +202,7 @@ def __repr__(self) -> str: return self.__class__.__name__ + "()" def describe(self) -> str: - return "The images are rescaled to ``[-1.0, 1.0]``." + return ( + "Accepts ``PIL.Image``, batched ``(B, C, H, W)`` and single ``(C, H, W)`` image ``torch.Tensor`` objects. " + "The images are rescaled to ``[-1.0, 1.0]``." + )