From a98c0568cebf60c2a17548b4af8db226a8d88899 Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Thu, 5 May 2022 10:44:43 +0000 Subject: [PATCH 01/12] Dataset creation now supports "new" version of Kinetics dataset --- references/video_classification/train.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/references/video_classification/train.py b/references/video_classification/train.py index 26c856da878..a0ba7420a4e 100644 --- a/references/video_classification/train.py +++ b/references/video_classification/train.py @@ -145,9 +145,11 @@ def main(args): else: if args.distributed: print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster") - dataset = torchvision.datasets.Kinetics400( - traindir, + dataset = torchvision.datasets.Kinetics( + args.data_path, frames_per_clip=args.clip_len, + num_classes="400", + split="val", step_between_clips=1, transform=transform_train, frame_rate=15, @@ -179,9 +181,11 @@ def main(args): else: if args.distributed: print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster") - dataset_test = torchvision.datasets.Kinetics400( - valdir, + dataset_test = torchvision.datasets.Kinetics( + args.data_path, frames_per_clip=args.clip_len, + num_classes="400", + split="val", step_between_clips=1, transform=transform_test, frame_rate=15, From c5c36d85e53368512c927c170647b79f64322c00 Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Thu, 5 May 2022 10:44:56 +0000 Subject: [PATCH 02/12] remove unnecessary warning for now --- torchvision/io/video.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/torchvision/io/video.py b/torchvision/io/video.py index d026e754546..ad87633fa0c 100644 --- a/torchvision/io/video.py +++ b/torchvision/io/video.py @@ -153,14 +153,11 @@ def _read_from_stream( gc.collect() if pts_unit == "sec": + # TODO: we should change all of this from ground up to simply take + # sec and convert to MS in C++ start_offset = int(math.floor(start_offset * (1 / stream.time_base))) if end_offset != float("inf"): end_offset = int(math.ceil(end_offset * (1 / stream.time_base))) - else: - warnings.warn( - "The pts_unit 'pts' gives wrong results and will be removed in a " - + "follow-up version. Please use pts_unit 'sec'." - ) frames = {} should_buffer = True From 2ee355abfe6d61d62ad5433521795e00bb726507 Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Thu, 5 May 2022 10:50:51 +0000 Subject: [PATCH 03/12] provide kinetics option --- references/video_classification/train.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/references/video_classification/train.py b/references/video_classification/train.py index a0ba7420a4e..a8fc5d2e8c2 100644 --- a/references/video_classification/train.py +++ b/references/video_classification/train.py @@ -11,7 +11,7 @@ import utils from torch import nn from torch.utils.data.dataloader import default_collate -from torchvision.datasets.samplers import DistributedSampler, UniformClipSampler, RandomClipSampler +from torchvision.datasets.samplers import DistributedSampler, RandomClipSampler, UniformClipSampler def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, print_freq, scaler=None): @@ -148,7 +148,7 @@ def main(args): dataset = torchvision.datasets.Kinetics( args.data_path, frames_per_clip=args.clip_len, - num_classes="400", + num_classes=args.kinetics_version, split="val", step_between_clips=1, transform=transform_train, @@ -184,7 +184,7 @@ def main(args): dataset_test = torchvision.datasets.Kinetics( args.data_path, frames_per_clip=args.clip_len, - num_classes="400", + num_classes=args.kinetics_version, split="val", step_between_clips=1, transform=transform_test, @@ -318,6 +318,9 @@ def parse_args(): parser.add_argument("--data-path", default="/datasets01_101/kinetics/070618/", type=str, help="dataset path") parser.add_argument("--train-dir", default="train_avi-480p", type=str, help="name of train dir") parser.add_argument("--val-dir", default="val_avi-480p", type=str, help="name of val dir") + parser.add_argument( + "--kinetics-version", default="400", type=str, choices=["400", "600"], help="Select kinetics version" + ) parser.add_argument("--model", default="r2plus1d_18", type=str, help="model name") parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)") parser.add_argument("--clip-len", default=16, type=int, metavar="N", help="number of frames per clip") From 5f90f09c490d655d2fea31da6a151a1f892c186e Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Thu, 5 May 2022 10:59:57 +0000 Subject: [PATCH 04/12] new reading somehow doesn't need BHWC to BCHW transform --- references/video_classification/presets.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/references/video_classification/presets.py b/references/video_classification/presets.py index c12d00a022b..ef774052257 100644 --- a/references/video_classification/presets.py +++ b/references/video_classification/presets.py @@ -1,6 +1,6 @@ import torch from torchvision.transforms import transforms -from transforms import ConvertBHWCtoBCHW, ConvertBCHWtoCBHW +from transforms import ConvertBCHWtoCBHW class VideoClassificationPresetTrain: @@ -14,7 +14,6 @@ def __init__( hflip_prob=0.5, ): trans = [ - ConvertBHWCtoBCHW(), transforms.ConvertImageDtype(torch.float32), transforms.Resize(resize_size), ] @@ -31,7 +30,6 @@ class VideoClassificationPresetEval: def __init__(self, *, crop_size, resize_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)): self.transforms = transforms.Compose( [ - ConvertBHWCtoBCHW(), transforms.ConvertImageDtype(torch.float32), transforms.Resize(resize_size), transforms.Normalize(mean=mean, std=std), From 55595935cf2c48135666e212656b55e7aacd255c Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Thu, 12 May 2022 12:14:34 +0000 Subject: [PATCH 05/12] Addressing minor comments --- references/video_classification/train.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/references/video_classification/train.py b/references/video_classification/train.py index a8fc5d2e8c2..c7ac9e8c133 100644 --- a/references/video_classification/train.py +++ b/references/video_classification/train.py @@ -11,7 +11,7 @@ import utils from torch import nn from torch.utils.data.dataloader import default_collate -from torchvision.datasets.samplers import DistributedSampler, RandomClipSampler, UniformClipSampler +from torchvision.datasets.samplers import DistributedSampler, UniformClipSampler, RandomClipSampler def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, print_freq, scaler=None): @@ -130,8 +130,8 @@ def main(args): # Data loading code print("Loading data") - traindir = os.path.join(args.data_path, args.train_dir) - valdir = os.path.join(args.data_path, args.val_dir) + traindir = os.path.join(args.data_path, "train") + valdir = os.path.join(args.data_path, "val") print("Loading training data") st = time.time() @@ -149,7 +149,7 @@ def main(args): args.data_path, frames_per_clip=args.clip_len, num_classes=args.kinetics_version, - split="val", + split="train", step_between_clips=1, transform=transform_train, frame_rate=15, @@ -316,8 +316,6 @@ def parse_args(): parser = argparse.ArgumentParser(description="PyTorch Video Classification Training") parser.add_argument("--data-path", default="/datasets01_101/kinetics/070618/", type=str, help="dataset path") - parser.add_argument("--train-dir", default="train_avi-480p", type=str, help="name of train dir") - parser.add_argument("--val-dir", default="val_avi-480p", type=str, help="name of val dir") parser.add_argument( "--kinetics-version", default="400", type=str, choices=["400", "600"], help="Select kinetics version" ) From 01b0d9bf8185755db79d9476aa22d68b85be5846 Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Thu, 12 May 2022 12:17:51 +0000 Subject: [PATCH 06/12] Adding kinetics deprication warning for the old Kinetics400 class --- torchvision/datasets/kinetics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torchvision/datasets/kinetics.py b/torchvision/datasets/kinetics.py index 651dbdc158f..a1322c7bf7f 100644 --- a/torchvision/datasets/kinetics.py +++ b/torchvision/datasets/kinetics.py @@ -308,6 +308,7 @@ def __init__( warnings.warn( "The Kinetics400 class is deprecated since 0.12 and will be removed in 0.14." "Please use Kinetics(..., num_classes='400') instead." + "Note that Kinetics(..., num_classes='400') returns video in a more logicalTensor[T, C, H, W] format." ) if any(value is not None for value in (num_classes, split, download, num_download_workers)): raise RuntimeError( From a84b8075d444121c0b437c903efd8f1d14867bf6 Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Thu, 12 May 2022 12:51:43 +0000 Subject: [PATCH 07/12] lint error --- torchvision/io/video.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchvision/io/video.py b/torchvision/io/video.py index ad87633fa0c..8d1c3e7e0fa 100644 --- a/torchvision/io/video.py +++ b/torchvision/io/video.py @@ -154,7 +154,7 @@ def _read_from_stream( if pts_unit == "sec": # TODO: we should change all of this from ground up to simply take - # sec and convert to MS in C++ + # sec and convert to MS in C++ start_offset = int(math.floor(start_offset * (1 / stream.time_base))) if end_offset != float("inf"): end_offset = int(math.ceil(end_offset * (1 / stream.time_base))) @@ -173,9 +173,9 @@ def _read_from_stream( # can't use regex directly because of some weird characters sometimes... pos = extradata.find(b"DivX") d = extradata[pos:] - o = re.search(br"DivX(\d+)Build(\d+)(\w)", d) + o = re.search(rb"DivX(\d+)Build(\d+)(\w)", d) if o is None: - o = re.search(br"DivX(\d+)b(\d+)(\w)", d) + o = re.search(rb"DivX(\d+)b(\d+)(\w)", d) if o is not None: should_buffer = o.group(3) == b"p" seek_offset = start_offset From 2d2841f0ff6c71353b8e6dc836ac61e92c8eff64 Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Fri, 20 May 2022 14:57:49 +0100 Subject: [PATCH 08/12] Update torchvision/datasets/kinetics.py Co-authored-by: Nicolas Hug --- torchvision/datasets/kinetics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/datasets/kinetics.py b/torchvision/datasets/kinetics.py index a1322c7bf7f..937cee495e0 100644 --- a/torchvision/datasets/kinetics.py +++ b/torchvision/datasets/kinetics.py @@ -308,7 +308,7 @@ def __init__( warnings.warn( "The Kinetics400 class is deprecated since 0.12 and will be removed in 0.14." "Please use Kinetics(..., num_classes='400') instead." - "Note that Kinetics(..., num_classes='400') returns video in a more logicalTensor[T, C, H, W] format." + "Note that Kinetics(..., num_classes='400') returns video in a more logical Tensor[T, C, H, W] format." ) if any(value is not None for value in (num_classes, split, download, num_download_workers)): raise RuntimeError( From e8401c1d6f19812d00fe798974aad8f454f1058f Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Fri, 20 May 2022 15:26:45 +0100 Subject: [PATCH 09/12] Updating README --- references/video_classification/README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/references/video_classification/README.md b/references/video_classification/README.md index 204bfda5ba5..418f382edec 100644 --- a/references/video_classification/README.md +++ b/references/video_classification/README.md @@ -18,7 +18,7 @@ We assume the training and validation AVI videos are stored at `/data/kinectics4 Run the training on a single node with 8 GPUs: ```bash -torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=16 --cache-dataset --sync-bn --amp +torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --kinetics-version="400" --batch-size=16 --cache-dataset --sync-bn --amp ``` **Note:** all our models were trained on 8 nodes with 8 V100 GPUs each for a total of 64 GPUs. Expected training time for 64 GPUs is 24 hours, depending on the storage solution. @@ -30,5 +30,13 @@ torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --train-dir= ```bash -python train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=8 --cache-dataset +python train.py --data-path=/data/kinectics400 --kinetics-version="400" --batch-size=8 --cache-dataset ``` + + +### Additional Kinetics versions + +Since the original release, additional versions of Kinetics dataset became available (Kinetics 600). +Our training scripts support these versions of dataset as well by setting the `--kinetics-version` parameter to `"600"`. + +**Note:** training on Kinetics 600 requires a different set of hyperparameters for optimal performance. We do not provide Kinetics 600 pretrained models. \ No newline at end of file From 8c05f0f39ec47e4a8f6217af8ce315e4159bafb8 Mon Sep 17 00:00:00 2001 From: Bruno Korbar Date: Fri, 20 May 2022 15:33:23 +0100 Subject: [PATCH 10/12] Remove BHWC to BCHW --- references/video_classification/transforms.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/references/video_classification/transforms.py b/references/video_classification/transforms.py index a0ce691bae7..2a7cc2a4a66 100644 --- a/references/video_classification/transforms.py +++ b/references/video_classification/transforms.py @@ -2,13 +2,6 @@ import torch.nn as nn -class ConvertBHWCtoBCHW(nn.Module): - """Convert tensor from (B, H, W, C) to (B, C, H, W)""" - - def forward(self, vid: torch.Tensor) -> torch.Tensor: - return vid.permute(0, 3, 1, 2) - - class ConvertBCHWtoCBHW(nn.Module): """Convert tensor from (B, C, H, W) to (C, B, H, W)""" From ab29225e8c58216a9117f6f38347246647edc023 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 20 May 2022 16:37:37 +0100 Subject: [PATCH 11/12] Put warning back --- torchvision/io/video.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/torchvision/io/video.py b/torchvision/io/video.py index 8d1c3e7e0fa..d94d2b0157e 100644 --- a/torchvision/io/video.py +++ b/torchvision/io/video.py @@ -158,6 +158,10 @@ def _read_from_stream( start_offset = int(math.floor(start_offset * (1 / stream.time_base))) if end_offset != float("inf"): end_offset = int(math.ceil(end_offset * (1 / stream.time_base))) + else: + warnings.warn( + "The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'." + ) frames = {} should_buffer = True From af4e30c7d9faef359966609b309094d1ba9f6d6c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 20 May 2022 16:40:29 +0100 Subject: [PATCH 12/12] formatting --- references/video_classification/README.md | 2 +- torchvision/io/video.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/references/video_classification/README.md b/references/video_classification/README.md index 418f382edec..c387e2e7158 100644 --- a/references/video_classification/README.md +++ b/references/video_classification/README.md @@ -39,4 +39,4 @@ python train.py --data-path=/data/kinectics400 --kinetics-version="400" --batch- Since the original release, additional versions of Kinetics dataset became available (Kinetics 600). Our training scripts support these versions of dataset as well by setting the `--kinetics-version` parameter to `"600"`. -**Note:** training on Kinetics 600 requires a different set of hyperparameters for optimal performance. We do not provide Kinetics 600 pretrained models. \ No newline at end of file +**Note:** training on Kinetics 600 requires a different set of hyperparameters for optimal performance. We do not provide Kinetics 600 pretrained models. diff --git a/torchvision/io/video.py b/torchvision/io/video.py index d94d2b0157e..1c758661164 100644 --- a/torchvision/io/video.py +++ b/torchvision/io/video.py @@ -159,9 +159,7 @@ def _read_from_stream( if end_offset != float("inf"): end_offset = int(math.ceil(end_offset * (1 / stream.time_base))) else: - warnings.warn( - "The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'." - ) + warnings.warn("The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'.") frames = {} should_buffer = True