pytorch · NicolasHug · May 20, 2022 · May 5, 2022 · May 5, 2022 · May 5, 2022
diff --git a/references/video_classification/README.md b/references/video_classification/README.md
@@ -18,7 +18,7 @@ We assume the training and validation AVI videos are stored at `/data/kinectics4
 
 Run the training on a single node with 8 GPUs:
 ```bash
-torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=16 --cache-dataset --sync-bn --amp
+torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --kinetics-version="400" --batch-size=16 --cache-dataset --sync-bn --amp
 ```
 
 **Note:** all our models were trained on 8 nodes with 8 V100 GPUs each for a total of 64 GPUs. Expected training time for 64 GPUs is 24 hours, depending on the storage solution.
@@ -30,5 +30,13 @@ torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --train-dir=
 
 
 ```bash
-python train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=8 --cache-dataset
+python train.py --data-path=/data/kinectics400 --kinetics-version="400" --batch-size=8 --cache-dataset
 ```
+
+
+### Additional Kinetics versions
+
+Since the original release, additional versions of Kinetics dataset became available (Kinetics 600).
+Our training scripts support these versions of dataset as well by setting the `--kinetics-version` parameter to `"600"`.
+
+**Note:** training on Kinetics 600 requires a different set of hyperparameters for optimal performance. We do not provide Kinetics 600 pretrained models.
diff --git a/references/video_classification/presets.py b/references/video_classification/presets.py
@@ -1,6 +1,6 @@
 import torch
 from torchvision.transforms import transforms
-from transforms import ConvertBHWCtoBCHW, ConvertBCHWtoCBHW
+from transforms import ConvertBCHWtoCBHW
 
 
 class VideoClassificationPresetTrain:
@@ -14,7 +14,6 @@ def __init__(
         hflip_prob=0.5,
     ):
         trans = [
-            ConvertBHWCtoBCHW(),
             transforms.ConvertImageDtype(torch.float32),
             transforms.Resize(resize_size),
         ]
@@ -31,7 +30,6 @@ class VideoClassificationPresetEval:
     def __init__(self, *, crop_size, resize_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)):
         self.transforms = transforms.Compose(
             [
-                ConvertBHWCtoBCHW(),
             - video (Tensor[T, H, W, C]): the `T` video frames 
             - video (Tensor[T, C, H, W]): the `T` video frames in torch.uint8 tensor 
     .. warning:: 
         This class was deprecated in ``0.12`` and will be removed in ``0.14``. Please use 
         ``Kinetics(..., num_classes='400')`` instead. 
 warnings.warn( 
     "The Kinetics400 class is deprecated since 0.12 and will be removed in 0.14." 
     "Please use Kinetics(..., num_classes='400') instead." 
 ) 
             - video (Tensor[T, H, W, C]): the `T` video frames 
             - video (Tensor[T, C, H, W]): the `T` video frames in torch.uint8 tensor 
     .. warning:: 
         This class was deprecated in ``0.12`` and will be removed in ``0.14``. Please use 
         ``Kinetics(..., num_classes='400')`` instead. 
 warnings.warn( 
     "The Kinetics400 class is deprecated since 0.12 and will be removed in 0.14." 
     "Please use Kinetics(..., num_classes='400') instead." 
 ) 
                 transforms.ConvertImageDtype(torch.float32),
                 transforms.Resize(resize_size),
                 transforms.Normalize(mean=mean, std=std),

diff --git a/references/video_classification/train.py b/references/video_classification/train.py
@@ -130,8 +130,8 @@ def main(args):
 
     # Data loading code
     print("Loading data")
-    traindir = os.path.join(args.data_path, args.train_dir)
-    valdir = os.path.join(args.data_path, args.val_dir)
+    traindir = os.path.join(args.data_path, "train")
+    valdir = os.path.join(args.data_path, "val")
 
     print("Loading training data")
     st = time.time()
@@ -145,9 +145,11 @@ def main(args):
     else:
         if args.distributed:
             print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster")
-        dataset = torchvision.datasets.Kinetics400(
-            traindir,
+        dataset = torchvision.datasets.Kinetics(
+            args.data_path,
             frames_per_clip=args.clip_len,
+            num_classes=args.kinetics_version,
+            split="train",
             step_between_clips=1,
             transform=transform_train,
             frame_rate=15,
@@ -179,9 +181,11 @@ def main(args):
     else:
         if args.distributed:
             print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster")
-        dataset_test = torchvision.datasets.Kinetics400(
-            valdir,
+        dataset_test = torchvision.datasets.Kinetics(
+            args.data_path,
             frames_per_clip=args.clip_len,
+            num_classes=args.kinetics_version,
+            split="val",
             step_between_clips=1,
             transform=transform_test,
             frame_rate=15,
@@ -312,8 +316,9 @@ def parse_args():
     parser = argparse.ArgumentParser(description="PyTorch Video Classification Training")
 
     parser.add_argument("--data-path", default="/datasets01_101/kinetics/070618/", type=str, help="dataset path")
-    parser.add_argument("--train-dir", default="train_avi-480p", type=str, help="name of train dir")
-    parser.add_argument("--val-dir", default="val_avi-480p", type=str, help="name of val dir")
+    parser.add_argument(
+        "--kinetics-version", default="400", type=str, choices=["400", "600"], help="Select kinetics version"
+    )
     parser.add_argument("--model", default="r2plus1d_18", type=str, help="model name")
     parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
     parser.add_argument("--clip-len", default=16, type=int, metavar="N", help="number of frames per clip")

diff --git a/references/video_classification/transforms.py b/references/video_classification/transforms.py
@@ -2,13 +2,6 @@
 import torch.nn as nn
 
 
-class ConvertBHWCtoBCHW(nn.Module):
-    """Convert tensor from (B, H, W, C) to (B, C, H, W)"""
-
-    def forward(self, vid: torch.Tensor) -> torch.Tensor:
-        return vid.permute(0, 3, 1, 2)
-
-
 class ConvertBCHWtoCBHW(nn.Module):
     """Convert tensor from (B, C, H, W) to (C, B, H, W)"""
 

diff --git a/torchvision/datasets/kinetics.py b/torchvision/datasets/kinetics.py
@@ -308,6 +308,7 @@ def __init__(
         warnings.warn(
             "The Kinetics400 class is deprecated since 0.12 and will be removed in 0.14."
             "Please use Kinetics(..., num_classes='400') instead."
+            "Note that Kinetics(..., num_classes='400') returns video in a more logical Tensor[T, C, H, W] format."
         )
         if any(value is not None for value in (num_classes, split, download, num_download_workers)):
             raise RuntimeError(

diff --git a/torchvision/io/video.py b/torchvision/io/video.py
@@ -153,13 +153,14 @@ def _read_from_stream(
         gc.collect()
 
     if pts_unit == "sec":
+        # TODO: we should change all of this from ground up to simply take
+        # sec and convert to MS in C++
         start_offset = int(math.floor(start_offset * (1 / stream.time_base)))
         if end_offset != float("inf"):
             end_offset = int(math.ceil(end_offset * (1 / stream.time_base)))
     else:
         warnings.warn(
-            "The pts_unit 'pts' gives wrong results and will be removed in a "
-            + "follow-up version. Please use pts_unit 'sec'."
+            "The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'."
         )
 
     frames = {}
@@ -176,9 +177,9 @@ def _read_from_stream(
             # can't use regex directly because of some weird characters sometimes...
             pos = extradata.find(b"DivX")
             d = extradata[pos:]
-            o = re.search(br"DivX(\d+)Build(\d+)(\w)", d)
+            o = re.search(rb"DivX(\d+)Build(\d+)(\w)", d)
             if o is None:
-                o = re.search(br"DivX(\d+)b(\d+)(\w)", d)
+                o = re.search(rb"DivX(\d+)b(\d+)(\w)", d)
             if o is not None:
                 should_buffer = o.group(3) == b"p"
     seek_offset = start_offset