From 08632964d83374465c5b8a0347d6f30fd312ced6 Mon Sep 17 00:00:00 2001
From: YANYI ZHANG <yz593@scarletmail.rutgers.edu>
Date: Wed, 11 Nov 2020 13:48:37 -0500
Subject: [PATCH] add ir-csn-152 into torchvideo model zoo (#1515)

---
 .../action_recognition/ir_CSN_152.py          | 196 ++++++++++++++++++
 1 file changed, 196 insertions(+)
 create mode 100644 gluoncv/torch/model_zoo/action_recognition/ir_CSN_152.py

diff --git a/gluoncv/torch/model_zoo/action_recognition/ir_CSN_152.py b/gluoncv/torch/model_zoo/action_recognition/ir_CSN_152.py
new file mode 100644
index 0000000000..04ac652ce0
--- /dev/null
+++ b/gluoncv/torch/model_zoo/action_recognition/ir_CSN_152.py
@@ -0,0 +1,196 @@
+"""
+Video Classification with Channel-Separated Convolutional Networks
+ICCV 2019, https://arxiv.org/abs/1904.02811
+"""
+
+import torch
+import torch.nn as nn
+
+
+__all__ = ['ir_csn_resnet152_kinetics400']
+
+
+eps = 1e-3
+bn_mmt = 0.1
+
+
+class Affine(nn.Module):
+    def __init__(self, feature_in):
+        super(Affine, self).__init__()
+        self.weight = nn.Parameter(torch.randn(feature_in, 1, 1, 1))
+        self.bias = nn.Parameter(torch.randn(feature_in,1, 1, 1))
+        self.weight.requires_grad = False
+        self.bias.requires_grad = False
+
+    def forward(self, x):
+        x = x * self.weight + self.bias
+        return x
+
+
+class ResNeXtBottleneck(nn.Module):
+    # expansion = 2
+
+    def __init__(self, in_planes, planes, stride=1, temporal_stride=1,
+                 down_sample=None, expansion=2, temporal_kernel=3, use_affine=True):
+
+        super(ResNeXtBottleneck, self).__init__()
+        self.expansion = expansion
+        self.conv1 = nn.Conv3d(in_planes, planes, kernel_size=(1, 1, 1), bias=False, stride=(1, 1, 1))
+
+        if use_affine:
+            self.bn1 = Affine(planes)
+        else:
+            self.bn1 = nn.BatchNorm3d(planes, track_running_stats=True, eps=eps, momentum=bn_mmt)
+
+        self.conv3 = nn.Conv3d(planes, planes, kernel_size=(3, 3, 3), bias=False,
+                               stride=(temporal_stride, stride, stride),
+                               padding=((temporal_kernel - 1) // 2, 1, 1),
+                               groups=planes)
+
+        if use_affine:
+            self.bn3 = Affine(planes)
+        else:
+            self.bn3 = nn.BatchNorm3d(planes, track_running_stats=True, eps=eps, momentum=bn_mmt)
+
+        self.conv4 = nn.Conv3d(
+            planes, planes * self.expansion, kernel_size=1, bias=False)
+
+        if use_affine:
+            self.bn4 = Affine(planes * self.expansion)
+        else:
+            self.bn4 = nn.BatchNorm3d(planes * self.expansion, track_running_stats=True, eps=eps, momentum=bn_mmt)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.down_sample = down_sample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out = self.relu(out)
+
+        out = self.conv4(out)
+        out = self.bn4(out)
+
+        if self.down_sample is not None:
+            residual = self.down_sample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNeXt(nn.Module):
+    def __init__(self,
+                 block,
+                 block_nums,
+                 num_classes=400,
+                 use_affine=True):
+
+        self.use_affine = use_affine
+        self.in_planes = 64
+        self.num_classes = num_classes
+
+        super(ResNeXt, self).__init__()
+
+        self.conv1 = nn.Conv3d(
+            3,
+            64,
+            kernel_size=(3, 7, 7),
+            stride=(1, 2, 2),
+            padding=(1, 3, 3),
+            bias=False)
+        if use_affine:
+            self.bn1 = Affine(64)
+        else:
+            self.bn1 = nn.BatchNorm3d(64, track_running_stats=True, eps=eps, momentum=bn_mmt)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
+
+        self.layer1 = self._make_layer(block, in_planes=64, planes=64, blocks=block_nums[0],
+                                       stride=1, expansion=4)
+
+        self.layer2 = self._make_layer(block, in_planes=256, planes=128, blocks=block_nums[1],
+                                       stride=2, temporal_stride=2, expansion=4)
+
+        self.layer3 = self._make_layer(block, in_planes=512, planes=256, blocks=block_nums[2],
+                                       stride=2, temporal_stride=2, expansion=4)
+
+        self.layer4 = self._make_layer(block, in_planes=1024, planes=512, blocks=block_nums[3],
+                                       stride=2, temporal_stride=2, expansion=4)
+
+        self.avgpool = nn.AdaptiveAvgPool3d(output_size=(1, 1, 1))
+
+        self.out_fc = nn.Linear(in_features=2048, out_features=num_classes)
+
+    def _make_layer(self,
+                    block,
+                    in_planes,
+                    planes,
+                    blocks,
+                    stride=1,
+                    temporal_stride=1,
+                    expansion=4):
+
+        if self.use_affine:
+            down_bn = Affine(planes * expansion)
+        else:
+            down_bn = nn.BatchNorm3d(planes * expansion, track_running_stats=True, eps=eps, momentum=bn_mmt)
+        down_sample = nn.Sequential(
+            nn.Conv3d(
+                in_planes,
+                planes * expansion,
+                kernel_size=1,
+                stride=(temporal_stride, stride, stride),
+                bias=False), down_bn)
+        layers = []
+        layers.append(
+            block(in_planes, planes, stride, temporal_stride, down_sample, expansion,
+                  temporal_kernel=3, use_affine=self.use_affine))
+        for i in range(1, blocks):
+            layers.append(block(planes * expansion, planes, expansion=expansion,
+                                temporal_kernel=3, use_affine=self.use_affine))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        bs, _, _, _, _ = x.size()
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(bs, -1)
+        logits = self.out_fc(x)
+
+        return logits
+
+
+def ir_csn_resnet152_kinetics400(cfg):
+    model = ResNeXt(ResNeXtBottleneck,
+                    num_classes=cfg.CONFIG.DATA.NUM_CLASSES,
+                    block_nums=[3, 8, 36, 3],
+                    use_affine=cfg.CONFIG.MODEL.USE_AFFINE)
+
+    if cfg.CONFIG.MODEL.PRETRAINED:
+        from ..model_store import get_model_file
+        model.load_state_dict(torch.load(get_model_file('ir_csn_resnet152_kinetics400',
+                                                        tag=cfg.CONFIG.MODEL.PRETRAINED)))
+
+    return model
\ No newline at end of file