From c63915fa50f265970e26fb1aa944f87beda5adf4 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 5 Oct 2023 05:42:14 +0530
Subject: [PATCH 001/152] WIP

---
 .../models/fast/configuration_fast.py         |  78 ++
 src/transformers/models/fast/modeling_fast.py | 919 ++++++++++++++++++
 2 files changed, 997 insertions(+)
 create mode 100644 src/transformers/models/fast/configuration_fast.py
 create mode 100644 src/transformers/models/fast/modeling_fast.py

diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
new file mode 100644
index 000000000000..53a3791a48ae
--- /dev/null
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -0,0 +1,78 @@
+from transformers import PretrainedConfig
+
+
+class FastConfig(PretrainedConfig):
+
+    def __init__(self,
+                 backbone_config=None,
+                 backbone_stage1_in_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64],
+                 backbone_stage1_out_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64],
+                 backbone_stage1_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3),
+                                              (3, 3)],
+                 backbone_stage1_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1],
+                 backbone_stage1_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                 backbone_stage1_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+                 backbone_stage2_in_channels=[64, 128, 128, 128, 128, 128, 128, 128, 128, 128],
+                 backbone_stage2_out_channels=[128, 128, 128, 128, 128, 128, 128, 128, 128, 128],
+                 backbone_stage2_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3),
+                                              (3, 3)],
+                 backbone_stage2_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1],
+                 backbone_stage2_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                 backbone_stage2_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+                 backbone_stage3_in_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64],
+                 backbone_stage3_out_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64],
+                 backbone_stage3_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3),
+                                              (3, 3)],
+                 backbone_stage3_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1],
+                 backbone_stage3_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                 backbone_stage3_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+                 backbone_stage4_in_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64],
+                 backbone_stage4_out_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64],
+                 backbone_stage4_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3),
+                                              (3, 3)],
+                 backbone_stage4_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1],
+                 backbone_stage4_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                 backbone_stage4_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+
+                 neck_in_channels=[64, 128, 256, 512],
+                 neck_out_channels=[128, 128, 128, 128],
+                 neck_kernel_size=[(3, 3), (3, 3), (3, 3), (3, 3)],
+                 neck_stride=[1, 1, 1, 1],
+                 neck_dilation=[1, 1, 1, 1],
+                 neck_groups=[1, 1, 1, 1],
+                 **kwargs
+                 ):
+        self.backbone_config = {
+            "kernel_size": 3,
+            "stride": 2,
+            "dilation": 1,
+            "groups": 1,
+            "bias": False,
+            "has_shuffle": False,
+            "in_channels": 3,
+            "out_channels": 64,
+            "use_bn": True,
+            "act_func": "relu",
+            "dropout_rate": 0,
+            "ops_order": "weight_bn_act"
+        }
+        super.__init__(**kwargs)
+        if backbone_config is not None:
+            self.backbone_config.update(backbone_config)
+
+        self.backbone_stage1_in_channels = backbone_stage1_in_channels
+        self.backbone_stage1_out_channels = backbone_stage1_out_channels
+        self.backbone_stage1_kernel_size = backbone_stage1_kernel_size,
+        self.backbone_stage1_stride = backbone_stage1_stride,
+        self.backbone_stage1_dilation = backbone_stage1_dilation,
+        self.backbone_stage1_groups = backbone_stage1_groups,
+
+        self.neck_in_channels = neck_in_channels,
+        self.neck_out_channels = neck_out_channels,
+        self.neck_kernel_size_channels = neck_kernel_size,
+        self.neck_stride_channels = neck_stride,
+        self.neck_dilation_channels = neck_dilation,
+        self.neck_groups_channels = neck_groups,
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
new file mode 100644
index 000000000000..4684378ce8eb
--- /dev/null
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -0,0 +1,919 @@
+import math
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import time
+import cv2
+
+class DiceLoss(nn.Module):
+    def __init__(self, loss_weight=1.0):
+        super(DiceLoss, self).__init__()
+        self.loss_weight = loss_weight
+
+    def forward(self, input, target, mask, reduce=True):
+        batch_size = input.size(0)
+        input = torch.sigmoid(input)
+
+        input = input.contiguous().view(batch_size, -1)
+        target = target.contiguous().view(batch_size, -1).float()
+        mask = mask.contiguous().view(batch_size, -1).float()
+
+        input = input * mask
+        target = target * mask
+
+        a = torch.sum(input * target, dim=1)
+        b = torch.sum(input * input, dim=1) + 0.001
+        c = torch.sum(target * target, dim=1) + 0.001
+        d = (2 * a) / (b + c)
+        loss = 1 - d
+
+        loss = self.loss_weight * loss
+
+        if reduce:
+            loss = torch.mean(loss)
+
+        return loss
+
+
+class EmbLoss_v1(nn.Module):
+    def __init__(self, feature_dim=4, loss_weight=1.0):
+        super(EmbLoss_v1, self).__init__()
+        self.feature_dim = feature_dim
+        self.loss_weight = loss_weight
+        self.delta_v = 0.5
+        self.delta_d = 1.5
+        self.weights = (1.0, 1.0)
+
+    def forward_single(self, emb, instance, kernel, training_mask):
+        training_mask = (training_mask > 0.5).long()
+        kernel = (kernel > 0.5).long()
+        instance = instance * training_mask
+        instance_kernel = (instance * kernel).view(-1)
+        instance = instance.view(-1)
+        emb = emb.view(self.feature_dim, -1)
+
+        unique_labels, unique_ids = torch.unique(instance_kernel, sorted=True, return_inverse=True)
+        num_instance = unique_labels.size(0)
+        if num_instance <= 1:
+            return 0
+
+        emb_mean = emb.new_zeros((self.feature_dim, num_instance), dtype=torch.float32)
+        for i, lb in enumerate(unique_labels):
+            if lb == 0:
+                continue
+            ind_k = instance_kernel == lb
+            emb_mean[:, i] = torch.mean(emb[:, ind_k], dim=1)
+
+        l_agg = emb.new_zeros(num_instance, dtype=torch.float32)  # bug
+        for i, lb in enumerate(unique_labels):
+            if lb == 0:
+                continue
+            ind = instance == lb
+            emb_ = emb[:, ind]
+            dist = (emb_ - emb_mean[:, i:i + 1]).norm(p=2, dim=0)
+            dist = F.relu(dist - self.delta_v) ** 2
+            l_agg[i] = torch.mean(torch.log(dist + 1.0))
+        l_agg = torch.mean(l_agg[1:])
+
+        if num_instance > 2:
+            emb_interleave = emb_mean.permute(1, 0).repeat(num_instance, 1)
+            emb_band = emb_mean.permute(1, 0).repeat(1, num_instance).view(-1, self.feature_dim)
+            # print(seg_band)
+
+            mask = (1 - torch.eye(num_instance, dtype=torch.int8)).view(-1, 1).repeat(1, self.feature_dim)
+            mask = mask.view(num_instance, num_instance, -1)
+            mask[0, :, :] = 0
+            mask[:, 0, :] = 0
+            mask = mask.view(num_instance * num_instance, -1)
+            # print(mask)
+
+            dist = emb_interleave - emb_band
+            dist = dist[mask > 0].view(-1, self.feature_dim).norm(p=2, dim=1)
+            dist = F.relu(2 * self.delta_d - dist) ** 2
+            l_dis = torch.mean(torch.log(dist + 1.0))
+        else:
+            l_dis = 0
+
+        l_agg = self.weights[0] * l_agg
+        l_dis = self.weights[1] * l_dis
+        l_reg = torch.mean(torch.log(torch.norm(emb_mean, 2, 0) + 1.0)) * 0.001
+        loss = l_agg + l_dis + l_reg
+        return loss
+
+    def forward(self, emb, instance, kernel, training_mask, reduce=True):
+        loss_batch = emb.new_zeros((emb.size(0)), dtype=torch.float32)
+
+        for i in range(loss_batch.size(0)):
+            loss_batch[i] = self.forward_single(emb[i], instance[i], kernel[i], training_mask[i])
+
+        loss_batch = self.loss_weight * loss_batch
+
+        if reduce:
+            loss_batch = torch.mean(loss_batch)
+
+        return loss_batch
+
+
+class EmbLoss_v2(nn.Module):
+    def __init__(self, feature_dim=4, loss_weight=1.0):
+        super(EmbLoss_v2, self).__init__()
+        self.feature_dim = feature_dim
+        self.loss_weight = loss_weight
+        self.delta_v = 0.5
+        self.delta_d = 1.5
+        self.weights = (1.0, 1.0)
+
+    def forward_single(self, emb, instance, kernel, training_mask):
+        training_mask = (training_mask > 0.5).long()
+        kernel = (kernel > 0.5).long()
+        instance = instance * training_mask
+        instance_kernel = (instance * kernel).view(-1)
+        instance = instance.view(-1)
+        emb = emb.view(self.feature_dim, -1)
+
+        unique_labels, unique_ids = torch.unique(instance_kernel, sorted=True, return_inverse=True)
+        num_instance = unique_labels.size(0)
+        if num_instance <= 1:
+            return 0
+
+        emb_mean = emb.new_zeros((self.feature_dim, num_instance), dtype=torch.float32)
+        for i, lb in enumerate(unique_labels):
+            if lb == 0:
+                continue
+            ind_k = instance_kernel == lb
+            emb_mean[:, i] = torch.mean(emb[:, ind_k], dim=1)
+
+        l_agg = emb.new_zeros(num_instance, dtype=torch.float32)  # bug
+        for i, lb in enumerate(unique_labels):
+            if lb == 0:
+                continue
+            ind = instance == lb
+            emb_ = emb[:, ind]
+            dist = (emb_ - emb_mean[:, i:i + 1]).norm(p=2, dim=0)
+            dist = F.relu(dist - self.delta_v) ** 2
+            l_agg[i] = torch.mean(torch.log(dist + 1.0))
+        l_agg = torch.mean(l_agg[1:])
+
+        if num_instance > 2:
+            emb_interleave = emb_mean.permute(1, 0).repeat(num_instance, 1)
+            emb_band = emb_mean.permute(1, 0).repeat(1, num_instance).view(-1, self.feature_dim)
+            # print(seg_band)
+
+            mask = (1 - torch.eye(num_instance, dtype=torch.int8)).view(-1, 1).repeat(1, self.feature_dim)
+            mask = mask.view(num_instance, num_instance, -1)
+            mask[0, :, :] = 0
+            mask[:, 0, :] = 0
+            mask = mask.view(num_instance * num_instance, -1)
+            # print(mask)
+
+            dist = emb_interleave - emb_band
+            dist = dist[mask > 0].view(-1, self.feature_dim).norm(p=2, dim=1)
+            dist = F.relu(2 * self.delta_d - dist) ** 2
+            # l_dis = torch.mean(torch.log(dist + 1.0))
+
+            l_dis = [torch.log(dist + 1.0)]
+            emb_bg = emb[:, instance == 0].view(self.feature_dim, -1)
+            if emb_bg.size(1) > 100:
+                rand_ind = np.random.permutation(emb_bg.size(1))[:100]
+                emb_bg = emb_bg[:, rand_ind]
+            if emb_bg.size(1) > 0:
+                for i, lb in enumerate(unique_labels):
+                    if lb == 0:
+                        continue
+                    dist = (emb_bg - emb_mean[:, i:i + 1]).norm(p=2, dim=0)
+                    dist = F.relu(2 * self.delta_d - dist) ** 2
+                    l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True)
+                    l_dis.append(l_dis_bg)
+            l_dis = torch.mean(torch.cat(l_dis))
+        else:
+            l_dis = 0
+
+        l_agg = self.weights[0] * l_agg
+        l_dis = self.weights[1] * l_dis
+        l_reg = torch.mean(torch.log(torch.norm(emb_mean, 2, 0) + 1.0)) * 0.001
+        loss = l_agg + l_dis + l_reg
+        return loss
+
+    def forward(self, emb, instance, kernel, training_mask, reduce=True):
+        loss_batch = emb.new_zeros((emb.size(0)), dtype=torch.float32)
+
+        for i in range(loss_batch.size(0)):
+            loss_batch[i] = self.forward_single(emb[i], instance[i], kernel[i], training_mask[i])
+
+        loss_batch = self.loss_weight * loss_batch
+
+        if reduce:
+            loss_batch = torch.mean(loss_batch)
+
+        return loss_batch
+
+
+def set_layer_from_config(layer_config):
+    if layer_config is None:
+        return None
+
+    name2layer = {
+        ConvLayer.__name__: ConvLayer,
+        RepConvLayer.__name__: RepConvLayer
+    }
+
+    layer_name = layer_config.pop('name')
+    layer = name2layer[layer_name]
+    return layer.build_from_config(layer_config)
+
+
+def get_same_padding(kernel_size):
+    if isinstance(kernel_size, tuple):
+        assert len(kernel_size) == 2, 'invalid kernel size: %s' % kernel_size
+        p1 = get_same_padding(kernel_size[0])
+        p2 = get_same_padding(kernel_size[1])
+        return p1, p2
+    assert isinstance(kernel_size, int), 'kernel size should be either `int` or `tuple`'
+    assert kernel_size % 2 > 0, 'kernel size should be odd number'
+    return kernel_size // 2
+
+
+class My2DLayer(nn.Module):
+
+    def __init__(self, in_channels, out_channels,
+                 use_bn=True, act_func='relu', dropout_rate=0, ops_order='weight_bn_act'):
+        super(My2DLayer, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.use_bn = use_bn
+        self.act_func = act_func
+        self.dropout_rate = dropout_rate
+        self.ops_order = ops_order
+
+        """ modules """
+        modules = {}
+        # batch norm
+        if self.use_bn:
+            if self.bn_before_weight:
+                modules['bn'] = nn.BatchNorm2d(in_channels)
+            else:
+                modules['bn'] = nn.BatchNorm2d(out_channels)
+        else:
+            modules['bn'] = None
+        # activation
+        modules['act'] = build_activation(self.act_func, self.ops_list[0] != 'act')
+        # dropout
+        if self.dropout_rate > 0:
+            modules['dropout'] = nn.Dropout2d(self.dropout_rate, inplace=True)
+        else:
+            modules['dropout'] = None
+        # weight
+        modules['weight'] = self.weight_op()
+
+        # add modules
+        for op in self.ops_list:
+            if modules[op] is None:
+                continue
+            elif op == 'weight':
+                if modules['dropout'] is not None:
+                    self.add_module('dropout', modules['dropout'])
+                for key in modules['weight']:
+                    self.add_module(key, modules['weight'][key])
+            else:
+                self.add_module(op, modules[op])
+
+    @property
+    def ops_list(self):
+        return self.ops_order.split('_')
+
+    @property
+    def bn_before_weight(self):
+        for op in self.ops_list:
+            if op == 'bn':
+                return True
+            elif op == 'weight':
+                return False
+        raise ValueError('Invalid ops_order: %s' % self.ops_order)
+
+    def weight_op(self):
+        raise NotImplementedError
+
+    """ Methods defined in MyModule """
+
+    def forward(self, x):
+        for module in self._modules.values():
+            x = module(x)
+        return x
+
+    @property
+    def module_str(self):
+        raise NotImplementedError
+
+    @property
+    def config(self):
+        return {
+            'in_channels': self.in_channels,
+            'out_channels': self.out_channels,
+            'use_bn': self.use_bn,
+            'act_func': self.act_func,
+            'dropout_rate': self.dropout_rate,
+            'ops_order': self.ops_order,
+        }
+
+    @staticmethod
+    def build_from_config(config):
+        raise NotImplementedError
+
+    def get_flops(self, x):
+        raise NotImplementedError
+
+    @staticmethod
+    def is_zero_layer():
+        return False
+
+
+def generate_bbox(keys, label, score, scales, cfg):
+    label_num = len(keys)
+    bboxes = []
+    scores = []
+    for index in range(1, label_num):
+        i = keys[index]
+        ind = (label == i)
+        ind_np = ind.data.cpu().numpy()
+        points = np.array(np.where(ind_np)).transpose((1, 0))
+        if points.shape[0] < cfg.test_cfg.min_area:
+            label[ind] = 0
+            continue
+        score_i = score[ind].mean().item()
+        if score_i < cfg.test_cfg.min_score:
+            label[ind] = 0
+            continue
+
+        if cfg.test_cfg.bbox_type == 'rect':
+            rect = cv2.minAreaRect(points[:, ::-1])
+            alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1])))
+            rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2])
+            bbox = cv2.boxPoints(rect) * scales
+
+        elif cfg.test_cfg.bbox_type == 'poly':
+            binary = np.zeros(label.shape, dtype='uint8')
+            binary[ind_np] = 1
+            contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            bbox = contours[0] * scales
+        bbox = bbox.astype('int32')
+        bboxes.append(bbox.reshape(-1).tolist())
+        scores.append(score_i)
+    return bboxes, scores
+
+
+class ConvLayer(My2DLayer):
+
+    def __init__(self, in_channels, out_channels,
+                 kernel_size=3, stride=1, dilation=1, groups=1, bias=False, has_shuffle=False,
+                 use_bn=True, act_func='relu', dropout_rate=0, ops_order='weight_bn_act'):
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.bias = bias
+        self.has_shuffle = has_shuffle
+
+        super(ConvLayer, self).__init__(in_channels, out_channels, use_bn, act_func, dropout_rate, ops_order)
+
+    def weight_op(self):
+        padding = get_same_padding(self.kernel_size)
+        if isinstance(padding, int):
+            padding *= self.dilation
+        else:
+            padding[0] *= self.dilation
+            padding[1] *= self.dilation
+
+        weight_dict = OrderedDict()
+        weight_dict['conv'] = nn.Conv2d(
+            self.in_channels, self.out_channels, kernel_size=self.kernel_size, stride=self.stride, padding=padding,
+            dilation=self.dilation, groups=self.groups, bias=self.bias
+        )
+
+        return weight_dict
+
+    @staticmethod
+    def build_from_config(config):
+        return ConvLayer(**config)
+
+
+class RepConvLayer(nn.Module):
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, deploy=False):
+        super(RepConvLayer, self).__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.deploy = deploy
+
+        assert len(kernel_size) == 2
+        padding = (int(((kernel_size[0] - 1) * dilation) / 2),
+                   int(((kernel_size[1] - 1) * dilation) / 2))
+
+        self.nonlinearity = nn.ReLU(inplace=True)
+
+        if deploy:
+            self.fused_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                                        kernel_size=kernel_size, stride=stride, padding=padding,
+                                        dilation=dilation, groups=groups, bias=True)
+        else:
+            self.main_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                                       kernel_size=kernel_size, stride=stride, padding=padding,
+                                       dilation=dilation, groups=groups, bias=False)
+            self.main_bn = nn.BatchNorm2d(num_features=out_channels)
+
+            ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0)
+            hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2))
+
+            if kernel_size[1] != 1:  # 卷积核的宽大于1 -> 有垂直卷积
+                self.ver_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                                          kernel_size=(kernel_size[0], 1),
+                                          stride=stride, padding=ver_pad,
+                                          dilation=dilation, groups=groups, bias=False)
+                self.ver_bn = nn.BatchNorm2d(num_features=out_channels)
+            else:
+                self.ver_conv, self.ver_bn = None, None
+
+            if kernel_size[0] != 1:  # 卷积核的高大于1 -> 有水平卷积
+                self.hor_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                                          kernel_size=(1, kernel_size[1]),
+                                          stride=stride, padding=hor_pad,
+                                          dilation=dilation, groups=groups, bias=False)
+                self.hor_bn = nn.BatchNorm2d(num_features=out_channels)
+            else:
+                self.hor_conv, self.hor_bn = None, None
+
+            self.rbr_identity = nn.BatchNorm2d(
+                num_features=in_channels) if out_channels == in_channels and stride == 1 else None
+
+    def forward(self, input):
+        if hasattr(self, 'fused_conv'):
+            return self.nonlinearity(self.fused_conv(input))
+        else:
+            main_outputs = self.main_conv(input)
+            main_outputs = self.main_bn(main_outputs)
+            if self.ver_conv is not None:
+                vertical_outputs = self.ver_conv(input)
+                vertical_outputs = self.ver_bn(vertical_outputs)
+            else:
+                vertical_outputs = 0
+
+            if self.hor_conv is not None:
+                horizontal_outputs = self.hor_conv(input)
+                horizontal_outputs = self.hor_bn(horizontal_outputs)
+            else:
+                horizontal_outputs = 0
+
+            if self.rbr_identity is None:
+                id_out = 0
+            else:
+                id_out = self.rbr_identity(input)
+
+            return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out)
+
+    def _identity_to_conv(self, identity):
+        if identity is None:
+            return 0, 0
+        assert isinstance(identity, nn.BatchNorm2d)
+        if not hasattr(self, 'id_tensor'):
+            input_dim = self.in_channels // self.groups
+            kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32)
+            for i in range(self.in_channels):
+                kernel_value[i, i % input_dim, 0, 0] = 1
+            id_tensor = torch.from_numpy(kernel_value).to(identity.weight.device)
+            self.id_tensor = self._pad_to_mxn_tensor(id_tensor)
+        kernel = self.id_tensor
+        running_mean = identity.running_mean
+        running_var = identity.running_var
+        gamma = identity.weight
+        beta = identity.bias
+        eps = identity.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+    def _fuse_bn_tensor(self, conv, bn):
+        kernel = conv.weight
+        kernel = self._pad_to_mxn_tensor(kernel)
+        running_mean = bn.running_mean
+        running_var = bn.running_var
+        gamma = bn.weight
+        beta = bn.bias
+        eps = bn.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+    def get_equivalent_kernel_bias(self):
+        kernel_mxn, bias_mxn = self._fuse_bn_tensor(self.main_conv, self.main_bn)
+        if self.ver_conv is not None:
+            kernel_mx1, bias_mx1 = self._fuse_bn_tensor(self.ver_conv, self.ver_bn)
+        else:
+            kernel_mx1, bias_mx1 = 0, 0
+        if self.hor_conv is not None:
+            kernel_1xn, bias_1xn = self._fuse_bn_tensor(self.hor_conv, self.hor_bn)
+        else:
+            kernel_1xn, bias_1xn = 0, 0
+        kernel_id, bias_id = self._identity_to_conv(self.rbr_identity)
+        kernel_mxn = kernel_mxn + kernel_mx1 + kernel_1xn + kernel_id
+        bias_mxn = bias_mxn + bias_mx1 + bias_1xn + bias_id
+        return kernel_mxn, bias_mxn
+
+    def _pad_to_mxn_tensor(self, kernel):
+        kernel_height, kernel_width = self.kernel_size
+        height, width = kernel.shape[2:]
+        pad_left_right = (kernel_width - width) // 2
+        pad_top_down = (kernel_height - height) // 2
+        return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right,
+                                                pad_top_down, pad_top_down])
+
+    def switch_to_deploy(self):
+        if hasattr(self, 'fused_conv'):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels,
+                                    out_channels=self.main_conv.out_channels,
+                                    kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride,
+                                    padding=self.main_conv.padding, dilation=self.main_conv.dilation,
+                                    groups=self.main_conv.groups, bias=True)
+        self.fused_conv.weight.data = kernel
+        self.fused_conv.bias.data = bias
+        self.deploy = True
+        for para in self.parameters():
+            para.detach_()
+        for attr in ['main_conv', 'main_bn', 'ver_conv', 'ver_bn', 'hor_conv', 'hor_bn']:
+            if hasattr(self, attr):
+                self.__delattr__(attr)
+
+        if hasattr(self, 'rbr_identity'):
+            self.__delattr__('rbr_identity')
+
+    def switch_to_test(self):
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels,
+                                    out_channels=self.main_conv.out_channels,
+                                    kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride,
+                                    padding=self.main_conv.padding, dilation=self.main_conv.dilation,
+                                    groups=self.main_conv.groups, bias=True)
+        self.fused_conv.weight.data = kernel
+        self.fused_conv.bias.data = bias
+        for para in self.fused_conv.parameters():
+            para.detach_()
+        self.deploy = True
+
+    def switch_to_train(self):
+        if hasattr(self, 'fused_conv'):
+            self.__delattr__('fused_conv')
+        self.deploy = False
+
+    @staticmethod
+    def is_zero_layer():
+        return False
+
+    @property
+    def module_str(self):
+        return 'Rep_%dx%d' % (self.kernel_size[0], self.kernel_size[1])
+
+    @property
+    def config(self):
+        return {'name': RepConvLayer.__name__,
+                'in_channels': self.in_channels,
+                'out_channels': self.out_channels,
+                'kernel_size': self.kernel_size,
+                'stride': self.stride,
+                'dilation': self.dilation,
+                'groups': self.groups}
+
+    @staticmethod
+    def build_from_config(config):
+        return RepConvLayer(**config)
+
+
+class TextNet(nn.Module):
+
+    def __init__(self, first_conv, stage1, stage2, stage3, stage4):
+        super(TextNet, self).__init__()
+
+        self.first_conv = first_conv
+        self.stage1 = nn.ModuleList(stage1)
+        self.stage2 = nn.ModuleList(stage2)
+        self.stage3 = nn.ModuleList(stage3)
+        self.stage4 = nn.ModuleList(stage4)
+
+        self._initialize_weights()
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.first_conv(x)
+        output = list()
+
+        for block in self.stage1:
+            x = block(x)
+        output.append(x)
+
+        for block in self.stage2:
+            x = block(x)
+        output.append(x)
+
+        for block in self.stage3:
+            x = block(x)
+        output.append(x)
+
+        for block in self.stage4:
+            x = block(x)
+        output.append(x)
+
+        return output
+
+    @staticmethod
+    def build_from_config(config):
+        first_conv = set_layer_from_config(config['first_conv'])
+        stage1, stage2, stage3, stage4 = [], [], [], []
+        for block_config in config['stage1']:
+            stage1.append(set_layer_from_config(block_config))
+        for block_config in config['stage2']:
+            stage2.append(set_layer_from_config(block_config))
+        for block_config in config['stage3']:
+            stage3.append(set_layer_from_config(block_config))
+        for block_config in config['stage4']:
+            stage4.append(set_layer_from_config(block_config))
+
+        net = TextNet(first_conv, stage1, stage2, stage3, stage4)
+
+        return net
+
+
+class FASTNeck(nn.Module):
+    def __init__(self, reduce_layer1, reduce_layer2, reduce_layer3, reduce_layer4):
+        super(FASTNeck, self).__init__()
+        self.reduce_layer1 = reduce_layer1
+        self.reduce_layer2 = reduce_layer2
+        self.reduce_layer3 = reduce_layer3
+        self.reduce_layer4 = reduce_layer4
+
+        self._initialize_weights()
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _upsample(self, x, y):
+        _, _, H, W = y.size()
+        return F.upsample(x, size=(H, W), mode='bilinear')
+
+    def forward(self, x):
+        f1, f2, f3, f4 = x
+        f1 = self.reduce_layer1(f1)
+        f2 = self.reduce_layer2(f2)
+        f3 = self.reduce_layer3(f3)
+        f4 = self.reduce_layer4(f4)
+
+        f2 = self._upsample(f2, f1)
+        f3 = self._upsample(f3, f1)
+        f4 = self._upsample(f4, f1)
+        f = torch.cat((f1, f2, f3, f4), 1)
+        return f
+
+    @staticmethod
+    def build_from_config(config):
+        reduce_layer1 = set_layer_from_config(config['reduce_layer1'])
+        reduce_layer2 = set_layer_from_config(config['reduce_layer2'])
+        reduce_layer3 = set_layer_from_config(config['reduce_layer3'])
+        reduce_layer4 = set_layer_from_config(config['reduce_layer4'])
+        return FASTNeck(reduce_layer1, reduce_layer2, reduce_layer3, reduce_layer4)
+
+
+class FASTHead(nn.Module):
+    def __init__(self, conv, blocks, final, pooling_size,
+                 loss_text, loss_kernel, loss_emb, dropout_ratio=0):
+        super(FASTHead, self).__init__()
+        self.conv = conv
+        if blocks is not None:
+            self.blocks = nn.ModuleList(blocks)
+        else:
+            self.blocks = None
+        self.final = final
+
+        # self.text_loss = build_loss(loss_text)
+        # self.kernel_loss = build_loss(loss_kernel)
+        # self.emb_loss = build_loss(loss_emb)
+
+        self.pooling_size = pooling_size
+
+        self.pooling_1s = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1,
+                                       padding=(self.pooling_size - 1) // 2)
+        self.pooling_2s = nn.MaxPool2d(kernel_size=self.pooling_size // 2 + 1, stride=1,
+                                       padding=(self.pooling_size // 2) // 2)
+
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+
+        self._initialize_weights()
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.blocks is not None:
+            for block in self.blocks:
+                x = block(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        x = self.final(x)
+        return x
+
+    def get_results(self, out, img_meta, cfg, scale=2):
+
+        if not self.training:
+            torch.cuda.synchronize()
+            start = time.time()
+
+        org_img_size = img_meta['org_img_size'][0]
+        img_size = img_meta['img_size'][0]  # 640*640
+        batch_size = out.size(0)
+        outputs = dict()
+
+        texts = F.interpolate(out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale),
+                              mode='nearest')  # B*1*320*320
+        texts = self._max_pooling(texts, scale=scale)  # B*1*320*320
+        score_maps = torch.sigmoid_(texts)  # B*1*320*320
+        score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode='nearest')  # B*1*640*640
+        score_maps = score_maps.squeeze(1)  # B*640*640
+
+        kernels = (out[:, 0, :, :] > 0).to(torch.uint8)  # B*160*160
+        if kernels.is_cuda:
+            labels_ = ccl_cuda.ccl_batch(kernels)  # B*160*160
+        else:
+            labels_ = []
+            for kernel in kernels.numpy():
+                ret, label_ = cv2.connectedComponents(kernel)
+                labels_.append(label_)
+            labels_ = np.array(labels_)
+            labels_ = torch.from_numpy(labels_)
+        labels = labels_.unsqueeze(1).to(torch.float32)  # B*1*160*160
+        labels = F.interpolate(labels, size=(img_size[0] // scale, img_size[1] // scale), mode='nearest')  # B*1*320*320
+        labels = self._max_pooling(labels, scale=scale)
+        labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode='nearest')  # B*1*640*640
+        labels = labels.squeeze(1).to(torch.int32)  # B*640*640
+
+        keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)]
+
+        if not self.training:
+            torch.cuda.synchronize()
+            outputs.update(dict(
+                post_time=time.time() - start
+            ))
+
+        outputs.update(dict(kernels=kernels.data.cpu()))
+
+        scales = (float(org_img_size[1]) / float(img_size[1]),
+                  float(org_img_size[0]) / float(img_size[0]))
+
+        results = []
+        for i in range(batch_size):
+            bboxes, scores = generate_bbox(keys[i], labels[i], score_maps[i], scales, cfg)
+            results.append(dict(
+                bboxes=bboxes,
+                scores=scores
+            ))
+        outputs.update(dict(results=results))
+
+        return outputs
+
+    def _max_pooling(self, x, scale=1):
+        if scale == 1:
+            x = self.pooling_1s(x)
+        elif scale == 2:
+            x = self.pooling_2s(x)
+        return x
+
+    # def loss(self, out, gt_texts, gt_kernels, training_masks, gt_instances):
+    #     # output
+    #     kernels = out[:, 0, :, :]  # 4*640*640
+    #     texts = self._max_pooling(kernels, scale=1)  # 4*640*640
+    #     embs = out[:, 1:, :, :]  # 4*4*640*640
+    #
+    #     # text loss
+    #     selected_masks = ohem_batch(texts, gt_texts, training_masks)
+    #     loss_text = self.text_loss(texts, gt_texts, selected_masks, reduce=False)
+    #     iou_text = iou((texts > 0).long(), gt_texts, training_masks, reduce=False)
+    #     losses = dict(
+    #         loss_text=loss_text,
+    #         iou_text=iou_text
+    #     )
+    #
+    #     # kernel loss
+    #     selected_masks = gt_texts * training_masks
+    #     loss_kernel = self.kernel_loss(kernels, gt_kernels, selected_masks, reduce=False)
+    #     loss_kernel = torch.mean(loss_kernel, dim=0)
+    #     iou_kernel = iou((kernels > 0).long(), gt_kernels, selected_masks, reduce=False)
+    #     losses.update(dict(
+    #         loss_kernels=loss_kernel,
+    #         iou_kernel=iou_kernel
+    #     ))
+    #
+    #     # auxiliary loss
+    #     loss_emb = self.emb_loss(embs, gt_instances, gt_kernels, training_masks, reduce=False)
+    #     losses.update(dict(
+    #         loss_emb=loss_emb
+    #     ))
+    #
+    #     return losses
+
+    @staticmethod
+    def build_from_config(config, **kwargs):
+        conv = set_layer_from_config(config['conv'])
+        final = set_layer_from_config(config['final'])
+        try:
+            blocks = []
+            for block_config in config['blocks']:
+                blocks.append(set_layer_from_config(block_config))
+            return FASTHead(conv, blocks, final, **kwargs)
+        except:
+            return FASTHead(conv, None, final, **kwargs)
+
+
+class FAST(nn.Module):
+    def __init__(self, backbone, neck, detection_head):
+        super(FAST, self).__init__()
+        self.backbone = TextNet.build_from_config(backbone)
+        self.neck = FASTNeck.build_from_config(neck)
+        self.det_head = FASTHead.build_from_config(detection_head)
+
+    def _upsample(self, x, size, scale=1):
+        _, _, H, W = size
+        return F.interpolate(x, size=(H // scale, W // scale), mode='bilinear')
+
+    def forward(self, imgs, gt_texts=None, gt_kernels=None, training_masks=None,
+                gt_instances=None, img_metas=None, cfg=None):
+        outputs = dict()
+
+        if not self.training:
+            torch.cuda.synchronize()
+            start = time.time()
+
+        # backbone
+        f = self.backbone(imgs)
+
+        if not self.training:
+            torch.cuda.synchronize()
+            outputs.update(dict(
+                backbone_time=time.time() - start
+            ))
+            start = time.time()
+
+        # reduce channel
+        f = self.neck(f)
+
+        if not self.training:
+            torch.cuda.synchronize()
+            outputs.update(dict(
+                neck_time=time.time() - start
+            ))
+            start = time.time()
+
+        # detection
+        det_out = self.det_head(f)
+
+        if not self.training:
+            torch.cuda.synchronize()
+            outputs.update(dict(
+                det_head_time=time.time() - start
+            ))
+
+        if self.training:
+            det_out = self._upsample(det_out, imgs.size(), scale=1)
+            det_loss = self.det_head.loss(det_out, gt_texts, gt_kernels, training_masks, gt_instances)
+            outputs.update(det_loss)
+        else:
+            det_out = self._upsample(det_out, imgs.size(), scale=4)
+            det_res = self.det_head.get_results(det_out, img_metas, cfg, scale=2)
+            outputs.update(det_res)
+
+        return outputs

From d8e1bc6eea3398efc1cb0f39402b0545222e826b Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Sat, 7 Oct 2023 19:44:37 +0530
Subject: [PATCH 002/152] Add config and modeling for Fast model

---
 src/transformers/models/fast/__init__.py      |   0
 .../models/fast/configuration_fast.py         | 199 ++++--
 src/transformers/models/fast/modeling_fast.py | 603 +++++-------------
 3 files changed, 293 insertions(+), 509 deletions(-)
 create mode 100644 src/transformers/models/fast/__init__.py

diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index 53a3791a48ae..aab305edb5de 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -3,65 +3,94 @@
 
 class FastConfig(PretrainedConfig):
 
-    def __init__(self,
-                 backbone_config=None,
-                 backbone_stage1_in_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64],
-                 backbone_stage1_out_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64],
-                 backbone_stage1_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3),
-                                              (3, 3)],
-                 backbone_stage1_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1],
-                 backbone_stage1_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                 backbone_stage1_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-
-                 backbone_stage2_in_channels=[64, 128, 128, 128, 128, 128, 128, 128, 128, 128],
-                 backbone_stage2_out_channels=[128, 128, 128, 128, 128, 128, 128, 128, 128, 128],
-                 backbone_stage2_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3),
-                                              (3, 3)],
-                 backbone_stage2_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1],
-                 backbone_stage2_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                 backbone_stage2_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-
-                 backbone_stage3_in_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64],
-                 backbone_stage3_out_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64],
-                 backbone_stage3_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3),
-                                              (3, 3)],
-                 backbone_stage3_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1],
-                 backbone_stage3_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                 backbone_stage3_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-
-                 backbone_stage4_in_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64],
-                 backbone_stage4_out_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64],
-                 backbone_stage4_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3),
-                                              (3, 3)],
-                 backbone_stage4_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1],
-                 backbone_stage4_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                 backbone_stage4_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-
-                 neck_in_channels=[64, 128, 256, 512],
-                 neck_out_channels=[128, 128, 128, 128],
-                 neck_kernel_size=[(3, 3), (3, 3), (3, 3), (3, 3)],
-                 neck_stride=[1, 1, 1, 1],
-                 neck_dilation=[1, 1, 1, 1],
-                 neck_groups=[1, 1, 1, 1],
-                 **kwargs
-                 ):
-        self.backbone_config = {
-            "kernel_size": 3,
-            "stride": 2,
-            "dilation": 1,
-            "groups": 1,
-            "bias": False,
-            "has_shuffle": False,
-            "in_channels": 3,
-            "out_channels": 64,
-            "use_bn": True,
-            "act_func": "relu",
-            "dropout_rate": 0,
-            "ops_order": "weight_bn_act"
-        }
-        super.__init__(**kwargs)
-        if backbone_config is not None:
-            self.backbone_config.update(backbone_config)
+    def __init__(
+            self,
+            backbone_kernel_size=3,
+            backbone_stride=2,
+            backbone_dilation=1,
+            backbone_groups=1,
+            backbone_bias=False,
+            backbone_has_shuffle=False,
+            backbone_in_channels=3,
+            backbone_out_channels=64,
+            backbone_use_bn=True,
+            backbone_act_func="relu",
+            backbone_dropout_rate=0,
+            backbone_ops_order="weight_bn_act",
+
+            backbone_stage1_in_channels=(64, 64, 64),
+            backbone_stage1_out_channels=(64, 64, 64),
+            backbone_stage1_kernel_size=((3, 3), (3, 3), (3, 3)),
+            backbone_stage1_stride=(1, 2, 1),
+            backbone_stage1_dilation=(1, 1, 1),
+            backbone_stage1_groups=(1, 1, 1),
+
+            backbone_stage2_in_channels=(64, 128, 128, 128),
+            backbone_stage2_out_channels=(128, 128, 128, 128),
+            backbone_stage2_kernel_size=((3, 3), (1, 3), (3, 3), (3, 1)),
+            backbone_stage2_stride=(2, 1, 1, 1),
+            backbone_stage2_dilation=(1, 1, 1, 1),
+            backbone_stage2_groups=(1, 1, 1, 1),
+
+            backbone_stage3_in_channels=(128, 256, 256, 256),
+            backbone_stage3_out_channels=(256, 256, 256, 256),
+            backbone_stage3_kernel_size=((3, 3), (3, 3), (3, 1), (1, 3)),
+            backbone_stage3_stride=(2, 1, 1, 1),
+            backbone_stage3_dilation=(1, 1, 1, 1),
+            backbone_stage3_groups=(1, 1, 1, 1),
+
+            backbone_stage4_in_channels=(256, 512, 512, 512),
+            backbone_stage4_out_channels=(512, 512, 512, 512),
+            backbone_stage4_kernel_size=((3, 3), (3, 1), (1, 3), (3, 3)),
+            backbone_stage4_stride=(2, 1, 1, 1),
+            backbone_stage4_dilation=(1, 1, 1, 1),
+            backbone_stage4_groups=(1, 1, 1, 1),
+
+            neck_in_channels=(64, 128, 256, 512),
+            neck_out_channels=(128, 128, 128, 128),
+            neck_kernel_size=((3, 3), (3, 3), (3, 3), (3, 3)),
+            neck_stride=(1, 1, 1, 1),
+            neck_dilation=(1, 1, 1, 1),
+            neck_groups=(1, 1, 1, 1),
+
+            head_pooling_size=9,
+            head_dropout_ratio=0.1,
+
+            head_conv_in_channels=512,
+            head_conv_out_channels=128,
+            head_conv_kernel_size=(3, 3),
+            head_conv_stride=1,
+            head_conv_dilation=1,
+            head_conv_groups=1,
+
+            head_final_kernel_size=1,
+            head_final_stride=1,
+            head_final_dilation=1,
+            head_final_groups=1,
+            head_final_bias=False,
+            head_final_has_shuffle=False,
+            head_final_in_channels=128,
+            head_final_out_channels=5,
+            head_final_use_bn=False,
+            head_final_act_func=None,
+            head_final_dropout_rate=0,
+            head_final_ops_order="weight",
+            **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.backbone_kernel_size = backbone_kernel_size
+        self.backbone_stride = backbone_stride
+        self.backbone_dilation = backbone_dilation
+        self.backbone_groups = backbone_groups
+        self.backbone_bias = backbone_bias
+        self.backbone_has_shuffle = backbone_has_shuffle
+        self.backbone_in_channels = backbone_in_channels
+        self.backbone_out_channels = backbone_out_channels
+        self.backbone_use_bn = backbone_use_bn
+        self.backbone_act_func = backbone_act_func
+        self.backbone_dropout_rate = backbone_dropout_rate
+        self.backbone_ops_order = backbone_ops_order
 
         self.backbone_stage1_in_channels = backbone_stage1_in_channels
         self.backbone_stage1_out_channels = backbone_stage1_out_channels
@@ -70,9 +99,53 @@ def __init__(self,
         self.backbone_stage1_dilation = backbone_stage1_dilation,
         self.backbone_stage1_groups = backbone_stage1_groups,
 
+        self.backbone_stage2_in_channels = backbone_stage2_in_channels
+        self.backbone_stage2_out_channels = backbone_stage2_out_channels
+        self.backbone_stage2_kernel_size = backbone_stage2_kernel_size,
+        self.backbone_stage2_stride = backbone_stage2_stride,
+        self.backbone_stage2_dilation = backbone_stage2_dilation,
+        self.backbone_stage2_groups = backbone_stage2_groups,
+
+        self.backbone_stage3_in_channels = backbone_stage3_in_channels
+        self.backbone_stage3_out_channels = backbone_stage3_out_channels
+        self.backbone_stage3_kernel_size = backbone_stage3_kernel_size,
+        self.backbone_stage3_stride = backbone_stage3_stride,
+        self.backbone_stage3_dilation = backbone_stage3_dilation,
+        self.backbone_stage3_groups = backbone_stage3_groups,
+
+        self.backbone_stage4_in_channels = backbone_stage4_in_channels
+        self.backbone_stage4_out_channels = backbone_stage4_out_channels
+        self.backbone_stage4_kernel_size = backbone_stage4_kernel_size,
+        self.backbone_stage4_stride = backbone_stage4_stride,
+        self.backbone_stage4_dilation = backbone_stage4_dilation,
+        self.backbone_stage4_groups = backbone_stage4_groups,
+
         self.neck_in_channels = neck_in_channels,
         self.neck_out_channels = neck_out_channels,
-        self.neck_kernel_size_channels = neck_kernel_size,
-        self.neck_stride_channels = neck_stride,
-        self.neck_dilation_channels = neck_dilation,
-        self.neck_groups_channels = neck_groups,
+        self.neck_kernel_size = neck_kernel_size,
+        self.neck_stride = neck_stride,
+        self.neck_dilation = neck_dilation,
+        self.neck_groups = neck_groups,
+
+        self.head_pooling_size = head_pooling_size,
+        self.head_dropout_ratio = head_dropout_ratio,
+
+        self.head_conv_in_channels = head_conv_in_channels
+        self.head_conv_out_channels = head_conv_out_channels
+        self.head_conv_kernel_size = head_conv_kernel_size
+        self.head_conv_stride = head_conv_stride
+        self.head_conv_dilation = head_conv_dilation
+        self.head_conv_groups = head_conv_groups
+
+        self.head_final_kernel_size = head_final_kernel_size,
+        self.head_final_stride = head_final_stride,
+        self.head_final_dilation = head_final_dilation,
+        self.head_final_groups = head_final_groups,
+        self.head_final_bias = head_final_bias,
+        self.head_final_has_shuffle = head_final_has_shuffle,
+        self.head_final_in_channels = head_final_in_channels,
+        self.head_final_out_channels = head_final_out_channels,
+        self.head_final_use_bn = head_final_use_bn,
+        self.head_final_act_func = head_final_act_func,
+        self.head_final_dropout_rate = head_final_dropout_rate,
+        self.head_final_ops_order = head_final_ops_order
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 4684378ce8eb..dc415b76a0b5 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -1,228 +1,13 @@
 import math
 from collections import OrderedDict
 
+import cv2
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import time
-import cv2
-
-class DiceLoss(nn.Module):
-    def __init__(self, loss_weight=1.0):
-        super(DiceLoss, self).__init__()
-        self.loss_weight = loss_weight
-
-    def forward(self, input, target, mask, reduce=True):
-        batch_size = input.size(0)
-        input = torch.sigmoid(input)
-
-        input = input.contiguous().view(batch_size, -1)
-        target = target.contiguous().view(batch_size, -1).float()
-        mask = mask.contiguous().view(batch_size, -1).float()
-
-        input = input * mask
-        target = target * mask
-
-        a = torch.sum(input * target, dim=1)
-        b = torch.sum(input * input, dim=1) + 0.001
-        c = torch.sum(target * target, dim=1) + 0.001
-        d = (2 * a) / (b + c)
-        loss = 1 - d
-
-        loss = self.loss_weight * loss
-
-        if reduce:
-            loss = torch.mean(loss)
-
-        return loss
-
-
-class EmbLoss_v1(nn.Module):
-    def __init__(self, feature_dim=4, loss_weight=1.0):
-        super(EmbLoss_v1, self).__init__()
-        self.feature_dim = feature_dim
-        self.loss_weight = loss_weight
-        self.delta_v = 0.5
-        self.delta_d = 1.5
-        self.weights = (1.0, 1.0)
-
-    def forward_single(self, emb, instance, kernel, training_mask):
-        training_mask = (training_mask > 0.5).long()
-        kernel = (kernel > 0.5).long()
-        instance = instance * training_mask
-        instance_kernel = (instance * kernel).view(-1)
-        instance = instance.view(-1)
-        emb = emb.view(self.feature_dim, -1)
-
-        unique_labels, unique_ids = torch.unique(instance_kernel, sorted=True, return_inverse=True)
-        num_instance = unique_labels.size(0)
-        if num_instance <= 1:
-            return 0
-
-        emb_mean = emb.new_zeros((self.feature_dim, num_instance), dtype=torch.float32)
-        for i, lb in enumerate(unique_labels):
-            if lb == 0:
-                continue
-            ind_k = instance_kernel == lb
-            emb_mean[:, i] = torch.mean(emb[:, ind_k], dim=1)
-
-        l_agg = emb.new_zeros(num_instance, dtype=torch.float32)  # bug
-        for i, lb in enumerate(unique_labels):
-            if lb == 0:
-                continue
-            ind = instance == lb
-            emb_ = emb[:, ind]
-            dist = (emb_ - emb_mean[:, i:i + 1]).norm(p=2, dim=0)
-            dist = F.relu(dist - self.delta_v) ** 2
-            l_agg[i] = torch.mean(torch.log(dist + 1.0))
-        l_agg = torch.mean(l_agg[1:])
-
-        if num_instance > 2:
-            emb_interleave = emb_mean.permute(1, 0).repeat(num_instance, 1)
-            emb_band = emb_mean.permute(1, 0).repeat(1, num_instance).view(-1, self.feature_dim)
-            # print(seg_band)
-
-            mask = (1 - torch.eye(num_instance, dtype=torch.int8)).view(-1, 1).repeat(1, self.feature_dim)
-            mask = mask.view(num_instance, num_instance, -1)
-            mask[0, :, :] = 0
-            mask[:, 0, :] = 0
-            mask = mask.view(num_instance * num_instance, -1)
-            # print(mask)
-
-            dist = emb_interleave - emb_band
-            dist = dist[mask > 0].view(-1, self.feature_dim).norm(p=2, dim=1)
-            dist = F.relu(2 * self.delta_d - dist) ** 2
-            l_dis = torch.mean(torch.log(dist + 1.0))
-        else:
-            l_dis = 0
-
-        l_agg = self.weights[0] * l_agg
-        l_dis = self.weights[1] * l_dis
-        l_reg = torch.mean(torch.log(torch.norm(emb_mean, 2, 0) + 1.0)) * 0.001
-        loss = l_agg + l_dis + l_reg
-        return loss
-
-    def forward(self, emb, instance, kernel, training_mask, reduce=True):
-        loss_batch = emb.new_zeros((emb.size(0)), dtype=torch.float32)
-
-        for i in range(loss_batch.size(0)):
-            loss_batch[i] = self.forward_single(emb[i], instance[i], kernel[i], training_mask[i])
-
-        loss_batch = self.loss_weight * loss_batch
-
-        if reduce:
-            loss_batch = torch.mean(loss_batch)
-
-        return loss_batch
-
-
-class EmbLoss_v2(nn.Module):
-    def __init__(self, feature_dim=4, loss_weight=1.0):
-        super(EmbLoss_v2, self).__init__()
-        self.feature_dim = feature_dim
-        self.loss_weight = loss_weight
-        self.delta_v = 0.5
-        self.delta_d = 1.5
-        self.weights = (1.0, 1.0)
-
-    def forward_single(self, emb, instance, kernel, training_mask):
-        training_mask = (training_mask > 0.5).long()
-        kernel = (kernel > 0.5).long()
-        instance = instance * training_mask
-        instance_kernel = (instance * kernel).view(-1)
-        instance = instance.view(-1)
-        emb = emb.view(self.feature_dim, -1)
-
-        unique_labels, unique_ids = torch.unique(instance_kernel, sorted=True, return_inverse=True)
-        num_instance = unique_labels.size(0)
-        if num_instance <= 1:
-            return 0
-
-        emb_mean = emb.new_zeros((self.feature_dim, num_instance), dtype=torch.float32)
-        for i, lb in enumerate(unique_labels):
-            if lb == 0:
-                continue
-            ind_k = instance_kernel == lb
-            emb_mean[:, i] = torch.mean(emb[:, ind_k], dim=1)
-
-        l_agg = emb.new_zeros(num_instance, dtype=torch.float32)  # bug
-        for i, lb in enumerate(unique_labels):
-            if lb == 0:
-                continue
-            ind = instance == lb
-            emb_ = emb[:, ind]
-            dist = (emb_ - emb_mean[:, i:i + 1]).norm(p=2, dim=0)
-            dist = F.relu(dist - self.delta_v) ** 2
-            l_agg[i] = torch.mean(torch.log(dist + 1.0))
-        l_agg = torch.mean(l_agg[1:])
-
-        if num_instance > 2:
-            emb_interleave = emb_mean.permute(1, 0).repeat(num_instance, 1)
-            emb_band = emb_mean.permute(1, 0).repeat(1, num_instance).view(-1, self.feature_dim)
-            # print(seg_band)
-
-            mask = (1 - torch.eye(num_instance, dtype=torch.int8)).view(-1, 1).repeat(1, self.feature_dim)
-            mask = mask.view(num_instance, num_instance, -1)
-            mask[0, :, :] = 0
-            mask[:, 0, :] = 0
-            mask = mask.view(num_instance * num_instance, -1)
-            # print(mask)
-
-            dist = emb_interleave - emb_band
-            dist = dist[mask > 0].view(-1, self.feature_dim).norm(p=2, dim=1)
-            dist = F.relu(2 * self.delta_d - dist) ** 2
-            # l_dis = torch.mean(torch.log(dist + 1.0))
-
-            l_dis = [torch.log(dist + 1.0)]
-            emb_bg = emb[:, instance == 0].view(self.feature_dim, -1)
-            if emb_bg.size(1) > 100:
-                rand_ind = np.random.permutation(emb_bg.size(1))[:100]
-                emb_bg = emb_bg[:, rand_ind]
-            if emb_bg.size(1) > 0:
-                for i, lb in enumerate(unique_labels):
-                    if lb == 0:
-                        continue
-                    dist = (emb_bg - emb_mean[:, i:i + 1]).norm(p=2, dim=0)
-                    dist = F.relu(2 * self.delta_d - dist) ** 2
-                    l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True)
-                    l_dis.append(l_dis_bg)
-            l_dis = torch.mean(torch.cat(l_dis))
-        else:
-            l_dis = 0
-
-        l_agg = self.weights[0] * l_agg
-        l_dis = self.weights[1] * l_dis
-        l_reg = torch.mean(torch.log(torch.norm(emb_mean, 2, 0) + 1.0)) * 0.001
-        loss = l_agg + l_dis + l_reg
-        return loss
 
-    def forward(self, emb, instance, kernel, training_mask, reduce=True):
-        loss_batch = emb.new_zeros((emb.size(0)), dtype=torch.float32)
-
-        for i in range(loss_batch.size(0)):
-            loss_batch[i] = self.forward_single(emb[i], instance[i], kernel[i], training_mask[i])
-
-        loss_batch = self.loss_weight * loss_batch
-
-        if reduce:
-            loss_batch = torch.mean(loss_batch)
-
-        return loss_batch
-
-
-def set_layer_from_config(layer_config):
-    if layer_config is None:
-        return None
-
-    name2layer = {
-        ConvLayer.__name__: ConvLayer,
-        RepConvLayer.__name__: RepConvLayer
-    }
-
-    layer_name = layer_config.pop('name')
-    layer = name2layer[layer_name]
-    return layer.build_from_config(layer_config)
+from transformers import PreTrainedModel
 
 
 def get_same_padding(kernel_size):
@@ -236,6 +21,21 @@ def get_same_padding(kernel_size):
     return kernel_size // 2
 
 
+def build_activation(act_func, inplace=True):
+    if act_func == 'relu':
+        return nn.ReLU(inplace=inplace)
+    elif act_func == 'relu6':
+        return nn.ReLU6(inplace=inplace)
+    elif act_func == 'tanh':
+        return nn.Tanh()
+    elif act_func == 'sigmoid':
+        return nn.Sigmoid()
+    elif act_func is None:
+        return None
+    else:
+        raise ValueError('do not support: %s' % act_func)
+
+
 class My2DLayer(nn.Module):
 
     def __init__(self, in_channels, out_channels,
@@ -365,6 +165,10 @@ def generate_bbox(keys, label, score, scales, cfg):
     return bboxes, scores
 
 
+class FalsePreTrainedModel(PreTrainedModel):
+    pass
+
+
 class ConvLayer(My2DLayer):
 
     def __init__(self, in_channels, out_channels,
@@ -395,10 +199,6 @@ def weight_op(self):
 
         return weight_dict
 
-    @staticmethod
-    def build_from_config(config):
-        return ConvLayer(**config)
-
 
 class RepConvLayer(nn.Module):
 
@@ -534,77 +334,104 @@ def _pad_to_mxn_tensor(self, kernel):
         return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right,
                                                 pad_top_down, pad_top_down])
 
-    def switch_to_deploy(self):
-        if hasattr(self, 'fused_conv'):
-            return
-        kernel, bias = self.get_equivalent_kernel_bias()
-        self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels,
-                                    out_channels=self.main_conv.out_channels,
-                                    kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride,
-                                    padding=self.main_conv.padding, dilation=self.main_conv.dilation,
-                                    groups=self.main_conv.groups, bias=True)
-        self.fused_conv.weight.data = kernel
-        self.fused_conv.bias.data = bias
-        self.deploy = True
-        for para in self.parameters():
-            para.detach_()
-        for attr in ['main_conv', 'main_bn', 'ver_conv', 'ver_bn', 'hor_conv', 'hor_bn']:
-            if hasattr(self, attr):
-                self.__delattr__(attr)
-
-        if hasattr(self, 'rbr_identity'):
-            self.__delattr__('rbr_identity')
-
-    def switch_to_test(self):
-        kernel, bias = self.get_equivalent_kernel_bias()
-        self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels,
-                                    out_channels=self.main_conv.out_channels,
-                                    kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride,
-                                    padding=self.main_conv.padding, dilation=self.main_conv.dilation,
-                                    groups=self.main_conv.groups, bias=True)
-        self.fused_conv.weight.data = kernel
-        self.fused_conv.bias.data = bias
-        for para in self.fused_conv.parameters():
-            para.detach_()
-        self.deploy = True
-
-    def switch_to_train(self):
-        if hasattr(self, 'fused_conv'):
-            self.__delattr__('fused_conv')
-        self.deploy = False
-
-    @staticmethod
-    def is_zero_layer():
-        return False
-
-    @property
-    def module_str(self):
-        return 'Rep_%dx%d' % (self.kernel_size[0], self.kernel_size[1])
-
-    @property
-    def config(self):
-        return {'name': RepConvLayer.__name__,
-                'in_channels': self.in_channels,
-                'out_channels': self.out_channels,
-                'kernel_size': self.kernel_size,
-                'stride': self.stride,
-                'dilation': self.dilation,
-                'groups': self.groups}
-
-    @staticmethod
-    def build_from_config(config):
-        return RepConvLayer(**config)
-
-
-class TextNet(nn.Module):
-
-    def __init__(self, first_conv, stage1, stage2, stage3, stage4):
-        super(TextNet, self).__init__()
-
-        self.first_conv = first_conv
+    # def switch_to_deploy(self):
+    #     if hasattr(self, 'fused_conv'):
+    #         return
+    #     kernel, bias = self.get_equivalent_kernel_bias()
+    #     self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels,
+    #                                 out_channels=self.main_conv.out_channels,
+    #                                 kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride,
+    #                                 padding=self.main_conv.padding, dilation=self.main_conv.dilation,
+    #                                 groups=self.main_conv.groups, bias=True)
+    #     self.fused_conv.weight.data = kernel
+    #     self.fused_conv.bias.data = bias
+    #     self.deploy = True
+    #     for para in self.parameters():
+    #         para.detach_()
+    #     for attr in ['main_conv', 'main_bn', 'ver_conv', 'ver_bn', 'hor_conv', 'hor_bn']:
+    #         if hasattr(self, attr):
+    #             self.__delattr__(attr)
+    #
+    #     if hasattr(self, 'rbr_identity'):
+    #         self.__delattr__('rbr_identity')
+
+    # def switch_to_test(self):
+    #     kernel, bias = self.get_equivalent_kernel_bias()
+    #     self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels,
+    #                                 out_channels=self.main_conv.out_channels,
+    #                                 kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride,
+    #                                 padding=self.main_conv.padding, dilation=self.main_conv.dilation,
+    #                                 groups=self.main_conv.groups, bias=True)
+    #     self.fused_conv.weight.data = kernel
+    #     self.fused_conv.bias.data = bias
+    #     for para in self.fused_conv.parameters():
+    #         para.detach_()
+    #     self.deploy = True
+
+    # def switch_to_train(self):
+    #     if hasattr(self, 'fused_conv'):
+    #         self.__delattr__('fused_conv')
+    #     self.deploy = False
+
+    # @staticmethod
+    # def is_zero_layer():
+    #     return False
+
+    # @property
+    # def module_str(self):
+    #     return 'Rep_%dx%d' % (self.kernel_size[0], self.kernel_size[1])
+
+    # @property
+    # def config(self):
+    #     return {'name': RepConvLayer.__name__,
+    #             'in_channels': self.in_channels,
+    #             'out_channels': self.out_channels,
+    #             'kernel_size': self.kernel_size,
+    #             'stride': self.stride,
+    #             'dilation': self.dilation,
+    #             'groups': self.groups}
+
+    # @staticmethod
+    # def build_from_config(config):
+    #     return RepConvLayer(**config)
+
+
+class TextNet(PreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.first_conv = ConvLayer(config.backbone_in_channels, config.backbone_out_channels,
+                                    config.backbone_kernel_size, config.backbone_stride, config.backbone_dilation,
+                                    config.backbone_groups, config.backbone_bias, config.backbone_has_shuffle,
+                                    config.backbone_use_bn, config.backbone_act_func, config.backbone_dropout_rate,
+                                    config.backbone_ops_order)
+
+        stage1 = []
+        for stage_config in zip(config.backbone_stage1_in_channels, config.backbone_stage1_out_channels,
+                                config.backbone_stage1_kernel_size[0], config.backbone_stage1_stride[0],
+                                config.backbone_stage1_dilation[0], config.backbone_stage1_groups[0]):
+            stage1.append(RepConvLayer(*stage_config))
         self.stage1 = nn.ModuleList(stage1)
+
+        stage2 = []
+        for stage_config in zip(config.backbone_stage2_in_channels, config.backbone_stage2_out_channels,
+                                config.backbone_stage2_kernel_size[0], config.backbone_stage2_stride[0],
+                                config.backbone_stage2_dilation[0], config.backbone_stage2_groups[0]):
+            stage2.append(RepConvLayer(*stage_config))
         self.stage2 = nn.ModuleList(stage2)
+
+        stage3 = []
+        for stage_config in zip(config.backbone_stage3_in_channels, config.backbone_stage3_out_channels,
+                                config.backbone_stage3_kernel_size[0], config.backbone_stage3_stride[0],
+                                config.backbone_stage3_dilation[0], config.backbone_stage3_groups[0]):
+            stage3.append(RepConvLayer(*stage_config))
         self.stage3 = nn.ModuleList(stage3)
+
+        stage4 = []
+        for stage_config in zip(config.backbone_stage4_in_channels, config.backbone_stage4_out_channels,
+                                config.backbone_stage4_kernel_size[0], config.backbone_stage4_stride[0],
+                                config.backbone_stage4_dilation[0], config.backbone_stage4_groups[0]):
+            stage4.append(RepConvLayer(*stage_config))
         self.stage4 = nn.ModuleList(stage4)
 
         self._initialize_weights()
@@ -639,31 +466,18 @@ def forward(self, x):
 
         return output
 
-    @staticmethod
-    def build_from_config(config):
-        first_conv = set_layer_from_config(config['first_conv'])
-        stage1, stage2, stage3, stage4 = [], [], [], []
-        for block_config in config['stage1']:
-            stage1.append(set_layer_from_config(block_config))
-        for block_config in config['stage2']:
-            stage2.append(set_layer_from_config(block_config))
-        for block_config in config['stage3']:
-            stage3.append(set_layer_from_config(block_config))
-        for block_config in config['stage4']:
-            stage4.append(set_layer_from_config(block_config))
-
-        net = TextNet(first_conv, stage1, stage2, stage3, stage4)
-
-        return net
-
-
-class FASTNeck(nn.Module):
-    def __init__(self, reduce_layer1, reduce_layer2, reduce_layer3, reduce_layer4):
-        super(FASTNeck, self).__init__()
-        self.reduce_layer1 = reduce_layer1
-        self.reduce_layer2 = reduce_layer2
-        self.reduce_layer3 = reduce_layer3
-        self.reduce_layer4 = reduce_layer4
+
+class FASTNeck(PreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        reduce_layer_configs = list(zip(config.neck_in_channels[0], config.neck_out_channels[0], config.neck_kernel_size[0],
+                          config.neck_stride[0], config.neck_dilation[0], config.neck_groups[0]))
+
+        self.reduce_layer1 = RepConvLayer(*reduce_layer_configs[0])
+        self.reduce_layer2 = RepConvLayer(*reduce_layer_configs[1])
+        self.reduce_layer3 = RepConvLayer(*reduce_layer_configs[2])
+        self.reduce_layer4 = RepConvLayer(*reduce_layer_configs[3])
 
         self._initialize_weights()
 
@@ -692,39 +506,30 @@ def forward(self, x):
         f = torch.cat((f1, f2, f3, f4), 1)
         return f
 
-    @staticmethod
-    def build_from_config(config):
-        reduce_layer1 = set_layer_from_config(config['reduce_layer1'])
-        reduce_layer2 = set_layer_from_config(config['reduce_layer2'])
-        reduce_layer3 = set_layer_from_config(config['reduce_layer3'])
-        reduce_layer4 = set_layer_from_config(config['reduce_layer4'])
-        return FASTNeck(reduce_layer1, reduce_layer2, reduce_layer3, reduce_layer4)
-
 
 class FASTHead(nn.Module):
-    def __init__(self, conv, blocks, final, pooling_size,
-                 loss_text, loss_kernel, loss_emb, dropout_ratio=0):
+
+    def __init__(self, config):
         super(FASTHead, self).__init__()
-        self.conv = conv
-        if blocks is not None:
-            self.blocks = nn.ModuleList(blocks)
-        else:
-            self.blocks = None
-        self.final = final
+        self.conv = RepConvLayer(config.head_conv_in_channels, config.head_conv_out_channels,
+                                 config.head_conv_kernel_size, config.head_conv_stride, config.head_conv_dilation,
+                                 config.head_conv_groups)
 
-        # self.text_loss = build_loss(loss_text)
-        # self.kernel_loss = build_loss(loss_kernel)
-        # self.emb_loss = build_loss(loss_emb)
+        self.final = ConvLayer(config.head_final_in_channels[0], config.head_final_out_channels[0],
+                               config.head_final_kernel_size[0], config.head_final_stride[0], config.head_final_dilation[0],
+                               config.head_final_groups[0], config.head_final_bias[0], config.head_final_has_shuffle[0],
+                               config.head_final_use_bn[0], config.head_final_act_func[0], config.head_final_dropout_rate[0],
+                               config.head_final_ops_order)
 
-        self.pooling_size = pooling_size
+        self.pooling_size = config.head_pooling_size[0]
 
         self.pooling_1s = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1,
                                        padding=(self.pooling_size - 1) // 2)
         self.pooling_2s = nn.MaxPool2d(kernel_size=self.pooling_size // 2 + 1, stride=1,
                                        padding=(self.pooling_size // 2) // 2)
 
-        if dropout_ratio > 0:
-            self.dropout = nn.Dropout2d(dropout_ratio)
+        if config.head_dropout_ratio[0] > 0:
+            self.dropout = nn.Dropout2d(config.head_dropout_ratio[0])
         else:
             self.dropout = None
 
@@ -740,9 +545,6 @@ def _initialize_weights(self):
 
     def forward(self, x):
         x = self.conv(x)
-        if self.blocks is not None:
-            for block in self.blocks:
-                x = block(x)
         if self.dropout is not None:
             x = self.dropout(x)
         x = self.final(x)
@@ -750,10 +552,6 @@ def forward(self, x):
 
     def get_results(self, out, img_meta, cfg, scale=2):
 
-        if not self.training:
-            torch.cuda.synchronize()
-            start = time.time()
-
         org_img_size = img_meta['org_img_size'][0]
         img_size = img_meta['img_size'][0]  # 640*640
         batch_size = out.size(0)
@@ -767,15 +565,12 @@ def get_results(self, out, img_meta, cfg, scale=2):
         score_maps = score_maps.squeeze(1)  # B*640*640
 
         kernels = (out[:, 0, :, :] > 0).to(torch.uint8)  # B*160*160
-        if kernels.is_cuda:
-            labels_ = ccl_cuda.ccl_batch(kernels)  # B*160*160
-        else:
-            labels_ = []
-            for kernel in kernels.numpy():
-                ret, label_ = cv2.connectedComponents(kernel)
-                labels_.append(label_)
-            labels_ = np.array(labels_)
-            labels_ = torch.from_numpy(labels_)
+        labels_ = []
+        for kernel in kernels.numpy():
+            ret, label_ = cv2.connectedComponents(kernel)
+            labels_.append(label_)
+        labels_ = np.array(labels_)
+        labels_ = torch.from_numpy(labels_)
         labels = labels_.unsqueeze(1).to(torch.float32)  # B*1*160*160
         labels = F.interpolate(labels, size=(img_size[0] // scale, img_size[1] // scale), mode='nearest')  # B*1*320*320
         labels = self._max_pooling(labels, scale=scale)
@@ -784,12 +579,6 @@ def get_results(self, out, img_meta, cfg, scale=2):
 
         keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)]
 
-        if not self.training:
-            torch.cuda.synchronize()
-            outputs.update(dict(
-                post_time=time.time() - start
-            ))
-
         outputs.update(dict(kernels=kernels.data.cpu()))
 
         scales = (float(org_img_size[1]) / float(img_size[1]),
@@ -813,107 +602,29 @@ def _max_pooling(self, x, scale=1):
             x = self.pooling_2s(x)
         return x
 
-    # def loss(self, out, gt_texts, gt_kernels, training_masks, gt_instances):
-    #     # output
-    #     kernels = out[:, 0, :, :]  # 4*640*640
-    #     texts = self._max_pooling(kernels, scale=1)  # 4*640*640
-    #     embs = out[:, 1:, :, :]  # 4*4*640*640
-    #
-    #     # text loss
-    #     selected_masks = ohem_batch(texts, gt_texts, training_masks)
-    #     loss_text = self.text_loss(texts, gt_texts, selected_masks, reduce=False)
-    #     iou_text = iou((texts > 0).long(), gt_texts, training_masks, reduce=False)
-    #     losses = dict(
-    #         loss_text=loss_text,
-    #         iou_text=iou_text
-    #     )
-    #
-    #     # kernel loss
-    #     selected_masks = gt_texts * training_masks
-    #     loss_kernel = self.kernel_loss(kernels, gt_kernels, selected_masks, reduce=False)
-    #     loss_kernel = torch.mean(loss_kernel, dim=0)
-    #     iou_kernel = iou((kernels > 0).long(), gt_kernels, selected_masks, reduce=False)
-    #     losses.update(dict(
-    #         loss_kernels=loss_kernel,
-    #         iou_kernel=iou_kernel
-    #     ))
-    #
-    #     # auxiliary loss
-    #     loss_emb = self.emb_loss(embs, gt_instances, gt_kernels, training_masks, reduce=False)
-    #     losses.update(dict(
-    #         loss_emb=loss_emb
-    #     ))
-    #
-    #     return losses
 
-    @staticmethod
-    def build_from_config(config, **kwargs):
-        conv = set_layer_from_config(config['conv'])
-        final = set_layer_from_config(config['final'])
-        try:
-            blocks = []
-            for block_config in config['blocks']:
-                blocks.append(set_layer_from_config(block_config))
-            return FASTHead(conv, blocks, final, **kwargs)
-        except:
-            return FASTHead(conv, None, final, **kwargs)
-
-
-class FAST(nn.Module):
-    def __init__(self, backbone, neck, detection_head):
-        super(FAST, self).__init__()
-        self.backbone = TextNet.build_from_config(backbone)
-        self.neck = FASTNeck.build_from_config(neck)
-        self.det_head = FASTHead.build_from_config(detection_head)
+class FASTForImageCaptioning(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.backbone = TextNet(config=config)
+        self.neck = FASTNeck(config=config)
+        self.det_head = FASTHead(config=config)
 
     def _upsample(self, x, size, scale=1):
         _, _, H, W = size
         return F.interpolate(x, size=(H // scale, W // scale), mode='bilinear')
 
-    def forward(self, imgs, gt_texts=None, gt_kernels=None, training_masks=None,
-                gt_instances=None, img_metas=None, cfg=None):
+    def forward(self, imgs, img_metas=None, cfg=None):
         outputs = dict()
 
-        if not self.training:
-            torch.cuda.synchronize()
-            start = time.time()
-
-        # backbone
         f = self.backbone(imgs)
 
-        if not self.training:
-            torch.cuda.synchronize()
-            outputs.update(dict(
-                backbone_time=time.time() - start
-            ))
-            start = time.time()
-
-        # reduce channel
         f = self.neck(f)
 
-        if not self.training:
-            torch.cuda.synchronize()
-            outputs.update(dict(
-                neck_time=time.time() - start
-            ))
-            start = time.time()
-
-        # detection
         det_out = self.det_head(f)
 
-        if not self.training:
-            torch.cuda.synchronize()
-            outputs.update(dict(
-                det_head_time=time.time() - start
-            ))
-
-        if self.training:
-            det_out = self._upsample(det_out, imgs.size(), scale=1)
-            det_loss = self.det_head.loss(det_out, gt_texts, gt_kernels, training_masks, gt_instances)
-            outputs.update(det_loss)
-        else:
-            det_out = self._upsample(det_out, imgs.size(), scale=4)
-            det_res = self.det_head.get_results(det_out, img_metas, cfg, scale=2)
-            outputs.update(det_res)
+        det_out = self._upsample(det_out, imgs.size(), scale=4)
+        det_res = self.det_head.get_results(det_out, img_metas, cfg, scale=2)
+        outputs.update(det_res)
 
         return outputs

From 185603e351fae06142700b62399f3a31e198f294 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Sun, 8 Oct 2023 09:24:36 +0530
Subject: [PATCH 003/152] Refactor modeling and add tests

---
 src/transformers/__init__.py                  |  13 +
 src/transformers/models/fast/__init__.py      |  54 +++
 .../models/fast/configuration_fast.py         | 207 +++++------
 src/transformers/models/fast/modeling_fast.py | 342 +++++++++++-------
 tests/models/fast/__init__.py                 |   0
 tests/models/fast/test_modeling_fast.py       | 256 +++++++++++++
 6 files changed, 637 insertions(+), 235 deletions(-)
 create mode 100644 tests/models/fast/__init__.py
 create mode 100644 tests/models/fast/test_modeling_fast.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4941d724455d..280e824efb89 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -424,6 +424,7 @@
     "models.ernie_m": ["ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP", "ErnieMConfig"],
     "models.esm": ["ESM_PRETRAINED_CONFIG_ARCHIVE_MAP", "EsmConfig", "EsmTokenizer"],
     "models.falcon": ["FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP", "FalconConfig"],
+<<<<<<< HEAD
     "models.fastspeech2_conformer": [
         "FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -433,6 +434,9 @@
         "FastSpeech2ConformerTokenizer",
         "FastSpeech2ConformerWithHifiGanConfig",
     ],
+=======
+    "models.fast": ["FastConfig"],
+>>>>>>> 67fec5b40 (Refactor modeling and add tests)
     "models.flaubert": ["FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlaubertConfig", "FlaubertTokenizer"],
     "models.flava": [
         "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -5113,6 +5117,7 @@
     from .models.ernie_m import ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieMConfig
     from .models.esm import ESM_PRETRAINED_CONFIG_ARCHIVE_MAP, EsmConfig, EsmTokenizer
     from .models.falcon import FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP, FalconConfig
+<<<<<<< HEAD
     from .models.fastspeech2_conformer import (
         FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP,
         FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -5122,6 +5127,9 @@
         FastSpeech2ConformerTokenizer,
         FastSpeech2ConformerWithHifiGanConfig,
     )
+=======
+    from .models.fast import FastConfig
+>>>>>>> 67fec5b40 (Refactor modeling and add tests)
     from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer
     from .models.flava import (
         FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -6698,12 +6706,17 @@
             FalconModel,
             FalconPreTrainedModel,
         )
+<<<<<<< HEAD
         from .models.fastspeech2_conformer import (
             FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             FastSpeech2ConformerHifiGan,
             FastSpeech2ConformerModel,
             FastSpeech2ConformerPreTrainedModel,
             FastSpeech2ConformerWithHifiGan,
+=======
+        from .models.fast import (
+            FASTForImageCaptioning,
+>>>>>>> 67fec5b40 (Refactor modeling and add tests)
         )
         from .models.flaubert import (
             FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py
index e69de29bb2d1..6fad75850bba 100644
--- a/src/transformers/models/fast/__init__.py
+++ b/src/transformers/models/fast/__init__.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2023 the Fast authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+_import_structure = {
+    "configuration_fast": ["FastConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_fast"] = [
+        "FASTForImageCaptioning"
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_fast import FastConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_fast import (
+            FASTForImageCaptioning
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index aab305edb5de..914bcda0567f 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -2,80 +2,71 @@
 
 
 class FastConfig(PretrainedConfig):
-
     def __init__(
-            self,
-            backbone_kernel_size=3,
-            backbone_stride=2,
-            backbone_dilation=1,
-            backbone_groups=1,
-            backbone_bias=False,
-            backbone_has_shuffle=False,
-            backbone_in_channels=3,
-            backbone_out_channels=64,
-            backbone_use_bn=True,
-            backbone_act_func="relu",
-            backbone_dropout_rate=0,
-            backbone_ops_order="weight_bn_act",
-
-            backbone_stage1_in_channels=(64, 64, 64),
-            backbone_stage1_out_channels=(64, 64, 64),
-            backbone_stage1_kernel_size=((3, 3), (3, 3), (3, 3)),
-            backbone_stage1_stride=(1, 2, 1),
-            backbone_stage1_dilation=(1, 1, 1),
-            backbone_stage1_groups=(1, 1, 1),
-
-            backbone_stage2_in_channels=(64, 128, 128, 128),
-            backbone_stage2_out_channels=(128, 128, 128, 128),
-            backbone_stage2_kernel_size=((3, 3), (1, 3), (3, 3), (3, 1)),
-            backbone_stage2_stride=(2, 1, 1, 1),
-            backbone_stage2_dilation=(1, 1, 1, 1),
-            backbone_stage2_groups=(1, 1, 1, 1),
-
-            backbone_stage3_in_channels=(128, 256, 256, 256),
-            backbone_stage3_out_channels=(256, 256, 256, 256),
-            backbone_stage3_kernel_size=((3, 3), (3, 3), (3, 1), (1, 3)),
-            backbone_stage3_stride=(2, 1, 1, 1),
-            backbone_stage3_dilation=(1, 1, 1, 1),
-            backbone_stage3_groups=(1, 1, 1, 1),
-
-            backbone_stage4_in_channels=(256, 512, 512, 512),
-            backbone_stage4_out_channels=(512, 512, 512, 512),
-            backbone_stage4_kernel_size=((3, 3), (3, 1), (1, 3), (3, 3)),
-            backbone_stage4_stride=(2, 1, 1, 1),
-            backbone_stage4_dilation=(1, 1, 1, 1),
-            backbone_stage4_groups=(1, 1, 1, 1),
-
-            neck_in_channels=(64, 128, 256, 512),
-            neck_out_channels=(128, 128, 128, 128),
-            neck_kernel_size=((3, 3), (3, 3), (3, 3), (3, 3)),
-            neck_stride=(1, 1, 1, 1),
-            neck_dilation=(1, 1, 1, 1),
-            neck_groups=(1, 1, 1, 1),
-
-            head_pooling_size=9,
-            head_dropout_ratio=0.1,
-
-            head_conv_in_channels=512,
-            head_conv_out_channels=128,
-            head_conv_kernel_size=(3, 3),
-            head_conv_stride=1,
-            head_conv_dilation=1,
-            head_conv_groups=1,
-
-            head_final_kernel_size=1,
-            head_final_stride=1,
-            head_final_dilation=1,
-            head_final_groups=1,
-            head_final_bias=False,
-            head_final_has_shuffle=False,
-            head_final_in_channels=128,
-            head_final_out_channels=5,
-            head_final_use_bn=False,
-            head_final_act_func=None,
-            head_final_dropout_rate=0,
-            head_final_ops_order="weight",
-            **kwargs
+        self,
+        backbone_kernel_size=3,
+        backbone_stride=2,
+        backbone_dilation=1,
+        backbone_groups=1,
+        backbone_bias=False,
+        backbone_has_shuffle=False,
+        backbone_in_channels=3,
+        backbone_out_channels=64,
+        backbone_use_bn=True,
+        backbone_act_func="relu",
+        backbone_dropout_rate=0,
+        backbone_ops_order="weight_bn_act",
+        backbone_stage1_in_channels=(64, 64, 64),
+        backbone_stage1_out_channels=(64, 64, 64),
+        backbone_stage1_kernel_size=((3, 3), (3, 3), (3, 3)),
+        backbone_stage1_stride=(1, 2, 1),
+        backbone_stage1_dilation=(1, 1, 1),
+        backbone_stage1_groups=(1, 1, 1),
+        backbone_stage2_in_channels=(64, 128, 128, 128),
+        backbone_stage2_out_channels=(128, 128, 128, 128),
+        backbone_stage2_kernel_size=((3, 3), (1, 3), (3, 3), (3, 1)),
+        backbone_stage2_stride=(2, 1, 1, 1),
+        backbone_stage2_dilation=(1, 1, 1, 1),
+        backbone_stage2_groups=(1, 1, 1, 1),
+        backbone_stage3_in_channels=(128, 256, 256, 256),
+        backbone_stage3_out_channels=(256, 256, 256, 256),
+        backbone_stage3_kernel_size=((3, 3), (3, 3), (3, 1), (1, 3)),
+        backbone_stage3_stride=(2, 1, 1, 1),
+        backbone_stage3_dilation=(1, 1, 1, 1),
+        backbone_stage3_groups=(1, 1, 1, 1),
+        backbone_stage4_in_channels=(256, 512, 512, 512),
+        backbone_stage4_out_channels=(512, 512, 512, 512),
+        backbone_stage4_kernel_size=((3, 3), (3, 1), (1, 3), (3, 3)),
+        backbone_stage4_stride=(2, 1, 1, 1),
+        backbone_stage4_dilation=(1, 1, 1, 1),
+        backbone_stage4_groups=(1, 1, 1, 1),
+        neck_in_channels=(64, 128, 256, 512),
+        neck_out_channels=(128, 128, 128, 128),
+        neck_kernel_size=((3, 3), (3, 3), (3, 3), (3, 3)),
+        neck_stride=(1, 1, 1, 1),
+        neck_dilation=(1, 1, 1, 1),
+        neck_groups=(1, 1, 1, 1),
+        head_pooling_size=9,
+        head_dropout_ratio=0.1,
+        head_conv_in_channels=512,
+        head_conv_out_channels=128,
+        head_conv_kernel_size=(3, 3),
+        head_conv_stride=1,
+        head_conv_dilation=1,
+        head_conv_groups=1,
+        head_final_kernel_size=1,
+        head_final_stride=1,
+        head_final_dilation=1,
+        head_final_groups=1,
+        head_final_bias=False,
+        head_final_has_shuffle=False,
+        head_final_in_channels=128,
+        head_final_out_channels=5,
+        head_final_use_bn=False,
+        head_final_act_func=None,
+        head_final_dropout_rate=0,
+        head_final_ops_order="weight",
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -94,41 +85,41 @@ def __init__(
 
         self.backbone_stage1_in_channels = backbone_stage1_in_channels
         self.backbone_stage1_out_channels = backbone_stage1_out_channels
-        self.backbone_stage1_kernel_size = backbone_stage1_kernel_size,
-        self.backbone_stage1_stride = backbone_stage1_stride,
-        self.backbone_stage1_dilation = backbone_stage1_dilation,
-        self.backbone_stage1_groups = backbone_stage1_groups,
+        self.backbone_stage1_kernel_size = (backbone_stage1_kernel_size,)
+        self.backbone_stage1_stride = (backbone_stage1_stride,)
+        self.backbone_stage1_dilation = (backbone_stage1_dilation,)
+        self.backbone_stage1_groups = (backbone_stage1_groups,)
 
         self.backbone_stage2_in_channels = backbone_stage2_in_channels
         self.backbone_stage2_out_channels = backbone_stage2_out_channels
-        self.backbone_stage2_kernel_size = backbone_stage2_kernel_size,
-        self.backbone_stage2_stride = backbone_stage2_stride,
-        self.backbone_stage2_dilation = backbone_stage2_dilation,
-        self.backbone_stage2_groups = backbone_stage2_groups,
+        self.backbone_stage2_kernel_size = (backbone_stage2_kernel_size,)
+        self.backbone_stage2_stride = (backbone_stage2_stride,)
+        self.backbone_stage2_dilation = (backbone_stage2_dilation,)
+        self.backbone_stage2_groups = (backbone_stage2_groups,)
 
         self.backbone_stage3_in_channels = backbone_stage3_in_channels
         self.backbone_stage3_out_channels = backbone_stage3_out_channels
-        self.backbone_stage3_kernel_size = backbone_stage3_kernel_size,
-        self.backbone_stage3_stride = backbone_stage3_stride,
-        self.backbone_stage3_dilation = backbone_stage3_dilation,
-        self.backbone_stage3_groups = backbone_stage3_groups,
+        self.backbone_stage3_kernel_size = (backbone_stage3_kernel_size,)
+        self.backbone_stage3_stride = (backbone_stage3_stride,)
+        self.backbone_stage3_dilation = (backbone_stage3_dilation,)
+        self.backbone_stage3_groups = (backbone_stage3_groups,)
 
         self.backbone_stage4_in_channels = backbone_stage4_in_channels
         self.backbone_stage4_out_channels = backbone_stage4_out_channels
-        self.backbone_stage4_kernel_size = backbone_stage4_kernel_size,
-        self.backbone_stage4_stride = backbone_stage4_stride,
-        self.backbone_stage4_dilation = backbone_stage4_dilation,
-        self.backbone_stage4_groups = backbone_stage4_groups,
+        self.backbone_stage4_kernel_size = (backbone_stage4_kernel_size,)
+        self.backbone_stage4_stride = (backbone_stage4_stride,)
+        self.backbone_stage4_dilation = (backbone_stage4_dilation,)
+        self.backbone_stage4_groups = (backbone_stage4_groups,)
 
-        self.neck_in_channels = neck_in_channels,
-        self.neck_out_channels = neck_out_channels,
-        self.neck_kernel_size = neck_kernel_size,
-        self.neck_stride = neck_stride,
-        self.neck_dilation = neck_dilation,
-        self.neck_groups = neck_groups,
+        self.neck_in_channels = (neck_in_channels,)
+        self.neck_out_channels = (neck_out_channels,)
+        self.neck_kernel_size = (neck_kernel_size,)
+        self.neck_stride = (neck_stride,)
+        self.neck_dilation = (neck_dilation,)
+        self.neck_groups = (neck_groups,)
 
-        self.head_pooling_size = head_pooling_size,
-        self.head_dropout_ratio = head_dropout_ratio,
+        self.head_pooling_size = (head_pooling_size,)
+        self.head_dropout_ratio = (head_dropout_ratio,)
 
         self.head_conv_in_channels = head_conv_in_channels
         self.head_conv_out_channels = head_conv_out_channels
@@ -137,15 +128,15 @@ def __init__(
         self.head_conv_dilation = head_conv_dilation
         self.head_conv_groups = head_conv_groups
 
-        self.head_final_kernel_size = head_final_kernel_size,
-        self.head_final_stride = head_final_stride,
-        self.head_final_dilation = head_final_dilation,
-        self.head_final_groups = head_final_groups,
-        self.head_final_bias = head_final_bias,
-        self.head_final_has_shuffle = head_final_has_shuffle,
-        self.head_final_in_channels = head_final_in_channels,
-        self.head_final_out_channels = head_final_out_channels,
-        self.head_final_use_bn = head_final_use_bn,
-        self.head_final_act_func = head_final_act_func,
-        self.head_final_dropout_rate = head_final_dropout_rate,
+        self.head_final_kernel_size = (head_final_kernel_size,)
+        self.head_final_stride = (head_final_stride,)
+        self.head_final_dilation = (head_final_dilation,)
+        self.head_final_groups = (head_final_groups,)
+        self.head_final_bias = (head_final_bias,)
+        self.head_final_has_shuffle = (head_final_has_shuffle,)
+        self.head_final_in_channels = (head_final_in_channels,)
+        self.head_final_out_channels = (head_final_out_channels,)
+        self.head_final_use_bn = (head_final_use_bn,)
+        self.head_final_act_func = (head_final_act_func,)
+        self.head_final_dropout_rate = (head_final_dropout_rate,)
         self.head_final_ops_order = head_final_ops_order
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index dc415b76a0b5..a700902b1fb1 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -1,4 +1,5 @@
 import math
+import unittest
 from collections import OrderedDict
 
 import cv2
@@ -12,34 +13,34 @@
 
 def get_same_padding(kernel_size):
     if isinstance(kernel_size, tuple):
-        assert len(kernel_size) == 2, 'invalid kernel size: %s' % kernel_size
+        assert len(kernel_size) == 2, "invalid kernel size: %s" % kernel_size
         p1 = get_same_padding(kernel_size[0])
         p2 = get_same_padding(kernel_size[1])
         return p1, p2
-    assert isinstance(kernel_size, int), 'kernel size should be either `int` or `tuple`'
-    assert kernel_size % 2 > 0, 'kernel size should be odd number'
+    assert isinstance(kernel_size, int), "kernel size should be either `int` or `tuple`"
+    assert kernel_size % 2 > 0, "kernel size should be odd number"
     return kernel_size // 2
 
 
 def build_activation(act_func, inplace=True):
-    if act_func == 'relu':
+    if act_func == "relu":
         return nn.ReLU(inplace=inplace)
-    elif act_func == 'relu6':
+    elif act_func == "relu6":
         return nn.ReLU6(inplace=inplace)
-    elif act_func == 'tanh':
+    elif act_func == "tanh":
         return nn.Tanh()
-    elif act_func == 'sigmoid':
+    elif act_func == "sigmoid":
         return nn.Sigmoid()
     elif act_func is None:
         return None
     else:
-        raise ValueError('do not support: %s' % act_func)
+        raise ValueError("do not support: %s" % act_func)
 
 
 class My2DLayer(nn.Module):
-
-    def __init__(self, in_channels, out_channels,
-                 use_bn=True, act_func='relu', dropout_rate=0, ops_order='weight_bn_act'):
+    def __init__(
+            self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act"
+    ):
         super(My2DLayer, self).__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels
@@ -49,55 +50,55 @@ def __init__(self, in_channels, out_channels,
         self.dropout_rate = dropout_rate
         self.ops_order = ops_order
 
-        """ modules """
+        """ modules"""
         modules = {}
         # batch norm
         if self.use_bn:
             if self.bn_before_weight:
-                modules['bn'] = nn.BatchNorm2d(in_channels)
+                modules["bn"] = nn.BatchNorm2d(in_channels)
             else:
-                modules['bn'] = nn.BatchNorm2d(out_channels)
+                modules["bn"] = nn.BatchNorm2d(out_channels)
         else:
-            modules['bn'] = None
+            modules["bn"] = None
         # activation
-        modules['act'] = build_activation(self.act_func, self.ops_list[0] != 'act')
+        modules["act"] = build_activation(self.act_func, self.ops_list[0] != "act")
         # dropout
         if self.dropout_rate > 0:
-            modules['dropout'] = nn.Dropout2d(self.dropout_rate, inplace=True)
+            modules["dropout"] = nn.Dropout2d(self.dropout_rate, inplace=True)
         else:
-            modules['dropout'] = None
+            modules["dropout"] = None
         # weight
-        modules['weight'] = self.weight_op()
+        modules["weight"] = self.weight_op()
 
         # add modules
         for op in self.ops_list:
             if modules[op] is None:
                 continue
-            elif op == 'weight':
-                if modules['dropout'] is not None:
-                    self.add_module('dropout', modules['dropout'])
-                for key in modules['weight']:
-                    self.add_module(key, modules['weight'][key])
+            elif op == "weight":
+                if modules["dropout"] is not None:
+                    self.add_module("dropout", modules["dropout"])
+                for key in modules["weight"]:
+                    self.add_module(key, modules["weight"][key])
             else:
                 self.add_module(op, modules[op])
 
     @property
     def ops_list(self):
-        return self.ops_order.split('_')
+        return self.ops_order.split("_")
 
     @property
     def bn_before_weight(self):
         for op in self.ops_list:
-            if op == 'bn':
+            if op == "bn":
                 return True
-            elif op == 'weight':
+            elif op == "weight":
                 return False
-        raise ValueError('Invalid ops_order: %s' % self.ops_order)
+        raise ValueError("Invalid ops_order: %s" % self.ops_order)
 
     def weight_op(self):
         raise NotImplementedError
 
-    """ Methods defined in MyModule """
+    """ Methods defined in MyModule"""
 
     def forward(self, x):
         for module in self._modules.values():
@@ -111,12 +112,12 @@ def module_str(self):
     @property
     def config(self):
         return {
-            'in_channels': self.in_channels,
-            'out_channels': self.out_channels,
-            'use_bn': self.use_bn,
-            'act_func': self.act_func,
-            'dropout_rate': self.dropout_rate,
-            'ops_order': self.ops_order,
+            "in_channels": self.in_channels,
+            "out_channels": self.out_channels,
+            "use_bn": self.use_bn,
+            "act_func": self.act_func,
+            "dropout_rate": self.dropout_rate,
+            "ops_order": self.ops_order,
         }
 
     @staticmethod
@@ -137,7 +138,7 @@ def generate_bbox(keys, label, score, scales, cfg):
     scores = []
     for index in range(1, label_num):
         i = keys[index]
-        ind = (label == i)
+        ind = label == i
         ind_np = ind.data.cpu().numpy()
         points = np.array(np.where(ind_np)).transpose((1, 0))
         if points.shape[0] < cfg.test_cfg.min_area:
@@ -148,18 +149,18 @@ def generate_bbox(keys, label, score, scales, cfg):
             label[ind] = 0
             continue
 
-        if cfg.test_cfg.bbox_type == 'rect':
+        if cfg.test_cfg.bbox_type == "rect":
             rect = cv2.minAreaRect(points[:, ::-1])
             alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1])))
             rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2])
             bbox = cv2.boxPoints(rect) * scales
 
-        elif cfg.test_cfg.bbox_type == 'poly':
-            binary = np.zeros(label.shape, dtype='uint8')
+        elif cfg.test_cfg.bbox_type == "poly":
+            binary = np.zeros(label.shape, dtype="uint8")
             binary[ind_np] = 1
             contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
             bbox = contours[0] * scales
-        bbox = bbox.astype('int32')
+        bbox = bbox.astype("int32")
         bboxes.append(bbox.reshape(-1).tolist())
         scores.append(score_i)
     return bboxes, scores
@@ -170,10 +171,21 @@ class FalsePreTrainedModel(PreTrainedModel):
 
 
 class ConvLayer(My2DLayer):
-
-    def __init__(self, in_channels, out_channels,
-                 kernel_size=3, stride=1, dilation=1, groups=1, bias=False, has_shuffle=False,
-                 use_bn=True, act_func='relu', dropout_rate=0, ops_order='weight_bn_act'):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            dilation=1,
+            groups=1,
+            bias=False,
+            has_shuffle=False,
+            use_bn=True,
+            act_func="relu",
+            dropout_rate=0,
+            ops_order="weight_bn_act",
+    ):
         self.kernel_size = kernel_size
         self.stride = stride
         self.dilation = dilation
@@ -192,16 +204,21 @@ def weight_op(self):
             padding[1] *= self.dilation
 
         weight_dict = OrderedDict()
-        weight_dict['conv'] = nn.Conv2d(
-            self.in_channels, self.out_channels, kernel_size=self.kernel_size, stride=self.stride, padding=padding,
-            dilation=self.dilation, groups=self.groups, bias=self.bias
+        weight_dict["conv"] = nn.Conv2d(
+            self.in_channels,
+            self.out_channels,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=padding,
+            dilation=self.dilation,
+            groups=self.groups,
+            bias=self.bias,
         )
 
         return weight_dict
 
 
 class RepConvLayer(nn.Module):
-
     def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, deploy=False):
         super(RepConvLayer, self).__init__()
 
@@ -214,47 +231,73 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1,
         self.deploy = deploy
 
         assert len(kernel_size) == 2
-        padding = (int(((kernel_size[0] - 1) * dilation) / 2),
-                   int(((kernel_size[1] - 1) * dilation) / 2))
+        padding = (int(((kernel_size[0] - 1) * dilation) / 2), int(((kernel_size[1] - 1) * dilation) / 2))
 
         self.nonlinearity = nn.ReLU(inplace=True)
 
         if deploy:
-            self.fused_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
-                                        kernel_size=kernel_size, stride=stride, padding=padding,
-                                        dilation=dilation, groups=groups, bias=True)
+            self.fused_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias=True,
+            )
         else:
-            self.main_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
-                                       kernel_size=kernel_size, stride=stride, padding=padding,
-                                       dilation=dilation, groups=groups, bias=False)
+            self.main_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias=False,
+            )
             self.main_bn = nn.BatchNorm2d(num_features=out_channels)
 
             ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0)
             hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2))
 
             if kernel_size[1] != 1:  # 卷积核的宽大于1 -> 有垂直卷积
-                self.ver_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
-                                          kernel_size=(kernel_size[0], 1),
-                                          stride=stride, padding=ver_pad,
-                                          dilation=dilation, groups=groups, bias=False)
+                self.ver_conv = nn.Conv2d(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=(kernel_size[0], 1),
+                    stride=stride,
+                    padding=ver_pad,
+                    dilation=dilation,
+                    groups=groups,
+                    bias=False,
+                )
                 self.ver_bn = nn.BatchNorm2d(num_features=out_channels)
             else:
                 self.ver_conv, self.ver_bn = None, None
 
             if kernel_size[0] != 1:  # 卷积核的高大于1 -> 有水平卷积
-                self.hor_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
-                                          kernel_size=(1, kernel_size[1]),
-                                          stride=stride, padding=hor_pad,
-                                          dilation=dilation, groups=groups, bias=False)
+                self.hor_conv = nn.Conv2d(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=(1, kernel_size[1]),
+                    stride=stride,
+                    padding=hor_pad,
+                    dilation=dilation,
+                    groups=groups,
+                    bias=False,
+                )
                 self.hor_bn = nn.BatchNorm2d(num_features=out_channels)
             else:
                 self.hor_conv, self.hor_bn = None, None
 
-            self.rbr_identity = nn.BatchNorm2d(
-                num_features=in_channels) if out_channels == in_channels and stride == 1 else None
+            self.rbr_identity = (
+                nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None
+            )
 
     def forward(self, input):
-        if hasattr(self, 'fused_conv'):
+        if hasattr(self, "fused_conv"):
             return self.nonlinearity(self.fused_conv(input))
         else:
             main_outputs = self.main_conv(input)
@@ -282,7 +325,7 @@ def _identity_to_conv(self, identity):
         if identity is None:
             return 0, 0
         assert isinstance(identity, nn.BatchNorm2d)
-        if not hasattr(self, 'id_tensor'):
+        if not hasattr(self, "id_tensor"):
             input_dim = self.in_channels // self.groups
             kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32)
             for i in range(self.in_channels):
@@ -331,8 +374,7 @@ def _pad_to_mxn_tensor(self, kernel):
         height, width = kernel.shape[2:]
         pad_left_right = (kernel_width - width) // 2
         pad_top_down = (kernel_height - height) // 2
-        return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right,
-                                                pad_top_down, pad_top_down])
+        return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right, pad_top_down, pad_top_down])
 
     # def switch_to_deploy(self):
     #     if hasattr(self, 'fused_conv'):
@@ -397,40 +439,68 @@ def _pad_to_mxn_tensor(self, kernel):
 
 
 class TextNet(PreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
-        self.first_conv = ConvLayer(config.backbone_in_channels, config.backbone_out_channels,
-                                    config.backbone_kernel_size, config.backbone_stride, config.backbone_dilation,
-                                    config.backbone_groups, config.backbone_bias, config.backbone_has_shuffle,
-                                    config.backbone_use_bn, config.backbone_act_func, config.backbone_dropout_rate,
-                                    config.backbone_ops_order)
+        self.first_conv = ConvLayer(
+            config.backbone_in_channels,
+            config.backbone_out_channels,
+            config.backbone_kernel_size,
+            config.backbone_stride,
+            config.backbone_dilation,
+            config.backbone_groups,
+            config.backbone_bias,
+            config.backbone_has_shuffle,
+            config.backbone_use_bn,
+            config.backbone_act_func,
+            config.backbone_dropout_rate,
+            config.backbone_ops_order,
+        )
 
         stage1 = []
-        for stage_config in zip(config.backbone_stage1_in_channels, config.backbone_stage1_out_channels,
-                                config.backbone_stage1_kernel_size[0], config.backbone_stage1_stride[0],
-                                config.backbone_stage1_dilation[0], config.backbone_stage1_groups[0]):
+        for stage_config in zip(
+                config.backbone_stage1_in_channels,
+                config.backbone_stage1_out_channels,
+                config.backbone_stage1_kernel_size[0],
+                config.backbone_stage1_stride[0],
+                config.backbone_stage1_dilation[0],
+                config.backbone_stage1_groups[0],
+        ):
             stage1.append(RepConvLayer(*stage_config))
         self.stage1 = nn.ModuleList(stage1)
 
         stage2 = []
-        for stage_config in zip(config.backbone_stage2_in_channels, config.backbone_stage2_out_channels,
-                                config.backbone_stage2_kernel_size[0], config.backbone_stage2_stride[0],
-                                config.backbone_stage2_dilation[0], config.backbone_stage2_groups[0]):
+        for stage_config in zip(
+                config.backbone_stage2_in_channels,
+                config.backbone_stage2_out_channels,
+                config.backbone_stage2_kernel_size[0],
+                config.backbone_stage2_stride[0],
+                config.backbone_stage2_dilation[0],
+                config.backbone_stage2_groups[0],
+        ):
             stage2.append(RepConvLayer(*stage_config))
         self.stage2 = nn.ModuleList(stage2)
 
         stage3 = []
-        for stage_config in zip(config.backbone_stage3_in_channels, config.backbone_stage3_out_channels,
-                                config.backbone_stage3_kernel_size[0], config.backbone_stage3_stride[0],
-                                config.backbone_stage3_dilation[0], config.backbone_stage3_groups[0]):
+        for stage_config in zip(
+                config.backbone_stage3_in_channels,
+                config.backbone_stage3_out_channels,
+                config.backbone_stage3_kernel_size[0],
+                config.backbone_stage3_stride[0],
+                config.backbone_stage3_dilation[0],
+                config.backbone_stage3_groups[0],
+        ):
             stage3.append(RepConvLayer(*stage_config))
         self.stage3 = nn.ModuleList(stage3)
 
         stage4 = []
-        for stage_config in zip(config.backbone_stage4_in_channels, config.backbone_stage4_out_channels,
-                                config.backbone_stage4_kernel_size[0], config.backbone_stage4_stride[0],
-                                config.backbone_stage4_dilation[0], config.backbone_stage4_groups[0]):
+        for stage_config in zip(
+                config.backbone_stage4_in_channels,
+                config.backbone_stage4_out_channels,
+                config.backbone_stage4_kernel_size[0],
+                config.backbone_stage4_stride[0],
+                config.backbone_stage4_dilation[0],
+                config.backbone_stage4_groups[0],
+        ):
             stage4.append(RepConvLayer(*stage_config))
         self.stage4 = nn.ModuleList(stage4)
 
@@ -446,7 +516,7 @@ def _initialize_weights(self):
 
     def forward(self, x):
         x = self.first_conv(x)
-        output = list()
+        output = []
 
         for block in self.stage1:
             x = block(x)
@@ -468,11 +538,18 @@ def forward(self, x):
 
 
 class FASTNeck(PreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
-        reduce_layer_configs = list(zip(config.neck_in_channels[0], config.neck_out_channels[0], config.neck_kernel_size[0],
-                          config.neck_stride[0], config.neck_dilation[0], config.neck_groups[0]))
+        reduce_layer_configs = list(
+            zip(
+                config.neck_in_channels[0],
+                config.neck_out_channels[0],
+                config.neck_kernel_size[0],
+                config.neck_stride[0],
+                config.neck_dilation[0],
+                config.neck_groups[0],
+            )
+        )
 
         self.reduce_layer1 = RepConvLayer(*reduce_layer_configs[0])
         self.reduce_layer2 = RepConvLayer(*reduce_layer_configs[1])
@@ -491,7 +568,7 @@ def _initialize_weights(self):
 
     def _upsample(self, x, y):
         _, _, H, W = y.size()
-        return F.upsample(x, size=(H, W), mode='bilinear')
+        return F.upsample(x, size=(H, W), mode="bilinear")
 
     def forward(self, x):
         f1, f2, f3, f4 = x
@@ -508,25 +585,38 @@ def forward(self, x):
 
 
 class FASTHead(nn.Module):
-
     def __init__(self, config):
         super(FASTHead, self).__init__()
-        self.conv = RepConvLayer(config.head_conv_in_channels, config.head_conv_out_channels,
-                                 config.head_conv_kernel_size, config.head_conv_stride, config.head_conv_dilation,
-                                 config.head_conv_groups)
+        self.conv = RepConvLayer(
+            config.head_conv_in_channels,
+            config.head_conv_out_channels,
+            config.head_conv_kernel_size,
+            config.head_conv_stride,
+            config.head_conv_dilation,
+            config.head_conv_groups,
+        )
 
-        self.final = ConvLayer(config.head_final_in_channels[0], config.head_final_out_channels[0],
-                               config.head_final_kernel_size[0], config.head_final_stride[0], config.head_final_dilation[0],
-                               config.head_final_groups[0], config.head_final_bias[0], config.head_final_has_shuffle[0],
-                               config.head_final_use_bn[0], config.head_final_act_func[0], config.head_final_dropout_rate[0],
-                               config.head_final_ops_order)
+        self.final = ConvLayer(
+            config.head_final_in_channels[0],
+            config.head_final_out_channels[0],
+            config.head_final_kernel_size[0],
+            config.head_final_stride[0],
+            config.head_final_dilation[0],
+            config.head_final_groups[0],
+            config.head_final_bias[0],
+            config.head_final_has_shuffle[0],
+            config.head_final_use_bn[0],
+            config.head_final_act_func[0],
+            config.head_final_dropout_rate[0],
+            config.head_final_ops_order,
+        )
 
         self.pooling_size = config.head_pooling_size[0]
 
-        self.pooling_1s = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1,
-                                       padding=(self.pooling_size - 1) // 2)
-        self.pooling_2s = nn.MaxPool2d(kernel_size=self.pooling_size // 2 + 1, stride=1,
-                                       padding=(self.pooling_size // 2) // 2)
+        self.pooling_1s = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2)
+        self.pooling_2s = nn.MaxPool2d(
+            kernel_size=self.pooling_size // 2 + 1, stride=1, padding=(self.pooling_size // 2) // 2
+        )
 
         if config.head_dropout_ratio[0] > 0:
             self.dropout = nn.Dropout2d(config.head_dropout_ratio[0])
@@ -551,17 +641,17 @@ def forward(self, x):
         return x
 
     def get_results(self, out, img_meta, cfg, scale=2):
-
-        org_img_size = img_meta['org_img_size'][0]
-        img_size = img_meta['img_size'][0]  # 640*640
+        org_img_size = img_meta["org_img_size"][0]
+        img_size = img_meta["img_size"][0]  # 640*640
         batch_size = out.size(0)
-        outputs = dict()
+        outputs = {}
 
-        texts = F.interpolate(out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale),
-                              mode='nearest')  # B*1*320*320
+        texts = F.interpolate(
+            out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), mode="nearest"
+        )  # B*1*320*320
         texts = self._max_pooling(texts, scale=scale)  # B*1*320*320
         score_maps = torch.sigmoid_(texts)  # B*1*320*320
-        score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode='nearest')  # B*1*640*640
+        score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode="nearest")  # B*1*640*640
         score_maps = score_maps.squeeze(1)  # B*640*640
 
         kernels = (out[:, 0, :, :] > 0).to(torch.uint8)  # B*160*160
@@ -572,26 +662,24 @@ def get_results(self, out, img_meta, cfg, scale=2):
         labels_ = np.array(labels_)
         labels_ = torch.from_numpy(labels_)
         labels = labels_.unsqueeze(1).to(torch.float32)  # B*1*160*160
-        labels = F.interpolate(labels, size=(img_size[0] // scale, img_size[1] // scale), mode='nearest')  # B*1*320*320
+        labels = F.interpolate(
+            labels, size=(img_size[0] // scale, img_size[1] // scale), mode="nearest"
+        )  # B*1*320*320
         labels = self._max_pooling(labels, scale=scale)
-        labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode='nearest')  # B*1*640*640
+        labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode="nearest")  # B*1*640*640
         labels = labels.squeeze(1).to(torch.int32)  # B*640*640
 
         keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)]
 
-        outputs.update(dict(kernels=kernels.data.cpu()))
+        outputs.update({"kernels": kernels.data.cpu()})
 
-        scales = (float(org_img_size[1]) / float(img_size[1]),
-                  float(org_img_size[0]) / float(img_size[0]))
+        scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0]))
 
         results = []
         for i in range(batch_size):
             bboxes, scores = generate_bbox(keys[i], labels[i], score_maps[i], scales, cfg)
-            results.append(dict(
-                bboxes=bboxes,
-                scores=scores
-            ))
-        outputs.update(dict(results=results))
+            results.append({"bboxes": bboxes, "scores": scores})
+        outputs.update({"results": results})
 
         return outputs
 
@@ -603,19 +691,19 @@ def _max_pooling(self, x, scale=1):
         return x
 
 
-class FASTForImageCaptioning(nn.Module):
+class FASTForImageCaptioning(PreTrainedModel):
     def __init__(self, config):
-        super().__init__()
+        super().__init__(config)
         self.backbone = TextNet(config=config)
         self.neck = FASTNeck(config=config)
         self.det_head = FASTHead(config=config)
 
     def _upsample(self, x, size, scale=1):
         _, _, H, W = size
-        return F.interpolate(x, size=(H // scale, W // scale), mode='bilinear')
+        return F.interpolate(x, size=(H // scale, W // scale), mode="bilinear")
 
     def forward(self, imgs, img_metas=None, cfg=None):
-        outputs = dict()
+        outputs = {}
 
         f = self.backbone(imgs)
 
diff --git a/tests/models/fast/__init__.py b/tests/models/fast/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
new file mode 100644
index 000000000000..25fcaffb82a0
--- /dev/null
+++ b/tests/models/fast/test_modeling_fast.py
@@ -0,0 +1,256 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Falcon model. """
+
+import unittest
+
+from parameterized import parameterized
+
+from transformers import (
+    FastConfig,
+    is_torch_available,
+    set_seed,
+)
+from transformers.testing_utils import CaptureLogger, require_bitsandbytes, require_torch, slow, tooslow, torch_device
+from transformers.utils import logging as transformers_logging
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        FASTForImageCaptioning,
+    )
+
+
+class FastModelTester:
+    def __init__(
+            self,
+            backbone_kernel_size=3,
+            backbone_stride=2,
+            backbone_dilation=1,
+            backbone_groups=1,
+            backbone_bias=False,
+            backbone_has_shuffle=False,
+            backbone_in_channels=3,
+            backbone_out_channels=64,
+            backbone_use_bn=True,
+            backbone_act_func="relu",
+            backbone_dropout_rate=0,
+            backbone_ops_order="weight_bn_act",
+            backbone_stage1_in_channels=(64, 64, 64),
+            backbone_stage1_out_channels=(64, 64, 64),
+            backbone_stage1_kernel_size=((3, 3), (3, 3), (3, 3)),
+            backbone_stage1_stride=(1, 2, 1),
+            backbone_stage1_dilation=(1, 1, 1),
+            backbone_stage1_groups=(1, 1, 1),
+            backbone_stage2_in_channels=(64, 128, 128, 128),
+            backbone_stage2_out_channels=(128, 128, 128, 128),
+            backbone_stage2_kernel_size=((3, 3), (1, 3), (3, 3), (3, 1)),
+            backbone_stage2_stride=(2, 1, 1, 1),
+            backbone_stage2_dilation=(1, 1, 1, 1),
+            backbone_stage2_groups=(1, 1, 1, 1),
+            backbone_stage3_in_channels=(128, 256, 256, 256),
+            backbone_stage3_out_channels=(256, 256, 256, 256),
+            backbone_stage3_kernel_size=((3, 3), (3, 3), (3, 1), (1, 3)),
+            backbone_stage3_stride=(2, 1, 1, 1),
+            backbone_stage3_dilation=(1, 1, 1, 1),
+            backbone_stage3_groups=(1, 1, 1, 1),
+            backbone_stage4_in_channels=(256, 512, 512, 512),
+            backbone_stage4_out_channels=(512, 512, 512, 512),
+            backbone_stage4_kernel_size=((3, 3), (3, 1), (1, 3), (3, 3)),
+            backbone_stage4_stride=(2, 1, 1, 1),
+            backbone_stage4_dilation=(1, 1, 1, 1),
+            backbone_stage4_groups=(1, 1, 1, 1),
+            neck_in_channels=(64, 128, 256, 512),
+            neck_out_channels=(128, 128, 128, 128),
+            neck_kernel_size=((3, 3), (3, 3), (3, 3), (3, 3)),
+            neck_stride=(1, 1, 1, 1),
+            neck_dilation=(1, 1, 1, 1),
+            neck_groups=(1, 1, 1, 1),
+            head_pooling_size=9,
+            head_dropout_ratio=0.1,
+            head_conv_in_channels=512,
+            head_conv_out_channels=128,
+            head_conv_kernel_size=(3, 3),
+            head_conv_stride=1,
+            head_conv_dilation=1,
+            head_conv_groups=1,
+            head_final_kernel_size=1,
+            head_final_stride=1,
+            head_final_dilation=1,
+            head_final_groups=1,
+            head_final_bias=False,
+            head_final_has_shuffle=False,
+            head_final_in_channels=128,
+            head_final_out_channels=5,
+            head_final_use_bn=False,
+            head_final_act_func=None,
+            head_final_dropout_rate=0,
+            head_final_ops_order="weight",
+            batch_size=3,
+            num_channels=3,
+            image_size=500,
+
+    ):
+        self.backbone_kernel_size = backbone_kernel_size
+        self.backbone_stride = backbone_stride
+        self.backbone_dilation = backbone_dilation
+        self.backbone_groups = backbone_groups
+        self.backbone_bias = backbone_bias
+        self.backbone_has_shuffle = backbone_has_shuffle
+        self.backbone_in_channels = backbone_in_channels
+        self.backbone_out_channels = backbone_out_channels
+        self.backbone_use_bn = backbone_use_bn
+        self.backbone_act_func = backbone_act_func
+        self.backbone_dropout_rate = backbone_dropout_rate
+        self.backbone_ops_order = backbone_ops_order
+
+        self.backbone_stage1_in_channels = backbone_stage1_in_channels
+        self.backbone_stage1_out_channels = backbone_stage1_out_channels
+        self.backbone_stage1_kernel_size = (backbone_stage1_kernel_size,)
+        self.backbone_stage1_stride = (backbone_stage1_stride,)
+        self.backbone_stage1_dilation = (backbone_stage1_dilation,)
+        self.backbone_stage1_groups = (backbone_stage1_groups,)
+
+        self.backbone_stage2_in_channels = backbone_stage2_in_channels
+        self.backbone_stage2_out_channels = backbone_stage2_out_channels
+        self.backbone_stage2_kernel_size = (backbone_stage2_kernel_size,)
+        self.backbone_stage2_stride = (backbone_stage2_stride,)
+        self.backbone_stage2_dilation = (backbone_stage2_dilation,)
+        self.backbone_stage2_groups = (backbone_stage2_groups,)
+
+        self.backbone_stage3_in_channels = backbone_stage3_in_channels
+        self.backbone_stage3_out_channels = backbone_stage3_out_channels
+        self.backbone_stage3_kernel_size = (backbone_stage3_kernel_size,)
+        self.backbone_stage3_stride = (backbone_stage3_stride,)
+        self.backbone_stage3_dilation = (backbone_stage3_dilation,)
+        self.backbone_stage3_groups = (backbone_stage3_groups,)
+
+        self.backbone_stage4_in_channels = backbone_stage4_in_channels
+        self.backbone_stage4_out_channels = backbone_stage4_out_channels
+        self.backbone_stage4_kernel_size = (backbone_stage4_kernel_size,)
+        self.backbone_stage4_stride = (backbone_stage4_stride,)
+        self.backbone_stage4_dilation = (backbone_stage4_dilation,)
+        self.backbone_stage4_groups = (backbone_stage4_groups,)
+
+        self.neck_in_channels = (neck_in_channels,)
+        self.neck_out_channels = (neck_out_channels,)
+        self.neck_kernel_size = (neck_kernel_size,)
+        self.neck_stride = (neck_stride,)
+        self.neck_dilation = (neck_dilation,)
+        self.neck_groups = (neck_groups,)
+
+        self.head_pooling_size = (head_pooling_size,)
+        self.head_dropout_ratio = (head_dropout_ratio,)
+
+        self.head_conv_in_channels = head_conv_in_channels
+        self.head_conv_out_channels = head_conv_out_channels
+        self.head_conv_kernel_size = head_conv_kernel_size
+        self.head_conv_stride = head_conv_stride
+        self.head_conv_dilation = head_conv_dilation
+        self.head_conv_groups = head_conv_groups
+
+        self.head_final_kernel_size = (head_final_kernel_size,)
+        self.head_final_stride = (head_final_stride,)
+        self.head_final_dilation = (head_final_dilation,)
+        self.head_final_groups = (head_final_groups,)
+        self.head_final_bias = (head_final_bias,)
+        self.head_final_has_shuffle = (head_final_has_shuffle,)
+        self.head_final_in_channels = (head_final_in_channels,)
+        self.head_final_out_channels = (head_final_out_channels,)
+        self.head_final_use_bn = (head_final_use_bn,)
+        self.head_final_act_func = (head_final_act_func,)
+        self.head_final_dropout_rate = (head_final_dropout_rate,)
+        self.head_final_ops_order = head_final_ops_order
+
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        pixel_values_meta = {
+            "org_img_size": (500, 500),
+            "img_size": (500, 500)
+        }
+        # labels = None
+        # if self.use_labels:
+        #     labels = ids_tensor([self.batch_size], self.num_labels)
+        #
+        config = self.get_config()
+
+        return config, {"imgs": pixel_values, "img_meta": pixel_values_meta}
+
+    def get_config(self):
+        return FastConfig()
+
+    def create_and_check_model(self, config, pixel_values):
+        model = FASTForImageCaptioning(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"imgs": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class FastModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            FASTForImageCaptioning,
+        )
+        if is_torch_available()
+        else ()
+    )
+
+    pipeline_model_mapping = {}
+    test_headmasking = False
+    test_pruning = False
+    test_attention_outputs = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = FastModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=FastConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    @unittest.skip(reason="Fast does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Fast does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass

From 5d21171ebf039f21fde59288db4ac928c2972be7 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 11 Oct 2023 08:16:10 +0530
Subject: [PATCH 004/152] More changes

---
 .../models/fast/configuration_fast.py         | 205 ++++++++--------
 src/transformers/models/fast/modeling_fast.py | 158 ++++++-------
 tests/models/fast/test_modeling_fast.py       | 219 ++++++++++++------
 3 files changed, 331 insertions(+), 251 deletions(-)

diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index 914bcda0567f..773dbcb151c7 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -3,70 +3,73 @@
 
 class FastConfig(PretrainedConfig):
     def __init__(
-        self,
-        backbone_kernel_size=3,
-        backbone_stride=2,
-        backbone_dilation=1,
-        backbone_groups=1,
-        backbone_bias=False,
-        backbone_has_shuffle=False,
-        backbone_in_channels=3,
-        backbone_out_channels=64,
-        backbone_use_bn=True,
-        backbone_act_func="relu",
-        backbone_dropout_rate=0,
-        backbone_ops_order="weight_bn_act",
-        backbone_stage1_in_channels=(64, 64, 64),
-        backbone_stage1_out_channels=(64, 64, 64),
-        backbone_stage1_kernel_size=((3, 3), (3, 3), (3, 3)),
-        backbone_stage1_stride=(1, 2, 1),
-        backbone_stage1_dilation=(1, 1, 1),
-        backbone_stage1_groups=(1, 1, 1),
-        backbone_stage2_in_channels=(64, 128, 128, 128),
-        backbone_stage2_out_channels=(128, 128, 128, 128),
-        backbone_stage2_kernel_size=((3, 3), (1, 3), (3, 3), (3, 1)),
-        backbone_stage2_stride=(2, 1, 1, 1),
-        backbone_stage2_dilation=(1, 1, 1, 1),
-        backbone_stage2_groups=(1, 1, 1, 1),
-        backbone_stage3_in_channels=(128, 256, 256, 256),
-        backbone_stage3_out_channels=(256, 256, 256, 256),
-        backbone_stage3_kernel_size=((3, 3), (3, 3), (3, 1), (1, 3)),
-        backbone_stage3_stride=(2, 1, 1, 1),
-        backbone_stage3_dilation=(1, 1, 1, 1),
-        backbone_stage3_groups=(1, 1, 1, 1),
-        backbone_stage4_in_channels=(256, 512, 512, 512),
-        backbone_stage4_out_channels=(512, 512, 512, 512),
-        backbone_stage4_kernel_size=((3, 3), (3, 1), (1, 3), (3, 3)),
-        backbone_stage4_stride=(2, 1, 1, 1),
-        backbone_stage4_dilation=(1, 1, 1, 1),
-        backbone_stage4_groups=(1, 1, 1, 1),
-        neck_in_channels=(64, 128, 256, 512),
-        neck_out_channels=(128, 128, 128, 128),
-        neck_kernel_size=((3, 3), (3, 3), (3, 3), (3, 3)),
-        neck_stride=(1, 1, 1, 1),
-        neck_dilation=(1, 1, 1, 1),
-        neck_groups=(1, 1, 1, 1),
-        head_pooling_size=9,
-        head_dropout_ratio=0.1,
-        head_conv_in_channels=512,
-        head_conv_out_channels=128,
-        head_conv_kernel_size=(3, 3),
-        head_conv_stride=1,
-        head_conv_dilation=1,
-        head_conv_groups=1,
-        head_final_kernel_size=1,
-        head_final_stride=1,
-        head_final_dilation=1,
-        head_final_groups=1,
-        head_final_bias=False,
-        head_final_has_shuffle=False,
-        head_final_in_channels=128,
-        head_final_out_channels=5,
-        head_final_use_bn=False,
-        head_final_act_func=None,
-        head_final_dropout_rate=0,
-        head_final_ops_order="weight",
-        **kwargs,
+            self,
+            backbone_kernel_size=3,
+            backbone_stride=2,
+            backbone_dilation=1,
+            backbone_groups=1,
+            backbone_bias=False,
+            backbone_has_shuffle=False,
+            backbone_in_channels=3,
+            backbone_out_channels=64,
+            backbone_use_bn=True,
+            backbone_act_func="relu",
+            backbone_dropout_rate=0,
+            backbone_ops_order="weight_bn_act",
+            backbone_stage1_in_channels=[64, 64, 64],
+            backbone_stage1_out_channels=[64, 64, 64],
+            backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]],
+            backbone_stage1_stride=[1, 2, 1],
+            backbone_stage1_dilation=[1, 1, 1],
+            backbone_stage1_groups=[1, 1, 1],
+            backbone_stage2_in_channels=[64, 128, 128, 128],
+            backbone_stage2_out_channels=[128, 128, 128, 128],
+            backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]],
+            backbone_stage2_stride=[2, 1, 1, 1],
+            backbone_stage2_dilation=[1, 1, 1, 1],
+            backbone_stage2_groups=[1, 1, 1, 1],
+            backbone_stage3_in_channels=[128, 256, 256, 256],
+            backbone_stage3_out_channels=[256, 256, 256, 256],
+            backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]],
+            backbone_stage3_stride=[2, 1, 1, 1],
+            backbone_stage3_dilation=[1, 1, 1, 1],
+            backbone_stage3_groups=[1, 1, 1, 1],
+            backbone_stage4_in_channels=[256, 512, 512, 512],
+            backbone_stage4_out_channels=[512, 512, 512, 512],
+            backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]],
+            backbone_stage4_stride=[2, 1, 1, 1],
+            backbone_stage4_dilation=[1, 1, 1, 1],
+            backbone_stage4_groups=[1, 1, 1, 1],
+            neck_in_channels=[64, 128, 256, 512],
+            neck_out_channels=[128, 128, 128, 128],
+            neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]],
+            neck_stride=[1, 1, 1, 1],
+            neck_dilation=[1, 1, 1, 1],
+            neck_groups=[1, 1, 1, 1],
+            head_pooling_size=9,
+            head_dropout_ratio=0.1,
+            head_conv_in_channels=512,
+            head_conv_out_channels=128,
+            head_conv_kernel_size=[3, 3],
+            head_conv_stride=1,
+            head_conv_dilation=1,
+            head_conv_groups=1,
+            head_final_kernel_size=1,
+            head_final_stride=1,
+            head_final_dilation=1,
+            head_final_groups=1,
+            head_final_bias=False,
+            head_final_has_shuffle=False,
+            head_final_in_channels=128,
+            head_final_out_channels=5,
+            head_final_use_bn=False,
+            head_final_act_func=None,
+            head_final_dropout_rate=0,
+            head_final_ops_order="weight",
+            min_area=250,
+            min_score=0.88,
+            bbox_type='rect',
+            **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -85,41 +88,41 @@ def __init__(
 
         self.backbone_stage1_in_channels = backbone_stage1_in_channels
         self.backbone_stage1_out_channels = backbone_stage1_out_channels
-        self.backbone_stage1_kernel_size = (backbone_stage1_kernel_size,)
-        self.backbone_stage1_stride = (backbone_stage1_stride,)
-        self.backbone_stage1_dilation = (backbone_stage1_dilation,)
-        self.backbone_stage1_groups = (backbone_stage1_groups,)
+        self.backbone_stage1_kernel_size = backbone_stage1_kernel_size
+        self.backbone_stage1_stride = backbone_stage1_stride
+        self.backbone_stage1_dilation = backbone_stage1_dilation
+        self.backbone_stage1_groups = backbone_stage1_groups
 
         self.backbone_stage2_in_channels = backbone_stage2_in_channels
         self.backbone_stage2_out_channels = backbone_stage2_out_channels
-        self.backbone_stage2_kernel_size = (backbone_stage2_kernel_size,)
-        self.backbone_stage2_stride = (backbone_stage2_stride,)
-        self.backbone_stage2_dilation = (backbone_stage2_dilation,)
-        self.backbone_stage2_groups = (backbone_stage2_groups,)
+        self.backbone_stage2_kernel_size = backbone_stage2_kernel_size
+        self.backbone_stage2_stride = backbone_stage2_stride
+        self.backbone_stage2_dilation = backbone_stage2_dilation
+        self.backbone_stage2_groups = backbone_stage2_groups
 
         self.backbone_stage3_in_channels = backbone_stage3_in_channels
         self.backbone_stage3_out_channels = backbone_stage3_out_channels
-        self.backbone_stage3_kernel_size = (backbone_stage3_kernel_size,)
-        self.backbone_stage3_stride = (backbone_stage3_stride,)
-        self.backbone_stage3_dilation = (backbone_stage3_dilation,)
-        self.backbone_stage3_groups = (backbone_stage3_groups,)
+        self.backbone_stage3_kernel_size = backbone_stage3_kernel_size
+        self.backbone_stage3_stride = backbone_stage3_stride
+        self.backbone_stage3_dilation = backbone_stage3_dilation
+        self.backbone_stage3_groups = backbone_stage3_groups
 
         self.backbone_stage4_in_channels = backbone_stage4_in_channels
         self.backbone_stage4_out_channels = backbone_stage4_out_channels
-        self.backbone_stage4_kernel_size = (backbone_stage4_kernel_size,)
-        self.backbone_stage4_stride = (backbone_stage4_stride,)
-        self.backbone_stage4_dilation = (backbone_stage4_dilation,)
-        self.backbone_stage4_groups = (backbone_stage4_groups,)
+        self.backbone_stage4_kernel_size = backbone_stage4_kernel_size
+        self.backbone_stage4_stride = backbone_stage4_stride
+        self.backbone_stage4_dilation = backbone_stage4_dilation
+        self.backbone_stage4_groups = backbone_stage4_groups
 
-        self.neck_in_channels = (neck_in_channels,)
-        self.neck_out_channels = (neck_out_channels,)
-        self.neck_kernel_size = (neck_kernel_size,)
-        self.neck_stride = (neck_stride,)
-        self.neck_dilation = (neck_dilation,)
-        self.neck_groups = (neck_groups,)
+        self.neck_in_channels = neck_in_channels
+        self.neck_out_channels = neck_out_channels
+        self.neck_kernel_size = neck_kernel_size
+        self.neck_stride = neck_stride
+        self.neck_dilation = neck_dilation
+        self.neck_groups = neck_groups
 
-        self.head_pooling_size = (head_pooling_size,)
-        self.head_dropout_ratio = (head_dropout_ratio,)
+        self.head_pooling_size = head_pooling_size
+        self.head_dropout_ratio = head_dropout_ratio
 
         self.head_conv_in_channels = head_conv_in_channels
         self.head_conv_out_channels = head_conv_out_channels
@@ -128,15 +131,19 @@ def __init__(
         self.head_conv_dilation = head_conv_dilation
         self.head_conv_groups = head_conv_groups
 
-        self.head_final_kernel_size = (head_final_kernel_size,)
-        self.head_final_stride = (head_final_stride,)
-        self.head_final_dilation = (head_final_dilation,)
-        self.head_final_groups = (head_final_groups,)
-        self.head_final_bias = (head_final_bias,)
-        self.head_final_has_shuffle = (head_final_has_shuffle,)
-        self.head_final_in_channels = (head_final_in_channels,)
-        self.head_final_out_channels = (head_final_out_channels,)
-        self.head_final_use_bn = (head_final_use_bn,)
-        self.head_final_act_func = (head_final_act_func,)
-        self.head_final_dropout_rate = (head_final_dropout_rate,)
+        self.head_final_kernel_size = head_final_kernel_size
+        self.head_final_stride = head_final_stride
+        self.head_final_dilation = head_final_dilation
+        self.head_final_groups = head_final_groups
+        self.head_final_bias = head_final_bias
+        self.head_final_has_shuffle = head_final_has_shuffle
+        self.head_final_in_channels = head_final_in_channels
+        self.head_final_out_channels = head_final_out_channels
+        self.head_final_use_bn = head_final_use_bn
+        self.head_final_act_func = head_final_act_func
+        self.head_final_dropout_rate = head_final_dropout_rate
         self.head_final_ops_order = head_final_ops_order
+
+        self.min_area = min_area
+        self.min_score = min_score
+        self.bbox_type = bbox_type
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index a700902b1fb1..255eb2635fcf 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -1,5 +1,4 @@
 import math
-import unittest
 from collections import OrderedDict
 
 import cv2
@@ -132,40 +131,6 @@ def is_zero_layer():
         return False
 
 
-def generate_bbox(keys, label, score, scales, cfg):
-    label_num = len(keys)
-    bboxes = []
-    scores = []
-    for index in range(1, label_num):
-        i = keys[index]
-        ind = label == i
-        ind_np = ind.data.cpu().numpy()
-        points = np.array(np.where(ind_np)).transpose((1, 0))
-        if points.shape[0] < cfg.test_cfg.min_area:
-            label[ind] = 0
-            continue
-        score_i = score[ind].mean().item()
-        if score_i < cfg.test_cfg.min_score:
-            label[ind] = 0
-            continue
-
-        if cfg.test_cfg.bbox_type == "rect":
-            rect = cv2.minAreaRect(points[:, ::-1])
-            alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1])))
-            rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2])
-            bbox = cv2.boxPoints(rect) * scales
-
-        elif cfg.test_cfg.bbox_type == "poly":
-            binary = np.zeros(label.shape, dtype="uint8")
-            binary[ind_np] = 1
-            contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-            bbox = contours[0] * scales
-        bbox = bbox.astype("int32")
-        bboxes.append(bbox.reshape(-1).tolist())
-        scores.append(score_i)
-    return bboxes, scores
-
-
 class FalsePreTrainedModel(PreTrainedModel):
     pass
 
@@ -460,10 +425,10 @@ def __init__(self, config):
         for stage_config in zip(
                 config.backbone_stage1_in_channels,
                 config.backbone_stage1_out_channels,
-                config.backbone_stage1_kernel_size[0],
-                config.backbone_stage1_stride[0],
-                config.backbone_stage1_dilation[0],
-                config.backbone_stage1_groups[0],
+                config.backbone_stage1_kernel_size,
+                config.backbone_stage1_stride,
+                config.backbone_stage1_dilation,
+                config.backbone_stage1_groups,
         ):
             stage1.append(RepConvLayer(*stage_config))
         self.stage1 = nn.ModuleList(stage1)
@@ -472,10 +437,10 @@ def __init__(self, config):
         for stage_config in zip(
                 config.backbone_stage2_in_channels,
                 config.backbone_stage2_out_channels,
-                config.backbone_stage2_kernel_size[0],
-                config.backbone_stage2_stride[0],
-                config.backbone_stage2_dilation[0],
-                config.backbone_stage2_groups[0],
+                config.backbone_stage2_kernel_size,
+                config.backbone_stage2_stride,
+                config.backbone_stage2_dilation,
+                config.backbone_stage2_groups,
         ):
             stage2.append(RepConvLayer(*stage_config))
         self.stage2 = nn.ModuleList(stage2)
@@ -484,10 +449,10 @@ def __init__(self, config):
         for stage_config in zip(
                 config.backbone_stage3_in_channels,
                 config.backbone_stage3_out_channels,
-                config.backbone_stage3_kernel_size[0],
-                config.backbone_stage3_stride[0],
-                config.backbone_stage3_dilation[0],
-                config.backbone_stage3_groups[0],
+                config.backbone_stage3_kernel_size,
+                config.backbone_stage3_stride,
+                config.backbone_stage3_dilation,
+                config.backbone_stage3_groups,
         ):
             stage3.append(RepConvLayer(*stage_config))
         self.stage3 = nn.ModuleList(stage3)
@@ -496,10 +461,10 @@ def __init__(self, config):
         for stage_config in zip(
                 config.backbone_stage4_in_channels,
                 config.backbone_stage4_out_channels,
-                config.backbone_stage4_kernel_size[0],
-                config.backbone_stage4_stride[0],
-                config.backbone_stage4_dilation[0],
-                config.backbone_stage4_groups[0],
+                config.backbone_stage4_kernel_size,
+                config.backbone_stage4_stride,
+                config.backbone_stage4_dilation,
+                config.backbone_stage4_groups,
         ):
             stage4.append(RepConvLayer(*stage_config))
         self.stage4 = nn.ModuleList(stage4)
@@ -542,12 +507,12 @@ def __init__(self, config):
         super().__init__(config)
         reduce_layer_configs = list(
             zip(
-                config.neck_in_channels[0],
-                config.neck_out_channels[0],
-                config.neck_kernel_size[0],
-                config.neck_stride[0],
-                config.neck_dilation[0],
-                config.neck_groups[0],
+                config.neck_in_channels,
+                config.neck_out_channels,
+                config.neck_kernel_size,
+                config.neck_stride,
+                config.neck_dilation,
+                config.neck_groups,
             )
         )
 
@@ -597,29 +562,33 @@ def __init__(self, config):
         )
 
         self.final = ConvLayer(
-            config.head_final_in_channels[0],
-            config.head_final_out_channels[0],
-            config.head_final_kernel_size[0],
-            config.head_final_stride[0],
-            config.head_final_dilation[0],
-            config.head_final_groups[0],
-            config.head_final_bias[0],
-            config.head_final_has_shuffle[0],
-            config.head_final_use_bn[0],
-            config.head_final_act_func[0],
-            config.head_final_dropout_rate[0],
+            config.head_final_in_channels,
+            config.head_final_out_channels,
+            config.head_final_kernel_size,
+            config.head_final_stride,
+            config.head_final_dilation,
+            config.head_final_groups,
+            config.head_final_bias,
+            config.head_final_has_shuffle,
+            config.head_final_use_bn,
+            config.head_final_act_func,
+            config.head_final_dropout_rate,
             config.head_final_ops_order,
         )
 
-        self.pooling_size = config.head_pooling_size[0]
+        self.min_area = config.min_area
+        self.min_score = config.min_score
+        self.bbox_type = config.bbox_type
+
+        self.pooling_size = config.head_pooling_size
 
         self.pooling_1s = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2)
         self.pooling_2s = nn.MaxPool2d(
             kernel_size=self.pooling_size // 2 + 1, stride=1, padding=(self.pooling_size // 2) // 2
         )
 
-        if config.head_dropout_ratio[0] > 0:
-            self.dropout = nn.Dropout2d(config.head_dropout_ratio[0])
+        if config.head_dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(config.head_dropout_ratio)
         else:
             self.dropout = None
 
@@ -640,9 +609,9 @@ def forward(self, x):
         x = self.final(x)
         return x
 
-    def get_results(self, out, img_meta, cfg, scale=2):
-        org_img_size = img_meta["org_img_size"][0]
-        img_size = img_meta["img_size"][0]  # 640*640
+    def get_results(self, out, img_meta, scale=2):
+        org_img_size = img_meta["org_img_size"]
+        img_size = img_meta["img_size"]  # 640*640
         batch_size = out.size(0)
         outputs = {}
 
@@ -650,7 +619,7 @@ def get_results(self, out, img_meta, cfg, scale=2):
             out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), mode="nearest"
         )  # B*1*320*320
         texts = self._max_pooling(texts, scale=scale)  # B*1*320*320
-        score_maps = torch.sigmoid_(texts)  # B*1*320*320
+        score_maps = torch.sigmoid_(texts)  # B*1*320*320~
         score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode="nearest")  # B*1*640*640
         score_maps = score_maps.squeeze(1)  # B*640*640
 
@@ -677,7 +646,7 @@ def get_results(self, out, img_meta, cfg, scale=2):
 
         results = []
         for i in range(batch_size):
-            bboxes, scores = generate_bbox(keys[i], labels[i], score_maps[i], scales, cfg)
+            bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales)
             results.append({"bboxes": bboxes, "scores": scores})
         outputs.update({"results": results})
 
@@ -690,6 +659,39 @@ def _max_pooling(self, x, scale=1):
             x = self.pooling_2s(x)
         return x
 
+    def generate_bbox(self, keys, label, score, scales):
+        label_num = len(keys)
+        bboxes = []
+        scores = []
+        for index in range(1, label_num):
+            i = keys[index]
+            ind = label == i
+            ind_np = ind.data.cpu().numpy()
+            points = np.array(np.where(ind_np)).transpose((1, 0))
+            if points.shape[0] < self.min_area:
+                label[ind] = 0
+                continue
+            score_i = score[ind].mean().item()
+            if score_i < self.min_score:
+                label[ind] = 0
+                continue
+
+            if self.bbox_type == "rect":
+                rect = cv2.minAreaRect(points[:, ::-1])
+                alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1])))
+                rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2])
+                bbox = cv2.boxPoints(rect) * scales
+            else:
+                binary = np.zeros(label.shape, dtype="uint8")
+                binary[ind_np] = 1
+                contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                bbox = contours[0] * scales
+
+            bbox = bbox.astype("int32")
+            bboxes.append(bbox.reshape(-1).tolist())
+            scores.append(score_i)
+        return bboxes, scores
+
 
 class FASTForImageCaptioning(PreTrainedModel):
     def __init__(self, config):
@@ -702,7 +704,7 @@ def _upsample(self, x, size, scale=1):
         _, _, H, W = size
         return F.interpolate(x, size=(H // scale, W // scale), mode="bilinear")
 
-    def forward(self, imgs, img_metas=None, cfg=None):
+    def forward(self, imgs, img_metas=None):
         outputs = {}
 
         f = self.backbone(imgs)
@@ -712,7 +714,7 @@ def forward(self, imgs, img_metas=None, cfg=None):
         det_out = self.det_head(f)
 
         det_out = self._upsample(det_out, imgs.size(), scale=4)
-        det_res = self.det_head.get_results(det_out, img_metas, cfg, scale=2)
+        det_res = self.det_head.get_results(det_out, img_metas, scale=2)
         outputs.update(det_res)
 
         return outputs
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index 25fcaffb82a0..26d2fd8e347e 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -42,6 +42,7 @@
 class FastModelTester:
     def __init__(
             self,
+            parent,
             backbone_kernel_size=3,
             backbone_stride=2,
             backbone_dilation=1,
@@ -54,41 +55,41 @@ def __init__(
             backbone_act_func="relu",
             backbone_dropout_rate=0,
             backbone_ops_order="weight_bn_act",
-            backbone_stage1_in_channels=(64, 64, 64),
-            backbone_stage1_out_channels=(64, 64, 64),
-            backbone_stage1_kernel_size=((3, 3), (3, 3), (3, 3)),
-            backbone_stage1_stride=(1, 2, 1),
-            backbone_stage1_dilation=(1, 1, 1),
-            backbone_stage1_groups=(1, 1, 1),
-            backbone_stage2_in_channels=(64, 128, 128, 128),
-            backbone_stage2_out_channels=(128, 128, 128, 128),
-            backbone_stage2_kernel_size=((3, 3), (1, 3), (3, 3), (3, 1)),
-            backbone_stage2_stride=(2, 1, 1, 1),
-            backbone_stage2_dilation=(1, 1, 1, 1),
-            backbone_stage2_groups=(1, 1, 1, 1),
-            backbone_stage3_in_channels=(128, 256, 256, 256),
-            backbone_stage3_out_channels=(256, 256, 256, 256),
-            backbone_stage3_kernel_size=((3, 3), (3, 3), (3, 1), (1, 3)),
-            backbone_stage3_stride=(2, 1, 1, 1),
-            backbone_stage3_dilation=(1, 1, 1, 1),
-            backbone_stage3_groups=(1, 1, 1, 1),
-            backbone_stage4_in_channels=(256, 512, 512, 512),
-            backbone_stage4_out_channels=(512, 512, 512, 512),
-            backbone_stage4_kernel_size=((3, 3), (3, 1), (1, 3), (3, 3)),
-            backbone_stage4_stride=(2, 1, 1, 1),
-            backbone_stage4_dilation=(1, 1, 1, 1),
-            backbone_stage4_groups=(1, 1, 1, 1),
-            neck_in_channels=(64, 128, 256, 512),
-            neck_out_channels=(128, 128, 128, 128),
-            neck_kernel_size=((3, 3), (3, 3), (3, 3), (3, 3)),
-            neck_stride=(1, 1, 1, 1),
-            neck_dilation=(1, 1, 1, 1),
-            neck_groups=(1, 1, 1, 1),
+            backbone_stage1_in_channels=[64, 64, 64],
+            backbone_stage1_out_channels=[64, 64, 64],
+            backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]],
+            backbone_stage1_stride=[1, 2, 1],
+            backbone_stage1_dilation=[1, 1, 1],
+            backbone_stage1_groups=[1, 1, 1],
+            backbone_stage2_in_channels=[64, 128, 128, 128],
+            backbone_stage2_out_channels=[128, 128, 128, 128],
+            backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]],
+            backbone_stage2_stride=[2, 1, 1, 1],
+            backbone_stage2_dilation=[1, 1, 1, 1],
+            backbone_stage2_groups=[1, 1, 1, 1],
+            backbone_stage3_in_channels=[128, 256, 256, 256],
+            backbone_stage3_out_channels=[256, 256, 256, 256],
+            backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]],
+            backbone_stage3_stride=[2, 1, 1, 1],
+            backbone_stage3_dilation=[1, 1, 1, 1],
+            backbone_stage3_groups=[1, 1, 1, 1],
+            backbone_stage4_in_channels=[256, 512, 512, 512],
+            backbone_stage4_out_channels=[512, 512, 512, 512],
+            backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]],
+            backbone_stage4_stride=[2, 1, 1, 1],
+            backbone_stage4_dilation=[1, 1, 1, 1],
+            backbone_stage4_groups=[1, 1, 1, 1],
+            neck_in_channels=[64, 128, 256, 512],
+            neck_out_channels=[128, 128, 128, 128],
+            neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]],
+            neck_stride=[1, 1, 1, 1],
+            neck_dilation=[1, 1, 1, 1],
+            neck_groups=[1, 1, 1, 1],
             head_pooling_size=9,
             head_dropout_ratio=0.1,
             head_conv_in_channels=512,
             head_conv_out_channels=128,
-            head_conv_kernel_size=(3, 3),
+            head_conv_kernel_size=[3, 3],
             head_conv_stride=1,
             head_conv_dilation=1,
             head_conv_groups=1,
@@ -107,8 +108,9 @@ def __init__(
             batch_size=3,
             num_channels=3,
             image_size=500,
-
+            is_training=True,
     ):
+        self.parent = parent
         self.backbone_kernel_size = backbone_kernel_size
         self.backbone_stride = backbone_stride
         self.backbone_dilation = backbone_dilation
@@ -124,41 +126,41 @@ def __init__(
 
         self.backbone_stage1_in_channels = backbone_stage1_in_channels
         self.backbone_stage1_out_channels = backbone_stage1_out_channels
-        self.backbone_stage1_kernel_size = (backbone_stage1_kernel_size,)
-        self.backbone_stage1_stride = (backbone_stage1_stride,)
-        self.backbone_stage1_dilation = (backbone_stage1_dilation,)
-        self.backbone_stage1_groups = (backbone_stage1_groups,)
+        self.backbone_stage1_kernel_size = backbone_stage1_kernel_size
+        self.backbone_stage1_stride = backbone_stage1_stride
+        self.backbone_stage1_dilation = backbone_stage1_dilation
+        self.backbone_stage1_groups = backbone_stage1_groups
 
         self.backbone_stage2_in_channels = backbone_stage2_in_channels
         self.backbone_stage2_out_channels = backbone_stage2_out_channels
-        self.backbone_stage2_kernel_size = (backbone_stage2_kernel_size,)
-        self.backbone_stage2_stride = (backbone_stage2_stride,)
-        self.backbone_stage2_dilation = (backbone_stage2_dilation,)
-        self.backbone_stage2_groups = (backbone_stage2_groups,)
+        self.backbone_stage2_kernel_size = backbone_stage2_kernel_size
+        self.backbone_stage2_stride = backbone_stage2_stride
+        self.backbone_stage2_dilation = backbone_stage2_dilation
+        self.backbone_stage2_groups = backbone_stage2_groups
 
         self.backbone_stage3_in_channels = backbone_stage3_in_channels
         self.backbone_stage3_out_channels = backbone_stage3_out_channels
-        self.backbone_stage3_kernel_size = (backbone_stage3_kernel_size,)
-        self.backbone_stage3_stride = (backbone_stage3_stride,)
-        self.backbone_stage3_dilation = (backbone_stage3_dilation,)
-        self.backbone_stage3_groups = (backbone_stage3_groups,)
+        self.backbone_stage3_kernel_size = backbone_stage3_kernel_size
+        self.backbone_stage3_stride = backbone_stage3_stride
+        self.backbone_stage3_dilation = backbone_stage3_dilation
+        self.backbone_stage3_groups = backbone_stage3_groups
 
         self.backbone_stage4_in_channels = backbone_stage4_in_channels
         self.backbone_stage4_out_channels = backbone_stage4_out_channels
-        self.backbone_stage4_kernel_size = (backbone_stage4_kernel_size,)
-        self.backbone_stage4_stride = (backbone_stage4_stride,)
-        self.backbone_stage4_dilation = (backbone_stage4_dilation,)
-        self.backbone_stage4_groups = (backbone_stage4_groups,)
+        self.backbone_stage4_kernel_size = backbone_stage4_kernel_size
+        self.backbone_stage4_stride = backbone_stage4_stride
+        self.backbone_stage4_dilation = backbone_stage4_dilation
+        self.backbone_stage4_groups = backbone_stage4_groups
 
-        self.neck_in_channels = (neck_in_channels,)
-        self.neck_out_channels = (neck_out_channels,)
-        self.neck_kernel_size = (neck_kernel_size,)
-        self.neck_stride = (neck_stride,)
-        self.neck_dilation = (neck_dilation,)
-        self.neck_groups = (neck_groups,)
+        self.neck_in_channels = neck_in_channels
+        self.neck_out_channels = neck_out_channels
+        self.neck_kernel_size = neck_kernel_size
+        self.neck_stride = neck_stride
+        self.neck_dilation = neck_dilation
+        self.neck_groups = neck_groups
 
-        self.head_pooling_size = (head_pooling_size,)
-        self.head_dropout_ratio = (head_dropout_ratio,)
+        self.head_pooling_size = head_pooling_size
+        self.head_dropout_ratio = head_dropout_ratio
 
         self.head_conv_in_channels = head_conv_in_channels
         self.head_conv_out_channels = head_conv_out_channels
@@ -167,22 +169,23 @@ def __init__(
         self.head_conv_dilation = head_conv_dilation
         self.head_conv_groups = head_conv_groups
 
-        self.head_final_kernel_size = (head_final_kernel_size,)
-        self.head_final_stride = (head_final_stride,)
-        self.head_final_dilation = (head_final_dilation,)
-        self.head_final_groups = (head_final_groups,)
-        self.head_final_bias = (head_final_bias,)
-        self.head_final_has_shuffle = (head_final_has_shuffle,)
-        self.head_final_in_channels = (head_final_in_channels,)
-        self.head_final_out_channels = (head_final_out_channels,)
-        self.head_final_use_bn = (head_final_use_bn,)
-        self.head_final_act_func = (head_final_act_func,)
-        self.head_final_dropout_rate = (head_final_dropout_rate,)
+        self.head_final_kernel_size = head_final_kernel_size
+        self.head_final_stride = head_final_stride
+        self.head_final_dilation = head_final_dilation
+        self.head_final_groups = head_final_groups
+        self.head_final_bias = head_final_bias
+        self.head_final_has_shuffle = head_final_has_shuffle
+        self.head_final_in_channels = head_final_in_channels
+        self.head_final_out_channels = head_final_out_channels
+        self.head_final_use_bn = head_final_use_bn
+        self.head_final_act_func = head_final_act_func
+        self.head_final_dropout_rate = head_final_dropout_rate
         self.head_final_ops_order = head_final_ops_order
 
         self.batch_size = batch_size
         self.num_channels = num_channels
         self.image_size = image_size
+        self.is_training = is_training
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -196,22 +199,84 @@ def prepare_config_and_inputs(self):
         #
         config = self.get_config()
 
-        return config, {"imgs": pixel_values, "img_meta": pixel_values_meta}
+        return config, {"imgs": pixel_values, "img_metas": pixel_values_meta}
 
     def get_config(self):
-        return FastConfig()
+        return FastConfig(
+            backbone_kernel_size=self.backbone_kernel_size,
+            backbone_stride=self.backbone_stride,
+            backbone_dilation=self.backbone_dilation,
+            backbone_groups=self.backbone_groups,
+            backbone_bias=self.backbone_bias,
+            backbone_has_shuffle=self.backbone_has_shuffle,
+            backbone_in_channels=self.backbone_in_channels,
+            backbone_out_channels=self.backbone_out_channels,
+            backbone_use_bn=self.backbone_use_bn,
+            backbone_act_func=self.backbone_act_func,
+            backbone_dropout_rate=self.backbone_dropout_rate,
+            backbone_ops_order=self.backbone_ops_order,
+            backbone_stage1_in_channels=self.backbone_stage1_in_channels,
+            backbone_stage1_out_channels=self.backbone_stage1_out_channels,
+            backbone_stage1_kernel_size=self.backbone_stage1_kernel_size,
+            backbone_stage1_stride=self.backbone_stage1_stride,
+            backbone_stage1_dilation=self.backbone_stage1_dilation,
+            backbone_stage1_groups=self.backbone_stage1_groups,
+            backbone_stage2_in_channels=self.backbone_stage2_in_channels,
+            backbone_stage2_out_channels=self.backbone_stage2_out_channels,
+            backbone_stage2_kernel_size=self.backbone_stage2_kernel_size,
+            backbone_stage2_stride=self.backbone_stage2_stride,
+            backbone_stage2_dilation=self.backbone_stage2_dilation,
+            backbone_stage2_groups=self.backbone_stage2_groups,
+            backbone_stage3_in_channels=self.backbone_stage3_in_channels,
+            backbone_stage3_out_channels=self.backbone_stage3_out_channels,
+            backbone_stage3_kernel_size=self.backbone_stage3_kernel_size,
+            backbone_stage3_stride=self.backbone_stage3_stride,
+            backbone_stage3_dilation=self.backbone_stage3_dilation,
+            backbone_stage3_groups=self.backbone_stage3_groups,
+            backbone_stage4_in_channels=self.backbone_stage4_in_channels,
+            backbone_stage4_out_channels=self.backbone_stage4_out_channels,
+            backbone_stage4_kernel_size=self.backbone_stage4_kernel_size,
+            backbone_stage4_stride=self.backbone_stage4_stride,
+            backbone_stage4_dilation=self.backbone_stage4_dilation,
+            backbone_stage4_groups=self.backbone_stage4_groups,
+            neck_in_channels=self.neck_in_channels,
+            neck_out_channels=self.neck_out_channels,
+            neck_kernel_size=self.neck_kernel_size,
+            neck_stride=self.neck_stride,
+            neck_dilation=self.neck_dilation,
+            neck_groups=self.neck_groups,
+            head_pooling_size=self.head_pooling_size,
+            head_dropout_ratio=self.head_dropout_ratio,
+            head_conv_in_channels=self.head_conv_in_channels,
+            head_conv_out_channels=self.head_conv_out_channels,
+            head_conv_kernel_size=self.head_conv_kernel_size,
+            head_conv_stride=self.head_conv_stride,
+            head_conv_dilation=self.head_conv_dilation,
+            head_conv_groups=self.head_conv_groups,
+            head_final_kernel_size=self.head_final_kernel_size,
+            head_final_stride=self.head_final_stride,
+            head_final_dilation=self.head_final_dilation,
+            head_final_groups=self.head_final_groups,
+            head_final_bias=self.head_final_bias,
+            head_final_has_shuffle=self.head_final_has_shuffle,
+            head_final_in_channels=self.head_final_in_channels,
+            head_final_out_channels=self.head_final_out_channels,
+            head_final_use_bn=self.head_final_use_bn,
+            head_final_act_func=self.head_final_act_func,
+            head_final_dropout_rate=self.head_final_dropout_rate,
+            head_final_ops_order=self.head_final_ops_order,
+        )
 
-    def create_and_check_model(self, config, pixel_values):
+    def create_and_check_model(self, config, input):
         model = FASTForImageCaptioning(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(pixel_values)
+        result = model(imgs=input['imgs'], imgs_mets=input['img_metas'])
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"imgs": pixel_values}
+        config, inputs_dict = config_and_inputs
         return config, inputs_dict
 
 
@@ -238,7 +303,13 @@ def setUp(self):
         self.config_tester = ConfigTester(self, config_class=FastConfig, hidden_size=37)
 
     def test_config(self):
-        self.config_tester.run_common_tests()
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()

From a6e1cfdee13129c06ba4817115484c6acfe5a415 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Fri, 13 Oct 2023 18:50:00 +0530
Subject: [PATCH 005/152] WIP

---
 src/transformers/models/fast/modeling_fast.py | 154 +++++++++---------
 1 file changed, 77 insertions(+), 77 deletions(-)

diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 255eb2635fcf..4f3188819ac3 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -609,48 +609,48 @@ def forward(self, x):
         x = self.final(x)
         return x
 
-    def get_results(self, out, img_meta, scale=2):
-        org_img_size = img_meta["org_img_size"]
-        img_size = img_meta["img_size"]  # 640*640
-        batch_size = out.size(0)
-        outputs = {}
-
-        texts = F.interpolate(
-            out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), mode="nearest"
-        )  # B*1*320*320
-        texts = self._max_pooling(texts, scale=scale)  # B*1*320*320
-        score_maps = torch.sigmoid_(texts)  # B*1*320*320~
-        score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode="nearest")  # B*1*640*640
-        score_maps = score_maps.squeeze(1)  # B*640*640
-
-        kernels = (out[:, 0, :, :] > 0).to(torch.uint8)  # B*160*160
-        labels_ = []
-        for kernel in kernels.numpy():
-            ret, label_ = cv2.connectedComponents(kernel)
-            labels_.append(label_)
-        labels_ = np.array(labels_)
-        labels_ = torch.from_numpy(labels_)
-        labels = labels_.unsqueeze(1).to(torch.float32)  # B*1*160*160
-        labels = F.interpolate(
-            labels, size=(img_size[0] // scale, img_size[1] // scale), mode="nearest"
-        )  # B*1*320*320
-        labels = self._max_pooling(labels, scale=scale)
-        labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode="nearest")  # B*1*640*640
-        labels = labels.squeeze(1).to(torch.int32)  # B*640*640
-
-        keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)]
-
-        outputs.update({"kernels": kernels.data.cpu()})
-
-        scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0]))
-
-        results = []
-        for i in range(batch_size):
-            bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales)
-            results.append({"bboxes": bboxes, "scores": scores})
-        outputs.update({"results": results})
-
-        return outputs
+    # def get_results(self, out, img_meta, scale=2):
+    #     org_img_size = img_meta["org_img_size"]
+    #     img_size = img_meta["img_size"]  # 640*640
+    #     batch_size = out.size(0)
+    #     outputs = {}
+    #
+    #     texts = F.interpolate(
+    #         out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), mode="nearest"
+    #     )  # B*1*320*320
+    #     texts = self._max_pooling(texts, scale=scale)  # B*1*320*320
+    #     score_maps = torch.sigmoid_(texts)  # B*1*320*320~
+    #     score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode="nearest")  # B*1*640*640
+    #     score_maps = score_maps.squeeze(1)  # B*640*640
+    #
+    #     kernels = (out[:, 0, :, :] > 0).to(torch.uint8)  # B*160*160
+    #     labels_ = []
+    #     for kernel in kernels.numpy():
+    #         ret, label_ = cv2.connectedComponents(kernel)
+    #         labels_.append(label_)
+    #     labels_ = np.array(labels_)
+    #     labels_ = torch.from_numpy(labels_)
+    #     labels = labels_.unsqueeze(1).to(torch.float32)  # B*1*160*160
+    #     labels = F.interpolate(
+    #         labels, size=(img_size[0] // scale, img_size[1] // scale), mode="nearest"
+    #     )  # B*1*320*320
+    #     labels = self._max_pooling(labels, scale=scale)
+    #     labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode="nearest")  # B*1*640*640
+    #     labels = labels.squeeze(1).to(torch.int32)  # B*640*640
+    #
+    #     keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)]
+    #
+    #     outputs.update({"kernels": kernels.data.cpu()})
+    #
+    #     scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0]))
+    #
+    #     results = []
+    #     for i in range(batch_size):
+    #         bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales)
+    #         results.append({"bboxes": bboxes, "scores": scores})
+    #     outputs.update({"results": results})
+    #
+    #     return outputs
 
     def _max_pooling(self, x, scale=1):
         if scale == 1:
@@ -659,38 +659,38 @@ def _max_pooling(self, x, scale=1):
             x = self.pooling_2s(x)
         return x
 
-    def generate_bbox(self, keys, label, score, scales):
-        label_num = len(keys)
-        bboxes = []
-        scores = []
-        for index in range(1, label_num):
-            i = keys[index]
-            ind = label == i
-            ind_np = ind.data.cpu().numpy()
-            points = np.array(np.where(ind_np)).transpose((1, 0))
-            if points.shape[0] < self.min_area:
-                label[ind] = 0
-                continue
-            score_i = score[ind].mean().item()
-            if score_i < self.min_score:
-                label[ind] = 0
-                continue
-
-            if self.bbox_type == "rect":
-                rect = cv2.minAreaRect(points[:, ::-1])
-                alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1])))
-                rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2])
-                bbox = cv2.boxPoints(rect) * scales
-            else:
-                binary = np.zeros(label.shape, dtype="uint8")
-                binary[ind_np] = 1
-                contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-                bbox = contours[0] * scales
-
-            bbox = bbox.astype("int32")
-            bboxes.append(bbox.reshape(-1).tolist())
-            scores.append(score_i)
-        return bboxes, scores
+    # def generate_bbox(self, keys, label, score, scales):
+    #     label_num = len(keys)
+    #     bboxes = []
+    #     scores = []
+    #     for index in range(1, label_num):
+    #         i = keys[index]
+    #         ind = label == i
+    #         ind_np = ind.data.cpu().numpy()
+    #         points = np.array(np.where(ind_np)).transpose((1, 0))
+    #         if points.shape[0] < self.min_area:
+    #             label[ind] = 0
+    #             continue
+    #         score_i = score[ind].mean().item()
+    #         if score_i < self.min_score:
+    #             label[ind] = 0
+    #             continue
+    #
+    #         if self.bbox_type == "rect":
+    #             rect = cv2.minAreaRect(points[:, ::-1])
+    #             alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1])))
+    #             rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2])
+    #             bbox = cv2.boxPoints(rect) * scales
+    #         else:
+    #             binary = np.zeros(label.shape, dtype="uint8")
+    #             binary[ind_np] = 1
+    #             contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    #             bbox = contours[0] * scales
+    #
+    #         bbox = bbox.astype("int32")
+    #         bboxes.append(bbox.reshape(-1).tolist())
+    #         scores.append(score_i)
+    #     return bboxes, scores
 
 
 class FASTForImageCaptioning(PreTrainedModel):
@@ -714,7 +714,7 @@ def forward(self, imgs, img_metas=None):
         det_out = self.det_head(f)
 
         det_out = self._upsample(det_out, imgs.size(), scale=4)
-        det_res = self.det_head.get_results(det_out, img_metas, scale=2)
-        outputs.update(det_res)
+        # det_res = self.det_head.get_results(det_out, img_metas, scale=2)
+        # outputs.update(det_res)
 
-        return outputs
+        return det_out

From a8e4320b0666762b7e52a1d4135c293f33ed12fb Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Sat, 14 Oct 2023 20:17:16 +0530
Subject: [PATCH 006/152] Add tests

---
 .../models/fast/configuration_fast.py         |   4 +
 src/transformers/models/fast/modeling_fast.py | 342 ++++++++++++++++--
 tests/models/fast/test_modeling_fast.py       | 128 +++++--
 3 files changed, 398 insertions(+), 76 deletions(-)

diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index 773dbcb151c7..5b57ac482a0e 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -69,6 +69,8 @@ def __init__(
             min_area=250,
             min_score=0.88,
             bbox_type='rect',
+            loss_bg=False,
+            initializer_range=0.02,
             **kwargs,
     ):
         super().__init__(**kwargs)
@@ -147,3 +149,5 @@ def __init__(
         self.min_area = min_area
         self.min_score = min_score
         self.bbox_type = bbox_type
+        self.loss_bg = loss_bg
+        self.initializer_range = initializer_range
\ No newline at end of file
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 4f3188819ac3..798ecba93aa2 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -1,5 +1,7 @@
 import math
 from collections import OrderedDict
+from dataclasses import dataclass
+from typing import Optional, Dict
 
 import cv2
 import numpy as np
@@ -7,7 +9,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from transformers import PreTrainedModel
+from transformers import PreTrainedModel, FastConfig
+from transformers.utils import ModelOutput
 
 
 def get_same_padding(kernel_size):
@@ -131,10 +134,6 @@ def is_zero_layer():
         return False
 
 
-class FalsePreTrainedModel(PreTrainedModel):
-    pass
-
-
 class ConvLayer(My2DLayer):
     def __init__(
             self,
@@ -403,7 +402,24 @@ def _pad_to_mxn_tensor(self, kernel):
     #     return RepConvLayer(**config)
 
 
-class TextNet(PreTrainedModel):
+class FastPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = FastConfig
+    base_model_prefix = "fast"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+class TextNet(FastPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.first_conv = ConvLayer(
@@ -420,7 +436,7 @@ def __init__(self, config):
             config.backbone_dropout_rate,
             config.backbone_ops_order,
         )
-
+        self.first_conv.apply(self._init_weights)
         stage1 = []
         for stage_config in zip(
                 config.backbone_stage1_in_channels,
@@ -469,15 +485,15 @@ def __init__(self, config):
             stage4.append(RepConvLayer(*stage_config))
         self.stage4 = nn.ModuleList(stage4)
 
-        self._initialize_weights()
-
-    def _initialize_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight)
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
+    #     self._initialize_weights()
+    #
+    # def _initialize_weights(self):
+    #     for m in self.modules():
+    #         if isinstance(m, nn.Conv2d):
+    #             nn.init.kaiming_normal_(m.weight)
+    #         elif isinstance(m, nn.BatchNorm2d):
+    #             m.weight.data.fill_(1)
+    #             m.bias.data.zero_()
 
     def forward(self, x):
         x = self.first_conv(x)
@@ -502,7 +518,7 @@ def forward(self, x):
         return output
 
 
-class FASTNeck(PreTrainedModel):
+class FASTNeck(FastPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         reduce_layer_configs = list(
@@ -515,11 +531,13 @@ def __init__(self, config):
                 config.neck_groups,
             )
         )
-
-        self.reduce_layer1 = RepConvLayer(*reduce_layer_configs[0])
-        self.reduce_layer2 = RepConvLayer(*reduce_layer_configs[1])
-        self.reduce_layer3 = RepConvLayer(*reduce_layer_configs[2])
-        self.reduce_layer4 = RepConvLayer(*reduce_layer_configs[3])
+        self.layers_count = len(reduce_layer_configs)
+        for layer_ix in range(0, len(reduce_layer_configs)):
+            setattr(self, f"reduce_layer{layer_ix + 1}", RepConvLayer(*reduce_layer_configs[layer_ix]))
+        # self.reduce_layer1 = RepConvLayer(*reduce_layer_configs[0])
+        # self.reduce_layer2 = RepConvLayer(*reduce_layer_configs[1])
+        # self.reduce_layer3 = RepConvLayer(*reduce_layer_configs[2])
+        # self.reduce_layer4 = RepConvLayer(*reduce_layer_configs[3])
 
         self._initialize_weights()
 
@@ -536,22 +554,22 @@ def _upsample(self, x, y):
         return F.upsample(x, size=(H, W), mode="bilinear")
 
     def forward(self, x):
-        f1, f2, f3, f4 = x
+        f1 = x[0]
         f1 = self.reduce_layer1(f1)
-        f2 = self.reduce_layer2(f2)
-        f3 = self.reduce_layer3(f3)
-        f4 = self.reduce_layer4(f4)
-
-        f2 = self._upsample(f2, f1)
-        f3 = self._upsample(f3, f1)
-        f4 = self._upsample(f4, f1)
-        f = torch.cat((f1, f2, f3, f4), 1)
+        output_stages = [f1]
+
+        for layer_ix in range(1, self.layers_count):
+            layer_out = getattr(self, f"reduce_layer{layer_ix + 1}")(x[layer_ix])
+            layer_out = self._upsample(layer_out, f1)
+            output_stages.append(layer_out)
+
+        f = torch.cat(output_stages, 1)
         return f
 
 
-class FASTHead(nn.Module):
+class FASTHead(FastPreTrainedModel):
     def __init__(self, config):
-        super(FASTHead, self).__init__()
+        super().__init__(config)
         self.conv = RepConvLayer(
             config.head_conv_in_channels,
             config.head_conv_out_channels,
@@ -693,28 +711,274 @@ def _max_pooling(self, x, scale=1):
     #     return bboxes, scores
 
 
-class FASTForImageCaptioning(PreTrainedModel):
+def emb_loss(emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0),
+             bg_sample=False):
+    training_mask = (training_mask > 0.5).long()
+    kernel = (kernel > 0.5).long()
+    instance = instance * training_mask
+    instance_kernel = (instance * kernel).view(-1)
+    instance = instance.view(-1)
+    emb = emb.view(feature_dim, -1)
+
+    unique_labels, unique_ids = torch.unique(instance_kernel, sorted=True, return_inverse=True)
+    num_instance = unique_labels.size(0)
+    if num_instance <= 1:
+        return 0
+
+    emb_mean = emb.new_zeros((feature_dim, num_instance), dtype=torch.float32)
+    for i, lb in enumerate(unique_labels):
+        if lb == 0:
+            continue
+        ind_k = instance_kernel == lb
+        emb_mean[:, i] = torch.mean(emb[:, ind_k], dim=1)
+
+    l_agg = emb.new_zeros(num_instance, dtype=torch.float32)  # bug
+    for i, lb in enumerate(unique_labels):
+        if lb == 0:
+            continue
+        ind = instance == lb
+        emb_ = emb[:, ind]
+        dist = (emb_ - emb_mean[:, i:i + 1]).norm(p=2, dim=0)
+        dist = F.relu(dist - delta_v) ** 2
+        l_agg[i] = torch.mean(torch.log(dist + 1.0))
+    l_agg = torch.mean(l_agg[1:])
+
+    if num_instance > 2:
+        emb_interleave = emb_mean.permute(1, 0).repeat(num_instance, 1)
+        emb_band = emb_mean.permute(1, 0).repeat(1, num_instance).view(-1, feature_dim)
+        # print(seg_band)
+
+        mask = (1 - torch.eye(num_instance, dtype=torch.int8)).view(-1, 1).repeat(1, feature_dim)
+        mask = mask.view(num_instance, num_instance, -1)
+        mask[0, :, :] = 0
+        mask[:, 0, :] = 0
+        mask = mask.view(num_instance * num_instance, -1)
+        # print(mask)
+
+        dist = emb_interleave - emb_band
+        dist = dist[mask > 0].view(-1, feature_dim).norm(p=2, dim=1)
+        dist = F.relu(2 * delta_d - dist) ** 2
+        l_dis = torch.mean(torch.log(dist + 1.0))
+
+        if bg_sample:
+            l_dis = [torch.log(dist + 1.0)]
+            emb_bg = emb[:, instance == 0].view(feature_dim, -1)
+            if emb_bg.size(1) > 100:
+                rand_ind = np.random.permutation(emb_bg.size(1))[:100]
+                emb_bg = emb_bg[:, rand_ind]
+            if emb_bg.size(1) > 0:
+                for i, lb in enumerate(unique_labels):
+                    if lb == 0:
+                        continue
+                    dist = (emb_bg - emb_mean[:, i:i + 1]).norm(p=2, dim=0)
+                    dist = F.relu(2 * delta_d - dist) ** 2
+                    l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True)
+                    l_dis.append(l_dis_bg)
+            l_dis = torch.mean(torch.cat(l_dis))
+    else:
+        l_dis = 0
+
+    l_agg = weights[0] * l_agg
+    l_dis = weights[1] * l_dis
+    l_reg = torch.mean(torch.log(torch.norm(emb_mean, 2, 0) + 1.0)) * 0.001
+    loss = l_agg + l_dis + l_reg
+    return loss
+
+
+def emb_loss_batch(emb, instance, kernel, training_mask, reduce=True, loss_weight=0.25, bg_sample=False):
+    loss_batch = emb.new_zeros((emb.size(0)), dtype=torch.float32)
+
+    for i in range(loss_batch.size(0)):
+        loss_batch[i] = emb_loss(emb[i], instance[i], kernel[i], training_mask[i])
+
+    loss_batch = loss_weight * loss_batch
+
+    if reduce:
+        loss_batch = torch.mean(loss_batch)
+
+    return loss_batch
+
+
+def dice_loss_with_masks(input, target, mask, reduce=True):
+    loss_weight = 0.5
+    batch_size = input.size(0)
+    input = torch.sigmoid(input)
+
+    input = input.contiguous().view(batch_size, -1)
+    target = target.contiguous().view(batch_size, -1).float()
+    mask = mask.contiguous().view(batch_size, -1).float()
+
+    input = input * mask
+    target = target * mask
+
+    a = torch.sum(input * target, dim=1)
+    b = torch.sum(input * input, dim=1) + 0.001
+    c = torch.sum(target * target, dim=1) + 0.001
+    d = (2 * a) / (b + c)
+    loss = 1 - d
+
+    loss = loss_weight * loss
+
+    if reduce:
+        loss = torch.mean(loss)
+
+    return loss
+
+
+def ohem_single(score, gt_text, training_mask):
+    pos_num = int(torch.sum(gt_text > 0.5)) - int(torch.sum((gt_text > 0.5) & (training_mask <= 0.5)))
+
+    if pos_num == 0:
+        # selected_mask = gt_text.copy() * 0 # may be not good
+        selected_mask = training_mask
+        selected_mask = selected_mask.view(1, selected_mask.shape[0], selected_mask.shape[1]).float()
+        return selected_mask
+
+    neg_num = int(torch.sum(gt_text <= 0.5))
+    neg_num = int(min(pos_num * 3, neg_num))
+
+    if neg_num == 0:
+        selected_mask = training_mask
+        selected_mask = selected_mask.view(1, selected_mask.shape[0], selected_mask.shape[1]).float()
+        return selected_mask
+
+    neg_score = score[gt_text <= 0.5]
+    neg_score_sorted, _ = torch.sort(-neg_score)
+    threshold = -neg_score_sorted[neg_num - 1]
+
+    selected_mask = ((score >= threshold) | (gt_text > 0.5)) & (training_mask > 0.5)
+    selected_mask = selected_mask.reshape(1, selected_mask.shape[0], selected_mask.shape[1]).float()
+    return selected_mask
+
+
+def ohem_batch(scores, gt_texts, training_masks):
+    selected_masks = []
+    for i in range(scores.shape[0]):
+        selected_masks.append(ohem_single(scores[i, :, :], gt_texts[i, :, :], training_masks[i, :, :]))
+
+    selected_masks = torch.cat(selected_masks, 0).float()
+    return selected_masks
+
+
+def iou_single(a, b, mask, n_class):
+    EPS = 1e-6
+    valid = mask == 1
+    a = a[valid]
+    b = b[valid]
+    miou = []
+    for i in range(n_class):
+        inter = ((a == i) & (b == i)).float()
+        union = ((a == i) | (b == i)).float()
+
+        miou.append(torch.sum(inter) / (torch.sum(union) + EPS))
+    miou = sum(miou) / len(miou)
+    return miou
+
+
+def iou(a, b, mask, n_class=2, reduce=True):
+    batch_size = a.size(0)
+
+    a = a.view(batch_size, -1)
+    b = b.view(batch_size, -1)
+    mask = mask.view(batch_size, -1)
+
+    iou = a.new_zeros((batch_size,), dtype=torch.float32)
+    for i in range(batch_size):
+        iou[i] = iou_single(a[i], b[i], mask[i], n_class)
+
+    if reduce:
+        iou = torch.mean(iou)
+    return iou
+
+
+@dataclass
+class FASTForImageCaptioningOutput(ModelOutput):
+    """
+    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
+    last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity
+    scores.
+
+    Args:
+        loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Languge modeling loss from the text decoder.
+        text_hidden (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional*):
+            The image hidden states.
+    """
+
+    loss: Optional[torch.Tensor] = None
+    hidden_states: Optional[torch.FloatTensor] = None
+
+
+class FASTForImageCaptioning(FastPreTrainedModel):
+
     def __init__(self, config):
         super().__init__(config)
         self.backbone = TextNet(config=config)
         self.neck = FASTNeck(config=config)
         self.det_head = FASTHead(config=config)
+        self.loss_bg = config.loss_bg
+
+        self.pooling_1s = nn.MaxPool2d(kernel_size=config.head_pooling_size, stride=1,
+                                       padding=(config.head_pooling_size - 1) // 2)
+        self.pooling_2s = nn.MaxPool2d(kernel_size=config.head_pooling_size // 2 + 1, stride=1,
+                                       padding=(config.head_pooling_size // 2) // 2)
+        self.post_init()
 
     def _upsample(self, x, size, scale=1):
         _, _, H, W = size
         return F.interpolate(x, size=(H // scale, W // scale), mode="bilinear")
 
-    def forward(self, imgs, img_metas=None):
-        outputs = {}
+    def _max_pooling(self, x, scale=1):
+        if scale == 1:
+            x = self.pooling_1s(x)
+        elif scale == 2:
+            x = self.pooling_2s(x)
+        return x
+
+    def loss(self, hidden, labels):
+        gt_texts = labels['gt_texts']
+        gt_kernels = labels['gt_kernels']
+        training_masks = labels['training_masks']
+        gt_instances = labels['gt_instances']
+
+        kernels = hidden[:, 0, :, :]  # 4*640*640
+        texts = self._max_pooling(kernels, scale=1)  # 4*640*640
+        embs = hidden[:, 1:, :, :]  # 4*4*640*640
+
+        selected_masks = ohem_batch(texts, gt_texts, training_masks)
+        loss_text = dice_loss_with_masks(texts, gt_texts, selected_masks, reduce=False)
 
-        f = self.backbone(imgs)
+        selected_masks = gt_texts * training_masks
+        loss_kernel = dice_loss_with_masks(kernels, gt_kernels, selected_masks, reduce=False)
+        loss_kernel = torch.mean(loss_kernel, dim=0)
+
+        loss_emb = emb_loss_batch(embs, gt_instances, gt_kernels, training_masks, reduce=False, bg_sample=self.loss_bg)
+
+        return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb)
+
+    def forward(self,
+                pixel_values: torch.FloatTensor,
+                output_hidden_states: Optional[bool] = True,
+                return_dict: Optional[bool] = None,
+                labels: Dict = None
+                ):
+        # outputs = {}
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        f = self.backbone(pixel_values)
 
         f = self.neck(f)
 
         det_out = self.det_head(f)
 
-        det_out = self._upsample(det_out, imgs.size(), scale=4)
+        loss = None
+        if labels:
+            out = self._upsample(det_out, pixel_values.size(), scale=1)
+            loss = self.loss(out, labels)
         # det_res = self.det_head.get_results(det_out, img_metas, scale=2)
         # outputs.update(det_res)
+        det_out = self._upsample(det_out, pixel_values.size(), scale=4)
+
+        if not return_dict:
+            return (loss, det_out) if loss is not None else (det_out,)
 
-        return det_out
+        return FASTForImageCaptioningOutput(loss, det_out)
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index 26d2fd8e347e..f3790cfb8300 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Testing suite for the PyTorch Falcon model. """
-
+import inspect
 import unittest
 
 from parameterized import parameterized
@@ -55,40 +55,40 @@ def __init__(
             backbone_act_func="relu",
             backbone_dropout_rate=0,
             backbone_ops_order="weight_bn_act",
-            backbone_stage1_in_channels=[64, 64, 64],
-            backbone_stage1_out_channels=[64, 64, 64],
-            backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]],
-            backbone_stage1_stride=[1, 2, 1],
-            backbone_stage1_dilation=[1, 1, 1],
-            backbone_stage1_groups=[1, 1, 1],
-            backbone_stage2_in_channels=[64, 128, 128, 128],
-            backbone_stage2_out_channels=[128, 128, 128, 128],
-            backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]],
-            backbone_stage2_stride=[2, 1, 1, 1],
-            backbone_stage2_dilation=[1, 1, 1, 1],
-            backbone_stage2_groups=[1, 1, 1, 1],
-            backbone_stage3_in_channels=[128, 256, 256, 256],
-            backbone_stage3_out_channels=[256, 256, 256, 256],
-            backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]],
-            backbone_stage3_stride=[2, 1, 1, 1],
-            backbone_stage3_dilation=[1, 1, 1, 1],
-            backbone_stage3_groups=[1, 1, 1, 1],
-            backbone_stage4_in_channels=[256, 512, 512, 512],
-            backbone_stage4_out_channels=[512, 512, 512, 512],
-            backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]],
-            backbone_stage4_stride=[2, 1, 1, 1],
-            backbone_stage4_dilation=[1, 1, 1, 1],
-            backbone_stage4_groups=[1, 1, 1, 1],
-            neck_in_channels=[64, 128, 256, 512],
-            neck_out_channels=[128, 128, 128, 128],
-            neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]],
-            neck_stride=[1, 1, 1, 1],
-            neck_dilation=[1, 1, 1, 1],
-            neck_groups=[1, 1, 1, 1],
+            backbone_stage1_in_channels=[64],
+            backbone_stage1_out_channels=[64],
+            backbone_stage1_kernel_size=[[3, 3]],
+            backbone_stage1_stride=[1],
+            backbone_stage1_dilation=[1],
+            backbone_stage1_groups=[1],
+            backbone_stage2_in_channels=[64],
+            backbone_stage2_out_channels=[128],
+            backbone_stage2_kernel_size=[ [3, 1]],
+            backbone_stage2_stride=[2],
+            backbone_stage2_dilation=[1],
+            backbone_stage2_groups=[1],
+            backbone_stage3_in_channels=[128],
+            backbone_stage3_out_channels=[256],
+            backbone_stage3_kernel_size=[ [1, 3]],
+            backbone_stage3_stride=[2],
+            backbone_stage3_dilation=[1],
+            backbone_stage3_groups=[1],
+            backbone_stage4_in_channels=[256],
+            backbone_stage4_out_channels=[512],
+            backbone_stage4_kernel_size=[[3, 3]],
+            backbone_stage4_stride=[2],
+            backbone_stage4_dilation=[1],
+            backbone_stage4_groups=[1],
+            neck_in_channels=[64],
+            neck_out_channels=[128],
+            neck_kernel_size=[[3, 3]],
+            neck_stride=[1],
+            neck_dilation=[1],
+            neck_groups=[1],
             head_pooling_size=9,
             head_dropout_ratio=0.1,
-            head_conv_in_channels=512,
-            head_conv_out_channels=128,
+            head_conv_in_channels=128,
+            head_conv_out_channels=4,
             head_conv_kernel_size=[3, 3],
             head_conv_stride=1,
             head_conv_dilation=1,
@@ -99,7 +99,7 @@ def __init__(
             head_final_groups=1,
             head_final_bias=False,
             head_final_has_shuffle=False,
-            head_final_in_channels=128,
+            head_final_in_channels=4,
             head_final_out_channels=5,
             head_final_use_bn=False,
             head_final_act_func=None,
@@ -199,7 +199,7 @@ def prepare_config_and_inputs(self):
         #
         config = self.get_config()
 
-        return config, {"imgs": pixel_values, "img_metas": pixel_values_meta}
+        return config, {"pixel_values": pixel_values}
 
     def get_config(self):
         return FastConfig(
@@ -271,8 +271,8 @@ def create_and_check_model(self, config, input):
         model = FASTForImageCaptioning(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(imgs=input['imgs'], imgs_mets=input['img_metas'])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        result = model(pixel_values=input['pixel_values'])
+        self.parent.assertEqual(result.hidden_states.shape, (self.batch_size, 5, 125, 125))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -325,3 +325,57 @@ def test_inputs_embeds(self):
     @unittest.skip(reason="Fast does not support input and output embeddings")
     def test_model_common_attributes(self):
         pass
+
+    @unittest.skip(reason="Fast is not a generative model")
+    def test_generate_without_input_ids(self):
+        pass
+
+    @unittest.skip(reason="Fast is does not have any hidden_states")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Fast is does not have any attention")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        to_return = inputs_dict.copy()
+        gt_instances = torch.zeros(self.model_tester.batch_size, self.model_tester.image_size,
+                                   self.model_tester.image_size)
+        gt_kernels = torch.zeros(self.model_tester.batch_size, self.model_tester.image_size,
+                                 self.model_tester.image_size)
+        gt_text = torch.zeros(self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size)
+        training_masks = torch.ones(self.model_tester.batch_size, self.model_tester.image_size,
+                                    self.model_tester.image_size)
+        labels = {}
+        labels["gt_instances"] = gt_instances
+        labels["gt_kernels"] = gt_kernels
+        labels["gt_texts"] = gt_text
+        labels["training_masks"] = training_masks
+
+        to_return["labels"] = labels
+
+        return to_return
+
+    def test_model_is_small(self):
+        # Just a consistency check to make sure we are not running tests on 80M parameter models.
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            num_params = model.num_parameters()
+            assert (
+                num_params < 3000000
+            ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."

From 3b15aa97e791e0beda5cb20a46a4c0dae8caf210 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Sun, 15 Oct 2023 11:49:51 +0530
Subject: [PATCH 007/152] Add conversion script

---
 .../models/fast/configuration_fast.py         |   2 +-
 .../fast/convert_fast_original_to_pytorch.py  | 256 ++++++++++++++++++
 2 files changed, 257 insertions(+), 1 deletion(-)
 create mode 100644 src/transformers/models/fast/convert_fast_original_to_pytorch.py

diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index 5b57ac482a0e..ad72054b5cee 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -47,7 +47,7 @@ def __init__(
             neck_dilation=[1, 1, 1, 1],
             neck_groups=[1, 1, 1, 1],
             head_pooling_size=9,
-            head_dropout_ratio=0.1,
+            head_dropout_ratio=0,
             head_conv_in_channels=512,
             head_conv_out_channels=128,
             head_conv_kernel_size=[3, 3],
diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
new file mode 100644
index 000000000000..24f0c3dd56e0
--- /dev/null
+++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
@@ -0,0 +1,256 @@
+# coding=utf-8
+# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import copy
+import json
+
+import numpy as np
+import pandas as pd
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import (
+    FastConfig,
+    FASTForImageCaptioning
+)
+from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+
+tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
+small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config"
+base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config"
+
+rename_key_mappings = {
+    "head": "classifier",
+    "text_embed": "text_embedding",
+    "vision_embed": "vision_embedding",
+    "k_proj": "key_proj",
+    "q_proj": "query_proj",
+    "v_proj": "value_proj",
+    "A": "text",
+    "B": "image",
+    "layer_norm": "fc_norm",
+    "self_attn_fc_norm": "self_attn_layer_norm",
+    "final_fc_norm": "final_layer_norm",
+    "first": "first",
+}
+
+
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+def prepare_config(size_config_url, pooling_size, min_area, min_score, bbox_type, loss_bg):
+    config_dict = json.loads(requests.get(size_config_url).text)
+
+    backbone_config = {}
+    for stage_ix in range(1, 5):
+        stage_config = config_dict[f'stage{stage_ix}']
+
+        merged_dict = {}
+
+        # Iterate through the list of dictionaries
+        for layer in stage_config:
+            for key, value in layer.items():
+                if key != "name":
+                    # Check if the key is already in the merged_dict
+                    if key in merged_dict:
+                        merged_dict[key].append(value)
+                    else:
+                        # If the key is not in merged_dict, create a new list with the value
+                        merged_dict[key] = [value]
+        backbone_config[f'stage{stage_ix}'] = merged_dict
+
+    neck_in_channels = []
+    neck_out_channels = []
+    neck_kernel_size = []
+    neck_stride = []
+    neck_dilation = []
+    neck_groups = []
+
+    for i in range(1, 5):
+        layer_key = f"reduce_layer{i}"
+        layer_dict = config_dict['neck'].get(layer_key)
+
+        if layer_dict:
+            # Append values to the corresponding lists
+            neck_in_channels.append(layer_dict["in_channels"])
+            neck_out_channels.append(layer_dict["out_channels"])
+            neck_kernel_size.append(layer_dict["kernel_size"])
+            neck_stride.append(layer_dict["stride"])
+            neck_dilation.append(layer_dict["dilation"])
+            neck_groups.append(layer_dict["groups"])
+
+    return FastConfig(
+        backbone_kernel_size=config_dict["first_conv"]["kernel_size"],
+        backbone_stride=config_dict["first_conv"]["stride"],
+        backbone_dilation=config_dict["first_conv"]["dilation"],
+        backbone_groups=config_dict["first_conv"]["groups"],
+        backbone_bias=config_dict["first_conv"]["bias"],
+        backbone_has_shuffle=config_dict["first_conv"]["has_shuffle"],
+        backbone_in_channels=config_dict["first_conv"]["in_channels"],
+        backbone_out_channels=config_dict["first_conv"]["out_channels"],
+        backbone_use_bn=config_dict["first_conv"]["use_bn"],
+        backbone_act_func=config_dict["first_conv"]["act_func"],
+        backbone_dropout_rate=config_dict["first_conv"]["dropout_rate"],
+        backbone_ops_order=config_dict["first_conv"]["ops_order"],
+
+        backbone_stage1_in_channels=backbone_config['stage1']['in_channels'],
+        backbone_stage1_out_channels=backbone_config['stage1']['out_channels'],
+        backbone_stage1_kernel_size=backbone_config['stage1']['kernel_size'],
+        backbone_stage1_stride=backbone_config['stage1']['stride'],
+        backbone_stage1_dilation=backbone_config['stage1']['dilation'],
+        backbone_stage1_groups=backbone_config['stage1']['groups'],
+
+        backbone_stage2_in_channels=backbone_config['stage2']['in_channels'],
+        backbone_stage2_out_channels=backbone_config['stage2']['out_channels'],
+        backbone_stage2_kernel_size=backbone_config['stage2']['kernel_size'],
+        backbone_stage2_stride=backbone_config['stage2']['stride'],
+        backbone_stage2_dilation=backbone_config['stage2']['dilation'],
+        backbone_stage2_groups=backbone_config['stage2']['groups'],
+
+        backbone_stage3_in_channels=backbone_config['stage3']['in_channels'],
+        backbone_stage3_out_channels=backbone_config['stage3']['out_channels'],
+        backbone_stage3_kernel_size=backbone_config['stage3']['kernel_size'],
+        backbone_stage3_stride=backbone_config['stage3']['stride'],
+        backbone_stage3_dilation=backbone_config['stage3']['dilation'],
+        backbone_stage3_groups=backbone_config['stage3']['groups'],
+
+        backbone_stage4_in_channels=backbone_config['stage4']['in_channels'],
+        backbone_stage4_out_channels=backbone_config['stage4']['out_channels'],
+        backbone_stage4_kernel_size=backbone_config['stage4']['kernel_size'],
+        backbone_stage4_stride=backbone_config['stage4']['stride'],
+        backbone_stage4_dilation=backbone_config['stage4']['dilation'],
+        backbone_stage4_groups=backbone_config['stage4']['groups'],
+
+        neck_in_channels=neck_in_channels,
+        neck_out_channels=neck_out_channels,
+        neck_kernel_size=neck_kernel_size,
+        neck_stride=neck_stride,
+        neck_dilation=neck_dilation,
+        neck_groups=neck_groups,
+
+        head_pooling_size=pooling_size,
+        head_dropout_ratio=0.1,
+        head_conv_in_channels=config_dict['head']['conv']['in_channels'],
+        head_conv_out_channels=config_dict['head']['conv']['out_channels'],
+        head_conv_kernel_size=config_dict['head']['conv']['kernel_size'],
+        head_conv_stride=config_dict['head']['conv']['stride'],
+        head_conv_dilation=config_dict['head']['conv']['dilation'],
+        head_conv_groups=config_dict['head']['conv']['groups'],
+
+        head_final_kernel_size=config_dict['head']['final']['kernel_size'],
+        head_final_stride=config_dict['head']['final']['stride'],
+        head_final_dilation=config_dict['head']['final']['dilation'],
+        head_final_groups=config_dict['head']['final']['groups'],
+        head_final_bias=config_dict['head']['final']['bias'],
+        head_final_has_shuffle=config_dict['head']['final']['has_shuffle'],
+        head_final_in_channels=config_dict['head']['final']['in_channels'],
+        head_final_out_channels=config_dict['head']['final']['out_channels'],
+        head_final_use_bn=config_dict['head']['final']['use_bn'],
+        head_final_act_func=config_dict['head']['final']['act_func'],
+        head_final_dropout_rate=config_dict['head']['final']['dropout_rate'],
+        head_final_ops_order=config_dict['head']['final']['ops_order'],
+
+        min_area=min_area,
+        min_score=min_score,
+        bbox_type=bbox_type,
+        loss_bg=loss_bg,
+    )
+
+
+def get_small_model_config():
+    pass
+
+
+def get_base_model_config():
+    pass
+
+
+def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_folder_path, validate_logits):
+    response = requests.get(checkpoint_config_url)
+    content = response.text
+
+    namespace = {}
+
+    exec(content, namespace)
+
+    model_config = namespace.get('model')
+    test_config = namespace.get('test_cfg', None)
+
+    min_score = 0.88
+    min_area = 250
+    bbox_type = 'rect'
+    loss_bg = False
+    if test_config is not None:
+        min_area = test_config.get('min_area', min_area)
+        min_score = test_config.get('min_area', min_score)
+        bbox_type = test_config.get('min_area', bbox_type)
+        loss_bg = test_config.get('loss_emb', None) == "EmbLoss_v2"
+
+    if 'tiny' in model_config['backbone']['config']:
+        config = prepare_config(tiny_config_url, model_config['detection_head']['pooling_size'],
+                                min_area, min_score, bbox_type, loss_bg)
+    elif 'small' in model_config['backbone']['config']:
+        config = prepare_config(small_config_url, model_config['detection_head']['pooling_size'],
+                                min_area, min_score, bbox_type, loss_bg)
+    else:
+        config = prepare_config(base_config_url, model_config['detection_head']['pooling_size'],
+                                min_area, min_score, bbox_type, loss_bg)
+
+    model = FASTForImageCaptioning(config)
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)['ema']
+    state_dict_changed = copy.deepcopy(state_dict)
+    for key in state_dict:
+        val = state_dict_changed.pop(key)
+        state_dict_changed[key.replace('module.', '')] = val
+    model.load_state_dict(state_dict_changed)
+
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_url",
+        default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth",
+        type=str,
+        help="URL to the original PyTorch checkpoint (.pth file).",
+    )
+    parser.add_argument(
+        "--checkpoint_config_url",
+        default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth",
+        type=str,
+        help="URL to the original PyTorch checkpoint (.pth file).",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
+    )
+    parser.add_argument(
+        "--validate_logits",
+        default=False,
+        type=bool,
+        help="whether to assert logits outputs",
+    )
+    args = parser.parse_args()
+
+    convert_fast_checkpoint(
+        args.checkpoint_url, args.checkpoint_config_url, args.pytorch_dump_folder_path, args.validate_logits
+    )

From c565cf334950f8d42a1e5ec61d880c6e0eb46d8b Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Fri, 20 Oct 2023 20:47:02 +0530
Subject: [PATCH 008/152] Add conversion scripts, integration tests, image
 processor

---
 .../fast/convert_fast_original_to_pytorch.py  |  16 +-
 .../models/fast/image_processing_fast.py      | 603 ++++++++++++++++++
 src/transformers/models/fast/modeling_fast.py | 236 ++++---
 tests/models/fast/test_modeling_fast.py       |  66 +-
 4 files changed, 789 insertions(+), 132 deletions(-)
 create mode 100644 src/transformers/models/fast/image_processing_fast.py

diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
index 24f0c3dd56e0..7ef78a312080 100644
--- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py
+++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
@@ -16,6 +16,7 @@
 import argparse
 import copy
 import json
+import logging
 
 import numpy as np
 import pandas as pd
@@ -28,6 +29,7 @@
     FastConfig,
     FASTForImageCaptioning
 )
+from transformers.models.fast.image_processing_fast import FastImageProcessor
 from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
 
 tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
@@ -193,6 +195,7 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_
 
     model_config = namespace.get('model')
     test_config = namespace.get('test_cfg', None)
+    data_config = namespace.get('data')
 
     min_score = 0.88
     min_area = 250
@@ -200,8 +203,8 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_
     loss_bg = False
     if test_config is not None:
         min_area = test_config.get('min_area', min_area)
-        min_score = test_config.get('min_area', min_score)
-        bbox_type = test_config.get('min_area', bbox_type)
+        min_score = test_config.get('min_score', min_score)
+        bbox_type = test_config.get('bbox_type', bbox_type)
         loss_bg = test_config.get('loss_emb', None) == "EmbLoss_v2"
 
     if 'tiny' in model_config['backbone']['config']:
@@ -213,8 +216,15 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_
     else:
         config = prepare_config(base_config_url, model_config['detection_head']['pooling_size'],
                                 min_area, min_score, bbox_type, loss_bg)
+    size = 640
+    if "train" in data_config:
+        if "short_size" in data_config['train']:
+            size = data_config['train']['short_size']
 
     model = FASTForImageCaptioning(config)
+    fast_image_processor = FastImageProcessor(size={'height': size, 'width': size}, min_score=config.min_score,
+                                              min_area=config.min_area,
+                                              bbox_type=config.bbox_type, pooling_size=config.head_pooling_size)
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)['ema']
     state_dict_changed = copy.deepcopy(state_dict)
     for key in state_dict:
@@ -223,6 +233,8 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_
     model.load_state_dict(state_dict_changed)
 
     model.save_pretrained(pytorch_dump_folder_path)
+    fast_image_processor.save_pretrained(pytorch_dump_folder_path)
+    logging.info("The converted weights are save here : " + pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
new file mode 100644
index 000000000000..637aea38e086
--- /dev/null
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -0,0 +1,603 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Beit."""
+import math
+import warnings
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch.nn.functional as F
+import torch.nn as nn
+
+import cv2
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_torch_available, is_torch_tensor, is_vision_available, logging, \
+    IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+
+if is_vision_available():
+    import PIL
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class FastImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a BEiT image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
+            is padded with 0's and then center cropped. Can be overridden by the `do_center_crop` parameter in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`.
+            Can be overridden by the `crop_size` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            The mean to use if normalizing the image. This is a float or list of floats of length of the number of
+            channels of the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            The standard deviation to use if normalizing the image. This is a float or list of floats of length of the
+            number of channels of the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_reduce_labels (`bool`, *optional*, defaults to `False`):
+            Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is
+            used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The
+            background label will be replaced by 255. Can be overridden by the `do_reduce_labels` parameter in the
+            `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+            self,
+            do_resize: bool = True,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = PILImageResampling.BICUBIC,
+            do_center_crop: bool = False,
+            crop_size: Dict[str, int] = None,
+            rescale_factor: Union[int, float] = 1 / 255,
+            do_rescale: bool = True,
+            do_normalize: bool = True,
+            image_mean: Optional[Union[float, List[float]]] = None,
+            image_std: Optional[Union[float, List[float]]] = None,
+            do_reduce_labels: bool = False,
+            min_area: int = 10,
+            min_score: float = 0.88,
+            bbox_type: str = "rect",
+            pooling_size: int = 9,
+            **kwargs,
+    ) -> None:
+        if "reduce_labels" in kwargs:
+            warnings.warn(
+                "The `reduce_labels` parameter is deprecated and will be removed in a future version. Please use"
+                " `do_reduce_labels` instead.",
+                FutureWarning,
+            )
+            do_reduce_labels = kwargs.pop("reduce_labels")
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 640, "width": 640}
+        size = get_size_dict(size)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_reduce_labels = do_reduce_labels
+        self.min_area = min_area
+        self.min_score = min_score
+        self.bbox_type = bbox_type
+        self.pooling_size = pooling_size
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure `reduce_labels` is updated if image processor
+        is created using from_dict and kwargs e.g. `BeitImageProcessor.from_pretrained(checkpoint, reduce_labels=True)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "reduce_labels" in kwargs:
+            image_processor_dict["reduce_labels"] = kwargs.pop("reduce_labels")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    def resize(
+            self,
+            image: np.ndarray,
+            size: Dict[str, int],
+            resample: PILImageResampling = PILImageResampling.BICUBIC,
+            data_format: Optional[Union[str, ChannelDimension]] = None,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+            **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to (size["height"], size["width"]).
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PIL.Image.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size, default_to_square=True, param_name="size")
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` argument must contain `height` and `width` keys. Got {size.keys()}")
+        return resize(
+            image,
+            size=(size["height"], size["width"]),
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def reduce_label(self, label: ImageInput) -> np.ndarray:
+        label = to_numpy_array(label)
+        # Avoid using underflow conversion
+        label[label == 0] = 255
+        label = label - 1
+        label[label == 254] = 255
+        return label
+
+    def _preprocess(
+            self,
+            image: ImageInput,
+            do_reduce_labels: bool = None,
+            do_resize: bool = None,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = None,
+            do_center_crop: bool = None,
+            crop_size: Dict[str, int] = None,
+            do_rescale: bool = None,
+            rescale_factor: float = None,
+            do_normalize: bool = None,
+            image_mean: Optional[Union[float, List[float]]] = None,
+            image_std: Optional[Union[float, List[float]]] = None,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        if do_reduce_labels:
+            image = self.reduce_label(image)
+
+        if do_resize:
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+        if do_center_crop:
+            image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
+
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+
+        return image
+
+    def _preprocess_image(
+            self,
+            image: ImageInput,
+            do_resize: bool = None,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = None,
+            do_center_crop: bool = None,
+            crop_size: Dict[str, int] = None,
+            do_rescale: bool = None,
+            rescale_factor: float = None,
+            do_normalize: bool = None,
+            image_mean: Optional[Union[float, List[float]]] = None,
+            image_std: Optional[Union[float, List[float]]] = None,
+            data_format: Optional[Union[str, ChannelDimension]] = None,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+        image = self._preprocess(
+            image,
+            do_reduce_labels=False,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            input_data_format=input_data_format,
+        )
+        if data_format is not None:
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        return image
+
+    def _preprocess_segmentation_map(
+            self,
+            segmentation_map: ImageInput,
+            do_resize: bool = None,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = None,
+            do_center_crop: bool = None,
+            crop_size: Dict[str, int] = None,
+            do_reduce_labels: bool = None,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """Preprocesses a single segmentation map."""
+        # All transformations expect numpy arrays.
+        segmentation_map = to_numpy_array(segmentation_map)
+        # Add an axis to the segmentation maps for transformations.
+        if segmentation_map.ndim == 2:
+            segmentation_map = segmentation_map[None, ...]
+            added_dimension = True
+            input_data_format = ChannelDimension.FIRST
+        else:
+            added_dimension = False
+            if input_data_format is None:
+                input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1)
+        segmentation_map = self._preprocess(
+            image=segmentation_map,
+            do_reduce_labels=do_reduce_labels,
+            do_resize=do_resize,
+            resample=resample,
+            size=size,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_normalize=False,
+            do_rescale=False,
+            input_data_format=ChannelDimension.FIRST,
+        )
+        # Remove extra axis if added
+        if added_dimension:
+            segmentation_map = np.squeeze(segmentation_map, axis=0)
+        segmentation_map = segmentation_map.astype(np.int64)
+        return segmentation_map
+
+    def __call__(self, images, segmentation_maps=None, **kwargs):
+        # Overrides the `__call__` method of the `Preprocessor` class such that the images and segmentation maps can both
+        # be passed in as positional arguments.
+        return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
+
+    def preprocess(
+            self,
+            images: ImageInput,
+            segmentation_maps: Optional[ImageInput] = None,
+            do_resize: bool = None,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = None,
+            do_center_crop: bool = None,
+            crop_size: Dict[str, int] = None,
+            do_rescale: bool = None,
+            rescale_factor: float = None,
+            do_normalize: bool = None,
+            image_mean: Optional[Union[float, List[float]]] = None,
+            image_std: Optional[Union[float, List[float]]] = None,
+            do_reduce_labels: Optional[bool] = None,
+            return_tensors: Optional[Union[str, TensorType]] = None,
+            data_format: ChannelDimension = ChannelDimension.FIRST,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+            **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the image after center crop. If one edge the image is smaller than `crop_size`, it will be
+                padded with zeros and then cropped
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+                Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+                is used for background, and background itself is not included in all classes of a dataset (e.g.
+                ADE20k). The background label will be replaced by 255.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=True, param_name="size")
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_reduce_labels = do_reduce_labels if do_reduce_labels is not None else self.do_reduce_labels
+
+        images = make_list_of_images(images)
+        if segmentation_maps is not None:
+            segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if segmentation_maps is not None and not valid_images(segmentation_maps):
+            raise ValueError(
+                "Invalid segmentation map type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        images = [
+            self._preprocess_image(
+                image=img,
+                do_resize=do_resize,
+                do_center_crop=do_center_crop,
+                do_rescale=do_rescale,
+                do_normalize=do_normalize,
+                resample=resample,
+                size=size,
+                rescale_factor=rescale_factor,
+                crop_size=crop_size,
+                image_mean=image_mean,
+                image_std=image_std,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for img in images
+        ]
+
+        data = {"pixel_values": images}
+
+        if segmentation_maps is not None:
+            segmentation_maps = [
+                self._preprocess_segmentation_map(
+                    segmentation_map=segmentation_map,
+                    do_reduce_labels=do_reduce_labels,
+                    do_resize=do_resize,
+                    resample=resample,
+                    size=size,
+                    do_center_crop=do_center_crop,
+                    crop_size=crop_size,
+                )
+                for segmentation_map in segmentation_maps
+            ]
+            data["labels"] = segmentation_maps
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
+        """
+        Converts the output of [`BeitForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
+
+        Args:
+            outputs ([`BeitForSemanticSegmentation`]):
+                Raw outputs of the model.
+            target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
+
+        Returns:
+            semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
+            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
+            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
+        """
+        # TODO: add support for other frameworks
+        logits = outputs.logits
+
+        # Resize logits and compute semantic segmentation maps
+        if target_sizes is not None:
+            if len(logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+            if is_torch_tensor(target_sizes):
+                target_sizes = target_sizes.numpy()
+
+            semantic_segmentation = []
+
+            for idx in range(len(logits)):
+                resized_logits = torch.nn.functional.interpolate(
+                    logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
+                )
+                semantic_map = resized_logits[0].argmax(dim=0)
+                semantic_segmentation.append(semantic_map)
+        else:
+            semantic_segmentation = logits.argmax(dim=1)
+            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
+
+        return semantic_segmentation
+
+    def _max_pooling(self, x, scale=1):
+        if scale == 1:
+            x = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1,
+                             padding=(self.pooling_size - 1) // 2)(x)
+        elif scale == 2:
+            x = nn.MaxPool2d(kernel_size=self.pooling_size // 2 + 1, stride=1,
+                             padding=(self.pooling_size // 2) // 2)(x)
+        return x
+
+    def get_results(self, output, target_sizes):
+        scale = 2
+        img_size = (self.size['height'], self.size['width'])
+        out = output['hidden_states']
+        batch_size = out.size(0)
+        final_results = dict()
+
+        texts = F.interpolate(out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale),
+                              mode='nearest')  # B*1*320*320
+        texts = self._max_pooling(texts, scale=scale)  # B*1*320*320
+        score_maps = torch.sigmoid_(texts)  # B*1*320*320
+        score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode='nearest')  # B*1*640*640
+        score_maps = score_maps.squeeze(1)  # B*640*640
+
+        kernels = (out[:, 0, :, :] > 0).to(torch.uint8)  # B*160*160
+        labels_ = []
+        for kernel in kernels.numpy():
+            ret, label_ = cv2.connectedComponents(kernel)
+            labels_.append(label_)
+        labels_ = np.array(labels_)
+        labels_ = torch.from_numpy(labels_)
+        labels = labels_.unsqueeze(1).to(torch.float32)  # B*1*160*160
+        labels = F.interpolate(labels, size=(img_size[0] // scale, img_size[1] // scale), mode='nearest')  # B*1*320*320
+        labels = self._max_pooling(labels, scale=scale)
+        labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode='nearest')  # B*1*640*640
+        labels = labels.squeeze(1).to(torch.int32)  # B*640*640
+
+        keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)]
+
+        final_results.update(dict(kernels=kernels.data.cpu()))
+
+        results = []
+        for i in range(batch_size):
+            org_img_size = target_sizes[i]
+            scales = (float(org_img_size[1]) / float(img_size[1]),
+                      float(org_img_size[0]) / float(img_size[0]))
+
+            bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales)
+            results.append(dict(
+                bboxes=bboxes,
+                scores=scores
+            ))
+        final_results.update(dict(results=results))
+
+        return results
+
+    def generate_bbox(self, keys, label, score, scales):
+        label_num = len(keys)
+        bboxes = []
+        scores = []
+        for index in range(1, label_num):
+            i = keys[index]
+            ind = (label == i)
+            ind_np = ind.data.cpu().numpy()
+            points = np.array(np.where(ind_np)).transpose((1, 0))
+            if points.shape[0] < self.min_area:
+                label[ind] = 0
+                continue
+            score_i = score[ind].mean().item()
+            if score_i < self.min_score:
+                label[ind] = 0
+                continue
+
+            if self.bbox_type == 'rect':
+                rect = cv2.minAreaRect(points[:, ::-1])
+                alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1])))
+                rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2])
+                bbox = cv2.boxPoints(rect) * scales
+
+            elif self.bbox_type == 'poly':
+                binary = np.zeros(label.shape, dtype='uint8')
+                binary[ind_np] = 1
+                contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                bbox = contours[0] * scales
+            bbox = bbox.astype('int32')
+            bboxes.append(bbox.reshape(-1).tolist())
+            scores.append(score_i)
+        return bboxes, scores
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 798ecba93aa2..dc892f0e58ce 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -103,7 +103,9 @@ def weight_op(self):
     """ Methods defined in MyModule"""
 
     def forward(self, x):
-        for module in self._modules.values():
+        for key, module in self._modules.items():
+            if key == 'bn' and not self.training:
+                continue
             x = module(x)
         return x
 
@@ -134,7 +136,7 @@ def is_zero_layer():
         return False
 
 
-class ConvLayer(My2DLayer):
+class ConvLayer(nn.Module):
     def __init__(
             self,
             in_channels,
@@ -148,18 +150,19 @@ def __init__(
             use_bn=True,
             act_func="relu",
             dropout_rate=0,
-            ops_order="weight_bn_act",
+            use_act=True
     ):
+
+        super().__init__()
+
         self.kernel_size = kernel_size
         self.stride = stride
         self.dilation = dilation
         self.groups = groups
         self.bias = bias
         self.has_shuffle = has_shuffle
+        self.act_func = act_func
 
-        super(ConvLayer, self).__init__(in_channels, out_channels, use_bn, act_func, dropout_rate, ops_order)
-
-    def weight_op(self):
         padding = get_same_padding(self.kernel_size)
         if isinstance(padding, int):
             padding *= self.dilation
@@ -167,23 +170,61 @@ def weight_op(self):
             padding[0] *= self.dilation
             padding[1] *= self.dilation
 
-        weight_dict = OrderedDict()
-        weight_dict["conv"] = nn.Conv2d(
-            self.in_channels,
-            self.out_channels,
-            kernel_size=self.kernel_size,
-            stride=self.stride,
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
             padding=padding,
-            dilation=self.dilation,
-            groups=self.groups,
-            bias=self.bias,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
         )
+        self.bn = nn.Identity()
+        if use_bn:
+            self.bn = nn.BatchNorm2d(out_channels)
 
-        return weight_dict
+        self.act = nn.Identity()
+        if use_act:
+            act = build_activation(self.act_func, True)
+            if act is not None:
+                self.act = act
+
+    def forward(self, x):
+        if self.training:
+            if hasattr(self, 'fused_conv'):
+                delattr(self, 'fused_conv')
+            x = self.conv(x)
+            x = self.bn(x)
+            return self.act(x)
+        else:
+            if not hasattr(self, 'fused_conv'):
+                setattr(self, 'fused_conv', self.fuse_conv_bn(self.conv, self.bn))
+            x = self.fused_conv(x)
+            if self.act is not None:
+                x = self.act(x)
+            return x
+
+    def fuse_conv_bn(self, conv, bn):
+        """During inference, the functionary of batch norm layers is turned off but
+        only the mean and var alone channels are used, which exposes the chance to
+        fuse it with the preceding conv layers to save computations and simplify
+        network structures."""
+        if isinstance(bn, nn.Identity):
+            return conv
+        conv_w = conv.weight
+        conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+            bn.running_mean)
+
+        factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+        conv.weight = nn.Parameter(conv_w *
+                                   factor.reshape([conv.out_channels, 1, 1, 1]))
+        conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+        return conv
 
 
 class RepConvLayer(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, deploy=False):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1):
         super(RepConvLayer, self).__init__()
 
         self.in_channels = in_channels
@@ -192,78 +233,66 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1,
         self.stride = stride
         self.dilation = dilation
         self.groups = groups
-        self.deploy = deploy
 
         assert len(kernel_size) == 2
         padding = (int(((kernel_size[0] - 1) * dilation) / 2), int(((kernel_size[1] - 1) * dilation) / 2))
 
         self.nonlinearity = nn.ReLU(inplace=True)
 
-        if deploy:
-            self.fused_conv = nn.Conv2d(
+        self.main_conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=False,
+        )
+        self.main_bn = nn.BatchNorm2d(num_features=out_channels)
+
+        ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0)
+        hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2))
+
+        if kernel_size[1] != 1:
+            self.ver_conv = nn.Conv2d(
                 in_channels=in_channels,
                 out_channels=out_channels,
-                kernel_size=kernel_size,
+                kernel_size=(kernel_size[0], 1),
                 stride=stride,
-                padding=padding,
+                padding=ver_pad,
                 dilation=dilation,
                 groups=groups,
-                bias=True,
+                bias=False,
             )
+            self.ver_bn = nn.BatchNorm2d(num_features=out_channels)
         else:
-            self.main_conv = nn.Conv2d(
+            self.ver_conv, self.ver_bn = None, None
+
+        if kernel_size[0] != 1:  # 卷积核的高大于1 -> 有水平卷积
+            self.hor_conv = nn.Conv2d(
                 in_channels=in_channels,
                 out_channels=out_channels,
-                kernel_size=kernel_size,
+                kernel_size=(1, kernel_size[1]),
                 stride=stride,
-                padding=padding,
+                padding=hor_pad,
                 dilation=dilation,
                 groups=groups,
                 bias=False,
             )
-            self.main_bn = nn.BatchNorm2d(num_features=out_channels)
-
-            ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0)
-            hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2))
-
-            if kernel_size[1] != 1:  # 卷积核的宽大于1 -> 有垂直卷积
-                self.ver_conv = nn.Conv2d(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    kernel_size=(kernel_size[0], 1),
-                    stride=stride,
-                    padding=ver_pad,
-                    dilation=dilation,
-                    groups=groups,
-                    bias=False,
-                )
-                self.ver_bn = nn.BatchNorm2d(num_features=out_channels)
-            else:
-                self.ver_conv, self.ver_bn = None, None
-
-            if kernel_size[0] != 1:  # 卷积核的高大于1 -> 有水平卷积
-                self.hor_conv = nn.Conv2d(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    kernel_size=(1, kernel_size[1]),
-                    stride=stride,
-                    padding=hor_pad,
-                    dilation=dilation,
-                    groups=groups,
-                    bias=False,
-                )
-                self.hor_bn = nn.BatchNorm2d(num_features=out_channels)
-            else:
-                self.hor_conv, self.hor_bn = None, None
+            self.hor_bn = nn.BatchNorm2d(num_features=out_channels)
+        else:
+            self.hor_conv, self.hor_bn = None, None
 
-            self.rbr_identity = (
-                nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None
-            )
+        self.rbr_identity = (
+            nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None
+        )
 
     def forward(self, input):
-        if hasattr(self, "fused_conv"):
-            return self.nonlinearity(self.fused_conv(input))
-        else:
+        if self.training:
+            if hasattr(self, 'fused_conv'):
+                self.__delattr__('fused_conv')
+
             main_outputs = self.main_conv(input)
             main_outputs = self.main_bn(main_outputs)
             if self.ver_conv is not None:
@@ -284,6 +313,10 @@ def forward(self, input):
                 id_out = self.rbr_identity(input)
 
             return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out)
+        else:
+            if not hasattr(self, 'fused_conv'):
+                self.prepare_for_eval()
+            return self.nonlinearity(self.fused_conv(input))
 
     def _identity_to_conv(self, identity):
         if identity is None:
@@ -340,66 +373,17 @@ def _pad_to_mxn_tensor(self, kernel):
         pad_top_down = (kernel_height - height) // 2
         return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right, pad_top_down, pad_top_down])
 
-    # def switch_to_deploy(self):
-    #     if hasattr(self, 'fused_conv'):
-    #         return
-    #     kernel, bias = self.get_equivalent_kernel_bias()
-    #     self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels,
-    #                                 out_channels=self.main_conv.out_channels,
-    #                                 kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride,
-    #                                 padding=self.main_conv.padding, dilation=self.main_conv.dilation,
-    #                                 groups=self.main_conv.groups, bias=True)
-    #     self.fused_conv.weight.data = kernel
-    #     self.fused_conv.bias.data = bias
-    #     self.deploy = True
-    #     for para in self.parameters():
-    #         para.detach_()
-    #     for attr in ['main_conv', 'main_bn', 'ver_conv', 'ver_bn', 'hor_conv', 'hor_bn']:
-    #         if hasattr(self, attr):
-    #             self.__delattr__(attr)
-    #
-    #     if hasattr(self, 'rbr_identity'):
-    #         self.__delattr__('rbr_identity')
-
-    # def switch_to_test(self):
-    #     kernel, bias = self.get_equivalent_kernel_bias()
-    #     self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels,
-    #                                 out_channels=self.main_conv.out_channels,
-    #                                 kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride,
-    #                                 padding=self.main_conv.padding, dilation=self.main_conv.dilation,
-    #                                 groups=self.main_conv.groups, bias=True)
-    #     self.fused_conv.weight.data = kernel
-    #     self.fused_conv.bias.data = bias
-    #     for para in self.fused_conv.parameters():
-    #         para.detach_()
-    #     self.deploy = True
-
-    # def switch_to_train(self):
-    #     if hasattr(self, 'fused_conv'):
-    #         self.__delattr__('fused_conv')
-    #     self.deploy = False
-
-    # @staticmethod
-    # def is_zero_layer():
-    #     return False
-
-    # @property
-    # def module_str(self):
-    #     return 'Rep_%dx%d' % (self.kernel_size[0], self.kernel_size[1])
-
-    # @property
-    # def config(self):
-    #     return {'name': RepConvLayer.__name__,
-    #             'in_channels': self.in_channels,
-    #             'out_channels': self.out_channels,
-    #             'kernel_size': self.kernel_size,
-    #             'stride': self.stride,
-    #             'dilation': self.dilation,
-    #             'groups': self.groups}
-
-    # @staticmethod
-    # def build_from_config(config):
-    #     return RepConvLayer(**config)
+    def prepare_for_eval(self):
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels,
+                                    out_channels=self.main_conv.out_channels,
+                                    kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride,
+                                    padding=self.main_conv.padding, dilation=self.main_conv.dilation,
+                                    groups=self.main_conv.groups, bias=True)
+        self.fused_conv.weight.data = kernel
+        self.fused_conv.bias.data = bias
+        for para in self.fused_conv.parameters():
+            para.detach_()
 
 
 class FastPreTrainedModel(PreTrainedModel):
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index f3790cfb8300..ee84d0e857a8 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -16,6 +16,8 @@
 import inspect
 import unittest
 
+import requests
+from PIL import Image
 from parameterized import parameterized
 
 from transformers import (
@@ -23,7 +25,9 @@
     is_torch_available,
     set_seed,
 )
-from transformers.testing_utils import CaptureLogger, require_bitsandbytes, require_torch, slow, tooslow, torch_device
+from transformers.models.fast.image_processing_fast import FastImageProcessor
+from transformers.testing_utils import CaptureLogger, require_bitsandbytes, require_torch, slow, tooslow, torch_device, \
+    require_vision
 from transformers.utils import logging as transformers_logging
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -63,13 +67,13 @@ def __init__(
             backbone_stage1_groups=[1],
             backbone_stage2_in_channels=[64],
             backbone_stage2_out_channels=[128],
-            backbone_stage2_kernel_size=[ [3, 1]],
+            backbone_stage2_kernel_size=[[3, 1]],
             backbone_stage2_stride=[2],
             backbone_stage2_dilation=[1],
             backbone_stage2_groups=[1],
             backbone_stage3_in_channels=[128],
             backbone_stage3_out_channels=[256],
-            backbone_stage3_kernel_size=[ [1, 3]],
+            backbone_stage3_kernel_size=[[1, 3]],
             backbone_stage3_stride=[2],
             backbone_stage3_dilation=[1],
             backbone_stage3_groups=[1],
@@ -377,5 +381,59 @@ def test_model_is_small(self):
             model = model_class(config)
             num_params = model.num_parameters()
             assert (
-                num_params < 3000000
+                    num_params < 3000000
             ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
+
+        # def prepare_image():
+        #     image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img_329.jpg"
+        #     raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
+        #     return raw_image
+
+
+@require_torch
+@require_vision
+class FastModelIntegrationTest(unittest.TestCase):
+    # @slow
+    def test_inference_fast_tiny_ic17mlt_model(self):
+        model = FASTForImageCaptioning.from_pretrained("Raghavan/ic17mlt_Fast_T")
+
+        image_processor = FastImageProcessor.from_pretrained("Raghavan/ic17mlt_Fast_T")
+
+        def prepare_image():
+            image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img_329.jpg"
+            raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
+            return raw_image
+
+        image = prepare_image()
+        input = image_processor(image, return_tensor="np")
+
+        output = model(pixel_values=torch.tensor(input['pixel_values']))
+        target_sizes = [(image.shape[1], image.shape[2]) for image in input['pixel_values']]
+        final_out = image_processor.get_results(output, target_sizes)
+
+        assert (
+                final_out[0]['bboxes'][0] == [224, 120, 246, 120, 246, 134, 224, 134]
+        )
+        assert round(float(final_out[0]['scores'][0]), 5) == 0.95541
+
+    def test_inference_fast_base_800_total_text_ic17mlt_model(self):
+        model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
+
+        image_processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
+
+        def prepare_image():
+            image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg"
+            raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
+            return raw_image
+
+        image = prepare_image()
+        input = image_processor(image, return_tensor="np")
+
+        output = model(pixel_values=torch.tensor(input['pixel_values']))
+        target_sizes = [(image.shape[1], image.shape[2]) for image in input['pixel_values']]
+        final_out = image_processor.get_results(output, target_sizes)
+
+        assert (
+                final_out[0]['bboxes'][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]
+        )
+        assert round(float(final_out[0]['scores'][0]), 5) == 0.92356

From 0457e7465ad6b666ec44a57331a6f8533851441d Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 31 Oct 2023 19:19:53 +0530
Subject: [PATCH 009/152] Fix style and copies

---
 src/transformers/models/fast/__init__.py      |   9 +-
 .../models/fast/configuration_fast.py         | 140 +++++------
 .../fast/convert_fast_original_to_pytorch.py  | 163 ++++++-------
 .../models/fast/image_processing_fast.py      | 225 +++++++++---------
 src/transformers/models/fast/modeling_fast.py | 167 ++++++-------
 tests/models/fast/test_modeling_fast.py       | 200 ++++++++--------
 6 files changed, 445 insertions(+), 459 deletions(-)

diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py
index 6fad75850bba..133d20bc0c52 100644
--- a/src/transformers/models/fast/__init__.py
+++ b/src/transformers/models/fast/__init__.py
@@ -20,6 +20,7 @@
     is_torch_available,
 )
 
+
 _import_structure = {
     "configuration_fast": ["FastConfig"],
 }
@@ -30,9 +31,7 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["modeling_fast"] = [
-        "FASTForImageCaptioning"
-    ]
+    _import_structure["modeling_fast"] = ["FASTForImageCaptioning"]
 
 if TYPE_CHECKING:
     from .configuration_fast import FastConfig
@@ -43,9 +42,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_fast import (
-            FASTForImageCaptioning
-        )
+        from .modeling_fast import FASTForImageCaptioning
 
 
 else:
diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index ad72054b5cee..ee8c27b03a32 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -3,75 +3,75 @@
 
 class FastConfig(PretrainedConfig):
     def __init__(
-            self,
-            backbone_kernel_size=3,
-            backbone_stride=2,
-            backbone_dilation=1,
-            backbone_groups=1,
-            backbone_bias=False,
-            backbone_has_shuffle=False,
-            backbone_in_channels=3,
-            backbone_out_channels=64,
-            backbone_use_bn=True,
-            backbone_act_func="relu",
-            backbone_dropout_rate=0,
-            backbone_ops_order="weight_bn_act",
-            backbone_stage1_in_channels=[64, 64, 64],
-            backbone_stage1_out_channels=[64, 64, 64],
-            backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]],
-            backbone_stage1_stride=[1, 2, 1],
-            backbone_stage1_dilation=[1, 1, 1],
-            backbone_stage1_groups=[1, 1, 1],
-            backbone_stage2_in_channels=[64, 128, 128, 128],
-            backbone_stage2_out_channels=[128, 128, 128, 128],
-            backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]],
-            backbone_stage2_stride=[2, 1, 1, 1],
-            backbone_stage2_dilation=[1, 1, 1, 1],
-            backbone_stage2_groups=[1, 1, 1, 1],
-            backbone_stage3_in_channels=[128, 256, 256, 256],
-            backbone_stage3_out_channels=[256, 256, 256, 256],
-            backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]],
-            backbone_stage3_stride=[2, 1, 1, 1],
-            backbone_stage3_dilation=[1, 1, 1, 1],
-            backbone_stage3_groups=[1, 1, 1, 1],
-            backbone_stage4_in_channels=[256, 512, 512, 512],
-            backbone_stage4_out_channels=[512, 512, 512, 512],
-            backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]],
-            backbone_stage4_stride=[2, 1, 1, 1],
-            backbone_stage4_dilation=[1, 1, 1, 1],
-            backbone_stage4_groups=[1, 1, 1, 1],
-            neck_in_channels=[64, 128, 256, 512],
-            neck_out_channels=[128, 128, 128, 128],
-            neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]],
-            neck_stride=[1, 1, 1, 1],
-            neck_dilation=[1, 1, 1, 1],
-            neck_groups=[1, 1, 1, 1],
-            head_pooling_size=9,
-            head_dropout_ratio=0,
-            head_conv_in_channels=512,
-            head_conv_out_channels=128,
-            head_conv_kernel_size=[3, 3],
-            head_conv_stride=1,
-            head_conv_dilation=1,
-            head_conv_groups=1,
-            head_final_kernel_size=1,
-            head_final_stride=1,
-            head_final_dilation=1,
-            head_final_groups=1,
-            head_final_bias=False,
-            head_final_has_shuffle=False,
-            head_final_in_channels=128,
-            head_final_out_channels=5,
-            head_final_use_bn=False,
-            head_final_act_func=None,
-            head_final_dropout_rate=0,
-            head_final_ops_order="weight",
-            min_area=250,
-            min_score=0.88,
-            bbox_type='rect',
-            loss_bg=False,
-            initializer_range=0.02,
-            **kwargs,
+        self,
+        backbone_kernel_size=3,
+        backbone_stride=2,
+        backbone_dilation=1,
+        backbone_groups=1,
+        backbone_bias=False,
+        backbone_has_shuffle=False,
+        backbone_in_channels=3,
+        backbone_out_channels=64,
+        backbone_use_bn=True,
+        backbone_act_func="relu",
+        backbone_dropout_rate=0,
+        backbone_ops_order="weight_bn_act",
+        backbone_stage1_in_channels=[64, 64, 64],
+        backbone_stage1_out_channels=[64, 64, 64],
+        backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]],
+        backbone_stage1_stride=[1, 2, 1],
+        backbone_stage1_dilation=[1, 1, 1],
+        backbone_stage1_groups=[1, 1, 1],
+        backbone_stage2_in_channels=[64, 128, 128, 128],
+        backbone_stage2_out_channels=[128, 128, 128, 128],
+        backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]],
+        backbone_stage2_stride=[2, 1, 1, 1],
+        backbone_stage2_dilation=[1, 1, 1, 1],
+        backbone_stage2_groups=[1, 1, 1, 1],
+        backbone_stage3_in_channels=[128, 256, 256, 256],
+        backbone_stage3_out_channels=[256, 256, 256, 256],
+        backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]],
+        backbone_stage3_stride=[2, 1, 1, 1],
+        backbone_stage3_dilation=[1, 1, 1, 1],
+        backbone_stage3_groups=[1, 1, 1, 1],
+        backbone_stage4_in_channels=[256, 512, 512, 512],
+        backbone_stage4_out_channels=[512, 512, 512, 512],
+        backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]],
+        backbone_stage4_stride=[2, 1, 1, 1],
+        backbone_stage4_dilation=[1, 1, 1, 1],
+        backbone_stage4_groups=[1, 1, 1, 1],
+        neck_in_channels=[64, 128, 256, 512],
+        neck_out_channels=[128, 128, 128, 128],
+        neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]],
+        neck_stride=[1, 1, 1, 1],
+        neck_dilation=[1, 1, 1, 1],
+        neck_groups=[1, 1, 1, 1],
+        head_pooling_size=9,
+        head_dropout_ratio=0,
+        head_conv_in_channels=512,
+        head_conv_out_channels=128,
+        head_conv_kernel_size=[3, 3],
+        head_conv_stride=1,
+        head_conv_dilation=1,
+        head_conv_groups=1,
+        head_final_kernel_size=1,
+        head_final_stride=1,
+        head_final_dilation=1,
+        head_final_groups=1,
+        head_final_bias=False,
+        head_final_has_shuffle=False,
+        head_final_in_channels=128,
+        head_final_out_channels=5,
+        head_final_use_bn=False,
+        head_final_act_func=None,
+        head_final_dropout_rate=0,
+        head_final_ops_order="weight",
+        min_area=250,
+        min_score=0.88,
+        bbox_type="rect",
+        loss_bg=False,
+        initializer_range=0.02,
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -150,4 +150,4 @@ def __init__(
         self.min_score = min_score
         self.bbox_type = bbox_type
         self.loss_bg = loss_bg
-        self.initializer_range = initializer_range
\ No newline at end of file
+        self.initializer_range = initializer_range
diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
index 7ef78a312080..e549294081b8 100644
--- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py
+++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
@@ -18,19 +18,13 @@
 import json
 import logging
 
-import numpy as np
-import pandas as pd
 import requests
 import torch
-from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import (
-    FastConfig,
-    FASTForImageCaptioning
-)
+from transformers import FastConfig, FASTForImageCaptioning
 from transformers.models.fast.image_processing_fast import FastImageProcessor
-from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+
 
 tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
 small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config"
@@ -63,7 +57,7 @@ def prepare_config(size_config_url, pooling_size, min_area, min_score, bbox_type
 
     backbone_config = {}
     for stage_ix in range(1, 5):
-        stage_config = config_dict[f'stage{stage_ix}']
+        stage_config = config_dict[f"stage{stage_ix}"]
 
         merged_dict = {}
 
@@ -77,7 +71,7 @@ def prepare_config(size_config_url, pooling_size, min_area, min_score, bbox_type
                     else:
                         # If the key is not in merged_dict, create a new list with the value
                         merged_dict[key] = [value]
-        backbone_config[f'stage{stage_ix}'] = merged_dict
+        backbone_config[f"stage{stage_ix}"] = merged_dict
 
     neck_in_channels = []
     neck_out_channels = []
@@ -88,7 +82,7 @@ def prepare_config(size_config_url, pooling_size, min_area, min_score, bbox_type
 
     for i in range(1, 5):
         layer_key = f"reduce_layer{i}"
-        layer_dict = config_dict['neck'].get(layer_key)
+        layer_dict = config_dict["neck"].get(layer_key)
 
         if layer_dict:
             # Append values to the corresponding lists
@@ -112,64 +106,56 @@ def prepare_config(size_config_url, pooling_size, min_area, min_score, bbox_type
         backbone_act_func=config_dict["first_conv"]["act_func"],
         backbone_dropout_rate=config_dict["first_conv"]["dropout_rate"],
         backbone_ops_order=config_dict["first_conv"]["ops_order"],
-
-        backbone_stage1_in_channels=backbone_config['stage1']['in_channels'],
-        backbone_stage1_out_channels=backbone_config['stage1']['out_channels'],
-        backbone_stage1_kernel_size=backbone_config['stage1']['kernel_size'],
-        backbone_stage1_stride=backbone_config['stage1']['stride'],
-        backbone_stage1_dilation=backbone_config['stage1']['dilation'],
-        backbone_stage1_groups=backbone_config['stage1']['groups'],
-
-        backbone_stage2_in_channels=backbone_config['stage2']['in_channels'],
-        backbone_stage2_out_channels=backbone_config['stage2']['out_channels'],
-        backbone_stage2_kernel_size=backbone_config['stage2']['kernel_size'],
-        backbone_stage2_stride=backbone_config['stage2']['stride'],
-        backbone_stage2_dilation=backbone_config['stage2']['dilation'],
-        backbone_stage2_groups=backbone_config['stage2']['groups'],
-
-        backbone_stage3_in_channels=backbone_config['stage3']['in_channels'],
-        backbone_stage3_out_channels=backbone_config['stage3']['out_channels'],
-        backbone_stage3_kernel_size=backbone_config['stage3']['kernel_size'],
-        backbone_stage3_stride=backbone_config['stage3']['stride'],
-        backbone_stage3_dilation=backbone_config['stage3']['dilation'],
-        backbone_stage3_groups=backbone_config['stage3']['groups'],
-
-        backbone_stage4_in_channels=backbone_config['stage4']['in_channels'],
-        backbone_stage4_out_channels=backbone_config['stage4']['out_channels'],
-        backbone_stage4_kernel_size=backbone_config['stage4']['kernel_size'],
-        backbone_stage4_stride=backbone_config['stage4']['stride'],
-        backbone_stage4_dilation=backbone_config['stage4']['dilation'],
-        backbone_stage4_groups=backbone_config['stage4']['groups'],
-
+        backbone_stage1_in_channels=backbone_config["stage1"]["in_channels"],
+        backbone_stage1_out_channels=backbone_config["stage1"]["out_channels"],
+        backbone_stage1_kernel_size=backbone_config["stage1"]["kernel_size"],
+        backbone_stage1_stride=backbone_config["stage1"]["stride"],
+        backbone_stage1_dilation=backbone_config["stage1"]["dilation"],
+        backbone_stage1_groups=backbone_config["stage1"]["groups"],
+        backbone_stage2_in_channels=backbone_config["stage2"]["in_channels"],
+        backbone_stage2_out_channels=backbone_config["stage2"]["out_channels"],
+        backbone_stage2_kernel_size=backbone_config["stage2"]["kernel_size"],
+        backbone_stage2_stride=backbone_config["stage2"]["stride"],
+        backbone_stage2_dilation=backbone_config["stage2"]["dilation"],
+        backbone_stage2_groups=backbone_config["stage2"]["groups"],
+        backbone_stage3_in_channels=backbone_config["stage3"]["in_channels"],
+        backbone_stage3_out_channels=backbone_config["stage3"]["out_channels"],
+        backbone_stage3_kernel_size=backbone_config["stage3"]["kernel_size"],
+        backbone_stage3_stride=backbone_config["stage3"]["stride"],
+        backbone_stage3_dilation=backbone_config["stage3"]["dilation"],
+        backbone_stage3_groups=backbone_config["stage3"]["groups"],
+        backbone_stage4_in_channels=backbone_config["stage4"]["in_channels"],
+        backbone_stage4_out_channels=backbone_config["stage4"]["out_channels"],
+        backbone_stage4_kernel_size=backbone_config["stage4"]["kernel_size"],
+        backbone_stage4_stride=backbone_config["stage4"]["stride"],
+        backbone_stage4_dilation=backbone_config["stage4"]["dilation"],
+        backbone_stage4_groups=backbone_config["stage4"]["groups"],
         neck_in_channels=neck_in_channels,
         neck_out_channels=neck_out_channels,
         neck_kernel_size=neck_kernel_size,
         neck_stride=neck_stride,
         neck_dilation=neck_dilation,
         neck_groups=neck_groups,
-
         head_pooling_size=pooling_size,
         head_dropout_ratio=0.1,
-        head_conv_in_channels=config_dict['head']['conv']['in_channels'],
-        head_conv_out_channels=config_dict['head']['conv']['out_channels'],
-        head_conv_kernel_size=config_dict['head']['conv']['kernel_size'],
-        head_conv_stride=config_dict['head']['conv']['stride'],
-        head_conv_dilation=config_dict['head']['conv']['dilation'],
-        head_conv_groups=config_dict['head']['conv']['groups'],
-
-        head_final_kernel_size=config_dict['head']['final']['kernel_size'],
-        head_final_stride=config_dict['head']['final']['stride'],
-        head_final_dilation=config_dict['head']['final']['dilation'],
-        head_final_groups=config_dict['head']['final']['groups'],
-        head_final_bias=config_dict['head']['final']['bias'],
-        head_final_has_shuffle=config_dict['head']['final']['has_shuffle'],
-        head_final_in_channels=config_dict['head']['final']['in_channels'],
-        head_final_out_channels=config_dict['head']['final']['out_channels'],
-        head_final_use_bn=config_dict['head']['final']['use_bn'],
-        head_final_act_func=config_dict['head']['final']['act_func'],
-        head_final_dropout_rate=config_dict['head']['final']['dropout_rate'],
-        head_final_ops_order=config_dict['head']['final']['ops_order'],
-
+        head_conv_in_channels=config_dict["head"]["conv"]["in_channels"],
+        head_conv_out_channels=config_dict["head"]["conv"]["out_channels"],
+        head_conv_kernel_size=config_dict["head"]["conv"]["kernel_size"],
+        head_conv_stride=config_dict["head"]["conv"]["stride"],
+        head_conv_dilation=config_dict["head"]["conv"]["dilation"],
+        head_conv_groups=config_dict["head"]["conv"]["groups"],
+        head_final_kernel_size=config_dict["head"]["final"]["kernel_size"],
+        head_final_stride=config_dict["head"]["final"]["stride"],
+        head_final_dilation=config_dict["head"]["final"]["dilation"],
+        head_final_groups=config_dict["head"]["final"]["groups"],
+        head_final_bias=config_dict["head"]["final"]["bias"],
+        head_final_has_shuffle=config_dict["head"]["final"]["has_shuffle"],
+        head_final_in_channels=config_dict["head"]["final"]["in_channels"],
+        head_final_out_channels=config_dict["head"]["final"]["out_channels"],
+        head_final_use_bn=config_dict["head"]["final"]["use_bn"],
+        head_final_act_func=config_dict["head"]["final"]["act_func"],
+        head_final_dropout_rate=config_dict["head"]["final"]["dropout_rate"],
+        head_final_ops_order=config_dict["head"]["final"]["ops_order"],
         min_area=min_area,
         min_score=min_score,
         bbox_type=bbox_type,
@@ -193,43 +179,50 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_
 
     exec(content, namespace)
 
-    model_config = namespace.get('model')
-    test_config = namespace.get('test_cfg', None)
-    data_config = namespace.get('data')
+    model_config = namespace.get("model")
+    test_config = namespace.get("test_cfg", None)
+    data_config = namespace.get("data")
 
     min_score = 0.88
     min_area = 250
-    bbox_type = 'rect'
+    bbox_type = "rect"
     loss_bg = False
     if test_config is not None:
-        min_area = test_config.get('min_area', min_area)
-        min_score = test_config.get('min_score', min_score)
-        bbox_type = test_config.get('bbox_type', bbox_type)
-        loss_bg = test_config.get('loss_emb', None) == "EmbLoss_v2"
-
-    if 'tiny' in model_config['backbone']['config']:
-        config = prepare_config(tiny_config_url, model_config['detection_head']['pooling_size'],
-                                min_area, min_score, bbox_type, loss_bg)
-    elif 'small' in model_config['backbone']['config']:
-        config = prepare_config(small_config_url, model_config['detection_head']['pooling_size'],
-                                min_area, min_score, bbox_type, loss_bg)
+        min_area = test_config.get("min_area", min_area)
+        min_score = test_config.get("min_score", min_score)
+        bbox_type = test_config.get("bbox_type", bbox_type)
+        loss_bg = test_config.get("loss_emb", None) == "EmbLoss_v2"
+
+    if "tiny" in model_config["backbone"]["config"]:
+        config = prepare_config(
+            tiny_config_url, model_config["detection_head"]["pooling_size"], min_area, min_score, bbox_type, loss_bg
+        )
+    elif "small" in model_config["backbone"]["config"]:
+        config = prepare_config(
+            small_config_url, model_config["detection_head"]["pooling_size"], min_area, min_score, bbox_type, loss_bg
+        )
     else:
-        config = prepare_config(base_config_url, model_config['detection_head']['pooling_size'],
-                                min_area, min_score, bbox_type, loss_bg)
+        config = prepare_config(
+            base_config_url, model_config["detection_head"]["pooling_size"], min_area, min_score, bbox_type, loss_bg
+        )
     size = 640
     if "train" in data_config:
-        if "short_size" in data_config['train']:
-            size = data_config['train']['short_size']
+        if "short_size" in data_config["train"]:
+            size = data_config["train"]["short_size"]
 
     model = FASTForImageCaptioning(config)
-    fast_image_processor = FastImageProcessor(size={'height': size, 'width': size}, min_score=config.min_score,
-                                              min_area=config.min_area,
-                                              bbox_type=config.bbox_type, pooling_size=config.head_pooling_size)
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)['ema']
+    fast_image_processor = FastImageProcessor(
+        size={"height": size, "width": size},
+        min_score=config.min_score,
+        min_area=config.min_area,
+        bbox_type=config.bbox_type,
+        pooling_size=config.head_pooling_size,
+    )
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"]
     state_dict_changed = copy.deepcopy(state_dict)
     for key in state_dict:
         val = state_dict_changed.pop(key)
-        state_dict_changed[key.replace('module.', '')] = val
+        state_dict_changed[key.replace("module.", "")] = val
     model.load_state_dict(state_dict_changed)
 
     model.save_pretrained(pytorch_dump_folder_path)
diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index 637aea38e086..812c617f073c 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -16,17 +16,15 @@
 import math
 import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union
-import torch.nn.functional as F
-import torch.nn as nn
 
 import cv2
 import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
-    IMAGENET_STANDARD_MEAN,
-    IMAGENET_STANDARD_STD,
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -36,8 +34,16 @@
     to_numpy_array,
     valid_images,
 )
-from ...utils import TensorType, is_torch_available, is_torch_tensor, is_vision_available, logging, \
-    IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from ...utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    TensorType,
+    is_torch_available,
+    is_torch_tensor,
+    is_vision_available,
+    logging,
+)
+
 
 if is_vision_available():
     import PIL
@@ -94,23 +100,23 @@ class FastImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
 
     def __init__(
-            self,
-            do_resize: bool = True,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = PILImageResampling.BICUBIC,
-            do_center_crop: bool = False,
-            crop_size: Dict[str, int] = None,
-            rescale_factor: Union[int, float] = 1 / 255,
-            do_rescale: bool = True,
-            do_normalize: bool = True,
-            image_mean: Optional[Union[float, List[float]]] = None,
-            image_std: Optional[Union[float, List[float]]] = None,
-            do_reduce_labels: bool = False,
-            min_area: int = 10,
-            min_score: float = 0.88,
-            bbox_type: str = "rect",
-            pooling_size: int = 9,
-            **kwargs,
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = False,
+        crop_size: Dict[str, int] = None,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_rescale: bool = True,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_reduce_labels: bool = False,
+        min_area: int = 10,
+        min_score: float = 0.88,
+        bbox_type: str = "rect",
+        pooling_size: int = 9,
+        **kwargs,
     ) -> None:
         if "reduce_labels" in kwargs:
             warnings.warn(
@@ -152,13 +158,13 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         return super().from_dict(image_processor_dict, **kwargs)
 
     def resize(
-            self,
-            image: np.ndarray,
-            size: Dict[str, int],
-            resample: PILImageResampling = PILImageResampling.BICUBIC,
-            data_format: Optional[Union[str, ChannelDimension]] = None,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
-            **kwargs,
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to (size["height"], size["width"]).
@@ -196,20 +202,20 @@ def reduce_label(self, label: ImageInput) -> np.ndarray:
         return label
 
     def _preprocess(
-            self,
-            image: ImageInput,
-            do_reduce_labels: bool = None,
-            do_resize: bool = None,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = None,
-            do_center_crop: bool = None,
-            crop_size: Dict[str, int] = None,
-            do_rescale: bool = None,
-            rescale_factor: float = None,
-            do_normalize: bool = None,
-            image_mean: Optional[Union[float, List[float]]] = None,
-            image_std: Optional[Union[float, List[float]]] = None,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        self,
+        image: ImageInput,
+        do_reduce_labels: bool = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         if do_reduce_labels:
             image = self.reduce_label(image)
@@ -229,20 +235,20 @@ def _preprocess(
         return image
 
     def _preprocess_image(
-            self,
-            image: ImageInput,
-            do_resize: bool = None,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = None,
-            do_center_crop: bool = None,
-            crop_size: Dict[str, int] = None,
-            do_rescale: bool = None,
-            rescale_factor: float = None,
-            do_normalize: bool = None,
-            image_mean: Optional[Union[float, List[float]]] = None,
-            image_std: Optional[Union[float, List[float]]] = None,
-            data_format: Optional[Union[str, ChannelDimension]] = None,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        self,
+        image: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """Preprocesses a single image."""
         # All transformations expect numpy arrays.
@@ -274,15 +280,15 @@ def _preprocess_image(
         return image
 
     def _preprocess_segmentation_map(
-            self,
-            segmentation_map: ImageInput,
-            do_resize: bool = None,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = None,
-            do_center_crop: bool = None,
-            crop_size: Dict[str, int] = None,
-            do_reduce_labels: bool = None,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        self,
+        segmentation_map: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_reduce_labels: bool = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         """Preprocesses a single segmentation map."""
         # All transformations expect numpy arrays.
@@ -320,24 +326,24 @@ def __call__(self, images, segmentation_maps=None, **kwargs):
         return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
 
     def preprocess(
-            self,
-            images: ImageInput,
-            segmentation_maps: Optional[ImageInput] = None,
-            do_resize: bool = None,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = None,
-            do_center_crop: bool = None,
-            crop_size: Dict[str, int] = None,
-            do_rescale: bool = None,
-            rescale_factor: float = None,
-            do_normalize: bool = None,
-            image_mean: Optional[Union[float, List[float]]] = None,
-            image_std: Optional[Union[float, List[float]]] = None,
-            do_reduce_labels: Optional[bool] = None,
-            return_tensors: Optional[Union[str, TensorType]] = None,
-            data_format: ChannelDimension = ChannelDimension.FIRST,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
-            **kwargs,
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[ImageInput] = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_reduce_labels: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
@@ -516,25 +522,26 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple]
 
     def _max_pooling(self, x, scale=1):
         if scale == 1:
-            x = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1,
-                             padding=(self.pooling_size - 1) // 2)(x)
+            x = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2)(x)
         elif scale == 2:
-            x = nn.MaxPool2d(kernel_size=self.pooling_size // 2 + 1, stride=1,
-                             padding=(self.pooling_size // 2) // 2)(x)
+            x = nn.MaxPool2d(kernel_size=self.pooling_size // 2 + 1, stride=1, padding=(self.pooling_size // 2) // 2)(
+                x
+            )
         return x
 
     def get_results(self, output, target_sizes):
         scale = 2
-        img_size = (self.size['height'], self.size['width'])
-        out = output['hidden_states']
+        img_size = (self.size["height"], self.size["width"])
+        out = output["hidden_states"]
         batch_size = out.size(0)
-        final_results = dict()
+        final_results = {}
 
-        texts = F.interpolate(out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale),
-                              mode='nearest')  # B*1*320*320
+        texts = F.interpolate(
+            out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), mode="nearest"
+        )  # B*1*320*320
         texts = self._max_pooling(texts, scale=scale)  # B*1*320*320
         score_maps = torch.sigmoid_(texts)  # B*1*320*320
-        score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode='nearest')  # B*1*640*640
+        score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode="nearest")  # B*1*640*640
         score_maps = score_maps.squeeze(1)  # B*640*640
 
         kernels = (out[:, 0, :, :] > 0).to(torch.uint8)  # B*160*160
@@ -545,27 +552,25 @@ def get_results(self, output, target_sizes):
         labels_ = np.array(labels_)
         labels_ = torch.from_numpy(labels_)
         labels = labels_.unsqueeze(1).to(torch.float32)  # B*1*160*160
-        labels = F.interpolate(labels, size=(img_size[0] // scale, img_size[1] // scale), mode='nearest')  # B*1*320*320
+        labels = F.interpolate(
+            labels, size=(img_size[0] // scale, img_size[1] // scale), mode="nearest"
+        )  # B*1*320*320
         labels = self._max_pooling(labels, scale=scale)
-        labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode='nearest')  # B*1*640*640
+        labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode="nearest")  # B*1*640*640
         labels = labels.squeeze(1).to(torch.int32)  # B*640*640
 
         keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)]
 
-        final_results.update(dict(kernels=kernels.data.cpu()))
+        final_results.update({"kernels": kernels.data.cpu()})
 
         results = []
         for i in range(batch_size):
             org_img_size = target_sizes[i]
-            scales = (float(org_img_size[1]) / float(img_size[1]),
-                      float(org_img_size[0]) / float(img_size[0]))
+            scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0]))
 
             bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales)
-            results.append(dict(
-                bboxes=bboxes,
-                scores=scores
-            ))
-        final_results.update(dict(results=results))
+            results.append({"bboxes": bboxes, "scores": scores})
+        final_results.update({"results": results})
 
         return results
 
@@ -575,7 +580,7 @@ def generate_bbox(self, keys, label, score, scales):
         scores = []
         for index in range(1, label_num):
             i = keys[index]
-            ind = (label == i)
+            ind = label == i
             ind_np = ind.data.cpu().numpy()
             points = np.array(np.where(ind_np)).transpose((1, 0))
             if points.shape[0] < self.min_area:
@@ -586,18 +591,18 @@ def generate_bbox(self, keys, label, score, scales):
                 label[ind] = 0
                 continue
 
-            if self.bbox_type == 'rect':
+            if self.bbox_type == "rect":
                 rect = cv2.minAreaRect(points[:, ::-1])
                 alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1])))
                 rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2])
                 bbox = cv2.boxPoints(rect) * scales
 
-            elif self.bbox_type == 'poly':
-                binary = np.zeros(label.shape, dtype='uint8')
+            elif self.bbox_type == "poly":
+                binary = np.zeros(label.shape, dtype="uint8")
                 binary[ind_np] = 1
                 contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                 bbox = contours[0] * scales
-            bbox = bbox.astype('int32')
+            bbox = bbox.astype("int32")
             bboxes.append(bbox.reshape(-1).tolist())
             scores.append(score_i)
         return bboxes, scores
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index dc892f0e58ce..4557cf4754c4 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -1,15 +1,12 @@
-import math
-from collections import OrderedDict
 from dataclasses import dataclass
-from typing import Optional, Dict
+from typing import Dict, Optional
 
-import cv2
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from transformers import PreTrainedModel, FastConfig
+from transformers import FastConfig, PreTrainedModel
 from transformers.utils import ModelOutput
 
 
@@ -41,7 +38,7 @@ def build_activation(act_func, inplace=True):
 
 class My2DLayer(nn.Module):
     def __init__(
-            self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act"
+        self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act"
     ):
         super(My2DLayer, self).__init__()
         self.in_channels = in_channels
@@ -104,7 +101,7 @@ def weight_op(self):
 
     def forward(self, x):
         for key, module in self._modules.items():
-            if key == 'bn' and not self.training:
+            if key == "bn" and not self.training:
                 continue
             x = module(x)
         return x
@@ -138,21 +135,20 @@ def is_zero_layer():
 
 class ConvLayer(nn.Module):
     def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size=3,
-            stride=1,
-            dilation=1,
-            groups=1,
-            bias=False,
-            has_shuffle=False,
-            use_bn=True,
-            act_func="relu",
-            dropout_rate=0,
-            use_act=True
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        dilation=1,
+        groups=1,
+        bias=False,
+        has_shuffle=False,
+        use_bn=True,
+        act_func="relu",
+        dropout_rate=0,
+        use_act=True,
     ):
-
         super().__init__()
 
         self.kernel_size = kernel_size
@@ -192,14 +188,14 @@ def __init__(
 
     def forward(self, x):
         if self.training:
-            if hasattr(self, 'fused_conv'):
-                delattr(self, 'fused_conv')
+            if hasattr(self, "fused_conv"):
+                delattr(self, "fused_conv")
             x = self.conv(x)
             x = self.bn(x)
             return self.act(x)
         else:
-            if not hasattr(self, 'fused_conv'):
-                setattr(self, 'fused_conv', self.fuse_conv_bn(self.conv, self.bn))
+            if not hasattr(self, "fused_conv"):
+                setattr(self, "fused_conv", self.fuse_conv_bn(self.conv, self.bn))
             x = self.fused_conv(x)
             if self.act is not None:
                 x = self.act(x)
@@ -207,18 +203,15 @@ def forward(self, x):
 
     def fuse_conv_bn(self, conv, bn):
         """During inference, the functionary of batch norm layers is turned off but
-        only the mean and var alone channels are used, which exposes the chance to
-        fuse it with the preceding conv layers to save computations and simplify
-        network structures."""
+        only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv
+        layers to save computations and simplify network structures."""
         if isinstance(bn, nn.Identity):
             return conv
         conv_w = conv.weight
-        conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
-            bn.running_mean)
+        conv_b = conv.bias if conv.bias is not None else torch.zeros_like(bn.running_mean)
 
         factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
-        conv.weight = nn.Parameter(conv_w *
-                                   factor.reshape([conv.out_channels, 1, 1, 1]))
+        conv.weight = nn.Parameter(conv_w * factor.reshape([conv.out_channels, 1, 1, 1]))
         conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
         return conv
 
@@ -290,8 +283,8 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1,
 
     def forward(self, input):
         if self.training:
-            if hasattr(self, 'fused_conv'):
-                self.__delattr__('fused_conv')
+            if hasattr(self, "fused_conv"):
+                self.__delattr__("fused_conv")
 
             main_outputs = self.main_conv(input)
             main_outputs = self.main_bn(main_outputs)
@@ -314,7 +307,7 @@ def forward(self, input):
 
             return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out)
         else:
-            if not hasattr(self, 'fused_conv'):
+            if not hasattr(self, "fused_conv"):
                 self.prepare_for_eval()
             return self.nonlinearity(self.fused_conv(input))
 
@@ -375,11 +368,16 @@ def _pad_to_mxn_tensor(self, kernel):
 
     def prepare_for_eval(self):
         kernel, bias = self.get_equivalent_kernel_bias()
-        self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels,
-                                    out_channels=self.main_conv.out_channels,
-                                    kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride,
-                                    padding=self.main_conv.padding, dilation=self.main_conv.dilation,
-                                    groups=self.main_conv.groups, bias=True)
+        self.fused_conv = nn.Conv2d(
+            in_channels=self.main_conv.in_channels,
+            out_channels=self.main_conv.out_channels,
+            kernel_size=self.main_conv.kernel_size,
+            stride=self.main_conv.stride,
+            padding=self.main_conv.padding,
+            dilation=self.main_conv.dilation,
+            groups=self.main_conv.groups,
+            bias=True,
+        )
         self.fused_conv.weight.data = kernel
         self.fused_conv.bias.data = bias
         for para in self.fused_conv.parameters():
@@ -423,48 +421,48 @@ def __init__(self, config):
         self.first_conv.apply(self._init_weights)
         stage1 = []
         for stage_config in zip(
-                config.backbone_stage1_in_channels,
-                config.backbone_stage1_out_channels,
-                config.backbone_stage1_kernel_size,
-                config.backbone_stage1_stride,
-                config.backbone_stage1_dilation,
-                config.backbone_stage1_groups,
+            config.backbone_stage1_in_channels,
+            config.backbone_stage1_out_channels,
+            config.backbone_stage1_kernel_size,
+            config.backbone_stage1_stride,
+            config.backbone_stage1_dilation,
+            config.backbone_stage1_groups,
         ):
             stage1.append(RepConvLayer(*stage_config))
         self.stage1 = nn.ModuleList(stage1)
 
         stage2 = []
         for stage_config in zip(
-                config.backbone_stage2_in_channels,
-                config.backbone_stage2_out_channels,
-                config.backbone_stage2_kernel_size,
-                config.backbone_stage2_stride,
-                config.backbone_stage2_dilation,
-                config.backbone_stage2_groups,
+            config.backbone_stage2_in_channels,
+            config.backbone_stage2_out_channels,
+            config.backbone_stage2_kernel_size,
+            config.backbone_stage2_stride,
+            config.backbone_stage2_dilation,
+            config.backbone_stage2_groups,
         ):
             stage2.append(RepConvLayer(*stage_config))
         self.stage2 = nn.ModuleList(stage2)
 
         stage3 = []
         for stage_config in zip(
-                config.backbone_stage3_in_channels,
-                config.backbone_stage3_out_channels,
-                config.backbone_stage3_kernel_size,
-                config.backbone_stage3_stride,
-                config.backbone_stage3_dilation,
-                config.backbone_stage3_groups,
+            config.backbone_stage3_in_channels,
+            config.backbone_stage3_out_channels,
+            config.backbone_stage3_kernel_size,
+            config.backbone_stage3_stride,
+            config.backbone_stage3_dilation,
+            config.backbone_stage3_groups,
         ):
             stage3.append(RepConvLayer(*stage_config))
         self.stage3 = nn.ModuleList(stage3)
 
         stage4 = []
         for stage_config in zip(
-                config.backbone_stage4_in_channels,
-                config.backbone_stage4_out_channels,
-                config.backbone_stage4_kernel_size,
-                config.backbone_stage4_stride,
-                config.backbone_stage4_dilation,
-                config.backbone_stage4_groups,
+            config.backbone_stage4_in_channels,
+            config.backbone_stage4_out_channels,
+            config.backbone_stage4_kernel_size,
+            config.backbone_stage4_stride,
+            config.backbone_stage4_dilation,
+            config.backbone_stage4_groups,
         ):
             stage4.append(RepConvLayer(*stage_config))
         self.stage4 = nn.ModuleList(stage4)
@@ -695,8 +693,9 @@ def _max_pooling(self, x, scale=1):
     #     return bboxes, scores
 
 
-def emb_loss(emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0),
-             bg_sample=False):
+def emb_loss(
+    emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), bg_sample=False
+):
     training_mask = (training_mask > 0.5).long()
     kernel = (kernel > 0.5).long()
     instance = instance * training_mask
@@ -722,7 +721,7 @@ def emb_loss(emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, d
             continue
         ind = instance == lb
         emb_ = emb[:, ind]
-        dist = (emb_ - emb_mean[:, i:i + 1]).norm(p=2, dim=0)
+        dist = (emb_ - emb_mean[:, i : i + 1]).norm(p=2, dim=0)
         dist = F.relu(dist - delta_v) ** 2
         l_agg[i] = torch.mean(torch.log(dist + 1.0))
     l_agg = torch.mean(l_agg[1:])
@@ -754,7 +753,7 @@ def emb_loss(emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, d
                 for i, lb in enumerate(unique_labels):
                     if lb == 0:
                         continue
-                    dist = (emb_bg - emb_mean[:, i:i + 1]).norm(p=2, dim=0)
+                    dist = (emb_bg - emb_mean[:, i : i + 1]).norm(p=2, dim=0)
                     dist = F.relu(2 * delta_d - dist) ** 2
                     l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True)
                     l_dis.append(l_dis_bg)
@@ -894,7 +893,6 @@ class FASTForImageCaptioningOutput(ModelOutput):
 
 
 class FASTForImageCaptioning(FastPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.backbone = TextNet(config=config)
@@ -902,10 +900,12 @@ def __init__(self, config):
         self.det_head = FASTHead(config=config)
         self.loss_bg = config.loss_bg
 
-        self.pooling_1s = nn.MaxPool2d(kernel_size=config.head_pooling_size, stride=1,
-                                       padding=(config.head_pooling_size - 1) // 2)
-        self.pooling_2s = nn.MaxPool2d(kernel_size=config.head_pooling_size // 2 + 1, stride=1,
-                                       padding=(config.head_pooling_size // 2) // 2)
+        self.pooling_1s = nn.MaxPool2d(
+            kernel_size=config.head_pooling_size, stride=1, padding=(config.head_pooling_size - 1) // 2
+        )
+        self.pooling_2s = nn.MaxPool2d(
+            kernel_size=config.head_pooling_size // 2 + 1, stride=1, padding=(config.head_pooling_size // 2) // 2
+        )
         self.post_init()
 
     def _upsample(self, x, size, scale=1):
@@ -920,10 +920,10 @@ def _max_pooling(self, x, scale=1):
         return x
 
     def loss(self, hidden, labels):
-        gt_texts = labels['gt_texts']
-        gt_kernels = labels['gt_kernels']
-        training_masks = labels['training_masks']
-        gt_instances = labels['gt_instances']
+        gt_texts = labels["gt_texts"]
+        gt_kernels = labels["gt_kernels"]
+        training_masks = labels["training_masks"]
+        gt_instances = labels["gt_instances"]
 
         kernels = hidden[:, 0, :, :]  # 4*640*640
         texts = self._max_pooling(kernels, scale=1)  # 4*640*640
@@ -940,12 +940,13 @@ def loss(self, hidden, labels):
 
         return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb)
 
-    def forward(self,
-                pixel_values: torch.FloatTensor,
-                output_hidden_states: Optional[bool] = True,
-                return_dict: Optional[bool] = None,
-                labels: Dict = None
-                ):
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_hidden_states: Optional[bool] = True,
+        return_dict: Optional[bool] = None,
+        labels: Dict = None,
+    ):
         # outputs = {}
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         f = self.backbone(pixel_values)
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index ee84d0e857a8..d1a2075a199b 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -18,23 +18,24 @@
 
 import requests
 from PIL import Image
-from parameterized import parameterized
 
 from transformers import (
     FastConfig,
     is_torch_available,
-    set_seed,
 )
 from transformers.models.fast.image_processing_fast import FastImageProcessor
-from transformers.testing_utils import CaptureLogger, require_bitsandbytes, require_torch, slow, tooslow, torch_device, \
-    require_vision
-from transformers.utils import logging as transformers_logging
+from transformers.testing_utils import (
+    require_torch,
+    require_vision,
+    torch_device,
+)
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
+
 if is_torch_available():
     import torch
 
@@ -45,74 +46,74 @@
 
 class FastModelTester:
     def __init__(
-            self,
-            parent,
-            backbone_kernel_size=3,
-            backbone_stride=2,
-            backbone_dilation=1,
-            backbone_groups=1,
-            backbone_bias=False,
-            backbone_has_shuffle=False,
-            backbone_in_channels=3,
-            backbone_out_channels=64,
-            backbone_use_bn=True,
-            backbone_act_func="relu",
-            backbone_dropout_rate=0,
-            backbone_ops_order="weight_bn_act",
-            backbone_stage1_in_channels=[64],
-            backbone_stage1_out_channels=[64],
-            backbone_stage1_kernel_size=[[3, 3]],
-            backbone_stage1_stride=[1],
-            backbone_stage1_dilation=[1],
-            backbone_stage1_groups=[1],
-            backbone_stage2_in_channels=[64],
-            backbone_stage2_out_channels=[128],
-            backbone_stage2_kernel_size=[[3, 1]],
-            backbone_stage2_stride=[2],
-            backbone_stage2_dilation=[1],
-            backbone_stage2_groups=[1],
-            backbone_stage3_in_channels=[128],
-            backbone_stage3_out_channels=[256],
-            backbone_stage3_kernel_size=[[1, 3]],
-            backbone_stage3_stride=[2],
-            backbone_stage3_dilation=[1],
-            backbone_stage3_groups=[1],
-            backbone_stage4_in_channels=[256],
-            backbone_stage4_out_channels=[512],
-            backbone_stage4_kernel_size=[[3, 3]],
-            backbone_stage4_stride=[2],
-            backbone_stage4_dilation=[1],
-            backbone_stage4_groups=[1],
-            neck_in_channels=[64],
-            neck_out_channels=[128],
-            neck_kernel_size=[[3, 3]],
-            neck_stride=[1],
-            neck_dilation=[1],
-            neck_groups=[1],
-            head_pooling_size=9,
-            head_dropout_ratio=0.1,
-            head_conv_in_channels=128,
-            head_conv_out_channels=4,
-            head_conv_kernel_size=[3, 3],
-            head_conv_stride=1,
-            head_conv_dilation=1,
-            head_conv_groups=1,
-            head_final_kernel_size=1,
-            head_final_stride=1,
-            head_final_dilation=1,
-            head_final_groups=1,
-            head_final_bias=False,
-            head_final_has_shuffle=False,
-            head_final_in_channels=4,
-            head_final_out_channels=5,
-            head_final_use_bn=False,
-            head_final_act_func=None,
-            head_final_dropout_rate=0,
-            head_final_ops_order="weight",
-            batch_size=3,
-            num_channels=3,
-            image_size=500,
-            is_training=True,
+        self,
+        parent,
+        backbone_kernel_size=3,
+        backbone_stride=2,
+        backbone_dilation=1,
+        backbone_groups=1,
+        backbone_bias=False,
+        backbone_has_shuffle=False,
+        backbone_in_channels=3,
+        backbone_out_channels=64,
+        backbone_use_bn=True,
+        backbone_act_func="relu",
+        backbone_dropout_rate=0,
+        backbone_ops_order="weight_bn_act",
+        backbone_stage1_in_channels=[64],
+        backbone_stage1_out_channels=[64],
+        backbone_stage1_kernel_size=[[3, 3]],
+        backbone_stage1_stride=[1],
+        backbone_stage1_dilation=[1],
+        backbone_stage1_groups=[1],
+        backbone_stage2_in_channels=[64],
+        backbone_stage2_out_channels=[128],
+        backbone_stage2_kernel_size=[[3, 1]],
+        backbone_stage2_stride=[2],
+        backbone_stage2_dilation=[1],
+        backbone_stage2_groups=[1],
+        backbone_stage3_in_channels=[128],
+        backbone_stage3_out_channels=[256],
+        backbone_stage3_kernel_size=[[1, 3]],
+        backbone_stage3_stride=[2],
+        backbone_stage3_dilation=[1],
+        backbone_stage3_groups=[1],
+        backbone_stage4_in_channels=[256],
+        backbone_stage4_out_channels=[512],
+        backbone_stage4_kernel_size=[[3, 3]],
+        backbone_stage4_stride=[2],
+        backbone_stage4_dilation=[1],
+        backbone_stage4_groups=[1],
+        neck_in_channels=[64],
+        neck_out_channels=[128],
+        neck_kernel_size=[[3, 3]],
+        neck_stride=[1],
+        neck_dilation=[1],
+        neck_groups=[1],
+        head_pooling_size=9,
+        head_dropout_ratio=0.1,
+        head_conv_in_channels=128,
+        head_conv_out_channels=4,
+        head_conv_kernel_size=[3, 3],
+        head_conv_stride=1,
+        head_conv_dilation=1,
+        head_conv_groups=1,
+        head_final_kernel_size=1,
+        head_final_stride=1,
+        head_final_dilation=1,
+        head_final_groups=1,
+        head_final_bias=False,
+        head_final_has_shuffle=False,
+        head_final_in_channels=4,
+        head_final_out_channels=5,
+        head_final_use_bn=False,
+        head_final_act_func=None,
+        head_final_dropout_rate=0,
+        head_final_ops_order="weight",
+        batch_size=3,
+        num_channels=3,
+        image_size=500,
+        is_training=True,
     ):
         self.parent = parent
         self.backbone_kernel_size = backbone_kernel_size
@@ -193,10 +194,6 @@ def __init__(
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        pixel_values_meta = {
-            "org_img_size": (500, 500),
-            "img_size": (500, 500)
-        }
         # labels = None
         # if self.use_labels:
         #     labels = ids_tensor([self.batch_size], self.num_labels)
@@ -275,7 +272,7 @@ def create_and_check_model(self, config, input):
         model = FASTForImageCaptioning(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(pixel_values=input['pixel_values'])
+        result = model(pixel_values=input["pixel_values"])
         self.parent.assertEqual(result.hidden_states.shape, (self.batch_size, 5, 125, 125))
 
     def prepare_config_and_inputs_for_common(self):
@@ -286,13 +283,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class FastModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            FASTForImageCaptioning,
-        )
-        if is_torch_available()
-        else ()
-    )
+    all_model_classes = (FASTForImageCaptioning,) if is_torch_available() else ()
 
     pipeline_model_mapping = {}
     test_headmasking = False
@@ -356,13 +347,16 @@ def test_forward_signature(self):
 
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         to_return = inputs_dict.copy()
-        gt_instances = torch.zeros(self.model_tester.batch_size, self.model_tester.image_size,
-                                   self.model_tester.image_size)
-        gt_kernels = torch.zeros(self.model_tester.batch_size, self.model_tester.image_size,
-                                 self.model_tester.image_size)
+        gt_instances = torch.zeros(
+            self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size
+        )
+        gt_kernels = torch.zeros(
+            self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size
+        )
         gt_text = torch.zeros(self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size)
-        training_masks = torch.ones(self.model_tester.batch_size, self.model_tester.image_size,
-                                    self.model_tester.image_size)
+        training_masks = torch.ones(
+            self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size
+        )
         labels = {}
         labels["gt_instances"] = gt_instances
         labels["gt_kernels"] = gt_kernels
@@ -381,7 +375,7 @@ def test_model_is_small(self):
             model = model_class(config)
             num_params = model.num_parameters()
             assert (
-                    num_params < 3000000
+                num_params < 3000000
             ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
 
         # def prepare_image():
@@ -407,14 +401,12 @@ def prepare_image():
         image = prepare_image()
         input = image_processor(image, return_tensor="np")
 
-        output = model(pixel_values=torch.tensor(input['pixel_values']))
-        target_sizes = [(image.shape[1], image.shape[2]) for image in input['pixel_values']]
+        output = model(pixel_values=torch.tensor(input["pixel_values"]))
+        target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]]
         final_out = image_processor.get_results(output, target_sizes)
 
-        assert (
-                final_out[0]['bboxes'][0] == [224, 120, 246, 120, 246, 134, 224, 134]
-        )
-        assert round(float(final_out[0]['scores'][0]), 5) == 0.95541
+        assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134]
+        assert round(float(final_out[0]["scores"][0]), 5) == 0.95541
 
     def test_inference_fast_base_800_total_text_ic17mlt_model(self):
         model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
@@ -429,11 +421,9 @@ def prepare_image():
         image = prepare_image()
         input = image_processor(image, return_tensor="np")
 
-        output = model(pixel_values=torch.tensor(input['pixel_values']))
-        target_sizes = [(image.shape[1], image.shape[2]) for image in input['pixel_values']]
+        output = model(pixel_values=torch.tensor(input["pixel_values"]))
+        target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]]
         final_out = image_processor.get_results(output, target_sizes)
 
-        assert (
-                final_out[0]['bboxes'][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]
-        )
-        assert round(float(final_out[0]['scores'][0]), 5) == 0.92356
+        assert final_out[0]["bboxes"][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]
+        assert round(float(final_out[0]["scores"][0]), 5) == 0.92356

From 3fef2616e950889ca7e6ab22f4db68fd7d8ade51 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 31 Oct 2023 19:37:06 +0530
Subject: [PATCH 010/152] Add fast model to init

---
 src/transformers/__init__.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 280e824efb89..4941d724455d 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -424,7 +424,6 @@
     "models.ernie_m": ["ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP", "ErnieMConfig"],
     "models.esm": ["ESM_PRETRAINED_CONFIG_ARCHIVE_MAP", "EsmConfig", "EsmTokenizer"],
     "models.falcon": ["FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP", "FalconConfig"],
-<<<<<<< HEAD
     "models.fastspeech2_conformer": [
         "FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -434,9 +433,6 @@
         "FastSpeech2ConformerTokenizer",
         "FastSpeech2ConformerWithHifiGanConfig",
     ],
-=======
-    "models.fast": ["FastConfig"],
->>>>>>> 67fec5b40 (Refactor modeling and add tests)
     "models.flaubert": ["FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlaubertConfig", "FlaubertTokenizer"],
     "models.flava": [
         "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -5117,7 +5113,6 @@
     from .models.ernie_m import ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieMConfig
     from .models.esm import ESM_PRETRAINED_CONFIG_ARCHIVE_MAP, EsmConfig, EsmTokenizer
     from .models.falcon import FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP, FalconConfig
-<<<<<<< HEAD
     from .models.fastspeech2_conformer import (
         FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP,
         FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -5127,9 +5122,6 @@
         FastSpeech2ConformerTokenizer,
         FastSpeech2ConformerWithHifiGanConfig,
     )
-=======
-    from .models.fast import FastConfig
->>>>>>> 67fec5b40 (Refactor modeling and add tests)
     from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer
     from .models.flava import (
         FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -6706,17 +6698,12 @@
             FalconModel,
             FalconPreTrainedModel,
         )
-<<<<<<< HEAD
         from .models.fastspeech2_conformer import (
             FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             FastSpeech2ConformerHifiGan,
             FastSpeech2ConformerModel,
             FastSpeech2ConformerPreTrainedModel,
             FastSpeech2ConformerWithHifiGan,
-=======
-        from .models.fast import (
-            FASTForImageCaptioning,
->>>>>>> 67fec5b40 (Refactor modeling and add tests)
         )
         from .models.flaubert import (
             FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,

From 597abe1da92632e471968b3ea60f03725d444a73 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 31 Oct 2023 19:53:17 +0530
Subject: [PATCH 011/152] Add fast model in docs and other places

---
 docs/source/en/model_doc/fast.md              | 39 +++++++++++++++++++
 .../models/auto/image_processing_auto.py      |  1 +
 2 files changed, 40 insertions(+)
 create mode 100644 docs/source/en/model_doc/fast.md

diff --git a/docs/source/en/model_doc/fast.md b/docs/source/en/model_doc/fast.md
new file mode 100644
index 000000000000..ddcc5e1148f8
--- /dev/null
+++ b/docs/source/en/model_doc/fast.md
@@ -0,0 +1,39 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Fast
+
+## Overview
+
+Fast model proposes an accurate and efficient scene text detection framework, termed FAST (i.e., faster 
+arbitrarily-shaped text detector). 
+
+FAST has two new designs. (1) We design a minimalist kernel representation (only has 1-channel output) to model text 
+with arbitrary shape, as well as a GPU-parallel post-processing to efficiently assemble text lines with a negligible 
+time overhead. (2) We search the network architecture tailored for text detection, leading to more powerful features 
+than most networks that are searched for image classification
+
+## FastConfig
+
+[[autodoc]] FastConfig
+
+## Fast
+
+[[autodoc]] FASTForImageCaptioningOutput
+    - forward
+
+
+
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index e41889c5ef81..55a128fe5519 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -62,6 +62,7 @@
         ("dpt", "DPTImageProcessor"),
         ("efficientformer", "EfficientFormerImageProcessor"),
         ("efficientnet", "EfficientNetImageProcessor"),
+        ("fast", "FastImageProcessor"),
         ("flava", "FlavaImageProcessor"),
         ("focalnet", "BitImageProcessor"),
         ("fuyu", "FuyuImageProcessor"),

From c3b43e739c8601ed55a2f7f0903405ebc02c14d0 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 31 Oct 2023 20:13:48 +0530
Subject: [PATCH 012/152] Fix import of cv2

---
 .../models/fast/image_processing_fast.py      | 165 +++++++++---------
 1 file changed, 83 insertions(+), 82 deletions(-)

diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index 812c617f073c..2f8ad3cb4e57 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -16,8 +16,10 @@
 import math
 import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union
+from ...utils.import_utils import is_cv2_available
 
-import cv2
+if is_cv2_available():
+    import cv2
 import numpy as np
 import torch.nn as nn
 import torch.nn.functional as F
@@ -41,10 +43,9 @@
     is_torch_available,
     is_torch_tensor,
     is_vision_available,
-    logging,
+    logging, is_cv2_available,
 )
 
-
 if is_vision_available():
     import PIL
 
@@ -100,23 +101,23 @@ class FastImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
 
     def __init__(
-        self,
-        do_resize: bool = True,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        do_center_crop: bool = False,
-        crop_size: Dict[str, int] = None,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_rescale: bool = True,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_reduce_labels: bool = False,
-        min_area: int = 10,
-        min_score: float = 0.88,
-        bbox_type: str = "rect",
-        pooling_size: int = 9,
-        **kwargs,
+            self,
+            do_resize: bool = True,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = PILImageResampling.BICUBIC,
+            do_center_crop: bool = False,
+            crop_size: Dict[str, int] = None,
+            rescale_factor: Union[int, float] = 1 / 255,
+            do_rescale: bool = True,
+            do_normalize: bool = True,
+            image_mean: Optional[Union[float, List[float]]] = None,
+            image_std: Optional[Union[float, List[float]]] = None,
+            do_reduce_labels: bool = False,
+            min_area: int = 10,
+            min_score: float = 0.88,
+            bbox_type: str = "rect",
+            pooling_size: int = 9,
+            **kwargs,
     ) -> None:
         if "reduce_labels" in kwargs:
             warnings.warn(
@@ -158,13 +159,13 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         return super().from_dict(image_processor_dict, **kwargs)
 
     def resize(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
+            self,
+            image: np.ndarray,
+            size: Dict[str, int],
+            resample: PILImageResampling = PILImageResampling.BICUBIC,
+            data_format: Optional[Union[str, ChannelDimension]] = None,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+            **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to (size["height"], size["width"]).
@@ -202,20 +203,20 @@ def reduce_label(self, label: ImageInput) -> np.ndarray:
         return label
 
     def _preprocess(
-        self,
-        image: ImageInput,
-        do_reduce_labels: bool = None,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+            self,
+            image: ImageInput,
+            do_reduce_labels: bool = None,
+            do_resize: bool = None,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = None,
+            do_center_crop: bool = None,
+            crop_size: Dict[str, int] = None,
+            do_rescale: bool = None,
+            rescale_factor: float = None,
+            do_normalize: bool = None,
+            image_mean: Optional[Union[float, List[float]]] = None,
+            image_std: Optional[Union[float, List[float]]] = None,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         if do_reduce_labels:
             image = self.reduce_label(image)
@@ -235,20 +236,20 @@ def _preprocess(
         return image
 
     def _preprocess_image(
-        self,
-        image: ImageInput,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+            self,
+            image: ImageInput,
+            do_resize: bool = None,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = None,
+            do_center_crop: bool = None,
+            crop_size: Dict[str, int] = None,
+            do_rescale: bool = None,
+            rescale_factor: float = None,
+            do_normalize: bool = None,
+            image_mean: Optional[Union[float, List[float]]] = None,
+            image_std: Optional[Union[float, List[float]]] = None,
+            data_format: Optional[Union[str, ChannelDimension]] = None,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """Preprocesses a single image."""
         # All transformations expect numpy arrays.
@@ -280,15 +281,15 @@ def _preprocess_image(
         return image
 
     def _preprocess_segmentation_map(
-        self,
-        segmentation_map: ImageInput,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: Dict[str, int] = None,
-        do_reduce_labels: bool = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+            self,
+            segmentation_map: ImageInput,
+            do_resize: bool = None,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = None,
+            do_center_crop: bool = None,
+            crop_size: Dict[str, int] = None,
+            do_reduce_labels: bool = None,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         """Preprocesses a single segmentation map."""
         # All transformations expect numpy arrays.
@@ -326,24 +327,24 @@ def __call__(self, images, segmentation_maps=None, **kwargs):
         return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
 
     def preprocess(
-        self,
-        images: ImageInput,
-        segmentation_maps: Optional[ImageInput] = None,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_reduce_labels: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: ChannelDimension = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
+            self,
+            images: ImageInput,
+            segmentation_maps: Optional[ImageInput] = None,
+            do_resize: bool = None,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = None,
+            do_center_crop: bool = None,
+            crop_size: Dict[str, int] = None,
+            do_rescale: bool = None,
+            rescale_factor: float = None,
+            do_normalize: bool = None,
+            image_mean: Optional[Union[float, List[float]]] = None,
+            image_std: Optional[Union[float, List[float]]] = None,
+            do_reduce_labels: Optional[bool] = None,
+            return_tensors: Optional[Union[str, TensorType]] = None,
+            data_format: ChannelDimension = ChannelDimension.FIRST,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+            **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.

From 4903a6914174cbddd62984ca8ecdb4dbfde64c8b Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 31 Oct 2023 20:18:15 +0530
Subject: [PATCH 013/152] Rename image processing method

---
 .../models/fast/image_processing_fast.py      | 50 ++-----------------
 tests/models/fast/test_modeling_fast.py       |  9 ++--
 2 files changed, 9 insertions(+), 50 deletions(-)

diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index 2f8ad3cb4e57..04cb89e1d5cd 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -43,7 +43,7 @@
     is_torch_available,
     is_torch_tensor,
     is_vision_available,
-    logging, is_cv2_available,
+    logging,
 )
 
 if is_vision_available():
@@ -57,7 +57,7 @@
 
 class FastImageProcessor(BaseImageProcessor):
     r"""
-    Constructs a BEiT image processor.
+    Constructs a Fast image processor.
 
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
@@ -151,7 +151,7 @@ def __init__(
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         """
         Overrides the `from_dict` method from the base class to make sure `reduce_labels` is updated if image processor
-        is created using from_dict and kwargs e.g. `BeitImageProcessor.from_pretrained(checkpoint, reduce_labels=True)`
+        is created using from_dict and kwargs e.g. `FastImageProcessor.from_pretrained(checkpoint, reduce_labels=True)`
         """
         image_processor_dict = image_processor_dict.copy()
         if "reduce_labels" in kwargs:
@@ -478,48 +478,6 @@ def preprocess(
 
         return BatchFeature(data=data, tensor_type=return_tensors)
 
-    def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
-        """
-        Converts the output of [`BeitForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
-
-        Args:
-            outputs ([`BeitForSemanticSegmentation`]):
-                Raw outputs of the model.
-            target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
-                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
-                predictions will not be resized.
-
-        Returns:
-            semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
-            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
-            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
-        """
-        # TODO: add support for other frameworks
-        logits = outputs.logits
-
-        # Resize logits and compute semantic segmentation maps
-        if target_sizes is not None:
-            if len(logits) != len(target_sizes):
-                raise ValueError(
-                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
-                )
-
-            if is_torch_tensor(target_sizes):
-                target_sizes = target_sizes.numpy()
-
-            semantic_segmentation = []
-
-            for idx in range(len(logits)):
-                resized_logits = torch.nn.functional.interpolate(
-                    logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
-                )
-                semantic_map = resized_logits[0].argmax(dim=0)
-                semantic_segmentation.append(semantic_map)
-        else:
-            semantic_segmentation = logits.argmax(dim=1)
-            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
-
-        return semantic_segmentation
 
     def _max_pooling(self, x, scale=1):
         if scale == 1:
@@ -530,7 +488,7 @@ def _max_pooling(self, x, scale=1):
             )
         return x
 
-    def get_results(self, output, target_sizes):
+    def post_process_text_detection(self, output, target_sizes):
         scale = 2
         img_size = (self.size["height"], self.size["width"])
         out = output["hidden_states"]
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index d1a2075a199b..7acd27b6e9e8 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -27,7 +27,7 @@
 from transformers.testing_utils import (
     require_torch,
     require_vision,
-    torch_device,
+    torch_device, slow,
 )
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -387,7 +387,7 @@ def test_model_is_small(self):
 @require_torch
 @require_vision
 class FastModelIntegrationTest(unittest.TestCase):
-    # @slow
+    @slow
     def test_inference_fast_tiny_ic17mlt_model(self):
         model = FASTForImageCaptioning.from_pretrained("Raghavan/ic17mlt_Fast_T")
 
@@ -403,11 +403,12 @@ def prepare_image():
 
         output = model(pixel_values=torch.tensor(input["pixel_values"]))
         target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]]
-        final_out = image_processor.get_results(output, target_sizes)
+        final_out = image_processor.post_process_text_detection(output, target_sizes)
 
         assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134]
         assert round(float(final_out[0]["scores"][0]), 5) == 0.95541
 
+    @slow
     def test_inference_fast_base_800_total_text_ic17mlt_model(self):
         model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
 
@@ -423,7 +424,7 @@ def prepare_image():
 
         output = model(pixel_values=torch.tensor(input["pixel_values"]))
         target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]]
-        final_out = image_processor.get_results(output, target_sizes)
+        final_out = image_processor.post_process_text_detection(output, target_sizes)
 
         assert final_out[0]["bboxes"][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]
         assert round(float(final_out[0]["scores"][0]), 5) == 0.92356

From c391cf6d1e15be8715709181c3142454baab1fbf Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 31 Oct 2023 20:25:23 +0530
Subject: [PATCH 014/152] Fix build

---
 .../models/fast/image_processing_fast.py      | 165 +++++++++---------
 tests/models/fast/test_modeling_fast.py       |   3 +-
 2 files changed, 85 insertions(+), 83 deletions(-)

diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index 04cb89e1d5cd..1c652128a85e 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -15,9 +15,11 @@
 """Image processor class for Beit."""
 import math
 import warnings
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Union
+
 from ...utils.import_utils import is_cv2_available
 
+
 if is_cv2_available():
     import cv2
 import numpy as np
@@ -41,11 +43,11 @@
     IMAGENET_DEFAULT_STD,
     TensorType,
     is_torch_available,
-    is_torch_tensor,
     is_vision_available,
     logging,
 )
 
+
 if is_vision_available():
     import PIL
 
@@ -101,23 +103,23 @@ class FastImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
 
     def __init__(
-            self,
-            do_resize: bool = True,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = PILImageResampling.BICUBIC,
-            do_center_crop: bool = False,
-            crop_size: Dict[str, int] = None,
-            rescale_factor: Union[int, float] = 1 / 255,
-            do_rescale: bool = True,
-            do_normalize: bool = True,
-            image_mean: Optional[Union[float, List[float]]] = None,
-            image_std: Optional[Union[float, List[float]]] = None,
-            do_reduce_labels: bool = False,
-            min_area: int = 10,
-            min_score: float = 0.88,
-            bbox_type: str = "rect",
-            pooling_size: int = 9,
-            **kwargs,
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = False,
+        crop_size: Dict[str, int] = None,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_rescale: bool = True,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_reduce_labels: bool = False,
+        min_area: int = 10,
+        min_score: float = 0.88,
+        bbox_type: str = "rect",
+        pooling_size: int = 9,
+        **kwargs,
     ) -> None:
         if "reduce_labels" in kwargs:
             warnings.warn(
@@ -159,13 +161,13 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         return super().from_dict(image_processor_dict, **kwargs)
 
     def resize(
-            self,
-            image: np.ndarray,
-            size: Dict[str, int],
-            resample: PILImageResampling = PILImageResampling.BICUBIC,
-            data_format: Optional[Union[str, ChannelDimension]] = None,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
-            **kwargs,
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to (size["height"], size["width"]).
@@ -203,20 +205,20 @@ def reduce_label(self, label: ImageInput) -> np.ndarray:
         return label
 
     def _preprocess(
-            self,
-            image: ImageInput,
-            do_reduce_labels: bool = None,
-            do_resize: bool = None,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = None,
-            do_center_crop: bool = None,
-            crop_size: Dict[str, int] = None,
-            do_rescale: bool = None,
-            rescale_factor: float = None,
-            do_normalize: bool = None,
-            image_mean: Optional[Union[float, List[float]]] = None,
-            image_std: Optional[Union[float, List[float]]] = None,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        self,
+        image: ImageInput,
+        do_reduce_labels: bool = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         if do_reduce_labels:
             image = self.reduce_label(image)
@@ -236,20 +238,20 @@ def _preprocess(
         return image
 
     def _preprocess_image(
-            self,
-            image: ImageInput,
-            do_resize: bool = None,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = None,
-            do_center_crop: bool = None,
-            crop_size: Dict[str, int] = None,
-            do_rescale: bool = None,
-            rescale_factor: float = None,
-            do_normalize: bool = None,
-            image_mean: Optional[Union[float, List[float]]] = None,
-            image_std: Optional[Union[float, List[float]]] = None,
-            data_format: Optional[Union[str, ChannelDimension]] = None,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        self,
+        image: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """Preprocesses a single image."""
         # All transformations expect numpy arrays.
@@ -281,15 +283,15 @@ def _preprocess_image(
         return image
 
     def _preprocess_segmentation_map(
-            self,
-            segmentation_map: ImageInput,
-            do_resize: bool = None,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = None,
-            do_center_crop: bool = None,
-            crop_size: Dict[str, int] = None,
-            do_reduce_labels: bool = None,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        self,
+        segmentation_map: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_reduce_labels: bool = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         """Preprocesses a single segmentation map."""
         # All transformations expect numpy arrays.
@@ -327,24 +329,24 @@ def __call__(self, images, segmentation_maps=None, **kwargs):
         return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
 
     def preprocess(
-            self,
-            images: ImageInput,
-            segmentation_maps: Optional[ImageInput] = None,
-            do_resize: bool = None,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = None,
-            do_center_crop: bool = None,
-            crop_size: Dict[str, int] = None,
-            do_rescale: bool = None,
-            rescale_factor: float = None,
-            do_normalize: bool = None,
-            image_mean: Optional[Union[float, List[float]]] = None,
-            image_std: Optional[Union[float, List[float]]] = None,
-            do_reduce_labels: Optional[bool] = None,
-            return_tensors: Optional[Union[str, TensorType]] = None,
-            data_format: ChannelDimension = ChannelDimension.FIRST,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
-            **kwargs,
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[ImageInput] = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_reduce_labels: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
@@ -478,7 +480,6 @@ def preprocess(
 
         return BatchFeature(data=data, tensor_type=return_tensors)
 
-
     def _max_pooling(self, x, scale=1):
         if scale == 1:
             x = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2)(x)
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index 7acd27b6e9e8..17f09befd7cd 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -27,7 +27,8 @@
 from transformers.testing_utils import (
     require_torch,
     require_vision,
-    torch_device, slow,
+    slow,
+    torch_device,
 )
 
 from ...generation.test_utils import GenerationTesterMixin

From d3bf608b8701578929041b5d295ce984faf6e82d Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 1 Nov 2023 08:03:39 +0530
Subject: [PATCH 015/152] Fix Build

---
 docs/source/en/model_doc/fast.md              | 13 ++++-
 src/transformers/__init__.py                  | 28 ++++------
 src/transformers/models/fast/__init__.py      |  8 +--
 src/transformers/models/fast/modeling_fast.py | 51 +++++++++++++++----
 utils/check_repo.py                           | 27 +++++-----
 5 files changed, 80 insertions(+), 47 deletions(-)

diff --git a/docs/source/en/model_doc/fast.md b/docs/source/en/model_doc/fast.md
index ddcc5e1148f8..3c81109380ae 100644
--- a/docs/source/en/model_doc/fast.md
+++ b/docs/source/en/model_doc/fast.md
@@ -30,10 +30,19 @@ than most networks that are searched for image classification
 
 [[autodoc]] FastConfig
 
-## Fast
+## FastImageProcessor
+
+[[autodoc]] FastImageProcessor
+
+## FASTForImageCaptioning
+
+[[autodoc]] FASTForImageCaptioning
+- forward
+
+## FASTForImageCaptioningOutput
 
 [[autodoc]] FASTForImageCaptioningOutput
-    - forward
+- forward
 
 
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4941d724455d..82cee836cf05 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -47,10 +47,8 @@
     logging,
 )
 
-
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-
 # Base objects, independent of any specific backend
 _import_structure = {
     "audio_utils": [],
@@ -1200,7 +1198,6 @@
     _import_structure["models.xlnet"].append("XLNetTokenizerFast")
     _import_structure["tokenization_utils_fast"] = ["PreTrainedTokenizerFast"]
 
-
 try:
     if not (is_sentencepiece_available() and is_tokenizers_available()):
         raise OptionalDependencyNotAvailable()
@@ -1310,7 +1307,6 @@
     _import_structure["models.vivit"].append("VivitImageProcessor")
     _import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
 
-
 # PyTorch-backed objects
 try:
     if not is_torch_available():
@@ -4402,14 +4398,13 @@
     ]
     _import_structure["tf_utils"] = []
 
-
 try:
     if not (
-        is_librosa_available()
-        and is_essentia_available()
-        and is_scipy_available()
-        and is_torch_available()
-        and is_pretty_midi_available()
+            is_librosa_available()
+            and is_essentia_available()
+            and is_scipy_available()
+            and is_torch_available()
+            and is_pretty_midi_available()
     ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
@@ -4427,7 +4422,6 @@
     _import_structure["models.pop2piano"].append("Pop2PianoTokenizer")
     _import_structure["models.pop2piano"].append("Pop2PianoProcessor")
 
-
 # FLAX-backed objects
 try:
     if not is_flax_available():
@@ -4752,7 +4746,6 @@
         ]
     )
 
-
 # Direct imports for type-checking
 if TYPE_CHECKING:
     # Configuration
@@ -8561,11 +8554,11 @@
 
     try:
         if not (
-            is_librosa_available()
-            and is_essentia_available()
-            and is_scipy_available()
-            and is_torch_available()
-            and is_pretty_midi_available()
+                is_librosa_available()
+                and is_essentia_available()
+                and is_scipy_available()
+                and is_torch_available()
+                and is_pretty_midi_available()
         ):
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
@@ -8868,7 +8861,6 @@
         extra_objects={"__version__": __version__},
     )
 
-
 if not is_tf_available() and not is_torch_available() and not is_flax_available():
     logger.warning(
         "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. "
diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py
index 133d20bc0c52..e7e044c5d2ce 100644
--- a/src/transformers/models/fast/__init__.py
+++ b/src/transformers/models/fast/__init__.py
@@ -20,9 +20,9 @@
     is_torch_available,
 )
 
-
 _import_structure = {
     "configuration_fast": ["FastConfig"],
+    "image_processing_fast": ["FastImageProcessor"]
 }
 
 try:
@@ -31,18 +31,18 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["modeling_fast"] = ["FASTForImageCaptioning"]
+    _import_structure["modeling_fast"] = ["FASTForImageCaptioning","FastPreTrainedModel"]
 
 if TYPE_CHECKING:
     from .configuration_fast import FastConfig
-
+    from .image_processing_fast import FastImageProcessor
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_fast import FASTForImageCaptioning
+        from .modeling_fast import FASTForImageCaptioning,FastPreTrainedModel
 
 
 else:
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 4557cf4754c4..4d04b8e56ed5 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -6,9 +6,36 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from transformers import FastConfig, PreTrainedModel
+from transformers import FastConfig, PreTrainedModel, add_start_docstrings
 from transformers.utils import ModelOutput
 
+FAST_FOR_CAPTIONING_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BeitImageProcessor.__call__`] for details.
+        language_masked_pos (`torch.LongTensor` of shape `({0})`):
+            language_masked_pos for denoting tokens for captioning
+            - 1 indicates the token is **Present**,
+            - 0 indicates the token is **absent**.
+        text_len (`torch.LongTensor` of shape `({0})`):
+            Length of text for captioning
+        past_key_value (`Dict`):
+            A Dictionary containing the incremental states layerwise
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. A
+            classification loss is computed (Cross-Entropy) against these labels.
+"""
 
 def get_same_padding(kernel_size):
     if isinstance(kernel_size, tuple):
@@ -401,9 +428,9 @@ def _init_weights(self, module):
                 module.bias.data.zero_()
 
 
-class TextNet(FastPreTrainedModel):
+class TextNet(nn.Module):
     def __init__(self, config):
-        super().__init__(config)
+        super().__init__()
         self.first_conv = ConvLayer(
             config.backbone_in_channels,
             config.backbone_out_channels,
@@ -418,7 +445,7 @@ def __init__(self, config):
             config.backbone_dropout_rate,
             config.backbone_ops_order,
         )
-        self.first_conv.apply(self._init_weights)
+        # self.first_conv.apply(self._init_weights)
         stage1 = []
         for stage_config in zip(
             config.backbone_stage1_in_channels,
@@ -500,9 +527,9 @@ def forward(self, x):
         return output
 
 
-class FASTNeck(FastPreTrainedModel):
+class FASTNeck(nn.Module):
     def __init__(self, config):
-        super().__init__(config)
+        super().__init__()
         reduce_layer_configs = list(
             zip(
                 config.neck_in_channels,
@@ -549,9 +576,9 @@ def forward(self, x):
         return f
 
 
-class FASTHead(FastPreTrainedModel):
+class FASTHead(nn.Module):
     def __init__(self, config):
-        super().__init__(config)
+        super().__init__()
         self.conv = RepConvLayer(
             config.head_conv_in_channels,
             config.head_conv_out_channels,
@@ -891,7 +918,13 @@ class FASTForImageCaptioningOutput(ModelOutput):
     loss: Optional[torch.Tensor] = None
     hidden_states: Optional[torch.FloatTensor] = None
 
-
+@add_start_docstrings(
+    """BEiT-3 is a general-purpose multimodal foundation model that excels in both vision and vision-language tasks. It
+        utilizes [Multiway transformers] (https://arxiv.org/abs/2208.10442) for deep fusion and modality-specific
+        encoding, and unifies masked modeling on images, texts, and image-text pairs, achieving top performance on
+        multiple benchmarks.""",
+    FAST_FOR_CAPTIONING_INPUTS_DOCSTRING,
+)
 class FASTForImageCaptioning(FastPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index aa448f32e62d..f7f88615b670 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -50,7 +50,6 @@
 from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES
 from transformers.utils import ENV_VARS_TRUE_VALUES, direct_transformers_import
 
-
 # All paths are set with the intent you should run this script from the root of the repo with the command
 # python utils/check_repo.py
 PATH_TO_TRANSFORMERS = "src/transformers"
@@ -223,6 +222,7 @@
     "TFCLIPVisionModel",
     "TFGroupViTTextModel",
     "TFGroupViTVisionModel",
+    "FASTForImageCaptioning",
     "FlaxCLIPTextModel",
     "FlaxCLIPTextModelWithProjection",
     "FlaxCLIPVisionModel",
@@ -337,7 +337,6 @@
     ]
 )
 
-
 # This is to make sure the transformers module imported is the one in the repo.
 transformers = direct_transformers_import(PATH_TO_TRANSFORMERS)
 
@@ -812,9 +811,9 @@ def check_objects_being_equally_in_main_init():
         module_name = module_path.split(".")[-1]
         module_dir = ".".join(module_path.split(".")[:-1])
         if (
-            module_name.startswith("modeling_")
-            and not module_name.startswith("modeling_tf_")
-            and not module_name.startswith("modeling_flax_")
+                module_name.startswith("modeling_")
+                and not module_name.startswith("modeling_tf_")
+                and not module_name.startswith("modeling_flax_")
         ):
             parent_module = sys.modules[module_dir]
 
@@ -1007,17 +1006,17 @@ def ignore_undocumented(name: str) -> bool:
         return True
     # PreTrainedModels / Encoders / Decoders / Layers / Embeddings / Attention are not documented.
     if (
-        name.endswith("PreTrainedModel")
-        or name.endswith("Decoder")
-        or name.endswith("Encoder")
-        or name.endswith("Layer")
-        or name.endswith("Embeddings")
-        or name.endswith("Attention")
+            name.endswith("PreTrainedModel")
+            or name.endswith("Decoder")
+            or name.endswith("Encoder")
+            or name.endswith("Layer")
+            or name.endswith("Embeddings")
+            or name.endswith("Attention")
     ):
         return True
     # Submodules are not documented.
     if os.path.isdir(os.path.join(PATH_TO_TRANSFORMERS, name)) or os.path.isfile(
-        os.path.join(PATH_TO_TRANSFORMERS, f"{name}.py")
+            os.path.join(PATH_TO_TRANSFORMERS, f"{name}.py")
     ):
         return True
     # All load functions are not documented.
@@ -1075,7 +1074,7 @@ def check_model_type_doc_match():
             "Some model doc pages do not match any existing model type:\n"
             + "\n".join(errors)
             + "\nYou can add any missing model type to the `MODEL_NAMES_MAPPING` constant in "
-            "models/auto/configuration_auto.py."
+              "models/auto/configuration_auto.py."
         )
 
 
@@ -1119,7 +1118,7 @@ def check_docstrings_are_in_md():
             "The following files have docstrings written in rst:\n"
             + "\n".join([f"- {f}" for f in files_with_rst])
             + "\nTo fix this run `doc-builder convert path_to_py_file` after installing `doc-builder`\n"
-            "(`pip install git+https://github.com/huggingface/doc-builder`)"
+              "(`pip install git+https://github.com/huggingface/doc-builder`)"
         )
 
 

From 13ea2bbc0120792d228089be2d47073afd4a68bd Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 1 Nov 2023 08:10:46 +0530
Subject: [PATCH 016/152] fix style and fix copies

---
 src/transformers/__init__.py                  | 21 ++++++++--------
 src/transformers/models/fast/__init__.py      | 11 ++++----
 .../models/fast/image_processing_fast.py      | 14 +++++++----
 src/transformers/models/fast/modeling_fast.py | 10 +++++---
 src/transformers/utils/dummy_pt_objects.py    |  7 ++++++
 utils/check_repo.py                           | 25 ++++++++++---------
 6 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 82cee836cf05..5e12dc8c3354 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -47,6 +47,7 @@
     logging,
 )
 
+
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 # Base objects, independent of any specific backend
@@ -4400,11 +4401,11 @@
 
 try:
     if not (
-            is_librosa_available()
-            and is_essentia_available()
-            and is_scipy_available()
-            and is_torch_available()
-            and is_pretty_midi_available()
+        is_librosa_available()
+        and is_essentia_available()
+        and is_scipy_available()
+        and is_torch_available()
+        and is_pretty_midi_available()
     ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
@@ -8554,11 +8555,11 @@
 
     try:
         if not (
-                is_librosa_available()
-                and is_essentia_available()
-                and is_scipy_available()
-                and is_torch_available()
-                and is_pretty_midi_available()
+            is_librosa_available()
+            and is_essentia_available()
+            and is_scipy_available()
+            and is_torch_available()
+            and is_pretty_midi_available()
         ):
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py
index e7e044c5d2ce..78bd816d9d0d 100644
--- a/src/transformers/models/fast/__init__.py
+++ b/src/transformers/models/fast/__init__.py
@@ -20,10 +20,8 @@
     is_torch_available,
 )
 
-_import_structure = {
-    "configuration_fast": ["FastConfig"],
-    "image_processing_fast": ["FastImageProcessor"]
-}
+
+_import_structure = {"configuration_fast": ["FastConfig"], "image_processing_fast": ["FastImageProcessor"]}
 
 try:
     if not is_torch_available():
@@ -31,18 +29,19 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["modeling_fast"] = ["FASTForImageCaptioning","FastPreTrainedModel"]
+    _import_structure["modeling_fast"] = ["FASTForImageCaptioning", "FastPreTrainedModel"]
 
 if TYPE_CHECKING:
     from .configuration_fast import FastConfig
     from .image_processing_fast import FastImageProcessor
+
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_fast import FASTForImageCaptioning,FastPreTrainedModel
+        from .modeling_fast import FASTForImageCaptioning, FastPreTrainedModel
 
 
 else:
diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index 1c652128a85e..1f160810c315 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -68,22 +68,22 @@ class FastImageProcessor(BaseImageProcessor):
         size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
             Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
-        do_center_crop (`bool`, *optional*, defaults to `True`):
+        do_center_crop (`bool`, *optional*, defaults to `False`):
             Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
             is padded with 0's and then center cropped. Can be overridden by the `do_center_crop` parameter in the
             `preprocess` method.
         crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
             Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`.
             Can be overridden by the `crop_size` parameter in the `preprocess` method.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
-            parameter in the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
             `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
             method.
@@ -98,6 +98,10 @@ class FastImageProcessor(BaseImageProcessor):
             used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The
             background label will be replaced by 255. Can be overridden by the `do_reduce_labels` parameter in the
             `preprocess` method.
+        min_area (`int`, *optional*, defaults to 10): <fill_docstring>
+        min_score (`float`, *optional*, defaults to 0.88): <fill_docstring>
+        bbox_type (`str`, *optional*, defaults to `"rect"`): <fill_docstring>
+        pooling_size (`int`, *optional*, defaults to 9): <fill_docstring>
     """
 
     model_input_names = ["pixel_values"]
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 4d04b8e56ed5..7820fd24cdd3 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -9,13 +9,13 @@
 from transformers import FastConfig, PreTrainedModel, add_start_docstrings
 from transformers.utils import ModelOutput
 
+
 FAST_FOR_CAPTIONING_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
             [`BeitImageProcessor.__call__`] for details.
@@ -37,6 +37,7 @@
             classification loss is computed (Cross-Entropy) against these labels.
 """
 
+
 def get_same_padding(kernel_size):
     if isinstance(kernel_size, tuple):
         assert len(kernel_size) == 2, "invalid kernel size: %s" % kernel_size
@@ -918,6 +919,7 @@ class FASTForImageCaptioningOutput(ModelOutput):
     loss: Optional[torch.Tensor] = None
     hidden_states: Optional[torch.FloatTensor] = None
 
+
 @add_start_docstrings(
     """BEiT-3 is a general-purpose multimodal foundation model that excels in both vision and vision-language tasks. It
         utilizes [Multiway transformers] (https://arxiv.org/abs/2208.10442) for deep fusion and modality-specific
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 4d89b2942f79..06bdee17752b 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3453,6 +3453,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class FastPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index f7f88615b670..e9419bd78b03 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -50,6 +50,7 @@
 from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES
 from transformers.utils import ENV_VARS_TRUE_VALUES, direct_transformers_import
 
+
 # All paths are set with the intent you should run this script from the root of the repo with the command
 # python utils/check_repo.py
 PATH_TO_TRANSFORMERS = "src/transformers"
@@ -811,9 +812,9 @@ def check_objects_being_equally_in_main_init():
         module_name = module_path.split(".")[-1]
         module_dir = ".".join(module_path.split(".")[:-1])
         if (
-                module_name.startswith("modeling_")
-                and not module_name.startswith("modeling_tf_")
-                and not module_name.startswith("modeling_flax_")
+            module_name.startswith("modeling_")
+            and not module_name.startswith("modeling_tf_")
+            and not module_name.startswith("modeling_flax_")
         ):
             parent_module = sys.modules[module_dir]
 
@@ -1006,17 +1007,17 @@ def ignore_undocumented(name: str) -> bool:
         return True
     # PreTrainedModels / Encoders / Decoders / Layers / Embeddings / Attention are not documented.
     if (
-            name.endswith("PreTrainedModel")
-            or name.endswith("Decoder")
-            or name.endswith("Encoder")
-            or name.endswith("Layer")
-            or name.endswith("Embeddings")
-            or name.endswith("Attention")
+        name.endswith("PreTrainedModel")
+        or name.endswith("Decoder")
+        or name.endswith("Encoder")
+        or name.endswith("Layer")
+        or name.endswith("Embeddings")
+        or name.endswith("Attention")
     ):
         return True
     # Submodules are not documented.
     if os.path.isdir(os.path.join(PATH_TO_TRANSFORMERS, name)) or os.path.isfile(
-            os.path.join(PATH_TO_TRANSFORMERS, f"{name}.py")
+        os.path.join(PATH_TO_TRANSFORMERS, f"{name}.py")
     ):
         return True
     # All load functions are not documented.
@@ -1074,7 +1075,7 @@ def check_model_type_doc_match():
             "Some model doc pages do not match any existing model type:\n"
             + "\n".join(errors)
             + "\nYou can add any missing model type to the `MODEL_NAMES_MAPPING` constant in "
-              "models/auto/configuration_auto.py."
+            "models/auto/configuration_auto.py."
         )
 
 
@@ -1118,7 +1119,7 @@ def check_docstrings_are_in_md():
             "The following files have docstrings written in rst:\n"
             + "\n".join([f"- {f}" for f in files_with_rst])
             + "\nTo fix this run `doc-builder convert path_to_py_file` after installing `doc-builder`\n"
-              "(`pip install git+https://github.com/huggingface/doc-builder`)"
+            "(`pip install git+https://github.com/huggingface/doc-builder`)"
         )
 
 

From 1abfbc0c944e997fcf280b0f48fc819818bfeb2d Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 1 Nov 2023 09:48:33 +0530
Subject: [PATCH 017/152] Fix build

---
 src/transformers/models/fast/__init__.py      |  7 +++--
 .../models/fast/configuration_fast.py         | 29 +++++++++++++++++++
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py
index 78bd816d9d0d..c4ecab2f2c0d 100644
--- a/src/transformers/models/fast/__init__.py
+++ b/src/transformers/models/fast/__init__.py
@@ -21,7 +21,10 @@
 )
 
 
-_import_structure = {"configuration_fast": ["FastConfig"], "image_processing_fast": ["FastImageProcessor"]}
+_import_structure = {
+    "configuration_fast": ["FAST_PRETRAINED_CONFIG_ARCHIVE_MAP", "FastConfig"],
+    "image_processing_fast": ["FastImageProcessor"],
+}
 
 try:
     if not is_torch_available():
@@ -32,7 +35,7 @@
     _import_structure["modeling_fast"] = ["FASTForImageCaptioning", "FastPreTrainedModel"]
 
 if TYPE_CHECKING:
-    from .configuration_fast import FastConfig
+    from .configuration_fast import FAST_PRETRAINED_CONFIG_ARCHIVE_MAP, FastConfig
     from .image_processing_fast import FastImageProcessor
 
     try:
diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index ee8c27b03a32..3f813386507f 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -1,7 +1,36 @@
+# coding=utf-8
+# Copyright The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Fast model configuration"""
 from transformers import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+FAST_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "fast_base_tt_800_finetune_ic17mlt": (
+        "https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt/raw/main/config.json"
+    ),
+}
 
 
 class FastConfig(PretrainedConfig):
+    r"""
+    [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt)
+    """
+
     def __init__(
         self,
         backbone_kernel_size=3,

From f85fbda8f71bb1726a743cee058b6e5d8fb27ba8 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 1 Nov 2023 10:00:22 +0530
Subject: [PATCH 018/152] Fix build

---
 .../models/fast/image_processing_fast.py      | 166 +++++++++---------
 1 file changed, 82 insertions(+), 84 deletions(-)

diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index 1f160810c315..320d3cb6cb47 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -12,19 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Image processor class for Beit."""
+"""Image processor class for Fast."""
 import math
 import warnings
 from typing import Any, Dict, List, Optional, Union
 
 from ...utils.import_utils import is_cv2_available
 
-
 if is_cv2_available():
     import cv2
 import numpy as np
-import torch.nn as nn
-import torch.nn.functional as F
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import resize, to_channel_dimension_format
@@ -47,12 +44,13 @@
     logging,
 )
 
-
 if is_vision_available():
     import PIL
 
 if is_torch_available():
     import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
 
 logger = logging.get_logger(__name__)
 
@@ -107,23 +105,23 @@ class FastImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
 
     def __init__(
-        self,
-        do_resize: bool = True,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        do_center_crop: bool = False,
-        crop_size: Dict[str, int] = None,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_rescale: bool = True,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_reduce_labels: bool = False,
-        min_area: int = 10,
-        min_score: float = 0.88,
-        bbox_type: str = "rect",
-        pooling_size: int = 9,
-        **kwargs,
+            self,
+            do_resize: bool = True,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = PILImageResampling.BICUBIC,
+            do_center_crop: bool = False,
+            crop_size: Dict[str, int] = None,
+            rescale_factor: Union[int, float] = 1 / 255,
+            do_rescale: bool = True,
+            do_normalize: bool = True,
+            image_mean: Optional[Union[float, List[float]]] = None,
+            image_std: Optional[Union[float, List[float]]] = None,
+            do_reduce_labels: bool = False,
+            min_area: int = 10,
+            min_score: float = 0.88,
+            bbox_type: str = "rect",
+            pooling_size: int = 9,
+            **kwargs,
     ) -> None:
         if "reduce_labels" in kwargs:
             warnings.warn(
@@ -165,13 +163,13 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         return super().from_dict(image_processor_dict, **kwargs)
 
     def resize(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
+            self,
+            image: np.ndarray,
+            size: Dict[str, int],
+            resample: PILImageResampling = PILImageResampling.BICUBIC,
+            data_format: Optional[Union[str, ChannelDimension]] = None,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+            **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to (size["height"], size["width"]).
@@ -209,20 +207,20 @@ def reduce_label(self, label: ImageInput) -> np.ndarray:
         return label
 
     def _preprocess(
-        self,
-        image: ImageInput,
-        do_reduce_labels: bool = None,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+            self,
+            image: ImageInput,
+            do_reduce_labels: bool = None,
+            do_resize: bool = None,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = None,
+            do_center_crop: bool = None,
+            crop_size: Dict[str, int] = None,
+            do_rescale: bool = None,
+            rescale_factor: float = None,
+            do_normalize: bool = None,
+            image_mean: Optional[Union[float, List[float]]] = None,
+            image_std: Optional[Union[float, List[float]]] = None,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         if do_reduce_labels:
             image = self.reduce_label(image)
@@ -242,20 +240,20 @@ def _preprocess(
         return image
 
     def _preprocess_image(
-        self,
-        image: ImageInput,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+            self,
+            image: ImageInput,
+            do_resize: bool = None,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = None,
+            do_center_crop: bool = None,
+            crop_size: Dict[str, int] = None,
+            do_rescale: bool = None,
+            rescale_factor: float = None,
+            do_normalize: bool = None,
+            image_mean: Optional[Union[float, List[float]]] = None,
+            image_std: Optional[Union[float, List[float]]] = None,
+            data_format: Optional[Union[str, ChannelDimension]] = None,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """Preprocesses a single image."""
         # All transformations expect numpy arrays.
@@ -287,15 +285,15 @@ def _preprocess_image(
         return image
 
     def _preprocess_segmentation_map(
-        self,
-        segmentation_map: ImageInput,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: Dict[str, int] = None,
-        do_reduce_labels: bool = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+            self,
+            segmentation_map: ImageInput,
+            do_resize: bool = None,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = None,
+            do_center_crop: bool = None,
+            crop_size: Dict[str, int] = None,
+            do_reduce_labels: bool = None,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         """Preprocesses a single segmentation map."""
         # All transformations expect numpy arrays.
@@ -333,24 +331,24 @@ def __call__(self, images, segmentation_maps=None, **kwargs):
         return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
 
     def preprocess(
-        self,
-        images: ImageInput,
-        segmentation_maps: Optional[ImageInput] = None,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_reduce_labels: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: ChannelDimension = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
+            self,
+            images: ImageInput,
+            segmentation_maps: Optional[ImageInput] = None,
+            do_resize: bool = None,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = None,
+            do_center_crop: bool = None,
+            crop_size: Dict[str, int] = None,
+            do_rescale: bool = None,
+            rescale_factor: float = None,
+            do_normalize: bool = None,
+            image_mean: Optional[Union[float, List[float]]] = None,
+            image_std: Optional[Union[float, List[float]]] = None,
+            do_reduce_labels: Optional[bool] = None,
+            return_tensors: Optional[Union[str, TensorType]] = None,
+            data_format: ChannelDimension = ChannelDimension.FIRST,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+            **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.

From cd0b45f670865208eedeab366d843768447f1dbc Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 1 Nov 2023 10:22:36 +0530
Subject: [PATCH 019/152] Fix Build

---
 .../models/fast/image_processing_fast.py      | 160 +++++++++---------
 1 file changed, 81 insertions(+), 79 deletions(-)

diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index 320d3cb6cb47..a9ae06694fd6 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -19,6 +19,7 @@
 
 from ...utils.import_utils import is_cv2_available
 
+
 if is_cv2_available():
     import cv2
 import numpy as np
@@ -44,6 +45,7 @@
     logging,
 )
 
+
 if is_vision_available():
     import PIL
 
@@ -105,23 +107,23 @@ class FastImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
 
     def __init__(
-            self,
-            do_resize: bool = True,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = PILImageResampling.BICUBIC,
-            do_center_crop: bool = False,
-            crop_size: Dict[str, int] = None,
-            rescale_factor: Union[int, float] = 1 / 255,
-            do_rescale: bool = True,
-            do_normalize: bool = True,
-            image_mean: Optional[Union[float, List[float]]] = None,
-            image_std: Optional[Union[float, List[float]]] = None,
-            do_reduce_labels: bool = False,
-            min_area: int = 10,
-            min_score: float = 0.88,
-            bbox_type: str = "rect",
-            pooling_size: int = 9,
-            **kwargs,
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = False,
+        crop_size: Dict[str, int] = None,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_rescale: bool = True,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_reduce_labels: bool = False,
+        min_area: int = 10,
+        min_score: float = 0.88,
+        bbox_type: str = "rect",
+        pooling_size: int = 9,
+        **kwargs,
     ) -> None:
         if "reduce_labels" in kwargs:
             warnings.warn(
@@ -163,13 +165,13 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         return super().from_dict(image_processor_dict, **kwargs)
 
     def resize(
-            self,
-            image: np.ndarray,
-            size: Dict[str, int],
-            resample: PILImageResampling = PILImageResampling.BICUBIC,
-            data_format: Optional[Union[str, ChannelDimension]] = None,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
-            **kwargs,
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to (size["height"], size["width"]).
@@ -207,20 +209,20 @@ def reduce_label(self, label: ImageInput) -> np.ndarray:
         return label
 
     def _preprocess(
-            self,
-            image: ImageInput,
-            do_reduce_labels: bool = None,
-            do_resize: bool = None,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = None,
-            do_center_crop: bool = None,
-            crop_size: Dict[str, int] = None,
-            do_rescale: bool = None,
-            rescale_factor: float = None,
-            do_normalize: bool = None,
-            image_mean: Optional[Union[float, List[float]]] = None,
-            image_std: Optional[Union[float, List[float]]] = None,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        self,
+        image: ImageInput,
+        do_reduce_labels: bool = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         if do_reduce_labels:
             image = self.reduce_label(image)
@@ -240,20 +242,20 @@ def _preprocess(
         return image
 
     def _preprocess_image(
-            self,
-            image: ImageInput,
-            do_resize: bool = None,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = None,
-            do_center_crop: bool = None,
-            crop_size: Dict[str, int] = None,
-            do_rescale: bool = None,
-            rescale_factor: float = None,
-            do_normalize: bool = None,
-            image_mean: Optional[Union[float, List[float]]] = None,
-            image_std: Optional[Union[float, List[float]]] = None,
-            data_format: Optional[Union[str, ChannelDimension]] = None,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        self,
+        image: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """Preprocesses a single image."""
         # All transformations expect numpy arrays.
@@ -285,15 +287,15 @@ def _preprocess_image(
         return image
 
     def _preprocess_segmentation_map(
-            self,
-            segmentation_map: ImageInput,
-            do_resize: bool = None,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = None,
-            do_center_crop: bool = None,
-            crop_size: Dict[str, int] = None,
-            do_reduce_labels: bool = None,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        self,
+        segmentation_map: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_reduce_labels: bool = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         """Preprocesses a single segmentation map."""
         # All transformations expect numpy arrays.
@@ -331,24 +333,24 @@ def __call__(self, images, segmentation_maps=None, **kwargs):
         return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
 
     def preprocess(
-            self,
-            images: ImageInput,
-            segmentation_maps: Optional[ImageInput] = None,
-            do_resize: bool = None,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = None,
-            do_center_crop: bool = None,
-            crop_size: Dict[str, int] = None,
-            do_rescale: bool = None,
-            rescale_factor: float = None,
-            do_normalize: bool = None,
-            image_mean: Optional[Union[float, List[float]]] = None,
-            image_std: Optional[Union[float, List[float]]] = None,
-            do_reduce_labels: Optional[bool] = None,
-            return_tensors: Optional[Union[str, TensorType]] = None,
-            data_format: ChannelDimension = ChannelDimension.FIRST,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
-            **kwargs,
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[ImageInput] = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_reduce_labels: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.

From 6005f2febcfb14c038b2e47914ef37e7588b10bd Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 1 Nov 2023 15:54:52 +0530
Subject: [PATCH 020/152] Clean up docstrings

---
 .../models/fast/image_processing_fast.py      |  15 +--
 src/transformers/models/fast/modeling_fast.py | 106 ++++++++----------
 2 files changed, 48 insertions(+), 73 deletions(-)

diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index a9ae06694fd6..f950f4bca2fa 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -93,11 +93,6 @@ class FastImageProcessor(BaseImageProcessor):
         image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
             The standard deviation to use if normalizing the image. This is a float or list of floats of length of the
             number of channels of the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
-        do_reduce_labels (`bool`, *optional*, defaults to `False`):
-            Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is
-            used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The
-            background label will be replaced by 255. Can be overridden by the `do_reduce_labels` parameter in the
-            `preprocess` method.
         min_area (`int`, *optional*, defaults to 10): <fill_docstring>
         min_score (`float`, *optional*, defaults to 0.88): <fill_docstring>
         bbox_type (`str`, *optional*, defaults to `"rect"`): <fill_docstring>
@@ -118,20 +113,13 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_reduce_labels: bool = False,
         min_area: int = 10,
         min_score: float = 0.88,
         bbox_type: str = "rect",
         pooling_size: int = 9,
         **kwargs,
     ) -> None:
-        if "reduce_labels" in kwargs:
-            warnings.warn(
-                "The `reduce_labels` parameter is deprecated and will be removed in a future version. Please use"
-                " `do_reduce_labels` instead.",
-                FutureWarning,
-            )
-            do_reduce_labels = kwargs.pop("reduce_labels")
+
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 640, "width": 640}
         size = get_size_dict(size)
@@ -147,7 +135,6 @@ def __init__(
         self.do_normalize = do_normalize
         self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
-        self.do_reduce_labels = do_reduce_labels
         self.min_area = min_area
         self.min_score = min_score
         self.bbox_type = bbox_type
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 7820fd24cdd3..4d916690bce2 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -9,24 +9,11 @@
 from transformers import FastConfig, PreTrainedModel, add_start_docstrings
 from transformers.utils import ModelOutput
 
-
 FAST_FOR_CAPTIONING_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
-            IDs?](../glossary#input-ids)
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
             [`BeitImageProcessor.__call__`] for details.
-        language_masked_pos (`torch.LongTensor` of shape `({0})`):
-            language_masked_pos for denoting tokens for captioning
-            - 1 indicates the token is **Present**,
-            - 0 indicates the token is **absent**.
-        text_len (`torch.LongTensor` of shape `({0})`):
-            Length of text for captioning
-        past_key_value (`Dict`):
-            A Dictionary containing the incremental states layerwise
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
             more detail.
@@ -66,7 +53,7 @@ def build_activation(act_func, inplace=True):
 
 class My2DLayer(nn.Module):
     def __init__(
-        self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act"
+            self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act"
     ):
         super(My2DLayer, self).__init__()
         self.in_channels = in_channels
@@ -163,19 +150,19 @@ def is_zero_layer():
 
 class ConvLayer(nn.Module):
     def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size=3,
-        stride=1,
-        dilation=1,
-        groups=1,
-        bias=False,
-        has_shuffle=False,
-        use_bn=True,
-        act_func="relu",
-        dropout_rate=0,
-        use_act=True,
+            self,
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            dilation=1,
+            groups=1,
+            bias=False,
+            has_shuffle=False,
+            use_bn=True,
+            act_func="relu",
+            dropout_rate=0,
+            use_act=True,
     ):
         super().__init__()
 
@@ -449,48 +436,48 @@ def __init__(self, config):
         # self.first_conv.apply(self._init_weights)
         stage1 = []
         for stage_config in zip(
-            config.backbone_stage1_in_channels,
-            config.backbone_stage1_out_channels,
-            config.backbone_stage1_kernel_size,
-            config.backbone_stage1_stride,
-            config.backbone_stage1_dilation,
-            config.backbone_stage1_groups,
+                config.backbone_stage1_in_channels,
+                config.backbone_stage1_out_channels,
+                config.backbone_stage1_kernel_size,
+                config.backbone_stage1_stride,
+                config.backbone_stage1_dilation,
+                config.backbone_stage1_groups,
         ):
             stage1.append(RepConvLayer(*stage_config))
         self.stage1 = nn.ModuleList(stage1)
 
         stage2 = []
         for stage_config in zip(
-            config.backbone_stage2_in_channels,
-            config.backbone_stage2_out_channels,
-            config.backbone_stage2_kernel_size,
-            config.backbone_stage2_stride,
-            config.backbone_stage2_dilation,
-            config.backbone_stage2_groups,
+                config.backbone_stage2_in_channels,
+                config.backbone_stage2_out_channels,
+                config.backbone_stage2_kernel_size,
+                config.backbone_stage2_stride,
+                config.backbone_stage2_dilation,
+                config.backbone_stage2_groups,
         ):
             stage2.append(RepConvLayer(*stage_config))
         self.stage2 = nn.ModuleList(stage2)
 
         stage3 = []
         for stage_config in zip(
-            config.backbone_stage3_in_channels,
-            config.backbone_stage3_out_channels,
-            config.backbone_stage3_kernel_size,
-            config.backbone_stage3_stride,
-            config.backbone_stage3_dilation,
-            config.backbone_stage3_groups,
+                config.backbone_stage3_in_channels,
+                config.backbone_stage3_out_channels,
+                config.backbone_stage3_kernel_size,
+                config.backbone_stage3_stride,
+                config.backbone_stage3_dilation,
+                config.backbone_stage3_groups,
         ):
             stage3.append(RepConvLayer(*stage_config))
         self.stage3 = nn.ModuleList(stage3)
 
         stage4 = []
         for stage_config in zip(
-            config.backbone_stage4_in_channels,
-            config.backbone_stage4_out_channels,
-            config.backbone_stage4_kernel_size,
-            config.backbone_stage4_stride,
-            config.backbone_stage4_dilation,
-            config.backbone_stage4_groups,
+                config.backbone_stage4_in_channels,
+                config.backbone_stage4_out_channels,
+                config.backbone_stage4_kernel_size,
+                config.backbone_stage4_stride,
+                config.backbone_stage4_dilation,
+                config.backbone_stage4_groups,
         ):
             stage4.append(RepConvLayer(*stage_config))
         self.stage4 = nn.ModuleList(stage4)
@@ -722,7 +709,8 @@ def _max_pooling(self, x, scale=1):
 
 
 def emb_loss(
-    emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), bg_sample=False
+        emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0),
+        bg_sample=False
 ):
     training_mask = (training_mask > 0.5).long()
     kernel = (kernel > 0.5).long()
@@ -749,7 +737,7 @@ def emb_loss(
             continue
         ind = instance == lb
         emb_ = emb[:, ind]
-        dist = (emb_ - emb_mean[:, i : i + 1]).norm(p=2, dim=0)
+        dist = (emb_ - emb_mean[:, i: i + 1]).norm(p=2, dim=0)
         dist = F.relu(dist - delta_v) ** 2
         l_agg[i] = torch.mean(torch.log(dist + 1.0))
     l_agg = torch.mean(l_agg[1:])
@@ -781,7 +769,7 @@ def emb_loss(
                 for i, lb in enumerate(unique_labels):
                     if lb == 0:
                         continue
-                    dist = (emb_bg - emb_mean[:, i : i + 1]).norm(p=2, dim=0)
+                    dist = (emb_bg - emb_mean[:, i: i + 1]).norm(p=2, dim=0)
                     dist = F.relu(2 * delta_d - dist) ** 2
                     l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True)
                     l_dis.append(l_dis_bg)
@@ -976,11 +964,11 @@ def loss(self, hidden, labels):
         return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb)
 
     def forward(
-        self,
-        pixel_values: torch.FloatTensor,
-        output_hidden_states: Optional[bool] = True,
-        return_dict: Optional[bool] = None,
-        labels: Dict = None,
+            self,
+            pixel_values: torch.FloatTensor,
+            output_hidden_states: Optional[bool] = True,
+            return_dict: Optional[bool] = None,
+            labels: Dict = None,
     ):
         # outputs = {}
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict

From e56bff7e4696b3037cf35a24d9b5dd97c33b2195 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 1 Nov 2023 16:49:32 +0530
Subject: [PATCH 021/152] Fix Build

---
 .../models/fast/image_processing_fast.py      | 12 +--
 src/transformers/models/fast/modeling_fast.py | 94 +++++++++----------
 2 files changed, 52 insertions(+), 54 deletions(-)

diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index f950f4bca2fa..ff46ca02b012 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Image processor class for Fast."""
 import math
-import warnings
 from typing import Any, Dict, List, Optional, Union
 
 from ...utils.import_utils import is_cv2_available
@@ -93,10 +92,10 @@ class FastImageProcessor(BaseImageProcessor):
         image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
             The standard deviation to use if normalizing the image. This is a float or list of floats of length of the
             number of channels of the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
-        min_area (`int`, *optional*, defaults to 10): <fill_docstring>
-        min_score (`float`, *optional*, defaults to 0.88): <fill_docstring>
-        bbox_type (`str`, *optional*, defaults to `"rect"`): <fill_docstring>
-        pooling_size (`int`, *optional*, defaults to 9): <fill_docstring>
+        min_area (`int`, *optional*, defaults to 200): Threshold for min area for results
+        min_score (`float`, *optional*, defaults to 0.88): Threshold for min score for results
+        bbox_type (`str`, *optional*, defaults to `"rect"`): Type of bbox, rect or poly
+        pooling_size (`int`, *optional*, defaults to 9): Pooling size for text detection
     """
 
     model_input_names = ["pixel_values"]
@@ -113,13 +112,12 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        min_area: int = 10,
+        min_area: int = 200,
         min_score: float = 0.88,
         bbox_type: str = "rect",
         pooling_size: int = 9,
         **kwargs,
     ) -> None:
-
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 640, "width": 640}
         size = get_size_dict(size)
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 4d916690bce2..e7590614eade 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -9,6 +9,7 @@
 from transformers import FastConfig, PreTrainedModel, add_start_docstrings
 from transformers.utils import ModelOutput
 
+
 FAST_FOR_CAPTIONING_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
@@ -53,7 +54,7 @@ def build_activation(act_func, inplace=True):
 
 class My2DLayer(nn.Module):
     def __init__(
-            self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act"
+        self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act"
     ):
         super(My2DLayer, self).__init__()
         self.in_channels = in_channels
@@ -150,19 +151,19 @@ def is_zero_layer():
 
 class ConvLayer(nn.Module):
     def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size=3,
-            stride=1,
-            dilation=1,
-            groups=1,
-            bias=False,
-            has_shuffle=False,
-            use_bn=True,
-            act_func="relu",
-            dropout_rate=0,
-            use_act=True,
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        dilation=1,
+        groups=1,
+        bias=False,
+        has_shuffle=False,
+        use_bn=True,
+        act_func="relu",
+        dropout_rate=0,
+        use_act=True,
     ):
         super().__init__()
 
@@ -436,48 +437,48 @@ def __init__(self, config):
         # self.first_conv.apply(self._init_weights)
         stage1 = []
         for stage_config in zip(
-                config.backbone_stage1_in_channels,
-                config.backbone_stage1_out_channels,
-                config.backbone_stage1_kernel_size,
-                config.backbone_stage1_stride,
-                config.backbone_stage1_dilation,
-                config.backbone_stage1_groups,
+            config.backbone_stage1_in_channels,
+            config.backbone_stage1_out_channels,
+            config.backbone_stage1_kernel_size,
+            config.backbone_stage1_stride,
+            config.backbone_stage1_dilation,
+            config.backbone_stage1_groups,
         ):
             stage1.append(RepConvLayer(*stage_config))
         self.stage1 = nn.ModuleList(stage1)
 
         stage2 = []
         for stage_config in zip(
-                config.backbone_stage2_in_channels,
-                config.backbone_stage2_out_channels,
-                config.backbone_stage2_kernel_size,
-                config.backbone_stage2_stride,
-                config.backbone_stage2_dilation,
-                config.backbone_stage2_groups,
+            config.backbone_stage2_in_channels,
+            config.backbone_stage2_out_channels,
+            config.backbone_stage2_kernel_size,
+            config.backbone_stage2_stride,
+            config.backbone_stage2_dilation,
+            config.backbone_stage2_groups,
         ):
             stage2.append(RepConvLayer(*stage_config))
         self.stage2 = nn.ModuleList(stage2)
 
         stage3 = []
         for stage_config in zip(
-                config.backbone_stage3_in_channels,
-                config.backbone_stage3_out_channels,
-                config.backbone_stage3_kernel_size,
-                config.backbone_stage3_stride,
-                config.backbone_stage3_dilation,
-                config.backbone_stage3_groups,
+            config.backbone_stage3_in_channels,
+            config.backbone_stage3_out_channels,
+            config.backbone_stage3_kernel_size,
+            config.backbone_stage3_stride,
+            config.backbone_stage3_dilation,
+            config.backbone_stage3_groups,
         ):
             stage3.append(RepConvLayer(*stage_config))
         self.stage3 = nn.ModuleList(stage3)
 
         stage4 = []
         for stage_config in zip(
-                config.backbone_stage4_in_channels,
-                config.backbone_stage4_out_channels,
-                config.backbone_stage4_kernel_size,
-                config.backbone_stage4_stride,
-                config.backbone_stage4_dilation,
-                config.backbone_stage4_groups,
+            config.backbone_stage4_in_channels,
+            config.backbone_stage4_out_channels,
+            config.backbone_stage4_kernel_size,
+            config.backbone_stage4_stride,
+            config.backbone_stage4_dilation,
+            config.backbone_stage4_groups,
         ):
             stage4.append(RepConvLayer(*stage_config))
         self.stage4 = nn.ModuleList(stage4)
@@ -709,8 +710,7 @@ def _max_pooling(self, x, scale=1):
 
 
 def emb_loss(
-        emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0),
-        bg_sample=False
+    emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), bg_sample=False
 ):
     training_mask = (training_mask > 0.5).long()
     kernel = (kernel > 0.5).long()
@@ -737,7 +737,7 @@ def emb_loss(
             continue
         ind = instance == lb
         emb_ = emb[:, ind]
-        dist = (emb_ - emb_mean[:, i: i + 1]).norm(p=2, dim=0)
+        dist = (emb_ - emb_mean[:, i : i + 1]).norm(p=2, dim=0)
         dist = F.relu(dist - delta_v) ** 2
         l_agg[i] = torch.mean(torch.log(dist + 1.0))
     l_agg = torch.mean(l_agg[1:])
@@ -769,7 +769,7 @@ def emb_loss(
                 for i, lb in enumerate(unique_labels):
                     if lb == 0:
                         continue
-                    dist = (emb_bg - emb_mean[:, i: i + 1]).norm(p=2, dim=0)
+                    dist = (emb_bg - emb_mean[:, i : i + 1]).norm(p=2, dim=0)
                     dist = F.relu(2 * delta_d - dist) ** 2
                     l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True)
                     l_dis.append(l_dis_bg)
@@ -964,11 +964,11 @@ def loss(self, hidden, labels):
         return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb)
 
     def forward(
-            self,
-            pixel_values: torch.FloatTensor,
-            output_hidden_states: Optional[bool] = True,
-            return_dict: Optional[bool] = None,
-            labels: Dict = None,
+        self,
+        pixel_values: torch.FloatTensor,
+        output_hidden_states: Optional[bool] = True,
+        return_dict: Optional[bool] = None,
+        labels: Dict = None,
     ):
         # outputs = {}
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict

From ac672f309a3333775621ddd4a52d99c4f9f9484c Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 1 Nov 2023 17:03:26 +0530
Subject: [PATCH 022/152] Fix Build

---
 src/transformers/models/fast/modeling_fast.py | 122 +++++++++---------
 1 file changed, 64 insertions(+), 58 deletions(-)

diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index e7590614eade..2db6db54e78b 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -7,7 +7,20 @@
 import torch.nn.functional as F
 
 from transformers import FastConfig, PreTrainedModel, add_start_docstrings
-from transformers.utils import ModelOutput
+from transformers.utils import ModelOutput, add_start_docstrings_to_model_forward, replace_return_docstrings
+
+_CONFIG_FOR_DOC = "FastConfig"
+
+FAST_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`Beit3Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
 
 
 FAST_FOR_CAPTIONING_INPUTS_DOCSTRING = r"""
@@ -54,7 +67,7 @@ def build_activation(act_func, inplace=True):
 
 class My2DLayer(nn.Module):
     def __init__(
-        self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act"
+            self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act"
     ):
         super(My2DLayer, self).__init__()
         self.in_channels = in_channels
@@ -151,19 +164,19 @@ def is_zero_layer():
 
 class ConvLayer(nn.Module):
     def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size=3,
-        stride=1,
-        dilation=1,
-        groups=1,
-        bias=False,
-        has_shuffle=False,
-        use_bn=True,
-        act_func="relu",
-        dropout_rate=0,
-        use_act=True,
+            self,
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            dilation=1,
+            groups=1,
+            bias=False,
+            has_shuffle=False,
+            use_bn=True,
+            act_func="relu",
+            dropout_rate=0,
+            use_act=True,
     ):
         super().__init__()
 
@@ -437,62 +450,52 @@ def __init__(self, config):
         # self.first_conv.apply(self._init_weights)
         stage1 = []
         for stage_config in zip(
-            config.backbone_stage1_in_channels,
-            config.backbone_stage1_out_channels,
-            config.backbone_stage1_kernel_size,
-            config.backbone_stage1_stride,
-            config.backbone_stage1_dilation,
-            config.backbone_stage1_groups,
+                config.backbone_stage1_in_channels,
+                config.backbone_stage1_out_channels,
+                config.backbone_stage1_kernel_size,
+                config.backbone_stage1_stride,
+                config.backbone_stage1_dilation,
+                config.backbone_stage1_groups,
         ):
             stage1.append(RepConvLayer(*stage_config))
         self.stage1 = nn.ModuleList(stage1)
 
         stage2 = []
         for stage_config in zip(
-            config.backbone_stage2_in_channels,
-            config.backbone_stage2_out_channels,
-            config.backbone_stage2_kernel_size,
-            config.backbone_stage2_stride,
-            config.backbone_stage2_dilation,
-            config.backbone_stage2_groups,
+                config.backbone_stage2_in_channels,
+                config.backbone_stage2_out_channels,
+                config.backbone_stage2_kernel_size,
+                config.backbone_stage2_stride,
+                config.backbone_stage2_dilation,
+                config.backbone_stage2_groups,
         ):
             stage2.append(RepConvLayer(*stage_config))
         self.stage2 = nn.ModuleList(stage2)
 
         stage3 = []
         for stage_config in zip(
-            config.backbone_stage3_in_channels,
-            config.backbone_stage3_out_channels,
-            config.backbone_stage3_kernel_size,
-            config.backbone_stage3_stride,
-            config.backbone_stage3_dilation,
-            config.backbone_stage3_groups,
+                config.backbone_stage3_in_channels,
+                config.backbone_stage3_out_channels,
+                config.backbone_stage3_kernel_size,
+                config.backbone_stage3_stride,
+                config.backbone_stage3_dilation,
+                config.backbone_stage3_groups,
         ):
             stage3.append(RepConvLayer(*stage_config))
         self.stage3 = nn.ModuleList(stage3)
 
         stage4 = []
         for stage_config in zip(
-            config.backbone_stage4_in_channels,
-            config.backbone_stage4_out_channels,
-            config.backbone_stage4_kernel_size,
-            config.backbone_stage4_stride,
-            config.backbone_stage4_dilation,
-            config.backbone_stage4_groups,
+                config.backbone_stage4_in_channels,
+                config.backbone_stage4_out_channels,
+                config.backbone_stage4_kernel_size,
+                config.backbone_stage4_stride,
+                config.backbone_stage4_dilation,
+                config.backbone_stage4_groups,
         ):
             stage4.append(RepConvLayer(*stage_config))
         self.stage4 = nn.ModuleList(stage4)
 
-    #     self._initialize_weights()
-    #
-    # def _initialize_weights(self):
-    #     for m in self.modules():
-    #         if isinstance(m, nn.Conv2d):
-    #             nn.init.kaiming_normal_(m.weight)
-    #         elif isinstance(m, nn.BatchNorm2d):
-    #             m.weight.data.fill_(1)
-    #             m.bias.data.zero_()
-
     def forward(self, x):
         x = self.first_conv(x)
         output = []
@@ -710,7 +713,8 @@ def _max_pooling(self, x, scale=1):
 
 
 def emb_loss(
-    emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), bg_sample=False
+        emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0),
+        bg_sample=False
 ):
     training_mask = (training_mask > 0.5).long()
     kernel = (kernel > 0.5).long()
@@ -737,7 +741,7 @@ def emb_loss(
             continue
         ind = instance == lb
         emb_ = emb[:, ind]
-        dist = (emb_ - emb_mean[:, i : i + 1]).norm(p=2, dim=0)
+        dist = (emb_ - emb_mean[:, i: i + 1]).norm(p=2, dim=0)
         dist = F.relu(dist - delta_v) ** 2
         l_agg[i] = torch.mean(torch.log(dist + 1.0))
     l_agg = torch.mean(l_agg[1:])
@@ -769,7 +773,7 @@ def emb_loss(
                 for i, lb in enumerate(unique_labels):
                     if lb == 0:
                         continue
-                    dist = (emb_bg - emb_mean[:, i : i + 1]).norm(p=2, dim=0)
+                    dist = (emb_bg - emb_mean[:, i: i + 1]).norm(p=2, dim=0)
                     dist = F.relu(2 * delta_d - dist) ** 2
                     l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True)
                     l_dis.append(l_dis_bg)
@@ -913,7 +917,7 @@ class FASTForImageCaptioningOutput(ModelOutput):
         utilizes [Multiway transformers] (https://arxiv.org/abs/2208.10442) for deep fusion and modality-specific
         encoding, and unifies masked modeling on images, texts, and image-text pairs, achieving top performance on
         multiple benchmarks.""",
-    FAST_FOR_CAPTIONING_INPUTS_DOCSTRING,
+    FAST_START_DOCSTRING,
 )
 class FASTForImageCaptioning(FastPreTrainedModel):
     def __init__(self, config):
@@ -963,12 +967,14 @@ def loss(self, hidden, labels):
 
         return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb)
 
+    @add_start_docstrings_to_model_forward(FAST_FOR_CAPTIONING_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FASTForImageCaptioningOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
-        self,
-        pixel_values: torch.FloatTensor,
-        output_hidden_states: Optional[bool] = True,
-        return_dict: Optional[bool] = None,
-        labels: Dict = None,
+            self,
+            pixel_values: torch.FloatTensor,
+            output_hidden_states: Optional[bool] = True,
+            return_dict: Optional[bool] = None,
+            labels: Dict = None,
     ):
         # outputs = {}
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict

From aa1cc417d3b1361bb37239b94792ed70b4f0c924 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 1 Nov 2023 17:33:44 +0530
Subject: [PATCH 023/152] Fix Build

---
 src/transformers/models/fast/modeling_fast.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 2db6db54e78b..86e6210dab82 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -22,7 +22,6 @@
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-
 FAST_FOR_CAPTIONING_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
@@ -976,6 +975,14 @@ def forward(
             return_dict: Optional[bool] = None,
             labels: Dict = None,
     ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        """
         # outputs = {}
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         f = self.backbone(pixel_values)

From 90e0cd8a8d0764fc80d57d11bf6318fccb0b2cf5 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 1 Nov 2023 17:50:10 +0530
Subject: [PATCH 024/152] Fix build

---
 src/transformers/models/fast/modeling_fast.py | 94 +++++++++----------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 86e6210dab82..8f36da8528e4 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -9,6 +9,7 @@
 from transformers import FastConfig, PreTrainedModel, add_start_docstrings
 from transformers.utils import ModelOutput, add_start_docstrings_to_model_forward, replace_return_docstrings
 
+
 _CONFIG_FOR_DOC = "FastConfig"
 
 FAST_START_DOCSTRING = r"""
@@ -66,7 +67,7 @@ def build_activation(act_func, inplace=True):
 
 class My2DLayer(nn.Module):
     def __init__(
-            self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act"
+        self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act"
     ):
         super(My2DLayer, self).__init__()
         self.in_channels = in_channels
@@ -163,19 +164,19 @@ def is_zero_layer():
 
 class ConvLayer(nn.Module):
     def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size=3,
-            stride=1,
-            dilation=1,
-            groups=1,
-            bias=False,
-            has_shuffle=False,
-            use_bn=True,
-            act_func="relu",
-            dropout_rate=0,
-            use_act=True,
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        dilation=1,
+        groups=1,
+        bias=False,
+        has_shuffle=False,
+        use_bn=True,
+        act_func="relu",
+        dropout_rate=0,
+        use_act=True,
     ):
         super().__init__()
 
@@ -449,48 +450,48 @@ def __init__(self, config):
         # self.first_conv.apply(self._init_weights)
         stage1 = []
         for stage_config in zip(
-                config.backbone_stage1_in_channels,
-                config.backbone_stage1_out_channels,
-                config.backbone_stage1_kernel_size,
-                config.backbone_stage1_stride,
-                config.backbone_stage1_dilation,
-                config.backbone_stage1_groups,
+            config.backbone_stage1_in_channels,
+            config.backbone_stage1_out_channels,
+            config.backbone_stage1_kernel_size,
+            config.backbone_stage1_stride,
+            config.backbone_stage1_dilation,
+            config.backbone_stage1_groups,
         ):
             stage1.append(RepConvLayer(*stage_config))
         self.stage1 = nn.ModuleList(stage1)
 
         stage2 = []
         for stage_config in zip(
-                config.backbone_stage2_in_channels,
-                config.backbone_stage2_out_channels,
-                config.backbone_stage2_kernel_size,
-                config.backbone_stage2_stride,
-                config.backbone_stage2_dilation,
-                config.backbone_stage2_groups,
+            config.backbone_stage2_in_channels,
+            config.backbone_stage2_out_channels,
+            config.backbone_stage2_kernel_size,
+            config.backbone_stage2_stride,
+            config.backbone_stage2_dilation,
+            config.backbone_stage2_groups,
         ):
             stage2.append(RepConvLayer(*stage_config))
         self.stage2 = nn.ModuleList(stage2)
 
         stage3 = []
         for stage_config in zip(
-                config.backbone_stage3_in_channels,
-                config.backbone_stage3_out_channels,
-                config.backbone_stage3_kernel_size,
-                config.backbone_stage3_stride,
-                config.backbone_stage3_dilation,
-                config.backbone_stage3_groups,
+            config.backbone_stage3_in_channels,
+            config.backbone_stage3_out_channels,
+            config.backbone_stage3_kernel_size,
+            config.backbone_stage3_stride,
+            config.backbone_stage3_dilation,
+            config.backbone_stage3_groups,
         ):
             stage3.append(RepConvLayer(*stage_config))
         self.stage3 = nn.ModuleList(stage3)
 
         stage4 = []
         for stage_config in zip(
-                config.backbone_stage4_in_channels,
-                config.backbone_stage4_out_channels,
-                config.backbone_stage4_kernel_size,
-                config.backbone_stage4_stride,
-                config.backbone_stage4_dilation,
-                config.backbone_stage4_groups,
+            config.backbone_stage4_in_channels,
+            config.backbone_stage4_out_channels,
+            config.backbone_stage4_kernel_size,
+            config.backbone_stage4_stride,
+            config.backbone_stage4_dilation,
+            config.backbone_stage4_groups,
         ):
             stage4.append(RepConvLayer(*stage_config))
         self.stage4 = nn.ModuleList(stage4)
@@ -712,8 +713,7 @@ def _max_pooling(self, x, scale=1):
 
 
 def emb_loss(
-        emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0),
-        bg_sample=False
+    emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), bg_sample=False
 ):
     training_mask = (training_mask > 0.5).long()
     kernel = (kernel > 0.5).long()
@@ -740,7 +740,7 @@ def emb_loss(
             continue
         ind = instance == lb
         emb_ = emb[:, ind]
-        dist = (emb_ - emb_mean[:, i: i + 1]).norm(p=2, dim=0)
+        dist = (emb_ - emb_mean[:, i : i + 1]).norm(p=2, dim=0)
         dist = F.relu(dist - delta_v) ** 2
         l_agg[i] = torch.mean(torch.log(dist + 1.0))
     l_agg = torch.mean(l_agg[1:])
@@ -772,7 +772,7 @@ def emb_loss(
                 for i, lb in enumerate(unique_labels):
                     if lb == 0:
                         continue
-                    dist = (emb_bg - emb_mean[:, i: i + 1]).norm(p=2, dim=0)
+                    dist = (emb_bg - emb_mean[:, i : i + 1]).norm(p=2, dim=0)
                     dist = F.relu(2 * delta_d - dist) ** 2
                     l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True)
                     l_dis.append(l_dis_bg)
@@ -969,11 +969,11 @@ def loss(self, hidden, labels):
     @add_start_docstrings_to_model_forward(FAST_FOR_CAPTIONING_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=FASTForImageCaptioningOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
-            self,
-            pixel_values: torch.FloatTensor,
-            output_hidden_states: Optional[bool] = True,
-            return_dict: Optional[bool] = None,
-            labels: Dict = None,
+        self,
+        pixel_values: torch.FloatTensor,
+        output_hidden_states: Optional[bool] = True,
+        return_dict: Optional[bool] = None,
+        labels: Dict = None,
     ):
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):

From c94fc70b99f5e383e4a4c68dc166dafb2d1effdc Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 1 Nov 2023 21:36:01 +0530
Subject: [PATCH 025/152] Add test for image_processing_fast and add
 documentation tests

---
 .../models/fast/image_processing_fast.py      |  75 --------
 src/transformers/models/fast/modeling_fast.py |  13 +-
 .../models/fast/test_image_processing_fast.py | 160 ++++++++++++++++++
 tests/models/fast/test_modeling_fast.py       |   2 +-
 4 files changed, 173 insertions(+), 77 deletions(-)
 create mode 100644 tests/models/fast/test_image_processing_fast.py

diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index ff46ca02b012..03625082c8ee 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -185,18 +185,9 @@ def resize(
             **kwargs,
         )
 
-    def reduce_label(self, label: ImageInput) -> np.ndarray:
-        label = to_numpy_array(label)
-        # Avoid using underflow conversion
-        label[label == 0] = 255
-        label = label - 1
-        label[label == 254] = 255
-        return label
-
     def _preprocess(
         self,
         image: ImageInput,
-        do_reduce_labels: bool = None,
         do_resize: bool = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
@@ -209,9 +200,6 @@ def _preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
-        if do_reduce_labels:
-            image = self.reduce_label(image)
-
         if do_resize:
             image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
 
@@ -254,7 +242,6 @@ def _preprocess_image(
             input_data_format = infer_channel_dimension_format(image)
         image = self._preprocess(
             image,
-            do_reduce_labels=False,
             do_resize=do_resize,
             size=size,
             resample=resample,
@@ -271,47 +258,6 @@ def _preprocess_image(
             image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
         return image
 
-    def _preprocess_segmentation_map(
-        self,
-        segmentation_map: ImageInput,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: Dict[str, int] = None,
-        do_reduce_labels: bool = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ):
-        """Preprocesses a single segmentation map."""
-        # All transformations expect numpy arrays.
-        segmentation_map = to_numpy_array(segmentation_map)
-        # Add an axis to the segmentation maps for transformations.
-        if segmentation_map.ndim == 2:
-            segmentation_map = segmentation_map[None, ...]
-            added_dimension = True
-            input_data_format = ChannelDimension.FIRST
-        else:
-            added_dimension = False
-            if input_data_format is None:
-                input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1)
-        segmentation_map = self._preprocess(
-            image=segmentation_map,
-            do_reduce_labels=do_reduce_labels,
-            do_resize=do_resize,
-            resample=resample,
-            size=size,
-            do_center_crop=do_center_crop,
-            crop_size=crop_size,
-            do_normalize=False,
-            do_rescale=False,
-            input_data_format=ChannelDimension.FIRST,
-        )
-        # Remove extra axis if added
-        if added_dimension:
-            segmentation_map = np.squeeze(segmentation_map, axis=0)
-        segmentation_map = segmentation_map.astype(np.int64)
-        return segmentation_map
-
     def __call__(self, images, segmentation_maps=None, **kwargs):
         # Overrides the `__call__` method of the `Preprocessor` class such that the images and segmentation maps can both
         # be passed in as positional arguments.
@@ -331,7 +277,6 @@ def preprocess(
         do_normalize: bool = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_reduce_labels: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -366,10 +311,6 @@ def preprocess(
                 Image mean.
             image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                 Image standard deviation.
-            do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
-                Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
-                is used for background, and background itself is not included in all classes of a dataset (e.g.
-                ADE20k). The background label will be replaced by 255.
             return_tensors (`str` or `TensorType`, *optional*):
                 The type of tensors to return. Can be one of:
                     - Unset: Return a list of `np.ndarray`.
@@ -401,7 +342,6 @@ def preprocess(
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
-        do_reduce_labels = do_reduce_labels if do_reduce_labels is not None else self.do_reduce_labels
 
         images = make_list_of_images(images)
         if segmentation_maps is not None:
@@ -452,21 +392,6 @@ def preprocess(
 
         data = {"pixel_values": images}
 
-        if segmentation_maps is not None:
-            segmentation_maps = [
-                self._preprocess_segmentation_map(
-                    segmentation_map=segmentation_map,
-                    do_reduce_labels=do_reduce_labels,
-                    do_resize=do_resize,
-                    resample=resample,
-                    size=size,
-                    do_center_crop=do_center_crop,
-                    crop_size=crop_size,
-                )
-                for segmentation_map in segmentation_maps
-            ]
-            data["labels"] = segmentation_maps
-
         return BatchFeature(data=data, tensor_type=return_tensors)
 
     def _max_pooling(self, x, scale=1):
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 8f36da8528e4..6aad3fa97b45 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -982,7 +982,18 @@ def forward(
 
         Returns:
 
-        """
+                Examples:
+
+        ```python
+        >>> from transformers import FastImageProcessor, FASTForImageCaptioning >>> from PIL import Image >>> import
+        requests >>> url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg" >>>
+        image = Image.open(requests.get(url, stream=True).raw).convert("RGB") >>> processor =
+        FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") >>> model =
+        FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") >>> inputs =
+        processor(image, return_tensor="np") >>> # forward pass >>> outputs =
+        model(pixel_values=torch.tensor(inputs["pixel_values"])) >>> target_sizes = [(image.shape[1], image.shape[2])
+        for image in inputs["pixel_values"]] >>> text_locations = processor.post_process_text_detection(outputs,
+        target_sizes) >>> print(text_locations[0]["bboxes"][0][:10]) [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]"""
         # outputs = {}
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         f = self.backbone(pixel_values)
diff --git a/tests/models/fast/test_image_processing_fast.py b/tests/models/fast/test_image_processing_fast.py
new file mode 100644
index 000000000000..17b11004b2f2
--- /dev/null
+++ b/tests/models/fast/test_image_processing_fast.py
@@ -0,0 +1,160 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import requests
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import FASTForImageCaptioning, FastImageProcessor
+
+
+class FastImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        min_area: int = 200,
+        min_score: float = 0.88,
+        bbox_type: str = "rect",
+        pooling_size: int = 9,
+    ):
+        size = size if size is not None else {"height": 20, "width": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.min_area = min_area
+        self.min_score = min_score
+        self.bbox_type = bbox_type
+        self.pooling_size = pooling_size
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "min_area": self.min_area,
+            "min_score": self.min_score,
+            "bbox_type": self.bbox_type,
+            "pooling_size": self.pooling_size,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class FastImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = FastImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = FastImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 20, "width": 20})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(
+            self.image_processor_dict, size=42, crop_size=84, reduce_labels=True
+        )
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
+
+    def test_post_process_text_detection(self):
+        model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
+
+        image_processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
+
+        def prepare_image():
+            image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg"
+            raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
+            return raw_image
+
+        image = prepare_image()
+        inputs = image_processor(image, return_tensor="np")
+
+        output = model(pixel_values=torch.tensor(inputs["pixel_values"]))
+        target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]]
+        final_out = image_processor.post_process_text_detection(output, target_sizes)
+
+        assert len(final_out[0]["bboxes"]) == 2
+        assert len(final_out[0]["bboxes"][0]) == 716
+        assert final_out[0]["bboxes"][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]
+        assert round(float(final_out[0]["scores"][0]), 5) == 0.92356
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index 17f09befd7cd..6fcc0214c4c3 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -409,7 +409,7 @@ def prepare_image():
         assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134]
         assert round(float(final_out[0]["scores"][0]), 5) == 0.95541
 
-    @slow
+    # @slow
     def test_inference_fast_base_800_total_text_ic17mlt_model(self):
         model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
 

From 47409eb40f7841bb28499baa184764db2c197627 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 1 Nov 2023 21:42:29 +0530
Subject: [PATCH 026/152] some refactorings

---
 src/transformers/models/fast/modeling_fast.py | 156 ++++--------------
 1 file changed, 36 insertions(+), 120 deletions(-)

diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 6aad3fa97b45..135c0f79b0cd 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -430,7 +430,7 @@ def _init_weights(self, module):
                 module.bias.data.zero_()
 
 
-class TextNet(nn.Module):
+class FastTextNet(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.first_conv = ConvLayer(
@@ -447,7 +447,6 @@ def __init__(self, config):
             config.backbone_dropout_rate,
             config.backbone_ops_order,
         )
-        # self.first_conv.apply(self._init_weights)
         stage1 = []
         for stage_config in zip(
             config.backbone_stage1_in_channels,
@@ -496,25 +495,25 @@ def __init__(self, config):
             stage4.append(RepConvLayer(*stage_config))
         self.stage4 = nn.ModuleList(stage4)
 
-    def forward(self, x):
-        x = self.first_conv(x)
+    def forward(self, hidden_states):
+        hidden_states = self.first_conv(hidden_states)
         output = []
 
         for block in self.stage1:
-            x = block(x)
-        output.append(x)
+            hidden_states = block(hidden_states)
+        output.append(hidden_states)
 
         for block in self.stage2:
-            x = block(x)
-        output.append(x)
+            hidden_states = block(hidden_states)
+        output.append(hidden_states)
 
         for block in self.stage3:
-            x = block(x)
-        output.append(x)
+            hidden_states = block(hidden_states)
+        output.append(hidden_states)
 
         for block in self.stage4:
-            x = block(x)
-        output.append(x)
+            hidden_states = block(hidden_states)
+        output.append(hidden_states)
 
         return output
 
@@ -532,13 +531,9 @@ def __init__(self, config):
                 config.neck_groups,
             )
         )
-        self.layers_count = len(reduce_layer_configs)
+        self.num_layers = len(reduce_layer_configs)
         for layer_ix in range(0, len(reduce_layer_configs)):
             setattr(self, f"reduce_layer{layer_ix + 1}", RepConvLayer(*reduce_layer_configs[layer_ix]))
-        # self.reduce_layer1 = RepConvLayer(*reduce_layer_configs[0])
-        # self.reduce_layer2 = RepConvLayer(*reduce_layer_configs[1])
-        # self.reduce_layer3 = RepConvLayer(*reduce_layer_configs[2])
-        # self.reduce_layer4 = RepConvLayer(*reduce_layer_configs[3])
 
         self._initialize_weights()
 
@@ -550,22 +545,21 @@ def _initialize_weights(self):
                 m.weight.data.fill_(1)
                 m.bias.data.zero_()
 
-    def _upsample(self, x, y):
-        _, _, H, W = y.size()
-        return F.upsample(x, size=(H, W), mode="bilinear")
+    def _upsample(self, layer_out, height, width):
+        return F.upsample(layer_out, size=(height, width), mode="bilinear")
 
-    def forward(self, x):
-        f1 = x[0]
-        f1 = self.reduce_layer1(f1)
-        output_stages = [f1]
+    def forward(self, hidden_states):
+        first_layer_hidden = hidden_states[0]
+        first_layer_hidden = self.reduce_layer1(first_layer_hidden)
+        output_stages = [first_layer_hidden]
 
-        for layer_ix in range(1, self.layers_count):
-            layer_out = getattr(self, f"reduce_layer{layer_ix + 1}")(x[layer_ix])
-            layer_out = self._upsample(layer_out, f1)
+        for layer_ix in range(1, self.num_layers):
+            layer_out = getattr(self, f"reduce_layer{layer_ix + 1}")(hidden_states[layer_ix])
+            layer_out = self._upsample(layer_out, first_layer_hidden[2], first_layer_hidden[3])
             output_stages.append(layer_out)
 
-        f = torch.cat(output_stages, 1)
-        return f
+        combined_hidden_states = torch.cat(output_stages, 1)
+        return combined_hidden_states
 
 
 class FASTHead(nn.Module):
@@ -621,55 +615,12 @@ def _initialize_weights(self):
                 m.weight.data.fill_(1)
                 m.bias.data.zero_()
 
-    def forward(self, x):
-        x = self.conv(x)
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
         if self.dropout is not None:
-            x = self.dropout(x)
-        x = self.final(x)
-        return x
-
-    # def get_results(self, out, img_meta, scale=2):
-    #     org_img_size = img_meta["org_img_size"]
-    #     img_size = img_meta["img_size"]  # 640*640
-    #     batch_size = out.size(0)
-    #     outputs = {}
-    #
-    #     texts = F.interpolate(
-    #         out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), mode="nearest"
-    #     )  # B*1*320*320
-    #     texts = self._max_pooling(texts, scale=scale)  # B*1*320*320
-    #     score_maps = torch.sigmoid_(texts)  # B*1*320*320~
-    #     score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode="nearest")  # B*1*640*640
-    #     score_maps = score_maps.squeeze(1)  # B*640*640
-    #
-    #     kernels = (out[:, 0, :, :] > 0).to(torch.uint8)  # B*160*160
-    #     labels_ = []
-    #     for kernel in kernels.numpy():
-    #         ret, label_ = cv2.connectedComponents(kernel)
-    #         labels_.append(label_)
-    #     labels_ = np.array(labels_)
-    #     labels_ = torch.from_numpy(labels_)
-    #     labels = labels_.unsqueeze(1).to(torch.float32)  # B*1*160*160
-    #     labels = F.interpolate(
-    #         labels, size=(img_size[0] // scale, img_size[1] // scale), mode="nearest"
-    #     )  # B*1*320*320
-    #     labels = self._max_pooling(labels, scale=scale)
-    #     labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode="nearest")  # B*1*640*640
-    #     labels = labels.squeeze(1).to(torch.int32)  # B*640*640
-    #
-    #     keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)]
-    #
-    #     outputs.update({"kernels": kernels.data.cpu()})
-    #
-    #     scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0]))
-    #
-    #     results = []
-    #     for i in range(batch_size):
-    #         bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales)
-    #         results.append({"bboxes": bboxes, "scores": scores})
-    #     outputs.update({"results": results})
-    #
-    #     return outputs
+            hidden_states = self.dropout(hidden_states)
+        hidden_states = self.final(hidden_states)
+        return hidden_states
 
     def _max_pooling(self, x, scale=1):
         if scale == 1:
@@ -678,39 +629,6 @@ def _max_pooling(self, x, scale=1):
             x = self.pooling_2s(x)
         return x
 
-    # def generate_bbox(self, keys, label, score, scales):
-    #     label_num = len(keys)
-    #     bboxes = []
-    #     scores = []
-    #     for index in range(1, label_num):
-    #         i = keys[index]
-    #         ind = label == i
-    #         ind_np = ind.data.cpu().numpy()
-    #         points = np.array(np.where(ind_np)).transpose((1, 0))
-    #         if points.shape[0] < self.min_area:
-    #             label[ind] = 0
-    #             continue
-    #         score_i = score[ind].mean().item()
-    #         if score_i < self.min_score:
-    #             label[ind] = 0
-    #             continue
-    #
-    #         if self.bbox_type == "rect":
-    #             rect = cv2.minAreaRect(points[:, ::-1])
-    #             alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1])))
-    #             rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2])
-    #             bbox = cv2.boxPoints(rect) * scales
-    #         else:
-    #             binary = np.zeros(label.shape, dtype="uint8")
-    #             binary[ind_np] = 1
-    #             contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    #             bbox = contours[0] * scales
-    #
-    #         bbox = bbox.astype("int32")
-    #         bboxes.append(bbox.reshape(-1).tolist())
-    #         scores.append(score_i)
-    #     return bboxes, scores
-
 
 def emb_loss(
     emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), bg_sample=False
@@ -921,7 +839,7 @@ class FASTForImageCaptioningOutput(ModelOutput):
 class FASTForImageCaptioning(FastPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.backbone = TextNet(config=config)
+        self.backbone = FastTextNet(config=config)
         self.neck = FASTNeck(config=config)
         self.det_head = FASTHead(config=config)
         self.loss_bg = config.loss_bg
@@ -996,21 +914,19 @@ def forward(
         target_sizes) >>> print(text_locations[0]["bboxes"][0][:10]) [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]"""
         # outputs = {}
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        f = self.backbone(pixel_values)
+        hidden_states = self.backbone(pixel_values)
 
-        f = self.neck(f)
+        hidden_states = self.neck(hidden_states)
 
-        det_out = self.det_head(f)
+        text_detection_output = self.det_head(hidden_states)
 
         loss = None
         if labels:
-            out = self._upsample(det_out, pixel_values.size(), scale=1)
+            out = self._upsample(text_detection_output, pixel_values.size(), scale=1)
             loss = self.loss(out, labels)
-        # det_res = self.det_head.get_results(det_out, img_metas, scale=2)
-        # outputs.update(det_res)
-        det_out = self._upsample(det_out, pixel_values.size(), scale=4)
+        text_detection_output = self._upsample(text_detection_output, pixel_values.size(), scale=4)
 
         if not return_dict:
-            return (loss, det_out) if loss is not None else (det_out,)
+            return (loss, text_detection_output) if loss is not None else (text_detection_output,)
 
-        return FASTForImageCaptioningOutput(loss, det_out)
+        return FASTForImageCaptioningOutput(loss, text_detection_output)

From 6b787d687b1ece094a2c9f5d50098243423913eb Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Sun, 5 Nov 2023 09:09:42 +0530
Subject: [PATCH 027/152] Fix failing tests

---
 .../models/fast/image_processing_fast.py      |  5 ----
 src/transformers/models/fast/modeling_fast.py | 29 ++++++++++++-------
 tests/models/fast/test_modeling_fast.py       |  7 ++---
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index 03625082c8ee..2e58d40c8856 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -258,11 +258,6 @@ def _preprocess_image(
             image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
         return image
 
-    def __call__(self, images, segmentation_maps=None, **kwargs):
-        # Overrides the `__call__` method of the `Preprocessor` class such that the images and segmentation maps can both
-        # be passed in as positional arguments.
-        return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 135c0f79b0cd..4d8f1155ad5c 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -555,7 +555,8 @@ def forward(self, hidden_states):
 
         for layer_ix in range(1, self.num_layers):
             layer_out = getattr(self, f"reduce_layer{layer_ix + 1}")(hidden_states[layer_ix])
-            layer_out = self._upsample(layer_out, first_layer_hidden[2], first_layer_hidden[3])
+            _, _, height, width = first_layer_hidden.size()
+            layer_out = self._upsample(layer_out, height, width)
             output_stages.append(layer_out)
 
         combined_hidden_states = torch.cat(output_stages, 1)
@@ -903,15 +904,23 @@ def forward(
                 Examples:
 
         ```python
-        >>> from transformers import FastImageProcessor, FASTForImageCaptioning >>> from PIL import Image >>> import
-        requests >>> url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg" >>>
-        image = Image.open(requests.get(url, stream=True).raw).convert("RGB") >>> processor =
-        FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") >>> model =
-        FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") >>> inputs =
-        processor(image, return_tensor="np") >>> # forward pass >>> outputs =
-        model(pixel_values=torch.tensor(inputs["pixel_values"])) >>> target_sizes = [(image.shape[1], image.shape[2])
-        for image in inputs["pixel_values"]] >>> text_locations = processor.post_process_text_detection(outputs,
-        target_sizes) >>> print(text_locations[0]["bboxes"][0][:10]) [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]"""
+        >>> from transformers import FastImageProcessor, FASTForImageCaptioning
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+        >>> processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
+        >>> model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> # forward pass
+        >>> outputs = model(pixel_values=inputs["pixel_values"])
+        >>> target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]]
+        >>> text_locations = processor.post_process_text_detection(outputs, target_sizes)
+        >>> print(text_locations[0]["bboxes"][0][:10])
+        [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]
+        ```
+        """
         # outputs = {}
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         hidden_states = self.backbone(pixel_values)
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index 6fcc0214c4c3..951fc71dcdc3 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -27,7 +27,6 @@
 from transformers.testing_utils import (
     require_torch,
     require_vision,
-    slow,
     torch_device,
 )
 
@@ -388,7 +387,7 @@ def test_model_is_small(self):
 @require_torch
 @require_vision
 class FastModelIntegrationTest(unittest.TestCase):
-    @slow
+    # @slow
     def test_inference_fast_tiny_ic17mlt_model(self):
         model = FASTForImageCaptioning.from_pretrained("Raghavan/ic17mlt_Fast_T")
 
@@ -400,7 +399,7 @@ def prepare_image():
             return raw_image
 
         image = prepare_image()
-        input = image_processor(image, return_tensor="np")
+        input = image_processor(image, return_tensors="pt")
 
         output = model(pixel_values=torch.tensor(input["pixel_values"]))
         target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]]
@@ -421,7 +420,7 @@ def prepare_image():
             return raw_image
 
         image = prepare_image()
-        input = image_processor(image, return_tensor="np")
+        input = image_processor(image, return_tensors="pt")
 
         output = model(pixel_values=torch.tensor(input["pixel_values"]))
         target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]]

From 134f4cc37d632949fe2e2f528d24188247024e74 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Sun, 5 Nov 2023 09:30:32 +0530
Subject: [PATCH 028/152] Incorporate PR feedbacks

---
 docs/source/en/_toctree.yml                   |  2 +
 docs/source/en/model_doc/fast.md              |  6 +-
 src/transformers/__init__.py                  |  6 ++
 src/transformers/models/fast/__init__.py      |  4 +-
 .../fast/convert_fast_original_to_pytorch.py  |  4 +-
 src/transformers/models/fast/modeling_fast.py | 67 ++++++++++++-------
 .../models/fast/test_image_processing_fast.py |  4 +-
 tests/models/fast/test_modeling_fast.py       | 12 ++--
 utils/check_repo.py                           |  2 +-
 9 files changed, 65 insertions(+), 42 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 86cffb9a7e35..ca9067c596b0 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -537,6 +537,8 @@
         title: EfficientFormer
       - local: model_doc/efficientnet
         title: EfficientNet
+      - local: model_doc/fast
+        title: Fast
       - local: model_doc/focalnet
         title: FocalNet
       - local: model_doc/glpn
diff --git a/docs/source/en/model_doc/fast.md b/docs/source/en/model_doc/fast.md
index 3c81109380ae..b8304251f2f9 100644
--- a/docs/source/en/model_doc/fast.md
+++ b/docs/source/en/model_doc/fast.md
@@ -14,7 +14,7 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Fast
+# FAST
 
 ## Overview
 
@@ -34,9 +34,9 @@ than most networks that are searched for image classification
 
 [[autodoc]] FastImageProcessor
 
-## FASTForImageCaptioning
+## FastForSceneTextRecognition
 
-[[autodoc]] FASTForImageCaptioning
+[[autodoc]] FastForSceneTextRecognition
 - forward
 
 ## FASTForImageCaptioningOutput
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5e12dc8c3354..37333f4ed67f 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1199,6 +1199,7 @@
     _import_structure["models.xlnet"].append("XLNetTokenizerFast")
     _import_structure["tokenization_utils_fast"] = ["PreTrainedTokenizerFast"]
 
+
 try:
     if not (is_sentencepiece_available() and is_tokenizers_available()):
         raise OptionalDependencyNotAvailable()
@@ -1308,6 +1309,7 @@
     _import_structure["models.vivit"].append("VivitImageProcessor")
     _import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
 
+
 # PyTorch-backed objects
 try:
     if not is_torch_available():
@@ -4399,6 +4401,7 @@
     ]
     _import_structure["tf_utils"] = []
 
+
 try:
     if not (
         is_librosa_available()
@@ -4423,6 +4426,7 @@
     _import_structure["models.pop2piano"].append("Pop2PianoTokenizer")
     _import_structure["models.pop2piano"].append("Pop2PianoProcessor")
 
+
 # FLAX-backed objects
 try:
     if not is_flax_available():
@@ -4747,6 +4751,7 @@
         ]
     )
 
+
 # Direct imports for type-checking
 if TYPE_CHECKING:
     # Configuration
@@ -8862,6 +8867,7 @@
         extra_objects={"__version__": __version__},
     )
 
+
 if not is_tf_available() and not is_torch_available() and not is_flax_available():
     logger.warning(
         "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. "
diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py
index c4ecab2f2c0d..dedc491f6c59 100644
--- a/src/transformers/models/fast/__init__.py
+++ b/src/transformers/models/fast/__init__.py
@@ -32,7 +32,7 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["modeling_fast"] = ["FASTForImageCaptioning", "FastPreTrainedModel"]
+    _import_structure["modeling_fast"] = ["FastForSceneTextRecognition", "FastPreTrainedModel"]
 
 if TYPE_CHECKING:
     from .configuration_fast import FAST_PRETRAINED_CONFIG_ARCHIVE_MAP, FastConfig
@@ -44,7 +44,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_fast import FASTForImageCaptioning, FastPreTrainedModel
+        from .modeling_fast import FastForSceneTextRecognition, FastPreTrainedModel
 
 
 else:
diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
index e549294081b8..45522f429ec2 100644
--- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py
+++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
@@ -22,7 +22,7 @@
 import torch
 from PIL import Image
 
-from transformers import FastConfig, FASTForImageCaptioning
+from transformers import FastConfig, FastForSceneTextRecognition
 from transformers.models.fast.image_processing_fast import FastImageProcessor
 
 
@@ -210,7 +210,7 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_
         if "short_size" in data_config["train"]:
             size = data_config["train"]["short_size"]
 
-    model = FASTForImageCaptioning(config)
+    model = FastForSceneTextRecognition(config)
     fast_image_processor = FastImageProcessor(
         size={"height": size, "width": size},
         min_score=config.min_score,
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 4d8f1155ad5c..b88dc6043a72 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -1,3 +1,19 @@
+# coding=utf-8
+# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch FAST model."""
+
 from dataclasses import dataclass
 from typing import Dict, Optional
 
@@ -18,7 +34,7 @@
     behavior.
 
     Parameters:
-        config ([`Beit3Config`]): Model configuration class with all the parameters of the model.
+        config ([`FastConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
@@ -27,7 +43,7 @@
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`BeitImageProcessor.__call__`] for details.
+            [`FastImageProcessor.__call__`] for details.
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
             more detail.
@@ -162,7 +178,7 @@ def is_zero_layer():
         return False
 
 
-class ConvLayer(nn.Module):
+class FASTConvLayer(nn.Module):
     def __init__(
         self,
         in_channels,
@@ -245,9 +261,9 @@ def fuse_conv_bn(self, conv, bn):
         return conv
 
 
-class RepConvLayer(nn.Module):
+class FASTRepConvLayer(nn.Module):
     def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1):
-        super(RepConvLayer, self).__init__()
+        super().__init__()
 
         self.in_channels = in_channels
         self.out_channels = out_channels
@@ -256,7 +272,6 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1,
         self.dilation = dilation
         self.groups = groups
 
-        assert len(kernel_size) == 2
         padding = (int(((kernel_size[0] - 1) * dilation) / 2), int(((kernel_size[1] - 1) * dilation) / 2))
 
         self.nonlinearity = nn.ReLU(inplace=True)
@@ -310,21 +325,21 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1,
             nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None
         )
 
-    def forward(self, input):
+    def forward(self, hidden_states):
         if self.training:
             if hasattr(self, "fused_conv"):
                 self.__delattr__("fused_conv")
 
-            main_outputs = self.main_conv(input)
+            main_outputs = self.main_conv(hidden_states)
             main_outputs = self.main_bn(main_outputs)
             if self.ver_conv is not None:
-                vertical_outputs = self.ver_conv(input)
+                vertical_outputs = self.ver_conv(hidden_states)
                 vertical_outputs = self.ver_bn(vertical_outputs)
             else:
                 vertical_outputs = 0
 
             if self.hor_conv is not None:
-                horizontal_outputs = self.hor_conv(input)
+                horizontal_outputs = self.hor_conv(hidden_states)
                 horizontal_outputs = self.hor_bn(horizontal_outputs)
             else:
                 horizontal_outputs = 0
@@ -332,13 +347,13 @@ def forward(self, input):
             if self.rbr_identity is None:
                 id_out = 0
             else:
-                id_out = self.rbr_identity(input)
+                id_out = self.rbr_identity(hidden_states)
 
             return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out)
         else:
             if not hasattr(self, "fused_conv"):
                 self.prepare_for_eval()
-            return self.nonlinearity(self.fused_conv(input))
+            return self.nonlinearity(self.fused_conv(hidden_states))
 
     def _identity_to_conv(self, identity):
         if identity is None:
@@ -433,7 +448,7 @@ def _init_weights(self, module):
 class FastTextNet(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.first_conv = ConvLayer(
+        self.first_conv = FASTConvLayer(
             config.backbone_in_channels,
             config.backbone_out_channels,
             config.backbone_kernel_size,
@@ -456,7 +471,7 @@ def __init__(self, config):
             config.backbone_stage1_dilation,
             config.backbone_stage1_groups,
         ):
-            stage1.append(RepConvLayer(*stage_config))
+            stage1.append(FASTRepConvLayer(*stage_config))
         self.stage1 = nn.ModuleList(stage1)
 
         stage2 = []
@@ -468,7 +483,7 @@ def __init__(self, config):
             config.backbone_stage2_dilation,
             config.backbone_stage2_groups,
         ):
-            stage2.append(RepConvLayer(*stage_config))
+            stage2.append(FASTRepConvLayer(*stage_config))
         self.stage2 = nn.ModuleList(stage2)
 
         stage3 = []
@@ -480,7 +495,7 @@ def __init__(self, config):
             config.backbone_stage3_dilation,
             config.backbone_stage3_groups,
         ):
-            stage3.append(RepConvLayer(*stage_config))
+            stage3.append(FASTRepConvLayer(*stage_config))
         self.stage3 = nn.ModuleList(stage3)
 
         stage4 = []
@@ -492,7 +507,7 @@ def __init__(self, config):
             config.backbone_stage4_dilation,
             config.backbone_stage4_groups,
         ):
-            stage4.append(RepConvLayer(*stage_config))
+            stage4.append(FASTRepConvLayer(*stage_config))
         self.stage4 = nn.ModuleList(stage4)
 
     def forward(self, hidden_states):
@@ -533,7 +548,7 @@ def __init__(self, config):
         )
         self.num_layers = len(reduce_layer_configs)
         for layer_ix in range(0, len(reduce_layer_configs)):
-            setattr(self, f"reduce_layer{layer_ix + 1}", RepConvLayer(*reduce_layer_configs[layer_ix]))
+            setattr(self, f"reduce_layer{layer_ix + 1}", FASTRepConvLayer(*reduce_layer_configs[layer_ix]))
 
         self._initialize_weights()
 
@@ -566,7 +581,7 @@ def forward(self, hidden_states):
 class FASTHead(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.conv = RepConvLayer(
+        self.conv = FASTRepConvLayer(
             config.head_conv_in_channels,
             config.head_conv_out_channels,
             config.head_conv_kernel_size,
@@ -575,7 +590,7 @@ def __init__(self, config):
             config.head_conv_groups,
         )
 
-        self.final = ConvLayer(
+        self.final = FASTConvLayer(
             config.head_final_in_channels,
             config.head_final_out_channels,
             config.head_final_kernel_size,
@@ -813,7 +828,7 @@ def iou(a, b, mask, n_class=2, reduce=True):
 
 
 @dataclass
-class FASTForImageCaptioningOutput(ModelOutput):
+class FastForSceneTextRecognitionOutput(ModelOutput):
     """
     Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
     last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity
@@ -837,7 +852,7 @@ class FASTForImageCaptioningOutput(ModelOutput):
         multiple benchmarks.""",
     FAST_START_DOCSTRING,
 )
-class FASTForImageCaptioning(FastPreTrainedModel):
+class FastForSceneTextRecognition(FastPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.backbone = FastTextNet(config=config)
@@ -886,7 +901,7 @@ def loss(self, hidden, labels):
         return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb)
 
     @add_start_docstrings_to_model_forward(FAST_FOR_CAPTIONING_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=FASTForImageCaptioningOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=FastForSceneTextRecognitionOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: torch.FloatTensor,
@@ -904,14 +919,14 @@ def forward(
                 Examples:
 
         ```python
-        >>> from transformers import FastImageProcessor, FASTForImageCaptioning
+        >>> from transformers import FastImageProcessor, FastForSceneTextRecognition
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
         >>> processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
-        >>> model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
+        >>> model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
         >>> inputs = processor(image, return_tensors="pt")
         >>> # forward pass
         >>> outputs = model(pixel_values=inputs["pixel_values"])
@@ -938,4 +953,4 @@ def forward(
         if not return_dict:
             return (loss, text_detection_output) if loss is not None else (text_detection_output,)
 
-        return FASTForImageCaptioningOutput(loss, text_detection_output)
+        return FastForSceneTextRecognitionOutput(loss, text_detection_output)
diff --git a/tests/models/fast/test_image_processing_fast.py b/tests/models/fast/test_image_processing_fast.py
index 17b11004b2f2..8aa523dc03f3 100644
--- a/tests/models/fast/test_image_processing_fast.py
+++ b/tests/models/fast/test_image_processing_fast.py
@@ -30,7 +30,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import FASTForImageCaptioning, FastImageProcessor
+    from transformers import FastForSceneTextRecognition, FastImageProcessor
 
 
 class FastImageProcessingTester(unittest.TestCase):
@@ -138,7 +138,7 @@ def test_image_processor_from_dict_with_kwargs(self):
         self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
 
     def test_post_process_text_detection(self):
-        model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
+        model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
 
         image_processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
 
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index 951fc71dcdc3..409f579eed0f 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Falcon model. """
+""" Testing suite for the PyTorch FAST model. """
 import inspect
 import unittest
 
@@ -40,7 +40,7 @@
     import torch
 
     from transformers import (
-        FASTForImageCaptioning,
+        FastForSceneTextRecognition,
     )
 
 
@@ -269,7 +269,7 @@ def get_config(self):
         )
 
     def create_and_check_model(self, config, input):
-        model = FASTForImageCaptioning(config=config)
+        model = FastForSceneTextRecognition(config=config)
         model.to(torch_device)
         model.eval()
         result = model(pixel_values=input["pixel_values"])
@@ -283,7 +283,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class FastModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (FASTForImageCaptioning,) if is_torch_available() else ()
+    all_model_classes = (FastForSceneTextRecognition,) if is_torch_available() else ()
 
     pipeline_model_mapping = {}
     test_headmasking = False
@@ -389,7 +389,7 @@ def test_model_is_small(self):
 class FastModelIntegrationTest(unittest.TestCase):
     # @slow
     def test_inference_fast_tiny_ic17mlt_model(self):
-        model = FASTForImageCaptioning.from_pretrained("Raghavan/ic17mlt_Fast_T")
+        model = FastForSceneTextRecognition.from_pretrained("Raghavan/ic17mlt_Fast_T")
 
         image_processor = FastImageProcessor.from_pretrained("Raghavan/ic17mlt_Fast_T")
 
@@ -410,7 +410,7 @@ def prepare_image():
 
     # @slow
     def test_inference_fast_base_800_total_text_ic17mlt_model(self):
-        model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
+        model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
 
         image_processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index e9419bd78b03..3af3a05a8aa6 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -223,7 +223,7 @@
     "TFCLIPVisionModel",
     "TFGroupViTTextModel",
     "TFGroupViTVisionModel",
-    "FASTForImageCaptioning",
+    "FastForSceneTextRecognition",
     "FlaxCLIPTextModel",
     "FlaxCLIPTextModelWithProjection",
     "FlaxCLIPVisionModel",

From 5b9608b2b62f26886a9447b6d9be3717124bb5ce Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Sun, 5 Nov 2023 09:44:55 +0530
Subject: [PATCH 029/152] Incorporate PR feedbacks

---
 .../models/fast/image_processing_fast.py      |  15 +--
 src/transformers/models/fast/modeling_fast.py | 124 ++----------------
 2 files changed, 15 insertions(+), 124 deletions(-)

diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index 2e58d40c8856..d03f8b542ae0 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Image processor class for Fast."""
+"""Image processor class for FAST."""
 import math
 from typing import Any, Dict, List, Optional, Union
 
@@ -58,7 +58,7 @@
 
 class FastImageProcessor(BaseImageProcessor):
     r"""
-    Constructs a Fast image processor.
+    Constructs a FAST image processor.
 
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
@@ -261,7 +261,6 @@ def _preprocess_image(
     def preprocess(
         self,
         images: ImageInput,
-        segmentation_maps: Optional[ImageInput] = None,
         do_resize: bool = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
@@ -339,8 +338,6 @@ def preprocess(
         image_std = image_std if image_std is not None else self.image_std
 
         images = make_list_of_images(images)
-        if segmentation_maps is not None:
-            segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2)
 
         if not valid_images(images):
             raise ValueError(
@@ -348,12 +345,6 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        if segmentation_maps is not None and not valid_images(segmentation_maps):
-            raise ValueError(
-                "Invalid segmentation map type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-
         if do_resize and size is None or resample is None:
             raise ValueError("Size and resample must be specified if do_resize is True.")
 
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index b88dc6043a72..b209edcdceb4 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -57,12 +57,9 @@
 
 def get_same_padding(kernel_size):
     if isinstance(kernel_size, tuple):
-        assert len(kernel_size) == 2, "invalid kernel size: %s" % kernel_size
         p1 = get_same_padding(kernel_size[0])
         p2 = get_same_padding(kernel_size[1])
         return p1, p2
-    assert isinstance(kernel_size, int), "kernel size should be either `int` or `tuple`"
-    assert kernel_size % 2 > 0, "kernel size should be odd number"
     return kernel_size // 2
 
 
@@ -81,103 +78,6 @@ def build_activation(act_func, inplace=True):
         raise ValueError("do not support: %s" % act_func)
 
 
-class My2DLayer(nn.Module):
-    def __init__(
-        self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act"
-    ):
-        super(My2DLayer, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-
-        self.use_bn = use_bn
-        self.act_func = act_func
-        self.dropout_rate = dropout_rate
-        self.ops_order = ops_order
-
-        """ modules"""
-        modules = {}
-        # batch norm
-        if self.use_bn:
-            if self.bn_before_weight:
-                modules["bn"] = nn.BatchNorm2d(in_channels)
-            else:
-                modules["bn"] = nn.BatchNorm2d(out_channels)
-        else:
-            modules["bn"] = None
-        # activation
-        modules["act"] = build_activation(self.act_func, self.ops_list[0] != "act")
-        # dropout
-        if self.dropout_rate > 0:
-            modules["dropout"] = nn.Dropout2d(self.dropout_rate, inplace=True)
-        else:
-            modules["dropout"] = None
-        # weight
-        modules["weight"] = self.weight_op()
-
-        # add modules
-        for op in self.ops_list:
-            if modules[op] is None:
-                continue
-            elif op == "weight":
-                if modules["dropout"] is not None:
-                    self.add_module("dropout", modules["dropout"])
-                for key in modules["weight"]:
-                    self.add_module(key, modules["weight"][key])
-            else:
-                self.add_module(op, modules[op])
-
-    @property
-    def ops_list(self):
-        return self.ops_order.split("_")
-
-    @property
-    def bn_before_weight(self):
-        for op in self.ops_list:
-            if op == "bn":
-                return True
-            elif op == "weight":
-                return False
-        raise ValueError("Invalid ops_order: %s" % self.ops_order)
-
-    def weight_op(self):
-        raise NotImplementedError
-
-    """ Methods defined in MyModule"""
-
-    def forward(self, x):
-        for key, module in self._modules.items():
-            if key == "bn" and not self.training:
-                continue
-            x = module(x)
-        return x
-
-    @property
-    def module_str(self):
-        raise NotImplementedError
-
-    @property
-    def config(self):
-        return {
-            "in_channels": self.in_channels,
-            "out_channels": self.out_channels,
-            "use_bn": self.use_bn,
-            "act_func": self.act_func,
-            "dropout_rate": self.dropout_rate,
-            "ops_order": self.ops_order,
-        }
-
-    @staticmethod
-    def build_from_config(config):
-        raise NotImplementedError
-
-    def get_flops(self, x):
-        raise NotImplementedError
-
-    @staticmethod
-    def is_zero_layer():
-        return False
-
-
 class FASTConvLayer(nn.Module):
     def __init__(
         self,
@@ -202,7 +102,7 @@ def __init__(
         self.groups = groups
         self.bias = bias
         self.has_shuffle = has_shuffle
-        self.act_func = act_func
+        self.activation_function = act_func
 
         padding = get_same_padding(self.kernel_size)
         if isinstance(padding, int):
@@ -225,11 +125,11 @@ def __init__(
         if use_bn:
             self.bn = nn.BatchNorm2d(out_channels)
 
-        self.act = nn.Identity()
+        self.activation = nn.Identity()
         if use_act:
-            act = build_activation(self.act_func, True)
+            act = build_activation(self.activation_function, True)
             if act is not None:
-                self.act = act
+                self.activation = act
 
     def forward(self, x):
         if self.training:
@@ -237,27 +137,27 @@ def forward(self, x):
                 delattr(self, "fused_conv")
             x = self.conv(x)
             x = self.bn(x)
-            return self.act(x)
+            return self.activation(x)
         else:
             if not hasattr(self, "fused_conv"):
                 setattr(self, "fused_conv", self.fuse_conv_bn(self.conv, self.bn))
             x = self.fused_conv(x)
-            if self.act is not None:
-                x = self.act(x)
+            if self.activation is not None:
+                x = self.activation(x)
             return x
 
-    def fuse_conv_bn(self, conv, bn):
+    def fuse_conv_bn(self, conv, batch_norm):
         """During inference, the functionary of batch norm layers is turned off but
         only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv
         layers to save computations and simplify network structures."""
-        if isinstance(bn, nn.Identity):
+        if isinstance(batch_norm, nn.Identity):
             return conv
         conv_w = conv.weight
-        conv_b = conv.bias if conv.bias is not None else torch.zeros_like(bn.running_mean)
+        conv_b = conv.bias if conv.bias is not None else torch.zeros_like(batch_norm.running_mean)
 
-        factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+        factor = batch_norm.weight / torch.sqrt(batch_norm.running_var + batch_norm.eps)
         conv.weight = nn.Parameter(conv_w * factor.reshape([conv.out_channels, 1, 1, 1]))
-        conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+        conv.bias = nn.Parameter((conv_b - batch_norm.running_mean) * factor + batch_norm.bias)
         return conv
 
 

From 344dc6eafa9dc714c12fca53a5853d189cc5ce6c Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Sun, 5 Nov 2023 15:43:30 +0530
Subject: [PATCH 030/152] Incorporate PR feedbacks

---
 docs/source/en/_toctree.yml                   |  2 +-
 docs/source/en/model_doc/fast.md              |  2 +-
 src/transformers/__init__.py                  |  1 +
 .../fast/convert_fast_original_to_pytorch.py  | 21 ++--
 src/transformers/models/fast/modeling_fast.py | 95 +++++++++----------
 5 files changed, 57 insertions(+), 64 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index ca9067c596b0..51602dc805d4 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -538,7 +538,7 @@
       - local: model_doc/efficientnet
         title: EfficientNet
       - local: model_doc/fast
-        title: Fast
+        title: FAST
       - local: model_doc/focalnet
         title: FocalNet
       - local: model_doc/glpn
diff --git a/docs/source/en/model_doc/fast.md b/docs/source/en/model_doc/fast.md
index b8304251f2f9..e5c8c58f1856 100644
--- a/docs/source/en/model_doc/fast.md
+++ b/docs/source/en/model_doc/fast.md
@@ -24,7 +24,7 @@ arbitrarily-shaped text detector).
 FAST has two new designs. (1) We design a minimalist kernel representation (only has 1-channel output) to model text 
 with arbitrary shape, as well as a GPU-parallel post-processing to efficiently assemble text lines with a negligible 
 time overhead. (2) We search the network architecture tailored for text detection, leading to more powerful features 
-than most networks that are searched for image classification
+than most networks that are searched for image classification.
 
 ## FastConfig
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 37333f4ed67f..4941d724455d 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -50,6 +50,7 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+
 # Base objects, independent of any specific backend
 _import_structure = {
     "audio_utils": [],
diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
index 45522f429ec2..b64263b6df9f 100644
--- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py
+++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
@@ -31,18 +31,9 @@
 base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config"
 
 rename_key_mappings = {
-    "head": "classifier",
-    "text_embed": "text_embedding",
-    "vision_embed": "vision_embedding",
-    "k_proj": "key_proj",
-    "q_proj": "query_proj",
-    "v_proj": "value_proj",
-    "A": "text",
-    "B": "image",
-    "layer_norm": "fc_norm",
-    "self_attn_fc_norm": "self_attn_layer_norm",
-    "final_fc_norm": "final_layer_norm",
-    "first": "first",
+    "bn": "batch_norm",
+    "hor": "horizontal",
+    "ver": "vertical",
 }
 
 
@@ -222,7 +213,11 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_
     state_dict_changed = copy.deepcopy(state_dict)
     for key in state_dict:
         val = state_dict_changed.pop(key)
-        state_dict_changed[key.replace("module.", "")] = val
+        new_key = key.replace("module.", "")
+        for search, replacement in rename_key_mappings.items():
+            if search in new_key:
+                new_key = new_key.replace(search, replacement)
+        state_dict_changed[new_key] = val
     model.load_state_dict(state_dict_changed)
 
     model.save_pretrained(pytorch_dump_folder_path)
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index b209edcdceb4..dd1e9be971c3 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -49,9 +49,6 @@
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. A
-            classification loss is computed (Cross-Entropy) against these labels.
 """
 
 
@@ -89,7 +86,7 @@ def __init__(
         groups=1,
         bias=False,
         has_shuffle=False,
-        use_bn=True,
+        use_batch_norm=True,
         act_func="relu",
         dropout_rate=0,
         use_act=True,
@@ -121,9 +118,9 @@ def __init__(
             groups=groups,
             bias=bias,
         )
-        self.bn = nn.Identity()
-        if use_bn:
-            self.bn = nn.BatchNorm2d(out_channels)
+        self.batch_norm = nn.Identity()
+        if use_batch_norm:
+            self.batch_norm = nn.BatchNorm2d(out_channels)
 
         self.activation = nn.Identity()
         if use_act:
@@ -131,22 +128,22 @@ def __init__(
             if act is not None:
                 self.activation = act
 
-    def forward(self, x):
+    def forward(self, hidden_states):
         if self.training:
             if hasattr(self, "fused_conv"):
                 delattr(self, "fused_conv")
-            x = self.conv(x)
-            x = self.bn(x)
-            return self.activation(x)
+            hidden_states = self.conv(hidden_states)
+            hidden_states = self.batch_norm(hidden_states)
+            return self.activation(hidden_states)
         else:
             if not hasattr(self, "fused_conv"):
-                setattr(self, "fused_conv", self.fuse_conv_bn(self.conv, self.bn))
-            x = self.fused_conv(x)
+                setattr(self, "fused_conv", self.fuse_conv_batch_norm(self.conv, self.batch_norm))
+            hidden_states = self.fused_conv(hidden_states)
             if self.activation is not None:
-                x = self.activation(x)
-            return x
+                hidden_states = self.activation(hidden_states)
+            return hidden_states
 
-    def fuse_conv_bn(self, conv, batch_norm):
+    def fuse_conv_batch_norm(self, conv, batch_norm):
         """During inference, the functionary of batch norm layers is turned off but
         only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv
         layers to save computations and simplify network structures."""
@@ -186,13 +183,13 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1,
             groups=groups,
             bias=False,
         )
-        self.main_bn = nn.BatchNorm2d(num_features=out_channels)
+        self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels)
 
         ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0)
         hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2))
 
         if kernel_size[1] != 1:
-            self.ver_conv = nn.Conv2d(
+            self.vertical_conv = nn.Conv2d(
                 in_channels=in_channels,
                 out_channels=out_channels,
                 kernel_size=(kernel_size[0], 1),
@@ -202,12 +199,12 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1,
                 groups=groups,
                 bias=False,
             )
-            self.ver_bn = nn.BatchNorm2d(num_features=out_channels)
+            self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels)
         else:
-            self.ver_conv, self.ver_bn = None, None
+            self.vertical_conv, self.vertical_batch_norm = None, None
 
         if kernel_size[0] != 1:  # 卷积核的高大于1 -> 有水平卷积
-            self.hor_conv = nn.Conv2d(
+            self.horizontal_conv = nn.Conv2d(
                 in_channels=in_channels,
                 out_channels=out_channels,
                 kernel_size=(1, kernel_size[1]),
@@ -217,9 +214,9 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1,
                 groups=groups,
                 bias=False,
             )
-            self.hor_bn = nn.BatchNorm2d(num_features=out_channels)
+            self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels)
         else:
-            self.hor_conv, self.hor_bn = None, None
+            self.horizontal_conv, self.horizontal_batch_norm = None, None
 
         self.rbr_identity = (
             nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None
@@ -231,16 +228,16 @@ def forward(self, hidden_states):
                 self.__delattr__("fused_conv")
 
             main_outputs = self.main_conv(hidden_states)
-            main_outputs = self.main_bn(main_outputs)
-            if self.ver_conv is not None:
-                vertical_outputs = self.ver_conv(hidden_states)
-                vertical_outputs = self.ver_bn(vertical_outputs)
+            main_outputs = self.main_batch_norm(main_outputs)
+            if self.vertical_conv is not None:
+                vertical_outputs = self.vertical_conv(hidden_states)
+                vertical_outputs = self.vertical_batch_norm(vertical_outputs)
             else:
                 vertical_outputs = 0
 
-            if self.hor_conv is not None:
-                horizontal_outputs = self.hor_conv(hidden_states)
-                horizontal_outputs = self.hor_bn(horizontal_outputs)
+            if self.horizontal_conv is not None:
+                horizontal_outputs = self.horizontal_conv(hidden_states)
+                horizontal_outputs = self.horizontal_batch_norm(horizontal_outputs)
             else:
                 horizontal_outputs = 0
 
@@ -258,7 +255,6 @@ def forward(self, hidden_states):
     def _identity_to_conv(self, identity):
         if identity is None:
             return 0, 0
-        assert isinstance(identity, nn.BatchNorm2d)
         if not hasattr(self, "id_tensor"):
             input_dim = self.in_channels // self.groups
             kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32)
@@ -276,26 +272,26 @@ def _identity_to_conv(self, identity):
         t = (gamma / std).reshape(-1, 1, 1, 1)
         return kernel * t, beta - running_mean * gamma / std
 
-    def _fuse_bn_tensor(self, conv, bn):
+    def _fuse_batch_norm_tensor(self, conv, batch_norm):
         kernel = conv.weight
         kernel = self._pad_to_mxn_tensor(kernel)
-        running_mean = bn.running_mean
-        running_var = bn.running_var
-        gamma = bn.weight
-        beta = bn.bias
-        eps = bn.eps
+        running_mean = batch_norm.running_mean
+        running_var = batch_norm.running_var
+        gamma = batch_norm.weight
+        beta = batch_norm.bias
+        eps = batch_norm.eps
         std = (running_var + eps).sqrt()
         t = (gamma / std).reshape(-1, 1, 1, 1)
         return kernel * t, beta - running_mean * gamma / std
 
     def get_equivalent_kernel_bias(self):
-        kernel_mxn, bias_mxn = self._fuse_bn_tensor(self.main_conv, self.main_bn)
-        if self.ver_conv is not None:
-            kernel_mx1, bias_mx1 = self._fuse_bn_tensor(self.ver_conv, self.ver_bn)
+        kernel_mxn, bias_mxn = self._fuse_batch_norm_tensor(self.main_conv, self.main_batch_norm)
+        if self.vertical_conv is not None:
+            kernel_mx1, bias_mx1 = self._fuse_batch_norm_tensor(self.vertical_conv, self.vertical_batch_norm)
         else:
             kernel_mx1, bias_mx1 = 0, 0
-        if self.hor_conv is not None:
-            kernel_1xn, bias_1xn = self._fuse_bn_tensor(self.hor_conv, self.hor_bn)
+        if self.horizontal_conv is not None:
+            kernel_1xn, bias_1xn = self._fuse_batch_norm_tensor(self.horizontal_conv, self.horizontal_batch_norm)
         else:
             kernel_1xn, bias_1xn = 0, 0
         kernel_id, bias_id = self._identity_to_conv(self.rbr_identity)
@@ -746,10 +742,12 @@ class FastForSceneTextRecognitionOutput(ModelOutput):
 
 
 @add_start_docstrings(
-    """BEiT-3 is a general-purpose multimodal foundation model that excels in both vision and vision-language tasks. It
-        utilizes [Multiway transformers] (https://arxiv.org/abs/2208.10442) for deep fusion and modality-specific
-        encoding, and unifies masked modeling on images, texts, and image-text pairs, achieving top performance on
-        multiple benchmarks.""",
+    """FAST (faster arbitararily-shaped text detector) proposes an accurate and efficient scene text detection
+    framework, termed FAST (i.e., faster arbitrarily-shaped text detector).FAST has two new designs. (1) They design a
+    minimalist kernel representation (only has 1-channel output) to model text with arbitrary shape, as well as a
+    GPU-parallel post-processing to efficiently assemble text lines with a negligible time overhead. (2) We search the
+    network architecture tailored for text detection, leading to more powerful features than most networks that are
+    searched for image classification.""",
     FAST_START_DOCSTRING,
 )
 class FastForSceneTextRecognition(FastPreTrainedModel):
@@ -810,9 +808,8 @@ def forward(
         labels: Dict = None,
     ):
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
-            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+        labels (`Dict[str, torch.Tensor]`, *optional*):
+            Should contain 3 keys: gt_texts,gt_kernels,gt_instances
 
         Returns:
 

From 932d59233544f7e37276cd6e8fee73aa0b0e7343 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Sun, 5 Nov 2023 16:39:20 +0530
Subject: [PATCH 031/152] Incorporate PR feedbacks

---
 .../models/fast/configuration_fast.py         |  2 --
 .../fast/convert_fast_original_to_pytorch.py  | 12 ++++-------
 .../models/fast/image_processing_fast.py      | 21 ++++++++++---------
 src/transformers/models/fast/modeling_fast.py |  7 ++-----
 .../models/fast/test_image_processing_fast.py |  3 ++-
 tests/models/fast/test_modeling_fast.py       |  6 ++++--
 6 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index 3f813386507f..186b398a4745 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -96,7 +96,6 @@ def __init__(
         head_final_dropout_rate=0,
         head_final_ops_order="weight",
         min_area=250,
-        min_score=0.88,
         bbox_type="rect",
         loss_bg=False,
         initializer_range=0.02,
@@ -176,7 +175,6 @@ def __init__(
         self.head_final_ops_order = head_final_ops_order
 
         self.min_area = min_area
-        self.min_score = min_score
         self.bbox_type = bbox_type
         self.loss_bg = loss_bg
         self.initializer_range = initializer_range
diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
index b64263b6df9f..0207f123b257 100644
--- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py
+++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
@@ -43,7 +43,7 @@ def prepare_img():
     return im
 
 
-def prepare_config(size_config_url, pooling_size, min_area, min_score, bbox_type, loss_bg):
+def prepare_config(size_config_url, pooling_size, min_area, bbox_type, loss_bg):
     config_dict = json.loads(requests.get(size_config_url).text)
 
     backbone_config = {}
@@ -148,7 +148,6 @@ def prepare_config(size_config_url, pooling_size, min_area, min_score, bbox_type
         head_final_dropout_rate=config_dict["head"]["final"]["dropout_rate"],
         head_final_ops_order=config_dict["head"]["final"]["ops_order"],
         min_area=min_area,
-        min_score=min_score,
         bbox_type=bbox_type,
         loss_bg=loss_bg,
     )
@@ -174,27 +173,25 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_
     test_config = namespace.get("test_cfg", None)
     data_config = namespace.get("data")
 
-    min_score = 0.88
     min_area = 250
     bbox_type = "rect"
     loss_bg = False
     if test_config is not None:
         min_area = test_config.get("min_area", min_area)
-        min_score = test_config.get("min_score", min_score)
         bbox_type = test_config.get("bbox_type", bbox_type)
         loss_bg = test_config.get("loss_emb", None) == "EmbLoss_v2"
 
     if "tiny" in model_config["backbone"]["config"]:
         config = prepare_config(
-            tiny_config_url, model_config["detection_head"]["pooling_size"], min_area, min_score, bbox_type, loss_bg
+            tiny_config_url, model_config["detection_head"]["pooling_size"], min_area, bbox_type, loss_bg
         )
     elif "small" in model_config["backbone"]["config"]:
         config = prepare_config(
-            small_config_url, model_config["detection_head"]["pooling_size"], min_area, min_score, bbox_type, loss_bg
+            small_config_url, model_config["detection_head"]["pooling_size"], min_area, bbox_type, loss_bg
         )
     else:
         config = prepare_config(
-            base_config_url, model_config["detection_head"]["pooling_size"], min_area, min_score, bbox_type, loss_bg
+            base_config_url, model_config["detection_head"]["pooling_size"], min_area, bbox_type, loss_bg
         )
     size = 640
     if "train" in data_config:
@@ -204,7 +201,6 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_
     model = FastForSceneTextRecognition(config)
     fast_image_processor = FastImageProcessor(
         size={"height": size, "width": size},
-        min_score=config.min_score,
         min_area=config.min_area,
         bbox_type=config.bbox_type,
         pooling_size=config.head_pooling_size,
diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index d03f8b542ae0..8aeb1e6f0334 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -92,10 +92,12 @@ class FastImageProcessor(BaseImageProcessor):
         image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
             The standard deviation to use if normalizing the image. This is a float or list of floats of length of the
             number of channels of the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
-        min_area (`int`, *optional*, defaults to 200): Threshold for min area for results
-        min_score (`float`, *optional*, defaults to 0.88): Threshold for min score for results
-        bbox_type (`str`, *optional*, defaults to `"rect"`): Type of bbox, rect or poly
-        pooling_size (`int`, *optional*, defaults to 9): Pooling size for text detection
+        min_area (`int`, *optional*, defaults to 200):
+            Threshold for min area for results
+        bbox_type (`str`, *optional*, defaults to `"rect"`):
+            Type of bbox, rect or poly
+        pooling_size (`int`, *optional*, defaults to 9):
+            Pooling size for text detection
     """
 
     model_input_names = ["pixel_values"]
@@ -113,7 +115,6 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         min_area: int = 200,
-        min_score: float = 0.88,
         bbox_type: str = "rect",
         pooling_size: int = 9,
         **kwargs,
@@ -134,7 +135,7 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
         self.min_area = min_area
-        self.min_score = min_score
+        # self.threshold = threshold
         self.bbox_type = bbox_type
         self.pooling_size = pooling_size
 
@@ -389,7 +390,7 @@ def _max_pooling(self, x, scale=1):
             )
         return x
 
-    def post_process_text_detection(self, output, target_sizes):
+    def post_process_text_detection(self, output, target_sizes, threshold):
         scale = 2
         img_size = (self.size["height"], self.size["width"])
         out = output["hidden_states"]
@@ -428,13 +429,13 @@ def post_process_text_detection(self, output, target_sizes):
             org_img_size = target_sizes[i]
             scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0]))
 
-            bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales)
+            bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales, threshold)
             results.append({"bboxes": bboxes, "scores": scores})
         final_results.update({"results": results})
 
         return results
 
-    def generate_bbox(self, keys, label, score, scales):
+    def generate_bbox(self, keys, label, score, scales, threshold):
         label_num = len(keys)
         bboxes = []
         scores = []
@@ -447,7 +448,7 @@ def generate_bbox(self, keys, label, score, scales):
                 label[ind] = 0
                 continue
             score_i = score[ind].mean().item()
-            if score_i < self.min_score:
+            if score_i < threshold:
                 label[ind] = 0
                 continue
 
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index dd1e9be971c3..761f7a9066f7 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -501,10 +501,6 @@ def __init__(self, config):
             config.head_final_ops_order,
         )
 
-        self.min_area = config.min_area
-        self.min_score = config.min_score
-        self.bbox_type = config.bbox_type
-
         self.pooling_size = config.head_pooling_size
 
         self.pooling_1s = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2)
@@ -828,7 +824,8 @@ def forward(
         >>> # forward pass
         >>> outputs = model(pixel_values=inputs["pixel_values"])
         >>> target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]]
-        >>> text_locations = processor.post_process_text_detection(outputs, target_sizes)
+        >>> threshold = 0.88
+        >>> text_locations = processor.post_process_text_detection(outputs, target_sizes, threshold)
         >>> print(text_locations[0]["bboxes"][0][:10])
         [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]
         ```
diff --git a/tests/models/fast/test_image_processing_fast.py b/tests/models/fast/test_image_processing_fast.py
index 8aa523dc03f3..f8192856849b 100644
--- a/tests/models/fast/test_image_processing_fast.py
+++ b/tests/models/fast/test_image_processing_fast.py
@@ -152,7 +152,8 @@ def prepare_image():
 
         output = model(pixel_values=torch.tensor(inputs["pixel_values"]))
         target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]]
-        final_out = image_processor.post_process_text_detection(output, target_sizes)
+        threshold = 0.88
+        final_out = image_processor.post_process_text_detection(output, target_sizes, threshold)
 
         assert len(final_out[0]["bboxes"]) == 2
         assert len(final_out[0]["bboxes"][0]) == 716
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index 409f579eed0f..f97481436676 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -403,7 +403,8 @@ def prepare_image():
 
         output = model(pixel_values=torch.tensor(input["pixel_values"]))
         target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]]
-        final_out = image_processor.post_process_text_detection(output, target_sizes)
+        threshold = 0.88
+        final_out = image_processor.post_process_text_detection(output, target_sizes, threshold)
 
         assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134]
         assert round(float(final_out[0]["scores"][0]), 5) == 0.95541
@@ -424,7 +425,8 @@ def prepare_image():
 
         output = model(pixel_values=torch.tensor(input["pixel_values"]))
         target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]]
-        final_out = image_processor.post_process_text_detection(output, target_sizes)
+        threshold = 0.88
+        final_out = image_processor.post_process_text_detection(output, target_sizes, threshold)
 
         assert final_out[0]["bboxes"][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]
         assert round(float(final_out[0]["scores"][0]), 5) == 0.92356

From 5f1af193c5b63f07c9997de5059d20a297fc2069 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Sun, 5 Nov 2023 18:58:31 +0530
Subject: [PATCH 032/152] Incorporate PR feedbacks

---
 src/transformers/models/fast/modeling_fast.py | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 761f7a9066f7..cfd3506de0fc 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -446,16 +446,6 @@ def __init__(self, config):
         for layer_ix in range(0, len(reduce_layer_configs)):
             setattr(self, f"reduce_layer{layer_ix + 1}", FASTRepConvLayer(*reduce_layer_configs[layer_ix]))
 
-        self._initialize_weights()
-
-    def _initialize_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight)
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
     def _upsample(self, layer_out, height, width):
         return F.upsample(layer_out, size=(height, width), mode="bilinear")
 
@@ -513,16 +503,6 @@ def __init__(self, config):
         else:
             self.dropout = None
 
-        self._initialize_weights()
-
-    def _initialize_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight)
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
     def forward(self, hidden_states):
         hidden_states = self.conv(hidden_states)
         if self.dropout is not None:

From c9a354320ae308486bd0d560fb9f340736bf33bf Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 8 Nov 2023 09:09:57 +0530
Subject: [PATCH 033/152] Introduce TextNet

---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_hd.md                                  |   1 +
 README_ja.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/index.md                       |   1 +
 docs/source/en/model_doc/textnet.md           |  42 ++
 docs/source/en/tasks/image_classification.md  |   2 +-
 src/transformers/__init__.py                  |  23 +
 .../models/auto/configuration_auto.py         |   2 +
 src/transformers/models/auto/modeling_auto.py |   3 +
 src/transformers/models/textnet/__init__.py   |  53 ++
 .../models/textnet/configuration_textnet.py   | 134 ++++
 .../textnet/image_processing_textnet.py       | 323 +++++++++
 .../models/textnet/modeling_textnet.py        | 614 ++++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |  24 +
 .../utils/dummy_vision_objects.py             |   7 +
 tests/models/textnet/__init__.py              |   0
 tests/models/textnet/test_modeling_textnet.py | 407 ++++++++++++
 21 files changed, 1641 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/en/model_doc/textnet.md
 create mode 100644 src/transformers/models/textnet/__init__.py
 create mode 100644 src/transformers/models/textnet/configuration_textnet.py
 create mode 100644 src/transformers/models/textnet/image_processing_textnet.py
 create mode 100644 src/transformers/models/textnet/modeling_textnet.py
 create mode 100644 tests/models/textnet/__init__.py
 create mode 100644 tests/models/textnet/test_modeling_textnet.py

diff --git a/README.md b/README.md
index daab3d1f9d6b..cb1beeec315c 100644
--- a/README.md
+++ b/README.md
@@ -491,6 +491,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
diff --git a/README_es.md b/README_es.md
index 9e1ac93b4a99..2d8279f5b0fe 100644
--- a/README_es.md
+++ b/README_es.md
@@ -466,6 +466,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)**  (from HuggingFace).
 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
diff --git a/README_hd.md b/README_hd.md
index 92935efb589c..ef97795ebbff 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -440,6 +440,7 @@ conda install conda-forge::transformers
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [पबटेबल्स-1एम: टूवर्ड्स कॉम्प्रिहेंसिव टेबल एक्सट्रैक्शन फ्रॉम अनस्ट्रक्चर्ड डॉक्यूमेंट्स ](https://arxiv.org/abs/2110.00061) ब्रैंडन स्मॉक, रोहित पेसाला, रॉबिन अब्राहम द्वारा पोस्ट किया गया।
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI से) साथ में कागज [TAPAS: पूर्व-प्रशिक्षण के माध्यम से कमजोर पर्यवेक्षण तालिका पार्सिंग](https:// arxiv.org/abs/2004.02349) जोनाथन हर्ज़िग, पावेल क्रिज़िस्तोफ़ नोवाक, थॉमस मुलर, फ्रांसेस्को पिकिन्नो और जूलियन मार्टिन ईसेन्च्लोस द्वारा।
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [TAPEX: टेबल प्री-ट्रेनिंग थ्रू लर्निंग अ न्यूरल SQL एक्ज़ीक्यूटर](https: //arxiv.org/abs/2107.07653) कियान लियू, बेई चेन, जियाकी गुओ, मोर्टेज़ा ज़ियादी, ज़ेकी लिन, वीज़ू चेन, जियान-गुआंग लू द्वारा पोस्ट किया गया।
+1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
diff --git a/README_ja.md b/README_ja.md
index f43dda021c6f..cf9b70b1ba7f 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -500,6 +500,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research から) Brandon Smock, Rohith Pesala, Robin Abraham から公開された研究論文: [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061)
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI から) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos から公開された研究論文: [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349)
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research から) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou から公開された研究論文: [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653)
+1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)**  (HuggingFace から).
 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook から) Gedas Bertasius, Heng Wang, Lorenzo Torresani から公開された研究論文: [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095)
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley から) Michael Janner, Qiyang Li, Sergey Levine から公開された研究論文: [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)
diff --git a/README_ko.md b/README_ko.md
index c2e53a1b81ce..6b256cba5aa0 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -415,6 +415,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research 에서) Brandon Smock, Rohith Pesala, Robin Abraham 의 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 논문과 함께 발표했습니다.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI 에서) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 의 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 논문과 함께 발표했습니다.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research 에서) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 의 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 논문과 함께 발표했습니다.
+1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook 에서) Gedas Bertasius, Heng Wang, Lorenzo Torresani 의 [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) 논문과 함께 발표했습니다.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley 에서) Michael Janner, Qiyang Li, Sergey Levin 의 [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 972f3a386f42..4b2950743ce7 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -439,6 +439,7 @@ conda install conda-forge::transformers
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (来自 Microsoft Research) 伴随论文 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 由 Brandon Smock, Rohith Pesala, Robin Abraham 发布。
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。
+1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
diff --git a/README_zh-hant.md b/README_zh-hant.md
index b17c8946bc3e..511630eb58fa 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -451,6 +451,7 @@ conda install conda-forge::transformers
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 52b5df6e59ba..76cdfee42f6d 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -266,6 +266,7 @@ Flax), PyTorch, and/or TensorFlow.
 |             [Table Transformer](model_doc/table-transformer)             |       ✅        |         ❌         |      ❌      |
 |                         [TAPAS](model_doc/tapas)                         |       ✅        |         ✅         |      ❌      |
 |                         [TAPEX](model_doc/tapex)                         |       ✅        |         ✅         |      ✅      |
+|                       [TextNet](model_doc/textnet)                       |       ✅        |         ❌         |      ❌      |
 |       [Time Series Transformer](model_doc/time_series_transformer)       |       ✅        |         ❌         |      ❌      |
 |                   [TimeSformer](model_doc/timesformer)                   |       ✅        |         ❌         |      ❌      |
 |        [Trajectory Transformer](model_doc/trajectory_transformer)        |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
new file mode 100644
index 000000000000..088adb572bdb
--- /dev/null
+++ b/docs/source/en/model_doc/textnet.md
@@ -0,0 +1,42 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TextNet
+
+## Overview
+
+The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu.
+TextNet was results of NAS for efficient text detection task.
+
+## TextNetConfig
+
+[[autodoc]] TextNetConfig
+
+## TextNetImageProcessor
+
+[[autodoc]] TextNetImageProcessor
+    - preprocess
+
+## TextNetModel
+
+[[autodoc]] TextNetModel
+    - forward
+
+## TextNetForImageClassification
+
+[[autodoc]] TextNetForImageClassification
+    - forward
+
diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md
index 489ec59ddf6a..55949c68ee14 100644
--- a/docs/source/en/tasks/image_classification.md
+++ b/docs/source/en/tasks/image_classification.md
@@ -34,7 +34,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [PVT](../model_doc/pvt), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
+[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [PVT](../model_doc/pvt), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [TextNet](../model_doc/textnet), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
 
 <!--End of the generated tip-->
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4941d724455d..90270a9e406f 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -822,6 +822,10 @@
         "TapasConfig",
         "TapasTokenizer",
     ],
+    "models.textnet": [
+        "TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TextNetConfig"
+    ],
     "models.time_series_transformer": [
         "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TimeSeriesTransformerConfig",
@@ -1300,6 +1304,7 @@
     _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
     _import_structure["models.siglip"].append("SiglipImageProcessor")
     _import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
+    _import_structure["models.textnet"].append("TextNetImageProcessor")
     _import_structure["models.tvlt"].append("TvltImageProcessor")
     _import_structure["models.tvp"].append("TvpImageProcessor")
     _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
@@ -3295,6 +3300,13 @@
             "load_tf_weights_in_tapas",
         ]
     )
+    _import_structure["models.textnet"].extend(
+        [
+            "TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
+            "TextNetBackbone",
+            "TextNetModel",
+        ]
+    )
     _import_structure["models.time_series_transformer"].extend(
         [
             "TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -5518,6 +5530,10 @@
         TapasConfig,
         TapasTokenizer,
     )
+    from .models.textnet import (
+        TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TextNetConfig,
+    )
     from .models.time_series_transformer import (
         TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TimeSeriesTransformerConfig,
@@ -5993,6 +6009,7 @@
         from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
         from .models.siglip import SiglipImageProcessor
         from .models.swin2sr import Swin2SRImageProcessor
+        from .models.textnet import TextNetImageProcessor
         from .models.tvlt import TvltImageProcessor
         from .models.tvp import TvpImageProcessor
         from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
@@ -7642,6 +7659,12 @@
             TapasPreTrainedModel,
             load_tf_weights_in_tapas,
         )
+        from .models.textnet import (
+            TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+            TextNetBackbone,
+            TextNetModel,
+            TextNetPreTrainedModel,
+        )
         from .models.time_series_transformer import (
             TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             TimeSeriesTransformerForPrediction,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 9eb3f1985c85..8ac6a1912c3a 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -216,6 +216,7 @@
         ("t5", "T5Config"),
         ("table-transformer", "TableTransformerConfig"),
         ("tapas", "TapasConfig"),
+        ("textnet", "TextNetConfig"),
         ("time_series_transformer", "TimeSeriesTransformerConfig"),
         ("timesformer", "TimesformerConfig"),
         ("timm_backbone", "TimmBackboneConfig"),
@@ -685,6 +686,7 @@
         ("table-transformer", "Table Transformer"),
         ("tapas", "TAPAS"),
         ("tapex", "TAPEX"),
+        ("textnet", "TextNet"),
         ("time_series_transformer", "Time Series Transformer"),
         ("timesformer", "TimeSformer"),
         ("timm_backbone", "TimmBackbone"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 7bf50a4518fa..c1ecdee1578e 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -207,6 +207,7 @@
         ("t5", "T5Model"),
         ("table-transformer", "TableTransformerModel"),
         ("tapas", "TapasModel"),
+        ("textnet", "TextNetModel"),
         ("time_series_transformer", "TimeSeriesTransformerModel"),
         ("timesformer", "TimesformerModel"),
         ("timm_backbone", "TimmBackbone"),
@@ -538,6 +539,7 @@
         ("swiftformer", "SwiftFormerForImageClassification"),
         ("swin", "SwinForImageClassification"),
         ("swinv2", "Swinv2ForImageClassification"),
+        ("textnet", "TextNetForImageClassification"),
         ("van", "VanForImageClassification"),
         ("vit", "ViTForImageClassification"),
         ("vit_hybrid", "ViTHybridForImageClassification"),
@@ -1123,6 +1125,7 @@
         ("resnet", "ResNetBackbone"),
         ("swin", "SwinBackbone"),
         ("swinv2", "Swinv2Backbone"),
+        ("textnet", "TextNetBackbone"),
         ("timm_backbone", "TimmBackbone"),
         ("vitdet", "VitDetBackbone"),
     ]
diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py
new file mode 100644
index 000000000000..21e26f387817
--- /dev/null
+++ b/src/transformers/models/textnet/__init__.py
@@ -0,0 +1,53 @@
+# coding=utf-8
+# Copyright 2023 the Fast authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_textnet": ["TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "TextNetConfig"],
+    "image_processing_textnet": ["TextNetImageProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_textnet"] = ["TextNetBackbone", "TextNetModel", "TextNetPreTrainedModel"]
+
+if TYPE_CHECKING:
+    from .configuration_textnet import TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, TextNetConfig
+    from .image_processing_textnet import TextNetImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_textnet import TextNetBackbone, TextNetModel, TextNetPreTrainedModel
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
new file mode 100644
index 000000000000..9c7fe907aa13
--- /dev/null
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -0,0 +1,134 @@
+# coding=utf-8
+# Copyright The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TextNet model configuration"""
+from transformers import PretrainedConfig
+from transformers.utils import logging
+from transformers.utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "fast_base_tt_800_finetune_ic17mlt": (
+        "https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt/raw/main/config.json"
+    ),
+}
+
+
+class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt)
+    """
+
+    def __init__(
+        self,
+        backbone_kernel_size=3,
+        backbone_stride=2,
+        backbone_dilation=1,
+        backbone_groups=1,
+        backbone_bias=False,
+        backbone_has_shuffle=False,
+        backbone_in_channels=3,
+        backbone_out_channels=64,
+        backbone_use_bn=True,
+        backbone_act_func="relu",
+        backbone_dropout_rate=0,
+        backbone_ops_order="weight_bn_act",
+        backbone_stage1_in_channels=[64, 64, 64],
+        backbone_stage1_out_channels=[64, 64, 64],
+        backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]],
+        backbone_stage1_stride=[1, 2, 1],
+        backbone_stage1_dilation=[1, 1, 1],
+        backbone_stage1_groups=[1, 1, 1],
+        backbone_stage2_in_channels=[64, 128, 128, 128],
+        backbone_stage2_out_channels=[128, 128, 128, 128],
+        backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]],
+        backbone_stage2_stride=[2, 1, 1, 1],
+        backbone_stage2_dilation=[1, 1, 1, 1],
+        backbone_stage2_groups=[1, 1, 1, 1],
+        backbone_stage3_in_channels=[128, 256, 256, 256],
+        backbone_stage3_out_channels=[256, 256, 256, 256],
+        backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]],
+        backbone_stage3_stride=[2, 1, 1, 1],
+        backbone_stage3_dilation=[1, 1, 1, 1],
+        backbone_stage3_groups=[1, 1, 1, 1],
+        backbone_stage4_in_channels=[256, 512, 512, 512],
+        backbone_stage4_out_channels=[512, 512, 512, 512],
+        backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]],
+        backbone_stage4_stride=[2, 1, 1, 1],
+        backbone_stage4_dilation=[1, 1, 1, 1],
+        backbone_stage4_groups=[1, 1, 1, 1],
+        hidden_sizes=[64, 64, 128, 256, 512],
+        initializer_range=0.02,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.backbone_kernel_size = backbone_kernel_size
+        self.backbone_stride = backbone_stride
+        self.backbone_dilation = backbone_dilation
+        self.backbone_groups = backbone_groups
+        self.backbone_bias = backbone_bias
+        self.backbone_has_shuffle = backbone_has_shuffle
+        self.backbone_in_channels = backbone_in_channels
+        self.backbone_out_channels = backbone_out_channels
+        self.backbone_use_bn = backbone_use_bn
+        self.backbone_act_func = backbone_act_func
+        self.backbone_dropout_rate = backbone_dropout_rate
+        self.backbone_ops_order = backbone_ops_order
+
+        self.backbone_stage1_in_channels = backbone_stage1_in_channels
+        self.backbone_stage1_out_channels = backbone_stage1_out_channels
+        self.backbone_stage1_kernel_size = backbone_stage1_kernel_size
+        self.backbone_stage1_stride = backbone_stage1_stride
+        self.backbone_stage1_dilation = backbone_stage1_dilation
+        self.backbone_stage1_groups = backbone_stage1_groups
+
+        self.backbone_stage2_in_channels = backbone_stage2_in_channels
+        self.backbone_stage2_out_channels = backbone_stage2_out_channels
+        self.backbone_stage2_kernel_size = backbone_stage2_kernel_size
+        self.backbone_stage2_stride = backbone_stage2_stride
+        self.backbone_stage2_dilation = backbone_stage2_dilation
+        self.backbone_stage2_groups = backbone_stage2_groups
+
+        self.backbone_stage3_in_channels = backbone_stage3_in_channels
+        self.backbone_stage3_out_channels = backbone_stage3_out_channels
+        self.backbone_stage3_kernel_size = backbone_stage3_kernel_size
+        self.backbone_stage3_stride = backbone_stage3_stride
+        self.backbone_stage3_dilation = backbone_stage3_dilation
+        self.backbone_stage3_groups = backbone_stage3_groups
+
+        self.backbone_stage4_in_channels = backbone_stage4_in_channels
+        self.backbone_stage4_out_channels = backbone_stage4_out_channels
+        self.backbone_stage4_kernel_size = backbone_stage4_kernel_size
+        self.backbone_stage4_stride = backbone_stage4_stride
+        self.backbone_stage4_dilation = backbone_stage4_dilation
+        self.backbone_stage4_groups = backbone_stage4_groups
+
+        self.initializer_range = initializer_range
+        self.hidden_sizes = hidden_sizes
+
+        self.depths = [
+            len(self.backbone_stage1_out_channels),
+            len(self.backbone_stage2_out_channels),
+            len(self.backbone_stage3_out_channels),
+            len(self.backbone_stage4_out_channels),
+        ]
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, 5)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
new file mode 100644
index 000000000000..32975e13c7a8
--- /dev/null
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -0,0 +1,323 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for TextNet."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    convert_to_rgb,
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+class TextNetImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a TextNet image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`): <fill_docstring>
+        image_mean (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        use_square_size (`bool`, *optional*, defaults to `False`):
+            The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
+            `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not.
+            Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        use_square_size: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=use_square_size)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+        self.use_square_size = use_square_size
+
+    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size, default_to_square=self.use_square_size)
+        if "shortest_edge" not in size:
+            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        output_size = get_resize_output_image_size(
+            image,
+            size=size["shortest_edge"],
+            default_to_square=self.use_square_size,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=self.use_square_size)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_center_crop:
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
new file mode 100644
index 000000000000..119ee2c7418d
--- /dev/null
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -0,0 +1,614 @@
+# coding=utf-8
+# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch TextNet model."""
+from typing import Any, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers import PreTrainedModel, add_start_docstrings
+from transformers.modeling_outputs import (
+    BackboneOutput,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from transformers.models.textnet.configuration_textnet import TextNetConfig
+from transformers.utils import add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from transformers.utils.backbone_utils import BackboneMixin
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "BitConfig"
+
+TEXTNET_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`BitConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+BIT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`BitImageProcessor.__call__`]
+            for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+"""
+
+BIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    # "google/bit-50",
+    # See all BiT models at https://huggingface.co/models?filter=bit
+]
+
+
+def get_same_padding(kernel_size):
+    if isinstance(kernel_size, tuple):
+        p1 = get_same_padding(kernel_size[0])
+        p2 = get_same_padding(kernel_size[1])
+        return p1, p2
+    return kernel_size // 2
+
+
+def build_activation(act_func, inplace=True):
+    if act_func == "relu":
+        return nn.ReLU(inplace=inplace)
+    elif act_func == "relu6":
+        return nn.ReLU6(inplace=inplace)
+    elif act_func == "tanh":
+        return nn.Tanh()
+    elif act_func == "sigmoid":
+        return nn.Sigmoid()
+    elif act_func is None:
+        return None
+    else:
+        raise ValueError("do not support: %s" % act_func)
+
+
+class TextNetConvLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        dilation=1,
+        groups=1,
+        bias=False,
+        has_shuffle=False,
+        use_batch_norm=True,
+        act_func="relu",
+        dropout_rate=0,
+        use_act=True,
+    ):
+        super().__init__()
+
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.bias = bias
+        self.has_shuffle = has_shuffle
+        self.activation_function = act_func
+
+        padding = get_same_padding(self.kernel_size)
+        if isinstance(padding, int):
+            padding *= self.dilation
+        else:
+            padding[0] *= self.dilation
+            padding[1] *= self.dilation
+
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.batch_norm = nn.Identity()
+        if use_batch_norm:
+            self.batch_norm = nn.BatchNorm2d(out_channels)
+
+        self.activation = nn.Identity()
+        if use_act:
+            act = build_activation(self.activation_function, True)
+            if act is not None:
+                self.activation = act
+
+    def forward(self, hidden_states):
+        if self.training:
+            if hasattr(self, "fused_conv"):
+                delattr(self, "fused_conv")
+            hidden_states = self.conv(hidden_states)
+            hidden_states = self.batch_norm(hidden_states)
+            return self.activation(hidden_states)
+        else:
+            if not hasattr(self, "fused_conv"):
+                setattr(self, "fused_conv", self.fuse_conv_batch_norm(self.conv, self.batch_norm))
+            hidden_states = self.fused_conv(hidden_states)
+            if self.activation is not None:
+                hidden_states = self.activation(hidden_states)
+            return hidden_states
+
+    def fuse_conv_batch_norm(self, conv, batch_norm):
+        """During inference, the functionary of batch norm layers is turned off but
+        only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv
+        layers to save computations and simplify network structures."""
+        if isinstance(batch_norm, nn.Identity):
+            return conv
+        conv_w = conv.weight
+        conv_b = conv.bias if conv.bias is not None else torch.zeros_like(batch_norm.running_mean)
+
+        factor = batch_norm.weight / torch.sqrt(batch_norm.running_var + batch_norm.eps)
+        conv.weight = nn.Parameter(conv_w * factor.reshape([conv.out_channels, 1, 1, 1]))
+        conv.bias = nn.Parameter((conv_b - batch_norm.running_mean) * factor + batch_norm.bias)
+        return conv
+
+
+class TestNetRepConvLayer(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+
+        padding = (int(((kernel_size[0] - 1) * dilation) / 2), int(((kernel_size[1] - 1) * dilation) / 2))
+
+        self.nonlinearity = nn.ReLU(inplace=True)
+
+        self.main_conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=False,
+        )
+        self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels)
+
+        ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0)
+        hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2))
+
+        if kernel_size[1] != 1:
+            self.vertical_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(kernel_size[0], 1),
+                stride=stride,
+                padding=ver_pad,
+                dilation=dilation,
+                groups=groups,
+                bias=False,
+            )
+            self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels)
+        else:
+            self.vertical_conv, self.vertical_batch_norm = None, None
+
+        if kernel_size[0] != 1:  # 卷积核的高大于1 -> 有水平卷积
+            self.horizontal_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(1, kernel_size[1]),
+                stride=stride,
+                padding=hor_pad,
+                dilation=dilation,
+                groups=groups,
+                bias=False,
+            )
+            self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels)
+        else:
+            self.horizontal_conv, self.horizontal_batch_norm = None, None
+
+        self.rbr_identity = (
+            nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None
+        )
+
+    def forward(self, hidden_states):
+        if self.training:
+            if hasattr(self, "fused_conv"):
+                self.__delattr__("fused_conv")
+
+            main_outputs = self.main_conv(hidden_states)
+            main_outputs = self.main_batch_norm(main_outputs)
+            if self.vertical_conv is not None:
+                vertical_outputs = self.vertical_conv(hidden_states)
+                vertical_outputs = self.vertical_batch_norm(vertical_outputs)
+            else:
+                vertical_outputs = 0
+
+            if self.horizontal_conv is not None:
+                horizontal_outputs = self.horizontal_conv(hidden_states)
+                horizontal_outputs = self.horizontal_batch_norm(horizontal_outputs)
+            else:
+                horizontal_outputs = 0
+
+            if self.rbr_identity is None:
+                id_out = 0
+            else:
+                id_out = self.rbr_identity(hidden_states)
+
+            return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out)
+        else:
+            if not hasattr(self, "fused_conv"):
+                self.prepare_for_eval()
+            return self.nonlinearity(self.fused_conv(hidden_states))
+
+    def _identity_to_conv(self, identity):
+        if identity is None:
+            return 0, 0
+        if not hasattr(self, "id_tensor"):
+            input_dim = self.in_channels // self.groups
+            kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32)
+            for i in range(self.in_channels):
+                kernel_value[i, i % input_dim, 0, 0] = 1
+            id_tensor = torch.from_numpy(kernel_value).to(identity.weight.device)
+            self.id_tensor = self._pad_to_mxn_tensor(id_tensor)
+        kernel = self.id_tensor
+        running_mean = identity.running_mean
+        running_var = identity.running_var
+        gamma = identity.weight
+        beta = identity.bias
+        eps = identity.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+    def _fuse_batch_norm_tensor(self, conv, batch_norm):
+        kernel = conv.weight
+        kernel = self._pad_to_mxn_tensor(kernel)
+        running_mean = batch_norm.running_mean
+        running_var = batch_norm.running_var
+        gamma = batch_norm.weight
+        beta = batch_norm.bias
+        eps = batch_norm.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+    def get_equivalent_kernel_bias(self):
+        kernel_mxn, bias_mxn = self._fuse_batch_norm_tensor(self.main_conv, self.main_batch_norm)
+        if self.vertical_conv is not None:
+            kernel_mx1, bias_mx1 = self._fuse_batch_norm_tensor(self.vertical_conv, self.vertical_batch_norm)
+        else:
+            kernel_mx1, bias_mx1 = 0, 0
+        if self.horizontal_conv is not None:
+            kernel_1xn, bias_1xn = self._fuse_batch_norm_tensor(self.horizontal_conv, self.horizontal_batch_norm)
+        else:
+            kernel_1xn, bias_1xn = 0, 0
+        kernel_id, bias_id = self._identity_to_conv(self.rbr_identity)
+        kernel_mxn = kernel_mxn + kernel_mx1 + kernel_1xn + kernel_id
+        bias_mxn = bias_mxn + bias_mx1 + bias_1xn + bias_id
+        return kernel_mxn, bias_mxn
+
+    def _pad_to_mxn_tensor(self, kernel):
+        kernel_height, kernel_width = self.kernel_size
+        height, width = kernel.shape[2:]
+        pad_left_right = (kernel_width - width) // 2
+        pad_top_down = (kernel_height - height) // 2
+        return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right, pad_top_down, pad_top_down])
+
+    def prepare_for_eval(self):
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.fused_conv = nn.Conv2d(
+            in_channels=self.main_conv.in_channels,
+            out_channels=self.main_conv.out_channels,
+            kernel_size=self.main_conv.kernel_size,
+            stride=self.main_conv.stride,
+            padding=self.main_conv.padding,
+            dilation=self.main_conv.dilation,
+            groups=self.main_conv.groups,
+            bias=True,
+        )
+        self.fused_conv.weight.data = kernel
+        self.fused_conv.bias.data = bias
+        for para in self.fused_conv.parameters():
+            para.detach_()
+
+
+class TextNetPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = TextNetConfig
+    base_model_prefix = "textnet"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+@add_start_docstrings(
+    "The bare Textnet model outputting raw features without any specific head on top.",
+    TEXTNET_START_DOCSTRING,
+)
+class TextNetModel(TextNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.first_conv = TextNetConvLayer(
+            config.backbone_in_channels,
+            config.backbone_out_channels,
+            config.backbone_kernel_size,
+            config.backbone_stride,
+            config.backbone_dilation,
+            config.backbone_groups,
+            config.backbone_bias,
+            config.backbone_has_shuffle,
+            config.backbone_use_bn,
+            config.backbone_act_func,
+            config.backbone_dropout_rate,
+            config.backbone_ops_order,
+        )
+        stage1 = []
+        for stage_config in zip(
+            config.backbone_stage1_in_channels,
+            config.backbone_stage1_out_channels,
+            config.backbone_stage1_kernel_size,
+            config.backbone_stage1_stride,
+            config.backbone_stage1_dilation,
+            config.backbone_stage1_groups,
+        ):
+            stage1.append(TestNetRepConvLayer(*stage_config))
+        self.stage1 = nn.ModuleList(stage1)
+
+        stage2 = []
+        for stage_config in zip(
+            config.backbone_stage2_in_channels,
+            config.backbone_stage2_out_channels,
+            config.backbone_stage2_kernel_size,
+            config.backbone_stage2_stride,
+            config.backbone_stage2_dilation,
+            config.backbone_stage2_groups,
+        ):
+            stage2.append(TestNetRepConvLayer(*stage_config))
+        self.stage2 = nn.ModuleList(stage2)
+
+        stage3 = []
+        for stage_config in zip(
+            config.backbone_stage3_in_channels,
+            config.backbone_stage3_out_channels,
+            config.backbone_stage3_kernel_size,
+            config.backbone_stage3_stride,
+            config.backbone_stage3_dilation,
+            config.backbone_stage3_groups,
+        ):
+            stage3.append(TestNetRepConvLayer(*stage_config))
+        self.stage3 = nn.ModuleList(stage3)
+
+        stage4 = []
+        for stage_config in zip(
+            config.backbone_stage4_in_channels,
+            config.backbone_stage4_out_channels,
+            config.backbone_stage4_kernel_size,
+            config.backbone_stage4_stride,
+            config.backbone_stage4_dilation,
+            config.backbone_stage4_groups,
+        ):
+            stage4.append(TestNetRepConvLayer(*stage_config))
+        self.stage4 = nn.ModuleList(stage4)
+
+        self.pooler = nn.AdaptiveAvgPool2d((2, 2))
+
+        self.init_weights()
+
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> Union[Tuple[Any, List[Any]], Tuple[Any], BaseModelOutputWithPoolingAndNoAttention]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        hidden_state = self.first_conv(pixel_values)
+        hidden_states = [hidden_state]
+
+        for block in self.stage1:
+            hidden_state = block(hidden_state)
+        hidden_states.append(hidden_state)
+
+        for block in self.stage2:
+            hidden_state = block(hidden_state)
+        hidden_states.append(hidden_state)
+
+        for block in self.stage3:
+            hidden_state = block(hidden_state)
+        hidden_states.append(hidden_state)
+
+        for block in self.stage4:
+            hidden_state = block(hidden_state)
+        hidden_states.append(hidden_state)
+
+        pooled_output = self.pooler(hidden_state)
+
+        if not return_dict:
+            output = (pooled_output, hidden_state)
+            return output + (hidden_states,) if output_hidden_states else output
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            pooler_output=pooled_output,
+            last_hidden_state=hidden_state,
+            hidden_states=tuple(hidden_states) if output_hidden_states else None,
+        )
+
+
+@add_start_docstrings(
+    """
+    TextNet backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    TEXTNET_START_DOCSTRING,
+)
+class TextNetBackbone(TextNetPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.textnet = TextNetModel(config)
+        self.num_features = [
+            config.backbone_out_channels,
+            config.backbone_stage1_out_channels[-1],
+            config.backbone_stage2_out_channels[-1],
+            config.backbone_stage3_out_channels[-1],
+            config.backbone_stage4_out_channels[-1],
+        ]
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward("BIT_INPUTS_DOCSTRING")
+    @replace_return_docstrings(output_type=BackboneOutput, config_class="")
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("google/resnetnv2-50")
+        >>> model = AutoBackbone.from_pretrained("google/resnetnv2-50")
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.textnet(pixel_values, output_hidden_states=True, return_dict=True)
+
+        hidden_states = outputs.hidden_states
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
+
+
+@add_start_docstrings(
+    """
+    TextNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    TEXTNET_START_DOCSTRING,
+)
+class TextNetForImageClassification(TextNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.textnet = TextNetModel(config)
+        # classification head
+        self.classifier = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(config.hidden_sizes[-1] * 2 * 2, config.num_labels) if config.num_labels > 0 else nn.Identity(),
+        )
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward("BIT_INPUTS_DOCSTRING")
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> ImageClassifierOutputWithNoAttention:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.textnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        last_hidden_state = outputs.last_hidden_state if return_dict else outputs[0]
+
+        logits = self.classifier(last_hidden_state)
+
+        loss = None
+
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return (loss,) + output if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 06bdee17752b..095c9a6f4189 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -8023,6 +8023,30 @@ def load_tf_weights_in_tapas(*args, **kwargs):
     requires_backends(load_tf_weights_in_tapas, ["torch"])
 
 
+TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP = None
+
+
+class TextNetBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TextNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TextNetPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 89366aba5081..18c6a27bd7dc 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -485,6 +485,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class TextNetImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class TvltImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/textnet/__init__.py b/tests/models/textnet/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
new file mode 100644
index 000000000000..d7ebe31f6021
--- /dev/null
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -0,0 +1,407 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch TextNet model. """
+import inspect
+import unittest
+
+import torch.nn as nn
+
+from transformers import (
+    TextNetBackbone,
+    TextNetConfig,
+    is_torch_available,
+)
+from transformers.models.textnet.modeling_textnet import TextNetForImageClassification
+from transformers.testing_utils import (
+    require_torch,
+    torch_device,
+)
+
+from ...test_backbone_common import BackboneTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        TextNetModel,
+    )
+
+
+class TextNetModelTester:
+    def __init__(
+        self,
+        parent,
+        backbone_kernel_size=3,
+        backbone_stride=2,
+        backbone_dilation=1,
+        backbone_groups=1,
+        backbone_bias=False,
+        backbone_has_shuffle=False,
+        backbone_in_channels=3,
+        backbone_out_channels=64,
+        backbone_use_bn=True,
+        backbone_act_func="relu",
+        backbone_dropout_rate=0,
+        backbone_ops_order="weight_bn_act",
+        backbone_stage1_in_channels=[64],
+        backbone_stage1_out_channels=[64],
+        backbone_stage1_kernel_size=[[3, 3]],
+        backbone_stage1_stride=[1],
+        backbone_stage1_dilation=[1],
+        backbone_stage1_groups=[1],
+        backbone_stage2_in_channels=[64],
+        backbone_stage2_out_channels=[128],
+        backbone_stage2_kernel_size=[[3, 1]],
+        backbone_stage2_stride=[2],
+        backbone_stage2_dilation=[1],
+        backbone_stage2_groups=[1],
+        backbone_stage3_in_channels=[128],
+        backbone_stage3_out_channels=[256],
+        backbone_stage3_kernel_size=[[1, 3]],
+        backbone_stage3_stride=[2],
+        backbone_stage3_dilation=[1],
+        backbone_stage3_groups=[1],
+        backbone_stage4_in_channels=[256],
+        backbone_stage4_out_channels=[512],
+        backbone_stage4_kernel_size=[[3, 3]],
+        backbone_stage4_stride=[2],
+        backbone_stage4_dilation=[1],
+        backbone_stage4_groups=[1],
+        out_features=["stage1", "stage2", "stage3", "stage4"],
+        out_indices=[1, 2, 3, 4],
+        batch_size=3,
+        num_channels=3,
+        image_size=32,
+        is_training=True,
+        use_labels=True,
+        hidden_act="relu",
+        num_labels=3,
+        hidden_sizes=[64, 64, 128, 256, 512],
+    ):
+        self.parent = parent
+        self.backbone_kernel_size = backbone_kernel_size
+        self.backbone_stride = backbone_stride
+        self.backbone_dilation = backbone_dilation
+        self.backbone_groups = backbone_groups
+        self.backbone_bias = backbone_bias
+        self.backbone_has_shuffle = backbone_has_shuffle
+        self.backbone_in_channels = backbone_in_channels
+        self.backbone_out_channels = backbone_out_channels
+        self.backbone_use_bn = backbone_use_bn
+        self.backbone_act_func = backbone_act_func
+        self.backbone_dropout_rate = backbone_dropout_rate
+        self.backbone_ops_order = backbone_ops_order
+
+        self.backbone_stage1_in_channels = backbone_stage1_in_channels
+        self.backbone_stage1_out_channels = backbone_stage1_out_channels
+        self.backbone_stage1_kernel_size = backbone_stage1_kernel_size
+        self.backbone_stage1_stride = backbone_stage1_stride
+        self.backbone_stage1_dilation = backbone_stage1_dilation
+        self.backbone_stage1_groups = backbone_stage1_groups
+
+        self.backbone_stage2_in_channels = backbone_stage2_in_channels
+        self.backbone_stage2_out_channels = backbone_stage2_out_channels
+        self.backbone_stage2_kernel_size = backbone_stage2_kernel_size
+        self.backbone_stage2_stride = backbone_stage2_stride
+        self.backbone_stage2_dilation = backbone_stage2_dilation
+        self.backbone_stage2_groups = backbone_stage2_groups
+
+        self.backbone_stage3_in_channels = backbone_stage3_in_channels
+        self.backbone_stage3_out_channels = backbone_stage3_out_channels
+        self.backbone_stage3_kernel_size = backbone_stage3_kernel_size
+        self.backbone_stage3_stride = backbone_stage3_stride
+        self.backbone_stage3_dilation = backbone_stage3_dilation
+        self.backbone_stage3_groups = backbone_stage3_groups
+
+        self.backbone_stage4_in_channels = backbone_stage4_in_channels
+        self.backbone_stage4_out_channels = backbone_stage4_out_channels
+        self.backbone_stage4_kernel_size = backbone_stage4_kernel_size
+        self.backbone_stage4_stride = backbone_stage4_stride
+        self.backbone_stage4_dilation = backbone_stage4_dilation
+        self.backbone_stage4_groups = backbone_stage4_groups
+
+        self.out_features = out_features
+        self.out_indices = out_indices
+
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.num_labels = num_labels
+        self.hidden_sizes = hidden_sizes
+
+        self.num_stages = 5
+
+    def get_config(self):
+        return TextNetConfig(
+            backbone_kernel_size=self.backbone_kernel_size,
+            backbone_stride=self.backbone_stride,
+            backbone_dilation=self.backbone_dilation,
+            backbone_groups=self.backbone_groups,
+            backbone_bias=self.backbone_bias,
+            backbone_has_shuffle=self.backbone_has_shuffle,
+            backbone_in_channels=self.backbone_in_channels,
+            backbone_out_channels=self.backbone_out_channels,
+            backbone_use_bn=self.backbone_use_bn,
+            backbone_act_func=self.backbone_act_func,
+            backbone_dropout_rate=self.backbone_dropout_rate,
+            backbone_ops_order=self.backbone_ops_order,
+            backbone_stage1_in_channels=self.backbone_stage1_in_channels,
+            backbone_stage1_out_channels=self.backbone_stage1_out_channels,
+            backbone_stage1_kernel_size=self.backbone_stage1_kernel_size,
+            backbone_stage1_stride=self.backbone_stage1_stride,
+            backbone_stage1_dilation=self.backbone_stage1_dilation,
+            backbone_stage1_groups=self.backbone_stage1_groups,
+            backbone_stage2_in_channels=self.backbone_stage2_in_channels,
+            backbone_stage2_out_channels=self.backbone_stage2_out_channels,
+            backbone_stage2_kernel_size=self.backbone_stage2_kernel_size,
+            backbone_stage2_stride=self.backbone_stage2_stride,
+            backbone_stage2_dilation=self.backbone_stage2_dilation,
+            backbone_stage2_groups=self.backbone_stage2_groups,
+            backbone_stage3_in_channels=self.backbone_stage3_in_channels,
+            backbone_stage3_out_channels=self.backbone_stage3_out_channels,
+            backbone_stage3_kernel_size=self.backbone_stage3_kernel_size,
+            backbone_stage3_stride=self.backbone_stage3_stride,
+            backbone_stage3_dilation=self.backbone_stage3_dilation,
+            backbone_stage3_groups=self.backbone_stage3_groups,
+            backbone_stage4_in_channels=self.backbone_stage4_in_channels,
+            backbone_stage4_out_channels=self.backbone_stage4_out_channels,
+            backbone_stage4_kernel_size=self.backbone_stage4_kernel_size,
+            backbone_stage4_stride=self.backbone_stage4_stride,
+            backbone_stage4_dilation=self.backbone_stage4_dilation,
+            backbone_stage4_groups=self.backbone_stage4_groups,
+            out_features=self.out_features,
+            out_indices=self.out_indices,
+            hidden_sizes=self.hidden_sizes,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = TextNetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.hidden_sizes[-1], 2, 2),
+        )
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def create_and_check_backbone(self, config, pixel_values, labels):
+        model = TextNetBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
+        self.parent.assertListEqual(
+            list(result.feature_maps[0].shape), [self.batch_size, self.backbone_stage1_out_channels[-1], 16, 16]
+        )
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), len(config.out_features))
+        self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
+
+        # verify backbone works with out_features=None
+        config.out_features = None
+        model = TextNetBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), 1)
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, 512, 2, 2])
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), 1)
+        self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (TextNetModel, TextNetForImageClassification, TextNetBackbone) if is_torch_available() else ()
+
+    pipeline_model_mapping = (
+        {"feature-extraction": TextNetModel, "image-classification": TextNetForImageClassification}
+        if is_torch_available()
+        else {}
+    )
+    # fx_compatible = False
+    # test_pruning = False
+    # test_resize_embeddings = False
+    # test_head_masking = False
+    # has_attentions = False
+
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = TextNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=TextNetConfig, hidden_size=37)
+
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    @unittest.skip(reason="Bit does not output attentions")
+    def test_attention_outputs(self):
+        pass
+
+    @unittest.skip(reason="Bit does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Bit does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_backbone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_backbone(*config_and_inputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            for name, module in model.named_modules():
+                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+                    self.assertTrue(
+                        torch.all(module.weight == 1),
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+                    self.assertTrue(
+                        torch.all(module.bias == 0),
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_stages = self.model_tester.num_stages - 1
+            self.assertEqual(len(hidden_states), expected_num_stages + 1)
+
+            # Bit's feature maps are of shape (batch_size, num_channels, height, width)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.image_size // 2, self.model_tester.image_size // 2],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        layers_type = ["preactivation", "bottleneck"]
+        for model_class in self.all_model_classes:
+            for layer_type in layers_type:
+                config.layer_type = layer_type
+                inputs_dict["output_hidden_states"] = True
+                check_hidden_states_output(inputs_dict, config, model_class)
+
+                # check that output_hidden_states also work using config
+                del inputs_dict["output_hidden_states"]
+                config.output_hidden_states = True
+
+                check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_model_is_small(self):
+        # Just a consistency check to make sure we are not running tests on 80M parameter models.
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            num_params = model.num_parameters()
+            assert (
+                num_params < 3000000
+            ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
+
+    @unittest.skip(reason="Bit does not use feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
+    # def test_for_image_classification(self):
+    #     config_and_inputs = self.model_tester.prepare_config_and_inputs()
+    #     self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    # @slow
+    # def test_model_from_pretrained(self):
+    #     for model_name in BIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+    #         model = BitModel.from_pretrained(model_name)
+    #         self.assertIsNotNone(model)
+
+
+@require_torch
+class BitBackboneTest(BackboneTesterMixin, unittest.TestCase):
+    all_model_classes = (TextNetBackbone,) if is_torch_available() else ()
+    config_class = TextNetConfig
+
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = TextNetModelTester(self)

From 12941e6aaad3acd281017e27649e58142b2c527b Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 8 Nov 2023 15:37:46 +0530
Subject: [PATCH 034/152] Fix failures

---
 src/transformers/models/textnet/image_processing_textnet.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 32975e13c7a8..0455e8199adf 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -72,8 +72,7 @@ class TextNetImageProcessor(BaseImageProcessor):
             the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
-            method.
-            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+            method. Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
         do_normalize (`bool`, *optional*, defaults to `True`): <fill_docstring>
         image_mean (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
             Mean to use if normalizing the image. This is a float or list of floats the length of the number of

From 30568ef68e0745ea8db5cb83fc3e3b57b4fb52cf Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 8 Nov 2023 15:50:22 +0530
Subject: [PATCH 035/152] Refactor textnet model

---
 .../models/textnet/configuration_textnet.py   | 152 ++++++------
 .../models/textnet/modeling_textnet.py        |  82 +++----
 tests/models/textnet/test_modeling_textnet.py | 226 +++++++++---------
 3 files changed, 230 insertions(+), 230 deletions(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 9c7fe907aa13..6bcb961a0f97 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -34,42 +34,42 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
 
     def __init__(
         self,
-        backbone_kernel_size=3,
-        backbone_stride=2,
-        backbone_dilation=1,
-        backbone_groups=1,
-        backbone_bias=False,
-        backbone_has_shuffle=False,
-        backbone_in_channels=3,
-        backbone_out_channels=64,
-        backbone_use_bn=True,
-        backbone_act_func="relu",
-        backbone_dropout_rate=0,
-        backbone_ops_order="weight_bn_act",
-        backbone_stage1_in_channels=[64, 64, 64],
-        backbone_stage1_out_channels=[64, 64, 64],
-        backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]],
-        backbone_stage1_stride=[1, 2, 1],
-        backbone_stage1_dilation=[1, 1, 1],
-        backbone_stage1_groups=[1, 1, 1],
-        backbone_stage2_in_channels=[64, 128, 128, 128],
-        backbone_stage2_out_channels=[128, 128, 128, 128],
-        backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]],
-        backbone_stage2_stride=[2, 1, 1, 1],
-        backbone_stage2_dilation=[1, 1, 1, 1],
-        backbone_stage2_groups=[1, 1, 1, 1],
-        backbone_stage3_in_channels=[128, 256, 256, 256],
-        backbone_stage3_out_channels=[256, 256, 256, 256],
-        backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]],
-        backbone_stage3_stride=[2, 1, 1, 1],
-        backbone_stage3_dilation=[1, 1, 1, 1],
-        backbone_stage3_groups=[1, 1, 1, 1],
-        backbone_stage4_in_channels=[256, 512, 512, 512],
-        backbone_stage4_out_channels=[512, 512, 512, 512],
-        backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]],
-        backbone_stage4_stride=[2, 1, 1, 1],
-        backbone_stage4_dilation=[1, 1, 1, 1],
-        backbone_stage4_groups=[1, 1, 1, 1],
+        kernel_size=3,
+        stride=2,
+        dilation=1,
+        groups=1,
+        bias=False,
+        has_shuffle=False,
+        in_channels=3,
+        out_channels=64,
+        use_bn=True,
+        act_func="relu",
+        dropout_rate=0,
+        ops_order="weight_bn_act",
+        stage1_in_channels=[64, 64, 64],
+        stage1_out_channels=[64, 64, 64],
+        stage1_kernel_size=[[3, 3], [3, 3], [3, 3]],
+        stage1_stride=[1, 2, 1],
+        stage1_dilation=[1, 1, 1],
+        stage1_groups=[1, 1, 1],
+        stage2_in_channels=[64, 128, 128, 128],
+        stage2_out_channels=[128, 128, 128, 128],
+        stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]],
+        stage2_stride=[2, 1, 1, 1],
+        stage2_dilation=[1, 1, 1, 1],
+        stage2_groups=[1, 1, 1, 1],
+        stage3_in_channels=[128, 256, 256, 256],
+        stage3_out_channels=[256, 256, 256, 256],
+        stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]],
+        stage3_stride=[2, 1, 1, 1],
+        stage3_dilation=[1, 1, 1, 1],
+        stage3_groups=[1, 1, 1, 1],
+        stage4_in_channels=[256, 512, 512, 512],
+        stage4_out_channels=[512, 512, 512, 512],
+        stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]],
+        stage4_stride=[2, 1, 1, 1],
+        stage4_dilation=[1, 1, 1, 1],
+        stage4_groups=[1, 1, 1, 1],
         hidden_sizes=[64, 64, 128, 256, 512],
         initializer_range=0.02,
         out_features=None,
@@ -78,55 +78,55 @@ def __init__(
     ):
         super().__init__(**kwargs)
 
-        self.backbone_kernel_size = backbone_kernel_size
-        self.backbone_stride = backbone_stride
-        self.backbone_dilation = backbone_dilation
-        self.backbone_groups = backbone_groups
-        self.backbone_bias = backbone_bias
-        self.backbone_has_shuffle = backbone_has_shuffle
-        self.backbone_in_channels = backbone_in_channels
-        self.backbone_out_channels = backbone_out_channels
-        self.backbone_use_bn = backbone_use_bn
-        self.backbone_act_func = backbone_act_func
-        self.backbone_dropout_rate = backbone_dropout_rate
-        self.backbone_ops_order = backbone_ops_order
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.bias = bias
+        self.has_shuffle = has_shuffle
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.use_bn = use_bn
+        self.act_func = act_func
+        self.dropout_rate = dropout_rate
+        self.ops_order = ops_order
 
-        self.backbone_stage1_in_channels = backbone_stage1_in_channels
-        self.backbone_stage1_out_channels = backbone_stage1_out_channels
-        self.backbone_stage1_kernel_size = backbone_stage1_kernel_size
-        self.backbone_stage1_stride = backbone_stage1_stride
-        self.backbone_stage1_dilation = backbone_stage1_dilation
-        self.backbone_stage1_groups = backbone_stage1_groups
+        self.stage1_in_channels = stage1_in_channels
+        self.stage1_out_channels = stage1_out_channels
+        self.stage1_kernel_size = stage1_kernel_size
+        self.stage1_stride = stage1_stride
+        self.stage1_dilation = stage1_dilation
+        self.stage1_groups = stage1_groups
 
-        self.backbone_stage2_in_channels = backbone_stage2_in_channels
-        self.backbone_stage2_out_channels = backbone_stage2_out_channels
-        self.backbone_stage2_kernel_size = backbone_stage2_kernel_size
-        self.backbone_stage2_stride = backbone_stage2_stride
-        self.backbone_stage2_dilation = backbone_stage2_dilation
-        self.backbone_stage2_groups = backbone_stage2_groups
+        self.stage2_in_channels = stage2_in_channels
+        self.stage2_out_channels = stage2_out_channels
+        self.stage2_kernel_size = stage2_kernel_size
+        self.stage2_stride = stage2_stride
+        self.stage2_dilation = stage2_dilation
+        self.stage2_groups = stage2_groups
 
-        self.backbone_stage3_in_channels = backbone_stage3_in_channels
-        self.backbone_stage3_out_channels = backbone_stage3_out_channels
-        self.backbone_stage3_kernel_size = backbone_stage3_kernel_size
-        self.backbone_stage3_stride = backbone_stage3_stride
-        self.backbone_stage3_dilation = backbone_stage3_dilation
-        self.backbone_stage3_groups = backbone_stage3_groups
+        self.stage3_in_channels = stage3_in_channels
+        self.stage3_out_channels = stage3_out_channels
+        self.stage3_kernel_size = stage3_kernel_size
+        self.stage3_stride = stage3_stride
+        self.stage3_dilation = stage3_dilation
+        self.stage3_groups = stage3_groups
 
-        self.backbone_stage4_in_channels = backbone_stage4_in_channels
-        self.backbone_stage4_out_channels = backbone_stage4_out_channels
-        self.backbone_stage4_kernel_size = backbone_stage4_kernel_size
-        self.backbone_stage4_stride = backbone_stage4_stride
-        self.backbone_stage4_dilation = backbone_stage4_dilation
-        self.backbone_stage4_groups = backbone_stage4_groups
+        self.stage4_in_channels = stage4_in_channels
+        self.stage4_out_channels = stage4_out_channels
+        self.stage4_kernel_size = stage4_kernel_size
+        self.stage4_stride = stage4_stride
+        self.stage4_dilation = stage4_dilation
+        self.stage4_groups = stage4_groups
 
         self.initializer_range = initializer_range
         self.hidden_sizes = hidden_sizes
 
         self.depths = [
-            len(self.backbone_stage1_out_channels),
-            len(self.backbone_stage2_out_channels),
-            len(self.backbone_stage3_out_channels),
-            len(self.backbone_stage4_out_channels),
+            len(self.stage1_out_channels),
+            len(self.stage2_out_channels),
+            len(self.stage3_out_channels),
+            len(self.stage4_out_channels),
         ]
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, 5)]
         self._out_features, self._out_indices = get_aligned_output_features_output_indices(
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 119ee2c7418d..cc0b834f4262 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -363,63 +363,63 @@ class TextNetModel(TextNetPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.first_conv = TextNetConvLayer(
-            config.backbone_in_channels,
-            config.backbone_out_channels,
-            config.backbone_kernel_size,
-            config.backbone_stride,
-            config.backbone_dilation,
-            config.backbone_groups,
-            config.backbone_bias,
-            config.backbone_has_shuffle,
-            config.backbone_use_bn,
-            config.backbone_act_func,
-            config.backbone_dropout_rate,
-            config.backbone_ops_order,
+            config.in_channels,
+            config.out_channels,
+            config.kernel_size,
+            config.stride,
+            config.dilation,
+            config.groups,
+            config.bias,
+            config.has_shuffle,
+            config.use_bn,
+            config.act_func,
+            config.dropout_rate,
+            config.ops_order,
         )
         stage1 = []
         for stage_config in zip(
-            config.backbone_stage1_in_channels,
-            config.backbone_stage1_out_channels,
-            config.backbone_stage1_kernel_size,
-            config.backbone_stage1_stride,
-            config.backbone_stage1_dilation,
-            config.backbone_stage1_groups,
+            config.stage1_in_channels,
+            config.stage1_out_channels,
+            config.stage1_kernel_size,
+            config.stage1_stride,
+            config.stage1_dilation,
+            config.stage1_groups,
         ):
             stage1.append(TestNetRepConvLayer(*stage_config))
         self.stage1 = nn.ModuleList(stage1)
 
         stage2 = []
         for stage_config in zip(
-            config.backbone_stage2_in_channels,
-            config.backbone_stage2_out_channels,
-            config.backbone_stage2_kernel_size,
-            config.backbone_stage2_stride,
-            config.backbone_stage2_dilation,
-            config.backbone_stage2_groups,
+            config.stage2_in_channels,
+            config.stage2_out_channels,
+            config.stage2_kernel_size,
+            config.stage2_stride,
+            config.stage2_dilation,
+            config.stage2_groups,
         ):
             stage2.append(TestNetRepConvLayer(*stage_config))
         self.stage2 = nn.ModuleList(stage2)
 
         stage3 = []
         for stage_config in zip(
-            config.backbone_stage3_in_channels,
-            config.backbone_stage3_out_channels,
-            config.backbone_stage3_kernel_size,
-            config.backbone_stage3_stride,
-            config.backbone_stage3_dilation,
-            config.backbone_stage3_groups,
+            config.stage3_in_channels,
+            config.stage3_out_channels,
+            config.stage3_kernel_size,
+            config.stage3_stride,
+            config.stage3_dilation,
+            config.stage3_groups,
         ):
             stage3.append(TestNetRepConvLayer(*stage_config))
         self.stage3 = nn.ModuleList(stage3)
 
         stage4 = []
         for stage_config in zip(
-            config.backbone_stage4_in_channels,
-            config.backbone_stage4_out_channels,
-            config.backbone_stage4_kernel_size,
-            config.backbone_stage4_stride,
-            config.backbone_stage4_dilation,
-            config.backbone_stage4_groups,
+            config.stage4_in_channels,
+            config.stage4_out_channels,
+            config.stage4_kernel_size,
+            config.stage4_stride,
+            config.stage4_dilation,
+            config.stage4_groups,
         ):
             stage4.append(TestNetRepConvLayer(*stage_config))
         self.stage4 = nn.ModuleList(stage4)
@@ -481,11 +481,11 @@ def __init__(self, config):
 
         self.textnet = TextNetModel(config)
         self.num_features = [
-            config.backbone_out_channels,
-            config.backbone_stage1_out_channels[-1],
-            config.backbone_stage2_out_channels[-1],
-            config.backbone_stage3_out_channels[-1],
-            config.backbone_stage4_out_channels[-1],
+            config.out_channels,
+            config.stage1_out_channels[-1],
+            config.stage2_out_channels[-1],
+            config.stage3_out_channels[-1],
+            config.stage4_out_channels[-1],
         ]
 
         # initialize weights and apply final processing
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index d7ebe31f6021..bd18111f582c 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -47,42 +47,42 @@ class TextNetModelTester:
     def __init__(
         self,
         parent,
-        backbone_kernel_size=3,
-        backbone_stride=2,
-        backbone_dilation=1,
-        backbone_groups=1,
-        backbone_bias=False,
-        backbone_has_shuffle=False,
-        backbone_in_channels=3,
-        backbone_out_channels=64,
-        backbone_use_bn=True,
-        backbone_act_func="relu",
-        backbone_dropout_rate=0,
-        backbone_ops_order="weight_bn_act",
-        backbone_stage1_in_channels=[64],
-        backbone_stage1_out_channels=[64],
-        backbone_stage1_kernel_size=[[3, 3]],
-        backbone_stage1_stride=[1],
-        backbone_stage1_dilation=[1],
-        backbone_stage1_groups=[1],
-        backbone_stage2_in_channels=[64],
-        backbone_stage2_out_channels=[128],
-        backbone_stage2_kernel_size=[[3, 1]],
-        backbone_stage2_stride=[2],
-        backbone_stage2_dilation=[1],
-        backbone_stage2_groups=[1],
-        backbone_stage3_in_channels=[128],
-        backbone_stage3_out_channels=[256],
-        backbone_stage3_kernel_size=[[1, 3]],
-        backbone_stage3_stride=[2],
-        backbone_stage3_dilation=[1],
-        backbone_stage3_groups=[1],
-        backbone_stage4_in_channels=[256],
-        backbone_stage4_out_channels=[512],
-        backbone_stage4_kernel_size=[[3, 3]],
-        backbone_stage4_stride=[2],
-        backbone_stage4_dilation=[1],
-        backbone_stage4_groups=[1],
+        kernel_size=3,
+        stride=2,
+        dilation=1,
+        groups=1,
+        bias=False,
+        has_shuffle=False,
+        in_channels=3,
+        out_channels=64,
+        use_bn=True,
+        act_func="relu",
+        dropout_rate=0,
+        ops_order="weight_bn_act",
+        stage1_in_channels=[64],
+        stage1_out_channels=[64],
+        stage1_kernel_size=[[3, 3]],
+        stage1_stride=[1],
+        stage1_dilation=[1],
+        stage1_groups=[1],
+        stage2_in_channels=[64],
+        stage2_out_channels=[128],
+        stage2_kernel_size=[[3, 1]],
+        stage2_stride=[2],
+        stage2_dilation=[1],
+        stage2_groups=[1],
+        stage3_in_channels=[128],
+        stage3_out_channels=[256],
+        stage3_kernel_size=[[1, 3]],
+        stage3_stride=[2],
+        stage3_dilation=[1],
+        stage3_groups=[1],
+        stage4_in_channels=[256],
+        stage4_out_channels=[512],
+        stage4_kernel_size=[[3, 3]],
+        stage4_stride=[2],
+        stage4_dilation=[1],
+        stage4_groups=[1],
         out_features=["stage1", "stage2", "stage3", "stage4"],
         out_indices=[1, 2, 3, 4],
         batch_size=3,
@@ -95,46 +95,46 @@ def __init__(
         hidden_sizes=[64, 64, 128, 256, 512],
     ):
         self.parent = parent
-        self.backbone_kernel_size = backbone_kernel_size
-        self.backbone_stride = backbone_stride
-        self.backbone_dilation = backbone_dilation
-        self.backbone_groups = backbone_groups
-        self.backbone_bias = backbone_bias
-        self.backbone_has_shuffle = backbone_has_shuffle
-        self.backbone_in_channels = backbone_in_channels
-        self.backbone_out_channels = backbone_out_channels
-        self.backbone_use_bn = backbone_use_bn
-        self.backbone_act_func = backbone_act_func
-        self.backbone_dropout_rate = backbone_dropout_rate
-        self.backbone_ops_order = backbone_ops_order
-
-        self.backbone_stage1_in_channels = backbone_stage1_in_channels
-        self.backbone_stage1_out_channels = backbone_stage1_out_channels
-        self.backbone_stage1_kernel_size = backbone_stage1_kernel_size
-        self.backbone_stage1_stride = backbone_stage1_stride
-        self.backbone_stage1_dilation = backbone_stage1_dilation
-        self.backbone_stage1_groups = backbone_stage1_groups
-
-        self.backbone_stage2_in_channels = backbone_stage2_in_channels
-        self.backbone_stage2_out_channels = backbone_stage2_out_channels
-        self.backbone_stage2_kernel_size = backbone_stage2_kernel_size
-        self.backbone_stage2_stride = backbone_stage2_stride
-        self.backbone_stage2_dilation = backbone_stage2_dilation
-        self.backbone_stage2_groups = backbone_stage2_groups
-
-        self.backbone_stage3_in_channels = backbone_stage3_in_channels
-        self.backbone_stage3_out_channels = backbone_stage3_out_channels
-        self.backbone_stage3_kernel_size = backbone_stage3_kernel_size
-        self.backbone_stage3_stride = backbone_stage3_stride
-        self.backbone_stage3_dilation = backbone_stage3_dilation
-        self.backbone_stage3_groups = backbone_stage3_groups
-
-        self.backbone_stage4_in_channels = backbone_stage4_in_channels
-        self.backbone_stage4_out_channels = backbone_stage4_out_channels
-        self.backbone_stage4_kernel_size = backbone_stage4_kernel_size
-        self.backbone_stage4_stride = backbone_stage4_stride
-        self.backbone_stage4_dilation = backbone_stage4_dilation
-        self.backbone_stage4_groups = backbone_stage4_groups
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.bias = bias
+        self.has_shuffle = has_shuffle
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.use_bn = use_bn
+        self.act_func = act_func
+        self.dropout_rate = dropout_rate
+        self.ops_order = ops_order
+
+        self.stage1_in_channels = stage1_in_channels
+        self.stage1_out_channels = stage1_out_channels
+        self.stage1_kernel_size = stage1_kernel_size
+        self.stage1_stride = stage1_stride
+        self.stage1_dilation = stage1_dilation
+        self.stage1_groups = stage1_groups
+
+        self.stage2_in_channels = stage2_in_channels
+        self.stage2_out_channels = stage2_out_channels
+        self.stage2_kernel_size = stage2_kernel_size
+        self.stage2_stride = stage2_stride
+        self.stage2_dilation = stage2_dilation
+        self.stage2_groups = stage2_groups
+
+        self.stage3_in_channels = stage3_in_channels
+        self.stage3_out_channels = stage3_out_channels
+        self.stage3_kernel_size = stage3_kernel_size
+        self.stage3_stride = stage3_stride
+        self.stage3_dilation = stage3_dilation
+        self.stage3_groups = stage3_groups
+
+        self.stage4_in_channels = stage4_in_channels
+        self.stage4_out_channels = stage4_out_channels
+        self.stage4_kernel_size = stage4_kernel_size
+        self.stage4_stride = stage4_stride
+        self.stage4_dilation = stage4_dilation
+        self.stage4_groups = stage4_groups
 
         self.out_features = out_features
         self.out_indices = out_indices
@@ -151,42 +151,42 @@ def __init__(
 
     def get_config(self):
         return TextNetConfig(
-            backbone_kernel_size=self.backbone_kernel_size,
-            backbone_stride=self.backbone_stride,
-            backbone_dilation=self.backbone_dilation,
-            backbone_groups=self.backbone_groups,
-            backbone_bias=self.backbone_bias,
-            backbone_has_shuffle=self.backbone_has_shuffle,
-            backbone_in_channels=self.backbone_in_channels,
-            backbone_out_channels=self.backbone_out_channels,
-            backbone_use_bn=self.backbone_use_bn,
-            backbone_act_func=self.backbone_act_func,
-            backbone_dropout_rate=self.backbone_dropout_rate,
-            backbone_ops_order=self.backbone_ops_order,
-            backbone_stage1_in_channels=self.backbone_stage1_in_channels,
-            backbone_stage1_out_channels=self.backbone_stage1_out_channels,
-            backbone_stage1_kernel_size=self.backbone_stage1_kernel_size,
-            backbone_stage1_stride=self.backbone_stage1_stride,
-            backbone_stage1_dilation=self.backbone_stage1_dilation,
-            backbone_stage1_groups=self.backbone_stage1_groups,
-            backbone_stage2_in_channels=self.backbone_stage2_in_channels,
-            backbone_stage2_out_channels=self.backbone_stage2_out_channels,
-            backbone_stage2_kernel_size=self.backbone_stage2_kernel_size,
-            backbone_stage2_stride=self.backbone_stage2_stride,
-            backbone_stage2_dilation=self.backbone_stage2_dilation,
-            backbone_stage2_groups=self.backbone_stage2_groups,
-            backbone_stage3_in_channels=self.backbone_stage3_in_channels,
-            backbone_stage3_out_channels=self.backbone_stage3_out_channels,
-            backbone_stage3_kernel_size=self.backbone_stage3_kernel_size,
-            backbone_stage3_stride=self.backbone_stage3_stride,
-            backbone_stage3_dilation=self.backbone_stage3_dilation,
-            backbone_stage3_groups=self.backbone_stage3_groups,
-            backbone_stage4_in_channels=self.backbone_stage4_in_channels,
-            backbone_stage4_out_channels=self.backbone_stage4_out_channels,
-            backbone_stage4_kernel_size=self.backbone_stage4_kernel_size,
-            backbone_stage4_stride=self.backbone_stage4_stride,
-            backbone_stage4_dilation=self.backbone_stage4_dilation,
-            backbone_stage4_groups=self.backbone_stage4_groups,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            dilation=self.dilation,
+            groups=self.groups,
+            bias=self.bias,
+            has_shuffle=self.has_shuffle,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            use_bn=self.use_bn,
+            act_func=self.act_func,
+            dropout_rate=self.dropout_rate,
+            ops_order=self.ops_order,
+            stage1_in_channels=self.stage1_in_channels,
+            stage1_out_channels=self.stage1_out_channels,
+            stage1_kernel_size=self.stage1_kernel_size,
+            stage1_stride=self.stage1_stride,
+            stage1_dilation=self.stage1_dilation,
+            stage1_groups=self.stage1_groups,
+            stage2_in_channels=self.stage2_in_channels,
+            stage2_out_channels=self.stage2_out_channels,
+            stage2_kernel_size=self.stage2_kernel_size,
+            stage2_stride=self.stage2_stride,
+            stage2_dilation=self.stage2_dilation,
+            stage2_groups=self.stage2_groups,
+            stage3_in_channels=self.stage3_in_channels,
+            stage3_out_channels=self.stage3_out_channels,
+            stage3_kernel_size=self.stage3_kernel_size,
+            stage3_stride=self.stage3_stride,
+            stage3_dilation=self.stage3_dilation,
+            stage3_groups=self.stage3_groups,
+            stage4_in_channels=self.stage4_in_channels,
+            stage4_out_channels=self.stage4_out_channels,
+            stage4_kernel_size=self.stage4_kernel_size,
+            stage4_stride=self.stage4_stride,
+            stage4_dilation=self.stage4_dilation,
+            stage4_groups=self.stage4_groups,
             out_features=self.out_features,
             out_indices=self.out_indices,
             hidden_sizes=self.hidden_sizes,
@@ -222,7 +222,7 @@ def create_and_check_backbone(self, config, pixel_values, labels):
         # verify feature maps
         self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
         self.parent.assertListEqual(
-            list(result.feature_maps[0].shape), [self.batch_size, self.backbone_stage1_out_channels[-1], 16, 16]
+            list(result.feature_maps[0].shape), [self.batch_size, self.stage1_out_channels[-1], 16, 16]
         )
 
         # verify channels

From 1f99e8485fe177ad0523b69d9e51ecd869cc10cf Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 8 Nov 2023 16:08:53 +0530
Subject: [PATCH 036/152] Fix failures

---
 src/transformers/__init__.py                | 3 +++
 src/transformers/models/__init__.py         | 1 +
 src/transformers/models/textnet/__init__.py | 4 ++--
 src/transformers/utils/dummy_pt_objects.py  | 7 +++++++
 utils/check_repo.py                         | 1 +
 5 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 90270a9e406f..e3110a1d72ba 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3305,6 +3305,8 @@
             "TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
             "TextNetBackbone",
             "TextNetModel",
+            "TextNetForImageClassification",
+            "TextNetPreTrainedModel"
         ]
     )
     _import_structure["models.time_series_transformer"].extend(
@@ -7664,6 +7666,7 @@
             TextNetBackbone,
             TextNetModel,
             TextNetPreTrainedModel,
+            TextNetForImageClassification,
         )
         from .models.time_series_transformer import (
             TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 2c20873c2ed7..8ca5c9ae27ba 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -206,6 +206,7 @@
     swinv2,
     switch_transformers,
     t5,
+    textnet,
     table_transformer,
     tapas,
     time_series_transformer,
diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py
index 21e26f387817..9a4832c7caa9 100644
--- a/src/transformers/models/textnet/__init__.py
+++ b/src/transformers/models/textnet/__init__.py
@@ -20,7 +20,6 @@
     is_torch_available,
 )
 
-
 _import_structure = {
     "configuration_textnet": ["TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "TextNetConfig"],
     "image_processing_textnet": ["TextNetImageProcessor"],
@@ -32,7 +31,8 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["modeling_textnet"] = ["TextNetBackbone", "TextNetModel", "TextNetPreTrainedModel"]
+    _import_structure["modeling_textnet"] = ["TextNetBackbone", "TextNetModel", "TextNetPreTrainedModel",
+                                             "TextNetForImageClassification"]
 
 if TYPE_CHECKING:
     from .configuration_textnet import TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, TextNetConfig
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 095c9a6f4189..70047baa2931 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -8047,6 +8047,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class TextNetForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 3af3a05a8aa6..66f9d7f2b757 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -993,6 +993,7 @@ def find_all_documented_objects() -> List[str]:
     "ResNetBackbone",
     "SwinBackbone",
     "Swinv2Backbone",
+    "TextNetBackbone",
     "TimmBackbone",
     "TimmBackboneConfig",
     "VitDetBackbone",

From 02e85ed1f2527370d3dec313d3e7765a6355e493 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 8 Nov 2023 17:38:14 +0530
Subject: [PATCH 037/152] Add cv2 to setup

---
 setup.py                                    | 1 +
 src/transformers/__init__.py                | 6 +++---
 src/transformers/models/__init__.py         | 2 +-
 src/transformers/models/textnet/__init__.py | 9 +++++++--
 src/transformers/utils/dummy_pt_objects.py  | 6 +++---
 5 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/setup.py b/setup.py
index 65b84fe938f7..cde79041adb3 100644
--- a/setup.py
+++ b/setup.py
@@ -296,6 +296,7 @@ def run(self):
 extras["natten"] = deps_list("natten")
 extras["codecarbon"] = deps_list("codecarbon")
 extras["video"] = deps_list("decord", "av")
+extras["opencv-python"] = deps_list("opencv-python")
 
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["testing"] = (
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index e3110a1d72ba..7cfffec8463b 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3304,9 +3304,9 @@
         [
             "TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
             "TextNetBackbone",
-            "TextNetModel",
             "TextNetForImageClassification",
-            "TextNetPreTrainedModel"
+            "TextNetModel",
+            "TextNetPreTrainedModel",
         ]
     )
     _import_structure["models.time_series_transformer"].extend(
@@ -7664,9 +7664,9 @@
         from .models.textnet import (
             TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
             TextNetBackbone,
+            TextNetForImageClassification,
             TextNetModel,
             TextNetPreTrainedModel,
-            TextNetForImageClassification,
         )
         from .models.time_series_transformer import (
             TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 8ca5c9ae27ba..b63e845c7060 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -206,9 +206,9 @@
     swinv2,
     switch_transformers,
     t5,
-    textnet,
     table_transformer,
     tapas,
+    textnet,
     time_series_transformer,
     timesformer,
     timm_backbone,
diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py
index 9a4832c7caa9..cd8c1fa3276b 100644
--- a/src/transformers/models/textnet/__init__.py
+++ b/src/transformers/models/textnet/__init__.py
@@ -20,6 +20,7 @@
     is_torch_available,
 )
 
+
 _import_structure = {
     "configuration_textnet": ["TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "TextNetConfig"],
     "image_processing_textnet": ["TextNetImageProcessor"],
@@ -31,8 +32,12 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["modeling_textnet"] = ["TextNetBackbone", "TextNetModel", "TextNetPreTrainedModel",
-                                             "TextNetForImageClassification"]
+    _import_structure["modeling_textnet"] = [
+        "TextNetBackbone",
+        "TextNetModel",
+        "TextNetPreTrainedModel",
+        "TextNetForImageClassification",
+    ]
 
 if TYPE_CHECKING:
     from .configuration_textnet import TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, TextNetConfig
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 70047baa2931..f45bbda68f5f 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -8033,21 +8033,21 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class TextNetModel(metaclass=DummyObject):
+class TextNetForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class TextNetPreTrainedModel(metaclass=DummyObject):
+class TextNetModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class TextNetForImageClassification(metaclass=DummyObject):
+class TextNetPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):

From 632ef069bb0f99d0a122f0e075a6dad3d2fc9ac7 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 8 Nov 2023 17:56:16 +0530
Subject: [PATCH 038/152] Fix failures

---
 src/transformers/models/textnet/__init__.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py
index cd8c1fa3276b..6ac78b0bce02 100644
--- a/src/transformers/models/textnet/__init__.py
+++ b/src/transformers/models/textnet/__init__.py
@@ -49,7 +49,12 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_textnet import TextNetBackbone, TextNetModel, TextNetPreTrainedModel
+        from .modeling_textnet import (
+            TextNetBackbone,
+            TextNetForImageClassification,
+            TextNetModel,
+            TextNetPreTrainedModel,
+        )
 
 
 else:

From 8c25e477cd96e8d48251377e76a234f7c7a9c699 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 8 Nov 2023 18:05:44 +0530
Subject: [PATCH 039/152] Fix failures

---
 .../models/textnet/modeling_textnet.py        | 26 +++++++++----------
 tests/models/textnet/test_modeling_textnet.py |  3 +--
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index cc0b834f4262..72950f0776ec 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -502,19 +502,19 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import AutoImageProcessor, AutoBackbone
-        >>> import torch
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> processor = AutoImageProcessor.from_pretrained("google/resnetnv2-50")
-        >>> model = AutoBackbone.from_pretrained("google/resnetnv2-50")
-
-        >>> inputs = processor(image, return_tensors="pt")
-        >>> outputs = model(**inputs)
+        # >>> from transformers import AutoImageProcessor, AutoBackbone
+        # >>> import torch
+        # >>> from PIL import Image
+        # >>> import requests
+        #
+        # >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        # >>> image = Image.open(requests.get(url, stream=True).raw)
+        #
+        # >>> processor = AutoImageProcessor.from_pretrained("google/resnetnv2-50")
+        # >>> model = AutoBackbone.from_pretrained("google/resnetnv2-50")
+        #
+        # >>> inputs = processor(image, return_tensors="pt")
+        # >>> outputs = model(**inputs)
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = (
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index bd18111f582c..a73833f20a16 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -16,8 +16,6 @@
 import inspect
 import unittest
 
-import torch.nn as nn
-
 from transformers import (
     TextNetBackbone,
     TextNetConfig,
@@ -37,6 +35,7 @@
 
 if is_torch_available():
     import torch
+    import torch.nn as nn
 
     from transformers import (
         TextNetModel,

From 1537643c48265ebe0c4336661e6ab36beec9cf82 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 8 Nov 2023 18:14:28 +0530
Subject: [PATCH 040/152] Add CV2 dependency

---
 setup.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index cde79041adb3..cc354db415ef 100644
--- a/setup.py
+++ b/setup.py
@@ -290,13 +290,12 @@ def run(self):
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
 extras["tf-speech"] = extras["audio"]
 extras["flax-speech"] = extras["audio"]
-extras["vision"] = deps_list("Pillow")
+extras["vision"] = deps_list("Pillow", "opencv-python")
 extras["timm"] = deps_list("timm")
 extras["torch-vision"] = deps_list("torchvision") + extras["vision"]
 extras["natten"] = deps_list("natten")
 extras["codecarbon"] = deps_list("codecarbon")
 extras["video"] = deps_list("decord", "av")
-extras["opencv-python"] = deps_list("opencv-python")
 
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["testing"] = (

From 9718ca1cdcfb4f1fbec9ff19beebccc757ed7d35 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 8 Nov 2023 18:32:42 +0530
Subject: [PATCH 041/152] Fix bugs

---
 src/transformers/models/fast/configuration_fast.py   |  4 ----
 .../models/fast/image_processing_fast.py             | 12 +++++++-----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index 186b398a4745..3d499d756c5e 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -95,8 +95,6 @@ def __init__(
         head_final_act_func=None,
         head_final_dropout_rate=0,
         head_final_ops_order="weight",
-        min_area=250,
-        bbox_type="rect",
         loss_bg=False,
         initializer_range=0.02,
         **kwargs,
@@ -174,7 +172,5 @@ def __init__(
         self.head_final_dropout_rate = head_final_dropout_rate
         self.head_final_ops_order = head_final_ops_order
 
-        self.min_area = min_area
-        self.bbox_type = bbox_type
         self.loss_bg = loss_bg
         self.initializer_range = initializer_range
diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index 8aeb1e6f0334..ae4505d4fffa 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -390,7 +390,7 @@ def _max_pooling(self, x, scale=1):
             )
         return x
 
-    def post_process_text_detection(self, output, target_sizes, threshold):
+    def post_process_text_detection(self, output, target_sizes, threshold, bbox_type="rect"):
         scale = 2
         img_size = (self.size["height"], self.size["width"])
         out = output["hidden_states"]
@@ -429,13 +429,15 @@ def post_process_text_detection(self, output, target_sizes, threshold):
             org_img_size = target_sizes[i]
             scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0]))
 
-            bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales, threshold)
+            bboxes, scores = self.generate_bbox(
+                keys[i], labels[i], score_maps[i], scales, threshold, bbox_type=bbox_type
+            )
             results.append({"bboxes": bboxes, "scores": scores})
         final_results.update({"results": results})
 
         return results
 
-    def generate_bbox(self, keys, label, score, scales, threshold):
+    def generate_bbox(self, keys, label, score, scales, threshold, bbox_type):
         label_num = len(keys)
         bboxes = []
         scores = []
@@ -452,13 +454,13 @@ def generate_bbox(self, keys, label, score, scales, threshold):
                 label[ind] = 0
                 continue
 
-            if self.bbox_type == "rect":
+            if bbox_type == "rect":
                 rect = cv2.minAreaRect(points[:, ::-1])
                 alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1])))
                 rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2])
                 bbox = cv2.boxPoints(rect) * scales
 
-            elif self.bbox_type == "poly":
+            elif bbox_type == "poly":
                 binary = np.zeros(label.shape, dtype="uint8")
                 binary[ind_np] = 1
                 contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

From 26c7542a844ca3d405c4dc870d18f79fd9c69d4d Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 8 Nov 2023 19:07:00 +0530
Subject: [PATCH 042/152] Fix build issue

---
 tests/models/textnet/test_modeling_textnet.py | 11 +++++------
 utils/check_docstrings.py                     |  1 +
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index a73833f20a16..6fb31172adc8 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -16,16 +16,11 @@
 import inspect
 import unittest
 
-from transformers import (
-    TextNetBackbone,
-    TextNetConfig,
-    is_torch_available,
-)
-from transformers.models.textnet.modeling_textnet import TextNetForImageClassification
 from transformers.testing_utils import (
     require_torch,
     torch_device,
 )
+from transformers.utils import is_torch_available
 
 from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -38,7 +33,11 @@
     import torch.nn as nn
 
     from transformers import (
+        TextNetBackbone,
+        TextNetConfig,
+        TextNetForImageClassification,
         TextNetModel,
+        is_torch_available,
     )
 
 
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index 3c4663103979..a867e46ce64e 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -487,6 +487,7 @@
     "TapasConfig",
     "TapasModel",
     "TapasTokenizer",
+    "TextNetImageProcessor",
     "Text2TextGenerationPipeline",
     "TextClassificationPipeline",
     "TextGenerationPipeline",

From ed85312ccede591ee03eecc9f507463489928df6 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 8 Nov 2023 19:37:44 +0530
Subject: [PATCH 043/152] Fix failures

---
 src/transformers/models/fast/modeling_fast.py   | 4 ++--
 tests/models/fast/test_image_processing_fast.py | 7 ++++---
 tests/models/fast/test_modeling_fast.py         | 4 ++--
 tests/models/textnet/test_modeling_textnet.py   | 2 +-
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index cfd3506de0fc..b7f5f45f1f00 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -804,8 +804,8 @@ def forward(
         >>> # forward pass
         >>> outputs = model(pixel_values=inputs["pixel_values"])
         >>> target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]]
-        >>> threshold = 0.88
-        >>> text_locations = processor.post_process_text_detection(outputs, target_sizes, threshold)
+        >>> threshold = 0.85
+        >>> text_locations = processor.post_process_text_detection(outputs, target_sizes, threshold, bbox_type="poly")
         >>> print(text_locations[0]["bboxes"][0][:10])
         [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]
         ```
diff --git a/tests/models/fast/test_image_processing_fast.py b/tests/models/fast/test_image_processing_fast.py
index f8192856849b..667ce191d43a 100644
--- a/tests/models/fast/test_image_processing_fast.py
+++ b/tests/models/fast/test_image_processing_fast.py
@@ -18,7 +18,7 @@
 
 import requests
 
-from transformers.testing_utils import require_torch, require_vision
+from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -137,6 +137,7 @@ def test_image_processor_from_dict_with_kwargs(self):
         self.assertEqual(image_processor.size, {"height": 42, "width": 42})
         self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
 
+    @slow
     def test_post_process_text_detection(self):
         model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
 
@@ -152,8 +153,8 @@ def prepare_image():
 
         output = model(pixel_values=torch.tensor(inputs["pixel_values"]))
         target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]]
-        threshold = 0.88
-        final_out = image_processor.post_process_text_detection(output, target_sizes, threshold)
+        threshold = 0.85
+        final_out = image_processor.post_process_text_detection(output, target_sizes, threshold, bbox_type="poly")
 
         assert len(final_out[0]["bboxes"]) == 2
         assert len(final_out[0]["bboxes"][0]) == 716
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index f97481436676..44168b853961 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -425,8 +425,8 @@ def prepare_image():
 
         output = model(pixel_values=torch.tensor(input["pixel_values"]))
         target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]]
-        threshold = 0.88
-        final_out = image_processor.post_process_text_detection(output, target_sizes, threshold)
+        threshold = 0.85
+        final_out = image_processor.post_process_text_detection(output, target_sizes, threshold, bbox_type="poly")
 
         assert final_out[0]["bboxes"][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]
         assert round(float(final_out[0]["scores"][0]), 5) == 0.92356
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 6fb31172adc8..01337be50b7c 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -395,7 +395,7 @@ def test_feed_forward_chunking(self):
 
 
 @require_torch
-class BitBackboneTest(BackboneTesterMixin, unittest.TestCase):
+class TextNetBackboneTest(BackboneTesterMixin, unittest.TestCase):
     all_model_classes = (TextNetBackbone,) if is_torch_available() else ()
     config_class = TextNetConfig
 

From 3f8be4dd7d9bc8cb8be74d01a0b299bdd6f9fce2 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 9 Nov 2023 06:20:03 +0530
Subject: [PATCH 044/152] Remove textnet from modeling fast

---
 .../models/fast/configuration_fast.py         | 119 +++++++-----------
 .../fast/convert_fast_original_to_pytorch.py  |  87 +++++++------
 src/transformers/models/fast/modeling_fast.py | 118 ++++-------------
 .../models/textnet/configuration_textnet.py   |   1 +
 tests/models/fast/test_modeling_fast.py       |  85 +++++++------
 tests/models/textnet/test_modeling_textnet.py |   2 +-
 6 files changed, 164 insertions(+), 248 deletions(-)

diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index 3d499d756c5e..0ed87373049e 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Fast model configuration"""
-from transformers import PretrainedConfig
+from transformers import CONFIG_MAPPING, PretrainedConfig
 from transformers.utils import logging
 
 
@@ -33,42 +33,9 @@ class FastConfig(PretrainedConfig):
 
     def __init__(
         self,
-        backbone_kernel_size=3,
-        backbone_stride=2,
-        backbone_dilation=1,
-        backbone_groups=1,
-        backbone_bias=False,
-        backbone_has_shuffle=False,
-        backbone_in_channels=3,
-        backbone_out_channels=64,
-        backbone_use_bn=True,
-        backbone_act_func="relu",
-        backbone_dropout_rate=0,
-        backbone_ops_order="weight_bn_act",
-        backbone_stage1_in_channels=[64, 64, 64],
-        backbone_stage1_out_channels=[64, 64, 64],
-        backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]],
-        backbone_stage1_stride=[1, 2, 1],
-        backbone_stage1_dilation=[1, 1, 1],
-        backbone_stage1_groups=[1, 1, 1],
-        backbone_stage2_in_channels=[64, 128, 128, 128],
-        backbone_stage2_out_channels=[128, 128, 128, 128],
-        backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]],
-        backbone_stage2_stride=[2, 1, 1, 1],
-        backbone_stage2_dilation=[1, 1, 1, 1],
-        backbone_stage2_groups=[1, 1, 1, 1],
-        backbone_stage3_in_channels=[128, 256, 256, 256],
-        backbone_stage3_out_channels=[256, 256, 256, 256],
-        backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]],
-        backbone_stage3_stride=[2, 1, 1, 1],
-        backbone_stage3_dilation=[1, 1, 1, 1],
-        backbone_stage3_groups=[1, 1, 1, 1],
-        backbone_stage4_in_channels=[256, 512, 512, 512],
-        backbone_stage4_out_channels=[512, 512, 512, 512],
-        backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]],
-        backbone_stage4_stride=[2, 1, 1, 1],
-        backbone_stage4_dilation=[1, 1, 1, 1],
-        backbone_stage4_groups=[1, 1, 1, 1],
+        use_timm_backbone=True,
+        backbone_config=None,
+        num_channels=3,
         neck_in_channels=[64, 128, 256, 512],
         neck_out_channels=[128, 128, 128, 128],
         neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]],
@@ -96,51 +63,33 @@ def __init__(
         head_final_dropout_rate=0,
         head_final_ops_order="weight",
         loss_bg=False,
+        backbone="resnet50",
+        use_pretrained_backbone=True,
+        dilation=False,
         initializer_range=0.02,
         **kwargs,
     ):
         super().__init__(**kwargs)
 
-        self.backbone_kernel_size = backbone_kernel_size
-        self.backbone_stride = backbone_stride
-        self.backbone_dilation = backbone_dilation
-        self.backbone_groups = backbone_groups
-        self.backbone_bias = backbone_bias
-        self.backbone_has_shuffle = backbone_has_shuffle
-        self.backbone_in_channels = backbone_in_channels
-        self.backbone_out_channels = backbone_out_channels
-        self.backbone_use_bn = backbone_use_bn
-        self.backbone_act_func = backbone_act_func
-        self.backbone_dropout_rate = backbone_dropout_rate
-        self.backbone_ops_order = backbone_ops_order
-
-        self.backbone_stage1_in_channels = backbone_stage1_in_channels
-        self.backbone_stage1_out_channels = backbone_stage1_out_channels
-        self.backbone_stage1_kernel_size = backbone_stage1_kernel_size
-        self.backbone_stage1_stride = backbone_stage1_stride
-        self.backbone_stage1_dilation = backbone_stage1_dilation
-        self.backbone_stage1_groups = backbone_stage1_groups
-
-        self.backbone_stage2_in_channels = backbone_stage2_in_channels
-        self.backbone_stage2_out_channels = backbone_stage2_out_channels
-        self.backbone_stage2_kernel_size = backbone_stage2_kernel_size
-        self.backbone_stage2_stride = backbone_stage2_stride
-        self.backbone_stage2_dilation = backbone_stage2_dilation
-        self.backbone_stage2_groups = backbone_stage2_groups
-
-        self.backbone_stage3_in_channels = backbone_stage3_in_channels
-        self.backbone_stage3_out_channels = backbone_stage3_out_channels
-        self.backbone_stage3_kernel_size = backbone_stage3_kernel_size
-        self.backbone_stage3_stride = backbone_stage3_stride
-        self.backbone_stage3_dilation = backbone_stage3_dilation
-        self.backbone_stage3_groups = backbone_stage3_groups
-
-        self.backbone_stage4_in_channels = backbone_stage4_in_channels
-        self.backbone_stage4_out_channels = backbone_stage4_out_channels
-        self.backbone_stage4_kernel_size = backbone_stage4_kernel_size
-        self.backbone_stage4_stride = backbone_stage4_stride
-        self.backbone_stage4_dilation = backbone_stage4_dilation
-        self.backbone_stage4_groups = backbone_stage4_groups
+        if backbone_config is not None and use_timm_backbone:
+            raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
+
+        if not use_timm_backbone:
+            if backbone_config is None:
+                logger.info(
+                    "`backbone_config` is `None`. Initializing the config with the default `TextNet` backbone."
+                )
+                backbone_config = CONFIG_MAPPING["textnet"](out_features=["stage1", "stage2", "stage3", "stage4"])
+            elif isinstance(backbone_config, dict):
+                backbone_model_type = backbone_config.get("model_type")
+                config_class = CONFIG_MAPPING[backbone_model_type]
+                backbone_config = config_class.from_dict(backbone_config)
+            # set timm attributes to None
+            dilation, backbone, use_pretrained_backbone = None, None, None
+
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_config = backbone_config
+        self.num_channels = num_channels
 
         self.neck_in_channels = neck_in_channels
         self.neck_out_channels = neck_out_channels
@@ -173,4 +122,20 @@ def __init__(
         self.head_final_ops_order = head_final_ops_order
 
         self.loss_bg = loss_bg
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.dilation = dilation
+
         self.initializer_range = initializer_range
+
+    @classmethod
+    def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
+        """Instantiate a [`FastConfig`] (or a derived class) from a pre-trained backbone model configuration.
+
+        Args:
+            backbone_config ([`PretrainedConfig`]):
+                The backbone configuration.
+        Returns:
+            [`DetrConfig`]: An instance of a configuration object
+        """
+        return cls(backbone_config=backbone_config, **kwargs)
diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
index 0207f123b257..c624440bc0cb 100644
--- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py
+++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
@@ -22,7 +22,7 @@
 import torch
 from PIL import Image
 
-from transformers import FastConfig, FastForSceneTextRecognition
+from transformers import FastConfig, FastForSceneTextRecognition, TextNetConfig
 from transformers.models.fast.image_processing_fast import FastImageProcessor
 
 
@@ -84,43 +84,50 @@ def prepare_config(size_config_url, pooling_size, min_area, bbox_type, loss_bg):
             neck_dilation.append(layer_dict["dilation"])
             neck_groups.append(layer_dict["groups"])
 
+    textnet_config = TextNetConfig(
+        kernel_size=config_dict["first_conv"]["kernel_size"],
+        stride=config_dict["first_conv"]["stride"],
+        dilation=config_dict["first_conv"]["dilation"],
+        groups=config_dict["first_conv"]["groups"],
+        bias=config_dict["first_conv"]["bias"],
+        has_shuffle=config_dict["first_conv"]["has_shuffle"],
+        in_channels=config_dict["first_conv"]["in_channels"],
+        out_channels=config_dict["first_conv"]["out_channels"],
+        use_bn=config_dict["first_conv"]["use_bn"],
+        act_func=config_dict["first_conv"]["act_func"],
+        dropout_rate=config_dict["first_conv"]["dropout_rate"],
+        ops_order=config_dict["first_conv"]["ops_order"],
+        stage1_in_channels=backbone_config["stage1"]["in_channels"],
+        stage1_out_channels=backbone_config["stage1"]["out_channels"],
+        stage1_kernel_size=backbone_config["stage1"]["kernel_size"],
+        stage1_stride=backbone_config["stage1"]["stride"],
+        stage1_dilation=backbone_config["stage1"]["dilation"],
+        stage1_groups=backbone_config["stage1"]["groups"],
+        stage2_in_channels=backbone_config["stage2"]["in_channels"],
+        stage2_out_channels=backbone_config["stage2"]["out_channels"],
+        stage2_kernel_size=backbone_config["stage2"]["kernel_size"],
+        stage2_stride=backbone_config["stage2"]["stride"],
+        stage2_dilation=backbone_config["stage2"]["dilation"],
+        stage2_groups=backbone_config["stage2"]["groups"],
+        stage3_in_channels=backbone_config["stage3"]["in_channels"],
+        stage3_out_channels=backbone_config["stage3"]["out_channels"],
+        stage3_kernel_size=backbone_config["stage3"]["kernel_size"],
+        stage3_stride=backbone_config["stage3"]["stride"],
+        stage3_dilation=backbone_config["stage3"]["dilation"],
+        stage3_groups=backbone_config["stage3"]["groups"],
+        stage4_in_channels=backbone_config["stage4"]["in_channels"],
+        stage4_out_channels=backbone_config["stage4"]["out_channels"],
+        stage4_kernel_size=backbone_config["stage4"]["kernel_size"],
+        stage4_stride=backbone_config["stage4"]["stride"],
+        stage4_dilation=backbone_config["stage4"]["dilation"],
+        stage4_groups=backbone_config["stage4"]["groups"],
+        out_features=["stage1", "stage2", "stage3", "stage4"],
+        out_indices=[1, 2, 3, 4],
+    )
+
     return FastConfig(
-        backbone_kernel_size=config_dict["first_conv"]["kernel_size"],
-        backbone_stride=config_dict["first_conv"]["stride"],
-        backbone_dilation=config_dict["first_conv"]["dilation"],
-        backbone_groups=config_dict["first_conv"]["groups"],
-        backbone_bias=config_dict["first_conv"]["bias"],
-        backbone_has_shuffle=config_dict["first_conv"]["has_shuffle"],
-        backbone_in_channels=config_dict["first_conv"]["in_channels"],
-        backbone_out_channels=config_dict["first_conv"]["out_channels"],
-        backbone_use_bn=config_dict["first_conv"]["use_bn"],
-        backbone_act_func=config_dict["first_conv"]["act_func"],
-        backbone_dropout_rate=config_dict["first_conv"]["dropout_rate"],
-        backbone_ops_order=config_dict["first_conv"]["ops_order"],
-        backbone_stage1_in_channels=backbone_config["stage1"]["in_channels"],
-        backbone_stage1_out_channels=backbone_config["stage1"]["out_channels"],
-        backbone_stage1_kernel_size=backbone_config["stage1"]["kernel_size"],
-        backbone_stage1_stride=backbone_config["stage1"]["stride"],
-        backbone_stage1_dilation=backbone_config["stage1"]["dilation"],
-        backbone_stage1_groups=backbone_config["stage1"]["groups"],
-        backbone_stage2_in_channels=backbone_config["stage2"]["in_channels"],
-        backbone_stage2_out_channels=backbone_config["stage2"]["out_channels"],
-        backbone_stage2_kernel_size=backbone_config["stage2"]["kernel_size"],
-        backbone_stage2_stride=backbone_config["stage2"]["stride"],
-        backbone_stage2_dilation=backbone_config["stage2"]["dilation"],
-        backbone_stage2_groups=backbone_config["stage2"]["groups"],
-        backbone_stage3_in_channels=backbone_config["stage3"]["in_channels"],
-        backbone_stage3_out_channels=backbone_config["stage3"]["out_channels"],
-        backbone_stage3_kernel_size=backbone_config["stage3"]["kernel_size"],
-        backbone_stage3_stride=backbone_config["stage3"]["stride"],
-        backbone_stage3_dilation=backbone_config["stage3"]["dilation"],
-        backbone_stage3_groups=backbone_config["stage3"]["groups"],
-        backbone_stage4_in_channels=backbone_config["stage4"]["in_channels"],
-        backbone_stage4_out_channels=backbone_config["stage4"]["out_channels"],
-        backbone_stage4_kernel_size=backbone_config["stage4"]["kernel_size"],
-        backbone_stage4_stride=backbone_config["stage4"]["stride"],
-        backbone_stage4_dilation=backbone_config["stage4"]["dilation"],
-        backbone_stage4_groups=backbone_config["stage4"]["groups"],
+        use_timm_backbone=False,
+        backbone_config=textnet_config,
         neck_in_channels=neck_in_channels,
         neck_out_channels=neck_out_channels,
         neck_kernel_size=neck_kernel_size,
@@ -164,7 +171,7 @@ def get_base_model_config():
 def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_folder_path, validate_logits):
     response = requests.get(checkpoint_config_url)
     content = response.text
-
+    print("Got respose")
     namespace = {}
 
     exec(content, namespace)
@@ -197,7 +204,7 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_
     if "train" in data_config:
         if "short_size" in data_config["train"]:
             size = data_config["train"]["short_size"]
-
+    print("we got config")
     model = FastForSceneTextRecognition(config)
     fast_image_processor = FastImageProcessor(
         size={"height": size, "width": size},
@@ -209,7 +216,7 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_
     state_dict_changed = copy.deepcopy(state_dict)
     for key in state_dict:
         val = state_dict_changed.pop(key)
-        new_key = key.replace("module.", "")
+        new_key = key.replace("module.", "").replace("backbone.", "backbone.textnet.")
         for search, replacement in rename_key_mappings.items():
             if search in new_key:
                 new_key = new_key.replace(search, replacement)
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index b7f5f45f1f00..ce5f2aab384f 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -21,8 +21,9 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from timm import create_model
 
-from transformers import FastConfig, PreTrainedModel, add_start_docstrings
+from transformers import AutoBackbone, FastConfig, PreTrainedModel, add_start_docstrings, requires_backends
 from transformers.utils import ModelOutput, add_start_docstrings_to_model_forward, replace_return_docstrings
 
 
@@ -341,94 +342,6 @@ def _init_weights(self, module):
                 module.bias.data.zero_()
 
 
-class FastTextNet(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.first_conv = FASTConvLayer(
-            config.backbone_in_channels,
-            config.backbone_out_channels,
-            config.backbone_kernel_size,
-            config.backbone_stride,
-            config.backbone_dilation,
-            config.backbone_groups,
-            config.backbone_bias,
-            config.backbone_has_shuffle,
-            config.backbone_use_bn,
-            config.backbone_act_func,
-            config.backbone_dropout_rate,
-            config.backbone_ops_order,
-        )
-        stage1 = []
-        for stage_config in zip(
-            config.backbone_stage1_in_channels,
-            config.backbone_stage1_out_channels,
-            config.backbone_stage1_kernel_size,
-            config.backbone_stage1_stride,
-            config.backbone_stage1_dilation,
-            config.backbone_stage1_groups,
-        ):
-            stage1.append(FASTRepConvLayer(*stage_config))
-        self.stage1 = nn.ModuleList(stage1)
-
-        stage2 = []
-        for stage_config in zip(
-            config.backbone_stage2_in_channels,
-            config.backbone_stage2_out_channels,
-            config.backbone_stage2_kernel_size,
-            config.backbone_stage2_stride,
-            config.backbone_stage2_dilation,
-            config.backbone_stage2_groups,
-        ):
-            stage2.append(FASTRepConvLayer(*stage_config))
-        self.stage2 = nn.ModuleList(stage2)
-
-        stage3 = []
-        for stage_config in zip(
-            config.backbone_stage3_in_channels,
-            config.backbone_stage3_out_channels,
-            config.backbone_stage3_kernel_size,
-            config.backbone_stage3_stride,
-            config.backbone_stage3_dilation,
-            config.backbone_stage3_groups,
-        ):
-            stage3.append(FASTRepConvLayer(*stage_config))
-        self.stage3 = nn.ModuleList(stage3)
-
-        stage4 = []
-        for stage_config in zip(
-            config.backbone_stage4_in_channels,
-            config.backbone_stage4_out_channels,
-            config.backbone_stage4_kernel_size,
-            config.backbone_stage4_stride,
-            config.backbone_stage4_dilation,
-            config.backbone_stage4_groups,
-        ):
-            stage4.append(FASTRepConvLayer(*stage_config))
-        self.stage4 = nn.ModuleList(stage4)
-
-    def forward(self, hidden_states):
-        hidden_states = self.first_conv(hidden_states)
-        output = []
-
-        for block in self.stage1:
-            hidden_states = block(hidden_states)
-        output.append(hidden_states)
-
-        for block in self.stage2:
-            hidden_states = block(hidden_states)
-        output.append(hidden_states)
-
-        for block in self.stage3:
-            hidden_states = block(hidden_states)
-        output.append(hidden_states)
-
-        for block in self.stage4:
-            hidden_states = block(hidden_states)
-        output.append(hidden_states)
-
-        return output
-
-
 class FASTNeck(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -729,7 +642,26 @@ class FastForSceneTextRecognitionOutput(ModelOutput):
 class FastForSceneTextRecognition(FastPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.backbone = FastTextNet(config=config)
+        # self.backbone = FastTextNet(config=config)
+        self.config = config
+
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            kwargs = {}
+            if config.dilation:
+                kwargs["output_stride"] = 16
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=(1, 2, 3, 4),
+                in_chans=config.num_channels,
+                **kwargs,
+            )
+        else:
+            backbone = AutoBackbone.from_config(config.backbone_config)
+
+        self.backbone = backbone
         self.neck = FASTNeck(config=config)
         self.det_head = FASTHead(config=config)
         self.loss_bg = config.loss_bg
@@ -812,9 +744,11 @@ def forward(
         """
         # outputs = {}
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        hidden_states = self.backbone(pixel_values)
+        features = (
+            self.backbone(pixel_values) if self.config.use_timm_backbone else self.backbone(pixel_values).feature_maps
+        )
 
-        hidden_states = self.neck(hidden_states)
+        hidden_states = self.neck(features)
 
         text_detection_output = self.det_head(hidden_states)
 
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 6bcb961a0f97..1f059550f50a 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -31,6 +31,7 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt)
     """
+    model_type = "textnet"
 
     def __init__(
         self,
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index 44168b853961..b39a7b8e7e88 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -21,12 +21,14 @@
 
 from transformers import (
     FastConfig,
+    TextNetConfig,
     is_torch_available,
 )
 from transformers.models.fast.image_processing_fast import FastImageProcessor
 from transformers.testing_utils import (
     require_torch,
     require_vision,
+    slow,
     torch_device,
 )
 
@@ -203,43 +205,50 @@ def prepare_config_and_inputs(self):
         return config, {"pixel_values": pixel_values}
 
     def get_config(self):
+        textnet_config = TextNetConfig(
+            kernel_size=self.backbone_kernel_size,
+            stride=self.backbone_stride,
+            dilation=self.backbone_dilation,
+            groups=self.backbone_groups,
+            bias=self.backbone_bias,
+            has_shuffle=self.backbone_has_shuffle,
+            in_channels=self.backbone_in_channels,
+            out_channels=self.backbone_out_channels,
+            use_bn=self.backbone_use_bn,
+            act_func=self.backbone_act_func,
+            dropout_rate=self.backbone_dropout_rate,
+            ops_order=self.backbone_ops_order,
+            stage1_in_channels=self.backbone_stage1_in_channels,
+            stage1_out_channels=self.backbone_stage1_out_channels,
+            stage1_kernel_size=self.backbone_stage1_kernel_size,
+            stage1_stride=self.backbone_stage1_stride,
+            stage1_dilation=self.backbone_stage1_dilation,
+            stage1_groups=self.backbone_stage1_groups,
+            stage2_in_channels=self.backbone_stage2_in_channels,
+            stage2_out_channels=self.backbone_stage2_out_channels,
+            stage2_kernel_size=self.backbone_stage2_kernel_size,
+            stage2_stride=self.backbone_stage2_stride,
+            stage2_dilation=self.backbone_stage2_dilation,
+            stage2_groups=self.backbone_stage2_groups,
+            stage3_in_channels=self.backbone_stage3_in_channels,
+            stage3_out_channels=self.backbone_stage3_out_channels,
+            stage3_kernel_size=self.backbone_stage3_kernel_size,
+            stage3_stride=self.backbone_stage3_stride,
+            stage3_dilation=self.backbone_stage3_dilation,
+            stage3_groups=self.backbone_stage3_groups,
+            stage4_in_channels=self.backbone_stage4_in_channels,
+            stage4_out_channels=self.backbone_stage4_out_channels,
+            stage4_kernel_size=self.backbone_stage4_kernel_size,
+            stage4_stride=self.backbone_stage4_stride,
+            stage4_dilation=self.backbone_stage4_dilation,
+            stage4_groups=self.backbone_stage4_groups,
+            out_features=["stage1", "stage2", "stage3", "stage4"],
+            out_indices=[1, 2, 3, 4],
+        )
+
         return FastConfig(
-            backbone_kernel_size=self.backbone_kernel_size,
-            backbone_stride=self.backbone_stride,
-            backbone_dilation=self.backbone_dilation,
-            backbone_groups=self.backbone_groups,
-            backbone_bias=self.backbone_bias,
-            backbone_has_shuffle=self.backbone_has_shuffle,
-            backbone_in_channels=self.backbone_in_channels,
-            backbone_out_channels=self.backbone_out_channels,
-            backbone_use_bn=self.backbone_use_bn,
-            backbone_act_func=self.backbone_act_func,
-            backbone_dropout_rate=self.backbone_dropout_rate,
-            backbone_ops_order=self.backbone_ops_order,
-            backbone_stage1_in_channels=self.backbone_stage1_in_channels,
-            backbone_stage1_out_channels=self.backbone_stage1_out_channels,
-            backbone_stage1_kernel_size=self.backbone_stage1_kernel_size,
-            backbone_stage1_stride=self.backbone_stage1_stride,
-            backbone_stage1_dilation=self.backbone_stage1_dilation,
-            backbone_stage1_groups=self.backbone_stage1_groups,
-            backbone_stage2_in_channels=self.backbone_stage2_in_channels,
-            backbone_stage2_out_channels=self.backbone_stage2_out_channels,
-            backbone_stage2_kernel_size=self.backbone_stage2_kernel_size,
-            backbone_stage2_stride=self.backbone_stage2_stride,
-            backbone_stage2_dilation=self.backbone_stage2_dilation,
-            backbone_stage2_groups=self.backbone_stage2_groups,
-            backbone_stage3_in_channels=self.backbone_stage3_in_channels,
-            backbone_stage3_out_channels=self.backbone_stage3_out_channels,
-            backbone_stage3_kernel_size=self.backbone_stage3_kernel_size,
-            backbone_stage3_stride=self.backbone_stage3_stride,
-            backbone_stage3_dilation=self.backbone_stage3_dilation,
-            backbone_stage3_groups=self.backbone_stage3_groups,
-            backbone_stage4_in_channels=self.backbone_stage4_in_channels,
-            backbone_stage4_out_channels=self.backbone_stage4_out_channels,
-            backbone_stage4_kernel_size=self.backbone_stage4_kernel_size,
-            backbone_stage4_stride=self.backbone_stage4_stride,
-            backbone_stage4_dilation=self.backbone_stage4_dilation,
-            backbone_stage4_groups=self.backbone_stage4_groups,
+            use_timm_backbone=False,
+            backbone_config=textnet_config,
             neck_in_channels=self.neck_in_channels,
             neck_out_channels=self.neck_out_channels,
             neck_kernel_size=self.neck_kernel_size,
@@ -387,7 +396,7 @@ def test_model_is_small(self):
 @require_torch
 @require_vision
 class FastModelIntegrationTest(unittest.TestCase):
-    # @slow
+    @slow
     def test_inference_fast_tiny_ic17mlt_model(self):
         model = FastForSceneTextRecognition.from_pretrained("Raghavan/ic17mlt_Fast_T")
 
@@ -409,7 +418,7 @@ def prepare_image():
         assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134]
         assert round(float(final_out[0]["scores"][0]), 5) == 0.95541
 
-    # @slow
+    @slow
     def test_inference_fast_base_800_total_text_ic17mlt_model(self):
         model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
 
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 01337be50b7c..a95c072fba95 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -16,6 +16,7 @@
 import inspect
 import unittest
 
+from transformers import TextNetConfig
 from transformers.testing_utils import (
     require_torch,
     torch_device,
@@ -34,7 +35,6 @@
 
     from transformers import (
         TextNetBackbone,
-        TextNetConfig,
         TextNetForImageClassification,
         TextNetModel,
         is_torch_available,

From 45ebd1eb8769df5842bbeb07ab5b46dacdf1e305 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 9 Nov 2023 08:14:37 +0530
Subject: [PATCH 045/152] Fix build and other things

---
 .../models/auto/image_processing_auto.py      |  1 +
 .../models/fast/configuration_fast.py         | 76 ++++++++++++++++++-
 .../fast/convert_fast_original_to_pytorch.py  |  2 -
 .../models/fast/image_processing_fast.py      |  4 -
 src/transformers/models/fast/modeling_fast.py | 17 ++++-
 .../utils/dummy_vision_objects.py             |  7 ++
 6 files changed, 97 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 55a128fe5519..2fac0833c940 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -104,6 +104,7 @@
         ("swin2sr", "Swin2SRImageProcessor"),
         ("swinv2", "ViTImageProcessor"),
         ("table-transformer", "DetrImageProcessor"),
+        ("textnet", "TextNetImageProcessor"),
         ("timesformer", "VideoMAEImageProcessor"),
         ("tvlt", "TvltImageProcessor"),
         ("tvp", "TvpImageProcessor"),
diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index 0ed87373049e..ce5e05b319f8 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -27,6 +27,78 @@
 
 
 class FastConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FastForSceneTextRecognition`]. It is used to
+    instantiate a FastForSceneTextRecognition model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    FastForSceneTextRecognition.
+    [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        use_timm_backbone (`bool`, *optional*, defaults to `True`):
+            Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
+            API.
+        backbone_config (`PretrainedConfig` or `dict`, *optional*):
+            The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
+            case it will default to `ResNetConfig()`.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        neck_in_channels (`List[int]`, *optional*, defaults to `[64, 128, 256, 512]`):
+        neck_out_channels (`List[int]`, *optional*, defaults to `[128, 128, 128, 128]`):
+        neck_kernel_size (`List[int]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 3], [3, 3]]`):
+        neck_stride (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`):
+        neck_dilation (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`):
+        neck_groups (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`):
+        head_pooling_size (`int`, *optional*, defaults to 9):
+        head_dropout_ratio (`int`, *optional*, defaults to 0):
+        head_conv_in_channels (`int`, *optional*, defaults to 512):
+        head_conv_out_channels (`int`, *optional*, defaults to 128):
+        head_conv_kernel_size (`List[int]`, *optional*, defaults to `[3, 3]`):
+        head_conv_stride (`int`, *optional*, defaults to 1):
+        head_conv_dilation (`int`, *optional*, defaults to 1):
+        head_conv_groups (`int`, *optional*, defaults to 1):
+        head_final_kernel_size (`int`, *optional*, defaults to 1):
+        head_final_stride (`int`, *optional*, defaults to 1):
+        head_final_dilation (`int`, *optional*, defaults to 1):
+        head_final_groups (`int`, *optional*, defaults to 1):
+        head_final_bias (`bool`, *optional*, defaults to `False`):
+        head_final_has_shuffle (`bool`, *optional*, defaults to `False`):
+        head_final_in_channels (`int`, *optional*, defaults to 128):
+        head_final_out_channels (`int`, *optional*, defaults to 5):
+        head_final_use_bn (`bool`, *optional*, defaults to `False`):
+        head_final_act_func (`str`, *optional*):
+        head_final_dropout_rate (`int`, *optional*, defaults to 0):
+        head_final_ops_order (`str`, *optional*, defaults to `"weight"`):
+        loss_bg (`bool`, *optional*, defaults to `False`):
+        backbone (`str`, *optional*, defaults to `"textnet"`):
+            Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
+            backbone from the timm package. For a list of all available models, see [this
+            page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
+        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
+            Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`.
+        dilation (`bool`, *optional*, defaults to `False`):
+            Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
+            `use_timm_backbone` = `True`.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+
+    Examples:
+
+    ```python
+    >>> from transformers import FastConfig, FastForSceneTextRecognition
+
+    >>> # Initializing a Fast Config
+    >>> configuration = FastConfig()
+
+    >>> # Initializing a model (with random weights)
+    >>> model = FastForSceneTextRecognition(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     r"""
     [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt)
     """
@@ -63,7 +135,7 @@ def __init__(
         head_final_dropout_rate=0,
         head_final_ops_order="weight",
         loss_bg=False,
-        backbone="resnet50",
+        backbone="textnet",
         use_pretrained_backbone=True,
         dilation=False,
         initializer_range=0.02,
@@ -136,6 +208,6 @@ def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
             backbone_config ([`PretrainedConfig`]):
                 The backbone configuration.
         Returns:
-            [`DetrConfig`]: An instance of a configuration object
+            [`FastConfig`]: An instance of a configuration object
         """
         return cls(backbone_config=backbone_config, **kwargs)
diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
index c624440bc0cb..c7a8e622aae7 100644
--- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py
+++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
@@ -171,7 +171,6 @@ def get_base_model_config():
 def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_folder_path, validate_logits):
     response = requests.get(checkpoint_config_url)
     content = response.text
-    print("Got respose")
     namespace = {}
 
     exec(content, namespace)
@@ -204,7 +203,6 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_
     if "train" in data_config:
         if "short_size" in data_config["train"]:
             size = data_config["train"]["short_size"]
-    print("we got config")
     model = FastForSceneTextRecognition(config)
     fast_image_processor = FastImageProcessor(
         size={"height": size, "width": size},
diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index ae4505d4fffa..eb5020195f2d 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -94,8 +94,6 @@ class FastImageProcessor(BaseImageProcessor):
             number of channels of the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
         min_area (`int`, *optional*, defaults to 200):
             Threshold for min area for results
-        bbox_type (`str`, *optional*, defaults to `"rect"`):
-            Type of bbox, rect or poly
         pooling_size (`int`, *optional*, defaults to 9):
             Pooling size for text detection
     """
@@ -115,7 +113,6 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         min_area: int = 200,
-        bbox_type: str = "rect",
         pooling_size: int = 9,
         **kwargs,
     ) -> None:
@@ -136,7 +133,6 @@ def __init__(
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
         self.min_area = min_area
         # self.threshold = threshold
-        self.bbox_type = bbox_type
         self.pooling_size = pooling_size
 
     @classmethod
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index ce5f2aab384f..ba9460e2d029 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -21,9 +21,22 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from timm import create_model
 
-from transformers import AutoBackbone, FastConfig, PreTrainedModel, add_start_docstrings, requires_backends
+from ...utils import is_timm_available
+
+
+if is_timm_available():
+    from timm import create_model
+
+
+from transformers import (
+    AutoBackbone,
+    FastConfig,
+    PreTrainedModel,
+    add_start_docstrings,
+    is_timm_available,
+    requires_backends,
+)
 from transformers.utils import ModelOutput, add_start_docstrings_to_model_forward, replace_return_docstrings
 
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 18c6a27bd7dc..4ee5d2c9c296 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -198,6 +198,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class FastImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class FlavaFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 

From 5c6dbaf32f24d63d115a8feee7e20b9ba1ebd198 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 9 Nov 2023 08:56:40 +0530
Subject: [PATCH 046/152] Fix build

---
 src/transformers/__init__.py                       | 1 +
 src/transformers/models/fast/configuration_fast.py | 4 ++--
 src/transformers/models/fast/modeling_fast.py      | 1 -
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 7cfffec8463b..5c0d2bed5b5f 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1279,6 +1279,7 @@
     _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
     _import_structure["models.efficientformer"].append("EfficientFormerImageProcessor")
     _import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
+    _import_structure["models.fast"].extend(["FastImageProcessor"])
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
     _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
     _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index ce5e05b319f8..e0f88dabe16f 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -73,7 +73,7 @@ class FastConfig(PretrainedConfig):
         head_final_dropout_rate (`int`, *optional*, defaults to 0):
         head_final_ops_order (`str`, *optional*, defaults to `"weight"`):
         loss_bg (`bool`, *optional*, defaults to `False`):
-        backbone (`str`, *optional*, defaults to `"textnet"`):
+        backbone (`str`, *optional*, defaults to `"resnet50"`):
             Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
             backbone from the timm package. For a list of all available models, see [this
             page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
@@ -135,7 +135,7 @@ def __init__(
         head_final_dropout_rate=0,
         head_final_ops_order="weight",
         loss_bg=False,
-        backbone="textnet",
+        backbone="resnet50",
         use_pretrained_backbone=True,
         dilation=False,
         initializer_range=0.02,
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index ba9460e2d029..644bf67ca4f4 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -655,7 +655,6 @@ class FastForSceneTextRecognitionOutput(ModelOutput):
 class FastForSceneTextRecognition(FastPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        # self.backbone = FastTextNet(config=config)
         self.config = config
 
         if config.use_timm_backbone:

From 643ccacda1021d769a02b3d0029a741cbb7450dd Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 9 Nov 2023 10:16:15 +0530
Subject: [PATCH 047/152] some cleanups

---
 .../models/fast/configuration_fast.py         | 37 +++++++++++---
 src/transformers/models/fast/modeling_fast.py | 51 +++----------------
 .../models/textnet/configuration_textnet.py   |  6 ---
 .../models/textnet/modeling_textnet.py        | 32 ++----------
 tests/models/fast/test_modeling_fast.py       | 20 +++-----
 5 files changed, 48 insertions(+), 98 deletions(-)

diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index e0f88dabe16f..5bfc9ee6fb2a 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -47,32 +47,59 @@ class FastConfig(PretrainedConfig):
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         neck_in_channels (`List[int]`, *optional*, defaults to `[64, 128, 256, 512]`):
+            Denotes the in channels of FASTRepConvLayer in neck module.
         neck_out_channels (`List[int]`, *optional*, defaults to `[128, 128, 128, 128]`):
+            Denotes the out channels of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels`
         neck_kernel_size (`List[int]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 3], [3, 3]]`):
+            Denotes the kernel_size of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels`
         neck_stride (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`):
+            Denotes the neck_stride of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels`
         neck_dilation (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`):
+            Denotes the neck_dilation of FASTRepConvLayer in neck module. Should be of same length of
+            `neck_in_channels`
         neck_groups (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`):
+            Denotes the groups of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels`
         head_pooling_size (`int`, *optional*, defaults to 9):
+            Denotes the pooling size of head layer
         head_dropout_ratio (`int`, *optional*, defaults to 0):
+            Denotes the dropout ratio used in dropout layer of head layer..
         head_conv_in_channels (`int`, *optional*, defaults to 512):
+            Denotes the in channels of first conv layer in head layer.
         head_conv_out_channels (`int`, *optional*, defaults to 128):
+            Denotes the out channels of first conv layer in head layer.
         head_conv_kernel_size (`List[int]`, *optional*, defaults to `[3, 3]`):
+            Denotes the conv kernel size of first conv layer in head layer.
         head_conv_stride (`int`, *optional*, defaults to 1):
+            Denotes the conv stride of first conv layer in head layer.
         head_conv_dilation (`int`, *optional*, defaults to 1):
+            Denotes the conv dilation of first conv layer in head layer.
         head_conv_groups (`int`, *optional*, defaults to 1):
+            Denotes the conv groups of first conv layer in head layer.
         head_final_kernel_size (`int`, *optional*, defaults to 1):
+            Denotes the conv kernel size of final conv layer in head layer.
         head_final_stride (`int`, *optional*, defaults to 1):
+            Denotes the conv stride of final conv layer in head layer.
         head_final_dilation (`int`, *optional*, defaults to 1):
+            Denotes the conv dilation of final conv layer in head layer.
         head_final_groups (`int`, *optional*, defaults to 1):
+            Denotes the conv groups of final conv layer in head layer.
         head_final_bias (`bool`, *optional*, defaults to `False`):
+            Denotes the conv bais of final conv layer in head layer.
         head_final_has_shuffle (`bool`, *optional*, defaults to `False`):
+            Denotes the conv shuffle of final conv layer in head layer.
         head_final_in_channels (`int`, *optional*, defaults to 128):
+            Denotes the in channels of final conv layer in head layer.
         head_final_out_channels (`int`, *optional*, defaults to 5):
-        head_final_use_bn (`bool`, *optional*, defaults to `False`):
+            Denotes the out channels of final conv layer in head layer.
+        head_final_use_batch_norm (`bool`, *optional*, defaults to `False`):
+            Denotes to use or not to use batch norm of final conv layer in head layer.
         head_final_act_func (`str`, *optional*):
+            Denotes to activation function of final conv layer in head layer.
         head_final_dropout_rate (`int`, *optional*, defaults to 0):
+            Denotes to dropout_rate of dropout layer of final conv layer in head layer.
         head_final_ops_order (`str`, *optional*, defaults to `"weight"`):
-        loss_bg (`bool`, *optional*, defaults to `False`):
+            Denotes to dropout_rate of dropout layer of final conv layer in head layer.
+        loss_bg (`<fill_type>`, *optional*, defaults to `False`): <fill_docstring>
         backbone (`str`, *optional*, defaults to `"resnet50"`):
             Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
             backbone from the timm package. For a list of all available models, see [this
@@ -130,7 +157,7 @@ def __init__(
         head_final_has_shuffle=False,
         head_final_in_channels=128,
         head_final_out_channels=5,
-        head_final_use_bn=False,
+        head_final_use_batch_norm=False,
         head_final_act_func=None,
         head_final_dropout_rate=0,
         head_final_ops_order="weight",
@@ -188,10 +215,6 @@ def __init__(
         self.head_final_has_shuffle = head_final_has_shuffle
         self.head_final_in_channels = head_final_in_channels
         self.head_final_out_channels = head_final_out_channels
-        self.head_final_use_bn = head_final_use_bn
-        self.head_final_act_func = head_final_act_func
-        self.head_final_dropout_rate = head_final_dropout_rate
-        self.head_final_ops_order = head_final_ops_order
 
         self.loss_bg = loss_bg
         self.backbone = backbone
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 644bf67ca4f4..207f1115bc8b 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -68,27 +68,12 @@
 
 def get_same_padding(kernel_size):
     if isinstance(kernel_size, tuple):
-        p1 = get_same_padding(kernel_size[0])
-        p2 = get_same_padding(kernel_size[1])
-        return p1, p2
+        padding1 = get_same_padding(kernel_size[0])
+        padding2 = get_same_padding(kernel_size[1])
+        return padding1, padding2
     return kernel_size // 2
 
 
-def build_activation(act_func, inplace=True):
-    if act_func == "relu":
-        return nn.ReLU(inplace=inplace)
-    elif act_func == "relu6":
-        return nn.ReLU6(inplace=inplace)
-    elif act_func == "tanh":
-        return nn.Tanh()
-    elif act_func == "sigmoid":
-        return nn.Sigmoid()
-    elif act_func is None:
-        return None
-    else:
-        raise ValueError("do not support: %s" % act_func)
-
-
 class FASTConvLayer(nn.Module):
     def __init__(
         self,
@@ -100,10 +85,6 @@ def __init__(
         groups=1,
         bias=False,
         has_shuffle=False,
-        use_batch_norm=True,
-        act_func="relu",
-        dropout_rate=0,
-        use_act=True,
     ):
         super().__init__()
 
@@ -113,7 +94,6 @@ def __init__(
         self.groups = groups
         self.bias = bias
         self.has_shuffle = has_shuffle
-        self.activation_function = act_func
 
         padding = get_same_padding(self.kernel_size)
         if isinstance(padding, int):
@@ -132,29 +112,17 @@ def __init__(
             groups=groups,
             bias=bias,
         )
-        self.batch_norm = nn.Identity()
-        if use_batch_norm:
-            self.batch_norm = nn.BatchNorm2d(out_channels)
-
-        self.activation = nn.Identity()
-        if use_act:
-            act = build_activation(self.activation_function, True)
-            if act is not None:
-                self.activation = act
 
     def forward(self, hidden_states):
         if self.training:
             if hasattr(self, "fused_conv"):
                 delattr(self, "fused_conv")
             hidden_states = self.conv(hidden_states)
-            hidden_states = self.batch_norm(hidden_states)
-            return self.activation(hidden_states)
+            return hidden_states
         else:
             if not hasattr(self, "fused_conv"):
-                setattr(self, "fused_conv", self.fuse_conv_batch_norm(self.conv, self.batch_norm))
+                setattr(self, "fused_conv", self.fuse_conv_batch_norm(self.conv, nn.Identity()))
             hidden_states = self.fused_conv(hidden_states)
-            if self.activation is not None:
-                hidden_states = self.activation(hidden_states)
             return hidden_states
 
     def fuse_conv_batch_norm(self, conv, batch_norm):
@@ -411,10 +379,6 @@ def __init__(self, config):
             config.head_final_groups,
             config.head_final_bias,
             config.head_final_has_shuffle,
-            config.head_final_use_bn,
-            config.head_final_act_func,
-            config.head_final_dropout_rate,
-            config.head_final_ops_order,
         )
 
         self.pooling_size = config.head_pooling_size
@@ -519,7 +483,7 @@ def emb_loss(
     return loss
 
 
-def emb_loss_batch(emb, instance, kernel, training_mask, reduce=True, loss_weight=0.25, bg_sample=False):
+def emb_loss_batch(emb, instance, kernel, training_mask, reduce=True, loss_weight=0.25):
     loss_batch = emb.new_zeros((emb.size(0)), dtype=torch.float32)
 
     for i in range(loss_batch.size(0)):
@@ -676,7 +640,6 @@ def __init__(self, config):
         self.backbone = backbone
         self.neck = FASTNeck(config=config)
         self.det_head = FASTHead(config=config)
-        self.loss_bg = config.loss_bg
 
         self.pooling_1s = nn.MaxPool2d(
             kernel_size=config.head_pooling_size, stride=1, padding=(config.head_pooling_size - 1) // 2
@@ -714,7 +677,7 @@ def loss(self, hidden, labels):
         loss_kernel = dice_loss_with_masks(kernels, gt_kernels, selected_masks, reduce=False)
         loss_kernel = torch.mean(loss_kernel, dim=0)
 
-        loss_emb = emb_loss_batch(embs, gt_instances, gt_kernels, training_masks, reduce=False, bg_sample=self.loss_bg)
+        loss_emb = emb_loss_batch(embs, gt_instances, gt_kernels, training_masks, reduce=False)
 
         return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb)
 
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 1f059550f50a..4ad83123f4ff 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -43,10 +43,7 @@ def __init__(
         has_shuffle=False,
         in_channels=3,
         out_channels=64,
-        use_bn=True,
         act_func="relu",
-        dropout_rate=0,
-        ops_order="weight_bn_act",
         stage1_in_channels=[64, 64, 64],
         stage1_out_channels=[64, 64, 64],
         stage1_kernel_size=[[3, 3], [3, 3], [3, 3]],
@@ -87,10 +84,7 @@ def __init__(
         self.has_shuffle = has_shuffle
         self.in_channels = in_channels
         self.out_channels = out_channels
-        self.use_bn = use_bn
         self.act_func = act_func
-        self.dropout_rate = dropout_rate
-        self.ops_order = ops_order
 
         self.stage1_in_channels = stage1_in_channels
         self.stage1_out_channels = stage1_out_channels
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 72950f0776ec..8e9cd5335569 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -22,6 +22,7 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from transformers import PreTrainedModel, add_start_docstrings
+from transformers.activations import ACT2CLS
 from transformers.modeling_outputs import (
     BackboneOutput,
     BaseModelOutputWithPoolingAndNoAttention,
@@ -74,21 +75,6 @@ def get_same_padding(kernel_size):
     return kernel_size // 2
 
 
-def build_activation(act_func, inplace=True):
-    if act_func == "relu":
-        return nn.ReLU(inplace=inplace)
-    elif act_func == "relu6":
-        return nn.ReLU6(inplace=inplace)
-    elif act_func == "tanh":
-        return nn.Tanh()
-    elif act_func == "sigmoid":
-        return nn.Sigmoid()
-    elif act_func is None:
-        return None
-    else:
-        raise ValueError("do not support: %s" % act_func)
-
-
 class TextNetConvLayer(nn.Module):
     def __init__(
         self,
@@ -100,10 +86,7 @@ def __init__(
         groups=1,
         bias=False,
         has_shuffle=False,
-        use_batch_norm=True,
         act_func="relu",
-        dropout_rate=0,
-        use_act=True,
     ):
         super().__init__()
 
@@ -133,14 +116,12 @@ def __init__(
             bias=bias,
         )
         self.batch_norm = nn.Identity()
-        if use_batch_norm:
-            self.batch_norm = nn.BatchNorm2d(out_channels)
+
+        self.batch_norm = nn.BatchNorm2d(out_channels)
 
         self.activation = nn.Identity()
-        if use_act:
-            act = build_activation(self.activation_function, True)
-            if act is not None:
-                self.activation = act
+        if self.activation_function is not None:
+            self.activation = ACT2CLS[self.activation_function](inplace=True)
 
     def forward(self, hidden_states):
         if self.training:
@@ -371,10 +352,7 @@ def __init__(self, config):
             config.groups,
             config.bias,
             config.has_shuffle,
-            config.use_bn,
             config.act_func,
-            config.dropout_rate,
-            config.ops_order,
         )
         stage1 = []
         for stage_config in zip(
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index b39a7b8e7e88..31656261a426 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -28,7 +28,6 @@
 from transformers.testing_utils import (
     require_torch,
     require_vision,
-    slow,
     torch_device,
 )
 
@@ -59,7 +58,7 @@ def __init__(
         backbone_in_channels=3,
         backbone_out_channels=64,
         backbone_use_bn=True,
-        backbone_act_func="relu",
+        backbone_activation_func="relu",
         backbone_dropout_rate=0,
         backbone_ops_order="weight_bn_act",
         backbone_stage1_in_channels=[64],
@@ -108,7 +107,7 @@ def __init__(
         head_final_has_shuffle=False,
         head_final_in_channels=4,
         head_final_out_channels=5,
-        head_final_use_bn=False,
+        head_final_use_batch_norm=False,
         head_final_act_func=None,
         head_final_dropout_rate=0,
         head_final_ops_order="weight",
@@ -127,7 +126,7 @@ def __init__(
         self.backbone_in_channels = backbone_in_channels
         self.backbone_out_channels = backbone_out_channels
         self.backbone_use_bn = backbone_use_bn
-        self.backbone_act_func = backbone_act_func
+        self.backbone_act_func = backbone_activation_func
         self.backbone_dropout_rate = backbone_dropout_rate
         self.backbone_ops_order = backbone_ops_order
 
@@ -184,7 +183,7 @@ def __init__(
         self.head_final_has_shuffle = head_final_has_shuffle
         self.head_final_in_channels = head_final_in_channels
         self.head_final_out_channels = head_final_out_channels
-        self.head_final_use_bn = head_final_use_bn
+        self.head_final_use_bn = head_final_use_batch_norm
         self.head_final_act_func = head_final_act_func
         self.head_final_dropout_rate = head_final_dropout_rate
         self.head_final_ops_order = head_final_ops_order
@@ -214,10 +213,7 @@ def get_config(self):
             has_shuffle=self.backbone_has_shuffle,
             in_channels=self.backbone_in_channels,
             out_channels=self.backbone_out_channels,
-            use_bn=self.backbone_use_bn,
             act_func=self.backbone_act_func,
-            dropout_rate=self.backbone_dropout_rate,
-            ops_order=self.backbone_ops_order,
             stage1_in_channels=self.backbone_stage1_in_channels,
             stage1_out_channels=self.backbone_stage1_out_channels,
             stage1_kernel_size=self.backbone_stage1_kernel_size,
@@ -271,10 +267,6 @@ def get_config(self):
             head_final_has_shuffle=self.head_final_has_shuffle,
             head_final_in_channels=self.head_final_in_channels,
             head_final_out_channels=self.head_final_out_channels,
-            head_final_use_bn=self.head_final_use_bn,
-            head_final_act_func=self.head_final_act_func,
-            head_final_dropout_rate=self.head_final_dropout_rate,
-            head_final_ops_order=self.head_final_ops_order,
         )
 
     def create_and_check_model(self, config, input):
@@ -396,7 +388,7 @@ def test_model_is_small(self):
 @require_torch
 @require_vision
 class FastModelIntegrationTest(unittest.TestCase):
-    @slow
+    # @slow
     def test_inference_fast_tiny_ic17mlt_model(self):
         model = FastForSceneTextRecognition.from_pretrained("Raghavan/ic17mlt_Fast_T")
 
@@ -418,7 +410,7 @@ def prepare_image():
         assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134]
         assert round(float(final_out[0]["scores"][0]), 5) == 0.95541
 
-    @slow
+    # @slow
     def test_inference_fast_base_800_total_text_ic17mlt_model(self):
         model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
 

From d50df437fcd0f1730bb4b7b0051bfdef1f9ec2f9 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 9 Nov 2023 12:10:04 +0530
Subject: [PATCH 048/152] some cleanups

---
 .../models/fast/configuration_fast.py             | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index 5bfc9ee6fb2a..734b0eeede0d 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -91,15 +91,6 @@ class FastConfig(PretrainedConfig):
             Denotes the in channels of final conv layer in head layer.
         head_final_out_channels (`int`, *optional*, defaults to 5):
             Denotes the out channels of final conv layer in head layer.
-        head_final_use_batch_norm (`bool`, *optional*, defaults to `False`):
-            Denotes to use or not to use batch norm of final conv layer in head layer.
-        head_final_act_func (`str`, *optional*):
-            Denotes to activation function of final conv layer in head layer.
-        head_final_dropout_rate (`int`, *optional*, defaults to 0):
-            Denotes to dropout_rate of dropout layer of final conv layer in head layer.
-        head_final_ops_order (`str`, *optional*, defaults to `"weight"`):
-            Denotes to dropout_rate of dropout layer of final conv layer in head layer.
-        loss_bg (`<fill_type>`, *optional*, defaults to `False`): <fill_docstring>
         backbone (`str`, *optional*, defaults to `"resnet50"`):
             Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
             backbone from the timm package. For a list of all available models, see [this
@@ -157,11 +148,6 @@ def __init__(
         head_final_has_shuffle=False,
         head_final_in_channels=128,
         head_final_out_channels=5,
-        head_final_use_batch_norm=False,
-        head_final_act_func=None,
-        head_final_dropout_rate=0,
-        head_final_ops_order="weight",
-        loss_bg=False,
         backbone="resnet50",
         use_pretrained_backbone=True,
         dilation=False,
@@ -216,7 +202,6 @@ def __init__(
         self.head_final_in_channels = head_final_in_channels
         self.head_final_out_channels = head_final_out_channels
 
-        self.loss_bg = loss_bg
         self.backbone = backbone
         self.use_pretrained_backbone = use_pretrained_backbone
         self.dilation = dilation

From 6acd3bafc9a52befe2ab42611824c461d69d1cc4 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 9 Nov 2023 13:02:24 +0530
Subject: [PATCH 049/152] Some more cleanups

---
 src/transformers/models/fast/modeling_fast.py | 56 +++++--------------
 .../models/textnet/modeling_textnet.py        | 56 ++++---------------
 tests/models/fast/test_modeling_fast.py       |  5 +-
 3 files changed, 30 insertions(+), 87 deletions(-)

diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index 207f1115bc8b..d244d1889aa5 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -81,26 +81,19 @@ def __init__(
         out_channels,
         kernel_size=3,
         stride=1,
-        dilation=1,
-        groups=1,
         bias=False,
-        has_shuffle=False,
     ):
         super().__init__()
 
         self.kernel_size = kernel_size
         self.stride = stride
-        self.dilation = dilation
-        self.groups = groups
-        self.bias = bias
-        self.has_shuffle = has_shuffle
 
         padding = get_same_padding(self.kernel_size)
-        if isinstance(padding, int):
-            padding *= self.dilation
-        else:
-            padding[0] *= self.dilation
-            padding[1] *= self.dilation
+        # if isinstance(padding, int):
+        #     padding *= self.dilation
+        # else:
+        #     padding[0] *= self.dilation
+        #     padding[1] *= self.dilation
 
         self.conv = nn.Conv2d(
             in_channels,
@@ -108,9 +101,7 @@ def __init__(
             kernel_size=kernel_size,
             stride=stride,
             padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias,
+            bias=False,
         )
 
     def forward(self, hidden_states):
@@ -121,7 +112,7 @@ def forward(self, hidden_states):
             return hidden_states
         else:
             if not hasattr(self, "fused_conv"):
-                setattr(self, "fused_conv", self.fuse_conv_batch_norm(self.conv, nn.Identity()))
+                setattr(self, "fused_conv", self.conv)
             hidden_states = self.fused_conv(hidden_states)
             return hidden_states
 
@@ -141,19 +132,17 @@ def fuse_conv_batch_norm(self, conv, batch_norm):
 
 
 class FASTRepConvLayer(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1):
         super().__init__()
 
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.kernel_size = kernel_size
         self.stride = stride
-        self.dilation = dilation
-        self.groups = groups
 
-        padding = (int(((kernel_size[0] - 1) * dilation) / 2), int(((kernel_size[1] - 1) * dilation) / 2))
+        padding = (int((kernel_size[0] - 1) / 2), int((kernel_size[1] - 1) / 2))
 
-        self.nonlinearity = nn.ReLU(inplace=True)
+        self.activation = nn.ReLU(inplace=True)
 
         self.main_conv = nn.Conv2d(
             in_channels=in_channels,
@@ -161,14 +150,12 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1,
             kernel_size=kernel_size,
             stride=stride,
             padding=padding,
-            dilation=dilation,
-            groups=groups,
             bias=False,
         )
         self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels)
 
-        ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0)
-        hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2))
+        ver_pad = (int((kernel_size[0] - 1) / 2), 0)
+        hor_pad = (0, int((kernel_size[1] - 1) / 2))
 
         if kernel_size[1] != 1:
             self.vertical_conv = nn.Conv2d(
@@ -177,8 +164,6 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1,
                 kernel_size=(kernel_size[0], 1),
                 stride=stride,
                 padding=ver_pad,
-                dilation=dilation,
-                groups=groups,
                 bias=False,
             )
             self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels)
@@ -192,8 +177,6 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1,
                 kernel_size=(1, kernel_size[1]),
                 stride=stride,
                 padding=hor_pad,
-                dilation=dilation,
-                groups=groups,
                 bias=False,
             )
             self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels)
@@ -228,17 +211,17 @@ def forward(self, hidden_states):
             else:
                 id_out = self.rbr_identity(hidden_states)
 
-            return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out)
+            return self.activation(main_outputs + vertical_outputs + horizontal_outputs + id_out)
         else:
             if not hasattr(self, "fused_conv"):
                 self.prepare_for_eval()
-            return self.nonlinearity(self.fused_conv(hidden_states))
+            return self.activation(self.fused_conv(hidden_states))
 
     def _identity_to_conv(self, identity):
         if identity is None:
             return 0, 0
         if not hasattr(self, "id_tensor"):
-            input_dim = self.in_channels // self.groups
+            input_dim = self.in_channels
             kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32)
             for i in range(self.in_channels):
                 kernel_value[i, i % input_dim, 0, 0] = 1
@@ -296,8 +279,6 @@ def prepare_for_eval(self):
             kernel_size=self.main_conv.kernel_size,
             stride=self.main_conv.stride,
             padding=self.main_conv.padding,
-            dilation=self.main_conv.dilation,
-            groups=self.main_conv.groups,
             bias=True,
         )
         self.fused_conv.weight.data = kernel
@@ -332,8 +313,6 @@ def __init__(self, config):
                 config.neck_out_channels,
                 config.neck_kernel_size,
                 config.neck_stride,
-                config.neck_dilation,
-                config.neck_groups,
             )
         )
         self.num_layers = len(reduce_layer_configs)
@@ -366,8 +345,6 @@ def __init__(self, config):
             config.head_conv_out_channels,
             config.head_conv_kernel_size,
             config.head_conv_stride,
-            config.head_conv_dilation,
-            config.head_conv_groups,
         )
 
         self.final = FASTConvLayer(
@@ -375,10 +352,7 @@ def __init__(self, config):
             config.head_final_out_channels,
             config.head_final_kernel_size,
             config.head_final_stride,
-            config.head_final_dilation,
-            config.head_final_groups,
             config.head_final_bias,
-            config.head_final_has_shuffle,
         )
 
         self.pooling_size = config.head_pooling_size
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 8e9cd5335569..daf76ba2667e 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -82,28 +82,20 @@ def __init__(
         out_channels,
         kernel_size=3,
         stride=1,
-        dilation=1,
-        groups=1,
-        bias=False,
-        has_shuffle=False,
         act_func="relu",
     ):
         super().__init__()
 
         self.kernel_size = kernel_size
         self.stride = stride
-        self.dilation = dilation
-        self.groups = groups
-        self.bias = bias
-        self.has_shuffle = has_shuffle
         self.activation_function = act_func
 
         padding = get_same_padding(self.kernel_size)
-        if isinstance(padding, int):
-            padding *= self.dilation
-        else:
-            padding[0] *= self.dilation
-            padding[1] *= self.dilation
+        # if isinstance(padding, int):
+        #     padding *= self.dilation
+        # else:
+        #     padding[0] *= self.dilation
+        #     padding[1] *= self.dilation
 
         self.conv = nn.Conv2d(
             in_channels,
@@ -111,9 +103,7 @@ def __init__(
             kernel_size=kernel_size,
             stride=stride,
             padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias,
+            bias=False,
         )
         self.batch_norm = nn.Identity()
 
@@ -154,17 +144,15 @@ def fuse_conv_batch_norm(self, conv, batch_norm):
 
 
 class TestNetRepConvLayer(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1):
         super().__init__()
 
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.kernel_size = kernel_size
         self.stride = stride
-        self.dilation = dilation
-        self.groups = groups
 
-        padding = (int(((kernel_size[0] - 1) * dilation) / 2), int(((kernel_size[1] - 1) * dilation) / 2))
+        padding = (int((kernel_size[0] - 1) / 2), int((kernel_size[1] - 1) / 2))
 
         self.nonlinearity = nn.ReLU(inplace=True)
 
@@ -174,14 +162,12 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1,
             kernel_size=kernel_size,
             stride=stride,
             padding=padding,
-            dilation=dilation,
-            groups=groups,
             bias=False,
         )
         self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels)
 
-        ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0)
-        hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2))
+        ver_pad = (int((kernel_size[0] - 1) / 2), 0)
+        hor_pad = (0, int((kernel_size[1] - 1) / 2))
 
         if kernel_size[1] != 1:
             self.vertical_conv = nn.Conv2d(
@@ -190,23 +176,19 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1,
                 kernel_size=(kernel_size[0], 1),
                 stride=stride,
                 padding=ver_pad,
-                dilation=dilation,
-                groups=groups,
                 bias=False,
             )
             self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels)
         else:
             self.vertical_conv, self.vertical_batch_norm = None, None
 
-        if kernel_size[0] != 1:  # 卷积核的高大于1 -> 有水平卷积
+        if kernel_size[0] != 1:
             self.horizontal_conv = nn.Conv2d(
                 in_channels=in_channels,
                 out_channels=out_channels,
                 kernel_size=(1, kernel_size[1]),
                 stride=stride,
                 padding=hor_pad,
-                dilation=dilation,
-                groups=groups,
                 bias=False,
             )
             self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels)
@@ -251,7 +233,7 @@ def _identity_to_conv(self, identity):
         if identity is None:
             return 0, 0
         if not hasattr(self, "id_tensor"):
-            input_dim = self.in_channels // self.groups
+            input_dim = self.in_channels
             kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32)
             for i in range(self.in_channels):
                 kernel_value[i, i % input_dim, 0, 0] = 1
@@ -309,8 +291,6 @@ def prepare_for_eval(self):
             kernel_size=self.main_conv.kernel_size,
             stride=self.main_conv.stride,
             padding=self.main_conv.padding,
-            dilation=self.main_conv.dilation,
-            groups=self.main_conv.groups,
             bias=True,
         )
         self.fused_conv.weight.data = kernel
@@ -348,10 +328,6 @@ def __init__(self, config):
             config.out_channels,
             config.kernel_size,
             config.stride,
-            config.dilation,
-            config.groups,
-            config.bias,
-            config.has_shuffle,
             config.act_func,
         )
         stage1 = []
@@ -360,8 +336,6 @@ def __init__(self, config):
             config.stage1_out_channels,
             config.stage1_kernel_size,
             config.stage1_stride,
-            config.stage1_dilation,
-            config.stage1_groups,
         ):
             stage1.append(TestNetRepConvLayer(*stage_config))
         self.stage1 = nn.ModuleList(stage1)
@@ -372,8 +346,6 @@ def __init__(self, config):
             config.stage2_out_channels,
             config.stage2_kernel_size,
             config.stage2_stride,
-            config.stage2_dilation,
-            config.stage2_groups,
         ):
             stage2.append(TestNetRepConvLayer(*stage_config))
         self.stage2 = nn.ModuleList(stage2)
@@ -384,8 +356,6 @@ def __init__(self, config):
             config.stage3_out_channels,
             config.stage3_kernel_size,
             config.stage3_stride,
-            config.stage3_dilation,
-            config.stage3_groups,
         ):
             stage3.append(TestNetRepConvLayer(*stage_config))
         self.stage3 = nn.ModuleList(stage3)
@@ -396,8 +366,6 @@ def __init__(self, config):
             config.stage4_out_channels,
             config.stage4_kernel_size,
             config.stage4_stride,
-            config.stage4_dilation,
-            config.stage4_groups,
         ):
             stage4.append(TestNetRepConvLayer(*stage_config))
         self.stage4 = nn.ModuleList(stage4)
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index 31656261a426..2a4fe0ab7fd3 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -28,6 +28,7 @@
 from transformers.testing_utils import (
     require_torch,
     require_vision,
+    slow,
     torch_device,
 )
 
@@ -388,7 +389,7 @@ def test_model_is_small(self):
 @require_torch
 @require_vision
 class FastModelIntegrationTest(unittest.TestCase):
-    # @slow
+    @slow
     def test_inference_fast_tiny_ic17mlt_model(self):
         model = FastForSceneTextRecognition.from_pretrained("Raghavan/ic17mlt_Fast_T")
 
@@ -410,7 +411,7 @@ def prepare_image():
         assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134]
         assert round(float(final_out[0]["scores"][0]), 5) == 0.95541
 
-    # @slow
+    @slow
     def test_inference_fast_base_800_total_text_ic17mlt_model(self):
         model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
 

From 85c128a97402a57db57db4b9278c91d1a575d989 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 9 Nov 2023 13:22:25 +0530
Subject: [PATCH 050/152] Fix build

---
 .../models/fast/configuration_fast.py         | 29 ------------
 .../models/textnet/configuration_textnet.py   | 24 ----------
 tests/models/fast/test_modeling_fast.py       | 44 -------------------
 tests/models/textnet/test_modeling_textnet.py | 36 ---------------
 4 files changed, 133 deletions(-)

diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
index 734b0eeede0d..6a5f3a425fa6 100644
--- a/src/transformers/models/fast/configuration_fast.py
+++ b/src/transformers/models/fast/configuration_fast.py
@@ -54,11 +54,6 @@ class FastConfig(PretrainedConfig):
             Denotes the kernel_size of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels`
         neck_stride (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`):
             Denotes the neck_stride of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels`
-        neck_dilation (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`):
-            Denotes the neck_dilation of FASTRepConvLayer in neck module. Should be of same length of
-            `neck_in_channels`
-        neck_groups (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`):
-            Denotes the groups of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels`
         head_pooling_size (`int`, *optional*, defaults to 9):
             Denotes the pooling size of head layer
         head_dropout_ratio (`int`, *optional*, defaults to 0):
@@ -71,22 +66,12 @@ class FastConfig(PretrainedConfig):
             Denotes the conv kernel size of first conv layer in head layer.
         head_conv_stride (`int`, *optional*, defaults to 1):
             Denotes the conv stride of first conv layer in head layer.
-        head_conv_dilation (`int`, *optional*, defaults to 1):
-            Denotes the conv dilation of first conv layer in head layer.
-        head_conv_groups (`int`, *optional*, defaults to 1):
-            Denotes the conv groups of first conv layer in head layer.
         head_final_kernel_size (`int`, *optional*, defaults to 1):
             Denotes the conv kernel size of final conv layer in head layer.
         head_final_stride (`int`, *optional*, defaults to 1):
             Denotes the conv stride of final conv layer in head layer.
-        head_final_dilation (`int`, *optional*, defaults to 1):
-            Denotes the conv dilation of final conv layer in head layer.
-        head_final_groups (`int`, *optional*, defaults to 1):
-            Denotes the conv groups of final conv layer in head layer.
         head_final_bias (`bool`, *optional*, defaults to `False`):
             Denotes the conv bais of final conv layer in head layer.
-        head_final_has_shuffle (`bool`, *optional*, defaults to `False`):
-            Denotes the conv shuffle of final conv layer in head layer.
         head_final_in_channels (`int`, *optional*, defaults to 128):
             Denotes the in channels of final conv layer in head layer.
         head_final_out_channels (`int`, *optional*, defaults to 5):
@@ -130,22 +115,15 @@ def __init__(
         neck_out_channels=[128, 128, 128, 128],
         neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]],
         neck_stride=[1, 1, 1, 1],
-        neck_dilation=[1, 1, 1, 1],
-        neck_groups=[1, 1, 1, 1],
         head_pooling_size=9,
         head_dropout_ratio=0,
         head_conv_in_channels=512,
         head_conv_out_channels=128,
         head_conv_kernel_size=[3, 3],
         head_conv_stride=1,
-        head_conv_dilation=1,
-        head_conv_groups=1,
         head_final_kernel_size=1,
         head_final_stride=1,
-        head_final_dilation=1,
-        head_final_groups=1,
         head_final_bias=False,
-        head_final_has_shuffle=False,
         head_final_in_channels=128,
         head_final_out_channels=5,
         backbone="resnet50",
@@ -180,8 +158,6 @@ def __init__(
         self.neck_out_channels = neck_out_channels
         self.neck_kernel_size = neck_kernel_size
         self.neck_stride = neck_stride
-        self.neck_dilation = neck_dilation
-        self.neck_groups = neck_groups
 
         self.head_pooling_size = head_pooling_size
         self.head_dropout_ratio = head_dropout_ratio
@@ -190,15 +166,10 @@ def __init__(
         self.head_conv_out_channels = head_conv_out_channels
         self.head_conv_kernel_size = head_conv_kernel_size
         self.head_conv_stride = head_conv_stride
-        self.head_conv_dilation = head_conv_dilation
-        self.head_conv_groups = head_conv_groups
 
         self.head_final_kernel_size = head_final_kernel_size
         self.head_final_stride = head_final_stride
-        self.head_final_dilation = head_final_dilation
-        self.head_final_groups = head_final_groups
         self.head_final_bias = head_final_bias
-        self.head_final_has_shuffle = head_final_has_shuffle
         self.head_final_in_channels = head_final_in_channels
         self.head_final_out_channels = head_final_out_channels
 
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 4ad83123f4ff..e67d02a21bac 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -37,10 +37,6 @@ def __init__(
         self,
         kernel_size=3,
         stride=2,
-        dilation=1,
-        groups=1,
-        bias=False,
-        has_shuffle=False,
         in_channels=3,
         out_channels=64,
         act_func="relu",
@@ -48,26 +44,18 @@ def __init__(
         stage1_out_channels=[64, 64, 64],
         stage1_kernel_size=[[3, 3], [3, 3], [3, 3]],
         stage1_stride=[1, 2, 1],
-        stage1_dilation=[1, 1, 1],
-        stage1_groups=[1, 1, 1],
         stage2_in_channels=[64, 128, 128, 128],
         stage2_out_channels=[128, 128, 128, 128],
         stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]],
         stage2_stride=[2, 1, 1, 1],
-        stage2_dilation=[1, 1, 1, 1],
-        stage2_groups=[1, 1, 1, 1],
         stage3_in_channels=[128, 256, 256, 256],
         stage3_out_channels=[256, 256, 256, 256],
         stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]],
         stage3_stride=[2, 1, 1, 1],
-        stage3_dilation=[1, 1, 1, 1],
-        stage3_groups=[1, 1, 1, 1],
         stage4_in_channels=[256, 512, 512, 512],
         stage4_out_channels=[512, 512, 512, 512],
         stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]],
         stage4_stride=[2, 1, 1, 1],
-        stage4_dilation=[1, 1, 1, 1],
-        stage4_groups=[1, 1, 1, 1],
         hidden_sizes=[64, 64, 128, 256, 512],
         initializer_range=0.02,
         out_features=None,
@@ -78,10 +66,6 @@ def __init__(
 
         self.kernel_size = kernel_size
         self.stride = stride
-        self.dilation = dilation
-        self.groups = groups
-        self.bias = bias
-        self.has_shuffle = has_shuffle
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.act_func = act_func
@@ -90,29 +74,21 @@ def __init__(
         self.stage1_out_channels = stage1_out_channels
         self.stage1_kernel_size = stage1_kernel_size
         self.stage1_stride = stage1_stride
-        self.stage1_dilation = stage1_dilation
-        self.stage1_groups = stage1_groups
 
         self.stage2_in_channels = stage2_in_channels
         self.stage2_out_channels = stage2_out_channels
         self.stage2_kernel_size = stage2_kernel_size
         self.stage2_stride = stage2_stride
-        self.stage2_dilation = stage2_dilation
-        self.stage2_groups = stage2_groups
 
         self.stage3_in_channels = stage3_in_channels
         self.stage3_out_channels = stage3_out_channels
         self.stage3_kernel_size = stage3_kernel_size
         self.stage3_stride = stage3_stride
-        self.stage3_dilation = stage3_dilation
-        self.stage3_groups = stage3_groups
 
         self.stage4_in_channels = stage4_in_channels
         self.stage4_out_channels = stage4_out_channels
         self.stage4_kernel_size = stage4_kernel_size
         self.stage4_stride = stage4_stride
-        self.stage4_dilation = stage4_dilation
-        self.stage4_groups = stage4_groups
 
         self.initializer_range = initializer_range
         self.hidden_sizes = hidden_sizes
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index 2a4fe0ab7fd3..50a2effa2eb9 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -54,7 +54,6 @@ def __init__(
         backbone_stride=2,
         backbone_dilation=1,
         backbone_groups=1,
-        backbone_bias=False,
         backbone_has_shuffle=False,
         backbone_in_channels=3,
         backbone_out_channels=64,
@@ -90,22 +89,15 @@ def __init__(
         neck_out_channels=[128],
         neck_kernel_size=[[3, 3]],
         neck_stride=[1],
-        neck_dilation=[1],
-        neck_groups=[1],
         head_pooling_size=9,
         head_dropout_ratio=0.1,
         head_conv_in_channels=128,
         head_conv_out_channels=4,
         head_conv_kernel_size=[3, 3],
         head_conv_stride=1,
-        head_conv_dilation=1,
-        head_conv_groups=1,
         head_final_kernel_size=1,
         head_final_stride=1,
-        head_final_dilation=1,
-        head_final_groups=1,
         head_final_bias=False,
-        head_final_has_shuffle=False,
         head_final_in_channels=4,
         head_final_out_channels=5,
         head_final_use_batch_norm=False,
@@ -120,9 +112,6 @@ def __init__(
         self.parent = parent
         self.backbone_kernel_size = backbone_kernel_size
         self.backbone_stride = backbone_stride
-        self.backbone_dilation = backbone_dilation
-        self.backbone_groups = backbone_groups
-        self.backbone_bias = backbone_bias
         self.backbone_has_shuffle = backbone_has_shuffle
         self.backbone_in_channels = backbone_in_channels
         self.backbone_out_channels = backbone_out_channels
@@ -135,36 +124,26 @@ def __init__(
         self.backbone_stage1_out_channels = backbone_stage1_out_channels
         self.backbone_stage1_kernel_size = backbone_stage1_kernel_size
         self.backbone_stage1_stride = backbone_stage1_stride
-        self.backbone_stage1_dilation = backbone_stage1_dilation
-        self.backbone_stage1_groups = backbone_stage1_groups
 
         self.backbone_stage2_in_channels = backbone_stage2_in_channels
         self.backbone_stage2_out_channels = backbone_stage2_out_channels
         self.backbone_stage2_kernel_size = backbone_stage2_kernel_size
         self.backbone_stage2_stride = backbone_stage2_stride
-        self.backbone_stage2_dilation = backbone_stage2_dilation
-        self.backbone_stage2_groups = backbone_stage2_groups
 
         self.backbone_stage3_in_channels = backbone_stage3_in_channels
         self.backbone_stage3_out_channels = backbone_stage3_out_channels
         self.backbone_stage3_kernel_size = backbone_stage3_kernel_size
         self.backbone_stage3_stride = backbone_stage3_stride
-        self.backbone_stage3_dilation = backbone_stage3_dilation
-        self.backbone_stage3_groups = backbone_stage3_groups
 
         self.backbone_stage4_in_channels = backbone_stage4_in_channels
         self.backbone_stage4_out_channels = backbone_stage4_out_channels
         self.backbone_stage4_kernel_size = backbone_stage4_kernel_size
         self.backbone_stage4_stride = backbone_stage4_stride
-        self.backbone_stage4_dilation = backbone_stage4_dilation
-        self.backbone_stage4_groups = backbone_stage4_groups
 
         self.neck_in_channels = neck_in_channels
         self.neck_out_channels = neck_out_channels
         self.neck_kernel_size = neck_kernel_size
         self.neck_stride = neck_stride
-        self.neck_dilation = neck_dilation
-        self.neck_groups = neck_groups
 
         self.head_pooling_size = head_pooling_size
         self.head_dropout_ratio = head_dropout_ratio
@@ -173,15 +152,10 @@ def __init__(
         self.head_conv_out_channels = head_conv_out_channels
         self.head_conv_kernel_size = head_conv_kernel_size
         self.head_conv_stride = head_conv_stride
-        self.head_conv_dilation = head_conv_dilation
-        self.head_conv_groups = head_conv_groups
 
         self.head_final_kernel_size = head_final_kernel_size
         self.head_final_stride = head_final_stride
-        self.head_final_dilation = head_final_dilation
-        self.head_final_groups = head_final_groups
         self.head_final_bias = head_final_bias
-        self.head_final_has_shuffle = head_final_has_shuffle
         self.head_final_in_channels = head_final_in_channels
         self.head_final_out_channels = head_final_out_channels
         self.head_final_use_bn = head_final_use_batch_norm
@@ -208,9 +182,6 @@ def get_config(self):
         textnet_config = TextNetConfig(
             kernel_size=self.backbone_kernel_size,
             stride=self.backbone_stride,
-            dilation=self.backbone_dilation,
-            groups=self.backbone_groups,
-            bias=self.backbone_bias,
             has_shuffle=self.backbone_has_shuffle,
             in_channels=self.backbone_in_channels,
             out_channels=self.backbone_out_channels,
@@ -219,26 +190,18 @@ def get_config(self):
             stage1_out_channels=self.backbone_stage1_out_channels,
             stage1_kernel_size=self.backbone_stage1_kernel_size,
             stage1_stride=self.backbone_stage1_stride,
-            stage1_dilation=self.backbone_stage1_dilation,
-            stage1_groups=self.backbone_stage1_groups,
             stage2_in_channels=self.backbone_stage2_in_channels,
             stage2_out_channels=self.backbone_stage2_out_channels,
             stage2_kernel_size=self.backbone_stage2_kernel_size,
             stage2_stride=self.backbone_stage2_stride,
-            stage2_dilation=self.backbone_stage2_dilation,
-            stage2_groups=self.backbone_stage2_groups,
             stage3_in_channels=self.backbone_stage3_in_channels,
             stage3_out_channels=self.backbone_stage3_out_channels,
             stage3_kernel_size=self.backbone_stage3_kernel_size,
             stage3_stride=self.backbone_stage3_stride,
-            stage3_dilation=self.backbone_stage3_dilation,
-            stage3_groups=self.backbone_stage3_groups,
             stage4_in_channels=self.backbone_stage4_in_channels,
             stage4_out_channels=self.backbone_stage4_out_channels,
             stage4_kernel_size=self.backbone_stage4_kernel_size,
             stage4_stride=self.backbone_stage4_stride,
-            stage4_dilation=self.backbone_stage4_dilation,
-            stage4_groups=self.backbone_stage4_groups,
             out_features=["stage1", "stage2", "stage3", "stage4"],
             out_indices=[1, 2, 3, 4],
         )
@@ -250,22 +213,15 @@ def get_config(self):
             neck_out_channels=self.neck_out_channels,
             neck_kernel_size=self.neck_kernel_size,
             neck_stride=self.neck_stride,
-            neck_dilation=self.neck_dilation,
-            neck_groups=self.neck_groups,
             head_pooling_size=self.head_pooling_size,
             head_dropout_ratio=self.head_dropout_ratio,
             head_conv_in_channels=self.head_conv_in_channels,
             head_conv_out_channels=self.head_conv_out_channels,
             head_conv_kernel_size=self.head_conv_kernel_size,
             head_conv_stride=self.head_conv_stride,
-            head_conv_dilation=self.head_conv_dilation,
-            head_conv_groups=self.head_conv_groups,
             head_final_kernel_size=self.head_final_kernel_size,
             head_final_stride=self.head_final_stride,
-            head_final_dilation=self.head_final_dilation,
-            head_final_groups=self.head_final_groups,
             head_final_bias=self.head_final_bias,
-            head_final_has_shuffle=self.head_final_has_shuffle,
             head_final_in_channels=self.head_final_in_channels,
             head_final_out_channels=self.head_final_out_channels,
         )
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index a95c072fba95..c19e5c8c2536 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -47,10 +47,6 @@ def __init__(
         parent,
         kernel_size=3,
         stride=2,
-        dilation=1,
-        groups=1,
-        bias=False,
-        has_shuffle=False,
         in_channels=3,
         out_channels=64,
         use_bn=True,
@@ -61,26 +57,18 @@ def __init__(
         stage1_out_channels=[64],
         stage1_kernel_size=[[3, 3]],
         stage1_stride=[1],
-        stage1_dilation=[1],
-        stage1_groups=[1],
         stage2_in_channels=[64],
         stage2_out_channels=[128],
         stage2_kernel_size=[[3, 1]],
         stage2_stride=[2],
-        stage2_dilation=[1],
-        stage2_groups=[1],
         stage3_in_channels=[128],
         stage3_out_channels=[256],
         stage3_kernel_size=[[1, 3]],
         stage3_stride=[2],
-        stage3_dilation=[1],
-        stage3_groups=[1],
         stage4_in_channels=[256],
         stage4_out_channels=[512],
         stage4_kernel_size=[[3, 3]],
         stage4_stride=[2],
-        stage4_dilation=[1],
-        stage4_groups=[1],
         out_features=["stage1", "stage2", "stage3", "stage4"],
         out_indices=[1, 2, 3, 4],
         batch_size=3,
@@ -95,10 +83,6 @@ def __init__(
         self.parent = parent
         self.kernel_size = kernel_size
         self.stride = stride
-        self.dilation = dilation
-        self.groups = groups
-        self.bias = bias
-        self.has_shuffle = has_shuffle
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.use_bn = use_bn
@@ -110,29 +94,21 @@ def __init__(
         self.stage1_out_channels = stage1_out_channels
         self.stage1_kernel_size = stage1_kernel_size
         self.stage1_stride = stage1_stride
-        self.stage1_dilation = stage1_dilation
-        self.stage1_groups = stage1_groups
 
         self.stage2_in_channels = stage2_in_channels
         self.stage2_out_channels = stage2_out_channels
         self.stage2_kernel_size = stage2_kernel_size
         self.stage2_stride = stage2_stride
-        self.stage2_dilation = stage2_dilation
-        self.stage2_groups = stage2_groups
 
         self.stage3_in_channels = stage3_in_channels
         self.stage3_out_channels = stage3_out_channels
         self.stage3_kernel_size = stage3_kernel_size
         self.stage3_stride = stage3_stride
-        self.stage3_dilation = stage3_dilation
-        self.stage3_groups = stage3_groups
 
         self.stage4_in_channels = stage4_in_channels
         self.stage4_out_channels = stage4_out_channels
         self.stage4_kernel_size = stage4_kernel_size
         self.stage4_stride = stage4_stride
-        self.stage4_dilation = stage4_dilation
-        self.stage4_groups = stage4_groups
 
         self.out_features = out_features
         self.out_indices = out_indices
@@ -151,10 +127,6 @@ def get_config(self):
         return TextNetConfig(
             kernel_size=self.kernel_size,
             stride=self.stride,
-            dilation=self.dilation,
-            groups=self.groups,
-            bias=self.bias,
-            has_shuffle=self.has_shuffle,
             in_channels=self.in_channels,
             out_channels=self.out_channels,
             use_bn=self.use_bn,
@@ -165,26 +137,18 @@ def get_config(self):
             stage1_out_channels=self.stage1_out_channels,
             stage1_kernel_size=self.stage1_kernel_size,
             stage1_stride=self.stage1_stride,
-            stage1_dilation=self.stage1_dilation,
-            stage1_groups=self.stage1_groups,
             stage2_in_channels=self.stage2_in_channels,
             stage2_out_channels=self.stage2_out_channels,
             stage2_kernel_size=self.stage2_kernel_size,
             stage2_stride=self.stage2_stride,
-            stage2_dilation=self.stage2_dilation,
-            stage2_groups=self.stage2_groups,
             stage3_in_channels=self.stage3_in_channels,
             stage3_out_channels=self.stage3_out_channels,
             stage3_kernel_size=self.stage3_kernel_size,
             stage3_stride=self.stage3_stride,
-            stage3_dilation=self.stage3_dilation,
-            stage3_groups=self.stage3_groups,
             stage4_in_channels=self.stage4_in_channels,
             stage4_out_channels=self.stage4_out_channels,
             stage4_kernel_size=self.stage4_kernel_size,
             stage4_stride=self.stage4_stride,
-            stage4_dilation=self.stage4_dilation,
-            stage4_groups=self.stage4_groups,
             out_features=self.out_features,
             out_indices=self.out_indices,
             hidden_sizes=self.hidden_sizes,

From c22ba88c123af31c26edd07535f56e1a36d74f98 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 9 Nov 2023 13:54:09 +0530
Subject: [PATCH 051/152] Incorporate PR feedbacks

---
 .../fast/convert_fast_original_to_pytorch.py  |  6 +-----
 .../models/fast/image_processing_fast.py      |  2 +-
 src/transformers/models/fast/modeling_fast.py | 20 +++++++++++++------
 tests/models/fast/test_modeling_fast.py       |  7 +++----
 4 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
index c7a8e622aae7..c98243c16457 100644
--- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py
+++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
@@ -30,11 +30,7 @@
 small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config"
 base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config"
 
-rename_key_mappings = {
-    "bn": "batch_norm",
-    "hor": "horizontal",
-    "ver": "vertical",
-}
+rename_key_mappings = {"bn": "batch_norm", "hor": "horizontal", "ver": "vertical", "det_head": "text_detection_head"}
 
 
 def prepare_img():
diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
index eb5020195f2d..5e70a83ac58a 100644
--- a/src/transformers/models/fast/image_processing_fast.py
+++ b/src/transformers/models/fast/image_processing_fast.py
@@ -389,7 +389,7 @@ def _max_pooling(self, x, scale=1):
     def post_process_text_detection(self, output, target_sizes, threshold, bbox_type="rect"):
         scale = 2
         img_size = (self.size["height"], self.size["width"])
-        out = output["hidden_states"]
+        out = output["last_hidden_state"]
         batch_size = out.size(0)
         final_results = {}
 
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
index d244d1889aa5..179aa9eb6402 100644
--- a/src/transformers/models/fast/modeling_fast.py
+++ b/src/transformers/models/fast/modeling_fast.py
@@ -15,7 +15,7 @@
 """ PyTorch FAST model."""
 
 from dataclasses import dataclass
-from typing import Dict, Optional
+from typing import Dict, Optional, Tuple
 
 import numpy as np
 import torch
@@ -578,7 +578,8 @@ class FastForSceneTextRecognitionOutput(ModelOutput):
     """
 
     loss: Optional[torch.Tensor] = None
-    hidden_states: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
 @add_start_docstrings(
@@ -613,7 +614,7 @@ def __init__(self, config):
 
         self.backbone = backbone
         self.neck = FASTNeck(config=config)
-        self.det_head = FASTHead(config=config)
+        self.text_detection_head = FASTHead(config=config)
 
         self.pooling_1s = nn.MaxPool2d(
             kernel_size=config.head_pooling_size, stride=1, padding=(config.head_pooling_size - 1) // 2
@@ -699,7 +700,9 @@ def forward(
 
         hidden_states = self.neck(features)
 
-        text_detection_output = self.det_head(hidden_states)
+        text_detection_output = self.text_detection_head(hidden_states)
+
+        all_hidden_states = (features, hidden_states)
 
         loss = None
         if labels:
@@ -708,6 +711,11 @@ def forward(
         text_detection_output = self._upsample(text_detection_output, pixel_values.size(), scale=4)
 
         if not return_dict:
-            return (loss, text_detection_output) if loss is not None else (text_detection_output,)
+            output = (loss, text_detection_output) if loss is not None else (text_detection_output,)
+            return output + (all_hidden_states,) if output_hidden_states else output
 
-        return FastForSceneTextRecognitionOutput(loss, text_detection_output)
+        return FastForSceneTextRecognitionOutput(
+            loss=loss,
+            last_hidden_state=text_detection_output,
+            hidden_states=all_hidden_states if output_hidden_states else None,
+        )
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index 50a2effa2eb9..4fb17cf824a3 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -28,7 +28,6 @@
 from transformers.testing_utils import (
     require_torch,
     require_vision,
-    slow,
     torch_device,
 )
 
@@ -231,7 +230,7 @@ def create_and_check_model(self, config, input):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values=input["pixel_values"])
-        self.parent.assertEqual(result.hidden_states.shape, (self.batch_size, 5, 125, 125))
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 5, 125, 125))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -345,7 +344,7 @@ def test_model_is_small(self):
 @require_torch
 @require_vision
 class FastModelIntegrationTest(unittest.TestCase):
-    @slow
+    # @slow
     def test_inference_fast_tiny_ic17mlt_model(self):
         model = FastForSceneTextRecognition.from_pretrained("Raghavan/ic17mlt_Fast_T")
 
@@ -367,7 +366,7 @@ def prepare_image():
         assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134]
         assert round(float(final_out[0]["scores"][0]), 5) == 0.95541
 
-    @slow
+    # @slow
     def test_inference_fast_base_800_total_text_ic17mlt_model(self):
         model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
 

From 2ee0440fd35d3fca8e8671293043fb4ffa0c3ddc Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 9 Nov 2023 19:08:15 +0530
Subject: [PATCH 052/152] More cleanup

---
 .../fast/convert_fast_original_to_pytorch.py  | 18 +++-
 .../models/textnet/configuration_textnet.py   | 89 ++++++++++++++++++-
 .../models/textnet/modeling_textnet.py        | 18 ++--
 tests/models/fast/test_modeling_fast.py       |  5 +-
 4 files changed, 109 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
index c98243c16457..6c36af421153 100644
--- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py
+++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
@@ -164,7 +164,9 @@ def get_base_model_config():
     pass
 
 
-def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_folder_path, validate_logits):
+def convert_fast_checkpoint(
+    checkpoint_url, checkpoint_config_url, pytorch_dump_folder_path, validate_logits, save_backbone_separately
+):
     response = requests.get(checkpoint_config_url)
     content = response.text
     namespace = {}
@@ -218,6 +220,8 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_
     model.load_state_dict(state_dict_changed)
 
     model.save_pretrained(pytorch_dump_folder_path)
+    if save_backbone_separately:
+        model.backbone.save_pretrained(pytorch_dump_folder_path + "/textnet/")
     fast_image_processor.save_pretrained(pytorch_dump_folder_path)
     logging.info("The converted weights are save here : " + pytorch_dump_folder_path)
 
@@ -246,8 +250,18 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_
         type=bool,
         help="whether to assert logits outputs",
     )
+    parser.add_argument(
+        "--save_backbone_separately",
+        default=False,
+        type=bool,
+        help="whether to assert logits outputs",
+    )
     args = parser.parse_args()
 
     convert_fast_checkpoint(
-        args.checkpoint_url, args.checkpoint_config_url, args.pytorch_dump_folder_path, args.validate_logits
+        args.checkpoint_url,
+        args.checkpoint_config_url,
+        args.pytorch_dump_folder_path,
+        args.validate_logits,
+        args.save_backbone_separately,
     )
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index e67d02a21bac..33e2b4c3b25c 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,13 +21,94 @@
 logger = logging.get_logger(__name__)
 
 TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "fast_base_tt_800_finetune_ic17mlt": (
-        "https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt/raw/main/config.json"
-    ),
+    "textnet-base": ("https://huggingface.co/Raghavan/textnet-base/blob/main/config.json"),
 }
 
 
 class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FastForSceneTextRecognition`]. It is used to
+    instantiate a FastForSceneTextRecognition model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    FastForSceneTextRecognition.
+    [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        kernel_size (`int`, *optional*, defaults to 3): <fill_docstring>
+            The kernel size for the initial convolution layer.
+        stride (`int`, *optional*, defaults to 2): <fill_docstring>
+            The stride for the initial convolution layer.
+        in_channels (`int`, *optional*, defaults to 3): <fill_docstring>
+            The num of channels in input for the initial convolution layer.
+        out_channels (`int`, *optional*, defaults to 64): <fill_docstring>
+            The num of channels in out for the initial convolution layer.
+        act_func (`str`, *optional*, defaults to `"relu"`): <fill_docstring>
+            The activation function for the initial convolution layer.
+        stage1_in_channels (`List[int]`, *optional*, defaults to `[64, 64, 64]`): <fill_docstring>
+            The num of channels in input for list of conv in stage 1.
+        stage1_out_channels (`List[int]`, *optional*, defaults to `[64, 64, 64]`): <fill_docstring>
+            The num of channels in output for list of conv in stage 1.Should be of same length os `stage1_in_channels`
+        stage1_kernel_size (`List[int]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 3]]`): <fill_docstring>
+            The kernel sizes for list of conv in stage 1.Should be of same length os `stage1_in_channels`
+        stage1_stride (`List[int]`, *optional*, defaults to `[1, 2, 1]`): <fill_docstring>
+            The strides for list of conv in stage 1.Should be of same length os `stage1_in_channels`
+        stage2_in_channels (`List[int]`, *optional*, defaults to `[64, 128, 128, 128]`): <fill_docstring>
+            The num of channels in input for list of conv in stage 2.
+        stage2_out_channels (`List[int]`, *optional*, defaults to `[128, 128, 128, 128]`): <fill_docstring>
+            The num of channels in output for list of conv in stage 2.Should be of same length os `stage2_in_channels`
+        stage2_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [1, 3], [3, 3], [3, 1]]`):
+            <fill_docstring> The kernel sizes for list of conv in stage 2.Should be of same length os
+            `stage2_in_channels`
+        stage2_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): <fill_docstring>
+            The strides for list of conv in stage 2.Should be of same length os `stage2_in_channels`
+        stage3_in_channels (`List[int]`, *optional*, defaults to `[128, 256, 256, 256]`): <fill_docstring>
+            The num of channels in input for list of conv in stage 3.
+        stage3_out_channels (`List[int]`, *optional*, defaults to `[256, 256, 256, 256]`): <fill_docstring>
+            The num of channels in output for list of conv in stage 3.Should be of same length os `stage3_in_channels`
+        stage3_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 1], [1, 3]]`):
+            <fill_docstring> The kernel sizes for list of conv in stage 3.Should be of same length os
+            `stage3_in_channels`
+        stage3_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): <fill_docstring>
+            The strides for list of conv in stage 3.Should be of same length os `stage3_in_channels`
+        stage4_in_channels (`List[int]`, *optional*, defaults to `[256, 512, 512, 512]`): <fill_docstring>
+            The strides for list of conv in stage 4.Should be of same length os `stage4_in_channels`
+        stage4_out_channels (`List[int]`, *optional*, defaults to `[512, 512, 512, 512]`): <fill_docstring>
+            The num of channels in output for list of conv in stage 4.Should be of same length os `stage4_in_channels`
+        stage4_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [3, 1], [1, 3], [3, 3]]`):
+            <fill_docstring> The kernel sizes for list of conv in stage 4.Should be of same length os
+            `stage4_in_channels`
+        stage4_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): <fill_docstring>
+            The strides for list of conv in stage 4.Should be of same length os `stage4_in_channels`
+        hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`): <fill_docstring>
+            Dimensionality (hidden size) at each stage.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
+
+    Examples:
+
+    ```python
+    >>> from transformers import FastConfig, FastForSceneTextRecognition
+
+    >>> # Initializing a Fast Config
+    >>> configuration = FastConfig()
+
+    >>> # Initializing a model (with random weights)
+    >>> model = FastForSceneTextRecognition(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     r"""
     [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt)
     """
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index daf76ba2667e..13091ad9d389 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -61,17 +61,14 @@
         return_dict (`bool`, *optional*):
 """
 
-BIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    # "google/bit-50",
-    # See all BiT models at https://huggingface.co/models?filter=bit
-]
+BIT_PRETRAINED_MODEL_ARCHIVE_LIST = ["Raghavan/textnet-base"]
 
 
 def get_same_padding(kernel_size):
     if isinstance(kernel_size, tuple):
-        p1 = get_same_padding(kernel_size[0])
-        p2 = get_same_padding(kernel_size[1])
-        return p1, p2
+        padding1 = get_same_padding(kernel_size[0])
+        padding2 = get_same_padding(kernel_size[1])
+        return padding1, padding2
     return kernel_size // 2
 
 
@@ -91,11 +88,6 @@ def __init__(
         self.activation_function = act_func
 
         padding = get_same_padding(self.kernel_size)
-        # if isinstance(padding, int):
-        #     padding *= self.dilation
-        # else:
-        #     padding[0] *= self.dilation
-        #     padding[1] *= self.dilation
 
         self.conv = nn.Conv2d(
             in_channels,
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
index 4fb17cf824a3..07c3f9b24b20 100644
--- a/tests/models/fast/test_modeling_fast.py
+++ b/tests/models/fast/test_modeling_fast.py
@@ -28,6 +28,7 @@
 from transformers.testing_utils import (
     require_torch,
     require_vision,
+    slow,
     torch_device,
 )
 
@@ -344,7 +345,7 @@ def test_model_is_small(self):
 @require_torch
 @require_vision
 class FastModelIntegrationTest(unittest.TestCase):
-    # @slow
+    @slow
     def test_inference_fast_tiny_ic17mlt_model(self):
         model = FastForSceneTextRecognition.from_pretrained("Raghavan/ic17mlt_Fast_T")
 
@@ -366,7 +367,7 @@ def prepare_image():
         assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134]
         assert round(float(final_out[0]["scores"][0]), 5) == 0.95541
 
-    # @slow
+    @slow
     def test_inference_fast_base_800_total_text_ic17mlt_model(self):
         model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
 

From 04d761d7c6d547e5a455ebdb0a14e41b8732c3b6 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 9 Nov 2023 19:17:21 +0530
Subject: [PATCH 053/152] More cleanup

---
 .../models/textnet/modeling_textnet.py        |  6 ++--
 tests/models/textnet/test_modeling_textnet.py | 31 +++++++++++--------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 13091ad9d389..1943f2343c67 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -49,7 +49,7 @@
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-BIT_INPUTS_DOCSTRING = r"""
+TEXTNET_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`BitImageProcessor.__call__`]
@@ -61,7 +61,7 @@
         return_dict (`bool`, *optional*):
 """
 
-BIT_PRETRAINED_MODEL_ARCHIVE_LIST = ["Raghavan/textnet-base"]
+TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST = ["Raghavan/textnet-base"]
 
 
 def get_same_padding(kernel_size):
@@ -429,7 +429,7 @@ def __init__(self, config):
         # initialize weights and apply final processing
         self.post_init()
 
-    @add_start_docstrings_to_model_forward("BIT_INPUTS_DOCSTRING")
+    @add_start_docstrings_to_model_forward(TEXTNET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BackboneOutput, config_class="")
     def forward(
         self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index c19e5c8c2536..999ddc23c7c1 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -17,8 +17,10 @@
 import unittest
 
 from transformers import TextNetConfig
+from transformers.models.textnet.modeling_textnet import TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST
 from transformers.testing_utils import (
     require_torch,
+    slow,
     torch_device,
 )
 from transformers.utils import is_torch_available
@@ -164,6 +166,14 @@ def create_and_check_model(self, config, pixel_values, labels):
             (self.batch_size, self.hidden_sizes[-1], 2, 2),
         )
 
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.num_labels
+        model = TextNetForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
@@ -222,11 +232,6 @@ class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
         if is_torch_available()
         else {}
     )
-    # fx_compatible = False
-    # test_pruning = False
-    # test_resize_embeddings = False
-    # test_head_masking = False
-    # has_attentions = False
 
     fx_compatible = False
     test_pruning = False
@@ -347,15 +352,15 @@ def test_model_is_small(self):
     def test_feed_forward_chunking(self):
         pass
 
-    # def test_for_image_classification(self):
-    #     config_and_inputs = self.model_tester.prepare_config_and_inputs()
-    #     self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
 
-    # @slow
-    # def test_model_from_pretrained(self):
-    #     for model_name in BIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-    #         model = BitModel.from_pretrained(model_name)
-    #         self.assertIsNotNone(model)
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TextNetModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
 
 
 @require_torch

From 25724619e5e6a435022b862cc377ec7dcd23d9fd Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 9 Nov 2023 19:37:29 +0530
Subject: [PATCH 054/152] More cleanup

---
 .../models/textnet/configuration_textnet.py   | 47 +++++++++----------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 33e2b4c3b25c..650c1bc4858f 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -37,52 +37,49 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        kernel_size (`int`, *optional*, defaults to 3): <fill_docstring>
+        kernel_size (`int`, *optional*, defaults to 3):
             The kernel size for the initial convolution layer.
-        stride (`int`, *optional*, defaults to 2): <fill_docstring>
+        stride (`int`, *optional*, defaults to 2):
             The stride for the initial convolution layer.
-        in_channels (`int`, *optional*, defaults to 3): <fill_docstring>
+        in_channels (`int`, *optional*, defaults to 3):
             The num of channels in input for the initial convolution layer.
-        out_channels (`int`, *optional*, defaults to 64): <fill_docstring>
+        out_channels (`int`, *optional*, defaults to 64):
             The num of channels in out for the initial convolution layer.
-        act_func (`str`, *optional*, defaults to `"relu"`): <fill_docstring>
+        act_func (`str`, *optional*, defaults to `"relu"`):
             The activation function for the initial convolution layer.
-        stage1_in_channels (`List[int]`, *optional*, defaults to `[64, 64, 64]`): <fill_docstring>
+        stage1_in_channels (`List[int]`, *optional*, defaults to `[64, 64, 64]`):
             The num of channels in input for list of conv in stage 1.
-        stage1_out_channels (`List[int]`, *optional*, defaults to `[64, 64, 64]`): <fill_docstring>
+        stage1_out_channels (`List[int]`, *optional*, defaults to `[64, 64, 64]`):
             The num of channels in output for list of conv in stage 1.Should be of same length os `stage1_in_channels`
-        stage1_kernel_size (`List[int]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 3]]`): <fill_docstring>
+        stage1_kernel_size (`List[int]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 3]]`):
             The kernel sizes for list of conv in stage 1.Should be of same length os `stage1_in_channels`
-        stage1_stride (`List[int]`, *optional*, defaults to `[1, 2, 1]`): <fill_docstring>
+        stage1_stride (`List[int]`, *optional*, defaults to `[1, 2, 1]`):
             The strides for list of conv in stage 1.Should be of same length os `stage1_in_channels`
-        stage2_in_channels (`List[int]`, *optional*, defaults to `[64, 128, 128, 128]`): <fill_docstring>
+        stage2_in_channels (`List[int]`, *optional*, defaults to `[64, 128, 128, 128]`):
             The num of channels in input for list of conv in stage 2.
-        stage2_out_channels (`List[int]`, *optional*, defaults to `[128, 128, 128, 128]`): <fill_docstring>
+        stage2_out_channels (`List[int]`, *optional*, defaults to `[128, 128, 128, 128]`):
             The num of channels in output for list of conv in stage 2.Should be of same length os `stage2_in_channels`
         stage2_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [1, 3], [3, 3], [3, 1]]`):
-            <fill_docstring> The kernel sizes for list of conv in stage 2.Should be of same length os
-            `stage2_in_channels`
-        stage2_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): <fill_docstring>
+            The kernel sizes for list of conv in stage 2.Should be of same length os `stage2_in_channels`
+        stage2_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`):
             The strides for list of conv in stage 2.Should be of same length os `stage2_in_channels`
-        stage3_in_channels (`List[int]`, *optional*, defaults to `[128, 256, 256, 256]`): <fill_docstring>
+        stage3_in_channels (`List[int]`, *optional*, defaults to `[128, 256, 256, 256]`):
             The num of channels in input for list of conv in stage 3.
-        stage3_out_channels (`List[int]`, *optional*, defaults to `[256, 256, 256, 256]`): <fill_docstring>
+        stage3_out_channels (`List[int]`, *optional*, defaults to `[256, 256, 256, 256]`):
             The num of channels in output for list of conv in stage 3.Should be of same length os `stage3_in_channels`
         stage3_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 1], [1, 3]]`):
-            <fill_docstring> The kernel sizes for list of conv in stage 3.Should be of same length os
-            `stage3_in_channels`
-        stage3_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): <fill_docstring>
+            The kernel sizes for list of conv in stage 3.Should be of same length os `stage3_in_channels`
+        stage3_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`):
             The strides for list of conv in stage 3.Should be of same length os `stage3_in_channels`
-        stage4_in_channels (`List[int]`, *optional*, defaults to `[256, 512, 512, 512]`): <fill_docstring>
+        stage4_in_channels (`List[int]`, *optional*, defaults to `[256, 512, 512, 512]`):
             The strides for list of conv in stage 4.Should be of same length os `stage4_in_channels`
-        stage4_out_channels (`List[int]`, *optional*, defaults to `[512, 512, 512, 512]`): <fill_docstring>
+        stage4_out_channels (`List[int]`, *optional*, defaults to `[512, 512, 512, 512]`):
             The num of channels in output for list of conv in stage 4.Should be of same length os `stage4_in_channels`
         stage4_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [3, 1], [1, 3], [3, 3]]`):
-            <fill_docstring> The kernel sizes for list of conv in stage 4.Should be of same length os
-            `stage4_in_channels`
-        stage4_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): <fill_docstring>
+            The kernel sizes for list of conv in stage 4.Should be of same length os `stage4_in_channels`
+        stage4_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`):
             The strides for list of conv in stage 4.Should be of same length os `stage4_in_channels`
-        hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`): <fill_docstring>
+        hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`):
             Dimensionality (hidden size) at each stage.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

From c576f4a51c0a49b862d20e313036950bcd8d2e8f Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 9 Nov 2023 20:44:27 +0530
Subject: [PATCH 055/152] Fix build

---
 tests/models/textnet/test_modeling_textnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 999ddc23c7c1..957661e61144 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -17,7 +17,6 @@
 import unittest
 
 from transformers import TextNetConfig
-from transformers.models.textnet.modeling_textnet import TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST
 from transformers.testing_utils import (
     require_torch,
     slow,
@@ -41,6 +40,7 @@
         TextNetModel,
         is_torch_available,
     )
+    from transformers.models.textnet.modeling_textnet import TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
 class TextNetModelTester:

From 5d58c6767d8313fed9fd4fcf648a53bf7fb21b58 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Fri, 10 Nov 2023 14:06:01 +0530
Subject: [PATCH 056/152] Remove all the references of fast model

---
 src/transformers/__init__.py                  |   1 -
 .../models/auto/image_processing_auto.py      |   1 -
 src/transformers/models/auto/modeling_auto.py |   3 +
 src/transformers/models/fast/__init__.py      |  53 --
 .../models/fast/configuration_fast.py         | 192 -----
 .../fast/convert_fast_original_to_pytorch.py  | 267 -------
 .../models/fast/image_processing_fast.py      | 467 ------------
 src/transformers/models/fast/modeling_fast.py | 721 ------------------
 .../utils/dummy_vision_objects.py             |   7 -
 tests/models/fast/__init__.py                 |   0
 .../models/fast/test_image_processing_fast.py | 162 ----
 tests/models/fast/test_modeling_fast.py       | 390 ----------
 utils/check_repo.py                           |   1 -
 13 files changed, 3 insertions(+), 2262 deletions(-)
 delete mode 100644 src/transformers/models/fast/__init__.py
 delete mode 100644 src/transformers/models/fast/configuration_fast.py
 delete mode 100644 src/transformers/models/fast/convert_fast_original_to_pytorch.py
 delete mode 100644 src/transformers/models/fast/image_processing_fast.py
 delete mode 100644 src/transformers/models/fast/modeling_fast.py
 delete mode 100644 tests/models/fast/__init__.py
 delete mode 100644 tests/models/fast/test_image_processing_fast.py
 delete mode 100644 tests/models/fast/test_modeling_fast.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5c0d2bed5b5f..7cfffec8463b 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1279,7 +1279,6 @@
     _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
     _import_structure["models.efficientformer"].append("EfficientFormerImageProcessor")
     _import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
-    _import_structure["models.fast"].extend(["FastImageProcessor"])
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
     _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
     _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 2fac0833c940..6244276b1d0b 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -62,7 +62,6 @@
         ("dpt", "DPTImageProcessor"),
         ("efficientformer", "EfficientFormerImageProcessor"),
         ("efficientnet", "EfficientNetImageProcessor"),
-        ("fast", "FastImageProcessor"),
         ("flava", "FlavaImageProcessor"),
         ("focalnet", "BitImageProcessor"),
         ("fuyu", "FuyuImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index c1ecdee1578e..d396ccb21c4a 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -95,7 +95,10 @@
         ("ernie_m", "ErnieMModel"),
         ("esm", "EsmModel"),
         ("falcon", "FalconModel"),
+<<<<<<< HEAD
         ("fastspeech2_conformer", "FastSpeech2ConformerModel"),
+=======
+>>>>>>> ae576e088 (Remove all the references of fast model)
         ("flaubert", "FlaubertModel"),
         ("flava", "FlavaModel"),
         ("fnet", "FNetModel"),
diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py
deleted file mode 100644
index dedc491f6c59..000000000000
--- a/src/transformers/models/fast/__init__.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# coding=utf-8
-# Copyright 2023 the Fast authors and HuggingFace Inc. team.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import TYPE_CHECKING
-
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
-
-
-_import_structure = {
-    "configuration_fast": ["FAST_PRETRAINED_CONFIG_ARCHIVE_MAP", "FastConfig"],
-    "image_processing_fast": ["FastImageProcessor"],
-}
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_fast"] = ["FastForSceneTextRecognition", "FastPreTrainedModel"]
-
-if TYPE_CHECKING:
-    from .configuration_fast import FAST_PRETRAINED_CONFIG_ARCHIVE_MAP, FastConfig
-    from .image_processing_fast import FastImageProcessor
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_fast import FastForSceneTextRecognition, FastPreTrainedModel
-
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py
deleted file mode 100644
index 6a5f3a425fa6..000000000000
--- a/src/transformers/models/fast/configuration_fast.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# coding=utf-8
-# Copyright The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Fast model configuration"""
-from transformers import CONFIG_MAPPING, PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-FAST_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "fast_base_tt_800_finetune_ic17mlt": (
-        "https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt/raw/main/config.json"
-    ),
-}
-
-
-class FastConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`FastForSceneTextRecognition`]. It is used to
-    instantiate a FastForSceneTextRecognition model according to the specified arguments, defining the model
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
-    FastForSceneTextRecognition.
-    [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt)
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        use_timm_backbone (`bool`, *optional*, defaults to `True`):
-            Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
-            API.
-        backbone_config (`PretrainedConfig` or `dict`, *optional*):
-            The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
-            case it will default to `ResNetConfig()`.
-        num_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
-        neck_in_channels (`List[int]`, *optional*, defaults to `[64, 128, 256, 512]`):
-            Denotes the in channels of FASTRepConvLayer in neck module.
-        neck_out_channels (`List[int]`, *optional*, defaults to `[128, 128, 128, 128]`):
-            Denotes the out channels of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels`
-        neck_kernel_size (`List[int]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 3], [3, 3]]`):
-            Denotes the kernel_size of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels`
-        neck_stride (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`):
-            Denotes the neck_stride of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels`
-        head_pooling_size (`int`, *optional*, defaults to 9):
-            Denotes the pooling size of head layer
-        head_dropout_ratio (`int`, *optional*, defaults to 0):
-            Denotes the dropout ratio used in dropout layer of head layer..
-        head_conv_in_channels (`int`, *optional*, defaults to 512):
-            Denotes the in channels of first conv layer in head layer.
-        head_conv_out_channels (`int`, *optional*, defaults to 128):
-            Denotes the out channels of first conv layer in head layer.
-        head_conv_kernel_size (`List[int]`, *optional*, defaults to `[3, 3]`):
-            Denotes the conv kernel size of first conv layer in head layer.
-        head_conv_stride (`int`, *optional*, defaults to 1):
-            Denotes the conv stride of first conv layer in head layer.
-        head_final_kernel_size (`int`, *optional*, defaults to 1):
-            Denotes the conv kernel size of final conv layer in head layer.
-        head_final_stride (`int`, *optional*, defaults to 1):
-            Denotes the conv stride of final conv layer in head layer.
-        head_final_bias (`bool`, *optional*, defaults to `False`):
-            Denotes the conv bais of final conv layer in head layer.
-        head_final_in_channels (`int`, *optional*, defaults to 128):
-            Denotes the in channels of final conv layer in head layer.
-        head_final_out_channels (`int`, *optional*, defaults to 5):
-            Denotes the out channels of final conv layer in head layer.
-        backbone (`str`, *optional*, defaults to `"resnet50"`):
-            Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional
-            backbone from the timm package. For a list of all available models, see [this
-            page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
-        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`.
-        dilation (`bool`, *optional*, defaults to `False`):
-            Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
-            `use_timm_backbone` = `True`.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
-
-    Examples:
-
-    ```python
-    >>> from transformers import FastConfig, FastForSceneTextRecognition
-
-    >>> # Initializing a Fast Config
-    >>> configuration = FastConfig()
-
-    >>> # Initializing a model (with random weights)
-    >>> model = FastForSceneTextRecognition(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    r"""
-    [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt)
-    """
-
-    def __init__(
-        self,
-        use_timm_backbone=True,
-        backbone_config=None,
-        num_channels=3,
-        neck_in_channels=[64, 128, 256, 512],
-        neck_out_channels=[128, 128, 128, 128],
-        neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]],
-        neck_stride=[1, 1, 1, 1],
-        head_pooling_size=9,
-        head_dropout_ratio=0,
-        head_conv_in_channels=512,
-        head_conv_out_channels=128,
-        head_conv_kernel_size=[3, 3],
-        head_conv_stride=1,
-        head_final_kernel_size=1,
-        head_final_stride=1,
-        head_final_bias=False,
-        head_final_in_channels=128,
-        head_final_out_channels=5,
-        backbone="resnet50",
-        use_pretrained_backbone=True,
-        dilation=False,
-        initializer_range=0.02,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        if backbone_config is not None and use_timm_backbone:
-            raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
-
-        if not use_timm_backbone:
-            if backbone_config is None:
-                logger.info(
-                    "`backbone_config` is `None`. Initializing the config with the default `TextNet` backbone."
-                )
-                backbone_config = CONFIG_MAPPING["textnet"](out_features=["stage1", "stage2", "stage3", "stage4"])
-            elif isinstance(backbone_config, dict):
-                backbone_model_type = backbone_config.get("model_type")
-                config_class = CONFIG_MAPPING[backbone_model_type]
-                backbone_config = config_class.from_dict(backbone_config)
-            # set timm attributes to None
-            dilation, backbone, use_pretrained_backbone = None, None, None
-
-        self.use_timm_backbone = use_timm_backbone
-        self.backbone_config = backbone_config
-        self.num_channels = num_channels
-
-        self.neck_in_channels = neck_in_channels
-        self.neck_out_channels = neck_out_channels
-        self.neck_kernel_size = neck_kernel_size
-        self.neck_stride = neck_stride
-
-        self.head_pooling_size = head_pooling_size
-        self.head_dropout_ratio = head_dropout_ratio
-
-        self.head_conv_in_channels = head_conv_in_channels
-        self.head_conv_out_channels = head_conv_out_channels
-        self.head_conv_kernel_size = head_conv_kernel_size
-        self.head_conv_stride = head_conv_stride
-
-        self.head_final_kernel_size = head_final_kernel_size
-        self.head_final_stride = head_final_stride
-        self.head_final_bias = head_final_bias
-        self.head_final_in_channels = head_final_in_channels
-        self.head_final_out_channels = head_final_out_channels
-
-        self.backbone = backbone
-        self.use_pretrained_backbone = use_pretrained_backbone
-        self.dilation = dilation
-
-        self.initializer_range = initializer_range
-
-    @classmethod
-    def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
-        """Instantiate a [`FastConfig`] (or a derived class) from a pre-trained backbone model configuration.
-
-        Args:
-            backbone_config ([`PretrainedConfig`]):
-                The backbone configuration.
-        Returns:
-            [`FastConfig`]: An instance of a configuration object
-        """
-        return cls(backbone_config=backbone_config, **kwargs)
diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py
deleted file mode 100644
index 6c36af421153..000000000000
--- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import copy
-import json
-import logging
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import FastConfig, FastForSceneTextRecognition, TextNetConfig
-from transformers.models.fast.image_processing_fast import FastImageProcessor
-
-
-tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
-small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config"
-base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config"
-
-rename_key_mappings = {"bn": "batch_norm", "hor": "horizontal", "ver": "vertical", "det_head": "text_detection_head"}
-
-
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def prepare_config(size_config_url, pooling_size, min_area, bbox_type, loss_bg):
-    config_dict = json.loads(requests.get(size_config_url).text)
-
-    backbone_config = {}
-    for stage_ix in range(1, 5):
-        stage_config = config_dict[f"stage{stage_ix}"]
-
-        merged_dict = {}
-
-        # Iterate through the list of dictionaries
-        for layer in stage_config:
-            for key, value in layer.items():
-                if key != "name":
-                    # Check if the key is already in the merged_dict
-                    if key in merged_dict:
-                        merged_dict[key].append(value)
-                    else:
-                        # If the key is not in merged_dict, create a new list with the value
-                        merged_dict[key] = [value]
-        backbone_config[f"stage{stage_ix}"] = merged_dict
-
-    neck_in_channels = []
-    neck_out_channels = []
-    neck_kernel_size = []
-    neck_stride = []
-    neck_dilation = []
-    neck_groups = []
-
-    for i in range(1, 5):
-        layer_key = f"reduce_layer{i}"
-        layer_dict = config_dict["neck"].get(layer_key)
-
-        if layer_dict:
-            # Append values to the corresponding lists
-            neck_in_channels.append(layer_dict["in_channels"])
-            neck_out_channels.append(layer_dict["out_channels"])
-            neck_kernel_size.append(layer_dict["kernel_size"])
-            neck_stride.append(layer_dict["stride"])
-            neck_dilation.append(layer_dict["dilation"])
-            neck_groups.append(layer_dict["groups"])
-
-    textnet_config = TextNetConfig(
-        kernel_size=config_dict["first_conv"]["kernel_size"],
-        stride=config_dict["first_conv"]["stride"],
-        dilation=config_dict["first_conv"]["dilation"],
-        groups=config_dict["first_conv"]["groups"],
-        bias=config_dict["first_conv"]["bias"],
-        has_shuffle=config_dict["first_conv"]["has_shuffle"],
-        in_channels=config_dict["first_conv"]["in_channels"],
-        out_channels=config_dict["first_conv"]["out_channels"],
-        use_bn=config_dict["first_conv"]["use_bn"],
-        act_func=config_dict["first_conv"]["act_func"],
-        dropout_rate=config_dict["first_conv"]["dropout_rate"],
-        ops_order=config_dict["first_conv"]["ops_order"],
-        stage1_in_channels=backbone_config["stage1"]["in_channels"],
-        stage1_out_channels=backbone_config["stage1"]["out_channels"],
-        stage1_kernel_size=backbone_config["stage1"]["kernel_size"],
-        stage1_stride=backbone_config["stage1"]["stride"],
-        stage1_dilation=backbone_config["stage1"]["dilation"],
-        stage1_groups=backbone_config["stage1"]["groups"],
-        stage2_in_channels=backbone_config["stage2"]["in_channels"],
-        stage2_out_channels=backbone_config["stage2"]["out_channels"],
-        stage2_kernel_size=backbone_config["stage2"]["kernel_size"],
-        stage2_stride=backbone_config["stage2"]["stride"],
-        stage2_dilation=backbone_config["stage2"]["dilation"],
-        stage2_groups=backbone_config["stage2"]["groups"],
-        stage3_in_channels=backbone_config["stage3"]["in_channels"],
-        stage3_out_channels=backbone_config["stage3"]["out_channels"],
-        stage3_kernel_size=backbone_config["stage3"]["kernel_size"],
-        stage3_stride=backbone_config["stage3"]["stride"],
-        stage3_dilation=backbone_config["stage3"]["dilation"],
-        stage3_groups=backbone_config["stage3"]["groups"],
-        stage4_in_channels=backbone_config["stage4"]["in_channels"],
-        stage4_out_channels=backbone_config["stage4"]["out_channels"],
-        stage4_kernel_size=backbone_config["stage4"]["kernel_size"],
-        stage4_stride=backbone_config["stage4"]["stride"],
-        stage4_dilation=backbone_config["stage4"]["dilation"],
-        stage4_groups=backbone_config["stage4"]["groups"],
-        out_features=["stage1", "stage2", "stage3", "stage4"],
-        out_indices=[1, 2, 3, 4],
-    )
-
-    return FastConfig(
-        use_timm_backbone=False,
-        backbone_config=textnet_config,
-        neck_in_channels=neck_in_channels,
-        neck_out_channels=neck_out_channels,
-        neck_kernel_size=neck_kernel_size,
-        neck_stride=neck_stride,
-        neck_dilation=neck_dilation,
-        neck_groups=neck_groups,
-        head_pooling_size=pooling_size,
-        head_dropout_ratio=0.1,
-        head_conv_in_channels=config_dict["head"]["conv"]["in_channels"],
-        head_conv_out_channels=config_dict["head"]["conv"]["out_channels"],
-        head_conv_kernel_size=config_dict["head"]["conv"]["kernel_size"],
-        head_conv_stride=config_dict["head"]["conv"]["stride"],
-        head_conv_dilation=config_dict["head"]["conv"]["dilation"],
-        head_conv_groups=config_dict["head"]["conv"]["groups"],
-        head_final_kernel_size=config_dict["head"]["final"]["kernel_size"],
-        head_final_stride=config_dict["head"]["final"]["stride"],
-        head_final_dilation=config_dict["head"]["final"]["dilation"],
-        head_final_groups=config_dict["head"]["final"]["groups"],
-        head_final_bias=config_dict["head"]["final"]["bias"],
-        head_final_has_shuffle=config_dict["head"]["final"]["has_shuffle"],
-        head_final_in_channels=config_dict["head"]["final"]["in_channels"],
-        head_final_out_channels=config_dict["head"]["final"]["out_channels"],
-        head_final_use_bn=config_dict["head"]["final"]["use_bn"],
-        head_final_act_func=config_dict["head"]["final"]["act_func"],
-        head_final_dropout_rate=config_dict["head"]["final"]["dropout_rate"],
-        head_final_ops_order=config_dict["head"]["final"]["ops_order"],
-        min_area=min_area,
-        bbox_type=bbox_type,
-        loss_bg=loss_bg,
-    )
-
-
-def get_small_model_config():
-    pass
-
-
-def get_base_model_config():
-    pass
-
-
-def convert_fast_checkpoint(
-    checkpoint_url, checkpoint_config_url, pytorch_dump_folder_path, validate_logits, save_backbone_separately
-):
-    response = requests.get(checkpoint_config_url)
-    content = response.text
-    namespace = {}
-
-    exec(content, namespace)
-
-    model_config = namespace.get("model")
-    test_config = namespace.get("test_cfg", None)
-    data_config = namespace.get("data")
-
-    min_area = 250
-    bbox_type = "rect"
-    loss_bg = False
-    if test_config is not None:
-        min_area = test_config.get("min_area", min_area)
-        bbox_type = test_config.get("bbox_type", bbox_type)
-        loss_bg = test_config.get("loss_emb", None) == "EmbLoss_v2"
-
-    if "tiny" in model_config["backbone"]["config"]:
-        config = prepare_config(
-            tiny_config_url, model_config["detection_head"]["pooling_size"], min_area, bbox_type, loss_bg
-        )
-    elif "small" in model_config["backbone"]["config"]:
-        config = prepare_config(
-            small_config_url, model_config["detection_head"]["pooling_size"], min_area, bbox_type, loss_bg
-        )
-    else:
-        config = prepare_config(
-            base_config_url, model_config["detection_head"]["pooling_size"], min_area, bbox_type, loss_bg
-        )
-    size = 640
-    if "train" in data_config:
-        if "short_size" in data_config["train"]:
-            size = data_config["train"]["short_size"]
-    model = FastForSceneTextRecognition(config)
-    fast_image_processor = FastImageProcessor(
-        size={"height": size, "width": size},
-        min_area=config.min_area,
-        bbox_type=config.bbox_type,
-        pooling_size=config.head_pooling_size,
-    )
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"]
-    state_dict_changed = copy.deepcopy(state_dict)
-    for key in state_dict:
-        val = state_dict_changed.pop(key)
-        new_key = key.replace("module.", "").replace("backbone.", "backbone.textnet.")
-        for search, replacement in rename_key_mappings.items():
-            if search in new_key:
-                new_key = new_key.replace(search, replacement)
-        state_dict_changed[new_key] = val
-    model.load_state_dict(state_dict_changed)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-    if save_backbone_separately:
-        model.backbone.save_pretrained(pytorch_dump_folder_path + "/textnet/")
-    fast_image_processor.save_pretrained(pytorch_dump_folder_path)
-    logging.info("The converted weights are save here : " + pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth",
-        type=str,
-        help="URL to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--checkpoint_config_url",
-        default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth",
-        type=str,
-        help="URL to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    parser.add_argument(
-        "--validate_logits",
-        default=False,
-        type=bool,
-        help="whether to assert logits outputs",
-    )
-    parser.add_argument(
-        "--save_backbone_separately",
-        default=False,
-        type=bool,
-        help="whether to assert logits outputs",
-    )
-    args = parser.parse_args()
-
-    convert_fast_checkpoint(
-        args.checkpoint_url,
-        args.checkpoint_config_url,
-        args.pytorch_dump_folder_path,
-        args.validate_logits,
-        args.save_backbone_separately,
-    )
diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py
deleted file mode 100644
index 5e70a83ac58a..000000000000
--- a/src/transformers/models/fast/image_processing_fast.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Image processor class for FAST."""
-import math
-from typing import Any, Dict, List, Optional, Union
-
-from ...utils.import_utils import is_cv2_available
-
-
-if is_cv2_available():
-    import cv2
-import numpy as np
-
-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import resize, to_channel_dimension_format
-from ...image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    infer_channel_dimension_format,
-    is_scaled_image,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from ...utils import (
-    IMAGENET_DEFAULT_MEAN,
-    IMAGENET_DEFAULT_STD,
-    TensorType,
-    is_torch_available,
-    is_vision_available,
-    logging,
-)
-
-
-if is_vision_available():
-    import PIL
-
-if is_torch_available():
-    import torch
-    import torch.nn as nn
-    import torch.nn.functional as F
-
-logger = logging.get_logger(__name__)
-
-
-class FastImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a FAST image processor.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
-            `do_resize` parameter in the `preprocess` method.
-        size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
-            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
-            method.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
-            `preprocess` method.
-        do_center_crop (`bool`, *optional*, defaults to `False`):
-            Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
-            is padded with 0's and then center cropped. Can be overridden by the `do_center_crop` parameter in the
-            `preprocess` method.
-        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
-            Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`.
-            Can be overridden by the `crop_size` parameter in the `preprocess` method.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
-            `preprocess` method.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
-            parameter in the `preprocess` method.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
-            method.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
-            The mean to use if normalizing the image. This is a float or list of floats of length of the number of
-            channels of the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
-            The standard deviation to use if normalizing the image. This is a float or list of floats of length of the
-            number of channels of the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
-        min_area (`int`, *optional*, defaults to 200):
-            Threshold for min area for results
-        pooling_size (`int`, *optional*, defaults to 9):
-            Pooling size for text detection
-    """
-
-    model_input_names = ["pixel_values"]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        do_center_crop: bool = False,
-        crop_size: Dict[str, int] = None,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_rescale: bool = True,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        min_area: int = 200,
-        pooling_size: int = 9,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        size = size if size is not None else {"height": 640, "width": 640}
-        size = get_size_dict(size)
-        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
-        crop_size = get_size_dict(crop_size, param_name="crop_size")
-        self.do_resize = do_resize
-        self.size = size
-        self.resample = resample
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
-        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
-        self.min_area = min_area
-        # self.threshold = threshold
-        self.pooling_size = pooling_size
-
-    @classmethod
-    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
-        """
-        Overrides the `from_dict` method from the base class to make sure `reduce_labels` is updated if image processor
-        is created using from_dict and kwargs e.g. `FastImageProcessor.from_pretrained(checkpoint, reduce_labels=True)`
-        """
-        image_processor_dict = image_processor_dict.copy()
-        if "reduce_labels" in kwargs:
-            image_processor_dict["reduce_labels"] = kwargs.pop("reduce_labels")
-        return super().from_dict(image_processor_dict, **kwargs)
-
-    def resize(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Resize an image to (size["height"], size["width"]).
-
-        Args:
-            image (`np.ndarray`):
-                Image to resize.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            resample (`PILImageResampling`, *optional*, defaults to `PIL.Image.BICUBIC`):
-                Resampling filter to use when resiizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-            input_data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the input image. If not provided, it will be inferred.
-        """
-        size = get_size_dict(size, default_to_square=True, param_name="size")
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` argument must contain `height` and `width` keys. Got {size.keys()}")
-        return resize(
-            image,
-            size=(size["height"], size["width"]),
-            resample=resample,
-            data_format=data_format,
-            input_data_format=input_data_format,
-            **kwargs,
-        )
-
-    def _preprocess(
-        self,
-        image: ImageInput,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ):
-        if do_resize:
-            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
-
-        if do_center_crop:
-            image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
-
-        if do_rescale:
-            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
-
-        if do_normalize:
-            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
-
-        return image
-
-    def _preprocess_image(
-        self,
-        image: ImageInput,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ) -> np.ndarray:
-        """Preprocesses a single image."""
-        # All transformations expect numpy arrays.
-        image = to_numpy_array(image)
-        if is_scaled_image(image) and do_rescale:
-            logger.warning_once(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-        if input_data_format is None:
-            input_data_format = infer_channel_dimension_format(image)
-        image = self._preprocess(
-            image,
-            do_resize=do_resize,
-            size=size,
-            resample=resample,
-            do_center_crop=do_center_crop,
-            crop_size=crop_size,
-            do_rescale=do_rescale,
-            rescale_factor=rescale_factor,
-            do_normalize=do_normalize,
-            image_mean=image_mean,
-            image_std=image_std,
-            input_data_format=input_data_format,
-        )
-        if data_format is not None:
-            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
-        return image
-
-    def preprocess(
-        self,
-        images: ImageInput,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: ChannelDimension = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> PIL.Image.Image:
-        """
-        Preprocess an image or batch of images.
-
-        Args:
-            images (`ImageInput`):
-                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
-                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
-                Size of the image after resizing.
-            resample (`int`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
-                has an effect if `do_resize` is set to `True`.
-            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
-                Whether to center crop the image.
-            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
-                Size of the image after center crop. If one edge the image is smaller than `crop_size`, it will be
-                padded with zeros and then cropped
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image values between [0 - 1].
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Image mean.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Image standard deviation.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-        """
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        size = get_size_dict(size, default_to_square=True, param_name="size")
-        resample = resample if resample is not None else self.resample
-        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
-        crop_size = crop_size if crop_size is not None else self.crop_size
-        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-
-        images = make_list_of_images(images)
-
-        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-
-        if do_resize and size is None or resample is None:
-            raise ValueError("Size and resample must be specified if do_resize is True.")
-
-        if do_center_crop and crop_size is None:
-            raise ValueError("Crop size must be specified if do_center_crop is True.")
-
-        if do_rescale and rescale_factor is None:
-            raise ValueError("Rescale factor must be specified if do_rescale is True.")
-
-        if do_normalize and (image_mean is None or image_std is None):
-            raise ValueError("Image mean and std must be specified if do_normalize is True.")
-
-        images = [
-            self._preprocess_image(
-                image=img,
-                do_resize=do_resize,
-                do_center_crop=do_center_crop,
-                do_rescale=do_rescale,
-                do_normalize=do_normalize,
-                resample=resample,
-                size=size,
-                rescale_factor=rescale_factor,
-                crop_size=crop_size,
-                image_mean=image_mean,
-                image_std=image_std,
-                data_format=data_format,
-                input_data_format=input_data_format,
-            )
-            for img in images
-        ]
-
-        data = {"pixel_values": images}
-
-        return BatchFeature(data=data, tensor_type=return_tensors)
-
-    def _max_pooling(self, x, scale=1):
-        if scale == 1:
-            x = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2)(x)
-        elif scale == 2:
-            x = nn.MaxPool2d(kernel_size=self.pooling_size // 2 + 1, stride=1, padding=(self.pooling_size // 2) // 2)(
-                x
-            )
-        return x
-
-    def post_process_text_detection(self, output, target_sizes, threshold, bbox_type="rect"):
-        scale = 2
-        img_size = (self.size["height"], self.size["width"])
-        out = output["last_hidden_state"]
-        batch_size = out.size(0)
-        final_results = {}
-
-        texts = F.interpolate(
-            out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), mode="nearest"
-        )  # B*1*320*320
-        texts = self._max_pooling(texts, scale=scale)  # B*1*320*320
-        score_maps = torch.sigmoid_(texts)  # B*1*320*320
-        score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode="nearest")  # B*1*640*640
-        score_maps = score_maps.squeeze(1)  # B*640*640
-
-        kernels = (out[:, 0, :, :] > 0).to(torch.uint8)  # B*160*160
-        labels_ = []
-        for kernel in kernels.numpy():
-            ret, label_ = cv2.connectedComponents(kernel)
-            labels_.append(label_)
-        labels_ = np.array(labels_)
-        labels_ = torch.from_numpy(labels_)
-        labels = labels_.unsqueeze(1).to(torch.float32)  # B*1*160*160
-        labels = F.interpolate(
-            labels, size=(img_size[0] // scale, img_size[1] // scale), mode="nearest"
-        )  # B*1*320*320
-        labels = self._max_pooling(labels, scale=scale)
-        labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode="nearest")  # B*1*640*640
-        labels = labels.squeeze(1).to(torch.int32)  # B*640*640
-
-        keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)]
-
-        final_results.update({"kernels": kernels.data.cpu()})
-
-        results = []
-        for i in range(batch_size):
-            org_img_size = target_sizes[i]
-            scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0]))
-
-            bboxes, scores = self.generate_bbox(
-                keys[i], labels[i], score_maps[i], scales, threshold, bbox_type=bbox_type
-            )
-            results.append({"bboxes": bboxes, "scores": scores})
-        final_results.update({"results": results})
-
-        return results
-
-    def generate_bbox(self, keys, label, score, scales, threshold, bbox_type):
-        label_num = len(keys)
-        bboxes = []
-        scores = []
-        for index in range(1, label_num):
-            i = keys[index]
-            ind = label == i
-            ind_np = ind.data.cpu().numpy()
-            points = np.array(np.where(ind_np)).transpose((1, 0))
-            if points.shape[0] < self.min_area:
-                label[ind] = 0
-                continue
-            score_i = score[ind].mean().item()
-            if score_i < threshold:
-                label[ind] = 0
-                continue
-
-            if bbox_type == "rect":
-                rect = cv2.minAreaRect(points[:, ::-1])
-                alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1])))
-                rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2])
-                bbox = cv2.boxPoints(rect) * scales
-
-            elif bbox_type == "poly":
-                binary = np.zeros(label.shape, dtype="uint8")
-                binary[ind_np] = 1
-                contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-                bbox = contours[0] * scales
-            bbox = bbox.astype("int32")
-            bboxes.append(bbox.reshape(-1).tolist())
-            scores.append(score_i)
-        return bboxes, scores
diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py
deleted file mode 100644
index 179aa9eb6402..000000000000
--- a/src/transformers/models/fast/modeling_fast.py
+++ /dev/null
@@ -1,721 +0,0 @@
-# coding=utf-8
-# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch FAST model."""
-
-from dataclasses import dataclass
-from typing import Dict, Optional, Tuple
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ...utils import is_timm_available
-
-
-if is_timm_available():
-    from timm import create_model
-
-
-from transformers import (
-    AutoBackbone,
-    FastConfig,
-    PreTrainedModel,
-    add_start_docstrings,
-    is_timm_available,
-    requires_backends,
-)
-from transformers.utils import ModelOutput, add_start_docstrings_to_model_forward, replace_return_docstrings
-
-
-_CONFIG_FOR_DOC = "FastConfig"
-
-FAST_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
-
-    Parameters:
-        config ([`FastConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-FAST_FOR_CAPTIONING_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`FastImageProcessor.__call__`] for details.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-def get_same_padding(kernel_size):
-    if isinstance(kernel_size, tuple):
-        padding1 = get_same_padding(kernel_size[0])
-        padding2 = get_same_padding(kernel_size[1])
-        return padding1, padding2
-    return kernel_size // 2
-
-
-class FASTConvLayer(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size=3,
-        stride=1,
-        bias=False,
-    ):
-        super().__init__()
-
-        self.kernel_size = kernel_size
-        self.stride = stride
-
-        padding = get_same_padding(self.kernel_size)
-        # if isinstance(padding, int):
-        #     padding *= self.dilation
-        # else:
-        #     padding[0] *= self.dilation
-        #     padding[1] *= self.dilation
-
-        self.conv = nn.Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            bias=False,
-        )
-
-    def forward(self, hidden_states):
-        if self.training:
-            if hasattr(self, "fused_conv"):
-                delattr(self, "fused_conv")
-            hidden_states = self.conv(hidden_states)
-            return hidden_states
-        else:
-            if not hasattr(self, "fused_conv"):
-                setattr(self, "fused_conv", self.conv)
-            hidden_states = self.fused_conv(hidden_states)
-            return hidden_states
-
-    def fuse_conv_batch_norm(self, conv, batch_norm):
-        """During inference, the functionary of batch norm layers is turned off but
-        only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv
-        layers to save computations and simplify network structures."""
-        if isinstance(batch_norm, nn.Identity):
-            return conv
-        conv_w = conv.weight
-        conv_b = conv.bias if conv.bias is not None else torch.zeros_like(batch_norm.running_mean)
-
-        factor = batch_norm.weight / torch.sqrt(batch_norm.running_var + batch_norm.eps)
-        conv.weight = nn.Parameter(conv_w * factor.reshape([conv.out_channels, 1, 1, 1]))
-        conv.bias = nn.Parameter((conv_b - batch_norm.running_mean) * factor + batch_norm.bias)
-        return conv
-
-
-class FASTRepConvLayer(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1):
-        super().__init__()
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-
-        padding = (int((kernel_size[0] - 1) / 2), int((kernel_size[1] - 1) / 2))
-
-        self.activation = nn.ReLU(inplace=True)
-
-        self.main_conv = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            bias=False,
-        )
-        self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels)
-
-        ver_pad = (int((kernel_size[0] - 1) / 2), 0)
-        hor_pad = (0, int((kernel_size[1] - 1) / 2))
-
-        if kernel_size[1] != 1:
-            self.vertical_conv = nn.Conv2d(
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=(kernel_size[0], 1),
-                stride=stride,
-                padding=ver_pad,
-                bias=False,
-            )
-            self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels)
-        else:
-            self.vertical_conv, self.vertical_batch_norm = None, None
-
-        if kernel_size[0] != 1:  # 卷积核的高大于1 -> 有水平卷积
-            self.horizontal_conv = nn.Conv2d(
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=(1, kernel_size[1]),
-                stride=stride,
-                padding=hor_pad,
-                bias=False,
-            )
-            self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels)
-        else:
-            self.horizontal_conv, self.horizontal_batch_norm = None, None
-
-        self.rbr_identity = (
-            nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None
-        )
-
-    def forward(self, hidden_states):
-        if self.training:
-            if hasattr(self, "fused_conv"):
-                self.__delattr__("fused_conv")
-
-            main_outputs = self.main_conv(hidden_states)
-            main_outputs = self.main_batch_norm(main_outputs)
-            if self.vertical_conv is not None:
-                vertical_outputs = self.vertical_conv(hidden_states)
-                vertical_outputs = self.vertical_batch_norm(vertical_outputs)
-            else:
-                vertical_outputs = 0
-
-            if self.horizontal_conv is not None:
-                horizontal_outputs = self.horizontal_conv(hidden_states)
-                horizontal_outputs = self.horizontal_batch_norm(horizontal_outputs)
-            else:
-                horizontal_outputs = 0
-
-            if self.rbr_identity is None:
-                id_out = 0
-            else:
-                id_out = self.rbr_identity(hidden_states)
-
-            return self.activation(main_outputs + vertical_outputs + horizontal_outputs + id_out)
-        else:
-            if not hasattr(self, "fused_conv"):
-                self.prepare_for_eval()
-            return self.activation(self.fused_conv(hidden_states))
-
-    def _identity_to_conv(self, identity):
-        if identity is None:
-            return 0, 0
-        if not hasattr(self, "id_tensor"):
-            input_dim = self.in_channels
-            kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32)
-            for i in range(self.in_channels):
-                kernel_value[i, i % input_dim, 0, 0] = 1
-            id_tensor = torch.from_numpy(kernel_value).to(identity.weight.device)
-            self.id_tensor = self._pad_to_mxn_tensor(id_tensor)
-        kernel = self.id_tensor
-        running_mean = identity.running_mean
-        running_var = identity.running_var
-        gamma = identity.weight
-        beta = identity.bias
-        eps = identity.eps
-        std = (running_var + eps).sqrt()
-        t = (gamma / std).reshape(-1, 1, 1, 1)
-        return kernel * t, beta - running_mean * gamma / std
-
-    def _fuse_batch_norm_tensor(self, conv, batch_norm):
-        kernel = conv.weight
-        kernel = self._pad_to_mxn_tensor(kernel)
-        running_mean = batch_norm.running_mean
-        running_var = batch_norm.running_var
-        gamma = batch_norm.weight
-        beta = batch_norm.bias
-        eps = batch_norm.eps
-        std = (running_var + eps).sqrt()
-        t = (gamma / std).reshape(-1, 1, 1, 1)
-        return kernel * t, beta - running_mean * gamma / std
-
-    def get_equivalent_kernel_bias(self):
-        kernel_mxn, bias_mxn = self._fuse_batch_norm_tensor(self.main_conv, self.main_batch_norm)
-        if self.vertical_conv is not None:
-            kernel_mx1, bias_mx1 = self._fuse_batch_norm_tensor(self.vertical_conv, self.vertical_batch_norm)
-        else:
-            kernel_mx1, bias_mx1 = 0, 0
-        if self.horizontal_conv is not None:
-            kernel_1xn, bias_1xn = self._fuse_batch_norm_tensor(self.horizontal_conv, self.horizontal_batch_norm)
-        else:
-            kernel_1xn, bias_1xn = 0, 0
-        kernel_id, bias_id = self._identity_to_conv(self.rbr_identity)
-        kernel_mxn = kernel_mxn + kernel_mx1 + kernel_1xn + kernel_id
-        bias_mxn = bias_mxn + bias_mx1 + bias_1xn + bias_id
-        return kernel_mxn, bias_mxn
-
-    def _pad_to_mxn_tensor(self, kernel):
-        kernel_height, kernel_width = self.kernel_size
-        height, width = kernel.shape[2:]
-        pad_left_right = (kernel_width - width) // 2
-        pad_top_down = (kernel_height - height) // 2
-        return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right, pad_top_down, pad_top_down])
-
-    def prepare_for_eval(self):
-        kernel, bias = self.get_equivalent_kernel_bias()
-        self.fused_conv = nn.Conv2d(
-            in_channels=self.main_conv.in_channels,
-            out_channels=self.main_conv.out_channels,
-            kernel_size=self.main_conv.kernel_size,
-            stride=self.main_conv.stride,
-            padding=self.main_conv.padding,
-            bias=True,
-        )
-        self.fused_conv.weight.data = kernel
-        self.fused_conv.bias.data = bias
-        for para in self.fused_conv.parameters():
-            para.detach_()
-
-
-class FastPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = FastConfig
-    base_model_prefix = "fast"
-    main_input_name = "pixel_values"
-
-    def _init_weights(self, module):
-        if isinstance(module, (nn.Linear, nn.Conv2d)):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-
-
-class FASTNeck(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        reduce_layer_configs = list(
-            zip(
-                config.neck_in_channels,
-                config.neck_out_channels,
-                config.neck_kernel_size,
-                config.neck_stride,
-            )
-        )
-        self.num_layers = len(reduce_layer_configs)
-        for layer_ix in range(0, len(reduce_layer_configs)):
-            setattr(self, f"reduce_layer{layer_ix + 1}", FASTRepConvLayer(*reduce_layer_configs[layer_ix]))
-
-    def _upsample(self, layer_out, height, width):
-        return F.upsample(layer_out, size=(height, width), mode="bilinear")
-
-    def forward(self, hidden_states):
-        first_layer_hidden = hidden_states[0]
-        first_layer_hidden = self.reduce_layer1(first_layer_hidden)
-        output_stages = [first_layer_hidden]
-
-        for layer_ix in range(1, self.num_layers):
-            layer_out = getattr(self, f"reduce_layer{layer_ix + 1}")(hidden_states[layer_ix])
-            _, _, height, width = first_layer_hidden.size()
-            layer_out = self._upsample(layer_out, height, width)
-            output_stages.append(layer_out)
-
-        combined_hidden_states = torch.cat(output_stages, 1)
-        return combined_hidden_states
-
-
-class FASTHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.conv = FASTRepConvLayer(
-            config.head_conv_in_channels,
-            config.head_conv_out_channels,
-            config.head_conv_kernel_size,
-            config.head_conv_stride,
-        )
-
-        self.final = FASTConvLayer(
-            config.head_final_in_channels,
-            config.head_final_out_channels,
-            config.head_final_kernel_size,
-            config.head_final_stride,
-            config.head_final_bias,
-        )
-
-        self.pooling_size = config.head_pooling_size
-
-        self.pooling_1s = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2)
-        self.pooling_2s = nn.MaxPool2d(
-            kernel_size=self.pooling_size // 2 + 1, stride=1, padding=(self.pooling_size // 2) // 2
-        )
-
-        if config.head_dropout_ratio > 0:
-            self.dropout = nn.Dropout2d(config.head_dropout_ratio)
-        else:
-            self.dropout = None
-
-    def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-        if self.dropout is not None:
-            hidden_states = self.dropout(hidden_states)
-        hidden_states = self.final(hidden_states)
-        return hidden_states
-
-    def _max_pooling(self, x, scale=1):
-        if scale == 1:
-            x = self.pooling_1s(x)
-        elif scale == 2:
-            x = self.pooling_2s(x)
-        return x
-
-
-def emb_loss(
-    emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), bg_sample=False
-):
-    training_mask = (training_mask > 0.5).long()
-    kernel = (kernel > 0.5).long()
-    instance = instance * training_mask
-    instance_kernel = (instance * kernel).view(-1)
-    instance = instance.view(-1)
-    emb = emb.view(feature_dim, -1)
-
-    unique_labels, unique_ids = torch.unique(instance_kernel, sorted=True, return_inverse=True)
-    num_instance = unique_labels.size(0)
-    if num_instance <= 1:
-        return 0
-
-    emb_mean = emb.new_zeros((feature_dim, num_instance), dtype=torch.float32)
-    for i, lb in enumerate(unique_labels):
-        if lb == 0:
-            continue
-        ind_k = instance_kernel == lb
-        emb_mean[:, i] = torch.mean(emb[:, ind_k], dim=1)
-
-    l_agg = emb.new_zeros(num_instance, dtype=torch.float32)  # bug
-    for i, lb in enumerate(unique_labels):
-        if lb == 0:
-            continue
-        ind = instance == lb
-        emb_ = emb[:, ind]
-        dist = (emb_ - emb_mean[:, i : i + 1]).norm(p=2, dim=0)
-        dist = F.relu(dist - delta_v) ** 2
-        l_agg[i] = torch.mean(torch.log(dist + 1.0))
-    l_agg = torch.mean(l_agg[1:])
-
-    if num_instance > 2:
-        emb_interleave = emb_mean.permute(1, 0).repeat(num_instance, 1)
-        emb_band = emb_mean.permute(1, 0).repeat(1, num_instance).view(-1, feature_dim)
-        # print(seg_band)
-
-        mask = (1 - torch.eye(num_instance, dtype=torch.int8)).view(-1, 1).repeat(1, feature_dim)
-        mask = mask.view(num_instance, num_instance, -1)
-        mask[0, :, :] = 0
-        mask[:, 0, :] = 0
-        mask = mask.view(num_instance * num_instance, -1)
-        # print(mask)
-
-        dist = emb_interleave - emb_band
-        dist = dist[mask > 0].view(-1, feature_dim).norm(p=2, dim=1)
-        dist = F.relu(2 * delta_d - dist) ** 2
-        l_dis = torch.mean(torch.log(dist + 1.0))
-
-        if bg_sample:
-            l_dis = [torch.log(dist + 1.0)]
-            emb_bg = emb[:, instance == 0].view(feature_dim, -1)
-            if emb_bg.size(1) > 100:
-                rand_ind = np.random.permutation(emb_bg.size(1))[:100]
-                emb_bg = emb_bg[:, rand_ind]
-            if emb_bg.size(1) > 0:
-                for i, lb in enumerate(unique_labels):
-                    if lb == 0:
-                        continue
-                    dist = (emb_bg - emb_mean[:, i : i + 1]).norm(p=2, dim=0)
-                    dist = F.relu(2 * delta_d - dist) ** 2
-                    l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True)
-                    l_dis.append(l_dis_bg)
-            l_dis = torch.mean(torch.cat(l_dis))
-    else:
-        l_dis = 0
-
-    l_agg = weights[0] * l_agg
-    l_dis = weights[1] * l_dis
-    l_reg = torch.mean(torch.log(torch.norm(emb_mean, 2, 0) + 1.0)) * 0.001
-    loss = l_agg + l_dis + l_reg
-    return loss
-
-
-def emb_loss_batch(emb, instance, kernel, training_mask, reduce=True, loss_weight=0.25):
-    loss_batch = emb.new_zeros((emb.size(0)), dtype=torch.float32)
-
-    for i in range(loss_batch.size(0)):
-        loss_batch[i] = emb_loss(emb[i], instance[i], kernel[i], training_mask[i])
-
-    loss_batch = loss_weight * loss_batch
-
-    if reduce:
-        loss_batch = torch.mean(loss_batch)
-
-    return loss_batch
-
-
-def dice_loss_with_masks(input, target, mask, reduce=True):
-    loss_weight = 0.5
-    batch_size = input.size(0)
-    input = torch.sigmoid(input)
-
-    input = input.contiguous().view(batch_size, -1)
-    target = target.contiguous().view(batch_size, -1).float()
-    mask = mask.contiguous().view(batch_size, -1).float()
-
-    input = input * mask
-    target = target * mask
-
-    a = torch.sum(input * target, dim=1)
-    b = torch.sum(input * input, dim=1) + 0.001
-    c = torch.sum(target * target, dim=1) + 0.001
-    d = (2 * a) / (b + c)
-    loss = 1 - d
-
-    loss = loss_weight * loss
-
-    if reduce:
-        loss = torch.mean(loss)
-
-    return loss
-
-
-def ohem_single(score, gt_text, training_mask):
-    pos_num = int(torch.sum(gt_text > 0.5)) - int(torch.sum((gt_text > 0.5) & (training_mask <= 0.5)))
-
-    if pos_num == 0:
-        # selected_mask = gt_text.copy() * 0 # may be not good
-        selected_mask = training_mask
-        selected_mask = selected_mask.view(1, selected_mask.shape[0], selected_mask.shape[1]).float()
-        return selected_mask
-
-    neg_num = int(torch.sum(gt_text <= 0.5))
-    neg_num = int(min(pos_num * 3, neg_num))
-
-    if neg_num == 0:
-        selected_mask = training_mask
-        selected_mask = selected_mask.view(1, selected_mask.shape[0], selected_mask.shape[1]).float()
-        return selected_mask
-
-    neg_score = score[gt_text <= 0.5]
-    neg_score_sorted, _ = torch.sort(-neg_score)
-    threshold = -neg_score_sorted[neg_num - 1]
-
-    selected_mask = ((score >= threshold) | (gt_text > 0.5)) & (training_mask > 0.5)
-    selected_mask = selected_mask.reshape(1, selected_mask.shape[0], selected_mask.shape[1]).float()
-    return selected_mask
-
-
-def ohem_batch(scores, gt_texts, training_masks):
-    selected_masks = []
-    for i in range(scores.shape[0]):
-        selected_masks.append(ohem_single(scores[i, :, :], gt_texts[i, :, :], training_masks[i, :, :]))
-
-    selected_masks = torch.cat(selected_masks, 0).float()
-    return selected_masks
-
-
-def iou_single(a, b, mask, n_class):
-    EPS = 1e-6
-    valid = mask == 1
-    a = a[valid]
-    b = b[valid]
-    miou = []
-    for i in range(n_class):
-        inter = ((a == i) & (b == i)).float()
-        union = ((a == i) | (b == i)).float()
-
-        miou.append(torch.sum(inter) / (torch.sum(union) + EPS))
-    miou = sum(miou) / len(miou)
-    return miou
-
-
-def iou(a, b, mask, n_class=2, reduce=True):
-    batch_size = a.size(0)
-
-    a = a.view(batch_size, -1)
-    b = b.view(batch_size, -1)
-    mask = mask.view(batch_size, -1)
-
-    iou = a.new_zeros((batch_size,), dtype=torch.float32)
-    for i in range(batch_size):
-        iou[i] = iou_single(a[i], b[i], mask[i], n_class)
-
-    if reduce:
-        iou = torch.mean(iou)
-    return iou
-
-
-@dataclass
-class FastForSceneTextRecognitionOutput(ModelOutput):
-    """
-    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
-    last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity
-    scores.
-
-    Args:
-        loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Languge modeling loss from the text decoder.
-        text_hidden (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional*):
-            The image hidden states.
-    """
-
-    loss: Optional[torch.Tensor] = None
-    last_hidden_state: Optional[torch.FloatTensor] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@add_start_docstrings(
-    """FAST (faster arbitararily-shaped text detector) proposes an accurate and efficient scene text detection
-    framework, termed FAST (i.e., faster arbitrarily-shaped text detector).FAST has two new designs. (1) They design a
-    minimalist kernel representation (only has 1-channel output) to model text with arbitrary shape, as well as a
-    GPU-parallel post-processing to efficiently assemble text lines with a negligible time overhead. (2) We search the
-    network architecture tailored for text detection, leading to more powerful features than most networks that are
-    searched for image classification.""",
-    FAST_START_DOCSTRING,
-)
-class FastForSceneTextRecognition(FastPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.config = config
-
-        if config.use_timm_backbone:
-            requires_backends(self, ["timm"])
-            kwargs = {}
-            if config.dilation:
-                kwargs["output_stride"] = 16
-            backbone = create_model(
-                config.backbone,
-                pretrained=config.use_pretrained_backbone,
-                features_only=True,
-                out_indices=(1, 2, 3, 4),
-                in_chans=config.num_channels,
-                **kwargs,
-            )
-        else:
-            backbone = AutoBackbone.from_config(config.backbone_config)
-
-        self.backbone = backbone
-        self.neck = FASTNeck(config=config)
-        self.text_detection_head = FASTHead(config=config)
-
-        self.pooling_1s = nn.MaxPool2d(
-            kernel_size=config.head_pooling_size, stride=1, padding=(config.head_pooling_size - 1) // 2
-        )
-        self.pooling_2s = nn.MaxPool2d(
-            kernel_size=config.head_pooling_size // 2 + 1, stride=1, padding=(config.head_pooling_size // 2) // 2
-        )
-        self.post_init()
-
-    def _upsample(self, x, size, scale=1):
-        _, _, H, W = size
-        return F.interpolate(x, size=(H // scale, W // scale), mode="bilinear")
-
-    def _max_pooling(self, x, scale=1):
-        if scale == 1:
-            x = self.pooling_1s(x)
-        elif scale == 2:
-            x = self.pooling_2s(x)
-        return x
-
-    def loss(self, hidden, labels):
-        gt_texts = labels["gt_texts"]
-        gt_kernels = labels["gt_kernels"]
-        training_masks = labels["training_masks"]
-        gt_instances = labels["gt_instances"]
-
-        kernels = hidden[:, 0, :, :]  # 4*640*640
-        texts = self._max_pooling(kernels, scale=1)  # 4*640*640
-        embs = hidden[:, 1:, :, :]  # 4*4*640*640
-
-        selected_masks = ohem_batch(texts, gt_texts, training_masks)
-        loss_text = dice_loss_with_masks(texts, gt_texts, selected_masks, reduce=False)
-
-        selected_masks = gt_texts * training_masks
-        loss_kernel = dice_loss_with_masks(kernels, gt_kernels, selected_masks, reduce=False)
-        loss_kernel = torch.mean(loss_kernel, dim=0)
-
-        loss_emb = emb_loss_batch(embs, gt_instances, gt_kernels, training_masks, reduce=False)
-
-        return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb)
-
-    @add_start_docstrings_to_model_forward(FAST_FOR_CAPTIONING_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=FastForSceneTextRecognitionOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        pixel_values: torch.FloatTensor,
-        output_hidden_states: Optional[bool] = True,
-        return_dict: Optional[bool] = None,
-        labels: Dict = None,
-    ):
-        r"""
-        labels (`Dict[str, torch.Tensor]`, *optional*):
-            Should contain 3 keys: gt_texts,gt_kernels,gt_instances
-
-        Returns:
-
-                Examples:
-
-        ```python
-        >>> from transformers import FastImageProcessor, FastForSceneTextRecognition
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-        >>> processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
-        >>> model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
-        >>> inputs = processor(image, return_tensors="pt")
-        >>> # forward pass
-        >>> outputs = model(pixel_values=inputs["pixel_values"])
-        >>> target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]]
-        >>> threshold = 0.85
-        >>> text_locations = processor.post_process_text_detection(outputs, target_sizes, threshold, bbox_type="poly")
-        >>> print(text_locations[0]["bboxes"][0][:10])
-        [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]
-        ```
-        """
-        # outputs = {}
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        features = (
-            self.backbone(pixel_values) if self.config.use_timm_backbone else self.backbone(pixel_values).feature_maps
-        )
-
-        hidden_states = self.neck(features)
-
-        text_detection_output = self.text_detection_head(hidden_states)
-
-        all_hidden_states = (features, hidden_states)
-
-        loss = None
-        if labels:
-            out = self._upsample(text_detection_output, pixel_values.size(), scale=1)
-            loss = self.loss(out, labels)
-        text_detection_output = self._upsample(text_detection_output, pixel_values.size(), scale=4)
-
-        if not return_dict:
-            output = (loss, text_detection_output) if loss is not None else (text_detection_output,)
-            return output + (all_hidden_states,) if output_hidden_states else output
-
-        return FastForSceneTextRecognitionOutput(
-            loss=loss,
-            last_hidden_state=text_detection_output,
-            hidden_states=all_hidden_states if output_hidden_states else None,
-        )
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 4ee5d2c9c296..18c6a27bd7dc 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -198,13 +198,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class FastImageProcessor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class FlavaFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/fast/__init__.py b/tests/models/fast/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/models/fast/test_image_processing_fast.py b/tests/models/fast/test_image_processing_fast.py
deleted file mode 100644
index 667ce191d43a..000000000000
--- a/tests/models/fast/test_image_processing_fast.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import requests
-
-from transformers.testing_utils import require_torch, require_vision, slow
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import FastForSceneTextRecognition, FastImageProcessor
-
-
-class FastImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        min_area: int = 200,
-        min_score: float = 0.88,
-        bbox_type: str = "rect",
-        pooling_size: int = 9,
-    ):
-        size = size if size is not None else {"height": 20, "width": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.min_area = min_area
-        self.min_score = min_score
-        self.bbox_type = bbox_type
-        self.pooling_size = pooling_size
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "min_area": self.min_area,
-            "min_score": self.min_score,
-            "bbox_type": self.bbox_type,
-            "pooling_size": self.pooling_size,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_torch
-@require_vision
-class FastImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = FastImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = FastImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_center_crop"))
-        self.assertTrue(hasattr(image_processing, "center_crop"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"height": 20, "width": 20})
-        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42, crop_size=84, reduce_labels=True
-        )
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
-        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
-
-    @slow
-    def test_post_process_text_detection(self):
-        model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
-
-        image_processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
-
-        def prepare_image():
-            image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg"
-            raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
-            return raw_image
-
-        image = prepare_image()
-        inputs = image_processor(image, return_tensor="np")
-
-        output = model(pixel_values=torch.tensor(inputs["pixel_values"]))
-        target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]]
-        threshold = 0.85
-        final_out = image_processor.post_process_text_detection(output, target_sizes, threshold, bbox_type="poly")
-
-        assert len(final_out[0]["bboxes"]) == 2
-        assert len(final_out[0]["bboxes"][0]) == 716
-        assert final_out[0]["bboxes"][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]
-        assert round(float(final_out[0]["scores"][0]), 5) == 0.92356
diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py
deleted file mode 100644
index 07c3f9b24b20..000000000000
--- a/tests/models/fast/test_modeling_fast.py
+++ /dev/null
@@ -1,390 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch FAST model. """
-import inspect
-import unittest
-
-import requests
-from PIL import Image
-
-from transformers import (
-    FastConfig,
-    TextNetConfig,
-    is_torch_available,
-)
-from transformers.models.fast.image_processing_fast import FastImageProcessor
-from transformers.testing_utils import (
-    require_torch,
-    require_vision,
-    slow,
-    torch_device,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        FastForSceneTextRecognition,
-    )
-
-
-class FastModelTester:
-    def __init__(
-        self,
-        parent,
-        backbone_kernel_size=3,
-        backbone_stride=2,
-        backbone_dilation=1,
-        backbone_groups=1,
-        backbone_has_shuffle=False,
-        backbone_in_channels=3,
-        backbone_out_channels=64,
-        backbone_use_bn=True,
-        backbone_activation_func="relu",
-        backbone_dropout_rate=0,
-        backbone_ops_order="weight_bn_act",
-        backbone_stage1_in_channels=[64],
-        backbone_stage1_out_channels=[64],
-        backbone_stage1_kernel_size=[[3, 3]],
-        backbone_stage1_stride=[1],
-        backbone_stage1_dilation=[1],
-        backbone_stage1_groups=[1],
-        backbone_stage2_in_channels=[64],
-        backbone_stage2_out_channels=[128],
-        backbone_stage2_kernel_size=[[3, 1]],
-        backbone_stage2_stride=[2],
-        backbone_stage2_dilation=[1],
-        backbone_stage2_groups=[1],
-        backbone_stage3_in_channels=[128],
-        backbone_stage3_out_channels=[256],
-        backbone_stage3_kernel_size=[[1, 3]],
-        backbone_stage3_stride=[2],
-        backbone_stage3_dilation=[1],
-        backbone_stage3_groups=[1],
-        backbone_stage4_in_channels=[256],
-        backbone_stage4_out_channels=[512],
-        backbone_stage4_kernel_size=[[3, 3]],
-        backbone_stage4_stride=[2],
-        backbone_stage4_dilation=[1],
-        backbone_stage4_groups=[1],
-        neck_in_channels=[64],
-        neck_out_channels=[128],
-        neck_kernel_size=[[3, 3]],
-        neck_stride=[1],
-        head_pooling_size=9,
-        head_dropout_ratio=0.1,
-        head_conv_in_channels=128,
-        head_conv_out_channels=4,
-        head_conv_kernel_size=[3, 3],
-        head_conv_stride=1,
-        head_final_kernel_size=1,
-        head_final_stride=1,
-        head_final_bias=False,
-        head_final_in_channels=4,
-        head_final_out_channels=5,
-        head_final_use_batch_norm=False,
-        head_final_act_func=None,
-        head_final_dropout_rate=0,
-        head_final_ops_order="weight",
-        batch_size=3,
-        num_channels=3,
-        image_size=500,
-        is_training=True,
-    ):
-        self.parent = parent
-        self.backbone_kernel_size = backbone_kernel_size
-        self.backbone_stride = backbone_stride
-        self.backbone_has_shuffle = backbone_has_shuffle
-        self.backbone_in_channels = backbone_in_channels
-        self.backbone_out_channels = backbone_out_channels
-        self.backbone_use_bn = backbone_use_bn
-        self.backbone_act_func = backbone_activation_func
-        self.backbone_dropout_rate = backbone_dropout_rate
-        self.backbone_ops_order = backbone_ops_order
-
-        self.backbone_stage1_in_channels = backbone_stage1_in_channels
-        self.backbone_stage1_out_channels = backbone_stage1_out_channels
-        self.backbone_stage1_kernel_size = backbone_stage1_kernel_size
-        self.backbone_stage1_stride = backbone_stage1_stride
-
-        self.backbone_stage2_in_channels = backbone_stage2_in_channels
-        self.backbone_stage2_out_channels = backbone_stage2_out_channels
-        self.backbone_stage2_kernel_size = backbone_stage2_kernel_size
-        self.backbone_stage2_stride = backbone_stage2_stride
-
-        self.backbone_stage3_in_channels = backbone_stage3_in_channels
-        self.backbone_stage3_out_channels = backbone_stage3_out_channels
-        self.backbone_stage3_kernel_size = backbone_stage3_kernel_size
-        self.backbone_stage3_stride = backbone_stage3_stride
-
-        self.backbone_stage4_in_channels = backbone_stage4_in_channels
-        self.backbone_stage4_out_channels = backbone_stage4_out_channels
-        self.backbone_stage4_kernel_size = backbone_stage4_kernel_size
-        self.backbone_stage4_stride = backbone_stage4_stride
-
-        self.neck_in_channels = neck_in_channels
-        self.neck_out_channels = neck_out_channels
-        self.neck_kernel_size = neck_kernel_size
-        self.neck_stride = neck_stride
-
-        self.head_pooling_size = head_pooling_size
-        self.head_dropout_ratio = head_dropout_ratio
-
-        self.head_conv_in_channels = head_conv_in_channels
-        self.head_conv_out_channels = head_conv_out_channels
-        self.head_conv_kernel_size = head_conv_kernel_size
-        self.head_conv_stride = head_conv_stride
-
-        self.head_final_kernel_size = head_final_kernel_size
-        self.head_final_stride = head_final_stride
-        self.head_final_bias = head_final_bias
-        self.head_final_in_channels = head_final_in_channels
-        self.head_final_out_channels = head_final_out_channels
-        self.head_final_use_bn = head_final_use_batch_norm
-        self.head_final_act_func = head_final_act_func
-        self.head_final_dropout_rate = head_final_dropout_rate
-        self.head_final_ops_order = head_final_ops_order
-
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        # labels = None
-        # if self.use_labels:
-        #     labels = ids_tensor([self.batch_size], self.num_labels)
-        #
-        config = self.get_config()
-
-        return config, {"pixel_values": pixel_values}
-
-    def get_config(self):
-        textnet_config = TextNetConfig(
-            kernel_size=self.backbone_kernel_size,
-            stride=self.backbone_stride,
-            has_shuffle=self.backbone_has_shuffle,
-            in_channels=self.backbone_in_channels,
-            out_channels=self.backbone_out_channels,
-            act_func=self.backbone_act_func,
-            stage1_in_channels=self.backbone_stage1_in_channels,
-            stage1_out_channels=self.backbone_stage1_out_channels,
-            stage1_kernel_size=self.backbone_stage1_kernel_size,
-            stage1_stride=self.backbone_stage1_stride,
-            stage2_in_channels=self.backbone_stage2_in_channels,
-            stage2_out_channels=self.backbone_stage2_out_channels,
-            stage2_kernel_size=self.backbone_stage2_kernel_size,
-            stage2_stride=self.backbone_stage2_stride,
-            stage3_in_channels=self.backbone_stage3_in_channels,
-            stage3_out_channels=self.backbone_stage3_out_channels,
-            stage3_kernel_size=self.backbone_stage3_kernel_size,
-            stage3_stride=self.backbone_stage3_stride,
-            stage4_in_channels=self.backbone_stage4_in_channels,
-            stage4_out_channels=self.backbone_stage4_out_channels,
-            stage4_kernel_size=self.backbone_stage4_kernel_size,
-            stage4_stride=self.backbone_stage4_stride,
-            out_features=["stage1", "stage2", "stage3", "stage4"],
-            out_indices=[1, 2, 3, 4],
-        )
-
-        return FastConfig(
-            use_timm_backbone=False,
-            backbone_config=textnet_config,
-            neck_in_channels=self.neck_in_channels,
-            neck_out_channels=self.neck_out_channels,
-            neck_kernel_size=self.neck_kernel_size,
-            neck_stride=self.neck_stride,
-            head_pooling_size=self.head_pooling_size,
-            head_dropout_ratio=self.head_dropout_ratio,
-            head_conv_in_channels=self.head_conv_in_channels,
-            head_conv_out_channels=self.head_conv_out_channels,
-            head_conv_kernel_size=self.head_conv_kernel_size,
-            head_conv_stride=self.head_conv_stride,
-            head_final_kernel_size=self.head_final_kernel_size,
-            head_final_stride=self.head_final_stride,
-            head_final_bias=self.head_final_bias,
-            head_final_in_channels=self.head_final_in_channels,
-            head_final_out_channels=self.head_final_out_channels,
-        )
-
-    def create_and_check_model(self, config, input):
-        model = FastForSceneTextRecognition(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values=input["pixel_values"])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 5, 125, 125))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, inputs_dict = config_and_inputs
-        return config, inputs_dict
-
-
-@require_torch
-class FastModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (FastForSceneTextRecognition,) if is_torch_available() else ()
-
-    pipeline_model_mapping = {}
-    test_headmasking = False
-    test_pruning = False
-    test_attention_outputs = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = FastModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=FastConfig, hidden_size=37)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    @unittest.skip(reason="Fast does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Fast does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    @unittest.skip(reason="Fast is not a generative model")
-    def test_generate_without_input_ids(self):
-        pass
-
-    @unittest.skip(reason="Fast is does not have any hidden_states")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Fast is does not have any attention")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        to_return = inputs_dict.copy()
-        gt_instances = torch.zeros(
-            self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size
-        )
-        gt_kernels = torch.zeros(
-            self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size
-        )
-        gt_text = torch.zeros(self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size)
-        training_masks = torch.ones(
-            self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size
-        )
-        labels = {}
-        labels["gt_instances"] = gt_instances
-        labels["gt_kernels"] = gt_kernels
-        labels["gt_texts"] = gt_text
-        labels["training_masks"] = training_masks
-
-        to_return["labels"] = labels
-
-        return to_return
-
-    def test_model_is_small(self):
-        # Just a consistency check to make sure we are not running tests on 80M parameter models.
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            num_params = model.num_parameters()
-            assert (
-                num_params < 3000000
-            ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
-
-        # def prepare_image():
-        #     image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img_329.jpg"
-        #     raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
-        #     return raw_image
-
-
-@require_torch
-@require_vision
-class FastModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_fast_tiny_ic17mlt_model(self):
-        model = FastForSceneTextRecognition.from_pretrained("Raghavan/ic17mlt_Fast_T")
-
-        image_processor = FastImageProcessor.from_pretrained("Raghavan/ic17mlt_Fast_T")
-
-        def prepare_image():
-            image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img_329.jpg"
-            raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
-            return raw_image
-
-        image = prepare_image()
-        input = image_processor(image, return_tensors="pt")
-
-        output = model(pixel_values=torch.tensor(input["pixel_values"]))
-        target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]]
-        threshold = 0.88
-        final_out = image_processor.post_process_text_detection(output, target_sizes, threshold)
-
-        assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134]
-        assert round(float(final_out[0]["scores"][0]), 5) == 0.95541
-
-    @slow
-    def test_inference_fast_base_800_total_text_ic17mlt_model(self):
-        model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
-
-        image_processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt")
-
-        def prepare_image():
-            image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg"
-            raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
-            return raw_image
-
-        image = prepare_image()
-        input = image_processor(image, return_tensors="pt")
-
-        output = model(pixel_values=torch.tensor(input["pixel_values"]))
-        target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]]
-        threshold = 0.85
-        final_out = image_processor.post_process_text_detection(output, target_sizes, threshold, bbox_type="poly")
-
-        assert final_out[0]["bboxes"][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]
-        assert round(float(final_out[0]["scores"][0]), 5) == 0.92356
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 66f9d7f2b757..10a9fd83e0c9 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -223,7 +223,6 @@
     "TFCLIPVisionModel",
     "TFGroupViTTextModel",
     "TFGroupViTVisionModel",
-    "FastForSceneTextRecognition",
     "FlaxCLIPTextModel",
     "FlaxCLIPTextModelWithProjection",
     "FlaxCLIPVisionModel",

From cbf6c81b85fd2c8465423374ead5b1b9d484b815 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Fri, 10 Nov 2023 14:12:42 +0530
Subject: [PATCH 057/152] More cleanup

---
 docs/source/en/_toctree.yml                   |  2 -
 docs/source/en/model_doc/fast.md              | 48 -------------------
 docs/source/en/model_doc/textnet.md           |  2 +-
 tests/models/textnet/test_modeling_textnet.py |  9 ++--
 utils/check_repo.py                           |  1 +
 5 files changed, 6 insertions(+), 56 deletions(-)
 delete mode 100644 docs/source/en/model_doc/fast.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 51602dc805d4..86cffb9a7e35 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -537,8 +537,6 @@
         title: EfficientFormer
       - local: model_doc/efficientnet
         title: EfficientNet
-      - local: model_doc/fast
-        title: FAST
       - local: model_doc/focalnet
         title: FocalNet
       - local: model_doc/glpn
diff --git a/docs/source/en/model_doc/fast.md b/docs/source/en/model_doc/fast.md
deleted file mode 100644
index e5c8c58f1856..000000000000
--- a/docs/source/en/model_doc/fast.md
+++ /dev/null
@@ -1,48 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# FAST
-
-## Overview
-
-Fast model proposes an accurate and efficient scene text detection framework, termed FAST (i.e., faster 
-arbitrarily-shaped text detector). 
-
-FAST has two new designs. (1) We design a minimalist kernel representation (only has 1-channel output) to model text 
-with arbitrary shape, as well as a GPU-parallel post-processing to efficiently assemble text lines with a negligible 
-time overhead. (2) We search the network architecture tailored for text detection, leading to more powerful features 
-than most networks that are searched for image classification.
-
-## FastConfig
-
-[[autodoc]] FastConfig
-
-## FastImageProcessor
-
-[[autodoc]] FastImageProcessor
-
-## FastForSceneTextRecognition
-
-[[autodoc]] FastForSceneTextRecognition
-- forward
-
-## FASTForImageCaptioningOutput
-
-[[autodoc]] FASTForImageCaptioningOutput
-- forward
-
-
-
diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
index 088adb572bdb..1c50cfa71016 100644
--- a/docs/source/en/model_doc/textnet.md
+++ b/docs/source/en/model_doc/textnet.md
@@ -19,7 +19,7 @@ rendered properly in your Markdown viewer.
 ## Overview
 
 The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu.
-TextNet was results of NAS for efficient text detection task.
+TextNet model was results of NAS for efficient text detection task. It is used in fast model as backbone.
 
 ## TextNetConfig
 
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 957661e61144..5cf9bdeaa423 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -255,15 +255,15 @@ def test_config(self):
     def create_and_test_config_common_properties(self):
         return
 
-    @unittest.skip(reason="Bit does not output attentions")
+    @unittest.skip(reason="TextNet does not output attentions")
     def test_attention_outputs(self):
         pass
 
-    @unittest.skip(reason="Bit does not use inputs_embeds")
+    @unittest.skip(reason="TextNet does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="Bit does not support input and output embeddings")
+    @unittest.skip(reason="TextNet does not support input and output embeddings")
     def test_model_common_attributes(self):
         pass
 
@@ -317,7 +317,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             expected_num_stages = self.model_tester.num_stages - 1
             self.assertEqual(len(hidden_states), expected_num_stages + 1)
 
-            # Bit's feature maps are of shape (batch_size, num_channels, height, width)
             self.assertListEqual(
                 list(hidden_states[0].shape[-2:]),
                 [self.model_tester.image_size // 2, self.model_tester.image_size // 2],
@@ -348,7 +347,7 @@ def test_model_is_small(self):
                 num_params < 3000000
             ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
 
-    @unittest.skip(reason="Bit does not use feedforward chunking")
+    @unittest.skip(reason="TextNet does not use feedforward chunking")
     def test_feed_forward_chunking(self):
         pass
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 10a9fd83e0c9..b0d9cffe8c46 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -337,6 +337,7 @@
     ]
 )
 
+
 # This is to make sure the transformers module imported is the one in the repo.
 transformers = direct_transformers_import(PATH_TO_TRANSFORMERS)
 

From 1db7bd97799c1391bd008577a39456c420a3dd45 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Fri, 10 Nov 2023 14:22:29 +0530
Subject: [PATCH 058/152] Fix build

---
 src/transformers/models/textnet/configuration_textnet.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 650c1bc4858f..9ca9748050cc 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -95,19 +95,19 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     Examples:
 
     ```python
-    >>> from transformers import FastConfig, FastForSceneTextRecognition
+    >>> from transformers import TextNetConfig, TextNetBackbone
 
     >>> # Initializing a Fast Config
-    >>> configuration = FastConfig()
+    >>> configuration = TextNetConfig()
 
     >>> # Initializing a model (with random weights)
-    >>> model = FastForSceneTextRecognition(configuration)
+    >>> model = TextNetBackbone(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
     r"""
-    [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt)
+    [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base)
     """
     model_type = "textnet"
 

From bb4ac611391286cb935f56651c9d554d9cd90e91 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 14 Nov 2023 11:32:21 +0530
Subject: [PATCH 059/152] Incorporate PR feedbacks

---
 docs/source/en/model_doc/textnet.md           |   5 +-
 setup.py                                      |   2 +-
 src/transformers/models/textnet/__init__.py   |  18 +-
 .../models/textnet/configuration_textnet.py   |  24 ++-
 .../textnet/image_processing_textnet.py       |  14 +-
 .../models/textnet/modeling_textnet.py        | 180 ++++++++++--------
 6 files changed, 135 insertions(+), 108 deletions(-)

diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
index 1c50cfa71016..a9947c777f2e 100644
--- a/docs/source/en/model_doc/textnet.md
+++ b/docs/source/en/model_doc/textnet.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -18,8 +18,7 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu.
-TextNet model was results of NAS for efficient text detection task. It is used in fast model as backbone.
+The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. TextNet is a vision backbone useful for text recognition tasks. It is the result of neural architecture search (NAS).
 
 ## TextNetConfig
 
diff --git a/setup.py b/setup.py
index cc354db415ef..65b84fe938f7 100644
--- a/setup.py
+++ b/setup.py
@@ -290,7 +290,7 @@ def run(self):
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
 extras["tf-speech"] = extras["audio"]
 extras["flax-speech"] = extras["audio"]
-extras["vision"] = deps_list("Pillow", "opencv-python")
+extras["vision"] = deps_list("Pillow")
 extras["timm"] = deps_list("timm")
 extras["torch-vision"] = deps_list("torchvision") + extras["vision"]
 extras["natten"] = deps_list("natten")
diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py
index 6ac78b0bce02..a68422021543 100644
--- a/src/transformers/models/textnet/__init__.py
+++ b/src/transformers/models/textnet/__init__.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
+from ... import is_vision_available
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -23,7 +24,6 @@
 
 _import_structure = {
     "configuration_textnet": ["TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "TextNetConfig"],
-    "image_processing_textnet": ["TextNetImageProcessor"],
 }
 
 try:
@@ -39,9 +39,16 @@
         "TextNetForImageClassification",
     ]
 
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_textnet"] = ["TextNetImageProcessor"]
+
 if TYPE_CHECKING:
     from .configuration_textnet import TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, TextNetConfig
-    from .image_processing_textnet import TextNetImageProcessor
 
     try:
         if not is_torch_available():
@@ -56,6 +63,13 @@
             TextNetPreTrainedModel,
         )
 
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_textnet import TextNetImageProcessor
 
 else:
     import sys
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 9ca9748050cc..3cc8f7b29c7d 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 the Fast authors and HuggingFace Inc. team.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,21 +27,19 @@
 
 class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`FastForSceneTextRecognition`]. It is used to
-    instantiate a FastForSceneTextRecognition model according to the specified arguments, defining the model
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
-    FastForSceneTextRecognition.
-    [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt)
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
+    This is the configuration class to store the configuration of a [`TextNextModel`]. It is used to instantiate a
+    TextNextModel model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base) Configuration objects inherit from
+    [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`]
+    for more information.
 
     Args:
         kernel_size (`int`, *optional*, defaults to 3):
             The kernel size for the initial convolution layer.
         stride (`int`, *optional*, defaults to 2):
             The stride for the initial convolution layer.
-        in_channels (`int`, *optional*, defaults to 3):
+        num_channels (`int`, *optional*, defaults to 3):
             The num of channels in input for the initial convolution layer.
         out_channels (`int`, *optional*, defaults to 64):
             The num of channels in out for the initial convolution layer.
@@ -97,7 +95,7 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     ```python
     >>> from transformers import TextNetConfig, TextNetBackbone
 
-    >>> # Initializing a Fast Config
+    >>> # Initializing a TextNetConfig
     >>> configuration = TextNetConfig()
 
     >>> # Initializing a model (with random weights)
@@ -115,7 +113,7 @@ def __init__(
         self,
         kernel_size=3,
         stride=2,
-        in_channels=3,
+        num_channels=3,
         out_channels=64,
         act_func="relu",
         stage1_in_channels=[64, 64, 64],
@@ -144,7 +142,7 @@ def __init__(
 
         self.kernel_size = kernel_size
         self.stride = stride
-        self.in_channels = in_channels
+        self.num_channels = num_channels
         self.out_channels = out_channels
         self.act_func = act_func
 
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 0455e8199adf..9f699fffc237 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 the Fast authors and HuggingFace Inc. team.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,8 +26,6 @@
     to_channel_dimension_format,
 )
 from ...image_utils import (
-    OPENAI_CLIP_MEAN,
-    OPENAI_CLIP_STD,
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -37,7 +35,7 @@
     to_numpy_array,
     valid_images,
 )
-from ...utils import TensorType, is_vision_available, logging
+from ...utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, TensorType, is_vision_available, logging
 
 
 logger = logging.get_logger(__name__)
@@ -74,10 +72,10 @@ class TextNetImageProcessor(BaseImageProcessor):
             Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
             method. Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
         do_normalize (`bool`, *optional*, defaults to `True`): <fill_docstring>
-        image_mean (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
             Mean to use if normalizing the image. This is a float or list of floats the length of the number of
             channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
             Can be overridden by the `image_std` parameter in the `preprocess` method.
@@ -121,8 +119,8 @@ def __init__(
         self.do_rescale = do_rescale
         self.rescale_factor = rescale_factor
         self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
-        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
         self.do_convert_rgb = do_convert_rgb
         self.use_square_size = use_square_size
 
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 1943f2343c67..80c02b56fc29 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 the Fast authors and HuggingFace Inc. team.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@
 logger = logging.get_logger(__name__)
 
 # General docstring
-_CONFIG_FOR_DOC = "BitConfig"
+_CONFIG_FOR_DOC = "TextNetConfig"
 
 TEXTNET_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
@@ -44,7 +44,7 @@
     behavior.
 
     Parameters:
-        config ([`BitConfig`]): Model configuration class with all the parameters of the model.
+        config ([`TextNetConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
@@ -52,8 +52,8 @@
 TEXTNET_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`BitImageProcessor.__call__`]
-            for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`TextNetImageProcessor.__call__`] for details.
 
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@@ -75,7 +75,7 @@ def get_same_padding(kernel_size):
 class TextNetConvLayer(nn.Module):
     def __init__(
         self,
-        in_channels,
+        num_channels,
         out_channels,
         kernel_size=3,
         stride=1,
@@ -90,7 +90,7 @@ def __init__(
         padding = get_same_padding(self.kernel_size)
 
         self.conv = nn.Conv2d(
-            in_channels,
+            num_channels,
             out_channels,
             kernel_size=kernel_size,
             stride=stride,
@@ -136,10 +136,10 @@ def fuse_conv_batch_norm(self, conv, batch_norm):
 
 
 class TestNetRepConvLayer(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1):
+    def __init__(self, num_channels, out_channels, kernel_size, stride=1):
         super().__init__()
 
-        self.in_channels = in_channels
+        self.num_channels = num_channels
         self.out_channels = out_channels
         self.kernel_size = kernel_size
         self.stride = stride
@@ -149,7 +149,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1):
         self.nonlinearity = nn.ReLU(inplace=True)
 
         self.main_conv = nn.Conv2d(
-            in_channels=in_channels,
+            in_channels=num_channels,
             out_channels=out_channels,
             kernel_size=kernel_size,
             stride=stride,
@@ -163,7 +163,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1):
 
         if kernel_size[1] != 1:
             self.vertical_conv = nn.Conv2d(
-                in_channels=in_channels,
+                in_channels=num_channels,
                 out_channels=out_channels,
                 kernel_size=(kernel_size[0], 1),
                 stride=stride,
@@ -176,7 +176,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1):
 
         if kernel_size[0] != 1:
             self.horizontal_conv = nn.Conv2d(
-                in_channels=in_channels,
+                in_channels=num_channels,
                 out_channels=out_channels,
                 kernel_size=(1, kernel_size[1]),
                 stride=stride,
@@ -188,7 +188,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1):
             self.horizontal_conv, self.horizontal_batch_norm = None, None
 
         self.rbr_identity = (
-            nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None
+            nn.BatchNorm2d(num_features=num_channels) if out_channels == num_channels and stride == 1 else None
         )
 
     def forward(self, hidden_states):
@@ -225,9 +225,9 @@ def _identity_to_conv(self, identity):
         if identity is None:
             return 0, 0
         if not hasattr(self, "id_tensor"):
-            input_dim = self.in_channels
-            kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32)
-            for i in range(self.in_channels):
+            input_dim = self.num_channels
+            kernel_value = np.zeros((self.num_channels, input_dim, 1, 1), dtype=np.float32)
+            for i in range(self.num_channels):
                 kernel_value[i, i % input_dim, 0, 0] = 1
             id_tensor = torch.from_numpy(kernel_value).to(identity.weight.device)
             self.id_tensor = self._pad_to_mxn_tensor(id_tensor)
@@ -291,6 +291,51 @@ def prepare_for_eval(self):
             para.detach_()
 
 
+class TextNetStage(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride):
+        super().__init__()
+        stage = []
+        for stage_config in zip(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+        ):
+            stage.append(TestNetRepConvLayer(*stage_config))
+        self.stage = nn.ModuleList(stage)
+
+    def forward(self, hidden_state):
+        for block in self.stage:
+            hidden_state = block(hidden_state)
+        return hidden_state
+
+
+class TextNetEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        stages = []
+        for stage_ix in range(1, 5):
+            stages.append(
+                TextNetStage(
+                    getattr(config, f"stage{stage_ix}_in_channels"),
+                    getattr(config, f"stage{stage_ix}_out_channels"),
+                    getattr(config, f"stage{stage_ix}_kernel_size"),
+                    getattr(config, f"stage{stage_ix}_stride"),
+                )
+            )
+
+            self.stages = nn.ModuleList(stages)
+
+    def forward(self, hidden_state):
+        hidden_states = []
+        for stage in self.stages:
+            hidden_state = stage(hidden_state)
+            hidden_states.append(hidden_state)
+
+        return hidden_states
+
+
 class TextNetPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -315,93 +360,65 @@ def _init_weights(self, module):
 class TextNetModel(TextNetPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.first_conv = TextNetConvLayer(
-            config.in_channels,
+        self.stem = TextNetConvLayer(
+            config.num_channels,
             config.out_channels,
             config.kernel_size,
             config.stride,
             config.act_func,
         )
-        stage1 = []
-        for stage_config in zip(
-            config.stage1_in_channels,
-            config.stage1_out_channels,
-            config.stage1_kernel_size,
-            config.stage1_stride,
-        ):
-            stage1.append(TestNetRepConvLayer(*stage_config))
-        self.stage1 = nn.ModuleList(stage1)
-
-        stage2 = []
-        for stage_config in zip(
-            config.stage2_in_channels,
-            config.stage2_out_channels,
-            config.stage2_kernel_size,
-            config.stage2_stride,
-        ):
-            stage2.append(TestNetRepConvLayer(*stage_config))
-        self.stage2 = nn.ModuleList(stage2)
 
-        stage3 = []
-        for stage_config in zip(
-            config.stage3_in_channels,
-            config.stage3_out_channels,
-            config.stage3_kernel_size,
-            config.stage3_stride,
-        ):
-            stage3.append(TestNetRepConvLayer(*stage_config))
-        self.stage3 = nn.ModuleList(stage3)
-
-        stage4 = []
-        for stage_config in zip(
-            config.stage4_in_channels,
-            config.stage4_out_channels,
-            config.stage4_kernel_size,
-            config.stage4_stride,
-        ):
-            stage4.append(TestNetRepConvLayer(*stage_config))
-        self.stage4 = nn.ModuleList(stage4)
+        self.encoder = TextNetEncoder(config)
 
         self.pooler = nn.AdaptiveAvgPool2d((2, 2))
 
         self.init_weights()
 
+    @add_start_docstrings_to_model_forward(TEXTNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
     ) -> Union[Tuple[Any, List[Any]], Tuple[Any], BaseModelOutputWithPoolingAndNoAttention]:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("Raghavan/textnet-base")
+        >>> model = AutoBackbone.from_pretrained("Raghavan/textnet-base")
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
 
-        hidden_state = self.first_conv(pixel_values)
+        hidden_state = self.stem(pixel_values)
         hidden_states = [hidden_state]
 
-        for block in self.stage1:
-            hidden_state = block(hidden_state)
-        hidden_states.append(hidden_state)
-
-        for block in self.stage2:
-            hidden_state = block(hidden_state)
-        hidden_states.append(hidden_state)
-
-        for block in self.stage3:
-            hidden_state = block(hidden_state)
-        hidden_states.append(hidden_state)
+        hidden_states = hidden_states + self.encoder(hidden_state)
 
-        for block in self.stage4:
-            hidden_state = block(hidden_state)
-        hidden_states.append(hidden_state)
-
-        pooled_output = self.pooler(hidden_state)
+        last_hidden_state = hidden_states[-1]
+        pooled_output = self.pooler(last_hidden_state)
 
         if not return_dict:
-            output = (pooled_output, hidden_state)
+            output = (pooled_output, last_hidden_state)
             return output + (hidden_states,) if output_hidden_states else output
 
         return BaseModelOutputWithPoolingAndNoAttention(
             pooler_output=pooled_output,
-            last_hidden_state=hidden_state,
+            last_hidden_state=last_hidden_state,
             hidden_states=tuple(hidden_states) if output_hidden_states else None,
         )
 
@@ -430,10 +447,10 @@ def __init__(self, config):
         self.post_init()
 
     @add_start_docstrings_to_model_forward(TEXTNET_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BackboneOutput, config_class="")
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
-    ) -> BackboneOutput:
+    ) -> Union[Tuple[Tuple], BackboneOutput]:
         """
         Returns:
 
@@ -459,9 +476,9 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
 
-        outputs = self.textnet(pixel_values, output_hidden_states=True, return_dict=True)
+        outputs = self.textnet(pixel_values, output_hidden_states=True, return_dict=return_dict)
 
-        hidden_states = outputs.hidden_states
+        hidden_states = outputs.hidden_states if return_dict else outputs[2]
 
         feature_maps = ()
         for idx, stage in enumerate(self.stage_names):
@@ -471,7 +488,8 @@ def forward(
         if not return_dict:
             output = (feature_maps,)
             if output_hidden_states:
-                output += (outputs.hidden_states,)
+                hidden_states = outputs.hidden_states if return_dict else outputs[2]
+                output += (hidden_states,)
             return output
 
         return BackboneOutput(
@@ -501,7 +519,7 @@ def __init__(self, config):
         # initialize weights and apply final processing
         self.post_init()
 
-    @add_start_docstrings_to_model_forward("BIT_INPUTS_DOCSTRING")
+    @add_start_docstrings_to_model_forward(TEXTNET_INPUTS_DOCSTRING)
     def forward(
         self,
         pixel_values: Optional[torch.FloatTensor] = None,

From 25b5063b172ad61cde8f30b3f07273d334cc78f2 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 14 Nov 2023 14:00:43 +0530
Subject: [PATCH 060/152] Incorporate PR feedbacks

---
 .../textnet/image_processing_textnet.py       | 14 ++--
 .../models/textnet/modeling_textnet.py        | 72 +++++++++----------
 tests/models/textnet/test_modeling_textnet.py | 42 ++---------
 utils/check_docstrings.py                     |  1 -
 4 files changed, 43 insertions(+), 86 deletions(-)

diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 9f699fffc237..f04db8f8c622 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -81,10 +81,6 @@ class TextNetImageProcessor(BaseImageProcessor):
             Can be overridden by the `image_std` parameter in the `preprocess` method.
         do_convert_rgb (`bool`, *optional*, defaults to `True`):
             Whether to convert the image to RGB.
-        use_square_size (`bool`, *optional*, defaults to `False`):
-            The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
-            `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not.
-            Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
     """
 
     model_input_names = ["pixel_values"]
@@ -102,14 +98,13 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        use_square_size: bool = False,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
-        size = get_size_dict(size, default_to_square=use_square_size)
+        size = get_size_dict(size)
         crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
-        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
 
         self.do_resize = do_resize
         self.size = size
@@ -122,7 +117,6 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
         self.do_convert_rgb = do_convert_rgb
-        self.use_square_size = use_square_size
 
     # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
     def resize(
@@ -240,11 +234,11 @@ def preprocess(
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
-        size = get_size_dict(size, param_name="size", default_to_square=self.use_square_size)
+        size = get_size_dict(size, param_name="size")
         resample = resample if resample is not None else self.resample
         do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
         crop_size = crop_size if crop_size is not None else self.crop_size
-        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 80c02b56fc29..25a7d9355914 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -73,33 +73,30 @@ def get_same_padding(kernel_size):
 
 
 class TextNetConvLayer(nn.Module):
-    def __init__(
-        self,
-        num_channels,
-        out_channels,
-        kernel_size=3,
-        stride=1,
-        act_func="relu",
-    ):
+    def __init__(self, config):
         super().__init__()
 
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.activation_function = act_func
+        self.kernel_size = config.kernel_size
+        self.stride = config.stride
+        self.activation_function = config.act_func
 
-        padding = get_same_padding(self.kernel_size)
+        padding = (
+            (config.kernel_size[0] // 2, config.kernel_size[1] // 2)
+            if isinstance(config.kernel_size, tuple)
+            else config.kernel_size // 2
+        )
 
         self.conv = nn.Conv2d(
-            num_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
+            config.num_channels,
+            config.out_channels,
+            kernel_size=config.kernel_size,
+            stride=config.stride,
             padding=padding,
             bias=False,
         )
         self.batch_norm = nn.Identity()
 
-        self.batch_norm = nn.BatchNorm2d(out_channels)
+        self.batch_norm = nn.BatchNorm2d(config.out_channels)
 
         self.activation = nn.Identity()
         if self.activation_function is not None:
@@ -360,13 +357,7 @@ def _init_weights(self, module):
 class TextNetModel(TextNetPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.stem = TextNetConvLayer(
-            config.num_channels,
-            config.out_channels,
-            config.kernel_size,
-            config.stride,
-            config.act_func,
-        )
+        self.stem = TextNetConvLayer(config)
 
         self.encoder = TextNetEncoder(config)
 
@@ -414,7 +405,7 @@ def forward(
 
         if not return_dict:
             output = (pooled_output, last_hidden_state)
-            return output + (hidden_states,) if output_hidden_states else output
+            return output + (tuple(hidden_states),) if output_hidden_states else output
 
         return BaseModelOutputWithPoolingAndNoAttention(
             pooler_output=pooled_output,
@@ -457,19 +448,19 @@ def forward(
         Examples:
 
         ```python
-        # >>> from transformers import AutoImageProcessor, AutoBackbone
-        # >>> import torch
-        # >>> from PIL import Image
-        # >>> import requests
-        #
-        # >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        # >>> image = Image.open(requests.get(url, stream=True).raw)
-        #
-        # >>> processor = AutoImageProcessor.from_pretrained("google/resnetnv2-50")
-        # >>> model = AutoBackbone.from_pretrained("google/resnetnv2-50")
-        #
-        # >>> inputs = processor(image, return_tensors="pt")
-        # >>> outputs = model(**inputs)
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("Raghavan/textnet-base")
+        >>> model = AutoBackbone.from_pretrained("Raghavan/textnet-base")
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = (
@@ -507,14 +498,15 @@ def forward(
     TEXTNET_START_DOCSTRING,
 )
 class TextNetForImageClassification(TextNetPreTrainedModel):
+    # Copied from transformers.models.bit.modeling_bit.BitForImageClassification.__init__ with Bit->TextNet
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.textnet = TextNetModel(config)
+        self.bit = TextNetModel(config)
         # classification head
         self.classifier = nn.Sequential(
             nn.Flatten(),
-            nn.Linear(config.hidden_sizes[-1] * 2 * 2, config.num_labels) if config.num_labels > 0 else nn.Identity(),
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity(),
         )
         # initialize weights and apply final processing
         self.post_init()
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 5cf9bdeaa423..68403c273c45 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -32,7 +32,6 @@
 
 if is_torch_available():
     import torch
-    import torch.nn as nn
 
     from transformers import (
         TextNetBackbone,
@@ -60,15 +59,15 @@ def __init__(
         stage1_kernel_size=[[3, 3]],
         stage1_stride=[1],
         stage2_in_channels=[64],
-        stage2_out_channels=[128],
+        stage2_out_channels=[64],
         stage2_kernel_size=[[3, 1]],
         stage2_stride=[2],
-        stage3_in_channels=[128],
-        stage3_out_channels=[256],
+        stage3_in_channels=[64],
+        stage3_out_channels=[64],
         stage3_kernel_size=[[1, 3]],
         stage3_stride=[2],
-        stage4_in_channels=[256],
-        stage4_out_channels=[512],
+        stage4_in_channels=[64],
+        stage4_out_channels=[64],
         stage4_kernel_size=[[3, 3]],
         stage4_stride=[2],
         out_features=["stage1", "stage2", "stage3", "stage4"],
@@ -80,7 +79,7 @@ def __init__(
         use_labels=True,
         hidden_act="relu",
         num_labels=3,
-        hidden_sizes=[64, 64, 128, 256, 512],
+        hidden_sizes=[64, 64, 64, 64, 64],
     ):
         self.parent = parent
         self.kernel_size = kernel_size
@@ -210,7 +209,7 @@ def create_and_check_backbone(self, config, pixel_values, labels):
 
         # verify feature maps
         self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, 512, 2, 2])
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, 64, 2, 2])
 
         # verify channels
         self.parent.assertEqual(len(model.channels), 1)
@@ -287,22 +286,6 @@ def test_backbone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_backbone(*config_and_inputs)
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            for name, module in model.named_modules():
-                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
-                    self.assertTrue(
-                        torch.all(module.weight == 1),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-                    self.assertTrue(
-                        torch.all(module.bias == 0),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
@@ -336,17 +319,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
                 check_hidden_states_output(inputs_dict, config, model_class)
 
-    def test_model_is_small(self):
-        # Just a consistency check to make sure we are not running tests on 80M parameter models.
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            num_params = model.num_parameters()
-            assert (
-                num_params < 3000000
-            ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
-
     @unittest.skip(reason="TextNet does not use feedforward chunking")
     def test_feed_forward_chunking(self):
         pass
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index a867e46ce64e..3c4663103979 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -487,7 +487,6 @@
     "TapasConfig",
     "TapasModel",
     "TapasTokenizer",
-    "TextNetImageProcessor",
     "Text2TextGenerationPipeline",
     "TextClassificationPipeline",
     "TextGenerationPipeline",

From 268d3e8edf8e6c4f1be582610390e632f94710a0 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 14 Nov 2023 17:23:52 +0530
Subject: [PATCH 061/152] Incorporate PR feedbacks

---
 .../models/textnet/configuration_textnet.py   | 12 ++--
 .../models/textnet/modeling_textnet.py        | 57 ++++++++-----------
 2 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 3cc8f7b29c7d..bd499b9b14ad 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -27,10 +27,10 @@
 
 class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`TextNextModel`]. It is used to instantiate a
-    TextNextModel model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the
-    [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base) Configuration objects inherit from
+    This is the configuration class to store the configuration of a [`TextNextModel`] which can be used with backbone
+    api as a backbone. It is used to instantiate a TextNextModel model according to the specified arguments, defining
+    the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that
+    of the [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base) Configuration objects inherit from
     [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`]
     for more information.
 
@@ -79,6 +79,8 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
             The strides for list of conv in stage 4.Should be of same length os `stage4_in_channels`
         hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`):
             Dimensionality (hidden size) at each stage.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch normalization layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         out_features (`List[str]`, *optional*):
@@ -133,6 +135,7 @@ def __init__(
         stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]],
         stage4_stride=[2, 1, 1, 1],
         hidden_sizes=[64, 64, 128, 256, 512],
+        batch_norm_eps=1e-5,
         initializer_range=0.02,
         out_features=None,
         out_indices=None,
@@ -168,6 +171,7 @@ def __init__(
 
         self.initializer_range = initializer_range
         self.hidden_sizes = hidden_sizes
+        self.batch_norm_eps = batch_norm_eps
 
         self.depths = [
             len(self.stage1_out_channels),
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 25a7d9355914..d8034ab31b06 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -59,6 +59,7 @@
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
             more detail.
         return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
 TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST = ["Raghavan/textnet-base"]
@@ -94,9 +95,7 @@ def __init__(self, config):
             padding=padding,
             bias=False,
         )
-        self.batch_norm = nn.Identity()
-
-        self.batch_norm = nn.BatchNorm2d(config.out_channels)
+        self.batch_norm = nn.BatchNorm2d(config.out_channels, config.batch_norm_eps)
 
         self.activation = nn.Identity()
         if self.activation_function is not None:
@@ -121,8 +120,6 @@ def fuse_conv_batch_norm(self, conv, batch_norm):
         """During inference, the functionary of batch norm layers is turned off but
         only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv
         layers to save computations and simplify network structures."""
-        if isinstance(batch_norm, nn.Identity):
-            return conv
         conv_w = conv.weight
         conv_b = conv.bias if conv.bias is not None else torch.zeros_like(batch_norm.running_mean)
 
@@ -132,8 +129,8 @@ def fuse_conv_batch_norm(self, conv, batch_norm):
         return conv
 
 
-class TestNetRepConvLayer(nn.Module):
-    def __init__(self, num_channels, out_channels, kernel_size, stride=1):
+class TextNetRepConvLayer(nn.Module):
+    def __init__(self, config, num_channels, out_channels, kernel_size, stride):
         super().__init__()
 
         self.num_channels = num_channels
@@ -141,7 +138,7 @@ def __init__(self, num_channels, out_channels, kernel_size, stride=1):
         self.kernel_size = kernel_size
         self.stride = stride
 
-        padding = (int((kernel_size[0] - 1) / 2), int((kernel_size[1] - 1) / 2))
+        padding = ((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2)
 
         self.nonlinearity = nn.ReLU(inplace=True)
 
@@ -153,10 +150,10 @@ def __init__(self, num_channels, out_channels, kernel_size, stride=1):
             padding=padding,
             bias=False,
         )
-        self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels)
+        self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels, eps=config.batch_norm_eps)
 
-        ver_pad = (int((kernel_size[0] - 1) / 2), 0)
-        hor_pad = (0, int((kernel_size[1] - 1) / 2))
+        ver_pad = ((kernel_size[0] - 1) // 2, 0)
+        hor_pad = (0, (kernel_size[1] - 1) // 2)
 
         if kernel_size[1] != 1:
             self.vertical_conv = nn.Conv2d(
@@ -167,7 +164,7 @@ def __init__(self, num_channels, out_channels, kernel_size, stride=1):
                 padding=ver_pad,
                 bias=False,
             )
-            self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels)
+            self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels, eps=config.batch_norm_eps)
         else:
             self.vertical_conv, self.vertical_batch_norm = None, None
 
@@ -180,12 +177,14 @@ def __init__(self, num_channels, out_channels, kernel_size, stride=1):
                 padding=hor_pad,
                 bias=False,
             )
-            self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels)
+            self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels, eps=config.batch_norm_eps)
         else:
             self.horizontal_conv, self.horizontal_batch_norm = None, None
 
         self.rbr_identity = (
-            nn.BatchNorm2d(num_features=num_channels) if out_channels == num_channels and stride == 1 else None
+            nn.BatchNorm2d(num_features=num_channels, eps=config.batch_norm_eps)
+            if out_channels == num_channels and stride == 1
+            else None
         )
 
     def forward(self, hidden_states):
@@ -195,21 +194,18 @@ def forward(self, hidden_states):
 
             main_outputs = self.main_conv(hidden_states)
             main_outputs = self.main_batch_norm(main_outputs)
+
+            vertical_outputs = 0
             if self.vertical_conv is not None:
                 vertical_outputs = self.vertical_conv(hidden_states)
                 vertical_outputs = self.vertical_batch_norm(vertical_outputs)
-            else:
-                vertical_outputs = 0
-
+            horizontal_outputs = 0
             if self.horizontal_conv is not None:
                 horizontal_outputs = self.horizontal_conv(hidden_states)
                 horizontal_outputs = self.horizontal_batch_norm(horizontal_outputs)
-            else:
-                horizontal_outputs = 0
 
-            if self.rbr_identity is None:
-                id_out = 0
-            else:
+            id_out = 0
+            if self.rbr_identity is not None:
                 id_out = self.rbr_identity(hidden_states)
 
             return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out)
@@ -289,16 +285,11 @@ def prepare_for_eval(self):
 
 
 class TextNetStage(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, stride):
+    def __init__(self, config, in_channels, out_channels, kernel_size, stride):
         super().__init__()
         stage = []
-        for stage_config in zip(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-        ):
-            stage.append(TestNetRepConvLayer(*stage_config))
+        for stage_config in zip(in_channels, out_channels, kernel_size, stride):
+            stage.append(TextNetRepConvLayer(config, *stage_config))
         self.stage = nn.ModuleList(stage)
 
     def forward(self, hidden_state):
@@ -315,6 +306,7 @@ def __init__(self, config):
         for stage_ix in range(1, 5):
             stages.append(
                 TextNetStage(
+                    config,
                     getattr(config, f"stage{stage_ix}_in_channels"),
                     getattr(config, f"stage{stage_ix}_out_channels"),
                     getattr(config, f"stage{stage_ix}_kernel_size"),
@@ -498,15 +490,14 @@ def forward(
     TEXTNET_START_DOCSTRING,
 )
 class TextNetForImageClassification(TextNetPreTrainedModel):
-    # Copied from transformers.models.bit.modeling_bit.BitForImageClassification.__init__ with Bit->TextNet
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.bit = TextNetModel(config)
+        self.textnet = TextNetModel(config)
         # classification head
         self.classifier = nn.Sequential(
             nn.Flatten(),
-            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity(),
+            nn.Linear(config.hidden_sizes[-1] * 2 * 2, config.num_labels) if config.num_labels > 0 else nn.Identity(),
         )
         # initialize weights and apply final processing
         self.post_init()

From 9003563dcb736366696b5431a7129f27abe91f10 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 14 Nov 2023 19:53:49 +0530
Subject: [PATCH 062/152] Incorporate PR feedbacks

---
 docs/source/en/model_doc/textnet.md           |   5 -
 src/transformers/__init__.py                  |   2 -
 .../models/auto/image_processing_auto.py      |   1 -
 src/transformers/models/textnet/__init__.py   |  14 -
 .../textnet/image_processing_textnet.py       | 314 ------------------
 .../models/textnet/modeling_textnet.py        | 149 ++-------
 .../utils/dummy_vision_objects.py             |   7 -
 7 files changed, 22 insertions(+), 470 deletions(-)
 delete mode 100644 src/transformers/models/textnet/image_processing_textnet.py

diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
index a9947c777f2e..3a65db2724de 100644
--- a/docs/source/en/model_doc/textnet.md
+++ b/docs/source/en/model_doc/textnet.md
@@ -24,11 +24,6 @@ The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector
 
 [[autodoc]] TextNetConfig
 
-## TextNetImageProcessor
-
-[[autodoc]] TextNetImageProcessor
-    - preprocess
-
 ## TextNetModel
 
 [[autodoc]] TextNetModel
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 7cfffec8463b..93d9b2d6d9d3 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1304,7 +1304,6 @@
     _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
     _import_structure["models.siglip"].append("SiglipImageProcessor")
     _import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
-    _import_structure["models.textnet"].append("TextNetImageProcessor")
     _import_structure["models.tvlt"].append("TvltImageProcessor")
     _import_structure["models.tvp"].append("TvpImageProcessor")
     _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
@@ -6011,7 +6010,6 @@
         from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
         from .models.siglip import SiglipImageProcessor
         from .models.swin2sr import Swin2SRImageProcessor
-        from .models.textnet import TextNetImageProcessor
         from .models.tvlt import TvltImageProcessor
         from .models.tvp import TvpImageProcessor
         from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 6244276b1d0b..e41889c5ef81 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -103,7 +103,6 @@
         ("swin2sr", "Swin2SRImageProcessor"),
         ("swinv2", "ViTImageProcessor"),
         ("table-transformer", "DetrImageProcessor"),
-        ("textnet", "TextNetImageProcessor"),
         ("timesformer", "VideoMAEImageProcessor"),
         ("tvlt", "TvltImageProcessor"),
         ("tvp", "TvpImageProcessor"),
diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py
index a68422021543..e03fbde04197 100644
--- a/src/transformers/models/textnet/__init__.py
+++ b/src/transformers/models/textnet/__init__.py
@@ -39,13 +39,6 @@
         "TextNetForImageClassification",
     ]
 
-try:
-    if not is_vision_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["image_processing_textnet"] = ["TextNetImageProcessor"]
 
 if TYPE_CHECKING:
     from .configuration_textnet import TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, TextNetConfig
@@ -63,13 +56,6 @@
             TextNetPreTrainedModel,
         )
 
-    try:
-        if not is_vision_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .image_processing_textnet import TextNetImageProcessor
 
 else:
     import sys
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
deleted file mode 100644
index f04db8f8c622..000000000000
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# coding=utf-8
-# Copyright 2023 the Fast authors and HuggingFace Inc. team.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Image processor class for TextNet."""
-
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-
-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import (
-    convert_to_rgb,
-    get_resize_output_image_size,
-    resize,
-    to_channel_dimension_format,
-)
-from ...image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    infer_channel_dimension_format,
-    is_scaled_image,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from ...utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, TensorType, is_vision_available, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-if is_vision_available():
-    import PIL
-
-
-class TextNetImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a TextNet image processor.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
-            `do_resize` in the `preprocess` method.
-        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
-            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
-            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
-            method.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
-        do_center_crop (`bool`, *optional*, defaults to `True`):
-            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
-            `preprocess` method.
-        crop_size (`Dict[str, int]` *optional*, defaults to 224):
-            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
-            method.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
-            the `preprocess` method.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
-            method. Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
-        do_normalize (`bool`, *optional*, defaults to `True`): <fill_docstring>
-        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
-            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
-            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
-            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
-            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
-            Can be overridden by the `image_std` parameter in the `preprocess` method.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
-            Whether to convert the image to RGB.
-    """
-
-    model_input_names = ["pixel_values"]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        do_center_crop: bool = True,
-        crop_size: Dict[str, int] = None,
-        do_rescale: bool = True,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = True,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        size = size if size is not None else {"shortest_edge": 224}
-        size = get_size_dict(size)
-        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
-        crop_size = get_size_dict(crop_size, param_name="crop_size")
-
-        self.do_resize = do_resize
-        self.size = size
-        self.resample = resample
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
-        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
-        self.do_convert_rgb = do_convert_rgb
-
-    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
-    def resize(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
-        resized to keep the input aspect ratio.
-
-        Args:
-            image (`np.ndarray`):
-                Image to resize.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
-                Resampling filter to use when resiizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format of the input image. If not provided, it will be inferred.
-        """
-        size = get_size_dict(size, default_to_square=self.use_square_size)
-        if "shortest_edge" not in size:
-            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
-        output_size = get_resize_output_image_size(
-            image,
-            size=size["shortest_edge"],
-            default_to_square=self.use_square_size,
-            input_data_format=input_data_format,
-        )
-        return resize(
-            image,
-            size=output_size,
-            resample=resample,
-            data_format=data_format,
-            input_data_format=input_data_format,
-            **kwargs,
-        )
-
-    def preprocess(
-        self,
-        images: ImageInput,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> PIL.Image.Image:
-        """
-        Preprocess an image or batch of images.
-
-        Args:
-            images (`ImageInput`):
-                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
-                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
-                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
-                the longest edge resized to keep the input aspect ratio.
-            resample (`int`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
-                has an effect if `do_resize` is set to `True`.
-            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
-                Whether to center crop the image.
-            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
-                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
-                `True`.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                - Unset: Return a list of `np.ndarray`.
-                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-        """
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        size = get_size_dict(size, param_name="size")
-        resample = resample if resample is not None else self.resample
-        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
-        crop_size = crop_size if crop_size is not None else self.crop_size
-        crop_size = get_size_dict(crop_size, param_name="crop_size")
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-
-        images = make_list_of_images(images)
-
-        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-
-        if do_resize and size is None:
-            raise ValueError("Size must be specified if do_resize is True.")
-
-        if do_center_crop and crop_size is None:
-            raise ValueError("Crop size must be specified if do_center_crop is True.")
-
-        if do_rescale and rescale_factor is None:
-            raise ValueError("Rescale factor must be specified if do_rescale is True.")
-
-        if do_normalize and (image_mean is None or image_std is None):
-            raise ValueError("Image mean and std must be specified if do_normalize is True.")
-
-        # PIL RGBA images are converted to RGB
-        if do_convert_rgb:
-            images = [convert_to_rgb(image) for image in images]
-
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
-        if is_scaled_image(images[0]) and do_rescale:
-            logger.warning_once(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        if do_resize:
-            images = [
-                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
-                for image in images
-            ]
-
-        if do_center_crop:
-            images = [
-                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
-            ]
-
-        if do_rescale:
-            images = [
-                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
-                for image in images
-            ]
-
-        if do_normalize:
-            images = [
-                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
-                for image in images
-            ]
-
-        images = [
-            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
-        ]
-
-        data = {"pixel_values": images}
-        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index d8034ab31b06..09c73b5f3069 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -15,7 +15,6 @@
 """ PyTorch TextNet model."""
 from typing import Any, List, Optional, Tuple, Union
 
-import numpy as np
 import torch
 import torch.nn as nn
 from torch import Tensor
@@ -53,7 +52,7 @@
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`TextNetImageProcessor.__call__`] for details.
+            [`ClipImageProcessor.__call__`] for details.
 
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@@ -65,14 +64,6 @@
 TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST = ["Raghavan/textnet-base"]
 
 
-def get_same_padding(kernel_size):
-    if isinstance(kernel_size, tuple):
-        padding1 = get_same_padding(kernel_size[0])
-        padding2 = get_same_padding(kernel_size[1])
-        return padding1, padding2
-    return kernel_size // 2
-
-
 class TextNetConvLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -102,31 +93,9 @@ def __init__(self, config):
             self.activation = ACT2CLS[self.activation_function](inplace=True)
 
     def forward(self, hidden_states):
-        if self.training:
-            if hasattr(self, "fused_conv"):
-                delattr(self, "fused_conv")
-            hidden_states = self.conv(hidden_states)
-            hidden_states = self.batch_norm(hidden_states)
-            return self.activation(hidden_states)
-        else:
-            if not hasattr(self, "fused_conv"):
-                setattr(self, "fused_conv", self.fuse_conv_batch_norm(self.conv, self.batch_norm))
-            hidden_states = self.fused_conv(hidden_states)
-            if self.activation is not None:
-                hidden_states = self.activation(hidden_states)
-            return hidden_states
-
-    def fuse_conv_batch_norm(self, conv, batch_norm):
-        """During inference, the functionary of batch norm layers is turned off but
-        only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv
-        layers to save computations and simplify network structures."""
-        conv_w = conv.weight
-        conv_b = conv.bias if conv.bias is not None else torch.zeros_like(batch_norm.running_mean)
-
-        factor = batch_norm.weight / torch.sqrt(batch_norm.running_var + batch_norm.eps)
-        conv.weight = nn.Parameter(conv_w * factor.reshape([conv.out_channels, 1, 1, 1]))
-        conv.bias = nn.Parameter((conv_b - batch_norm.running_mean) * factor + batch_norm.bias)
-        return conv
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        return self.activation(hidden_states)
 
 
 class TextNetRepConvLayer(nn.Module):
@@ -188,100 +157,26 @@ def __init__(self, config, num_channels, out_channels, kernel_size, stride):
         )
 
     def forward(self, hidden_states):
-        if self.training:
-            if hasattr(self, "fused_conv"):
-                self.__delattr__("fused_conv")
-
-            main_outputs = self.main_conv(hidden_states)
-            main_outputs = self.main_batch_norm(main_outputs)
-
-            vertical_outputs = 0
-            if self.vertical_conv is not None:
-                vertical_outputs = self.vertical_conv(hidden_states)
-                vertical_outputs = self.vertical_batch_norm(vertical_outputs)
-            horizontal_outputs = 0
-            if self.horizontal_conv is not None:
-                horizontal_outputs = self.horizontal_conv(hidden_states)
-                horizontal_outputs = self.horizontal_batch_norm(horizontal_outputs)
-
-            id_out = 0
-            if self.rbr_identity is not None:
-                id_out = self.rbr_identity(hidden_states)
-
-            return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out)
-        else:
-            if not hasattr(self, "fused_conv"):
-                self.prepare_for_eval()
-            return self.nonlinearity(self.fused_conv(hidden_states))
-
-    def _identity_to_conv(self, identity):
-        if identity is None:
-            return 0, 0
-        if not hasattr(self, "id_tensor"):
-            input_dim = self.num_channels
-            kernel_value = np.zeros((self.num_channels, input_dim, 1, 1), dtype=np.float32)
-            for i in range(self.num_channels):
-                kernel_value[i, i % input_dim, 0, 0] = 1
-            id_tensor = torch.from_numpy(kernel_value).to(identity.weight.device)
-            self.id_tensor = self._pad_to_mxn_tensor(id_tensor)
-        kernel = self.id_tensor
-        running_mean = identity.running_mean
-        running_var = identity.running_var
-        gamma = identity.weight
-        beta = identity.bias
-        eps = identity.eps
-        std = (running_var + eps).sqrt()
-        t = (gamma / std).reshape(-1, 1, 1, 1)
-        return kernel * t, beta - running_mean * gamma / std
-
-    def _fuse_batch_norm_tensor(self, conv, batch_norm):
-        kernel = conv.weight
-        kernel = self._pad_to_mxn_tensor(kernel)
-        running_mean = batch_norm.running_mean
-        running_var = batch_norm.running_var
-        gamma = batch_norm.weight
-        beta = batch_norm.bias
-        eps = batch_norm.eps
-        std = (running_var + eps).sqrt()
-        t = (gamma / std).reshape(-1, 1, 1, 1)
-        return kernel * t, beta - running_mean * gamma / std
-
-    def get_equivalent_kernel_bias(self):
-        kernel_mxn, bias_mxn = self._fuse_batch_norm_tensor(self.main_conv, self.main_batch_norm)
+        if hasattr(self, "fused_conv"):
+            self.__delattr__("fused_conv")
+
+        main_outputs = self.main_conv(hidden_states)
+        main_outputs = self.main_batch_norm(main_outputs)
+
+        vertical_outputs = 0
         if self.vertical_conv is not None:
-            kernel_mx1, bias_mx1 = self._fuse_batch_norm_tensor(self.vertical_conv, self.vertical_batch_norm)
-        else:
-            kernel_mx1, bias_mx1 = 0, 0
+            vertical_outputs = self.vertical_conv(hidden_states)
+            vertical_outputs = self.vertical_batch_norm(vertical_outputs)
+        horizontal_outputs = 0
         if self.horizontal_conv is not None:
-            kernel_1xn, bias_1xn = self._fuse_batch_norm_tensor(self.horizontal_conv, self.horizontal_batch_norm)
-        else:
-            kernel_1xn, bias_1xn = 0, 0
-        kernel_id, bias_id = self._identity_to_conv(self.rbr_identity)
-        kernel_mxn = kernel_mxn + kernel_mx1 + kernel_1xn + kernel_id
-        bias_mxn = bias_mxn + bias_mx1 + bias_1xn + bias_id
-        return kernel_mxn, bias_mxn
-
-    def _pad_to_mxn_tensor(self, kernel):
-        kernel_height, kernel_width = self.kernel_size
-        height, width = kernel.shape[2:]
-        pad_left_right = (kernel_width - width) // 2
-        pad_top_down = (kernel_height - height) // 2
-        return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right, pad_top_down, pad_top_down])
-
-    def prepare_for_eval(self):
-        kernel, bias = self.get_equivalent_kernel_bias()
-        self.fused_conv = nn.Conv2d(
-            in_channels=self.main_conv.in_channels,
-            out_channels=self.main_conv.out_channels,
-            kernel_size=self.main_conv.kernel_size,
-            stride=self.main_conv.stride,
-            padding=self.main_conv.padding,
-            bias=True,
-        )
-        self.fused_conv.weight.data = kernel
-        self.fused_conv.bias.data = bias
-        for para in self.fused_conv.parameters():
-            para.detach_()
+            horizontal_outputs = self.horizontal_conv(hidden_states)
+            horizontal_outputs = self.horizontal_batch_norm(horizontal_outputs)
+
+        id_out = 0
+        if self.rbr_identity is not None:
+            id_out = self.rbr_identity(hidden_states)
+
+        return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out)
 
 
 class TextNetStage(nn.Module):
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 18c6a27bd7dc..89366aba5081 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -485,13 +485,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class TextNetImageProcessor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class TvltImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 

From 5e2128cb07c4a7abcd772c5a464912f36b0936f1 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 15 Nov 2023 17:02:44 +0530
Subject: [PATCH 063/152] Incorporate PR feedbacks

---
 src/transformers/models/textnet/modeling_textnet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 09c73b5f3069..6d0f656e944e 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -90,7 +90,7 @@ def __init__(self, config):
 
         self.activation = nn.Identity()
         if self.activation_function is not None:
-            self.activation = ACT2CLS[self.activation_function](inplace=True)
+            self.activation = ACT2CLS[self.activation_function]()
 
     def forward(self, hidden_states):
         hidden_states = self.conv(hidden_states)
@@ -109,7 +109,7 @@ def __init__(self, config, num_channels, out_channels, kernel_size, stride):
 
         padding = ((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2)
 
-        self.nonlinearity = nn.ReLU(inplace=True)
+        self.nonlinearity = nn.ReLU()
 
         self.main_conv = nn.Conv2d(
             in_channels=num_channels,

From aa3a8f0800901f7fab8b11facbb177bbde59e7f6 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 16 Nov 2023 11:09:40 +0530
Subject: [PATCH 064/152] Incorporate PR feedbacks

---
 .../models/textnet/configuration_textnet.py   | 90 ++++---------------
 .../models/textnet/modeling_textnet.py        | 33 ++++---
 tests/models/textnet/test_modeling_textnet.py | 73 +++++----------
 3 files changed, 52 insertions(+), 144 deletions(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index bd499b9b14ad..00705aba618f 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -45,38 +45,13 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
             The num of channels in out for the initial convolution layer.
         act_func (`str`, *optional*, defaults to `"relu"`):
             The activation function for the initial convolution layer.
-        stage1_in_channels (`List[int]`, *optional*, defaults to `[64, 64, 64]`):
-            The num of channels in input for list of conv in stage 1.
-        stage1_out_channels (`List[int]`, *optional*, defaults to `[64, 64, 64]`):
-            The num of channels in output for list of conv in stage 1.Should be of same length os `stage1_in_channels`
-        stage1_kernel_size (`List[int]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 3]]`):
-            The kernel sizes for list of conv in stage 1.Should be of same length os `stage1_in_channels`
-        stage1_stride (`List[int]`, *optional*, defaults to `[1, 2, 1]`):
-            The strides for list of conv in stage 1.Should be of same length os `stage1_in_channels`
-        stage2_in_channels (`List[int]`, *optional*, defaults to `[64, 128, 128, 128]`):
-            The num of channels in input for list of conv in stage 2.
-        stage2_out_channels (`List[int]`, *optional*, defaults to `[128, 128, 128, 128]`):
-            The num of channels in output for list of conv in stage 2.Should be of same length os `stage2_in_channels`
-        stage2_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [1, 3], [3, 3], [3, 1]]`):
-            The kernel sizes for list of conv in stage 2.Should be of same length os `stage2_in_channels`
-        stage2_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`):
-            The strides for list of conv in stage 2.Should be of same length os `stage2_in_channels`
-        stage3_in_channels (`List[int]`, *optional*, defaults to `[128, 256, 256, 256]`):
-            The num of channels in input for list of conv in stage 3.
-        stage3_out_channels (`List[int]`, *optional*, defaults to `[256, 256, 256, 256]`):
-            The num of channels in output for list of conv in stage 3.Should be of same length os `stage3_in_channels`
-        stage3_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 1], [1, 3]]`):
-            The kernel sizes for list of conv in stage 3.Should be of same length os `stage3_in_channels`
-        stage3_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`):
-            The strides for list of conv in stage 3.Should be of same length os `stage3_in_channels`
-        stage4_in_channels (`List[int]`, *optional*, defaults to `[256, 512, 512, 512]`):
-            The strides for list of conv in stage 4.Should be of same length os `stage4_in_channels`
-        stage4_out_channels (`List[int]`, *optional*, defaults to `[512, 512, 512, 512]`):
-            The num of channels in output for list of conv in stage 4.Should be of same length os `stage4_in_channels`
-        stage4_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [3, 1], [1, 3], [3, 3]]`):
-            The kernel sizes for list of conv in stage 4.Should be of same length os `stage4_in_channels`
-        stage4_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`):
-            The strides for list of conv in stage 4.Should be of same length os `stage4_in_channels`
+                                     [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3,
+                                     1], [1, 3], [3, 3]]]`):
+            The list of stagewise conv layer's kernel sizes.
+                                [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
+            The list of stagewise conv layer's kernel strides.
+        conv_layer_kernel_sizes (`<fill_type>`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`): <fill_docstring>
+        conv_layer_strides (`<fill_type>`, *optional*, defaults to `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`): <fill_docstring>
         hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`):
             Dimensionality (hidden size) at each stage.
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
@@ -118,22 +93,13 @@ def __init__(
         num_channels=3,
         out_channels=64,
         act_func="relu",
-        stage1_in_channels=[64, 64, 64],
-        stage1_out_channels=[64, 64, 64],
-        stage1_kernel_size=[[3, 3], [3, 3], [3, 3]],
-        stage1_stride=[1, 2, 1],
-        stage2_in_channels=[64, 128, 128, 128],
-        stage2_out_channels=[128, 128, 128, 128],
-        stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]],
-        stage2_stride=[2, 1, 1, 1],
-        stage3_in_channels=[128, 256, 256, 256],
-        stage3_out_channels=[256, 256, 256, 256],
-        stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]],
-        stage3_stride=[2, 1, 1, 1],
-        stage4_in_channels=[256, 512, 512, 512],
-        stage4_out_channels=[512, 512, 512, 512],
-        stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]],
-        stage4_stride=[2, 1, 1, 1],
+        conv_layer_kernel_sizes=[
+            [[3, 3], [3, 3], [3, 3]],
+            [[3, 3], [1, 3], [3, 3], [3, 1]],
+            [[3, 3], [3, 3], [3, 1], [1, 3]],
+            [[3, 3], [3, 1], [1, 3], [3, 3]],
+        ],
+        conv_layer_strides=[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]],
         hidden_sizes=[64, 64, 128, 256, 512],
         batch_norm_eps=1e-5,
         initializer_range=0.02,
@@ -149,36 +115,14 @@ def __init__(
         self.out_channels = out_channels
         self.act_func = act_func
 
-        self.stage1_in_channels = stage1_in_channels
-        self.stage1_out_channels = stage1_out_channels
-        self.stage1_kernel_size = stage1_kernel_size
-        self.stage1_stride = stage1_stride
-
-        self.stage2_in_channels = stage2_in_channels
-        self.stage2_out_channels = stage2_out_channels
-        self.stage2_kernel_size = stage2_kernel_size
-        self.stage2_stride = stage2_stride
-
-        self.stage3_in_channels = stage3_in_channels
-        self.stage3_out_channels = stage3_out_channels
-        self.stage3_kernel_size = stage3_kernel_size
-        self.stage3_stride = stage3_stride
-
-        self.stage4_in_channels = stage4_in_channels
-        self.stage4_out_channels = stage4_out_channels
-        self.stage4_kernel_size = stage4_kernel_size
-        self.stage4_stride = stage4_stride
+        self.conv_layer_kernel_sizes = conv_layer_kernel_sizes
+        self.conv_layer_strides = conv_layer_strides
 
         self.initializer_range = initializer_range
         self.hidden_sizes = hidden_sizes
         self.batch_norm_eps = batch_norm_eps
 
-        self.depths = [
-            len(self.stage1_out_channels),
-            len(self.stage2_out_channels),
-            len(self.stage3_out_channels),
-            len(self.stage4_out_channels),
-        ]
+        self.depths = [len(layer) for layer in self.conv_layer_kernel_sizes]
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, 5)]
         self._out_features, self._out_indices = get_aligned_output_features_output_indices(
             out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 6d0f656e944e..b4258877bc30 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -180,8 +180,18 @@ def forward(self, hidden_states):
 
 
 class TextNetStage(nn.Module):
-    def __init__(self, config, in_channels, out_channels, kernel_size, stride):
+    def __init__(self, config, depth):
         super().__init__()
+        kernel_size = config.conv_layer_kernel_sizes[depth]
+        stride = config.conv_layer_strides[depth]
+
+        num_layers = len(kernel_size)
+        stage_in_channel_size = config.hidden_sizes[depth]
+        stage_out_channel_size = config.hidden_sizes[depth + 1]
+
+        in_channels = [stage_in_channel_size] + [stage_out_channel_size] * (num_layers - 1)
+        out_channels = [stage_out_channel_size] * num_layers
+
         stage = []
         for stage_config in zip(in_channels, out_channels, kernel_size, stride):
             stage.append(TextNetRepConvLayer(config, *stage_config))
@@ -198,16 +208,9 @@ def __init__(self, config):
         super().__init__()
 
         stages = []
-        for stage_ix in range(1, 5):
-            stages.append(
-                TextNetStage(
-                    config,
-                    getattr(config, f"stage{stage_ix}_in_channels"),
-                    getattr(config, f"stage{stage_ix}_out_channels"),
-                    getattr(config, f"stage{stage_ix}_kernel_size"),
-                    getattr(config, f"stage{stage_ix}_stride"),
-                )
-            )
+        num_stages = len(config.conv_layer_kernel_sizes)
+        for stage_ix in range(0, num_stages):
+            stages.append(TextNetStage(config, stage_ix))
 
             self.stages = nn.ModuleList(stages)
 
@@ -313,13 +316,7 @@ def __init__(self, config):
         super()._init_backbone(config)
 
         self.textnet = TextNetModel(config)
-        self.num_features = [
-            config.out_channels,
-            config.stage1_out_channels[-1],
-            config.stage2_out_channels[-1],
-            config.stage3_out_channels[-1],
-            config.stage4_out_channels[-1],
-        ]
+        self.num_features = config.hidden_sizes
 
         # initialize weights and apply final processing
         self.post_init()
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 68403c273c45..f839f3c095c9 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -54,22 +54,21 @@ def __init__(
         act_func="relu",
         dropout_rate=0,
         ops_order="weight_bn_act",
-        stage1_in_channels=[64],
-        stage1_out_channels=[64],
-        stage1_kernel_size=[[3, 3]],
-        stage1_stride=[1],
-        stage2_in_channels=[64],
-        stage2_out_channels=[64],
-        stage2_kernel_size=[[3, 1]],
-        stage2_stride=[2],
-        stage3_in_channels=[64],
-        stage3_out_channels=[64],
-        stage3_kernel_size=[[1, 3]],
-        stage3_stride=[2],
-        stage4_in_channels=[64],
-        stage4_out_channels=[64],
-        stage4_kernel_size=[[3, 3]],
-        stage4_stride=[2],
+        conv_layer_kernel_sizes=[[[3, 3]], [[3, 1]], [[1, 3]], [[3, 3]]],
+        conv_layer_strides=[
+            [
+                1,
+            ],
+            [
+                2,
+            ],
+            [
+                2,
+            ],
+            [
+                2,
+            ],
+        ],
         out_features=["stage1", "stage2", "stage3", "stage4"],
         out_indices=[1, 2, 3, 4],
         batch_size=3,
@@ -90,26 +89,8 @@ def __init__(
         self.act_func = act_func
         self.dropout_rate = dropout_rate
         self.ops_order = ops_order
-
-        self.stage1_in_channels = stage1_in_channels
-        self.stage1_out_channels = stage1_out_channels
-        self.stage1_kernel_size = stage1_kernel_size
-        self.stage1_stride = stage1_stride
-
-        self.stage2_in_channels = stage2_in_channels
-        self.stage2_out_channels = stage2_out_channels
-        self.stage2_kernel_size = stage2_kernel_size
-        self.stage2_stride = stage2_stride
-
-        self.stage3_in_channels = stage3_in_channels
-        self.stage3_out_channels = stage3_out_channels
-        self.stage3_kernel_size = stage3_kernel_size
-        self.stage3_stride = stage3_stride
-
-        self.stage4_in_channels = stage4_in_channels
-        self.stage4_out_channels = stage4_out_channels
-        self.stage4_kernel_size = stage4_kernel_size
-        self.stage4_stride = stage4_stride
+        self.conv_layer_kernel_sizes = conv_layer_kernel_sizes
+        self.conv_layer_strides = conv_layer_strides
 
         self.out_features = out_features
         self.out_indices = out_indices
@@ -134,22 +115,8 @@ def get_config(self):
             act_func=self.act_func,
             dropout_rate=self.dropout_rate,
             ops_order=self.ops_order,
-            stage1_in_channels=self.stage1_in_channels,
-            stage1_out_channels=self.stage1_out_channels,
-            stage1_kernel_size=self.stage1_kernel_size,
-            stage1_stride=self.stage1_stride,
-            stage2_in_channels=self.stage2_in_channels,
-            stage2_out_channels=self.stage2_out_channels,
-            stage2_kernel_size=self.stage2_kernel_size,
-            stage2_stride=self.stage2_stride,
-            stage3_in_channels=self.stage3_in_channels,
-            stage3_out_channels=self.stage3_out_channels,
-            stage3_kernel_size=self.stage3_kernel_size,
-            stage3_stride=self.stage3_stride,
-            stage4_in_channels=self.stage4_in_channels,
-            stage4_out_channels=self.stage4_out_channels,
-            stage4_kernel_size=self.stage4_kernel_size,
-            stage4_stride=self.stage4_stride,
+            conv_layer_kernel_sizes=self.conv_layer_kernel_sizes,
+            conv_layer_strides=self.conv_layer_strides,
             out_features=self.out_features,
             out_indices=self.out_indices,
             hidden_sizes=self.hidden_sizes,
@@ -193,7 +160,7 @@ def create_and_check_backbone(self, config, pixel_values, labels):
         # verify feature maps
         self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
         self.parent.assertListEqual(
-            list(result.feature_maps[0].shape), [self.batch_size, self.stage1_out_channels[-1], 16, 16]
+            list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 16, 16]
         )
 
         # verify channels

From 0d518b0a8c5b32e9e19a1eb971868568114563fb Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 16 Nov 2023 11:46:10 +0530
Subject: [PATCH 065/152] Incorporate PR feedbacks

---
 src/transformers/models/textnet/__init__.py   |  1 -
 .../models/textnet/configuration_textnet.py   | 30 ++++++++--------
 .../models/textnet/modeling_textnet.py        | 20 +++++------
 tests/models/textnet/test_modeling_textnet.py | 35 +++++++++----------
 4 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py
index e03fbde04197..43111f21c0b7 100644
--- a/src/transformers/models/textnet/__init__.py
+++ b/src/transformers/models/textnet/__init__.py
@@ -56,7 +56,6 @@
             TextNetPreTrainedModel,
         )
 
-
 else:
     import sys
 
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 00705aba618f..ec3d22ac2e88 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -35,15 +35,15 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     for more information.
 
     Args:
-        kernel_size (`int`, *optional*, defaults to 3):
+        stem_kernel_size (`int`, *optional*, defaults to 3):
             The kernel size for the initial convolution layer.
-        stride (`int`, *optional*, defaults to 2):
+        stem_stride (`int`, *optional*, defaults to 2):
             The stride for the initial convolution layer.
-        num_channels (`int`, *optional*, defaults to 3):
+        stem_num_channels (`int`, *optional*, defaults to 3):
             The num of channels in input for the initial convolution layer.
-        out_channels (`int`, *optional*, defaults to 64):
+        stem_out_channels (`int`, *optional*, defaults to 64):
             The num of channels in out for the initial convolution layer.
-        act_func (`str`, *optional*, defaults to `"relu"`):
+        stem_act_func (`str`, *optional*, defaults to `"relu"`):
             The activation function for the initial convolution layer.
                                      [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3,
                                      1], [1, 3], [3, 3]]]`):
@@ -88,11 +88,11 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
 
     def __init__(
         self,
-        kernel_size=3,
-        stride=2,
-        num_channels=3,
-        out_channels=64,
-        act_func="relu",
+        stem_kernel_size=3,
+        stem_stride=2,
+        stem_num_channels=3,
+        stem_out_channels=64,
+        stem_act_func="relu",
         conv_layer_kernel_sizes=[
             [[3, 3], [3, 3], [3, 3]],
             [[3, 3], [1, 3], [3, 3], [3, 1]],
@@ -109,11 +109,11 @@ def __init__(
     ):
         super().__init__(**kwargs)
 
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.num_channels = num_channels
-        self.out_channels = out_channels
-        self.act_func = act_func
+        self.stem_kernel_size = stem_kernel_size
+        self.stem_stride = stem_stride
+        self.stem_num_channels = stem_num_channels
+        self.stem_out_channels = stem_out_channels
+        self.stem_act_func = stem_act_func
 
         self.conv_layer_kernel_sizes = conv_layer_kernel_sizes
         self.conv_layer_strides = conv_layer_strides
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index b4258877bc30..e649e05c52da 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -68,25 +68,25 @@ class TextNetConvLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
 
-        self.kernel_size = config.kernel_size
-        self.stride = config.stride
-        self.activation_function = config.act_func
+        self.kernel_size = config.stem_kernel_size
+        self.stride = config.stem_stride
+        self.activation_function = config.stem_act_func
 
         padding = (
             (config.kernel_size[0] // 2, config.kernel_size[1] // 2)
-            if isinstance(config.kernel_size, tuple)
-            else config.kernel_size // 2
+            if isinstance(config.stem_kernel_size, tuple)
+            else config.stem_kernel_size // 2
         )
 
         self.conv = nn.Conv2d(
-            config.num_channels,
-            config.out_channels,
-            kernel_size=config.kernel_size,
-            stride=config.stride,
+            config.stem_num_channels,
+            config.stem_out_channels,
+            kernel_size=config.stem_kernel_size,
+            stride=config.stem_stride,
             padding=padding,
             bias=False,
         )
-        self.batch_norm = nn.BatchNorm2d(config.out_channels, config.batch_norm_eps)
+        self.batch_norm = nn.BatchNorm2d(config.stem_out_channels, config.batch_norm_eps)
 
         self.activation = nn.Identity()
         if self.activation_function is not None:
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index f839f3c095c9..699fd46428a2 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -46,12 +46,12 @@ class TextNetModelTester:
     def __init__(
         self,
         parent,
-        kernel_size=3,
-        stride=2,
-        in_channels=3,
-        out_channels=64,
-        use_bn=True,
-        act_func="relu",
+        stem_kernel_size=3,
+        stem_stride=2,
+        stem_in_channels=3,
+        stem_out_channels=64,
+        # use_bn=True,
+        stem_act_func="relu",
         dropout_rate=0,
         ops_order="weight_bn_act",
         conv_layer_kernel_sizes=[[[3, 3]], [[3, 1]], [[1, 3]], [[3, 3]]],
@@ -81,12 +81,12 @@ def __init__(
         hidden_sizes=[64, 64, 64, 64, 64],
     ):
         self.parent = parent
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.use_bn = use_bn
-        self.act_func = act_func
+        self.stem_kernel_size = stem_kernel_size
+        self.stem_stride = stem_stride
+        self.stem_in_channels = stem_in_channels
+        self.stem_out_channels = stem_out_channels
+        # self.use_bn = use_bn
+        self.act_func = stem_act_func
         self.dropout_rate = dropout_rate
         self.ops_order = ops_order
         self.conv_layer_kernel_sizes = conv_layer_kernel_sizes
@@ -107,11 +107,10 @@ def __init__(
 
     def get_config(self):
         return TextNetConfig(
-            kernel_size=self.kernel_size,
-            stride=self.stride,
-            in_channels=self.in_channels,
-            out_channels=self.out_channels,
-            use_bn=self.use_bn,
+            stem_kernel_size=self.stem_kernel_size,
+            stem_stride=self.stem_stride,
+            stem_num_channels=self.stem_in_channels,
+            stem_out_channels=self.stem_out_channels,
             act_func=self.act_func,
             dropout_rate=self.dropout_rate,
             ops_order=self.ops_order,
@@ -300,7 +299,7 @@ def test_model_from_pretrained(self):
             model = TextNetModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
-
+# copied from tests.test_modeling_bit
 @require_torch
 class TextNetBackboneTest(BackboneTesterMixin, unittest.TestCase):
     all_model_classes = (TextNetBackbone,) if is_torch_available() else ()

From 89110a00836fd2b77826df9a70f03be9bfc37c36 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 16 Nov 2023 12:00:40 +0530
Subject: [PATCH 066/152] Incorporate PR feedbacks

---
 src/transformers/__init__.py                  |   4 +-
 src/transformers/models/textnet/__init__.py   |   2 +
 .../models/textnet/configuration_textnet.py   |   6 +-
 .../models/textnet/modeling_textnet.py        | 142 +++++++++---------
 src/transformers/utils/dummy_pt_objects.py    |   2 +-
 tests/models/textnet/test_modeling_textnet.py |   1 +
 6 files changed, 81 insertions(+), 76 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 93d9b2d6d9d3..5fbe0309d31f 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3301,7 +3301,7 @@
     )
     _import_structure["models.textnet"].extend(
         [
-            "TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
+            "TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST",
             "TextNetBackbone",
             "TextNetForImageClassification",
             "TextNetModel",
@@ -7660,7 +7660,7 @@
             load_tf_weights_in_tapas,
         )
         from .models.textnet import (
-            TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+            TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST,
             TextNetBackbone,
             TextNetForImageClassification,
             TextNetModel,
diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py
index 43111f21c0b7..b302088afd10 100644
--- a/src/transformers/models/textnet/__init__.py
+++ b/src/transformers/models/textnet/__init__.py
@@ -33,6 +33,7 @@
     pass
 else:
     _import_structure["modeling_textnet"] = [
+        "TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST",
         "TextNetBackbone",
         "TextNetModel",
         "TextNetPreTrainedModel",
@@ -50,6 +51,7 @@
         pass
     else:
         from .modeling_textnet import (
+            TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST,
             TextNetBackbone,
             TextNetForImageClassification,
             TextNetModel,
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index ec3d22ac2e88..39f736e9e95d 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -50,8 +50,10 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
             The list of stagewise conv layer's kernel sizes.
                                 [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
             The list of stagewise conv layer's kernel strides.
-        conv_layer_kernel_sizes (`<fill_type>`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`): <fill_docstring>
-        conv_layer_strides (`<fill_type>`, *optional*, defaults to `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`): <fill_docstring>
+        conv_layer_kernel_sizes (`<fill_type>`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`):
+            <fill_docstring>
+        conv_layer_strides (`<fill_type>`, *optional*, defaults to `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
+            <fill_docstring>
         hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`):
             Dimensionality (hidden size) at each stage.
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index e649e05c52da..541957866724 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -212,7 +212,7 @@ def __init__(self, config):
         for stage_ix in range(0, num_stages):
             stages.append(TextNetStage(config, stage_ix))
 
-            self.stages = nn.ModuleList(stages)
+        self.stages = nn.ModuleList(stages)
 
     def forward(self, hidden_state):
         hidden_states = []
@@ -304,76 +304,6 @@ def forward(
         )
 
 
-@add_start_docstrings(
-    """
-    TextNet backbone, to be used with frameworks like DETR and MaskFormer.
-    """,
-    TEXTNET_START_DOCSTRING,
-)
-class TextNetBackbone(TextNetPreTrainedModel, BackboneMixin):
-    def __init__(self, config):
-        super().__init__(config)
-        super()._init_backbone(config)
-
-        self.textnet = TextNetModel(config)
-        self.num_features = config.hidden_sizes
-
-        # initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(TEXTNET_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
-    ) -> Union[Tuple[Tuple], BackboneOutput]:
-        """
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from transformers import AutoImageProcessor, AutoBackbone
-        >>> import torch
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> processor = AutoImageProcessor.from_pretrained("Raghavan/textnet-base")
-        >>> model = AutoBackbone.from_pretrained("Raghavan/textnet-base")
-
-        >>> inputs = processor(image, return_tensors="pt")
-        >>> outputs = model(**inputs)
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-
-        outputs = self.textnet(pixel_values, output_hidden_states=True, return_dict=return_dict)
-
-        hidden_states = outputs.hidden_states if return_dict else outputs[2]
-
-        feature_maps = ()
-        for idx, stage in enumerate(self.stage_names):
-            if stage in self.out_features:
-                feature_maps += (hidden_states[idx],)
-
-        if not return_dict:
-            output = (feature_maps,)
-            if output_hidden_states:
-                hidden_states = outputs.hidden_states if return_dict else outputs[2]
-                output += (hidden_states,)
-            return output
-
-        return BackboneOutput(
-            feature_maps=feature_maps,
-            hidden_states=outputs.hidden_states if output_hidden_states else None,
-            attentions=None,
-        )
-
-
 @add_start_docstrings(
     """
     TextNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
@@ -443,3 +373,73 @@ def forward(
             return (loss,) + output if loss is not None else output
 
         return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+@add_start_docstrings(
+    """
+    TextNet backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    TEXTNET_START_DOCSTRING,
+)
+class TextNetBackbone(TextNetPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.textnet = TextNetModel(config)
+        self.num_features = config.hidden_sizes
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(TEXTNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> Union[Tuple[Tuple], BackboneOutput]:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("Raghavan/textnet-base")
+        >>> model = AutoBackbone.from_pretrained("Raghavan/textnet-base")
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.textnet(pixel_values, output_hidden_states=True, return_dict=return_dict)
+
+        hidden_states = outputs.hidden_states if return_dict else outputs[2]
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                hidden_states = outputs.hidden_states if return_dict else outputs[2]
+                output += (hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index f45bbda68f5f..a94ed1123326 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -8023,7 +8023,7 @@ def load_tf_weights_in_tapas(*args, **kwargs):
     requires_backends(load_tf_weights_in_tapas, ["torch"])
 
 
-TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP = None
+TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
 class TextNetBackbone(metaclass=DummyObject):
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 699fd46428a2..b41b4453b112 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -299,6 +299,7 @@ def test_model_from_pretrained(self):
             model = TextNetModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+
 # copied from tests.test_modeling_bit
 @require_torch
 class TextNetBackboneTest(BackboneTesterMixin, unittest.TestCase):

From 07ab3a2328f80a2ecbe854d3cfcf5643cb77d153 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 16 Nov 2023 12:37:26 +0530
Subject: [PATCH 067/152] Incorporate PR feedbacks

---
 .../models/textnet/modeling_textnet.py        | 47 ++++++++++---------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 541957866724..f04324b1f72f 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -28,7 +28,12 @@
     ImageClassifierOutputWithNoAttention,
 )
 from transformers.models.textnet.configuration_textnet import TextNetConfig
-from transformers.utils import add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from transformers.utils.backbone_utils import BackboneMixin
 
 
@@ -36,6 +41,12 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "TextNetConfig"
+_CHECKPOINT_FOR_DOC = "Raghavan/textnet-base"
+_EXPECTED_OUTPUT_SHAPE = [1, 2048, 7, 7]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "Raghavan/textnet-base"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"
 
 TEXTNET_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
@@ -256,30 +267,16 @@ def __init__(self, config):
         self.init_weights()
 
     @add_start_docstrings_to_model_forward(TEXTNET_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
     def forward(
         self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
     ) -> Union[Tuple[Any, List[Any]], Tuple[Any], BaseModelOutputWithPoolingAndNoAttention]:
-        """
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from transformers import AutoImageProcessor, AutoBackbone
-        >>> import torch
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> processor = AutoImageProcessor.from_pretrained("Raghavan/textnet-base")
-        >>> model = AutoBackbone.from_pretrained("Raghavan/textnet-base")
-
-        >>> inputs = processor(image, return_tensors="pt")
-        >>> outputs = model(**inputs)
-        ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -325,6 +322,12 @@ def __init__(self, config):
         self.post_init()
 
     @add_start_docstrings_to_model_forward(TEXTNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
     def forward(
         self,
         pixel_values: Optional[torch.FloatTensor] = None,

From 66f9d5d2a6e686f46c328f9a814362263868161f Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 16 Nov 2023 21:11:55 +0530
Subject: [PATCH 068/152] Incorporate PR feedbacks

---
 .../models/textnet/configuration_textnet.py   |   4 +
 .../convert_textnet_original_to_pytorch.py    | 206 ++++++++++++++++++
 .../models/textnet/modeling_textnet.py        |   7 +-
 tests/models/textnet/test_modeling_textnet.py |  10 +-
 4 files changed, 222 insertions(+), 5 deletions(-)
 create mode 100644 src/transformers/models/textnet/convert_textnet_original_to_pytorch.py

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 39f736e9e95d..ea18cba15298 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -50,6 +50,8 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
             The list of stagewise conv layer's kernel sizes.
                                 [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
             The list of stagewise conv layer's kernel strides.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
         conv_layer_kernel_sizes (`<fill_type>`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`):
             <fill_docstring>
         conv_layer_strides (`<fill_type>`, *optional*, defaults to `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
@@ -95,6 +97,7 @@ def __init__(
         stem_num_channels=3,
         stem_out_channels=64,
         stem_act_func="relu",
+        image_size=224,
         conv_layer_kernel_sizes=[
             [[3, 3], [3, 3], [3, 3]],
             [[3, 3], [1, 3], [3, 3], [3, 1]],
@@ -117,6 +120,7 @@ def __init__(
         self.stem_out_channels = stem_out_channels
         self.stem_act_func = stem_act_func
 
+        self.image_size = image_size
         self.conv_layer_kernel_sizes = conv_layer_kernel_sizes
         self.conv_layer_strides = conv_layer_strides
 
diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
new file mode 100644
index 000000000000..a83fdbe67f07
--- /dev/null
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -0,0 +1,206 @@
+# coding=utf-8
+# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import logging
+import re
+from collections import OrderedDict
+
+import requests
+import torch
+
+from transformers import CLIPImageProcessor, TextNetBackbone, TextNetConfig
+
+
+tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
+small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config"
+base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config"
+
+rename_key_mappings = {
+    "module.backbone": "textnet",
+    "first_conv": "stem",
+    "bn": "batch_norm",
+    "ver": "vertical",
+    "hor": "horizontal",
+}
+
+
+def prepare_config(size_config_url):
+    config_dict = json.loads(requests.get(size_config_url).text)
+
+    backbone_config = {}
+    for stage_ix in range(1, 5):
+        stage_config = config_dict[f"stage{stage_ix}"]
+
+        merged_dict = {}
+
+        # Iterate through the list of dictionaries
+        for layer in stage_config:
+            for key, value in layer.items():
+                if key != "name":
+                    # Check if the key is already in the merged_dict
+                    if key in merged_dict:
+                        merged_dict[key].append(value)
+                    else:
+                        # If the key is not in merged_dict, create a new list with the value
+                        merged_dict[key] = [value]
+        backbone_config[f"stage{stage_ix}"] = merged_dict
+
+    neck_in_channels = []
+    neck_out_channels = []
+    neck_kernel_size = []
+    neck_stride = []
+    neck_dilation = []
+    neck_groups = []
+
+    for i in range(1, 5):
+        layer_key = f"reduce_layer{i}"
+        layer_dict = config_dict["neck"].get(layer_key)
+
+        if layer_dict:
+            # Append values to the corresponding lists
+            neck_in_channels.append(layer_dict["in_channels"])
+            neck_out_channels.append(layer_dict["out_channels"])
+            neck_kernel_size.append(layer_dict["kernel_size"])
+            neck_stride.append(layer_dict["stride"])
+            neck_dilation.append(layer_dict["dilation"])
+            neck_groups.append(layer_dict["groups"])
+
+    textnet_config = TextNetConfig(
+        stem_kernel_size=config_dict["first_conv"]["kernel_size"],
+        stem_stride=config_dict["first_conv"]["stride"],
+        stem_num_channels=config_dict["first_conv"]["in_channels"],
+        stem_out_channels=config_dict["first_conv"]["out_channels"],
+        stem_act_func=config_dict["first_conv"]["act_func"],
+        conv_layer_kernel_sizes=[
+            backbone_config["stage1"]["kernel_size"],
+            backbone_config["stage2"]["kernel_size"],
+            backbone_config["stage3"]["kernel_size"],
+            backbone_config["stage4"]["kernel_size"],
+        ],
+        conv_layer_strides=[
+            backbone_config["stage1"]["stride"],
+            backbone_config["stage2"]["stride"],
+            backbone_config["stage3"]["stride"],
+            backbone_config["stage4"]["stride"],
+        ],
+        hidden_sizes=[
+            config_dict["first_conv"]["out_channels"],
+            backbone_config["stage1"]["out_channels"][-1],
+            backbone_config["stage2"]["out_channels"][-1],
+            backbone_config["stage3"]["out_channels"][-1],
+            backbone_config["stage4"]["out_channels"][-1],
+        ],
+        out_features=["stage1", "stage2", "stage3", "stage4"],
+        out_indices=[1, 2, 3, 4],
+    )
+
+    return textnet_config
+
+
+def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_folder_path):
+    response = requests.get(checkpoint_config_url)
+    content = response.text
+    namespace = {}
+
+    exec(content, namespace)
+
+    model_config = namespace.get("model")
+    data_config = namespace.get("data")
+
+    if "tiny" in model_config["backbone"]["config"]:
+        config = prepare_config(
+            tiny_config_url,
+        )
+    elif "small" in model_config["backbone"]["config"]:
+        config = prepare_config(
+            small_config_url,
+        )
+    else:
+        config = prepare_config(
+            base_config_url,
+        )
+    size = 640
+    if "train" in data_config:
+        if "short_size" in data_config["train"]:
+            size = data_config["train"]["short_size"]
+    model = TextNetBackbone(config)
+    textnet_image_processor = CLIPImageProcessor(
+        size={"height": size, "width": size},
+    )
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"]
+    state_dict_changed = OrderedDict()
+    for key in state_dict:
+        if "backbone" in key:
+            val = state_dict[key]
+            new_key = key
+            for search, replacement in rename_key_mappings.items():
+                if search in new_key:
+                    new_key = new_key.replace(search, replacement)
+
+            pattern = r"textnet\.stage(\d)"
+
+            def adjust_stage(match):
+                stage_number = int(match.group(1)) - 1
+                return f"textnet.encoder.stages.{stage_number}.stage"
+
+            # Using regex to find and replace the pattern in the string
+            new_key = re.sub(pattern, adjust_stage, new_key)
+            state_dict_changed[new_key] = val
+    model.load_state_dict(state_dict_changed)
+
+    model.save_pretrained(pytorch_dump_folder_path)
+    textnet_image_processor.save_pretrained(pytorch_dump_folder_path)
+    logging.info("The converted weights are save here : " + pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_url",
+        default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth",
+        type=str,
+        help="URL to the original PyTorch checkpoint (.pth file).",
+    )
+    parser.add_argument(
+        "--checkpoint_config_url",
+        default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth",
+        type=str,
+        help="URL to the original PyTorch checkpoint (.pth file).",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
+    )
+    parser.add_argument(
+        "--validate_logits",
+        default=False,
+        type=bool,
+        help="whether to assert logits outputs",
+    )
+    parser.add_argument(
+        "--save_backbone_separately",
+        default=False,
+        type=bool,
+        help="whether to assert logits outputs",
+    )
+    args = parser.parse_args()
+
+    convert_textnet_checkpoint(
+        args.checkpoint_url,
+        args.checkpoint_config_url,
+        args.pytorch_dump_folder_path,
+    )
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index f04324b1f72f..c15aad214f77 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -291,7 +291,7 @@ def forward(
         pooled_output = self.pooler(last_hidden_state)
 
         if not return_dict:
-            output = (pooled_output, last_hidden_state)
+            output = (last_hidden_state, pooled_output)
             return output + (tuple(hidden_states),) if output_hidden_states else output
 
         return BaseModelOutputWithPoolingAndNoAttention(
@@ -313,10 +313,13 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.textnet = TextNetModel(config)
+        scale = config.image_size // 32
         # classification head
         self.classifier = nn.Sequential(
             nn.Flatten(),
-            nn.Linear(config.hidden_sizes[-1] * 2 * 2, config.num_labels) if config.num_labels > 0 else nn.Identity(),
+            nn.Linear(config.hidden_sizes[-1] * scale * scale * 2 * 2, config.num_labels)
+            if config.num_labels > 0
+            else nn.Identity(),
         )
         # initialize weights and apply final processing
         self.post_init()
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index b41b4453b112..a2fe3c6cfc12 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -119,6 +119,7 @@ def get_config(self):
             out_features=self.out_features,
             out_indices=self.out_indices,
             hidden_sizes=self.hidden_sizes,
+            image_size=self.image_size,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -126,9 +127,10 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
+        scale = self.image_size // 32
         self.parent.assertEqual(
             result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], 2, 2),
+            (self.batch_size, self.hidden_sizes[-1], 2 * scale, 2 * scale),
         )
 
     def create_and_check_for_image_classification(self, config, pixel_values, labels):
@@ -158,8 +160,9 @@ def create_and_check_backbone(self, config, pixel_values, labels):
 
         # verify feature maps
         self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
+        scale = self.image_size // 32
         self.parent.assertListEqual(
-            list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 16, 16]
+            list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 16 * scale, 16 * scale]
         )
 
         # verify channels
@@ -175,7 +178,8 @@ def create_and_check_backbone(self, config, pixel_values, labels):
 
         # verify feature maps
         self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, 64, 2, 2])
+        scale = self.image_size // 32
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, 64, 2 * scale, 2 * scale])
 
         # verify channels
         self.parent.assertEqual(len(model.channels), 1)

From 3c09b696010d3e7a3160b00f83fa6a4afa0c9e9c Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 16 Nov 2023 21:47:52 +0530
Subject: [PATCH 069/152] Fix Build

---
 src/transformers/models/textnet/configuration_textnet.py  | 8 ++++----
 .../models/textnet/convert_textnet_original_to_pytorch.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index ea18cba15298..9a3c573085a7 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -52,10 +52,10 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
             The list of stagewise conv layer's kernel strides.
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
-        conv_layer_kernel_sizes (`<fill_type>`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`):
-            <fill_docstring>
-        conv_layer_strides (`<fill_type>`, *optional*, defaults to `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
-            <fill_docstring>
+        conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`):
+            A list of stage wise kernel sizes.
+        conv_layer_strides (`List[List[int]]`, *optional*, defaults to `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
+            A list of stage wise strides.
         hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`):
             Dimensionality (hidden size) at each stage.
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index a83fdbe67f07..ed526233b69b 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -139,7 +139,7 @@ def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_du
             size = data_config["train"]["short_size"]
     model = TextNetBackbone(config)
     textnet_image_processor = CLIPImageProcessor(
-        size={"height": size, "width": size},
+        size={"shortest_edge": size},
     )
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"]
     state_dict_changed = OrderedDict()

From 169873916609e75ec382e7f5a17183cba8d07ce5 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 16 Nov 2023 22:16:34 +0530
Subject: [PATCH 070/152] Fix build

---
 .../models/textnet/convert_textnet_original_to_pytorch.py       | 2 ++
 src/transformers/models/textnet/modeling_textnet.py             | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index ed526233b69b..e7eb2069ece8 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -140,6 +140,8 @@ def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_du
     model = TextNetBackbone(config)
     textnet_image_processor = CLIPImageProcessor(
         size={"shortest_edge": size},
+        do_center_crop=False,
+        use_square_size=True
     )
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"]
     state_dict_changed = OrderedDict()
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index c15aad214f77..06fd412f66de 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -42,7 +42,7 @@
 # General docstring
 _CONFIG_FOR_DOC = "TextNetConfig"
 _CHECKPOINT_FOR_DOC = "Raghavan/textnet-base"
-_EXPECTED_OUTPUT_SHAPE = [1, 2048, 7, 7]
+_EXPECTED_OUTPUT_SHAPE = [1, 512, 7, 7]
 
 # Image classification docstring
 _IMAGE_CLASS_CHECKPOINT = "Raghavan/textnet-base"

From 6a47b12115dc5685cc4ffdf6b2bb43a29ecb3dae Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 16 Nov 2023 22:30:55 +0530
Subject: [PATCH 071/152] Fix build

---
 .../convert_textnet_original_to_pytorch.py     | 18 ++++++++++--------
 .../models/textnet/modeling_textnet.py         |  2 +-
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index e7eb2069ece8..ca3616e594a8 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -38,7 +38,7 @@
 }
 
 
-def prepare_config(size_config_url):
+def prepare_config(size_config_url,size):
     config_dict = json.loads(requests.get(size_config_url).text)
 
     backbone_config = {}
@@ -106,6 +106,7 @@ def prepare_config(size_config_url):
         ],
         out_features=["stage1", "stage2", "stage3", "stage4"],
         out_indices=[1, 2, 3, 4],
+        image_size=size,
     )
 
     return textnet_config
@@ -120,23 +121,24 @@ def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_du
 
     model_config = namespace.get("model")
     data_config = namespace.get("data")
+    size = 640
+    if "train" in data_config:
+        if "short_size" in data_config["train"]:
+            size = data_config["train"]["short_size"]
 
     if "tiny" in model_config["backbone"]["config"]:
         config = prepare_config(
-            tiny_config_url,
+            tiny_config_url,size
         )
     elif "small" in model_config["backbone"]["config"]:
         config = prepare_config(
-            small_config_url,
+            small_config_url,size
         )
     else:
         config = prepare_config(
-            base_config_url,
+            base_config_url,size
         )
-    size = 640
-    if "train" in data_config:
-        if "short_size" in data_config["train"]:
-            size = data_config["train"]["short_size"]
+
     model = TextNetBackbone(config)
     textnet_image_processor = CLIPImageProcessor(
         size={"shortest_edge": size},
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 06fd412f66de..840f8608a699 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -42,7 +42,7 @@
 # General docstring
 _CONFIG_FOR_DOC = "TextNetConfig"
 _CHECKPOINT_FOR_DOC = "Raghavan/textnet-base"
-_EXPECTED_OUTPUT_SHAPE = [1, 512, 7, 7]
+_EXPECTED_OUTPUT_SHAPE = [1, 512, 20, 20]
 
 # Image classification docstring
 _IMAGE_CLASS_CHECKPOINT = "Raghavan/textnet-base"

From 744e1576521d62614888db5bff3fc3ed42ba273d Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 16 Nov 2023 22:32:59 +0530
Subject: [PATCH 072/152] Fix build

---
 .../convert_textnet_original_to_pytorch.py     | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index ca3616e594a8..2090fcd15437 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -38,7 +38,7 @@
 }
 
 
-def prepare_config(size_config_url,size):
+def prepare_config(size_config_url, size):
     config_dict = json.loads(requests.get(size_config_url).text)
 
     backbone_config = {}
@@ -127,23 +127,15 @@ def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_du
             size = data_config["train"]["short_size"]
 
     if "tiny" in model_config["backbone"]["config"]:
-        config = prepare_config(
-            tiny_config_url,size
-        )
+        config = prepare_config(tiny_config_url, size)
     elif "small" in model_config["backbone"]["config"]:
-        config = prepare_config(
-            small_config_url,size
-        )
+        config = prepare_config(small_config_url, size)
     else:
-        config = prepare_config(
-            base_config_url,size
-        )
+        config = prepare_config(base_config_url, size)
 
     model = TextNetBackbone(config)
     textnet_image_processor = CLIPImageProcessor(
-        size={"shortest_edge": size},
-        do_center_crop=False,
-        use_square_size=True
+        size={"shortest_edge": size}, do_center_crop=False, use_square_size=True
     )
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"]
     state_dict_changed = OrderedDict()

From 93ad4a22106faa46e479c6311d512eae973ff420 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Fri, 17 Nov 2023 07:58:23 +0530
Subject: [PATCH 073/152] Fix build

---
 .../models/textnet/modeling_textnet.py        |   2 +-
 tests/models/textnet/test_modeling_textnet.py | 142 ++++++++++--------
 2 files changed, 79 insertions(+), 65 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 840f8608a699..123560cb91ba 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -317,7 +317,7 @@ def __init__(self, config):
         # classification head
         self.classifier = nn.Sequential(
             nn.Flatten(),
-            nn.Linear(config.hidden_sizes[-1] * scale * scale * 2 * 2, config.num_labels)
+            nn.Linear(config.hidden_sizes[-1] * scale * scale, config.num_labels)
             if config.num_labels > 0
             else nn.Identity(),
         )
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index a2fe3c6cfc12..88774928e615 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -42,69 +42,83 @@
     from transformers.models.textnet.modeling_textnet import TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
-class TextNetModelTester:
-    def __init__(
-        self,
-        parent,
-        stem_kernel_size=3,
-        stem_stride=2,
-        stem_in_channels=3,
-        stem_out_channels=64,
-        # use_bn=True,
-        stem_act_func="relu",
-        dropout_rate=0,
-        ops_order="weight_bn_act",
-        conv_layer_kernel_sizes=[[[3, 3]], [[3, 1]], [[1, 3]], [[3, 3]]],
-        conv_layer_strides=[
-            [
-                1,
-            ],
-            [
-                2,
-            ],
-            [
-                2,
-            ],
-            [
-                2,
-            ],
+def __init__(
+    self,
+    parent,
+    stem_kernel_size=3,
+    stem_stride=2,
+    stem_in_channels=3,
+    stem_out_channels=64,
+    # use_bn=True,
+    stem_act_func="relu",
+    dropout_rate=0,
+    ops_order="weight_bn_act",
+    conv_layer_kernel_sizes=[
+        [
+            [3, 3],
+        ],
+        [
+            [3, 3],
+        ],
+        [
+            [3, 3],
+        ],
+        [
+            [3, 3],
+        ],
+    ],
+    conv_layer_strides=[
+        [
+            2,
         ],
-        out_features=["stage1", "stage2", "stage3", "stage4"],
-        out_indices=[1, 2, 3, 4],
-        batch_size=3,
-        num_channels=3,
-        image_size=32,
-        is_training=True,
-        use_labels=True,
-        hidden_act="relu",
-        num_labels=3,
-        hidden_sizes=[64, 64, 64, 64, 64],
-    ):
-        self.parent = parent
-        self.stem_kernel_size = stem_kernel_size
-        self.stem_stride = stem_stride
-        self.stem_in_channels = stem_in_channels
-        self.stem_out_channels = stem_out_channels
-        # self.use_bn = use_bn
-        self.act_func = stem_act_func
-        self.dropout_rate = dropout_rate
-        self.ops_order = ops_order
-        self.conv_layer_kernel_sizes = conv_layer_kernel_sizes
-        self.conv_layer_strides = conv_layer_strides
-
-        self.out_features = out_features
-        self.out_indices = out_indices
-
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.num_labels = num_labels
-        self.hidden_sizes = hidden_sizes
-
-        self.num_stages = 5
+        [
+            2,
+        ],
+        [
+            2,
+        ],
+        [
+            2,
+        ],
+    ],
+    out_features=["stage1", "stage2", "stage3", "stage4"],
+    out_indices=[1, 2, 3, 4],
+    batch_size=3,
+    num_channels=3,
+    image_size=32,
+    is_training=True,
+    use_labels=True,
+    hidden_act="relu",
+    num_labels=3,
+    hidden_sizes=[64, 64, 64, 64, 64],
+):
+    self.parent = parent
+    self.stem_kernel_size = stem_kernel_size
+    self.stem_stride = stem_stride
+    self.stem_in_channels = stem_in_channels
+    self.stem_out_channels = stem_out_channels
+    # self.use_bn = use_bn
+    self.act_func = stem_act_func
+    self.dropout_rate = dropout_rate
+    self.ops_order = ops_order
+    self.conv_layer_kernel_sizes = conv_layer_kernel_sizes
+    self.conv_layer_strides = conv_layer_strides
+
+    self.out_features = out_features
+    self.out_indices = out_indices
+
+    self.batch_size = batch_size
+    self.num_channels = num_channels
+    self.image_size = image_size
+    self.is_training = is_training
+    self.use_labels = use_labels
+    self.num_labels = num_labels
+    self.hidden_sizes = hidden_sizes
+
+    self.num_stages = 5
 
+
+class TextNetModelTester:
     def get_config(self):
         return TextNetConfig(
             stem_kernel_size=self.stem_kernel_size,
@@ -130,7 +144,7 @@ def create_and_check_model(self, config, pixel_values, labels):
         scale = self.image_size // 32
         self.parent.assertEqual(
             result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], 2 * scale, 2 * scale),
+            (self.batch_size, self.hidden_sizes[-1], scale, scale),
         )
 
     def create_and_check_for_image_classification(self, config, pixel_values, labels):
@@ -162,7 +176,7 @@ def create_and_check_backbone(self, config, pixel_values, labels):
         self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
         scale = self.image_size // 32
         self.parent.assertListEqual(
-            list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 16 * scale, 16 * scale]
+            list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 8 * scale, 8 * scale]
         )
 
         # verify channels
@@ -179,7 +193,7 @@ def create_and_check_backbone(self, config, pixel_values, labels):
         # verify feature maps
         self.parent.assertEqual(len(result.feature_maps), 1)
         scale = self.image_size // 32
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, 64, 2 * scale, 2 * scale])
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, 64, scale, scale])
 
         # verify channels
         self.parent.assertEqual(len(model.channels), 1)

From bb01d854daaf9cea049a8a2f8f1ebcaa8e44e82a Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Fri, 17 Nov 2023 08:12:16 +0530
Subject: [PATCH 074/152] Fix build

---
 .../models/textnet/modeling_textnet.py        |   2 +-
 tests/models/textnet/test_modeling_textnet.py | 147 +++++++++---------
 2 files changed, 74 insertions(+), 75 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 123560cb91ba..7c7ea625dddd 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -46,7 +46,7 @@
 
 # Image classification docstring
 _IMAGE_CLASS_CHECKPOINT = "Raghavan/textnet-base"
-_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_0"
 
 TEXTNET_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 88774928e615..695b3ae38ec9 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -42,83 +42,82 @@
     from transformers.models.textnet.modeling_textnet import TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
-def __init__(
-    self,
-    parent,
-    stem_kernel_size=3,
-    stem_stride=2,
-    stem_in_channels=3,
-    stem_out_channels=64,
-    # use_bn=True,
-    stem_act_func="relu",
-    dropout_rate=0,
-    ops_order="weight_bn_act",
-    conv_layer_kernel_sizes=[
-        [
-            [3, 3],
-        ],
-        [
-            [3, 3],
-        ],
-        [
-            [3, 3],
-        ],
-        [
-            [3, 3],
-        ],
-    ],
-    conv_layer_strides=[
-        [
-            2,
-        ],
-        [
-            2,
-        ],
-        [
-            2,
+class TextNetModelTester:
+    def __init__(
+        self,
+        parent,
+        stem_kernel_size=3,
+        stem_stride=2,
+        stem_in_channels=3,
+        stem_out_channels=64,
+        # use_bn=True,
+        stem_act_func="relu",
+        dropout_rate=0,
+        ops_order="weight_bn_act",
+        conv_layer_kernel_sizes=[
+            [
+                [3, 3],
+            ],
+            [
+                [3, 3],
+            ],
+            [
+                [3, 3],
+            ],
+            [
+                [3, 3],
+            ],
         ],
-        [
-            2,
+        conv_layer_strides=[
+            [
+                2,
+            ],
+            [
+                2,
+            ],
+            [
+                2,
+            ],
+            [
+                2,
+            ],
         ],
-    ],
-    out_features=["stage1", "stage2", "stage3", "stage4"],
-    out_indices=[1, 2, 3, 4],
-    batch_size=3,
-    num_channels=3,
-    image_size=32,
-    is_training=True,
-    use_labels=True,
-    hidden_act="relu",
-    num_labels=3,
-    hidden_sizes=[64, 64, 64, 64, 64],
-):
-    self.parent = parent
-    self.stem_kernel_size = stem_kernel_size
-    self.stem_stride = stem_stride
-    self.stem_in_channels = stem_in_channels
-    self.stem_out_channels = stem_out_channels
-    # self.use_bn = use_bn
-    self.act_func = stem_act_func
-    self.dropout_rate = dropout_rate
-    self.ops_order = ops_order
-    self.conv_layer_kernel_sizes = conv_layer_kernel_sizes
-    self.conv_layer_strides = conv_layer_strides
-
-    self.out_features = out_features
-    self.out_indices = out_indices
-
-    self.batch_size = batch_size
-    self.num_channels = num_channels
-    self.image_size = image_size
-    self.is_training = is_training
-    self.use_labels = use_labels
-    self.num_labels = num_labels
-    self.hidden_sizes = hidden_sizes
-
-    self.num_stages = 5
+        out_features=["stage1", "stage2", "stage3", "stage4"],
+        out_indices=[1, 2, 3, 4],
+        batch_size=3,
+        num_channels=3,
+        image_size=32,
+        is_training=True,
+        use_labels=True,
+        hidden_act="relu",
+        num_labels=3,
+        hidden_sizes=[64, 64, 64, 64, 64],
+    ):
+        self.parent = parent
+        self.stem_kernel_size = stem_kernel_size
+        self.stem_stride = stem_stride
+        self.stem_in_channels = stem_in_channels
+        self.stem_out_channels = stem_out_channels
+        # self.use_bn = use_bn
+        self.act_func = stem_act_func
+        self.dropout_rate = dropout_rate
+        self.ops_order = ops_order
+        self.conv_layer_kernel_sizes = conv_layer_kernel_sizes
+        self.conv_layer_strides = conv_layer_strides
+
+        self.out_features = out_features
+        self.out_indices = out_indices
+
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.num_labels = num_labels
+        self.hidden_sizes = hidden_sizes
+
+        self.num_stages = 5
 
-
-class TextNetModelTester:
     def get_config(self):
         return TextNetConfig(
             stem_kernel_size=self.stem_kernel_size,

From e9aafe9c5bb04c3a493e4cadc7d26eae51d040db Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 21 Nov 2023 08:02:06 +0530
Subject: [PATCH 075/152] Incorporate PR feedbacks

---
 .../models/textnet/modeling_textnet.py        | 24 ++++++++-----------
 tests/models/textnet/test_modeling_textnet.py |  6 ++---
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 7c7ea625dddd..f4ae755cc34c 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -110,10 +110,10 @@ def forward(self, hidden_states):
 
 
 class TextNetRepConvLayer(nn.Module):
-    def __init__(self, config, num_channels, out_channels, kernel_size, stride):
+    def __init__(self, config, in_channels, out_channels, kernel_size, stride):
         super().__init__()
 
-        self.num_channels = num_channels
+        self.num_channels = in_channels
         self.out_channels = out_channels
         self.kernel_size = kernel_size
         self.stride = stride
@@ -123,7 +123,7 @@ def __init__(self, config, num_channels, out_channels, kernel_size, stride):
         self.nonlinearity = nn.ReLU()
 
         self.main_conv = nn.Conv2d(
-            in_channels=num_channels,
+            in_channels=in_channels,
             out_channels=out_channels,
             kernel_size=kernel_size,
             stride=stride,
@@ -137,7 +137,7 @@ def __init__(self, config, num_channels, out_channels, kernel_size, stride):
 
         if kernel_size[1] != 1:
             self.vertical_conv = nn.Conv2d(
-                in_channels=num_channels,
+                in_channels=in_channels,
                 out_channels=out_channels,
                 kernel_size=(kernel_size[0], 1),
                 stride=stride,
@@ -150,7 +150,7 @@ def __init__(self, config, num_channels, out_channels, kernel_size, stride):
 
         if kernel_size[0] != 1:
             self.horizontal_conv = nn.Conv2d(
-                in_channels=num_channels,
+                in_channels=in_channels,
                 out_channels=out_channels,
                 kernel_size=(1, kernel_size[1]),
                 stride=stride,
@@ -162,8 +162,8 @@ def __init__(self, config, num_channels, out_channels, kernel_size, stride):
             self.horizontal_conv, self.horizontal_batch_norm = None, None
 
         self.rbr_identity = (
-            nn.BatchNorm2d(num_features=num_channels, eps=config.batch_norm_eps)
-            if out_channels == num_channels and stride == 1
+            nn.BatchNorm2d(num_features=in_channels, eps=config.batch_norm_eps)
+            if out_channels == in_channels and stride == 1
             else None
         )
 
@@ -346,11 +346,8 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.textnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
-
-        last_hidden_state = outputs.last_hidden_state if return_dict else outputs[0]
-
+        last_hidden_state = outputs[0]
         logits = self.classifier(last_hidden_state)
-
         loss = None
 
         if labels is not None:
@@ -433,9 +430,8 @@ def forward(
         hidden_states = outputs.hidden_states if return_dict else outputs[2]
 
         feature_maps = ()
-        for idx, stage in enumerate(self.stage_names):
-            if stage in self.out_features:
-                feature_maps += (hidden_states[idx],)
+        for idx in self.out_indices:
+            feature_maps += (hidden_states[idx],)
 
         if not return_dict:
             output = (feature_maps,)
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 695b3ae38ec9..c8d2952f9f6a 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -235,7 +235,8 @@ def test_config(self):
         self.config_tester.check_config_arguments_init()
 
     def create_and_test_config_common_properties(self):
-        return
+        config = self.model_tester.get_config()
+        self.assertTrue(hasattr(config, "hidden_sizes"))
 
     @unittest.skip(reason="TextNet does not output attentions")
     def test_attention_outputs(self):
@@ -280,8 +281,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
 
-            expected_num_stages = self.model_tester.num_stages - 1
-            self.assertEqual(len(hidden_states), expected_num_stages + 1)
+            self.assertEqual(len(hidden_states), self.model_tester.num_stages)
 
             self.assertListEqual(
                 list(hidden_states[0].shape[-2:]),

From c96a3277822e33bc1a4ef8de55238f1712f5c1b9 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 21 Nov 2023 08:04:03 +0530
Subject: [PATCH 076/152] Fix style

---
 src/transformers/models/textnet/configuration_textnet.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 9a3c573085a7..64e9cd45eb50 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -85,6 +85,7 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+
     r"""
     [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base)
     """

From c1f33d5e30bc353b56a2cd6753f9a816e4c37447 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 21 Nov 2023 09:10:53 +0530
Subject: [PATCH 077/152] Fix build

---
 src/transformers/models/textnet/modeling_textnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index f4ae755cc34c..797be9d14d71 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -46,7 +46,7 @@
 
 # Image classification docstring
 _IMAGE_CLASS_CHECKPOINT = "Raghavan/textnet-base"
-_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_0"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_1"
 
 TEXTNET_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it

From 532ea29fb9c814d472ff54eb2b288e2ed753471e Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 21 Nov 2023 16:24:01 +0530
Subject: [PATCH 078/152] Incorporate PR feedbacks

---
 .../models/textnet/configuration_textnet.py   | 112 +++++++++---------
 .../convert_textnet_original_to_pytorch.py    |  16 +--
 .../models/textnet/modeling_textnet.py        |  25 +++-
 3 files changed, 78 insertions(+), 75 deletions(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 64e9cd45eb50..3869b78bc99d 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -27,64 +27,64 @@
 
 class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`TextNextModel`] which can be used with backbone
-    api as a backbone. It is used to instantiate a TextNextModel model according to the specified arguments, defining
-    the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that
-    of the [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base) Configuration objects inherit from
-    [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`]
+        This is the configuration class to store the configuration of a [`TextNextModel`]. It is used to instantiate a
+    TextNext model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base)Configuration objects inherit from
+    [`PretrainedConfig`] and can be used to control the model outputs.Read the documentation from [`PretrainedConfig`]
     for more information.
 
-    Args:
-        stem_kernel_size (`int`, *optional*, defaults to 3):
-            The kernel size for the initial convolution layer.
-        stem_stride (`int`, *optional*, defaults to 2):
-            The stride for the initial convolution layer.
-        stem_num_channels (`int`, *optional*, defaults to 3):
-            The num of channels in input for the initial convolution layer.
-        stem_out_channels (`int`, *optional*, defaults to 64):
-            The num of channels in out for the initial convolution layer.
-        stem_act_func (`str`, *optional*, defaults to `"relu"`):
-            The activation function for the initial convolution layer.
-                                     [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3,
-                                     1], [1, 3], [3, 3]]]`):
-            The list of stagewise conv layer's kernel sizes.
-                                [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
-            The list of stagewise conv layer's kernel strides.
-        image_size (`int`, *optional*, defaults to 224):
-            The size (resolution) of each image.
-        conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`):
-            A list of stage wise kernel sizes.
-        conv_layer_strides (`List[List[int]]`, *optional*, defaults to `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
-            A list of stage wise strides.
-        hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`):
-            Dimensionality (hidden size) at each stage.
-        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the batch normalization layers.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        out_features (`List[str]`, *optional*):
-            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
-            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
-        out_indices (`List[int]`, *optional*):
-            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
-            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
-            If unset and `out_features` is unset, will default to the last stage.
-
-    Examples:
-
-    ```python
-    >>> from transformers import TextNetConfig, TextNetBackbone
-
-    >>> # Initializing a TextNetConfig
-    >>> configuration = TextNetConfig()
-
-    >>> # Initializing a model (with random weights)
-    >>> model = TextNetBackbone(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
+        Args:
+            stem_kernel_size (`int`, *optional*, defaults to 3):
+                The kernel size for the initial convolution layer.
+            stem_stride (`int`, *optional*, defaults to 2):
+                The stride for the initial convolution layer.
+            stem_num_channels (`int`, *optional*, defaults to 3):
+                The num of channels in input for the initial convolution layer.
+            stem_out_channels (`int`, *optional*, defaults to 64):
+                The num of channels in out for the initial convolution layer.
+            stem_act_func (`str`, *optional*, defaults to `"relu"`):
+                The activation function for the initial convolution layer.
+                                         [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3,
+                                         1], [1, 3], [3, 3]]]`):
+                The list of stagewise conv layer's kernel sizes.
+                                    [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
+                The list of stagewise conv layer's kernel strides.
+            image_size (`int`, *optional*, defaults to 224):
+                The size (resolution) of each image.
+            conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`):
+                A list of stage wise kernel sizes.
+            conv_layer_strides (`List[List[int]]`, *optional*, defaults to `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
+                A list of stage wise strides.
+            hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`):
+                Dimensionality (hidden size) at each stage.
+            batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+                The epsilon used by the batch normalization layers.
+            initializer_range (`float`, *optional*, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            out_features (`List[str]`, *optional*):
+                If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+                (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+                corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+            out_indices (`List[int]`, *optional*):
+                If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+                many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+                If unset and `out_features` is unset, will default to the last stage.
+
+        Examples:
+
+        ```python
+        >>> from transformers import TextNetConfig, TextNetBackbone
+
+        >>> # Initializing a TextNetConfig
+        >>> configuration = TextNetConfig()
+
+        >>> # Initializing a model (with random weights)
+        >>> model = TextNetBackbone(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+        ```"""
 
     r"""
     [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base)
diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index 2090fcd15437..d6e56a657934 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -168,31 +168,19 @@ def adjust_stage(match):
 
     parser.add_argument(
         "--checkpoint_url",
-        default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth",
+        default="https://github.com/czczup/FAST/releases/download/release/fast_base_ic17mlt_640.pth",
         type=str,
         help="URL to the original PyTorch checkpoint (.pth file).",
     )
     parser.add_argument(
         "--checkpoint_config_url",
-        default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth",
+        default="https://raw.githubusercontent.com/czczup/FAST/main/config/fast/ic17mlt/fast_base_ic17mlt_640.py",
         type=str,
         help="URL to the original PyTorch checkpoint (.pth file).",
     )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
     )
-    parser.add_argument(
-        "--validate_logits",
-        default=False,
-        type=bool,
-        help="whether to assert logits outputs",
-    )
-    parser.add_argument(
-        "--save_backbone_separately",
-        default=False,
-        type=bool,
-        help="whether to assert logits outputs",
-    )
     args = parser.parse_args()
 
     convert_textnet_checkpoint(
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 797be9d14d71..7943e07d70d7 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -24,6 +24,7 @@
 from transformers.activations import ACT2CLS
 from transformers.modeling_outputs import (
     BackboneOutput,
+    BaseModelOutputWithNoAttention,
     BaseModelOutputWithPoolingAndNoAttention,
     ImageClassifierOutputWithNoAttention,
 )
@@ -225,13 +226,22 @@ def __init__(self, config):
 
         self.stages = nn.ModuleList(stages)
 
-    def forward(self, hidden_state):
+    def forward(
+        self,
+        hidden_state,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
         hidden_states = []
         for stage in self.stages:
             hidden_state = stage(hidden_state)
             hidden_states.append(hidden_state)
 
-        return hidden_states
+        if not return_dict:
+            output = (hidden_state,)
+            return output + (hidden_states,) if output_hidden_states else output
+
+        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states)
 
 
 class TextNetPreTrainedModel(PreTrainedModel):
@@ -285,9 +295,14 @@ def forward(
         hidden_state = self.stem(pixel_values)
         hidden_states = [hidden_state]
 
-        hidden_states = hidden_states + self.encoder(hidden_state)
+        encoder_outputs = self.encoder(
+            hidden_state, output_hidden_states=output_hidden_states, return_dict=return_dict
+        )
+
+        if output_hidden_states:
+            hidden_states = hidden_states + encoder_outputs[1]
 
-        last_hidden_state = hidden_states[-1]
+        last_hidden_state = encoder_outputs[0]
         pooled_output = self.pooler(last_hidden_state)
 
         if not return_dict:
@@ -295,8 +310,8 @@ def forward(
             return output + (tuple(hidden_states),) if output_hidden_states else output
 
         return BaseModelOutputWithPoolingAndNoAttention(
-            pooler_output=pooled_output,
             last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
             hidden_states=tuple(hidden_states) if output_hidden_states else None,
         )
 

From 0662b1245743cf193365f6e2758e5d9221f1eb2d Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 21 Nov 2023 16:35:07 +0530
Subject: [PATCH 079/152] Fix image processing mean and std

---
 .../models/textnet/convert_textnet_original_to_pytorch.py    | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index d6e56a657934..0ee01a9ad285 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -23,7 +23,7 @@
 import torch
 
 from transformers import CLIPImageProcessor, TextNetBackbone, TextNetConfig
-
+from transformers.utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
 tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
 small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config"
@@ -135,7 +135,8 @@ def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_du
 
     model = TextNetBackbone(config)
     textnet_image_processor = CLIPImageProcessor(
-        size={"shortest_edge": size}, do_center_crop=False, use_square_size=True
+        size={"shortest_edge": size}, do_center_crop=False, use_square_size=True, image_mean=IMAGENET_DEFAULT_MEAN,
+        image_std=IMAGENET_DEFAULT_STD
     )
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"]
     state_dict_changed = OrderedDict()

From b18822d950c17ce419660e613ec0cd9fe77ae270 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 21 Nov 2023 16:43:08 +0530
Subject: [PATCH 080/152] Incorporate PR feedbacks

---
 src/transformers/models/textnet/modeling_textnet.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 7943e07d70d7..3553e9e2d092 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -232,7 +232,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):
-        hidden_states = []
+        hidden_states = [hidden_state]
         for stage in self.stages:
             hidden_state = stage(hidden_state)
             hidden_states.append(hidden_state)
@@ -293,26 +293,22 @@ def forward(
         )
 
         hidden_state = self.stem(pixel_values)
-        hidden_states = [hidden_state]
 
         encoder_outputs = self.encoder(
             hidden_state, output_hidden_states=output_hidden_states, return_dict=return_dict
         )
 
-        if output_hidden_states:
-            hidden_states = hidden_states + encoder_outputs[1]
-
         last_hidden_state = encoder_outputs[0]
         pooled_output = self.pooler(last_hidden_state)
 
         if not return_dict:
             output = (last_hidden_state, pooled_output)
-            return output + (tuple(hidden_states),) if output_hidden_states else output
+            return output + (encoder_outputs[1],) if output_hidden_states else output
 
         return BaseModelOutputWithPoolingAndNoAttention(
             last_hidden_state=last_hidden_state,
             pooler_output=pooled_output,
-            hidden_states=tuple(hidden_states) if output_hidden_states else None,
+            hidden_states=encoder_outputs[1] if output_hidden_states else None,
         )
 
 

From 78afd4f6f75a392a80d454c0c51714fb1076ed60 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 21 Nov 2023 17:08:38 +0530
Subject: [PATCH 081/152] fix build failure

---
 .../convert_textnet_original_to_pytorch.py    |  8 +++--
 .../models/textnet/modeling_textnet.py        | 31 ++++++++++---------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index 0ee01a9ad285..0f224a09dd90 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -25,6 +25,7 @@
 from transformers import CLIPImageProcessor, TextNetBackbone, TextNetConfig
 from transformers.utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
+
 tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
 small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config"
 base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config"
@@ -135,8 +136,11 @@ def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_du
 
     model = TextNetBackbone(config)
     textnet_image_processor = CLIPImageProcessor(
-        size={"shortest_edge": size}, do_center_crop=False, use_square_size=True, image_mean=IMAGENET_DEFAULT_MEAN,
-        image_std=IMAGENET_DEFAULT_STD
+        size={"shortest_edge": size},
+        do_center_crop=False,
+        use_square_size=True,
+        image_mean=IMAGENET_DEFAULT_MEAN,
+        image_std=IMAGENET_DEFAULT_STD,
     )
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"]
     state_dict_changed = OrderedDict()
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 3553e9e2d092..d1574487fa87 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -45,10 +45,6 @@
 _CHECKPOINT_FOR_DOC = "Raghavan/textnet-base"
 _EXPECTED_OUTPUT_SHAPE = [1, 512, 20, 20]
 
-# Image classification docstring
-_IMAGE_CLASS_CHECKPOINT = "Raghavan/textnet-base"
-_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_1"
-
 TEXTNET_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
     as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
@@ -336,12 +332,7 @@ def __init__(self, config):
         self.post_init()
 
     @add_start_docstrings_to_model_forward(TEXTNET_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_IMAGE_CLASS_CHECKPOINT,
-        output_type=ImageClassifierOutputWithNoAttention,
-        config_class=_CONFIG_FOR_DOC,
-        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
-    )
+    @replace_return_docstrings(output_type=ImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: Optional[torch.FloatTensor] = None,
@@ -350,10 +341,22 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> ImageClassifierOutputWithNoAttention:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import CLIPImageProcessor, TextNetForImageClassification
+        >>> from PIL import Image
+        >>> import requests
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> processor = CLIPImageProcessor.from_pretrained("Raghavan/textnet-base")
+        >>> model = TextNetForImageClassification.from_pretrained("Raghavan/textnet-base")
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+        >>> outputs.logits.shape
+        torch.Size([1, 2])
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.textnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)

From 7a59eb5b4ac8d9e2f8c5745c427ace8180f87e52 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 22 Nov 2023 08:08:27 +0530
Subject: [PATCH 082/152] Add assertion to image processor

---
 .../convert_textnet_original_to_pytorch.py    | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index 0f224a09dd90..eb60877cacea 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -21,8 +21,11 @@
 
 import requests
 import torch
+from PIL import Image
+from torchvision import transforms
 
 from transformers import CLIPImageProcessor, TextNetBackbone, TextNetConfig
+from transformers.image_utils import PILImageResampling
 from transformers.utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
 
@@ -137,6 +140,7 @@ def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_du
     model = TextNetBackbone(config)
     textnet_image_processor = CLIPImageProcessor(
         size={"shortest_edge": size},
+        resample=PILImageResampling.BILINEAR,
         do_center_crop=False,
         use_square_size=True,
         image_mean=IMAGENET_DEFAULT_MEAN,
@@ -163,6 +167,26 @@ def adjust_stage(match):
             state_dict_changed[new_key] = val
     model.load_state_dict(state_dict_changed)
 
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    # preprocess image
+    transformations = transforms.Compose(
+        [
+            transforms.Resize((size, size), interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=IMAGENET_DEFAULT_MEAN,  # these are RGB mean+std values
+                std=IMAGENET_DEFAULT_STD,  # across a large photo dataset.
+            ),
+        ]
+    )
+
+    original_pixel_values = transformations(image).unsqueeze(0)  # insert batch dimension
+    pixel_values = textnet_image_processor(image, return_tensors="pt").pixel_values
+
+    assert torch.allclose(original_pixel_values, pixel_values)
+
     model.save_pretrained(pytorch_dump_folder_path)
     textnet_image_processor.save_pretrained(pytorch_dump_folder_path)
     logging.info("The converted weights are save here : " + pytorch_dump_folder_path)

From e2ec8aaeeb51b8e8496e4ca4ef0c5bb37a6a0eab Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 23 Nov 2023 09:58:53 +0530
Subject: [PATCH 083/152] Incorporate PR feedbacks

---
 .../convert_textnet_original_to_pytorch.py       | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index eb60877cacea..cf6b93aad458 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -22,13 +22,13 @@
 import requests
 import torch
 from PIL import Image
+from huggingface_hub import hf_hub_download
 from torchvision import transforms
 
 from transformers import CLIPImageProcessor, TextNetBackbone, TextNetConfig
 from transformers.image_utils import PILImageResampling
 from transformers.utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
-
 tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
 small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config"
 base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config"
@@ -116,9 +116,11 @@ def prepare_config(size_config_url, size):
     return textnet_config
 
 
-def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_folder_path):
-    response = requests.get(checkpoint_config_url)
-    content = response.text
+def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_filename, pytorch_dump_folder_path):
+    filepath = hf_hub_download(repo_id="Raghavan/fast_model_config_files", filename=checkpoint_config_filename)
+
+    with open(filepath) as f:
+        content = f.read()
     namespace = {}
 
     exec(content, namespace)
@@ -202,8 +204,8 @@ def adjust_stage(match):
         help="URL to the original PyTorch checkpoint (.pth file).",
     )
     parser.add_argument(
-        "--checkpoint_config_url",
-        default="https://raw.githubusercontent.com/czczup/FAST/main/config/fast/ic17mlt/fast_base_ic17mlt_640.py",
+        "--checkpoint_config_filename",
+        default="fast_base_ic17mlt_640.py",
         type=str,
         help="URL to the original PyTorch checkpoint (.pth file).",
     )
@@ -214,6 +216,6 @@ def adjust_stage(match):
 
     convert_textnet_checkpoint(
         args.checkpoint_url,
-        args.checkpoint_config_url,
+        args.checkpoint_config_filename,
         args.pytorch_dump_folder_path,
     )

From 1e4ed58eae62ee0418616cd71e9a776018581c6a Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 23 Nov 2023 10:24:47 +0530
Subject: [PATCH 084/152] Incorporate PR feedbacks

---
 .../convert_textnet_original_to_pytorch.py     | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index cf6b93aad458..b41c620eb87e 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -19,6 +19,7 @@
 import re
 from collections import OrderedDict
 
+import numpy as np
 import requests
 import torch
 from PIL import Image
@@ -144,7 +145,6 @@ def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_filename, pytor
         size={"shortest_edge": size},
         resample=PILImageResampling.BILINEAR,
         do_center_crop=False,
-        use_square_size=True,
         image_mean=IMAGENET_DEFAULT_MEAN,
         image_std=IMAGENET_DEFAULT_STD,
     )
@@ -172,10 +172,22 @@ def adjust_stage(match):
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
 
-    # preprocess image
+    # originpreprocess image
+
+    img = np.array(image)
+    short_size = size
+    h, w = img.shape[0:2]
+    scale = short_size * 1.0 / min(h, w)
+    h = int(h * scale + 0.5)
+    w = int(w * scale + 0.5)
+    # if h % 32 != 0:
+    #     h = h + (32 - h % 32)
+    # if w % 32 != 0:
+    #     w = w + (32 - w % 32)
+
     transformations = transforms.Compose(
         [
-            transforms.Resize((size, size), interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.Resize((h, w), interpolation=transforms.InterpolationMode.BILINEAR),
             transforms.ToTensor(),
             transforms.Normalize(
                 mean=IMAGENET_DEFAULT_MEAN,  # these are RGB mean+std values

From 19368c740d6c8f323c2e45600dd460f87c0726d1 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 23 Nov 2023 10:34:17 +0530
Subject: [PATCH 085/152] fix style failures

---
 .../models/textnet/convert_textnet_original_to_pytorch.py      | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index b41c620eb87e..e096aa841d40 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -22,14 +22,15 @@
 import numpy as np
 import requests
 import torch
-from PIL import Image
 from huggingface_hub import hf_hub_download
+from PIL import Image
 from torchvision import transforms
 
 from transformers import CLIPImageProcessor, TextNetBackbone, TextNetConfig
 from transformers.image_utils import PILImageResampling
 from transformers.utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
+
 tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
 small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config"
 base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config"

From 57fedc089606e067ddbf612a5bfd8b95ba745ad7 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 23 Nov 2023 10:47:33 +0530
Subject: [PATCH 086/152] fix build

---
 src/transformers/models/textnet/modeling_textnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index d1574487fa87..ca48a3d0cee5 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -43,7 +43,7 @@
 # General docstring
 _CONFIG_FOR_DOC = "TextNetConfig"
 _CHECKPOINT_FOR_DOC = "Raghavan/textnet-base"
-_EXPECTED_OUTPUT_SHAPE = [1, 512, 20, 20]
+_EXPECTED_OUTPUT_SHAPE = [1, 512, 20, 27]
 
 TEXTNET_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it

From dc0b360d46ba7a3c83da80ad4aaa72b2e8320ff4 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Fri, 24 Nov 2023 16:50:49 +0530
Subject: [PATCH 087/152] Fix Imageclassification's linear layer, also
 introduce TextNetImageProcessor

---
 src/transformers/__init__.py                  |   2 +
 src/transformers/models/textnet/__init__.py   |  16 +
 .../models/textnet/configuration_textnet.py   |   4 +-
 .../convert_textnet_original_to_pytorch.py    |  10 +-
 .../textnet/image_processing_textnet.py       | 327 ++++++++++++++++++
 .../models/textnet/modeling_textnet.py        |  13 +-
 tests/models/textnet/test_modeling_textnet.py | 118 ++++---
 7 files changed, 428 insertions(+), 62 deletions(-)
 create mode 100644 src/transformers/models/textnet/image_processing_textnet.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5fbe0309d31f..c701e3bc2c96 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1304,6 +1304,7 @@
     _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
     _import_structure["models.siglip"].append("SiglipImageProcessor")
     _import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
+    _import_structure["models.textnet"].extend(["TextNetImageProcessor"])
     _import_structure["models.tvlt"].append("TvltImageProcessor")
     _import_structure["models.tvp"].append("TvpImageProcessor")
     _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
@@ -6010,6 +6011,7 @@
         from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
         from .models.siglip import SiglipImageProcessor
         from .models.swin2sr import Swin2SRImageProcessor
+        from .models.textnet import TextNetImageProcessor
         from .models.tvlt import TvltImageProcessor
         from .models.tvp import TvpImageProcessor
         from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py
index b302088afd10..9585ce0ba996 100644
--- a/src/transformers/models/textnet/__init__.py
+++ b/src/transformers/models/textnet/__init__.py
@@ -26,6 +26,14 @@
     "configuration_textnet": ["TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "TextNetConfig"],
 }
 
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_textnet"] = ["TextNetImageProcessor"]
+
 try:
     if not is_torch_available():
         raise OptionalDependencyNotAvailable()
@@ -44,6 +52,14 @@
 if TYPE_CHECKING:
     from .configuration_textnet import TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, TextNetConfig
 
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_textnet import TextNetImageProcessor
+
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 3869b78bc99d..b3b2246ec676 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -50,7 +50,7 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
                 The list of stagewise conv layer's kernel sizes.
                                     [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
                 The list of stagewise conv layer's kernel strides.
-            image_size (`int`, *optional*, defaults to 224):
+            image_size (`int`, *optional*, defaults to `(640, 640)`):
                 The size (resolution) of each image.
             conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`):
                 A list of stage wise kernel sizes.
@@ -98,7 +98,7 @@ def __init__(
         stem_num_channels=3,
         stem_out_channels=64,
         stem_act_func="relu",
-        image_size=224,
+        image_size=(640, 640),
         conv_layer_kernel_sizes=[
             [[3, 3], [3, 3], [3, 3]],
             [[3, 3], [1, 3], [3, 3], [3, 1]],
diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index e096aa841d40..6e43aa0a795a 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -26,7 +26,7 @@
 from PIL import Image
 from torchvision import transforms
 
-from transformers import CLIPImageProcessor, TextNetBackbone, TextNetConfig
+from transformers import CLIPImageProcessor, TextNetBackbone, TextNetConfig, TextNetImageProcessor
 from transformers.image_utils import PILImageResampling
 from transformers.utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
@@ -112,7 +112,7 @@ def prepare_config(size_config_url, size):
         ],
         out_features=["stage1", "stage2", "stage3", "stage4"],
         out_indices=[1, 2, 3, 4],
-        image_size=size,
+        image_size=(size, size),
     )
 
     return textnet_config
@@ -142,12 +142,8 @@ def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_filename, pytor
         config = prepare_config(base_config_url, size)
 
     model = TextNetBackbone(config)
-    textnet_image_processor = CLIPImageProcessor(
+    textnet_image_processor = TextNetImageProcessor(
         size={"shortest_edge": size},
-        resample=PILImageResampling.BILINEAR,
-        do_center_crop=False,
-        image_mean=IMAGENET_DEFAULT_MEAN,
-        image_std=IMAGENET_DEFAULT_STD,
     )
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"]
     state_dict_changed = OrderedDict()
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
new file mode 100644
index 000000000000..aefbd689edf6
--- /dev/null
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -0,0 +1,327 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for TextNet."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    convert_to_rgb,
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+logger = logging.get_logger(__name__)
+
+if is_vision_available():
+    import PIL
+
+
+class TextNetImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a TextNet image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+            self,
+            do_resize: bool = True,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = PILImageResampling.BILINEAR,
+            do_center_crop: bool = False,
+            crop_size: Dict[str, int] = None,
+            do_rescale: bool = True,
+            rescale_factor: Union[int, float] = 1 / 255,
+            do_normalize: bool = True,
+            image_mean: Optional[Union[float, List[float]]] = IMAGENET_DEFAULT_MEAN,
+            image_std: Optional[Union[float, List[float]]] = IMAGENET_DEFAULT_STD,
+            do_convert_rgb: bool = True,
+            **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+            self,
+            image: np.ndarray,
+            size: Dict[str, int],
+            resample: PILImageResampling = PILImageResampling.BICUBIC,
+            data_format: Optional[Union[str, ChannelDimension]] = None,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+            default_to_square: bool = True,
+            # align_size_to_32_factor: bool = False,
+            **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size)
+        if "shortest_edge" not in size:
+            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        output_size = get_resize_output_image_size(
+            image,
+            size=size["shortest_edge"],
+            input_data_format=input_data_format,
+            default_to_square=default_to_square
+        )
+        # if align_size_to_32_factor:
+        height, weight = output_size
+        if height % 32 != 0:
+            height = height + (32 - height % 32)
+        if weight % 32 != 0:
+            weight = weight + (32 - weight % 32)
+
+        output_size = (height, weight)
+
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+            self,
+            images: ImageInput,
+            do_resize: bool = None,
+            size: Dict[str, int] = None,
+            resample: PILImageResampling = None,
+            do_center_crop: bool = None,
+            crop_size: int = None,
+            do_rescale: bool = None,
+            rescale_factor: float = None,
+            do_normalize: bool = None,
+            image_mean: Optional[Union[float, List[float]]] = None,
+            image_std: Optional[Union[float, List[float]]] = None,
+            do_convert_rgb: bool = None,
+            return_tensors: Optional[Union[str, TensorType]] = None,
+            data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+            input_data_format: Optional[Union[str, ChannelDimension]] = None,
+            # align_size_to_32_factor: bool = True,
+            default_to_square: bool = True,
+            **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size")
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format,
+                            default_to_square=default_to_square)
+                for image in images
+            ]
+
+        if do_center_crop:
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index ca48a3d0cee5..a0bc7ce388fe 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -60,7 +60,7 @@
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`ClipImageProcessor.__call__`] for details.
+            [`TextNetImageProcessor.__call__`] for details.
 
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
@@ -320,11 +320,12 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.textnet = TextNetModel(config)
-        scale = config.image_size // 32
+        scale_h = config.image_size[0] // 32
+        scale_w = config.image_size[1] // 32
         # classification head
         self.classifier = nn.Sequential(
             nn.Flatten(),
-            nn.Linear(config.hidden_sizes[-1] * scale * scale, config.num_labels)
+            nn.Linear(config.hidden_sizes[-1] * scale_h * scale_w, config.num_labels)
             if config.num_labels > 0
             else nn.Identity(),
         )
@@ -344,14 +345,14 @@ def forward(
         Returns:
         Examples:
         ```python
-        >>> from transformers import CLIPImageProcessor, TextNetForImageClassification
+        >>> from transformers import TextNetForImageClassification,TextNetImageProcessor
         >>> from PIL import Image
         >>> import requests
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> processor = CLIPImageProcessor.from_pretrained("Raghavan/textnet-base")
+        >>> processor = TextNetImageProcessor.from_pretrained("Raghavan/textnet-base")
         >>> model = TextNetForImageClassification.from_pretrained("Raghavan/textnet-base")
-        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> inputs = processor(images=image, return_tensors="pt",size={"shortest_edge": 640},default_to_square=True)
         >>> # forward pass
         >>> outputs = model(**inputs)
         >>> outputs.logits.shape
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index c8d2952f9f6a..638627d9e453 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -16,9 +16,14 @@
 import inspect
 import unittest
 
+import requests
+from PIL import Image
+
 from transformers import TextNetConfig
+from transformers.models.textnet.image_processing_textnet import TextNetImageProcessor
 from transformers.testing_utils import (
     require_torch,
+    require_vision,
     slow,
     torch_device,
 )
@@ -29,7 +34,6 @@
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
-
 if is_torch_available():
     import torch
 
@@ -44,54 +48,54 @@
 
 class TextNetModelTester:
     def __init__(
-        self,
-        parent,
-        stem_kernel_size=3,
-        stem_stride=2,
-        stem_in_channels=3,
-        stem_out_channels=64,
-        # use_bn=True,
-        stem_act_func="relu",
-        dropout_rate=0,
-        ops_order="weight_bn_act",
-        conv_layer_kernel_sizes=[
-            [
-                [3, 3],
-            ],
-            [
-                [3, 3],
-            ],
-            [
-                [3, 3],
-            ],
-            [
-                [3, 3],
-            ],
-        ],
-        conv_layer_strides=[
-            [
-                2,
-            ],
-            [
-                2,
+            self,
+            parent,
+            stem_kernel_size=3,
+            stem_stride=2,
+            stem_in_channels=3,
+            stem_out_channels=64,
+            # use_bn=True,
+            stem_act_func="relu",
+            dropout_rate=0,
+            ops_order="weight_bn_act",
+            conv_layer_kernel_sizes=[
+                [
+                    [3, 3],
+                ],
+                [
+                    [3, 3],
+                ],
+                [
+                    [3, 3],
+                ],
+                [
+                    [3, 3],
+                ],
             ],
-            [
-                2,
+            conv_layer_strides=[
+                [
+                    2,
+                ],
+                [
+                    2,
+                ],
+                [
+                    2,
+                ],
+                [
+                    2,
+                ],
             ],
-            [
-                2,
-            ],
-        ],
-        out_features=["stage1", "stage2", "stage3", "stage4"],
-        out_indices=[1, 2, 3, 4],
-        batch_size=3,
-        num_channels=3,
-        image_size=32,
-        is_training=True,
-        use_labels=True,
-        hidden_act="relu",
-        num_labels=3,
-        hidden_sizes=[64, 64, 64, 64, 64],
+            out_features=["stage1", "stage2", "stage3", "stage4"],
+            out_indices=[1, 2, 3, 4],
+            batch_size=3,
+            num_channels=3,
+            image_size=32,
+            is_training=True,
+            use_labels=True,
+            hidden_act="relu",
+            num_labels=3,
+            hidden_sizes=[64, 64, 64, 64, 64],
     ):
         self.parent = parent
         self.stem_kernel_size = stem_kernel_size
@@ -327,3 +331,23 @@ class TextNetBackboneTest(BackboneTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = TextNetModelTester(self)
+
+
+@require_torch
+@require_vision
+class TextNetModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_textnet_image_classification(self):
+        model = TextNetForImageClassification.from_pretrained("Raghavan/textnet-base").to(torch_device)
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+        processor = TextNetImageProcessor.from_pretrained("Raghavan/textnet-base")
+        text = "This is a photo of a cat"
+        inputs = processor(
+            text=text, images=image, return_tensors="pt", size={"shortest_edge": 640},
+            default_to_square=True
+        )
+
+        # forward pass
+        output = model(pixel_values=torch.tensor(inputs["pixel_values"]))
+        self.assertEqual(output.logits.shape, torch.Size([1, 2]))
\ No newline at end of file

From 6a6c45e70ca3d9bc9dbc97de599a18ee05c66f0d Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Fri, 24 Nov 2023 17:51:05 +0530
Subject: [PATCH 088/152] Fix build

---
 .../convert_textnet_original_to_pytorch.py    |   3 +-
 .../textnet/image_processing_textnet.py       | 107 ++++++++--------
 .../models/textnet/modeling_textnet.py        |   2 +-
 .../utils/dummy_vision_objects.py             |   7 +
 .../textnet/test_image_processing_textnet.py  | 121 ++++++++++++++++++
 tests/models/textnet/test_modeling_textnet.py |  98 +++++++-------
 6 files changed, 234 insertions(+), 104 deletions(-)
 create mode 100644 tests/models/textnet/test_image_processing_textnet.py

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index 6e43aa0a795a..ff8deb401725 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -26,8 +26,7 @@
 from PIL import Image
 from torchvision import transforms
 
-from transformers import CLIPImageProcessor, TextNetBackbone, TextNetConfig, TextNetImageProcessor
-from transformers.image_utils import PILImageResampling
+from transformers import TextNetBackbone, TextNetConfig, TextNetImageProcessor
 from transformers.utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
 
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index aefbd689edf6..edb5feed4708 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -39,6 +39,7 @@
 )
 from ...utils import TensorType, is_vision_available, logging
 
+
 logger = logging.get_logger(__name__)
 
 if is_vision_available():
@@ -57,9 +58,9 @@ class TextNetImageProcessor(BaseImageProcessor):
             Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
             the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
-        do_center_crop (`bool`, *optional*, defaults to `True`):
+        do_center_crop (`bool`, *optional*, defaults to `False`):
             Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
             `preprocess` method.
         crop_size (`Dict[str, int]` *optional*, defaults to 224):
@@ -87,25 +88,25 @@ class TextNetImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
 
     def __init__(
-            self,
-            do_resize: bool = True,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = PILImageResampling.BILINEAR,
-            do_center_crop: bool = False,
-            crop_size: Dict[str, int] = None,
-            do_rescale: bool = True,
-            rescale_factor: Union[int, float] = 1 / 255,
-            do_normalize: bool = True,
-            image_mean: Optional[Union[float, List[float]]] = IMAGENET_DEFAULT_MEAN,
-            image_std: Optional[Union[float, List[float]]] = IMAGENET_DEFAULT_STD,
-            do_convert_rgb: bool = True,
-            **kwargs,
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_center_crop: bool = False,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = IMAGENET_DEFAULT_MEAN,
+        image_std: Optional[Union[float, List[float]]] = IMAGENET_DEFAULT_STD,
+        do_convert_rgb: bool = True,
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
-        size = get_size_dict(size)
+        size = get_size_dict(size, default_to_square=False)
         crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
-        crop_size = get_size_dict(crop_size, param_name="crop_size")
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
 
         self.do_resize = do_resize
         self.size = size
@@ -120,15 +121,15 @@ def __init__(
         self.do_convert_rgb = do_convert_rgb
 
     def resize(
-            self,
-            image: np.ndarray,
-            size: Dict[str, int],
-            resample: PILImageResampling = PILImageResampling.BICUBIC,
-            data_format: Optional[Union[str, ChannelDimension]] = None,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
-            default_to_square: bool = True,
-            # align_size_to_32_factor: bool = False,
-            **kwargs,
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        default_to_square: bool = True,
+        # align_size_to_32_factor: bool = False,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
@@ -150,10 +151,7 @@ def resize(
         if "shortest_edge" not in size:
             raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
         output_size = get_resize_output_image_size(
-            image,
-            size=size["shortest_edge"],
-            input_data_format=input_data_format,
-            default_to_square=default_to_square
+            image, size=size["shortest_edge"], input_data_format=input_data_format, default_to_square=default_to_square
         )
         # if align_size_to_32_factor:
         height, weight = output_size
@@ -174,25 +172,25 @@ def resize(
         )
 
     def preprocess(
-            self,
-            images: ImageInput,
-            do_resize: bool = None,
-            size: Dict[str, int] = None,
-            resample: PILImageResampling = None,
-            do_center_crop: bool = None,
-            crop_size: int = None,
-            do_rescale: bool = None,
-            rescale_factor: float = None,
-            do_normalize: bool = None,
-            image_mean: Optional[Union[float, List[float]]] = None,
-            image_std: Optional[Union[float, List[float]]] = None,
-            do_convert_rgb: bool = None,
-            return_tensors: Optional[Union[str, TensorType]] = None,
-            data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-            input_data_format: Optional[Union[str, ChannelDimension]] = None,
-            # align_size_to_32_factor: bool = True,
-            default_to_square: bool = True,
-            **kwargs,
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        # align_size_to_32_factor: bool = True,
+        default_to_square: bool = True,
+        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
@@ -297,8 +295,13 @@ def preprocess(
 
         if do_resize:
             images = [
-                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format,
-                            default_to_square=default_to_square)
+                self.resize(
+                    image=image,
+                    size=size,
+                    resample=resample,
+                    input_data_format=input_data_format,
+                    default_to_square=default_to_square,
+                )
                 for image in images
             ]
 
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index a0bc7ce388fe..2d9885befe90 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -43,7 +43,7 @@
 # General docstring
 _CONFIG_FOR_DOC = "TextNetConfig"
 _CHECKPOINT_FOR_DOC = "Raghavan/textnet-base"
-_EXPECTED_OUTPUT_SHAPE = [1, 512, 20, 27]
+_EXPECTED_OUTPUT_SHAPE = [1, 512, 20, 20]
 
 TEXTNET_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 89366aba5081..18c6a27bd7dc 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -485,6 +485,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class TextNetImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class TvltImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/textnet/test_image_processing_textnet.py b/tests/models/textnet/test_image_processing_textnet.py
new file mode 100644
index 000000000000..ee3c8a1beb2e
--- /dev/null
+++ b/tests/models/textnet/test_image_processing_textnet.py
@@ -0,0 +1,121 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import TextNetImageProcessor
+
+
+class TextNetImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"shortest_edge": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class TextNetImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = TextNetImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = TextNetImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 20})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 638627d9e453..5bc93eed095a 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -34,6 +34,7 @@
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
+
 if is_torch_available():
     import torch
 
@@ -48,54 +49,54 @@
 
 class TextNetModelTester:
     def __init__(
-            self,
-            parent,
-            stem_kernel_size=3,
-            stem_stride=2,
-            stem_in_channels=3,
-            stem_out_channels=64,
-            # use_bn=True,
-            stem_act_func="relu",
-            dropout_rate=0,
-            ops_order="weight_bn_act",
-            conv_layer_kernel_sizes=[
-                [
-                    [3, 3],
-                ],
-                [
-                    [3, 3],
-                ],
-                [
-                    [3, 3],
-                ],
-                [
-                    [3, 3],
-                ],
+        self,
+        parent,
+        stem_kernel_size=3,
+        stem_stride=2,
+        stem_in_channels=3,
+        stem_out_channels=64,
+        # use_bn=True,
+        stem_act_func="relu",
+        dropout_rate=0,
+        ops_order="weight_bn_act",
+        conv_layer_kernel_sizes=[
+            [
+                [3, 3],
+            ],
+            [
+                [3, 3],
+            ],
+            [
+                [3, 3],
+            ],
+            [
+                [3, 3],
+            ],
+        ],
+        conv_layer_strides=[
+            [
+                2,
+            ],
+            [
+                2,
+            ],
+            [
+                2,
             ],
-            conv_layer_strides=[
-                [
-                    2,
-                ],
-                [
-                    2,
-                ],
-                [
-                    2,
-                ],
-                [
-                    2,
-                ],
+            [
+                2,
             ],
-            out_features=["stage1", "stage2", "stage3", "stage4"],
-            out_indices=[1, 2, 3, 4],
-            batch_size=3,
-            num_channels=3,
-            image_size=32,
-            is_training=True,
-            use_labels=True,
-            hidden_act="relu",
-            num_labels=3,
-            hidden_sizes=[64, 64, 64, 64, 64],
+        ],
+        out_features=["stage1", "stage2", "stage3", "stage4"],
+        out_indices=[1, 2, 3, 4],
+        batch_size=3,
+        num_channels=3,
+        image_size=32,
+        is_training=True,
+        use_labels=True,
+        hidden_act="relu",
+        num_labels=3,
+        hidden_sizes=[64, 64, 64, 64, 64],
     ):
         self.parent = parent
         self.stem_kernel_size = stem_kernel_size
@@ -344,10 +345,9 @@ def test_inference_textnet_image_classification(self):
         processor = TextNetImageProcessor.from_pretrained("Raghavan/textnet-base")
         text = "This is a photo of a cat"
         inputs = processor(
-            text=text, images=image, return_tensors="pt", size={"shortest_edge": 640},
-            default_to_square=True
+            text=text, images=image, return_tensors="pt", size={"shortest_edge": 640}, default_to_square=True
         )
 
         # forward pass
         output = model(pixel_values=torch.tensor(inputs["pixel_values"]))
-        self.assertEqual(output.logits.shape, torch.Size([1, 2]))
\ No newline at end of file
+        self.assertEqual(output.logits.shape, torch.Size([1, 2]))

From fca92a9c81411f0bfc00d06808c63d9f5e8f0341 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Fri, 24 Nov 2023 19:32:35 +0530
Subject: [PATCH 089/152] Fix build

---
 .../models/textnet/configuration_textnet.py   |  4 ++--
 .../textnet/image_processing_textnet.py       |  2 +-
 tests/models/textnet/test_modeling_textnet.py | 21 +++++++++++--------
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index b3b2246ec676..d5c70010adcc 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -50,7 +50,7 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
                 The list of stagewise conv layer's kernel sizes.
                                     [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
                 The list of stagewise conv layer's kernel strides.
-            image_size (`int`, *optional*, defaults to `(640, 640)`):
+            image_size (`int`, *optional*, defaults to `[640, 640]`):
                 The size (resolution) of each image.
             conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`):
                 A list of stage wise kernel sizes.
@@ -98,7 +98,7 @@ def __init__(
         stem_num_channels=3,
         stem_out_channels=64,
         stem_act_func="relu",
-        image_size=(640, 640),
+        image_size=[640, 640],
         conv_layer_kernel_sizes=[
             [[3, 3], [3, 3], [3, 3]],
             [[3, 3], [1, 3], [3, 3], [3, 1]],
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index edb5feed4708..7177fdecba72 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -106,7 +106,7 @@ def __init__(
         size = size if size is not None else {"shortest_edge": 224}
         size = get_size_dict(size, default_to_square=False)
         crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
-        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+        crop_size = get_size_dict(crop_size, default_to_square=False, param_name="crop_size")
 
         self.do_resize = do_resize
         self.size = size
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 5bc93eed095a..ff716b8abed6 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -91,7 +91,7 @@ def __init__(
         out_indices=[1, 2, 3, 4],
         batch_size=3,
         num_channels=3,
-        image_size=32,
+        image_size=[32, 32],
         is_training=True,
         use_labels=True,
         hidden_act="relu",
@@ -145,10 +145,11 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        scale = self.image_size // 32
+        scale_h = self.image_size[0] // 32
+        scale_w = self.image_size[1] // 32
         self.parent.assertEqual(
             result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], scale, scale),
+            (self.batch_size, self.hidden_sizes[-1], scale_h, scale_w),
         )
 
     def create_and_check_for_image_classification(self, config, pixel_values, labels):
@@ -160,7 +161,7 @@ def create_and_check_for_image_classification(self, config, pixel_values, labels
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
 
     def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size[0], self.image_size[1]])
 
         labels = None
         if self.use_labels:
@@ -178,9 +179,10 @@ def create_and_check_backbone(self, config, pixel_values, labels):
 
         # verify feature maps
         self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        scale = self.image_size // 32
+        scale_h = self.image_size[0] // 32
+        scale_w = self.image_size[1] // 32
         self.parent.assertListEqual(
-            list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 8 * scale, 8 * scale]
+            list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 8 * scale_h, 8 * scale_w]
         )
 
         # verify channels
@@ -196,8 +198,9 @@ def create_and_check_backbone(self, config, pixel_values, labels):
 
         # verify feature maps
         self.parent.assertEqual(len(result.feature_maps), 1)
-        scale = self.image_size // 32
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, 64, scale, scale])
+        scale_h = self.image_size[0] // 32
+        scale_w = self.image_size[1] // 32
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, 64, scale_h, scale_w])
 
         # verify channels
         self.parent.assertEqual(len(model.channels), 1)
@@ -290,7 +293,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             self.assertListEqual(
                 list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 2, self.model_tester.image_size // 2],
+                [self.model_tester.image_size[0] // 2, self.model_tester.image_size[1] // 2],
             )
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

From 138e8e136758c88f7790991620120e322a1c9cc8 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Fri, 24 Nov 2023 20:46:51 +0530
Subject: [PATCH 090/152] Fix build

---
 docs/source/en/model_doc/textnet.md                         | 6 ++++++
 src/transformers/models/textnet/image_processing_textnet.py | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
index 3a65db2724de..0ecf3a75fed6 100644
--- a/docs/source/en/model_doc/textnet.md
+++ b/docs/source/en/model_doc/textnet.md
@@ -24,6 +24,12 @@ The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector
 
 [[autodoc]] TextNetConfig
 
+## BeitImageProcessor
+
+[[autodoc]] BeitImageProcessor
+    - preprocess
+
+
 ## TextNetModel
 
 [[autodoc]] TextNetModel
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 7177fdecba72..1b9fec5692cf 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -106,7 +106,7 @@ def __init__(
         size = size if size is not None else {"shortest_edge": 224}
         size = get_size_dict(size, default_to_square=False)
         crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
-        crop_size = get_size_dict(crop_size, default_to_square=False, param_name="crop_size")
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
 
         self.do_resize = do_resize
         self.size = size

From e3fe76e09df3c497abb47b2631dc31c1537f9d15 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Fri, 24 Nov 2023 20:56:00 +0530
Subject: [PATCH 091/152] Fix build

---
 docs/source/en/model_doc/textnet.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
index 0ecf3a75fed6..41baa651bd1e 100644
--- a/docs/source/en/model_doc/textnet.md
+++ b/docs/source/en/model_doc/textnet.md
@@ -26,10 +26,9 @@ The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector
 
 ## BeitImageProcessor
 
-[[autodoc]] BeitImageProcessor
+[[autodoc]] TextNetImageProcessor
     - preprocess
 
-
 ## TextNetModel
 
 [[autodoc]] TextNetModel

From 1a286d5d876ca7fe886bf8aa7d53cfcb99e36b38 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Sat, 25 Nov 2023 11:24:32 +0530
Subject: [PATCH 092/152] Incorporate PR feedbacks

---
 docs/source/en/model_doc/textnet.md           |   4 +-
 .../models/textnet/configuration_textnet.py   | 102 +++++++++---------
 .../convert_textnet_original_to_pytorch.py    |  12 +--
 .../textnet/image_processing_textnet.py       |  15 ++-
 .../models/textnet/modeling_textnet.py        |  14 +--
 tests/models/textnet/test_modeling_textnet.py |  67 +++++-------
 6 files changed, 101 insertions(+), 113 deletions(-)

diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
index 41baa651bd1e..9e8596c6f913 100644
--- a/docs/source/en/model_doc/textnet.md
+++ b/docs/source/en/model_doc/textnet.md
@@ -18,13 +18,13 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. TextNet is a vision backbone useful for text recognition tasks. It is the result of neural architecture search (NAS).
+The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. TextNet is a vision backbone useful for text detection tasks. It is the result of neural architecture search (NAS) on backbones with reward function as text detection task (to provide powerful features for text detection).
 
 ## TextNetConfig
 
 [[autodoc]] TextNetConfig
 
-## BeitImageProcessor
+## TextNetImageProcessor
 
 [[autodoc]] TextNetImageProcessor
     - preprocess
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index d5c70010adcc..713dd0401db8 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -34,57 +34,57 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     [`PretrainedConfig`] and can be used to control the model outputs.Read the documentation from [`PretrainedConfig`]
     for more information.
 
-        Args:
-            stem_kernel_size (`int`, *optional*, defaults to 3):
-                The kernel size for the initial convolution layer.
-            stem_stride (`int`, *optional*, defaults to 2):
-                The stride for the initial convolution layer.
-            stem_num_channels (`int`, *optional*, defaults to 3):
-                The num of channels in input for the initial convolution layer.
-            stem_out_channels (`int`, *optional*, defaults to 64):
-                The num of channels in out for the initial convolution layer.
-            stem_act_func (`str`, *optional*, defaults to `"relu"`):
-                The activation function for the initial convolution layer.
-                                         [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3,
-                                         1], [1, 3], [3, 3]]]`):
-                The list of stagewise conv layer's kernel sizes.
-                                    [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
-                The list of stagewise conv layer's kernel strides.
-            image_size (`int`, *optional*, defaults to `[640, 640]`):
-                The size (resolution) of each image.
-            conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`):
-                A list of stage wise kernel sizes.
-            conv_layer_strides (`List[List[int]]`, *optional*, defaults to `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
-                A list of stage wise strides.
-            hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`):
-                Dimensionality (hidden size) at each stage.
-            batch_norm_eps (`float`, *optional*, defaults to 1e-05):
-                The epsilon used by the batch normalization layers.
-            initializer_range (`float`, *optional*, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            out_features (`List[str]`, *optional*):
-                If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-                (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
-                corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
-            out_indices (`List[int]`, *optional*):
-                If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
-                many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
-                If unset and `out_features` is unset, will default to the last stage.
-
-        Examples:
-
-        ```python
-        >>> from transformers import TextNetConfig, TextNetBackbone
-
-        >>> # Initializing a TextNetConfig
-        >>> configuration = TextNetConfig()
-
-        >>> # Initializing a model (with random weights)
-        >>> model = TextNetBackbone(configuration)
-
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-        ```"""
+    Args:
+        stem_kernel_size (`int`, *optional*, defaults to 3):
+            The kernel size for the initial convolution layer.
+        stem_stride (`int`, *optional*, defaults to 2):
+            The stride for the initial convolution layer.
+        stem_num_channels (`int`, *optional*, defaults to 3):
+            The num of channels in input for the initial convolution layer.
+        stem_out_channels (`int`, *optional*, defaults to 64):
+            The num of channels in out for the initial convolution layer.
+        stem_act_func (`str`, *optional*, defaults to `"relu"`):
+            The activation function for the initial convolution layer.
+                                     [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3,
+                                     1], [1, 3], [3, 3]]]`):
+            The list of stagewise conv layer's kernel sizes.
+                                [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
+            The list of stagewise conv layer's kernel strides.
+        image_size (`int`, *optional*, defaults to `[640, 640]`):
+            The size (resolution) of each image.
+        conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`):
+            A list of stage wise kernel sizes.
+        conv_layer_strides (`List[List[int]]`, *optional*, defaults to `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
+            A list of stage wise strides.
+        hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`):
+            Dimensionality (hidden size) at each stage.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
+
+    Examples:
+
+    ```python
+    >>> from transformers import TextNetConfig, TextNetBackbone
+
+    >>> # Initializing a TextNetConfig
+    >>> configuration = TextNetConfig()
+
+    >>> # Initializing a model (with random weights)
+    >>> model = TextNetBackbone(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     r"""
     [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base)
diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index ff8deb401725..216f6aa12855 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -141,9 +141,7 @@ def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_filename, pytor
         config = prepare_config(base_config_url, size)
 
     model = TextNetBackbone(config)
-    textnet_image_processor = TextNetImageProcessor(
-        size={"shortest_edge": size},
-    )
+    textnet_image_processor = TextNetImageProcessor(size={"shortest_edge": size})
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"]
     state_dict_changed = OrderedDict()
     for key in state_dict:
@@ -176,10 +174,10 @@ def adjust_stage(match):
     scale = short_size * 1.0 / min(h, w)
     h = int(h * scale + 0.5)
     w = int(w * scale + 0.5)
-    # if h % 32 != 0:
-    #     h = h + (32 - h % 32)
-    # if w % 32 != 0:
-    #     w = w + (32 - w % 32)
+    if h % 32 != 0:
+        h = h + (32 - h % 32)
+    if w % 32 != 0:
+        w = w + (32 - w % 32)
 
     transformations = transforms.Compose(
         [
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 1b9fec5692cf..388a5928c39b 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -127,8 +127,7 @@ def resize(
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        default_to_square: bool = True,
-        # align_size_to_32_factor: bool = False,
+        default_to_square: bool = False,
         **kwargs,
     ) -> np.ndarray:
         """
@@ -146,6 +145,10 @@ def resize(
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
+            default_to_square (`bool`, *optional*, defaults to `False`):
+                The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
+                `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or
+                not.Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
         """
         size = get_size_dict(size)
         if "shortest_edge" not in size:
@@ -153,7 +156,6 @@ def resize(
         output_size = get_resize_output_image_size(
             image, size=size["shortest_edge"], input_data_format=input_data_format, default_to_square=default_to_square
         )
-        # if align_size_to_32_factor:
         height, weight = output_size
         if height % 32 != 0:
             height = height + (32 - height % 32)
@@ -188,8 +190,7 @@ def preprocess(
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        # align_size_to_32_factor: bool = True,
-        default_to_square: bool = True,
+        default_to_square: bool = False,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -242,6 +243,10 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            default_to_square (`bool`, *optional*, defaults to `False`):
+                The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
+                `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or
+                not.Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 2d9885befe90..f46f739efb83 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -165,9 +165,6 @@ def __init__(self, config, in_channels, out_channels, kernel_size, stride):
         )
 
     def forward(self, hidden_states):
-        if hasattr(self, "fused_conv"):
-            self.__delattr__("fused_conv")
-
         main_outputs = self.main_conv(hidden_states)
         main_outputs = self.main_batch_norm(main_outputs)
 
@@ -348,12 +345,14 @@ def forward(
         >>> from transformers import TextNetForImageClassification,TextNetImageProcessor
         >>> from PIL import Image
         >>> import requests
+
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
+
         >>> processor = TextNetImageProcessor.from_pretrained("Raghavan/textnet-base")
         >>> model = TextNetForImageClassification.from_pretrained("Raghavan/textnet-base")
-        >>> inputs = processor(images=image, return_tensors="pt",size={"shortest_edge": 640},default_to_square=True)
-        >>> # forward pass
+
+        >>> inputs = processor(images=image, return_tensors="pt", size={"shortest_edge": 640}, default_to_square=True)
         >>> outputs = model(**inputs)
         >>> outputs.logits.shape
         torch.Size([1, 2])
@@ -445,8 +444,9 @@ def forward(
         hidden_states = outputs.hidden_states if return_dict else outputs[2]
 
         feature_maps = ()
-        for idx in self.out_indices:
-            feature_maps += (hidden_states[idx],)
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
 
         if not return_dict:
             output = (feature_maps,)
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index ff716b8abed6..746e30350345 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -54,38 +54,21 @@ def __init__(
         stem_kernel_size=3,
         stem_stride=2,
         stem_in_channels=3,
-        stem_out_channels=64,
-        # use_bn=True,
+        stem_out_channels=32,
         stem_act_func="relu",
         dropout_rate=0,
         ops_order="weight_bn_act",
         conv_layer_kernel_sizes=[
-            [
-                [3, 3],
-            ],
-            [
-                [3, 3],
-            ],
-            [
-                [3, 3],
-            ],
-            [
-                [3, 3],
-            ],
+            [[3, 3]],
+            [[3, 3]],
+            [[3, 3]],
+            [[3, 3]],
         ],
         conv_layer_strides=[
-            [
-                2,
-            ],
-            [
-                2,
-            ],
-            [
-                2,
-            ],
-            [
-                2,
-            ],
+            [2],
+            [2],
+            [2],
+            [2],
         ],
         out_features=["stage1", "stage2", "stage3", "stage4"],
         out_indices=[1, 2, 3, 4],
@@ -96,14 +79,13 @@ def __init__(
         use_labels=True,
         hidden_act="relu",
         num_labels=3,
-        hidden_sizes=[64, 64, 64, 64, 64],
+        hidden_sizes=[32, 32, 32, 32, 32],
     ):
         self.parent = parent
         self.stem_kernel_size = stem_kernel_size
         self.stem_stride = stem_stride
         self.stem_in_channels = stem_in_channels
         self.stem_out_channels = stem_out_channels
-        # self.use_bn = use_bn
         self.act_func = stem_act_func
         self.dropout_rate = dropout_rate
         self.ops_order = ops_order
@@ -200,7 +182,9 @@ def create_and_check_backbone(self, config, pixel_values, labels):
         self.parent.assertEqual(len(result.feature_maps), 1)
         scale_h = self.image_size[0] // 32
         scale_w = self.image_size[1] // 32
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, 64, scale_h, scale_w])
+        self.parent.assertListEqual(
+            list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[0], scale_h, scale_w]
+        )
 
         # verify channels
         self.parent.assertEqual(len(model.channels), 1)
@@ -213,6 +197,7 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
 
+# copied from tests.test_modeling_bit.BitBackboneTest with Bit -> TextNet
 @require_torch
 class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TextNetModel, TextNetForImageClassification, TextNetBackbone) if is_torch_available() else ()
@@ -325,18 +310,6 @@ def test_model_from_pretrained(self):
             self.assertIsNotNone(model)
 
 
-# copied from tests.test_modeling_bit
-@require_torch
-class TextNetBackboneTest(BackboneTesterMixin, unittest.TestCase):
-    all_model_classes = (TextNetBackbone,) if is_torch_available() else ()
-    config_class = TextNetConfig
-
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = TextNetModelTester(self)
-
-
 @require_torch
 @require_vision
 class TextNetModelIntegrationTest(unittest.TestCase):
@@ -354,3 +327,15 @@ def test_inference_textnet_image_classification(self):
         # forward pass
         output = model(pixel_values=torch.tensor(inputs["pixel_values"]))
         self.assertEqual(output.logits.shape, torch.Size([1, 2]))
+
+
+# copied from tests.test_modeling_bit.BitBackboneTest with Bit -> TextNet
+@require_torch
+class TextNetBackboneTest(BackboneTesterMixin, unittest.TestCase):
+    all_model_classes = (TextNetBackbone,) if is_torch_available() else ()
+    config_class = TextNetConfig
+
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = TextNetModelTester(self)

From e19461209475e23f889409b4f4eefde47f1733ef Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Sat, 25 Nov 2023 13:59:25 +0530
Subject: [PATCH 093/152] Incorporate PR feedbacks

---
 .../models/textnet/convert_textnet_original_to_pytorch.py    | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index 216f6aa12855..a48e6e69c7f5 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -195,6 +195,11 @@ def adjust_stage(match):
 
     assert torch.allclose(original_pixel_values, pixel_values)
 
+    with torch.no_grad():
+        output = model(pixel_values)
+
+    assert np.allclose(output["feature_maps"][-1][0][0][0][-2:].detach().numpy(), np.array([4.0259, 17.4911]))
+
     model.save_pretrained(pytorch_dump_folder_path)
     textnet_image_processor.save_pretrained(pytorch_dump_folder_path)
     logging.info("The converted weights are save here : " + pytorch_dump_folder_path)

From 74881cfc56fd3ac78e48bd592c50da5bd071165b Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Sat, 25 Nov 2023 14:09:33 +0530
Subject: [PATCH 094/152] Fix build

---
 src/transformers/models/textnet/modeling_textnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index f46f739efb83..5659aedcb64f 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -43,7 +43,7 @@
 # General docstring
 _CONFIG_FOR_DOC = "TextNetConfig"
 _CHECKPOINT_FOR_DOC = "Raghavan/textnet-base"
-_EXPECTED_OUTPUT_SHAPE = [1, 512, 20, 20]
+_EXPECTED_OUTPUT_SHAPE = [1, 512, 20, 27]
 
 TEXTNET_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it

From a3c9636b3f034a2f76e70009d8ea8c0f0653236b Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Mon, 27 Nov 2023 16:33:28 +0530
Subject: [PATCH 095/152] Incorporate PR feedbacks

---
 .../convert_textnet_original_to_pytorch.py    | 18 +++-----
 .../models/textnet/get_config_dict.py         | 45 +++++++++++++++++++
 2 files changed, 50 insertions(+), 13 deletions(-)
 create mode 100644 src/transformers/models/textnet/get_config_dict.py

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index a48e6e69c7f5..a6a53e87f5b2 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -118,24 +118,16 @@ def prepare_config(size_config_url, size):
 
 
 def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_filename, pytorch_dump_folder_path):
-    filepath = hf_hub_download(repo_id="Raghavan/fast_model_config_files", filename=checkpoint_config_filename)
+    filepath = hf_hub_download(repo_id="Raghavan/fast_model_config_files", filename="fast_model_configs.json")
 
     with open(filepath) as f:
-        content = f.read()
-    namespace = {}
+        content = json.loads(f.read())
 
-    exec(content, namespace)
+    size = content[checkpoint_config_filename]["short_size"]
 
-    model_config = namespace.get("model")
-    data_config = namespace.get("data")
-    size = 640
-    if "train" in data_config:
-        if "short_size" in data_config["train"]:
-            size = data_config["train"]["short_size"]
-
-    if "tiny" in model_config["backbone"]["config"]:
+    if "tiny" in content[checkpoint_config_filename]["config"]:
         config = prepare_config(tiny_config_url, size)
-    elif "small" in model_config["backbone"]["config"]:
+    elif "small" in content[checkpoint_config_filename]["config"]:
         config = prepare_config(small_config_url, size)
     else:
         config = prepare_config(base_config_url, size)
diff --git a/src/transformers/models/textnet/get_config_dict.py b/src/transformers/models/textnet/get_config_dict.py
new file mode 100644
index 000000000000..b5c4900db49c
--- /dev/null
+++ b/src/transformers/models/textnet/get_config_dict.py
@@ -0,0 +1,45 @@
+import argparse
+import json
+import os
+import ast
+
+def process_file(file_path):
+    try:
+        with open(file_path, 'r') as file:
+            content = file.read()
+            namespace = {}
+
+            exec(content, namespace)
+
+            model_config = namespace.get("model")
+            data_config = namespace.get("data")
+            short_size = data_config.get('train', {}).get('short_size')
+            config = model_config.get('backbone', {}).get('config')
+            return short_size, config
+    except Exception as e:
+        print(f"Error processing file {file_path}: {e}")
+        return None, None
+
+def process_folder(folder_path):
+    results = {}
+    for root, dirs, files in os.walk(folder_path):
+        for file in files:
+            if file.endswith('.py'):
+                file_path = os.path.join(root, file)
+                short_size, config = process_file(file_path)
+                if short_size is not None and config is not None:
+                    results[file] = {'short_size': short_size, 'config': config}
+    return results
+
+def main():
+    parser = argparse.ArgumentParser(description='Process Python files in a given folder.')
+    parser.add_argument('folder_path', type=str, help='Path to the folder containing Python files')
+    args = parser.parse_args()
+
+    folder_path = args.folder_path
+    results = process_folder(folder_path)
+    with open("/Users/eaxxkra/Downloads/fast_model_configs.json",'w') as f:
+        json.dump(results,f,indent=4)
+
+if __name__ == '__main__':
+    main()

From d40c57b3b2275f4461ff751ede57a673379454ba Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Mon, 27 Nov 2023 16:37:38 +0530
Subject: [PATCH 096/152] Remove some script

---
 .../models/textnet/get_config_dict.py         | 45 -------------------
 1 file changed, 45 deletions(-)
 delete mode 100644 src/transformers/models/textnet/get_config_dict.py

diff --git a/src/transformers/models/textnet/get_config_dict.py b/src/transformers/models/textnet/get_config_dict.py
deleted file mode 100644
index b5c4900db49c..000000000000
--- a/src/transformers/models/textnet/get_config_dict.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import argparse
-import json
-import os
-import ast
-
-def process_file(file_path):
-    try:
-        with open(file_path, 'r') as file:
-            content = file.read()
-            namespace = {}
-
-            exec(content, namespace)
-
-            model_config = namespace.get("model")
-            data_config = namespace.get("data")
-            short_size = data_config.get('train', {}).get('short_size')
-            config = model_config.get('backbone', {}).get('config')
-            return short_size, config
-    except Exception as e:
-        print(f"Error processing file {file_path}: {e}")
-        return None, None
-
-def process_folder(folder_path):
-    results = {}
-    for root, dirs, files in os.walk(folder_path):
-        for file in files:
-            if file.endswith('.py'):
-                file_path = os.path.join(root, file)
-                short_size, config = process_file(file_path)
-                if short_size is not None and config is not None:
-                    results[file] = {'short_size': short_size, 'config': config}
-    return results
-
-def main():
-    parser = argparse.ArgumentParser(description='Process Python files in a given folder.')
-    parser.add_argument('folder_path', type=str, help='Path to the folder containing Python files')
-    args = parser.parse_args()
-
-    folder_path = args.folder_path
-    results = process_folder(folder_path)
-    with open("/Users/eaxxkra/Downloads/fast_model_configs.json",'w') as f:
-        json.dump(results,f,indent=4)
-
-if __name__ == '__main__':
-    main()

From 66da2a092688ba55f12cb26827635a4e40f25a9f Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Mon, 27 Nov 2023 17:01:08 +0530
Subject: [PATCH 097/152] Incorporate PR feedbacks

---
 .../convert_textnet_original_to_pytorch.py    | 33 +++----------------
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index a6a53e87f5b2..632e499ab066 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -24,10 +24,8 @@
 import torch
 from huggingface_hub import hf_hub_download
 from PIL import Image
-from torchvision import transforms
 
 from transformers import TextNetBackbone, TextNetConfig, TextNetImageProcessor
-from transformers.utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
 
 tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
@@ -118,9 +116,9 @@ def prepare_config(size_config_url, size):
 
 
 def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_filename, pytorch_dump_folder_path):
-    filepath = hf_hub_download(repo_id="Raghavan/fast_model_config_files", filename="fast_model_configs.json")
+    config_filepath = hf_hub_download(repo_id="Raghavan/fast_model_config_files", filename="fast_model_configs.json")
 
-    with open(filepath) as f:
+    with open(config_filepath) as f:
         content = json.loads(f.read())
 
     size = content[checkpoint_config_filename]["short_size"]
@@ -158,31 +156,10 @@ def adjust_stage(match):
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
 
-    # originpreprocess image
-
-    img = np.array(image)
-    short_size = size
-    h, w = img.shape[0:2]
-    scale = short_size * 1.0 / min(h, w)
-    h = int(h * scale + 0.5)
-    w = int(w * scale + 0.5)
-    if h % 32 != 0:
-        h = h + (32 - h % 32)
-    if w % 32 != 0:
-        w = w + (32 - w % 32)
-
-    transformations = transforms.Compose(
-        [
-            transforms.Resize((h, w), interpolation=transforms.InterpolationMode.BILINEAR),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=IMAGENET_DEFAULT_MEAN,  # these are RGB mean+std values
-                std=IMAGENET_DEFAULT_STD,  # across a large photo dataset.
-            ),
-        ]
+    original_pixel_values_filepath = hf_hub_download(
+        repo_id="Raghavan/fast_model_samples", filename="original_processed_pixel_values.npy", repo_type="dataset"
     )
-
-    original_pixel_values = transformations(image).unsqueeze(0)  # insert batch dimension
+    original_pixel_values = torch.from_numpy(np.load(original_pixel_values_filepath))
     pixel_values = textnet_image_processor(image, return_tensors="pt").pixel_values
 
     assert torch.allclose(original_pixel_values, pixel_values)

From 44f7f90fcc8b1ad11914e0b51c4cac5feb650ae1 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Mon, 27 Nov 2023 20:30:03 +0530
Subject: [PATCH 098/152] Incorporate PR feedbacks

---
 .../textnet/image_processing_textnet.py       | 32 ++++++++-----------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 388a5928c39b..31f3091d1ee8 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -150,11 +150,16 @@ def resize(
                 `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or
                 not.Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
         """
-        size = get_size_dict(size)
-        if "shortest_edge" not in size:
-            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
         output_size = get_resize_output_image_size(
-            image, size=size["shortest_edge"], input_data_format=input_data_format, default_to_square=default_to_square
+            image, size=size, input_data_format=input_data_format, default_to_square=default_to_square
         )
         height, weight = output_size
         if height % 32 != 0:
@@ -173,6 +178,7 @@ def resize(
             **kwargs,
         )
 
+    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.preprocess
     def preprocess(
         self,
         images: ImageInput,
@@ -190,7 +196,6 @@ def preprocess(
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        default_to_square: bool = False,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -243,24 +248,21 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-            default_to_square (`bool`, *optional*, defaults to `False`):
-                The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
-                `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or
-                not.Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
-        size = get_size_dict(size, param_name="size")
+        size = get_size_dict(size, param_name="size", default_to_square=self.use_square_size)
         resample = resample if resample is not None else self.resample
         do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
         crop_size = crop_size if crop_size is not None else self.crop_size
-        crop_size = get_size_dict(crop_size, param_name="crop_size")
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
         images = make_list_of_images(images)
 
         if not valid_images(images):
@@ -300,13 +302,7 @@ def preprocess(
 
         if do_resize:
             images = [
-                self.resize(
-                    image=image,
-                    size=size,
-                    resample=resample,
-                    input_data_format=input_data_format,
-                    default_to_square=default_to_square,
-                )
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
                 for image in images
             ]
 

From f9da25c6175a8111fc648a67a0e936d7b3fe5571 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Mon, 27 Nov 2023 20:58:11 +0530
Subject: [PATCH 099/152] Incorporate PR feedbacks

---
 src/transformers/models/textnet/image_processing_textnet.py | 1 -
 tests/models/textnet/test_modeling_textnet.py               | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 31f3091d1ee8..bab3d16b3f10 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -152,7 +152,6 @@ def resize(
         """
         if "shortest_edge" in size:
             size = size["shortest_edge"]
-            default_to_square = False
         elif "height" in size and "width" in size:
             size = (size["height"], size["width"])
         else:
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 746e30350345..62228b36a764 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -313,7 +313,7 @@ def test_model_from_pretrained(self):
 @require_torch
 @require_vision
 class TextNetModelIntegrationTest(unittest.TestCase):
-    @slow
+    # @slow
     def test_inference_textnet_image_classification(self):
         model = TextNetForImageClassification.from_pretrained("Raghavan/textnet-base").to(torch_device)
         url = "http://images.cocodataset.org/val2017/000000039769.jpg"

From f94fcec1b93a4726fe0c96e68a1f32913501faee Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Mon, 27 Nov 2023 21:13:53 +0530
Subject: [PATCH 100/152] Incorporate PR feedbacks

---
 .../models/textnet/image_processing_textnet.py   | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index bab3d16b3f10..2e1ea8924f43 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -177,7 +177,6 @@ def resize(
             **kwargs,
         )
 
-    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.preprocess
     def preprocess(
         self,
         images: ImageInput,
@@ -195,6 +194,7 @@ def preprocess(
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        default_to_square: bool = False,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -247,10 +247,14 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            default_to_square (`bool`, *optional*, defaults to `False`):
+                The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
+                `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or
+                not.Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
-        size = get_size_dict(size, param_name="size", default_to_square=self.use_square_size)
+        size = get_size_dict(size, param_name="size", default_to_square=default_to_square)
         resample = resample if resample is not None else self.resample
         do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
         crop_size = crop_size if crop_size is not None else self.crop_size
@@ -301,7 +305,13 @@ def preprocess(
 
         if do_resize:
             images = [
-                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                self.resize(
+                    image=image,
+                    size=size,
+                    resample=resample,
+                    input_data_format=input_data_format,
+                    default_to_square=default_to_square,
+                )
                 for image in images
             ]
 

From ef4b3b01c3fd4e3135eb260ff5b6ff2a2d2d13b5 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Mon, 27 Nov 2023 21:19:46 +0530
Subject: [PATCH 101/152] Fix image processing in textnet

---
 .../models/textnet/image_processing_textnet.py        | 11 ++---------
 src/transformers/models/textnet/modeling_textnet.py   |  2 +-
 tests/models/textnet/test_modeling_textnet.py         |  4 +---
 3 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 2e1ea8924f43..75ef943ea8fa 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -127,7 +127,6 @@ def resize(
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        default_to_square: bool = False,
         **kwargs,
     ) -> np.ndarray:
         """
@@ -158,7 +157,7 @@ def resize(
             raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
 
         output_size = get_resize_output_image_size(
-            image, size=size, input_data_format=input_data_format, default_to_square=default_to_square
+            image, size=size, input_data_format=input_data_format, default_to_square=False
         )
         height, weight = output_size
         if height % 32 != 0:
@@ -194,7 +193,6 @@ def preprocess(
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        default_to_square: bool = False,
         **kwargs,
     ) -> PIL.Image.Image:
         """
@@ -247,14 +245,10 @@ def preprocess(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-            default_to_square (`bool`, *optional*, defaults to `False`):
-                The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
-                `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or
-                not.Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
-        size = get_size_dict(size, param_name="size", default_to_square=default_to_square)
+        size = get_size_dict(size, param_name="size", default_to_square=False)
         resample = resample if resample is not None else self.resample
         do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
         crop_size = crop_size if crop_size is not None else self.crop_size
@@ -310,7 +304,6 @@ def preprocess(
                     size=size,
                     resample=resample,
                     input_data_format=input_data_format,
-                    default_to_square=default_to_square,
                 )
                 for image in images
             ]
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 5659aedcb64f..20b9e0fdf8da 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -352,7 +352,7 @@ def forward(
         >>> processor = TextNetImageProcessor.from_pretrained("Raghavan/textnet-base")
         >>> model = TextNetForImageClassification.from_pretrained("Raghavan/textnet-base")
 
-        >>> inputs = processor(images=image, return_tensors="pt", size={"shortest_edge": 640}, default_to_square=True)
+        >>> inputs = processor(images=image, return_tensors="pt", size={"height": 640, "width": 640})
         >>> outputs = model(**inputs)
         >>> outputs.logits.shape
         torch.Size([1, 2])
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 62228b36a764..be9434781b16 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -320,9 +320,7 @@ def test_inference_textnet_image_classification(self):
         image = Image.open(requests.get(url, stream=True).raw)
         processor = TextNetImageProcessor.from_pretrained("Raghavan/textnet-base")
         text = "This is a photo of a cat"
-        inputs = processor(
-            text=text, images=image, return_tensors="pt", size={"shortest_edge": 640}, default_to_square=True
-        )
+        inputs = processor(text=text, images=image, return_tensors="pt", size={"height": 640, "width": 640})
 
         # forward pass
         output = model(pixel_values=torch.tensor(inputs["pixel_values"]))

From bd097460d3597bd51eb542163b9a4ad10faab40f Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Wed, 10 Jan 2024 20:08:17 +0530
Subject: [PATCH 102/152] Incorporate PR Feedbacks

---
 .../models/textnet/image_processing_textnet.py             | 7 +++----
 src/transformers/models/textnet/modeling_textnet.py        | 6 +++---
 tests/models/textnet/test_modeling_textnet.py              | 4 ++--
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 75ef943ea8fa..ff449a618e66 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -130,8 +130,8 @@ def resize(
         **kwargs,
     ) -> np.ndarray:
         """
-        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
-        resized to keep the input aspect ratio.
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"] , with the longest edge
+        resized to keep the input aspect ratio. Both the height and width are resized to be divisible by 32.
 
         Args:
             image (`np.ndarray`):
@@ -156,10 +156,9 @@ def resize(
         else:
             raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
 
-        output_size = get_resize_output_image_size(
+        height, weight = get_resize_output_image_size(
             image, size=size, input_data_format=input_data_format, default_to_square=False
         )
-        height, weight = output_size
         if height % 32 != 0:
             height = height + (32 - height % 32)
         if weight % 32 != 0:
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 20b9e0fdf8da..9dc93779bd74 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -214,7 +214,7 @@ def __init__(self, config):
 
         stages = []
         num_stages = len(config.conv_layer_kernel_sizes)
-        for stage_ix in range(0, num_stages):
+        for stage_ix in range(num_stages):
             stages.append(TextNetStage(config, stage_ix))
 
         self.stages = nn.ModuleList(stages)
@@ -266,8 +266,7 @@ def __init__(self, config):
         self.encoder = TextNetEncoder(config)
 
         self.pooler = nn.AdaptiveAvgPool2d((2, 2))
-
-        self.init_weights()
+        self.post_init()
 
     @add_start_docstrings_to_model_forward(TEXTNET_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
@@ -340,6 +339,7 @@ def forward(
     ) -> ImageClassifierOutputWithNoAttention:
         r"""
         Returns:
+
         Examples:
         ```python
         >>> from transformers import TextNetForImageClassification,TextNetImageProcessor
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index be9434781b16..6df3fec4351d 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -197,7 +197,7 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
 
-# copied from tests.test_modeling_bit.BitBackboneTest with Bit -> TextNet
+# Copied from tests.test_modeling_bit.BitBackboneTest with Bit -> TextNet
 @require_torch
 class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TextNetModel, TextNetForImageClassification, TextNetBackbone) if is_torch_available() else ()
@@ -327,7 +327,7 @@ def test_inference_textnet_image_classification(self):
         self.assertEqual(output.logits.shape, torch.Size([1, 2]))
 
 
-# copied from tests.test_modeling_bit.BitBackboneTest with Bit -> TextNet
+# Copied from tests.test_modeling_bit.BitBackboneTest with Bit -> TextNet
 @require_torch
 class TextNetBackboneTest(BackboneTesterMixin, unittest.TestCase):
     all_model_classes = (TextNetBackbone,) if is_torch_available() else ()

From a3427d70a55b08290b8d1807e1ce52982a2d4bf6 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 16 Jan 2024 07:55:06 +0530
Subject: [PATCH 103/152] Fix CI failures

---
 src/transformers/__init__.py                  | 5 +----
 src/transformers/models/auto/modeling_auto.py | 3 ---
 tests/models/textnet/test_modeling_textnet.py | 4 ++--
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c701e3bc2c96..c9baa4152579 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -822,10 +822,7 @@
         "TapasConfig",
         "TapasTokenizer",
     ],
-    "models.textnet": [
-        "TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "TextNetConfig"
-    ],
+    "models.textnet": ["TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "TextNetConfig"],
     "models.time_series_transformer": [
         "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TimeSeriesTransformerConfig",
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index d396ccb21c4a..c1ecdee1578e 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -95,10 +95,7 @@
         ("ernie_m", "ErnieMModel"),
         ("esm", "EsmModel"),
         ("falcon", "FalconModel"),
-<<<<<<< HEAD
         ("fastspeech2_conformer", "FastSpeech2ConformerModel"),
-=======
->>>>>>> ae576e088 (Remove all the references of fast model)
         ("flaubert", "FlaubertModel"),
         ("flava", "FlavaModel"),
         ("fnet", "FNetModel"),
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 6df3fec4351d..ebf6aa2e2551 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -197,7 +197,7 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
 
-# Copied from tests.test_modeling_bit.BitBackboneTest with Bit -> TextNet
+# Copied from tests.models.bit.test_modeling_bit.BitBackboneTest with Bit -> TextNet
 @require_torch
 class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TextNetModel, TextNetForImageClassification, TextNetBackbone) if is_torch_available() else ()
@@ -327,7 +327,7 @@ def test_inference_textnet_image_classification(self):
         self.assertEqual(output.logits.shape, torch.Size([1, 2]))
 
 
-# Copied from tests.test_modeling_bit.BitBackboneTest with Bit -> TextNet
+# Copied from tests.models.bit.test_modeling_bit.BitBackboneTest with Bit -> TextNet
 @require_torch
 class TextNetBackboneTest(BackboneTesterMixin, unittest.TestCase):
     all_model_classes = (TextNetBackbone,) if is_torch_available() else ()

From e3916ad27152b45be94d237f9ab27cfba279bb56 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 16 Jan 2024 08:37:57 +0530
Subject: [PATCH 104/152] Fix failing test

---
 src/transformers/utils/dummy_pt_objects.py    |  7 ---
 tests/models/textnet/test_modeling_textnet.py | 51 +++++++++++--------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index a94ed1123326..240b8d29361a 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3453,13 +3453,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class FastPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index ebf6aa2e2551..5462bd98a855 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -197,11 +197,15 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
 
-# Copied from tests.models.bit.test_modeling_bit.BitBackboneTest with Bit -> TextNet
 @require_torch
+# Copied from tests.models.bit.test_modeling_bit.BitModelTest with Bit->TextNet
 class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TextNetModel, TextNetForImageClassification, TextNetBackbone) if is_torch_available() else ()
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as TextNet does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
 
+    all_model_classes = (TextNetModel, TextNetForImageClassification, TextNetBackbone) if is_torch_available() else ()
     pipeline_model_mapping = (
         {"feature-extraction": TextNetModel, "image-classification": TextNetForImageClassification}
         if is_torch_available()
@@ -216,7 +220,7 @@ class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
 
     def setUp(self):
         self.model_tester = TextNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TextNetConfig, hidden_size=37)
+        self.config_tester = ConfigTester(self, config_class=TextNetConfig, has_text_modality=False)
 
     def test_config(self):
         self.create_and_test_config_common_properties()
@@ -228,8 +232,7 @@ def test_config(self):
         self.config_tester.check_config_arguments_init()
 
     def create_and_test_config_common_properties(self):
-        config = self.model_tester.get_config()
-        self.assertTrue(hasattr(config, "hidden_sizes"))
+        return
 
     @unittest.skip(reason="TextNet does not output attentions")
     def test_attention_outputs(self):
@@ -243,18 +246,6 @@ def test_inputs_embeds(self):
     def test_model_common_attributes(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
@@ -263,6 +254,22 @@ def test_backbone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_backbone(*config_and_inputs)
 
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            for name, module in model.named_modules():
+                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+                    self.assertTrue(
+                        torch.all(module.weight == 1),
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+                    self.assertTrue(
+                        torch.all(module.bias == 0),
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
@@ -274,11 +281,13 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
 
-            self.assertEqual(len(hidden_states), self.model_tester.num_stages)
+            expected_num_stages = self.model_tester.num_stages
+            self.assertEqual(len(hidden_states), expected_num_stages + 1)
 
+            # TextNet's feature maps are of shape (batch_size, num_channels, height, width)
             self.assertListEqual(
                 list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size[0] // 2, self.model_tester.image_size[1] // 2],
+                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
             )
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -305,7 +314,7 @@ def test_for_image_classification(self):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in BIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = TextNetModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
@@ -327,8 +336,8 @@ def test_inference_textnet_image_classification(self):
         self.assertEqual(output.logits.shape, torch.Size([1, 2]))
 
 
-# Copied from tests.models.bit.test_modeling_bit.BitBackboneTest with Bit -> TextNet
 @require_torch
+# Copied from tests.models.bit.test_modeling_bit.BitBackboneTest with Bit->TextNet
 class TextNetBackboneTest(BackboneTesterMixin, unittest.TestCase):
     all_model_classes = (TextNetBackbone,) if is_torch_available() else ()
     config_class = TextNetConfig

From b46875c50dd20f07139ec7bc6e068f551181d3fa Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 16 Jan 2024 09:16:21 +0530
Subject: [PATCH 105/152] Fix failing test

---
 .../textnet/convert_textnet_original_to_pytorch.py   |  9 ++++++---
 .../models/textnet/image_processing_textnet.py       | 12 +++---------
 src/transformers/models/textnet/modeling_textnet.py  |  7 +++++--
 tests/models/textnet/test_modeling_textnet.py        |  9 ++++-----
 4 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index 632e499ab066..690f9a8f9317 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -27,7 +27,6 @@
 
 from transformers import TextNetBackbone, TextNetConfig, TextNetImageProcessor
 
-
 tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
 small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config"
 base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config"
@@ -162,12 +161,16 @@ def adjust_stage(match):
     original_pixel_values = torch.from_numpy(np.load(original_pixel_values_filepath))
     pixel_values = textnet_image_processor(image, return_tensors="pt").pixel_values
 
-    assert torch.allclose(original_pixel_values, pixel_values)
+    assert torch.allclose(original_pixel_values,
+                          pixel_values)
 
     with torch.no_grad():
         output = model(pixel_values)
 
-    assert np.allclose(output["feature_maps"][-1][0][0][0][-2:].detach().numpy(), np.array([4.0259, 17.4911]))
+    model_output_featuremap_sample = [0, 0, 0, 0, 0, 0, 0, 0, 4.0259247, 17.4911]
+    assert np.allclose(output["feature_maps"][-1][0][0][0][-10:].detach().numpy(),
+                       np.array(
+                           model_output_featuremap_sample)), "Converted model outputs does not match original model outputs"
 
     model.save_pretrained(pytorch_dump_folder_path)
     textnet_image_processor.save_pretrained(pytorch_dump_folder_path)
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index ff449a618e66..fa336684e120 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -164,17 +164,16 @@ def resize(
         if weight % 32 != 0:
             weight = weight + (32 - weight % 32)
 
-        output_size = (height, weight)
-
         return resize(
             image,
-            size=output_size,
+            size=(height, weight),
             resample=resample,
             data_format=data_format,
             input_data_format=input_data_format,
             **kwargs,
         )
 
+    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.preprocess
     def preprocess(
         self,
         images: ImageInput,
@@ -298,12 +297,7 @@ def preprocess(
 
         if do_resize:
             images = [
-                self.resize(
-                    image=image,
-                    size=size,
-                    resample=resample,
-                    input_data_format=input_data_format,
-                )
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
                 for image in images
             ]
 
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 9dc93779bd74..720f9dc43870 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -262,9 +262,7 @@ class TextNetModel(TextNetPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.stem = TextNetConvLayer(config)
-
         self.encoder = TextNetEncoder(config)
-
         self.pooler = nn.AdaptiveAvgPool2d((2, 2))
         self.post_init()
 
@@ -338,6 +336,11 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> ImageClassifierOutputWithNoAttention:
         r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
         Returns:
 
         Examples:
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 5462bd98a855..a279d908076e 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -13,13 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Testing suite for the PyTorch TextNet model. """
-import inspect
 import unittest
 
 import requests
 from PIL import Image
 
-from transformers import TextNetConfig
+from transformers import TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST, TextNetConfig
 from transformers.models.textnet.image_processing_textnet import TextNetImageProcessor
 from transformers.testing_utils import (
     require_torch,
@@ -37,6 +36,7 @@
 
 if is_torch_available():
     import torch
+    from torch import nn
 
     from transformers import (
         TextNetBackbone,
@@ -44,7 +44,6 @@
         TextNetModel,
         is_torch_available,
     )
-    from transformers.models.textnet.modeling_textnet import TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
 class TextNetModelTester:
@@ -198,7 +197,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-# Copied from tests.models.bit.test_modeling_bit.BitModelTest with Bit->TextNet
+# Copied from tests.models.bit.test_modeling_bit.BitModelTest with Bit->TextNet, BIT->TEXTNET
 class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as TextNet does not use input_ids, inputs_embeds,
@@ -314,7 +313,7 @@ def test_for_image_classification(self):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in BIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = TextNetModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 

From 20bb82d6d0c362ad4de5ce164c07ca1e917bdef9 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 16 Jan 2024 09:23:58 +0530
Subject: [PATCH 106/152] Fix failing test

---
 .../models/textnet/convert_textnet_original_to_pytorch.py        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index 690f9a8f9317..9492b310a31b 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -27,6 +27,7 @@
 
 from transformers import TextNetBackbone, TextNetConfig, TextNetImageProcessor
 
+
 tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
 small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config"
 base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config"

From 344455cc0f72fc5c1e4cc3886d336ef9b5f22424 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 16 Jan 2024 09:37:27 +0530
Subject: [PATCH 107/152] Fix failing test

---
 .../textnet/convert_textnet_original_to_pytorch.py       | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index 9492b310a31b..790187b02119 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -162,16 +162,15 @@ def adjust_stage(match):
     original_pixel_values = torch.from_numpy(np.load(original_pixel_values_filepath))
     pixel_values = textnet_image_processor(image, return_tensors="pt").pixel_values
 
-    assert torch.allclose(original_pixel_values,
-                          pixel_values)
+    assert torch.allclose(original_pixel_values, pixel_values)
 
     with torch.no_grad():
         output = model(pixel_values)
 
     model_output_featuremap_sample = [0, 0, 0, 0, 0, 0, 0, 0, 4.0259247, 17.4911]
-    assert np.allclose(output["feature_maps"][-1][0][0][0][-10:].detach().numpy(),
-                       np.array(
-                           model_output_featuremap_sample)), "Converted model outputs does not match original model outputs"
+    assert np.allclose(
+        output["feature_maps"][-1][0][0][0][-10:].detach().numpy(), np.array(model_output_featuremap_sample)
+    ), "Converted model outputs does not match original model outputs"
 
     model.save_pretrained(pytorch_dump_folder_path)
     textnet_image_processor.save_pretrained(pytorch_dump_folder_path)

From 28892e3428a2c23bcc8d600757f64e3bea8eaa84 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 16 Jan 2024 10:46:19 +0530
Subject: [PATCH 108/152] Fix failing test

---
 tests/models/textnet/test_modeling_textnet.py | 70 +++++++++----------
 1 file changed, 33 insertions(+), 37 deletions(-)

diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index a279d908076e..f5477248f0df 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -33,7 +33,6 @@
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
-
 if is_torch_available():
     import torch
     from torch import nn
@@ -48,37 +47,37 @@
 
 class TextNetModelTester:
     def __init__(
-        self,
-        parent,
-        stem_kernel_size=3,
-        stem_stride=2,
-        stem_in_channels=3,
-        stem_out_channels=32,
-        stem_act_func="relu",
-        dropout_rate=0,
-        ops_order="weight_bn_act",
-        conv_layer_kernel_sizes=[
-            [[3, 3]],
-            [[3, 3]],
-            [[3, 3]],
-            [[3, 3]],
-        ],
-        conv_layer_strides=[
-            [2],
-            [2],
-            [2],
-            [2],
-        ],
-        out_features=["stage1", "stage2", "stage3", "stage4"],
-        out_indices=[1, 2, 3, 4],
-        batch_size=3,
-        num_channels=3,
-        image_size=[32, 32],
-        is_training=True,
-        use_labels=True,
-        hidden_act="relu",
-        num_labels=3,
-        hidden_sizes=[32, 32, 32, 32, 32],
+            self,
+            parent,
+            stem_kernel_size=3,
+            stem_stride=2,
+            stem_in_channels=3,
+            stem_out_channels=32,
+            stem_act_func="relu",
+            dropout_rate=0,
+            ops_order="weight_bn_act",
+            conv_layer_kernel_sizes=[
+                [[3, 3]],
+                [[3, 3]],
+                [[3, 3]],
+                [[3, 3]],
+            ],
+            conv_layer_strides=[
+                [2],
+                [2],
+                [2],
+                [2],
+            ],
+            out_features=["stage1", "stage2", "stage3", "stage4"],
+            out_indices=[1, 2, 3, 4],
+            batch_size=3,
+            num_channels=3,
+            image_size=[32, 32],
+            is_training=True,
+            use_labels=True,
+            hidden_act="relu",
+            num_labels=3,
+            hidden_sizes=[32, 32, 32, 32, 32],
     ):
         self.parent = parent
         self.stem_kernel_size = stem_kernel_size
@@ -197,7 +196,6 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-# Copied from tests.models.bit.test_modeling_bit.BitModelTest with Bit->TextNet, BIT->TEXTNET
 class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as TextNet does not use input_ids, inputs_embeds,
@@ -280,13 +278,11 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
 
-            expected_num_stages = self.model_tester.num_stages
-            self.assertEqual(len(hidden_states), expected_num_stages + 1)
+            self.assertEqual(len(hidden_states), self.model_tester.num_stages)
 
-            # TextNet's feature maps are of shape (batch_size, num_channels, height, width)
             self.assertListEqual(
                 list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
+                [self.model_tester.image_size[0] // 2, self.model_tester.image_size[1] // 2],
             )
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

From a7b238b333e45163a1408259f41b612f0edb427a Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 16 Jan 2024 10:48:17 +0530
Subject: [PATCH 109/152] Fix failing test

---
 tests/models/textnet/test_modeling_textnet.py | 63 ++++++++++---------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index f5477248f0df..b7cced64e37b 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -33,6 +33,7 @@
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
+
 if is_torch_available():
     import torch
     from torch import nn
@@ -47,37 +48,37 @@
 
 class TextNetModelTester:
     def __init__(
-            self,
-            parent,
-            stem_kernel_size=3,
-            stem_stride=2,
-            stem_in_channels=3,
-            stem_out_channels=32,
-            stem_act_func="relu",
-            dropout_rate=0,
-            ops_order="weight_bn_act",
-            conv_layer_kernel_sizes=[
-                [[3, 3]],
-                [[3, 3]],
-                [[3, 3]],
-                [[3, 3]],
-            ],
-            conv_layer_strides=[
-                [2],
-                [2],
-                [2],
-                [2],
-            ],
-            out_features=["stage1", "stage2", "stage3", "stage4"],
-            out_indices=[1, 2, 3, 4],
-            batch_size=3,
-            num_channels=3,
-            image_size=[32, 32],
-            is_training=True,
-            use_labels=True,
-            hidden_act="relu",
-            num_labels=3,
-            hidden_sizes=[32, 32, 32, 32, 32],
+        self,
+        parent,
+        stem_kernel_size=3,
+        stem_stride=2,
+        stem_in_channels=3,
+        stem_out_channels=32,
+        stem_act_func="relu",
+        dropout_rate=0,
+        ops_order="weight_bn_act",
+        conv_layer_kernel_sizes=[
+            [[3, 3]],
+            [[3, 3]],
+            [[3, 3]],
+            [[3, 3]],
+        ],
+        conv_layer_strides=[
+            [2],
+            [2],
+            [2],
+            [2],
+        ],
+        out_features=["stage1", "stage2", "stage3", "stage4"],
+        out_indices=[1, 2, 3, 4],
+        batch_size=3,
+        num_channels=3,
+        image_size=[32, 32],
+        is_training=True,
+        use_labels=True,
+        hidden_act="relu",
+        num_labels=3,
+        hidden_sizes=[32, 32, 32, 32, 32],
     ):
         self.parent = parent
         self.stem_kernel_size = stem_kernel_size

From 1bce14f55d269b86248b8996d8cbbbc10a5424d6 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 30 Jan 2024 13:19:18 +0530
Subject: [PATCH 110/152] Add textnet to readme

---
 README.md         | 2 +-
 README_es.md      | 2 +-
 README_hd.md      | 2 +-
 README_ja.md      | 2 +-
 README_ko.md      | 2 +-
 README_zh-hans.md | 2 +-
 README_zh-hant.md | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index cb1beeec315c..9225b6c4f8a2 100644
--- a/README.md
+++ b/README.md
@@ -491,7 +491,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** released with the paper [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
diff --git a/README_es.md b/README_es.md
index 2d8279f5b0fe..20f4a2225bd5 100644
--- a/README_es.md
+++ b/README_es.md
@@ -466,7 +466,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** released with the paper [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. 
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)**  (from HuggingFace).
 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
diff --git a/README_hd.md b/README_hd.md
index ef97795ebbff..6a98abc603d5 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -440,7 +440,7 @@ conda install conda-forge::transformers
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [पबटेबल्स-1एम: टूवर्ड्स कॉम्प्रिहेंसिव टेबल एक्सट्रैक्शन फ्रॉम अनस्ट्रक्चर्ड डॉक्यूमेंट्स ](https://arxiv.org/abs/2110.00061) ब्रैंडन स्मॉक, रोहित पेसाला, रॉबिन अब्राहम द्वारा पोस्ट किया गया।
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI से) साथ में कागज [TAPAS: पूर्व-प्रशिक्षण के माध्यम से कमजोर पर्यवेक्षण तालिका पार्सिंग](https:// arxiv.org/abs/2004.02349) जोनाथन हर्ज़िग, पावेल क्रिज़िस्तोफ़ नोवाक, थॉमस मुलर, फ्रांसेस्को पिकिन्नो और जूलियन मार्टिन ईसेन्च्लोस द्वारा।
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [TAPEX: टेबल प्री-ट्रेनिंग थ्रू लर्निंग अ न्यूरल SQL एक्ज़ीक्यूटर](https: //arxiv.org/abs/2107.07653) कियान लियू, बेई चेन, जियाकी गुओ, मोर्टेज़ा ज़ियादी, ज़ेकी लिन, वीज़ू चेन, जियान-गुआंग लू द्वारा पोस्ट किया गया।
-1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** released with the paper [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. 
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
diff --git a/README_ja.md b/README_ja.md
index cf9b70b1ba7f..be63da51460e 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -500,7 +500,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research から) Brandon Smock, Rohith Pesala, Robin Abraham から公開された研究論文: [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061)
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI から) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos から公開された研究論文: [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349)
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research から) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou から公開された研究論文: [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653)
-1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** released with the paper [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. 
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)**  (HuggingFace から).
 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook から) Gedas Bertasius, Heng Wang, Lorenzo Torresani から公開された研究論文: [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095)
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley から) Michael Janner, Qiyang Li, Sergey Levine から公開された研究論文: [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)
diff --git a/README_ko.md b/README_ko.md
index 6b256cba5aa0..8395fbcf08a8 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -415,7 +415,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research 에서) Brandon Smock, Rohith Pesala, Robin Abraham 의 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 논문과 함께 발표했습니다.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI 에서) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 의 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 논문과 함께 발표했습니다.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research 에서) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 의 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 논문과 함께 발표했습니다.
-1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** released with the paper [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. 
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook 에서) Gedas Bertasius, Heng Wang, Lorenzo Torresani 의 [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) 논문과 함께 발표했습니다.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley 에서) Michael Janner, Qiyang Li, Sergey Levin 의 [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 4b2950743ce7..349ac9b3dac7 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -439,7 +439,7 @@ conda install conda-forge::transformers
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (来自 Microsoft Research) 伴随论文 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 由 Brandon Smock, Rohith Pesala, Robin Abraham 发布。
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。
-1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** released with the paper [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. 
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 511630eb58fa..764b6708751f 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -451,7 +451,7 @@ conda install conda-forge::transformers
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
-1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** released with the paper [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. 
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine

From 9a6efa0269b15808046bd4220fe03c44d4c4ddec Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Tue, 30 Jan 2024 15:48:57 +0530
Subject: [PATCH 111/152] Improve readability

---
 docs/source/en/model_doc/textnet.md           |  2 +-
 src/transformers/models/textnet/__init__.py   |  8 ++----
 .../models/textnet/configuration_textnet.py   |  7 +----
 .../convert_textnet_original_to_pytorch.py    |  2 +-
 .../textnet/image_processing_textnet.py       |  2 +-
 .../models/textnet/modeling_textnet.py        |  2 +-
 tests/models/textnet/test_modeling_textnet.py | 27 ++++++++-----------
 7 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
index 9e8596c6f913..43cb27d81375 100644
--- a/docs/source/en/model_doc/textnet.md
+++ b/docs/source/en/model_doc/textnet.md
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py
index 9585ce0ba996..3ed55ea8e5be 100644
--- a/src/transformers/models/textnet/__init__.py
+++ b/src/transformers/models/textnet/__init__.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 the Fast authors and HuggingFace Inc. team.  All rights reserved.
+# Copyright 2024 the Fast authors and HuggingFace Inc. team.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,11 +15,7 @@
 from typing import TYPE_CHECKING
 
 from ... import is_vision_available
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 713dd0401db8..8b965a33a066 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 the Fast authors and HuggingFace Inc. team.  All rights reserved.
+# Copyright 2024 the Fast authors and HuggingFace Inc. team.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -45,11 +45,6 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
             The num of channels in out for the initial convolution layer.
         stem_act_func (`str`, *optional*, defaults to `"relu"`):
             The activation function for the initial convolution layer.
-                                     [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3,
-                                     1], [1, 3], [3, 3]]]`):
-            The list of stagewise conv layer's kernel sizes.
-                                [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
-            The list of stagewise conv layer's kernel strides.
         image_size (`int`, *optional*, defaults to `[640, 640]`):
             The size (resolution) of each image.
         conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`):
diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index 790187b02119..5787c751aa7a 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 the Fast authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index fa336684e120..665be0d87176 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 the Fast authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 720f9dc43870..d0b763b777c3 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 the Fast authors and HuggingFace Inc. team.  All rights reserved.
+# Copyright 2024 the Fast authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index b7cced64e37b..8d96461b097a 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 the Fast authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,6 +46,14 @@
     )
 
 
+class TextNetConfigTester(ConfigTester):
+    def create_and_test_config_common_properties(self):
+        config = self.config_class(**self.inputs_dict)
+        self.parent.assertTrue(hasattr(config, "hidden_sizes"))
+        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
+        self.parent.assertTrue(hasattr(config, "num_encoder_blocks"))
+
+
 class TextNetModelTester:
     def __init__(
         self,
@@ -76,7 +84,6 @@ def __init__(
         image_size=[32, 32],
         is_training=True,
         use_labels=True,
-        hidden_act="relu",
         num_labels=3,
         hidden_sizes=[32, 32, 32, 32, 32],
     ):
@@ -199,7 +206,7 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
-    Here we also overwrite some of the tests of test_modeling_common.py, as TextNet does not use input_ids, inputs_embeds,
+    Here we also overwrite some tests of test_modeling_common.py, as TextNet does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
@@ -218,19 +225,7 @@ class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
 
     def setUp(self):
         self.model_tester = TextNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TextNetConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
+        self.config_tester = TextNetConfigTester(self, config_class=TextNetConfig, has_text_modality=False)
 
     @unittest.skip(reason="TextNet does not output attentions")
     def test_attention_outputs(self):

From ea6c3d8e12289eacf9a950811dd92efcc92222e7 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 1 Feb 2024 11:22:50 +0530
Subject: [PATCH 112/152] Incorporate PR feedbacks

---
 src/transformers/models/textnet/configuration_textnet.py | 2 +-
 tests/models/textnet/test_modeling_textnet.py            | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 8b965a33a066..4e7fe045e659 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -45,7 +45,7 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
             The num of channels in out for the initial convolution layer.
         stem_act_func (`str`, *optional*, defaults to `"relu"`):
             The activation function for the initial convolution layer.
-        image_size (`int`, *optional*, defaults to `[640, 640]`):
+        image_size (`Tuple[int, int]`, *optional*, defaults to `[640, 640]`):
             The size (resolution) of each image.
         conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`):
             A list of stage wise kernel sizes.
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 8d96461b097a..b27e2d40c991 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -41,8 +41,7 @@
     from transformers import (
         TextNetBackbone,
         TextNetForImageClassification,
-        TextNetModel,
-        is_torch_available,
+        TextNetModel
     )
 
 
@@ -313,7 +312,7 @@ def test_model_from_pretrained(self):
 @require_torch
 @require_vision
 class TextNetModelIntegrationTest(unittest.TestCase):
-    # @slow
+    @slow
     def test_inference_textnet_image_classification(self):
         model = TextNetForImageClassification.from_pretrained("Raghavan/textnet-base").to(torch_device)
         url = "http://images.cocodataset.org/val2017/000000039769.jpg"

From 43247b6a3c4e7ded7bfd056121596cdce93f5fc6 Mon Sep 17 00:00:00 2001
From: raghavanone <opensourcemaniacfreak@gmail.com>
Date: Thu, 1 Feb 2024 11:28:58 +0530
Subject: [PATCH 113/152] fix code style

---
 tests/models/textnet/test_modeling_textnet.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index b27e2d40c991..89f8886fa7f8 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -38,11 +38,7 @@
     import torch
     from torch import nn
 
-    from transformers import (
-        TextNetBackbone,
-        TextNetForImageClassification,
-        TextNetModel
-    )
+    from transformers import TextNetBackbone, TextNetForImageClassification, TextNetModel
 
 
 class TextNetConfigTester(ConfigTester):

From 73e3718697a5c4594565d7e826e2ea03fba52121 Mon Sep 17 00:00:00 2001
From: jadechoghari <chogharijade@gmail.com>
Date: Wed, 27 Nov 2024 22:11:48 +0100
Subject: [PATCH 114/152] fix key error and convert working

---
 src/transformers/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 559ccd3f5701..b7d11a52eefc 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1239,7 +1239,6 @@
     _import_structure["models.superpoint"].extend(["SuperPointImageProcessor"])
     _import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
     _import_structure["models.textnet"].extend(["TextNetImageProcessor"])
-    _import_structure["models.tvlt"].append("TvltImageProcessor")
     _import_structure["models.tvp"].append("TvpImageProcessor")
     _import_structure["models.video_llava"].append("VideoLlavaImageProcessor")
     _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])

From c22534aef251e6f5d7a9e3e2ee6648cbaca0d41e Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Wed, 27 Nov 2024 16:34:49 -0500
Subject: [PATCH 115/152] tvlt shouldn't be here

---
 src/transformers/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index b7d11a52eefc..8490d2f8bbcd 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -6186,7 +6186,6 @@
         from .models.superpoint import SuperPointImageProcessor
         from .models.swin2sr import Swin2SRImageProcessor
         from .models.textnet import TextNetImageProcessor
-        from .models.tvlt import TvltImageProcessor
         from .models.tvp import TvpImageProcessor
         from .models.video_llava import VideoLlavaImageProcessor
         from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor

From 35f2f5600f8d7edd91766f7179a8a9dbe7061a1d Mon Sep 17 00:00:00 2001
From: jadechoghari <chogharijade@gmail.com>
Date: Thu, 28 Nov 2024 02:49:44 +0100
Subject: [PATCH 116/152] fix test modeling test

---
 tests/models/textnet/test_modeling_textnet.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 89f8886fa7f8..37908951e9a4 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -226,6 +226,10 @@ def setUp(self):
     def test_attention_outputs(self):
         pass
 
+    @unittest.skip(reason="TextNet does not have input/output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+
     @unittest.skip(reason="TextNet does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass

From 222e20d3df7356129d859f1eaf2cd661b8906456 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Tue, 3 Dec 2024 22:15:53 +0100
Subject: [PATCH 117/152] Fix tests, make fixup

---
 .../models/textnet/configuration_textnet.py   |  3 +-
 .../textnet/image_processing_textnet.py       | 88 +++++++++++--------
 .../models/textnet/modeling_textnet.py        |  3 +-
 .../utils/dummy_vision_objects.py             |  1 +
 .../textnet/test_image_processing_textnet.py  |  1 +
 tests/models/textnet/test_modeling_textnet.py |  3 +-
 6 files changed, 60 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 4e7fe045e659..c443e1ecdefb 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TextNet model configuration"""
+"""TextNet model configuration"""
+
 from transformers import PretrainedConfig
 from transformers.utils import logging
 from transformers.utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 665be0d87176..8d743db537b9 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -36,6 +36,8 @@
     make_list_of_images,
     to_numpy_array,
     valid_images,
+    validate_kwargs,
+    validate_preprocess_arguments,
 )
 from ...utils import TensorType, is_vision_available, logging
 
@@ -120,6 +122,24 @@ def __init__(
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
         self.do_convert_rgb = do_convert_rgb
 
+        self._valid_processor_keys = [
+            "images",
+            "do_resize",
+            "size",
+            "resample",
+            "do_center_crop",
+            "crop_size",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "do_convert_rgb",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
     def resize(
         self,
         image: np.ndarray,
@@ -258,6 +278,8 @@ def preprocess(
         image_std = image_std if image_std is not None else self.image_std
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
 
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
         images = make_list_of_images(images)
 
         if not valid_images(images):
@@ -265,20 +287,19 @@ def preprocess(
                 "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
 
-        if do_resize and size is None:
-            raise ValueError("Size must be specified if do_resize is True.")
-
-        if do_center_crop and crop_size is None:
-            raise ValueError("Crop size must be specified if do_center_crop is True.")
-
-        if do_rescale and rescale_factor is None:
-            raise ValueError("Rescale factor must be specified if do_rescale is True.")
-
-        if do_normalize and (image_mean is None or image_std is None):
-            raise ValueError("Image mean and std must be specified if do_normalize is True.")
-
-        # PIL RGBA images are converted to RGB
         if do_convert_rgb:
             images = [convert_to_rgb(image) for image in images]
 
@@ -295,31 +316,26 @@ def preprocess(
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images[0])
 
-        if do_resize:
-            images = [
-                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
-                for image in images
-            ]
-
-        if do_center_crop:
-            images = [
-                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
-            ]
-
-        if do_rescale:
-            images = [
-                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
-                for image in images
-            ]
-
-        if do_normalize:
-            images = [
-                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
-                for image in images
-            ]
+        all_images = []
+        for image in images:
+            if do_resize:
+                image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+            if do_center_crop:
+                image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
 
+            all_images.append(image)
         images = [
-            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            for image in all_images
         ]
 
         data = {"pixel_values": images}
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index d0b763b777c3..6a285ddac542 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch TextNet model."""
+"""PyTorch TextNet model."""
+
 from typing import Any, List, Optional, Tuple, Union
 
 import torch
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index aea13c1d1691..66bee452e741 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -603,6 +603,7 @@ class SuperPointImageProcessor(metaclass=DummyObject):
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
+
 class TextNetImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/textnet/test_image_processing_textnet.py b/tests/models/textnet/test_image_processing_textnet.py
index ee3c8a1beb2e..1bd5dfe6907b 100644
--- a/tests/models/textnet/test_image_processing_textnet.py
+++ b/tests/models/textnet/test_image_processing_textnet.py
@@ -94,6 +94,7 @@ class TextNetImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = TextNetImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = TextNetImageProcessingTester(self)
 
     @property
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 37908951e9a4..ec6a4cd7a0c7 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch TextNet model. """
+"""Testing suite for the PyTorch TextNet model."""
+
 import unittest
 
 import requests

From 01e87299fa669d21a7627b98525e36119c40c3f0 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Tue, 3 Dec 2024 22:38:48 +0100
Subject: [PATCH 118/152] Make fixup

---
 src/transformers/utils/dummy_pt_objects.py     | 3 ---
 src/transformers/utils/dummy_vision_objects.py | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index f20299d617fd..af42552f2cab 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -8955,9 +8955,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
 class TimeSeriesTransformerForPrediction(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 66bee452e741..d45a61b6b7a5 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -604,14 +604,14 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class TextNetImageProcessor(metaclass=DummyObject):
+class Swin2SRImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class Swin2SRImageProcessor(metaclass=DummyObject):
+class TextNetImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):

From 15dc1a2657bdd3812a2e467eb45fcd071f9d1824 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Wed, 4 Dec 2024 11:20:00 +0100
Subject: [PATCH 119/152] Make fixup

---
 src/transformers/__init__.py               | 9 ++-------
 src/transformers/utils/dummy_pt_objects.py | 3 ---
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 2d53b9d07e35..95210ceca774 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -773,7 +773,7 @@
         "TapasConfig",
         "TapasTokenizer",
     ],
-    "models.textnet": ["TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "TextNetConfig"],
+    "models.textnet": ["TextNetConfig"],
     "models.time_series_transformer": ["TimeSeriesTransformerConfig"],
     "models.timesformer": ["TimesformerConfig"],
     "models.timm_backbone": ["TimmBackboneConfig"],
@@ -3496,7 +3496,6 @@
     )
     _import_structure["models.textnet"].extend(
         [
-            "TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST",
             "TextNetBackbone",
             "TextNetForImageClassification",
             "TextNetModel",
@@ -5707,10 +5706,7 @@
         TapasConfig,
         TapasTokenizer,
     )
-    from .models.textnet import (
-        TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        TextNetConfig,
-    )
+    from .models.textnet import TextNetConfig
     from .models.time_series_transformer import (
         TimeSeriesTransformerConfig,
     )
@@ -7997,7 +7993,6 @@
             load_tf_weights_in_tapas,
         )
         from .models.textnet import (
-            TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST,
             TextNetBackbone,
             TextNetForImageClassification,
             TextNetModel,
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index af42552f2cab..c1c230f52cd6 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -8924,9 +8924,6 @@ def load_tf_weights_in_tapas(*args, **kwargs):
     requires_backends(load_tf_weights_in_tapas, ["torch"])
 
 
-TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
 class TextNetBackbone(metaclass=DummyObject):
     _backends = ["torch"]
 

From adbac840dd0b484b9e07aae2224232eaaad2642f Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Wed, 4 Dec 2024 18:40:29 +0100
Subject: [PATCH 120/152] Remove TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST

---
 src/transformers/models/textnet/__init__.py         | 2 --
 src/transformers/models/textnet/modeling_textnet.py | 2 --
 tests/models/textnet/test_modeling_textnet.py       | 8 ++++----
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py
index 3ed55ea8e5be..9a18123cbe93 100644
--- a/src/transformers/models/textnet/__init__.py
+++ b/src/transformers/models/textnet/__init__.py
@@ -37,7 +37,6 @@
     pass
 else:
     _import_structure["modeling_textnet"] = [
-        "TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST",
         "TextNetBackbone",
         "TextNetModel",
         "TextNetPreTrainedModel",
@@ -63,7 +62,6 @@
         pass
     else:
         from .modeling_textnet import (
-            TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST,
             TextNetBackbone,
             TextNetForImageClassification,
             TextNetModel,
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 6a285ddac542..ada909c7986b 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -70,8 +70,6 @@
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
-TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST = ["Raghavan/textnet-base"]
-
 
 class TextNetConvLayer(nn.Module):
     def __init__(self, config):
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index ec6a4cd7a0c7..1b035fc4491f 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -19,7 +19,7 @@
 import requests
 from PIL import Image
 
-from transformers import TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST, TextNetConfig
+from transformers import TextNetConfig
 from transformers.models.textnet.image_processing_textnet import TextNetImageProcessor
 from transformers.testing_utils import (
     require_torch,
@@ -305,9 +305,9 @@ def test_for_image_classification(self):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = TextNetModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
+        model_name = "Raghavan/textnet-base"
+        model = TextNetModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
 
 
 @require_torch

From 29a45fade6c477a0180ac28c3563d378158856d5 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Wed, 4 Dec 2024 15:13:37 -0500
Subject: [PATCH 121/152] improve type annotation

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/textnet/modeling_textnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index ada909c7986b..30af497143a9 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -99,7 +99,7 @@ def __init__(self, config):
         if self.activation_function is not None:
             self.activation = ACT2CLS[self.activation_function]()
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.conv(hidden_states)
         hidden_states = self.batch_norm(hidden_states)
         return self.activation(hidden_states)

From 395684415d91a91e50c9ece86f03f85cb7fcacf2 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Wed, 4 Dec 2024 15:13:46 -0500
Subject: [PATCH 122/152] Update
 tests/models/textnet/test_image_processing_textnet.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 tests/models/textnet/test_image_processing_textnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/textnet/test_image_processing_textnet.py b/tests/models/textnet/test_image_processing_textnet.py
index 1bd5dfe6907b..781f71460ec2 100644
--- a/tests/models/textnet/test_image_processing_textnet.py
+++ b/tests/models/textnet/test_image_processing_textnet.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 HuggingFace Inc.
+# Copyright 2024 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 2ce19c3dace4d26ae9a74effd54168d0e48d2927 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Wed, 4 Dec 2024 15:14:00 -0500
Subject: [PATCH 123/152] improve type annotation

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/textnet/modeling_textnet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 30af497143a9..bb5991f7c31d 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -220,10 +220,10 @@ def __init__(self, config):
 
     def forward(
         self,
-        hidden_state,
+        hidden_state: torch.Tensor,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ):
+    ) -> BaseModelOutputWithNoAttention:
         hidden_states = [hidden_state]
         for stage in self.stages:
             hidden_state = stage(hidden_state)

From 01e838e2495b099816b12646c896bbc4721e0103 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Wed, 4 Dec 2024 15:14:34 -0500
Subject: [PATCH 124/152] space typo

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/textnet/configuration_textnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index c443e1ecdefb..6ba4d429ddb7 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -28,7 +28,7 @@
 
 class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
-        This is the configuration class to store the configuration of a [`TextNextModel`]. It is used to instantiate a
+    This is the configuration class to store the configuration of a [`TextNextModel`]. It is used to instantiate a
     TextNext model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the
     [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base)Configuration objects inherit from

From 7d2bfb7ce65c0815b4a031775a09fa40547a4f5a Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Wed, 4 Dec 2024 15:15:49 -0500
Subject: [PATCH 125/152] improve type annotation

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/textnet/modeling_textnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index bb5991f7c31d..ab69014bd5d9 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -72,7 +72,7 @@
 
 
 class TextNetConvLayer(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config: TextNetConfig):
         super().__init__()
 
         self.kernel_size = config.stem_kernel_size

From 2a12e6648b5a42ae957ef20a86315e1b5a6abfa7 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Wed, 4 Dec 2024 15:31:29 -0500
Subject: [PATCH 126/152] Update
 src/transformers/models/textnet/configuration_textnet.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/textnet/configuration_textnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 6ba4d429ddb7..eae72481c2f6 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -31,7 +31,7 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     This is the configuration class to store the configuration of a [`TextNextModel`]. It is used to instantiate a
     TextNext model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the
-    [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base)Configuration objects inherit from
+    [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base). Configuration objects inherit from
     [`PretrainedConfig`] and can be used to control the model outputs.Read the documentation from [`PretrainedConfig`]
     for more information.
 

From 5e234e1b7c4ae4122f40a9e20329ceb2053985d0 Mon Sep 17 00:00:00 2001
From: jadechoghari <jadechoghari@users.noreply.huggingface.co>
Date: Thu, 5 Dec 2024 02:56:17 +0100
Subject: [PATCH 127/152] make conv layer kernel sizes and strides default to
 None

---
 .../models/textnet/configuration_textnet.py   | 29 ++++++++++++-------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index eae72481c2f6..b91ed84a2ca3 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -48,10 +48,12 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
             The activation function for the initial convolution layer.
         image_size (`Tuple[int, int]`, *optional*, defaults to `[640, 640]`):
             The size (resolution) of each image.
-        conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*, defaults to `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`):
-            A list of stage wise kernel sizes.
-        conv_layer_strides (`List[List[int]]`, *optional*, defaults to `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`):
-            A list of stage wise strides.
+        conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*, defaults to `None`):
+            A list of stage-wise kernel sizes. If `None`, defaults to:
+            `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`.
+        conv_layer_strides (`List[List[int]]`, *optional*, defaults to `None`):
+            A list of stage-wise strides. If `None`, defaults to:
+            `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`.
         hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`):
             Dimensionality (hidden size) at each stage.
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
@@ -95,13 +97,8 @@ def __init__(
         stem_out_channels=64,
         stem_act_func="relu",
         image_size=[640, 640],
-        conv_layer_kernel_sizes=[
-            [[3, 3], [3, 3], [3, 3]],
-            [[3, 3], [1, 3], [3, 3], [3, 1]],
-            [[3, 3], [3, 3], [3, 1], [1, 3]],
-            [[3, 3], [3, 1], [1, 3], [3, 3]],
-        ],
-        conv_layer_strides=[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]],
+        conv_layer_kernel_sizes=None,
+        conv_layer_strides=None,
         hidden_sizes=[64, 64, 128, 256, 512],
         batch_norm_eps=1e-5,
         initializer_range=0.02,
@@ -111,6 +108,16 @@ def __init__(
     ):
         super().__init__(**kwargs)
 
+        if conv_layer_kernel_sizes is None:
+            conv_layer_kernel_sizes = [
+                [[3, 3], [3, 3], [3, 3]],
+                [[3, 3], [1, 3], [3, 3], [3, 1]],
+                [[3, 3], [3, 3], [3, 1], [1, 3]],
+                [[3, 3], [3, 1], [1, 3], [3, 3]],
+            ]
+        if conv_layer_strides is None:
+            conv_layer_strides = [[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]
+
         self.stem_kernel_size = stem_kernel_size
         self.stem_stride = stem_stride
         self.stem_num_channels = stem_num_channels

From a8ecd96d1801049cbfb236a10bda7032cef3ea56 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Thu, 5 Dec 2024 12:02:09 -0500
Subject: [PATCH 128/152] Update
 src/transformers/models/textnet/modeling_textnet.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/textnet/modeling_textnet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index ab69014bd5d9..34debc05d71b 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -128,8 +128,8 @@ def __init__(self, config, in_channels, out_channels, kernel_size, stride):
         )
         self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels, eps=config.batch_norm_eps)
 
-        ver_pad = ((kernel_size[0] - 1) // 2, 0)
-        hor_pad = (0, (kernel_size[1] - 1) // 2)
+        vertical_padding = ((kernel_size[0] - 1) // 2, 0)
+        horizontal_padding = (0, (kernel_size[1] - 1) // 2)
 
         if kernel_size[1] != 1:
             self.vertical_conv = nn.Conv2d(

From 7988497297c437fbdfc3f88b49263af1ccc011e6 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Thu, 5 Dec 2024 12:02:37 -0500
Subject: [PATCH 129/152] Update
 src/transformers/models/textnet/modeling_textnet.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/textnet/modeling_textnet.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 34debc05d71b..e9edba3bf973 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -163,24 +163,25 @@ def __init__(self, config, in_channels, out_channels, kernel_size, stride):
             else None
         )
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         main_outputs = self.main_conv(hidden_states)
         main_outputs = self.main_batch_norm(main_outputs)
 
-        vertical_outputs = 0
         if self.vertical_conv is not None:
             vertical_outputs = self.vertical_conv(hidden_states)
             vertical_outputs = self.vertical_batch_norm(vertical_outputs)
-        horizontal_outputs = 0
+            main_outputs = main_outputs + vertical_outputs
+
         if self.horizontal_conv is not None:
             horizontal_outputs = self.horizontal_conv(hidden_states)
             horizontal_outputs = self.horizontal_batch_norm(horizontal_outputs)
+            main_outputs = main_outputs + horizontal_outputs
 
-        id_out = 0
         if self.rbr_identity is not None:
             id_out = self.rbr_identity(hidden_states)
+            main_outputs = main_outputs + id_out
 
-        return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out)
+        return self.nonlinearity(main_outputs)
 
 
 class TextNetStage(nn.Module):

From e457a599a90c0ea38f3aad4f094244adcabe1198 Mon Sep 17 00:00:00 2001
From: jadechoghari <chogharijade@gmail.com>
Date: Thu, 5 Dec 2024 18:57:27 +0100
Subject: [PATCH 130/152] fix keyword bug

---
 src/transformers/models/textnet/modeling_textnet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index e9edba3bf973..477b73351ecc 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -137,7 +137,7 @@ def __init__(self, config, in_channels, out_channels, kernel_size, stride):
                 out_channels=out_channels,
                 kernel_size=(kernel_size[0], 1),
                 stride=stride,
-                padding=ver_pad,
+                padding=vertical_padding,
                 bias=False,
             )
             self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels, eps=config.batch_norm_eps)
@@ -150,7 +150,7 @@ def __init__(self, config, in_channels, out_channels, kernel_size, stride):
                 out_channels=out_channels,
                 kernel_size=(1, kernel_size[1]),
                 stride=stride,
-                padding=hor_pad,
+                padding=horizontal_padding,
                 bias=False,
             )
             self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels, eps=config.batch_norm_eps)

From 596458a304b83311a78ebc514b39d626be3d9ecc Mon Sep 17 00:00:00 2001
From: jadechoghari <chogharijade@gmail.com>
Date: Thu, 5 Dec 2024 19:01:59 +0100
Subject: [PATCH 131/152] add batch init and make fixup

---
 src/transformers/models/textnet/configuration_textnet.py | 4 ++--
 src/transformers/models/textnet/modeling_textnet.py      | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index b91ed84a2ca3..235e946e78bb 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -48,10 +48,10 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
             The activation function for the initial convolution layer.
         image_size (`Tuple[int, int]`, *optional*, defaults to `[640, 640]`):
             The size (resolution) of each image.
-        conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*, defaults to `None`):
+        conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*):
             A list of stage-wise kernel sizes. If `None`, defaults to:
             `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`.
-        conv_layer_strides (`List[List[int]]`, *optional*, defaults to `None`):
+        conv_layer_strides (`List[List[int]]`, *optional*):
             A list of stage-wise strides. If `None`, defaults to:
             `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`.
         hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`):
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 477b73351ecc..5994abf4d06b 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -252,6 +252,10 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
                 module.bias.data.zero_()
+        elif isinstance(module, nn.BatchNorm2d):
+            module.weight.data.fill_(1.0)
+            if module.bias is not None:
+                module.bias.data.zero_()
 
 
 @add_start_docstrings(

From b0c4f60f04b75960e89f54b4ce914e0bea24671e Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 7 Dec 2024 09:48:37 +0100
Subject: [PATCH 132/152] Make fixup

---
 .../models/idefics3/configuration_idefics3.py            | 3 ++-
 src/transformers/models/textnet/modeling_textnet.py      | 6 +++---
 tests/models/textnet/test_modeling_textnet.py            | 9 +++++----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py
index 4b10d8d2d03a..0d385b0ee48d 100644
--- a/src/transformers/models/idefics3/configuration_idefics3.py
+++ b/src/transformers/models/idefics3/configuration_idefics3.py
@@ -54,7 +54,8 @@ class Idefics3VisionConfig(PretrainedConfig):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        initializer_range (`<fill_type>`, *optional*, defaults to 0.02): <fill_docstring>
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 
     Example:
 
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 5994abf4d06b..e6589d61a527 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -106,7 +106,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 class TextNetRepConvLayer(nn.Module):
-    def __init__(self, config, in_channels, out_channels, kernel_size, stride):
+    def __init__(self, config: TextNetConfig, in_channels: int, out_channels: int, kernel_size: int, stride: int):
         super().__init__()
 
         self.num_channels = in_channels
@@ -185,7 +185,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 class TextNetStage(nn.Module):
-    def __init__(self, config, depth):
+    def __init__(self, config: TextNetConfig, depth: int):
         super().__init__()
         kernel_size = config.conv_layer_kernel_sizes[depth]
         stride = config.conv_layer_strides[depth]
@@ -209,7 +209,7 @@ def forward(self, hidden_state):
 
 
 class TextNetEncoder(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config: TextNetConfig):
         super().__init__()
 
         stages = []
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 1b035fc4491f..da1adff07f6c 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -315,15 +315,16 @@ def test_model_from_pretrained(self):
 class TextNetModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_textnet_image_classification(self):
+        processor = TextNetImageProcessor.from_pretrained("Raghavan/textnet-base")
         model = TextNetForImageClassification.from_pretrained("Raghavan/textnet-base").to(torch_device)
+
+        # prepare image
         url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         image = Image.open(requests.get(url, stream=True).raw)
-        processor = TextNetImageProcessor.from_pretrained("Raghavan/textnet-base")
-        text = "This is a photo of a cat"
-        inputs = processor(text=text, images=image, return_tensors="pt", size={"height": 640, "width": 640})
+        inputs = processor(images=image, return_tensors="pt")
 
         # forward pass
-        output = model(pixel_values=torch.tensor(inputs["pixel_values"]))
+        output = model(**inputs)
         self.assertEqual(output.logits.shape, torch.Size([1, 2]))
 
 

From 4526c995f1f5e7de9dbf7791885363ee4f87d642 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sat, 7 Dec 2024 10:20:31 +0100
Subject: [PATCH 133/152] Update integration test

---
 tests/models/textnet/test_modeling_textnet.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index da1adff07f6c..77054399a0da 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -325,7 +325,10 @@ def test_inference_textnet_image_classification(self):
 
         # forward pass
         output = model(**inputs)
+
+        # verify logits
         self.assertEqual(output.logits.shape, torch.Size([1, 2]))
+        self.assertTrue(torch.allclose(output.logits[:3, :3]), torch.zeros_like((output.logits[:3, :3])), atol=1e-3)
 
 
 @require_torch

From 979bfaef673b0a2768e6497a9e3800544b6b05b0 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Sun, 8 Dec 2024 10:19:33 +0100
Subject: [PATCH 134/152] Add figure

---
 docs/source/en/model_doc/ijepa.md   | 16 +++++++++++++++-
 docs/source/en/model_doc/textnet.md | 10 ++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/ijepa.md b/docs/source/en/model_doc/ijepa.md
index 32944e2617ea..842a943a4481 100644
--- a/docs/source/en/model_doc/ijepa.md
+++ b/docs/source/en/model_doc/ijepa.md
@@ -18,13 +18,18 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://arxiv.org/pdf/2301.08243.pdf) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas.
+The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://arxiv.org/abs/2301.08243) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas.
 I-JEPA is a self-supervised learning method that predicts the representations of one part of an image based on other parts of the same image. This approach focuses on learning semantic features without relying on pre-defined invariances from hand-crafted data transformations, which can bias specific tasks, or on filling in pixel-level details, which often leads to less meaningful representations.
 
 The abstract from the paper is the following:
 
 This paper demonstrates an approach for learning highly semantic image representations without relying on hand-crafted data-augmentations. We introduce the Image- based Joint-Embedding Predictive Architecture (I-JEPA), a non-generative approach for self-supervised learning from images. The idea behind I-JEPA is simple: from a single context block, predict the representations of various target blocks in the same image. A core design choice to guide I-JEPA towards producing semantic representations is the masking strategy; specifically, it is crucial to (a) sample tar- get blocks with sufficiently large scale (semantic), and to (b) use a sufficiently informative (spatially distributed) context block. Empirically, when combined with Vision Transform- ers, we find I-JEPA to be highly scalable. For instance, we train a ViT-Huge/14 on ImageNet using 16 A100 GPUs in under 72 hours to achieve strong downstream performance across a wide range of tasks, from linear classification to object counting and depth prediction.
 
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/ijepa_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> I-JEPA architecture. Taken from the <a href="https://arxiv.org/abs/2301.08243">original paper.</a> </small>
+
 This model was contributed by [jmtzt](https://huggingface.co/jmtzt).
 The original code can be found [here](https://github.com/facebookresearch/ijepa).
 
@@ -63,6 +68,15 @@ similarity = cosine_similarity(embed_1, embed_2)
 print(similarity)
 ```
 
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with I-JEPA.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`IJepaForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
 ## IJepaConfig
 
 [[autodoc]] IJepaConfig
diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
index 43cb27d81375..3dcafb502cad 100644
--- a/docs/source/en/model_doc/textnet.md
+++ b/docs/source/en/model_doc/textnet.md
@@ -20,6 +20,16 @@ rendered properly in your Markdown viewer.
 
 The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. TextNet is a vision backbone useful for text detection tasks. It is the result of neural architecture search (NAS) on backbones with reward function as text detection task (to provide powerful features for text detection).
 
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/fast_architecture.png"
+alt="drawing" width="600"/>
+
+<small> TextNet backbone as part of FAST. Taken from the <a href="https://arxiv.org/abs/2111.02394">original paper.</a> </small>
+
+This model was contributed by [Raghavan](https://huggingface.co/Raghavan), [jadechoghari](https://huggingface.co/nielsr) and [nielsr](https://huggingface.co/nielsr).
+
+This model was contributed by [nandwalritik](https://huggingface.co/nandwalritik).
+The original code can be found [here](https://github.com/czczup/FAST).
+
 ## TextNetConfig
 
 [[autodoc]] TextNetConfig

From ebc7d009041b70f47c44c156757d76d1c72380b2 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Sun, 8 Dec 2024 08:39:47 -0500
Subject: [PATCH 135/152] Update textnet.md

---
 docs/source/en/model_doc/textnet.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
index 3dcafb502cad..abb57f636ec8 100644
--- a/docs/source/en/model_doc/textnet.md
+++ b/docs/source/en/model_doc/textnet.md
@@ -25,7 +25,7 @@ alt="drawing" width="600"/>
 
 <small> TextNet backbone as part of FAST. Taken from the <a href="https://arxiv.org/abs/2111.02394">original paper.</a> </small>
 
-This model was contributed by [Raghavan](https://huggingface.co/Raghavan), [jadechoghari](https://huggingface.co/nielsr) and [nielsr](https://huggingface.co/nielsr).
+This model was contributed by [Raghavan](https://huggingface.co/Raghavan), [jadechoghari](https://huggingface.co/jadechoghari) and [nielsr](https://huggingface.co/nielsr).
 
 This model was contributed by [nandwalritik](https://huggingface.co/nandwalritik).
 The original code can be found [here](https://github.com/czczup/FAST).

From ca8b7c0a2ce73f1b311bc53814a660eaefe6f270 Mon Sep 17 00:00:00 2001
From: jadechoghari <jadechoghari@users.noreply.huggingface.co>
Date: Thu, 19 Dec 2024 12:11:26 +0100
Subject: [PATCH 136/152] add testing and fix errors (classification,
 imgprocess)

---
 .../convert_textnet_original_to_pytorch.py    | 27 ++++++++++++-------
 .../textnet/image_processing_textnet.py       |  4 +--
 .../models/textnet/modeling_textnet.py        |  3 ++-
 tests/models/textnet/test_modeling_textnet.py |  1 +
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index 5787c751aa7a..9064c56e4ee5 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -125,10 +125,21 @@ def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_filename, pytor
 
     if "tiny" in content[checkpoint_config_filename]["config"]:
         config = prepare_config(tiny_config_url, size)
+        expected_slice_backbone = torch.tensor(
+            [0.0000, 0.0000, 0.0000, 0.0000, 0.5300, 0.0000, 0.0000, 0.0000, 0.0000,
+        1.1221])
     elif "small" in content[checkpoint_config_filename]["config"]:
         config = prepare_config(small_config_url, size)
+        expected_slice_backbone = torch.tensor(
+            [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+        0.1394]
+        )
     else:
         config = prepare_config(base_config_url, size)
+        expected_slice_backbone = torch.tensor(
+            [0.9210, 0.6099, 0.0000, 0.0000, 0.0000, 0.0000, 3.2207, 2.6602, 1.8925,
+        0.0000]
+        )
 
     model = TextNetBackbone(config)
     textnet_image_processor = TextNetImageProcessor(size={"shortest_edge": size})
@@ -152,29 +163,25 @@ def adjust_stage(match):
             new_key = re.sub(pattern, adjust_stage, new_key)
             state_dict_changed[new_key] = val
     model.load_state_dict(state_dict_changed)
+    model.eval()
 
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
 
-    original_pixel_values_filepath = hf_hub_download(
-        repo_id="Raghavan/fast_model_samples", filename="original_processed_pixel_values.npy", repo_type="dataset"
-    )
-    original_pixel_values = torch.from_numpy(np.load(original_pixel_values_filepath))
+    original_pixel_values = torch.tensor([0.1939, 0.3481, 0.4166, 0.3309, 0.4508, 0.4679, 0.4851, 0.4851, 0.3309,
+        0.4337])
     pixel_values = textnet_image_processor(image, return_tensors="pt").pixel_values
 
-    assert torch.allclose(original_pixel_values, pixel_values)
+    assert torch.allclose(original_pixel_values, pixel_values[0][0][3][:10], atol=1e-4)
 
     with torch.no_grad():
         output = model(pixel_values)
 
-    model_output_featuremap_sample = [0, 0, 0, 0, 0, 0, 0, 0, 4.0259247, 17.4911]
-    assert np.allclose(
-        output["feature_maps"][-1][0][0][0][-10:].detach().numpy(), np.array(model_output_featuremap_sample)
-    ), "Converted model outputs does not match original model outputs"
+    assert torch.allclose(output["feature_maps"][-1][0][10][12][:10].detach(), expected_slice_backbone, atol=1e-3)
 
     model.save_pretrained(pytorch_dump_folder_path)
     textnet_image_processor.save_pretrained(pytorch_dump_folder_path)
-    logging.info("The converted weights are save here : " + pytorch_dump_folder_path)
+    logging.info("The converted weights are saved here : " + pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 8d743db537b9..3a5d71e32f16 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -144,7 +144,7 @@ def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
@@ -158,7 +158,7 @@ def resize(
                 Image to resize.
             size (`Dict[str, int]`):
                 Size of the output image.
-            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index e6589d61a527..bb5833ae6047 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -322,8 +322,9 @@ def __init__(self, config):
         scale_w = config.image_size[1] // 32
         # classification head
         self.classifier = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
             nn.Flatten(),
-            nn.Linear(config.hidden_sizes[-1] * scale_h * scale_w, config.num_labels)
+            nn.Linear(config.hidden_sizes[-1], config.num_labels)
             if config.num_labels > 0
             else nn.Identity(),
         )
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 77054399a0da..d6fa08f4d64f 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -192,6 +192,7 @@ def create_and_check_backbone(self, config, pixel_values, labels):
         self.parent.assertEqual(len(model.channels), 1)
         self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
 
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, pixel_values, labels = config_and_inputs

From d2f917e7f36ec737e5e2680493d8251609186e1a Mon Sep 17 00:00:00 2001
From: jadechoghari <chogharijade@gmail.com>
Date: Thu, 19 Dec 2024 12:17:03 +0100
Subject: [PATCH 137/152] fix error check

---
 src/transformers/models/textnet/modeling_textnet.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index bb5833ae6047..0b498c76dbfe 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -318,8 +318,6 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.textnet = TextNetModel(config)
-        scale_h = config.image_size[0] // 32
-        scale_w = config.image_size[1] // 32
         # classification head
         self.classifier = nn.Sequential(
             nn.AdaptiveAvgPool2d((1, 1)),

From 297bcfcfdcbf4ea4e273940f775572e7910f85b7 Mon Sep 17 00:00:00 2001
From: jadechoghari <chogharijade@gmail.com>
Date: Thu, 19 Dec 2024 12:26:43 +0100
Subject: [PATCH 138/152] make fixup

---
 .../models/textnet/convert_textnet_original_to_pytorch.py     | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index 9064c56e4ee5..d9c7a7533ebf 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -19,7 +19,6 @@
 import re
 from collections import OrderedDict
 
-import numpy as np
 import requests
 import torch
 from huggingface_hub import hf_hub_download
@@ -108,8 +107,7 @@ def prepare_config(size_config_url, size):
             backbone_config["stage4"]["out_channels"][-1],
         ],
         out_features=["stage1", "stage2", "stage3", "stage4"],
-        out_indices=[1, 2, 3, 4],
-        image_size=(size, size),
+        out_indices=[1, 2, 3, 4]
     )
 
     return textnet_config

From 7194556121172f03262ef95a5a9e20231459e3fc Mon Sep 17 00:00:00 2001
From: jadechoghari <chogharijade@gmail.com>
Date: Thu, 19 Dec 2024 12:32:43 +0100
Subject: [PATCH 139/152] make fixup

---
 utils/check_docstrings.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index a63ca59690f7..8e72ec8c3c2a 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -975,7 +975,9 @@ def check_docstrings(overwrite: bool = False, check_all: bool = False):
             if modified_file_diff.a_path.startswith("src/transformers"):
                 module_diff_files.add(modified_file_diff.a_path)
         # Diff from index to `main`
-        for modified_file_diff in repo.index.diff(repo.refs.main.commit):
+        default_branch = repo.active_branch.name if repo.active_branch else "main"
+        # Diff from index to the detected branch
+        for modified_file_diff in repo.index.diff(repo.refs[default_branch].commit):
             if modified_file_diff.a_path.startswith("src/transformers"):
                 module_diff_files.add(modified_file_diff.a_path)
         # quick escape route: if there are no module files in the diff, skip this check

From a065bb773bd6a266e80d6237292cb4531e4a6868 Mon Sep 17 00:00:00 2001
From: jadechoghari <chogharijade@gmail.com>
Date: Thu, 19 Dec 2024 12:33:33 +0100
Subject: [PATCH 140/152] revert to original docstring

---
 utils/check_docstrings.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index 8e72ec8c3c2a..a63ca59690f7 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -975,9 +975,7 @@ def check_docstrings(overwrite: bool = False, check_all: bool = False):
             if modified_file_diff.a_path.startswith("src/transformers"):
                 module_diff_files.add(modified_file_diff.a_path)
         # Diff from index to `main`
-        default_branch = repo.active_branch.name if repo.active_branch else "main"
-        # Diff from index to the detected branch
-        for modified_file_diff in repo.index.diff(repo.refs[default_branch].commit):
+        for modified_file_diff in repo.index.diff(repo.refs.main.commit):
             if modified_file_diff.a_path.startswith("src/transformers"):
                 module_diff_files.add(modified_file_diff.a_path)
         # quick escape route: if there are no module files in the diff, skip this check

From a7bd6f3dc9e6c3a109eaf97c3aa8c7ea77ff59b2 Mon Sep 17 00:00:00 2001
From: jadechoghari <chogharijade@gmail.com>
Date: Thu, 19 Dec 2024 12:41:41 +0100
Subject: [PATCH 141/152] add make style

---
 .../convert_textnet_original_to_pytorch.py      | 17 ++++++++---------
 .../models/textnet/modeling_textnet.py          |  4 +---
 tests/models/textnet/test_modeling_textnet.py   |  1 -
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
index d9c7a7533ebf..a8a004d18a35 100644
--- a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
+++ b/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
@@ -107,7 +107,7 @@ def prepare_config(size_config_url, size):
             backbone_config["stage4"]["out_channels"][-1],
         ],
         out_features=["stage1", "stage2", "stage3", "stage4"],
-        out_indices=[1, 2, 3, 4]
+        out_indices=[1, 2, 3, 4],
     )
 
     return textnet_config
@@ -124,19 +124,17 @@ def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_filename, pytor
     if "tiny" in content[checkpoint_config_filename]["config"]:
         config = prepare_config(tiny_config_url, size)
         expected_slice_backbone = torch.tensor(
-            [0.0000, 0.0000, 0.0000, 0.0000, 0.5300, 0.0000, 0.0000, 0.0000, 0.0000,
-        1.1221])
+            [0.0000, 0.0000, 0.0000, 0.0000, 0.5300, 0.0000, 0.0000, 0.0000, 0.0000, 1.1221]
+        )
     elif "small" in content[checkpoint_config_filename]["config"]:
         config = prepare_config(small_config_url, size)
         expected_slice_backbone = torch.tensor(
-            [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
-        0.1394]
+            [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1394]
         )
     else:
         config = prepare_config(base_config_url, size)
         expected_slice_backbone = torch.tensor(
-            [0.9210, 0.6099, 0.0000, 0.0000, 0.0000, 0.0000, 3.2207, 2.6602, 1.8925,
-        0.0000]
+            [0.9210, 0.6099, 0.0000, 0.0000, 0.0000, 0.0000, 3.2207, 2.6602, 1.8925, 0.0000]
         )
 
     model = TextNetBackbone(config)
@@ -166,8 +164,9 @@ def adjust_stage(match):
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
 
-    original_pixel_values = torch.tensor([0.1939, 0.3481, 0.4166, 0.3309, 0.4508, 0.4679, 0.4851, 0.4851, 0.3309,
-        0.4337])
+    original_pixel_values = torch.tensor(
+        [0.1939, 0.3481, 0.4166, 0.3309, 0.4508, 0.4679, 0.4851, 0.4851, 0.3309, 0.4337]
+    )
     pixel_values = textnet_image_processor(image, return_tensors="pt").pixel_values
 
     assert torch.allclose(original_pixel_values, pixel_values[0][0][3][:10], atol=1e-4)
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 0b498c76dbfe..60080d36dfec 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -322,9 +322,7 @@ def __init__(self, config):
         self.classifier = nn.Sequential(
             nn.AdaptiveAvgPool2d((1, 1)),
             nn.Flatten(),
-            nn.Linear(config.hidden_sizes[-1], config.num_labels)
-            if config.num_labels > 0
-            else nn.Identity(),
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity(),
         )
         # initialize weights and apply final processing
         self.post_init()
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index d6fa08f4d64f..77054399a0da 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -192,7 +192,6 @@ def create_and_check_backbone(self, config, pixel_values, labels):
         self.parent.assertEqual(len(model.channels), 1)
         self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
 
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, pixel_values, labels = config_and_inputs

From 5493ade29ebeb2131d14e9f085c903beef8820dc Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Thu, 19 Dec 2024 16:11:46 +0300
Subject: [PATCH 142/152] remove conflict for now

---
 src/transformers/models/auto/modeling_auto.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 9c08d38a5d07..401df8f61056 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -699,7 +699,6 @@
         ("swin", "SwinForImageClassification"),
         ("swinv2", "Swinv2ForImageClassification"),
         ("textnet", "TextNetForImageClassification"),
-        ("timm_wrapper", "TimmWrapperForImageClassification"),
         ("van", "VanForImageClassification"),
         ("vit", "ViTForImageClassification"),
         ("vit_hybrid", "ViTHybridForImageClassification"),

From c022a22be544b49ae420e6d4c96575929cd617b4 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Thu, 19 Dec 2024 16:18:03 +0300
Subject: [PATCH 143/152] Update modeling_auto.py

got a confusion in `timm_wrapper` - was giving some conflicts
---
 src/transformers/models/auto/modeling_auto.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 401df8f61056..9c08d38a5d07 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -699,6 +699,7 @@
         ("swin", "SwinForImageClassification"),
         ("swinv2", "Swinv2ForImageClassification"),
         ("textnet", "TextNetForImageClassification"),
+        ("timm_wrapper", "TimmWrapperForImageClassification"),
         ("van", "VanForImageClassification"),
         ("vit", "ViTForImageClassification"),
         ("vit_hybrid", "ViTHybridForImageClassification"),

From 5f294548cc0372cd9d197c5d289cf946e8cf40ca Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Sat, 21 Dec 2024 16:25:23 +0300
Subject: [PATCH 144/152] Update tests/models/textnet/test_modeling_textnet.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 tests/models/textnet/test_modeling_textnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 77054399a0da..b881ecfec9f6 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -321,7 +321,7 @@ def test_inference_textnet_image_classification(self):
         # prepare image
         url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         image = Image.open(requests.get(url, stream=True).raw)
-        inputs = processor(images=image, return_tensors="pt")
+        inputs = processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         output = model(**inputs)

From ed36fefed0fe235ce9a386fc3ee1a748f41c518a Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Sat, 21 Dec 2024 16:25:51 +0300
Subject: [PATCH 145/152] Update
 src/transformers/models/textnet/modeling_textnet.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/textnet/modeling_textnet.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index 60080d36dfec..f47ce5dc8770 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -346,9 +346,10 @@ def forward(
 
         Examples:
         ```python
-        >>> from transformers import TextNetForImageClassification,TextNetImageProcessor
-        >>> from PIL import Image
+        >>> import torch
         >>> import requests
+        >>> from transformers import TextNetForImageClassification, TextNetImageProcessor
+        >>> from PIL import Image
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -357,7 +358,8 @@ def forward(
         >>> model = TextNetForImageClassification.from_pretrained("Raghavan/textnet-base")
 
         >>> inputs = processor(images=image, return_tensors="pt", size={"height": 640, "width": 640})
-        >>> outputs = model(**inputs)
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
         >>> outputs.logits.shape
         torch.Size([1, 2])
         ```"""

From 5f2b970221c21fb70ee790cc492b5a12325f8e7a Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Sat, 21 Dec 2024 16:26:03 +0300
Subject: [PATCH 146/152] Update tests/models/textnet/test_modeling_textnet.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 tests/models/textnet/test_modeling_textnet.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index b881ecfec9f6..ffcf6c2e13f4 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -324,7 +324,8 @@ def test_inference_textnet_image_classification(self):
         inputs = processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
-        output = model(**inputs)
+        with torch.no_grad():
+            output = model(**inputs)
 
         # verify logits
         self.assertEqual(output.logits.shape, torch.Size([1, 2]))

From e8a5b972972657f8f3b9725265218fc7f62e1f3a Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Sat, 21 Dec 2024 16:26:26 +0300
Subject: [PATCH 147/152] Update
 src/transformers/models/textnet/modeling_textnet.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/textnet/modeling_textnet.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index f47ce5dc8770..d0ed0f6a1257 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -426,10 +426,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import AutoImageProcessor, AutoBackbone
         >>> import torch
-        >>> from PIL import Image
         >>> import requests
+        >>> from PIL import Image
+        >>> from transformers import AutoImageProcessor, AutoBackbone
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
@@ -438,7 +438,8 @@ def forward(
         >>> model = AutoBackbone.from_pretrained("Raghavan/textnet-base")
 
         >>> inputs = processor(image, return_tensors="pt")
-        >>> outputs = model(**inputs)
+        >>> with torch.no_grad():
+        >>>     outputs = model(**inputs)
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = (

From 29d969ba6ce43859875123f27ceafb141477a56d Mon Sep 17 00:00:00 2001
From: jadechoghari <chogharijade@gmail.com>
Date: Sat, 21 Dec 2024 15:52:59 +0100
Subject: [PATCH 148/152] add changes

---
 .../models/textnet/configuration_textnet.py   |  8 +--
 .../textnet/image_processing_textnet.py       | 24 ++++---
 .../models/textnet/modeling_textnet.py        | 63 ++++++++++---------
 .../textnet/test_image_processing_textnet.py  |  4 ++
 tests/models/textnet/test_modeling_textnet.py | 14 +++--
 5 files changed, 64 insertions(+), 49 deletions(-)

diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 235e946e78bb..67182ac79330 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -21,17 +21,13 @@
 
 logger = logging.get_logger(__name__)
 
-TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "textnet-base": ("https://huggingface.co/Raghavan/textnet-base/blob/main/config.json"),
-}
-
 
 class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`TextNextModel`]. It is used to instantiate a
     TextNext model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the
-    [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base). Configuration objects inherit from
+    [jadechoghari/textnet-base](https://huggingface.co/jadechoghari/textnet-base). Configuration objects inherit from
     [`PretrainedConfig`] and can be used to control the model outputs.Read the documentation from [`PretrainedConfig`]
     for more information.
 
@@ -85,7 +81,7 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     ```"""
 
     r"""
-    [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base)
+    [jadechoghari](https://huggingface.co/jadechoghari/textnet-base)
     """
     model_type = "textnet"
 
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 3a5d71e32f16..cf2362762def 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -60,6 +60,8 @@ class TextNetImageProcessor(BaseImageProcessor):
             Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
             the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
             method.
+        size_divisor (`int`, *optional*, defaults to 32):
+            Ensures height and width are rounded to a multiple of this value after resizing.
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
         do_center_crop (`bool`, *optional*, defaults to `False`):
@@ -93,6 +95,7 @@ def __init__(
         self,
         do_resize: bool = True,
         size: Dict[str, int] = None,
+        size_divisor: int = 32,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         do_center_crop: bool = False,
         crop_size: Dict[str, int] = None,
@@ -112,6 +115,7 @@ def __init__(
 
         self.do_resize = do_resize
         self.size = size
+        self.size_divisor = size_divisor
         self.resample = resample
         self.do_center_crop = do_center_crop
         self.crop_size = crop_size
@@ -126,6 +130,7 @@ def __init__(
             "images",
             "do_resize",
             "size",
+            "size_divisor",
             "resample",
             "do_center_crop",
             "crop_size",
@@ -158,6 +163,8 @@ def resize(
                 Image to resize.
             size (`Dict[str, int]`):
                 Size of the output image.
+            size_divisor (`int`, *optional*, defaults to `32`):
+                Ensures height and width are rounded to a multiple of this value after resizing.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
@@ -176,29 +183,29 @@ def resize(
         else:
             raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
 
-        height, weight = get_resize_output_image_size(
+        height, width = get_resize_output_image_size(
             image, size=size, input_data_format=input_data_format, default_to_square=False
         )
-        if height % 32 != 0:
-            height = height + (32 - height % 32)
-        if weight % 32 != 0:
-            weight = weight + (32 - weight % 32)
+        if height % self.size_divisor != 0:
+            height += self.size_divisor - (height % self.size_divisor)
+        if width % self.size_divisor != 0:
+            width += self.size_divisor - (width % self.size_divisor)
 
         return resize(
             image,
-            size=(height, weight),
+            size=(height, width),
             resample=resample,
             data_format=data_format,
             input_data_format=input_data_format,
             **kwargs,
         )
 
-    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.preprocess
     def preprocess(
         self,
         images: ImageInput,
         do_resize: bool = None,
         size: Dict[str, int] = None,
+        size_divisor: int = None,
         resample: PILImageResampling = None,
         do_center_crop: bool = None,
         crop_size: int = None,
@@ -225,6 +232,8 @@ def preprocess(
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
                 Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
                 the longest edge resized to keep the input aspect ratio.
+            size_divisor (`int`, *optional*, defaults to `32`):
+                Ensures height and width are rounded to a multiple of this value after resizing.
             resample (`int`, *optional*, defaults to `self.resample`):
                 Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
                 has an effect if `do_resize` is set to `True`.
@@ -267,6 +276,7 @@ def preprocess(
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
         size = get_size_dict(size, param_name="size", default_to_square=False)
+        size_divisor = size_divisor if size_divisor is not None else self.size_divisor
         resample = resample if resample is not None else self.resample
         do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
         crop_size = crop_size if crop_size is not None else self.crop_size
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index d0ed0f6a1257..f5d9cab61b81 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -43,33 +43,9 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "TextNetConfig"
-_CHECKPOINT_FOR_DOC = "Raghavan/textnet-base"
+_CHECKPOINT_FOR_DOC = "jadechoghari/textnet-base"
 _EXPECTED_OUTPUT_SHAPE = [1, 512, 20, 27]
 
-TEXTNET_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
-
-    Parameters:
-        config ([`TextNetConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-TEXTNET_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`TextNetImageProcessor.__call__`] for details.
-
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
 
 class TextNetConvLayer(nn.Module):
     def __init__(self, config: TextNetConfig):
@@ -116,7 +92,7 @@ def __init__(self, config: TextNetConfig, in_channels: int, out_channels: int, k
 
         padding = ((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2)
 
-        self.nonlinearity = nn.ReLU()
+        self.activation_function = nn.ReLU()
 
         self.main_conv = nn.Conv2d(
             in_channels=in_channels,
@@ -181,7 +157,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             id_out = self.rbr_identity(hidden_states)
             main_outputs = main_outputs + id_out
 
-        return self.nonlinearity(main_outputs)
+        return self.activation_function(main_outputs)
 
 
 class TextNetStage(nn.Module):
@@ -237,6 +213,31 @@ def forward(
         return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states)
 
 
+TEXTNET_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`TextNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TEXTNET_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`TextNetImageProcessor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
 class TextNetPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -354,8 +355,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> processor = TextNetImageProcessor.from_pretrained("Raghavan/textnet-base")
-        >>> model = TextNetForImageClassification.from_pretrained("Raghavan/textnet-base")
+        >>> processor = TextNetImageProcessor.from_pretrained("jadechoghari/textnet-base")
+        >>> model = TextNetForImageClassification.from_pretrained("jadechoghari/textnet-base")
 
         >>> inputs = processor(images=image, return_tensors="pt", size={"height": 640, "width": 640})
         >>> with torch.no_grad():
@@ -434,8 +435,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> processor = AutoImageProcessor.from_pretrained("Raghavan/textnet-base")
-        >>> model = AutoBackbone.from_pretrained("Raghavan/textnet-base")
+        >>> processor = AutoImageProcessor.from_pretrained("jadechoghari/textnet-base")
+        >>> model = AutoBackbone.from_pretrained("jadechoghari/textnet-base")
 
         >>> inputs = processor(image, return_tensors="pt")
         >>> with torch.no_grad():
diff --git a/tests/models/textnet/test_image_processing_textnet.py b/tests/models/textnet/test_image_processing_textnet.py
index 781f71460ec2..4fcd93e872fc 100644
--- a/tests/models/textnet/test_image_processing_textnet.py
+++ b/tests/models/textnet/test_image_processing_textnet.py
@@ -37,6 +37,7 @@ def __init__(
         max_resolution=400,
         do_resize=True,
         size=None,
+        size_divisor=32,
         do_center_crop=True,
         crop_size=None,
         do_normalize=True,
@@ -54,6 +55,7 @@ def __init__(
         self.max_resolution = max_resolution
         self.do_resize = do_resize
         self.size = size
+        self.size_divisor = size_divisor
         self.do_center_crop = do_center_crop
         self.crop_size = crop_size
         self.do_normalize = do_normalize
@@ -65,6 +67,7 @@ def prepare_image_processor_dict(self):
         return {
             "do_resize": self.do_resize,
             "size": self.size,
+            "size_divisor": self.size_divisor,
             "do_center_crop": self.do_center_crop,
             "crop_size": self.crop_size,
             "do_normalize": self.do_normalize,
@@ -105,6 +108,7 @@ def test_image_processor_properties(self):
         image_processing = self.image_processing_class(**self.image_processor_dict)
         self.assertTrue(hasattr(image_processing, "do_resize"))
         self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "size_divisor"))
         self.assertTrue(hasattr(image_processing, "do_center_crop"))
         self.assertTrue(hasattr(image_processing, "center_crop"))
         self.assertTrue(hasattr(image_processing, "do_normalize"))
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index ffcf6c2e13f4..95ec249f46f7 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -305,7 +305,7 @@ def test_for_image_classification(self):
 
     @slow
     def test_model_from_pretrained(self):
-        model_name = "Raghavan/textnet-base"
+        model_name = "jadechoghari/textnet-base"
         model = TextNetModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
@@ -314,9 +314,9 @@ def test_model_from_pretrained(self):
 @require_vision
 class TextNetModelIntegrationTest(unittest.TestCase):
     @slow
-    def test_inference_textnet_image_classification(self):
-        processor = TextNetImageProcessor.from_pretrained("Raghavan/textnet-base")
-        model = TextNetForImageClassification.from_pretrained("Raghavan/textnet-base").to(torch_device)
+    def test_inference_no_head(self):
+        processor = TextNetImageProcessor.from_pretrained("jadechoghari/textnet-base")
+        model = TextNetModel.from_pretrained("jadechoghari/textnet-base").to(torch_device)
 
         # prepare image
         url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -329,7 +329,11 @@ def test_inference_textnet_image_classification(self):
 
         # verify logits
         self.assertEqual(output.logits.shape, torch.Size([1, 2]))
-        self.assertTrue(torch.allclose(output.logits[:3, :3]), torch.zeros_like((output.logits[:3, :3])), atol=1e-3)
+        expected_slice_backbone = torch.tensor(
+            [0.9210, 0.6099, 0.0000, 0.0000, 0.0000, 0.0000, 3.2207, 2.6602, 1.8925, 0.0000],
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output.feature_maps[-1][0][10][12][:10], expected_slice_backbone, atol=1e-3))
 
 
 @require_torch

From a4d14fe69caf83705f825ce530d8692173d23845 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Sat, 21 Dec 2024 17:59:18 +0300
Subject: [PATCH 149/152] Update textnet.md

---
 docs/source/en/model_doc/textnet.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
index abb57f636ec8..015dae31391a 100644
--- a/docs/source/en/model_doc/textnet.md
+++ b/docs/source/en/model_doc/textnet.md
@@ -27,8 +27,6 @@ alt="drawing" width="600"/>
 
 This model was contributed by [Raghavan](https://huggingface.co/Raghavan), [jadechoghari](https://huggingface.co/jadechoghari) and [nielsr](https://huggingface.co/nielsr).
 
-This model was contributed by [nandwalritik](https://huggingface.co/nandwalritik).
-The original code can be found [here](https://github.com/czczup/FAST).
 
 ## TextNetConfig
 

From 52bcd2f1375da7d4ae596e8bf417610400cfbd3a Mon Sep 17 00:00:00 2001
From: jadechoghari <chogharijade@gmail.com>
Date: Mon, 23 Dec 2024 16:07:32 +0100
Subject: [PATCH 150/152] add doc

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 18de03e1df80..52dfee2483e6 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -715,6 +715,8 @@
         title: Swin2SR
       - local: model_doc/table-transformer
         title: Table Transformer
+      - local: model_doc/textnet
+        title: Text Net
       - local: model_doc/timm_wrapper
         title: Timm Wrapper
       - local: model_doc/upernet

From 200a03cc7d22511ffbdd5c53a139f659541e0e61 Mon Sep 17 00:00:00 2001
From: jadechoghari <chogharijade@gmail.com>
Date: Tue, 24 Dec 2024 14:48:00 +0100
Subject: [PATCH 151/152] add authors hf ckpt + rename

---
 docs/source/en/_toctree.yml                            |  2 +-
 .../models/textnet/configuration_textnet.py            |  4 ++--
 ...original_to_pytorch.py => convert_textnet_to_hf.py} |  0
 src/transformers/models/textnet/modeling_textnet.py    | 10 +++++-----
 tests/models/textnet/test_modeling_textnet.py          |  6 +++---
 5 files changed, 11 insertions(+), 11 deletions(-)
 rename src/transformers/models/textnet/{convert_textnet_original_to_pytorch.py => convert_textnet_to_hf.py} (100%)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 9e76b27c2c6a..5e029f2c8fdd 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -720,7 +720,7 @@
       - local: model_doc/table-transformer
         title: Table Transformer
       - local: model_doc/textnet
-        title: Text Net
+        title: TextNet
       - local: model_doc/timm_wrapper
         title: Timm Wrapper
       - local: model_doc/upernet
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 67182ac79330..48823f76fde5 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -27,7 +27,7 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     This is the configuration class to store the configuration of a [`TextNextModel`]. It is used to instantiate a
     TextNext model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the
-    [jadechoghari/textnet-base](https://huggingface.co/jadechoghari/textnet-base). Configuration objects inherit from
+    [czczup/textnet-base](https://huggingface.co/czczup/textnet-base). Configuration objects inherit from
     [`PretrainedConfig`] and can be used to control the model outputs.Read the documentation from [`PretrainedConfig`]
     for more information.
 
@@ -81,7 +81,7 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     ```"""
 
     r"""
-    [jadechoghari](https://huggingface.co/jadechoghari/textnet-base)
+    [czczup](https://huggingface.co/czczup/textnet-base)
     """
     model_type = "textnet"
 
diff --git a/src/transformers/models/textnet/convert_textnet_original_to_pytorch.py b/src/transformers/models/textnet/convert_textnet_to_hf.py
similarity index 100%
rename from src/transformers/models/textnet/convert_textnet_original_to_pytorch.py
rename to src/transformers/models/textnet/convert_textnet_to_hf.py
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index f5d9cab61b81..df39a965dddc 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -43,7 +43,7 @@
 
 # General docstring
 _CONFIG_FOR_DOC = "TextNetConfig"
-_CHECKPOINT_FOR_DOC = "jadechoghari/textnet-base"
+_CHECKPOINT_FOR_DOC = "czczup/textnet-base"
 _EXPECTED_OUTPUT_SHAPE = [1, 512, 20, 27]
 
 
@@ -355,8 +355,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> processor = TextNetImageProcessor.from_pretrained("jadechoghari/textnet-base")
-        >>> model = TextNetForImageClassification.from_pretrained("jadechoghari/textnet-base")
+        >>> processor = TextNetImageProcessor.from_pretrained("czczup/textnet-base")
+        >>> model = TextNetForImageClassification.from_pretrained("czczup/textnet-base")
 
         >>> inputs = processor(images=image, return_tensors="pt", size={"height": 640, "width": 640})
         >>> with torch.no_grad():
@@ -435,8 +435,8 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> processor = AutoImageProcessor.from_pretrained("jadechoghari/textnet-base")
-        >>> model = AutoBackbone.from_pretrained("jadechoghari/textnet-base")
+        >>> processor = AutoImageProcessor.from_pretrained("czczup/textnet-base")
+        >>> model = AutoBackbone.from_pretrained("czczup/textnet-base")
 
         >>> inputs = processor(image, return_tensors="pt")
         >>> with torch.no_grad():
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 95ec249f46f7..cf5e48506e52 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -305,7 +305,7 @@ def test_for_image_classification(self):
 
     @slow
     def test_model_from_pretrained(self):
-        model_name = "jadechoghari/textnet-base"
+        model_name = "czczup/textnet-base"
         model = TextNetModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
@@ -315,8 +315,8 @@ def test_model_from_pretrained(self):
 class TextNetModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_no_head(self):
-        processor = TextNetImageProcessor.from_pretrained("jadechoghari/textnet-base")
-        model = TextNetModel.from_pretrained("jadechoghari/textnet-base").to(torch_device)
+        processor = TextNetImageProcessor.from_pretrained("czczup/textnet-base")
+        model = TextNetModel.from_pretrained("czczup/textnet-base").to(torch_device)
 
         # prepare image
         url = "http://images.cocodataset.org/val2017/000000039769.jpg"

From be77727b9d49bcc0f78e148e8324c4c1f15bf2af Mon Sep 17 00:00:00 2001
From: jadechoghari <chogharijade@gmail.com>
Date: Wed, 8 Jan 2025 09:03:41 +0100
Subject: [PATCH 152/152] add feedback: classifier/docs

---
 docs/source/en/model_doc/textnet.md           |  6 ++
 src/transformers/models/textnet/__init__.py   | 62 +++----------------
 .../models/textnet/configuration_textnet.py   |  6 +-
 .../textnet/image_processing_textnet.py       |  3 +
 .../models/textnet/modeling_textnet.py        | 29 +++++++--
 5 files changed, 43 insertions(+), 63 deletions(-)

diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
index 015dae31391a..d6b431e648f2 100644
--- a/docs/source/en/model_doc/textnet.md
+++ b/docs/source/en/model_doc/textnet.md
@@ -27,6 +27,12 @@ alt="drawing" width="600"/>
 
 This model was contributed by [Raghavan](https://huggingface.co/Raghavan), [jadechoghari](https://huggingface.co/jadechoghari) and [nielsr](https://huggingface.co/nielsr).
 
+## Usage tips
+
+TextNet is mainly used as a backbone network for the architecture search of text detection. Each stage of the backbone network is comprised of a stride-2 convolution and searchable blocks. 
+Specifically, we present a layer-level candidate set, defined as {conv3×3, conv1×3, conv3×1, identity}. As the 1×3 and 3×1 convolutions have asymmetric kernels and oriented structure priors, they may help to capture the features of extreme aspect-ratio and rotated text lines.
+
+TextNet is the backbone for Fast, but can also be used as an efficient text/image classification, we add a `TextNetForImageClassification` as is it would allow people to train an image classifier on top of the pre-trained textnet weights
 
 ## TextNetConfig
 
diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py
index 9a18123cbe93..8f04a680b21f 100644
--- a/src/transformers/models/textnet/__init__.py
+++ b/src/transformers/models/textnet/__init__.py
@@ -1,5 +1,4 @@
-# coding=utf-8
-# Copyright 2024 the Fast authors and HuggingFace Inc. team.  All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,61 +13,16 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ... import is_vision_available
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
-
-
-_import_structure = {
-    "configuration_textnet": ["TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "TextNetConfig"],
-}
-
-try:
-    if not is_vision_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["image_processing_textnet"] = ["TextNetImageProcessor"]
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_textnet"] = [
-        "TextNetBackbone",
-        "TextNetModel",
-        "TextNetPreTrainedModel",
-        "TextNetForImageClassification",
-    ]
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
 
 
 if TYPE_CHECKING:
-    from .configuration_textnet import TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, TextNetConfig
-
-    try:
-        if not is_vision_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .image_processing_textnet import TextNetImageProcessor
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_textnet import (
-            TextNetBackbone,
-            TextNetForImageClassification,
-            TextNetModel,
-            TextNetPreTrainedModel,
-        )
-
+    from .configuration_textnet import *
+    from .image_processing_textnet import *
+    from .modeling_textnet import *
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
index 48823f76fde5..61ecaaeba8e5 100644
--- a/src/transformers/models/textnet/configuration_textnet.py
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -80,9 +80,6 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
     >>> configuration = model.config
     ```"""
 
-    r"""
-    [czczup](https://huggingface.co/czczup/textnet-base)
-    """
     model_type = "textnet"
 
     def __init__(
@@ -133,3 +130,6 @@ def __init__(
         self._out_features, self._out_indices = get_aligned_output_features_output_indices(
             out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
         )
+
+
+__all__ = ["TextNetConfig"]
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index cf2362762def..b3d4250b4149 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -350,3 +350,6 @@ def preprocess(
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["TextNetImageProcessor"]
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
index df39a965dddc..c895e66dc16f 100644
--- a/src/transformers/models/textnet/modeling_textnet.py
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -82,6 +82,15 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 class TextNetRepConvLayer(nn.Module):
+    r"""
+    This layer supports re-parameterization by combining multiple convolutional branches
+    (e.g., main convolution, vertical, horizontal, and identity branches) during training.
+    At inference time, these branches can be collapsed into a single convolution for
+    efficiency, as per the re-parameterization paradigm.
+
+    The "Rep" in the name stands for "re-parameterization" (introduced by RepVGG).
+    """
+
     def __init__(self, config: TextNetConfig, in_channels: int, out_channels: int, kernel_size: int, stride: int):
         super().__init__()
 
@@ -143,11 +152,13 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         main_outputs = self.main_conv(hidden_states)
         main_outputs = self.main_batch_norm(main_outputs)
 
+        # applies a convolution with a vertical kernel
         if self.vertical_conv is not None:
             vertical_outputs = self.vertical_conv(hidden_states)
             vertical_outputs = self.vertical_batch_norm(vertical_outputs)
             main_outputs = main_outputs + vertical_outputs
 
+        # applies a convolution with a horizontal kernel
         if self.horizontal_conv is not None:
             horizontal_outputs = self.horizontal_conv(hidden_states)
             horizontal_outputs = self.horizontal_batch_norm(horizontal_outputs)
@@ -319,12 +330,13 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.textnet = TextNetModel(config)
+        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        self.flatten = nn.Flatten()
+        self.fc = nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+
         # classification head
-        self.classifier = nn.Sequential(
-            nn.AdaptiveAvgPool2d((1, 1)),
-            nn.Flatten(),
-            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity(),
-        )
+        self.classifier = nn.ModuleList([self.avg_pool, self.flatten])
+
         # initialize weights and apply final processing
         self.post_init()
 
@@ -368,7 +380,9 @@ def forward(
 
         outputs = self.textnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
         last_hidden_state = outputs[0]
-        logits = self.classifier(last_hidden_state)
+        for layer in self.classifier:
+            last_hidden_state = layer(last_hidden_state)
+        logits = self.fc(last_hidden_state)
         loss = None
 
         if labels is not None:
@@ -468,3 +482,6 @@ def forward(
             hidden_states=outputs.hidden_states if output_hidden_states else None,
             attentions=None,
         )
+
+
+__all__ = ["TextNetBackbone", "TextNetModel", "TextNetPreTrainedModel", "TextNetForImageClassification"]