From c63915fa50f265970e26fb1aa944f87beda5adf4 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Thu, 5 Oct 2023 05:42:14 +0530 Subject: [PATCH 001/152] WIP --- .../models/fast/configuration_fast.py | 78 ++ src/transformers/models/fast/modeling_fast.py | 919 ++++++++++++++++++ 2 files changed, 997 insertions(+) create mode 100644 src/transformers/models/fast/configuration_fast.py create mode 100644 src/transformers/models/fast/modeling_fast.py diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py new file mode 100644 index 000000000000..53a3791a48ae --- /dev/null +++ b/src/transformers/models/fast/configuration_fast.py @@ -0,0 +1,78 @@ +from transformers import PretrainedConfig + + +class FastConfig(PretrainedConfig): + + def __init__(self, + backbone_config=None, + backbone_stage1_in_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64], + backbone_stage1_out_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64], + backbone_stage1_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3), + (3, 3)], + backbone_stage1_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1], + backbone_stage1_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + backbone_stage1_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + + backbone_stage2_in_channels=[64, 128, 128, 128, 128, 128, 128, 128, 128, 128], + backbone_stage2_out_channels=[128, 128, 128, 128, 128, 128, 128, 128, 128, 128], + backbone_stage2_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3), + (3, 3)], + backbone_stage2_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1], + backbone_stage2_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + backbone_stage2_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + + backbone_stage3_in_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64], + backbone_stage3_out_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64], + backbone_stage3_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3), + (3, 3)], + backbone_stage3_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1], + backbone_stage3_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + backbone_stage3_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + + backbone_stage4_in_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64], + backbone_stage4_out_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64], + backbone_stage4_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3), + (3, 3)], + backbone_stage4_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1], + backbone_stage4_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + backbone_stage4_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + + neck_in_channels=[64, 128, 256, 512], + neck_out_channels=[128, 128, 128, 128], + neck_kernel_size=[(3, 3), (3, 3), (3, 3), (3, 3)], + neck_stride=[1, 1, 1, 1], + neck_dilation=[1, 1, 1, 1], + neck_groups=[1, 1, 1, 1], + **kwargs + ): + self.backbone_config = { + "kernel_size": 3, + "stride": 2, + "dilation": 1, + "groups": 1, + "bias": False, + "has_shuffle": False, + "in_channels": 3, + "out_channels": 64, + "use_bn": True, + "act_func": "relu", + "dropout_rate": 0, + "ops_order": "weight_bn_act" + } + super.__init__(**kwargs) + if backbone_config is not None: + self.backbone_config.update(backbone_config) + + self.backbone_stage1_in_channels = backbone_stage1_in_channels + self.backbone_stage1_out_channels = backbone_stage1_out_channels + self.backbone_stage1_kernel_size = backbone_stage1_kernel_size, + self.backbone_stage1_stride = backbone_stage1_stride, + self.backbone_stage1_dilation = backbone_stage1_dilation, + self.backbone_stage1_groups = backbone_stage1_groups, + + self.neck_in_channels = neck_in_channels, + self.neck_out_channels = neck_out_channels, + self.neck_kernel_size_channels = neck_kernel_size, + self.neck_stride_channels = neck_stride, + self.neck_dilation_channels = neck_dilation, + self.neck_groups_channels = neck_groups, diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py new file mode 100644 index 000000000000..4684378ce8eb --- /dev/null +++ b/src/transformers/models/fast/modeling_fast.py @@ -0,0 +1,919 @@ +import math +from collections import OrderedDict + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import time +import cv2 + +class DiceLoss(nn.Module): + def __init__(self, loss_weight=1.0): + super(DiceLoss, self).__init__() + self.loss_weight = loss_weight + + def forward(self, input, target, mask, reduce=True): + batch_size = input.size(0) + input = torch.sigmoid(input) + + input = input.contiguous().view(batch_size, -1) + target = target.contiguous().view(batch_size, -1).float() + mask = mask.contiguous().view(batch_size, -1).float() + + input = input * mask + target = target * mask + + a = torch.sum(input * target, dim=1) + b = torch.sum(input * input, dim=1) + 0.001 + c = torch.sum(target * target, dim=1) + 0.001 + d = (2 * a) / (b + c) + loss = 1 - d + + loss = self.loss_weight * loss + + if reduce: + loss = torch.mean(loss) + + return loss + + +class EmbLoss_v1(nn.Module): + def __init__(self, feature_dim=4, loss_weight=1.0): + super(EmbLoss_v1, self).__init__() + self.feature_dim = feature_dim + self.loss_weight = loss_weight + self.delta_v = 0.5 + self.delta_d = 1.5 + self.weights = (1.0, 1.0) + + def forward_single(self, emb, instance, kernel, training_mask): + training_mask = (training_mask > 0.5).long() + kernel = (kernel > 0.5).long() + instance = instance * training_mask + instance_kernel = (instance * kernel).view(-1) + instance = instance.view(-1) + emb = emb.view(self.feature_dim, -1) + + unique_labels, unique_ids = torch.unique(instance_kernel, sorted=True, return_inverse=True) + num_instance = unique_labels.size(0) + if num_instance <= 1: + return 0 + + emb_mean = emb.new_zeros((self.feature_dim, num_instance), dtype=torch.float32) + for i, lb in enumerate(unique_labels): + if lb == 0: + continue + ind_k = instance_kernel == lb + emb_mean[:, i] = torch.mean(emb[:, ind_k], dim=1) + + l_agg = emb.new_zeros(num_instance, dtype=torch.float32) # bug + for i, lb in enumerate(unique_labels): + if lb == 0: + continue + ind = instance == lb + emb_ = emb[:, ind] + dist = (emb_ - emb_mean[:, i:i + 1]).norm(p=2, dim=0) + dist = F.relu(dist - self.delta_v) ** 2 + l_agg[i] = torch.mean(torch.log(dist + 1.0)) + l_agg = torch.mean(l_agg[1:]) + + if num_instance > 2: + emb_interleave = emb_mean.permute(1, 0).repeat(num_instance, 1) + emb_band = emb_mean.permute(1, 0).repeat(1, num_instance).view(-1, self.feature_dim) + # print(seg_band) + + mask = (1 - torch.eye(num_instance, dtype=torch.int8)).view(-1, 1).repeat(1, self.feature_dim) + mask = mask.view(num_instance, num_instance, -1) + mask[0, :, :] = 0 + mask[:, 0, :] = 0 + mask = mask.view(num_instance * num_instance, -1) + # print(mask) + + dist = emb_interleave - emb_band + dist = dist[mask > 0].view(-1, self.feature_dim).norm(p=2, dim=1) + dist = F.relu(2 * self.delta_d - dist) ** 2 + l_dis = torch.mean(torch.log(dist + 1.0)) + else: + l_dis = 0 + + l_agg = self.weights[0] * l_agg + l_dis = self.weights[1] * l_dis + l_reg = torch.mean(torch.log(torch.norm(emb_mean, 2, 0) + 1.0)) * 0.001 + loss = l_agg + l_dis + l_reg + return loss + + def forward(self, emb, instance, kernel, training_mask, reduce=True): + loss_batch = emb.new_zeros((emb.size(0)), dtype=torch.float32) + + for i in range(loss_batch.size(0)): + loss_batch[i] = self.forward_single(emb[i], instance[i], kernel[i], training_mask[i]) + + loss_batch = self.loss_weight * loss_batch + + if reduce: + loss_batch = torch.mean(loss_batch) + + return loss_batch + + +class EmbLoss_v2(nn.Module): + def __init__(self, feature_dim=4, loss_weight=1.0): + super(EmbLoss_v2, self).__init__() + self.feature_dim = feature_dim + self.loss_weight = loss_weight + self.delta_v = 0.5 + self.delta_d = 1.5 + self.weights = (1.0, 1.0) + + def forward_single(self, emb, instance, kernel, training_mask): + training_mask = (training_mask > 0.5).long() + kernel = (kernel > 0.5).long() + instance = instance * training_mask + instance_kernel = (instance * kernel).view(-1) + instance = instance.view(-1) + emb = emb.view(self.feature_dim, -1) + + unique_labels, unique_ids = torch.unique(instance_kernel, sorted=True, return_inverse=True) + num_instance = unique_labels.size(0) + if num_instance <= 1: + return 0 + + emb_mean = emb.new_zeros((self.feature_dim, num_instance), dtype=torch.float32) + for i, lb in enumerate(unique_labels): + if lb == 0: + continue + ind_k = instance_kernel == lb + emb_mean[:, i] = torch.mean(emb[:, ind_k], dim=1) + + l_agg = emb.new_zeros(num_instance, dtype=torch.float32) # bug + for i, lb in enumerate(unique_labels): + if lb == 0: + continue + ind = instance == lb + emb_ = emb[:, ind] + dist = (emb_ - emb_mean[:, i:i + 1]).norm(p=2, dim=0) + dist = F.relu(dist - self.delta_v) ** 2 + l_agg[i] = torch.mean(torch.log(dist + 1.0)) + l_agg = torch.mean(l_agg[1:]) + + if num_instance > 2: + emb_interleave = emb_mean.permute(1, 0).repeat(num_instance, 1) + emb_band = emb_mean.permute(1, 0).repeat(1, num_instance).view(-1, self.feature_dim) + # print(seg_band) + + mask = (1 - torch.eye(num_instance, dtype=torch.int8)).view(-1, 1).repeat(1, self.feature_dim) + mask = mask.view(num_instance, num_instance, -1) + mask[0, :, :] = 0 + mask[:, 0, :] = 0 + mask = mask.view(num_instance * num_instance, -1) + # print(mask) + + dist = emb_interleave - emb_band + dist = dist[mask > 0].view(-1, self.feature_dim).norm(p=2, dim=1) + dist = F.relu(2 * self.delta_d - dist) ** 2 + # l_dis = torch.mean(torch.log(dist + 1.0)) + + l_dis = [torch.log(dist + 1.0)] + emb_bg = emb[:, instance == 0].view(self.feature_dim, -1) + if emb_bg.size(1) > 100: + rand_ind = np.random.permutation(emb_bg.size(1))[:100] + emb_bg = emb_bg[:, rand_ind] + if emb_bg.size(1) > 0: + for i, lb in enumerate(unique_labels): + if lb == 0: + continue + dist = (emb_bg - emb_mean[:, i:i + 1]).norm(p=2, dim=0) + dist = F.relu(2 * self.delta_d - dist) ** 2 + l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True) + l_dis.append(l_dis_bg) + l_dis = torch.mean(torch.cat(l_dis)) + else: + l_dis = 0 + + l_agg = self.weights[0] * l_agg + l_dis = self.weights[1] * l_dis + l_reg = torch.mean(torch.log(torch.norm(emb_mean, 2, 0) + 1.0)) * 0.001 + loss = l_agg + l_dis + l_reg + return loss + + def forward(self, emb, instance, kernel, training_mask, reduce=True): + loss_batch = emb.new_zeros((emb.size(0)), dtype=torch.float32) + + for i in range(loss_batch.size(0)): + loss_batch[i] = self.forward_single(emb[i], instance[i], kernel[i], training_mask[i]) + + loss_batch = self.loss_weight * loss_batch + + if reduce: + loss_batch = torch.mean(loss_batch) + + return loss_batch + + +def set_layer_from_config(layer_config): + if layer_config is None: + return None + + name2layer = { + ConvLayer.__name__: ConvLayer, + RepConvLayer.__name__: RepConvLayer + } + + layer_name = layer_config.pop('name') + layer = name2layer[layer_name] + return layer.build_from_config(layer_config) + + +def get_same_padding(kernel_size): + if isinstance(kernel_size, tuple): + assert len(kernel_size) == 2, 'invalid kernel size: %s' % kernel_size + p1 = get_same_padding(kernel_size[0]) + p2 = get_same_padding(kernel_size[1]) + return p1, p2 + assert isinstance(kernel_size, int), 'kernel size should be either `int` or `tuple`' + assert kernel_size % 2 > 0, 'kernel size should be odd number' + return kernel_size // 2 + + +class My2DLayer(nn.Module): + + def __init__(self, in_channels, out_channels, + use_bn=True, act_func='relu', dropout_rate=0, ops_order='weight_bn_act'): + super(My2DLayer, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + + self.use_bn = use_bn + self.act_func = act_func + self.dropout_rate = dropout_rate + self.ops_order = ops_order + + """ modules """ + modules = {} + # batch norm + if self.use_bn: + if self.bn_before_weight: + modules['bn'] = nn.BatchNorm2d(in_channels) + else: + modules['bn'] = nn.BatchNorm2d(out_channels) + else: + modules['bn'] = None + # activation + modules['act'] = build_activation(self.act_func, self.ops_list[0] != 'act') + # dropout + if self.dropout_rate > 0: + modules['dropout'] = nn.Dropout2d(self.dropout_rate, inplace=True) + else: + modules['dropout'] = None + # weight + modules['weight'] = self.weight_op() + + # add modules + for op in self.ops_list: + if modules[op] is None: + continue + elif op == 'weight': + if modules['dropout'] is not None: + self.add_module('dropout', modules['dropout']) + for key in modules['weight']: + self.add_module(key, modules['weight'][key]) + else: + self.add_module(op, modules[op]) + + @property + def ops_list(self): + return self.ops_order.split('_') + + @property + def bn_before_weight(self): + for op in self.ops_list: + if op == 'bn': + return True + elif op == 'weight': + return False + raise ValueError('Invalid ops_order: %s' % self.ops_order) + + def weight_op(self): + raise NotImplementedError + + """ Methods defined in MyModule """ + + def forward(self, x): + for module in self._modules.values(): + x = module(x) + return x + + @property + def module_str(self): + raise NotImplementedError + + @property + def config(self): + return { + 'in_channels': self.in_channels, + 'out_channels': self.out_channels, + 'use_bn': self.use_bn, + 'act_func': self.act_func, + 'dropout_rate': self.dropout_rate, + 'ops_order': self.ops_order, + } + + @staticmethod + def build_from_config(config): + raise NotImplementedError + + def get_flops(self, x): + raise NotImplementedError + + @staticmethod + def is_zero_layer(): + return False + + +def generate_bbox(keys, label, score, scales, cfg): + label_num = len(keys) + bboxes = [] + scores = [] + for index in range(1, label_num): + i = keys[index] + ind = (label == i) + ind_np = ind.data.cpu().numpy() + points = np.array(np.where(ind_np)).transpose((1, 0)) + if points.shape[0] < cfg.test_cfg.min_area: + label[ind] = 0 + continue + score_i = score[ind].mean().item() + if score_i < cfg.test_cfg.min_score: + label[ind] = 0 + continue + + if cfg.test_cfg.bbox_type == 'rect': + rect = cv2.minAreaRect(points[:, ::-1]) + alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1]))) + rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2]) + bbox = cv2.boxPoints(rect) * scales + + elif cfg.test_cfg.bbox_type == 'poly': + binary = np.zeros(label.shape, dtype='uint8') + binary[ind_np] = 1 + contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + bbox = contours[0] * scales + bbox = bbox.astype('int32') + bboxes.append(bbox.reshape(-1).tolist()) + scores.append(score_i) + return bboxes, scores + + +class ConvLayer(My2DLayer): + + def __init__(self, in_channels, out_channels, + kernel_size=3, stride=1, dilation=1, groups=1, bias=False, has_shuffle=False, + use_bn=True, act_func='relu', dropout_rate=0, ops_order='weight_bn_act'): + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + self.groups = groups + self.bias = bias + self.has_shuffle = has_shuffle + + super(ConvLayer, self).__init__(in_channels, out_channels, use_bn, act_func, dropout_rate, ops_order) + + def weight_op(self): + padding = get_same_padding(self.kernel_size) + if isinstance(padding, int): + padding *= self.dilation + else: + padding[0] *= self.dilation + padding[1] *= self.dilation + + weight_dict = OrderedDict() + weight_dict['conv'] = nn.Conv2d( + self.in_channels, self.out_channels, kernel_size=self.kernel_size, stride=self.stride, padding=padding, + dilation=self.dilation, groups=self.groups, bias=self.bias + ) + + return weight_dict + + @staticmethod + def build_from_config(config): + return ConvLayer(**config) + + +class RepConvLayer(nn.Module): + + def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, deploy=False): + super(RepConvLayer, self).__init__() + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + self.groups = groups + self.deploy = deploy + + assert len(kernel_size) == 2 + padding = (int(((kernel_size[0] - 1) * dilation) / 2), + int(((kernel_size[1] - 1) * dilation) / 2)) + + self.nonlinearity = nn.ReLU(inplace=True) + + if deploy: + self.fused_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=kernel_size, stride=stride, padding=padding, + dilation=dilation, groups=groups, bias=True) + else: + self.main_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=kernel_size, stride=stride, padding=padding, + dilation=dilation, groups=groups, bias=False) + self.main_bn = nn.BatchNorm2d(num_features=out_channels) + + ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0) + hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2)) + + if kernel_size[1] != 1: # 卷积核的宽大于1 -> 有垂直卷积 + self.ver_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=(kernel_size[0], 1), + stride=stride, padding=ver_pad, + dilation=dilation, groups=groups, bias=False) + self.ver_bn = nn.BatchNorm2d(num_features=out_channels) + else: + self.ver_conv, self.ver_bn = None, None + + if kernel_size[0] != 1: # 卷积核的高大于1 -> 有水平卷积 + self.hor_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, + kernel_size=(1, kernel_size[1]), + stride=stride, padding=hor_pad, + dilation=dilation, groups=groups, bias=False) + self.hor_bn = nn.BatchNorm2d(num_features=out_channels) + else: + self.hor_conv, self.hor_bn = None, None + + self.rbr_identity = nn.BatchNorm2d( + num_features=in_channels) if out_channels == in_channels and stride == 1 else None + + def forward(self, input): + if hasattr(self, 'fused_conv'): + return self.nonlinearity(self.fused_conv(input)) + else: + main_outputs = self.main_conv(input) + main_outputs = self.main_bn(main_outputs) + if self.ver_conv is not None: + vertical_outputs = self.ver_conv(input) + vertical_outputs = self.ver_bn(vertical_outputs) + else: + vertical_outputs = 0 + + if self.hor_conv is not None: + horizontal_outputs = self.hor_conv(input) + horizontal_outputs = self.hor_bn(horizontal_outputs) + else: + horizontal_outputs = 0 + + if self.rbr_identity is None: + id_out = 0 + else: + id_out = self.rbr_identity(input) + + return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out) + + def _identity_to_conv(self, identity): + if identity is None: + return 0, 0 + assert isinstance(identity, nn.BatchNorm2d) + if not hasattr(self, 'id_tensor'): + input_dim = self.in_channels // self.groups + kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32) + for i in range(self.in_channels): + kernel_value[i, i % input_dim, 0, 0] = 1 + id_tensor = torch.from_numpy(kernel_value).to(identity.weight.device) + self.id_tensor = self._pad_to_mxn_tensor(id_tensor) + kernel = self.id_tensor + running_mean = identity.running_mean + running_var = identity.running_var + gamma = identity.weight + beta = identity.bias + eps = identity.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape(-1, 1, 1, 1) + return kernel * t, beta - running_mean * gamma / std + + def _fuse_bn_tensor(self, conv, bn): + kernel = conv.weight + kernel = self._pad_to_mxn_tensor(kernel) + running_mean = bn.running_mean + running_var = bn.running_var + gamma = bn.weight + beta = bn.bias + eps = bn.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape(-1, 1, 1, 1) + return kernel * t, beta - running_mean * gamma / std + + def get_equivalent_kernel_bias(self): + kernel_mxn, bias_mxn = self._fuse_bn_tensor(self.main_conv, self.main_bn) + if self.ver_conv is not None: + kernel_mx1, bias_mx1 = self._fuse_bn_tensor(self.ver_conv, self.ver_bn) + else: + kernel_mx1, bias_mx1 = 0, 0 + if self.hor_conv is not None: + kernel_1xn, bias_1xn = self._fuse_bn_tensor(self.hor_conv, self.hor_bn) + else: + kernel_1xn, bias_1xn = 0, 0 + kernel_id, bias_id = self._identity_to_conv(self.rbr_identity) + kernel_mxn = kernel_mxn + kernel_mx1 + kernel_1xn + kernel_id + bias_mxn = bias_mxn + bias_mx1 + bias_1xn + bias_id + return kernel_mxn, bias_mxn + + def _pad_to_mxn_tensor(self, kernel): + kernel_height, kernel_width = self.kernel_size + height, width = kernel.shape[2:] + pad_left_right = (kernel_width - width) // 2 + pad_top_down = (kernel_height - height) // 2 + return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right, + pad_top_down, pad_top_down]) + + def switch_to_deploy(self): + if hasattr(self, 'fused_conv'): + return + kernel, bias = self.get_equivalent_kernel_bias() + self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels, + out_channels=self.main_conv.out_channels, + kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride, + padding=self.main_conv.padding, dilation=self.main_conv.dilation, + groups=self.main_conv.groups, bias=True) + self.fused_conv.weight.data = kernel + self.fused_conv.bias.data = bias + self.deploy = True + for para in self.parameters(): + para.detach_() + for attr in ['main_conv', 'main_bn', 'ver_conv', 'ver_bn', 'hor_conv', 'hor_bn']: + if hasattr(self, attr): + self.__delattr__(attr) + + if hasattr(self, 'rbr_identity'): + self.__delattr__('rbr_identity') + + def switch_to_test(self): + kernel, bias = self.get_equivalent_kernel_bias() + self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels, + out_channels=self.main_conv.out_channels, + kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride, + padding=self.main_conv.padding, dilation=self.main_conv.dilation, + groups=self.main_conv.groups, bias=True) + self.fused_conv.weight.data = kernel + self.fused_conv.bias.data = bias + for para in self.fused_conv.parameters(): + para.detach_() + self.deploy = True + + def switch_to_train(self): + if hasattr(self, 'fused_conv'): + self.__delattr__('fused_conv') + self.deploy = False + + @staticmethod + def is_zero_layer(): + return False + + @property + def module_str(self): + return 'Rep_%dx%d' % (self.kernel_size[0], self.kernel_size[1]) + + @property + def config(self): + return {'name': RepConvLayer.__name__, + 'in_channels': self.in_channels, + 'out_channels': self.out_channels, + 'kernel_size': self.kernel_size, + 'stride': self.stride, + 'dilation': self.dilation, + 'groups': self.groups} + + @staticmethod + def build_from_config(config): + return RepConvLayer(**config) + + +class TextNet(nn.Module): + + def __init__(self, first_conv, stage1, stage2, stage3, stage4): + super(TextNet, self).__init__() + + self.first_conv = first_conv + self.stage1 = nn.ModuleList(stage1) + self.stage2 = nn.ModuleList(stage2) + self.stage3 = nn.ModuleList(stage3) + self.stage4 = nn.ModuleList(stage4) + + self._initialize_weights() + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def forward(self, x): + x = self.first_conv(x) + output = list() + + for block in self.stage1: + x = block(x) + output.append(x) + + for block in self.stage2: + x = block(x) + output.append(x) + + for block in self.stage3: + x = block(x) + output.append(x) + + for block in self.stage4: + x = block(x) + output.append(x) + + return output + + @staticmethod + def build_from_config(config): + first_conv = set_layer_from_config(config['first_conv']) + stage1, stage2, stage3, stage4 = [], [], [], [] + for block_config in config['stage1']: + stage1.append(set_layer_from_config(block_config)) + for block_config in config['stage2']: + stage2.append(set_layer_from_config(block_config)) + for block_config in config['stage3']: + stage3.append(set_layer_from_config(block_config)) + for block_config in config['stage4']: + stage4.append(set_layer_from_config(block_config)) + + net = TextNet(first_conv, stage1, stage2, stage3, stage4) + + return net + + +class FASTNeck(nn.Module): + def __init__(self, reduce_layer1, reduce_layer2, reduce_layer3, reduce_layer4): + super(FASTNeck, self).__init__() + self.reduce_layer1 = reduce_layer1 + self.reduce_layer2 = reduce_layer2 + self.reduce_layer3 = reduce_layer3 + self.reduce_layer4 = reduce_layer4 + + self._initialize_weights() + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def _upsample(self, x, y): + _, _, H, W = y.size() + return F.upsample(x, size=(H, W), mode='bilinear') + + def forward(self, x): + f1, f2, f3, f4 = x + f1 = self.reduce_layer1(f1) + f2 = self.reduce_layer2(f2) + f3 = self.reduce_layer3(f3) + f4 = self.reduce_layer4(f4) + + f2 = self._upsample(f2, f1) + f3 = self._upsample(f3, f1) + f4 = self._upsample(f4, f1) + f = torch.cat((f1, f2, f3, f4), 1) + return f + + @staticmethod + def build_from_config(config): + reduce_layer1 = set_layer_from_config(config['reduce_layer1']) + reduce_layer2 = set_layer_from_config(config['reduce_layer2']) + reduce_layer3 = set_layer_from_config(config['reduce_layer3']) + reduce_layer4 = set_layer_from_config(config['reduce_layer4']) + return FASTNeck(reduce_layer1, reduce_layer2, reduce_layer3, reduce_layer4) + + +class FASTHead(nn.Module): + def __init__(self, conv, blocks, final, pooling_size, + loss_text, loss_kernel, loss_emb, dropout_ratio=0): + super(FASTHead, self).__init__() + self.conv = conv + if blocks is not None: + self.blocks = nn.ModuleList(blocks) + else: + self.blocks = None + self.final = final + + # self.text_loss = build_loss(loss_text) + # self.kernel_loss = build_loss(loss_kernel) + # self.emb_loss = build_loss(loss_emb) + + self.pooling_size = pooling_size + + self.pooling_1s = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, + padding=(self.pooling_size - 1) // 2) + self.pooling_2s = nn.MaxPool2d(kernel_size=self.pooling_size // 2 + 1, stride=1, + padding=(self.pooling_size // 2) // 2) + + if dropout_ratio > 0: + self.dropout = nn.Dropout2d(dropout_ratio) + else: + self.dropout = None + + self._initialize_weights() + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def forward(self, x): + x = self.conv(x) + if self.blocks is not None: + for block in self.blocks: + x = block(x) + if self.dropout is not None: + x = self.dropout(x) + x = self.final(x) + return x + + def get_results(self, out, img_meta, cfg, scale=2): + + if not self.training: + torch.cuda.synchronize() + start = time.time() + + org_img_size = img_meta['org_img_size'][0] + img_size = img_meta['img_size'][0] # 640*640 + batch_size = out.size(0) + outputs = dict() + + texts = F.interpolate(out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), + mode='nearest') # B*1*320*320 + texts = self._max_pooling(texts, scale=scale) # B*1*320*320 + score_maps = torch.sigmoid_(texts) # B*1*320*320 + score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode='nearest') # B*1*640*640 + score_maps = score_maps.squeeze(1) # B*640*640 + + kernels = (out[:, 0, :, :] > 0).to(torch.uint8) # B*160*160 + if kernels.is_cuda: + labels_ = ccl_cuda.ccl_batch(kernels) # B*160*160 + else: + labels_ = [] + for kernel in kernels.numpy(): + ret, label_ = cv2.connectedComponents(kernel) + labels_.append(label_) + labels_ = np.array(labels_) + labels_ = torch.from_numpy(labels_) + labels = labels_.unsqueeze(1).to(torch.float32) # B*1*160*160 + labels = F.interpolate(labels, size=(img_size[0] // scale, img_size[1] // scale), mode='nearest') # B*1*320*320 + labels = self._max_pooling(labels, scale=scale) + labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode='nearest') # B*1*640*640 + labels = labels.squeeze(1).to(torch.int32) # B*640*640 + + keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)] + + if not self.training: + torch.cuda.synchronize() + outputs.update(dict( + post_time=time.time() - start + )) + + outputs.update(dict(kernels=kernels.data.cpu())) + + scales = (float(org_img_size[1]) / float(img_size[1]), + float(org_img_size[0]) / float(img_size[0])) + + results = [] + for i in range(batch_size): + bboxes, scores = generate_bbox(keys[i], labels[i], score_maps[i], scales, cfg) + results.append(dict( + bboxes=bboxes, + scores=scores + )) + outputs.update(dict(results=results)) + + return outputs + + def _max_pooling(self, x, scale=1): + if scale == 1: + x = self.pooling_1s(x) + elif scale == 2: + x = self.pooling_2s(x) + return x + + # def loss(self, out, gt_texts, gt_kernels, training_masks, gt_instances): + # # output + # kernels = out[:, 0, :, :] # 4*640*640 + # texts = self._max_pooling(kernels, scale=1) # 4*640*640 + # embs = out[:, 1:, :, :] # 4*4*640*640 + # + # # text loss + # selected_masks = ohem_batch(texts, gt_texts, training_masks) + # loss_text = self.text_loss(texts, gt_texts, selected_masks, reduce=False) + # iou_text = iou((texts > 0).long(), gt_texts, training_masks, reduce=False) + # losses = dict( + # loss_text=loss_text, + # iou_text=iou_text + # ) + # + # # kernel loss + # selected_masks = gt_texts * training_masks + # loss_kernel = self.kernel_loss(kernels, gt_kernels, selected_masks, reduce=False) + # loss_kernel = torch.mean(loss_kernel, dim=0) + # iou_kernel = iou((kernels > 0).long(), gt_kernels, selected_masks, reduce=False) + # losses.update(dict( + # loss_kernels=loss_kernel, + # iou_kernel=iou_kernel + # )) + # + # # auxiliary loss + # loss_emb = self.emb_loss(embs, gt_instances, gt_kernels, training_masks, reduce=False) + # losses.update(dict( + # loss_emb=loss_emb + # )) + # + # return losses + + @staticmethod + def build_from_config(config, **kwargs): + conv = set_layer_from_config(config['conv']) + final = set_layer_from_config(config['final']) + try: + blocks = [] + for block_config in config['blocks']: + blocks.append(set_layer_from_config(block_config)) + return FASTHead(conv, blocks, final, **kwargs) + except: + return FASTHead(conv, None, final, **kwargs) + + +class FAST(nn.Module): + def __init__(self, backbone, neck, detection_head): + super(FAST, self).__init__() + self.backbone = TextNet.build_from_config(backbone) + self.neck = FASTNeck.build_from_config(neck) + self.det_head = FASTHead.build_from_config(detection_head) + + def _upsample(self, x, size, scale=1): + _, _, H, W = size + return F.interpolate(x, size=(H // scale, W // scale), mode='bilinear') + + def forward(self, imgs, gt_texts=None, gt_kernels=None, training_masks=None, + gt_instances=None, img_metas=None, cfg=None): + outputs = dict() + + if not self.training: + torch.cuda.synchronize() + start = time.time() + + # backbone + f = self.backbone(imgs) + + if not self.training: + torch.cuda.synchronize() + outputs.update(dict( + backbone_time=time.time() - start + )) + start = time.time() + + # reduce channel + f = self.neck(f) + + if not self.training: + torch.cuda.synchronize() + outputs.update(dict( + neck_time=time.time() - start + )) + start = time.time() + + # detection + det_out = self.det_head(f) + + if not self.training: + torch.cuda.synchronize() + outputs.update(dict( + det_head_time=time.time() - start + )) + + if self.training: + det_out = self._upsample(det_out, imgs.size(), scale=1) + det_loss = self.det_head.loss(det_out, gt_texts, gt_kernels, training_masks, gt_instances) + outputs.update(det_loss) + else: + det_out = self._upsample(det_out, imgs.size(), scale=4) + det_res = self.det_head.get_results(det_out, img_metas, cfg, scale=2) + outputs.update(det_res) + + return outputs From d8e1bc6eea3398efc1cb0f39402b0545222e826b Mon Sep 17 00:00:00 2001 From: raghavanone Date: Sat, 7 Oct 2023 19:44:37 +0530 Subject: [PATCH 002/152] Add config and modeling for Fast model --- src/transformers/models/fast/__init__.py | 0 .../models/fast/configuration_fast.py | 199 ++++-- src/transformers/models/fast/modeling_fast.py | 603 +++++------------- 3 files changed, 293 insertions(+), 509 deletions(-) create mode 100644 src/transformers/models/fast/__init__.py diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index 53a3791a48ae..aab305edb5de 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -3,65 +3,94 @@ class FastConfig(PretrainedConfig): - def __init__(self, - backbone_config=None, - backbone_stage1_in_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64], - backbone_stage1_out_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64], - backbone_stage1_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3), - (3, 3)], - backbone_stage1_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1], - backbone_stage1_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - backbone_stage1_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - - backbone_stage2_in_channels=[64, 128, 128, 128, 128, 128, 128, 128, 128, 128], - backbone_stage2_out_channels=[128, 128, 128, 128, 128, 128, 128, 128, 128, 128], - backbone_stage2_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3), - (3, 3)], - backbone_stage2_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1], - backbone_stage2_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - backbone_stage2_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - - backbone_stage3_in_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64], - backbone_stage3_out_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64], - backbone_stage3_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3), - (3, 3)], - backbone_stage3_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1], - backbone_stage3_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - backbone_stage3_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - - backbone_stage4_in_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64], - backbone_stage4_out_channels=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64], - backbone_stage4_kernel_size=[(3, 3), (3, 3), (3, 1), (3, 3), (3, 1), (3, 3), (3, 3), (1, 3), (3, 3), - (3, 3)], - backbone_stage4_stride=[1, 2, 1, 1, 1, 1, 1, 1, 1, 1], - backbone_stage4_dilation=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - backbone_stage4_groups=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - - neck_in_channels=[64, 128, 256, 512], - neck_out_channels=[128, 128, 128, 128], - neck_kernel_size=[(3, 3), (3, 3), (3, 3), (3, 3)], - neck_stride=[1, 1, 1, 1], - neck_dilation=[1, 1, 1, 1], - neck_groups=[1, 1, 1, 1], - **kwargs - ): - self.backbone_config = { - "kernel_size": 3, - "stride": 2, - "dilation": 1, - "groups": 1, - "bias": False, - "has_shuffle": False, - "in_channels": 3, - "out_channels": 64, - "use_bn": True, - "act_func": "relu", - "dropout_rate": 0, - "ops_order": "weight_bn_act" - } - super.__init__(**kwargs) - if backbone_config is not None: - self.backbone_config.update(backbone_config) + def __init__( + self, + backbone_kernel_size=3, + backbone_stride=2, + backbone_dilation=1, + backbone_groups=1, + backbone_bias=False, + backbone_has_shuffle=False, + backbone_in_channels=3, + backbone_out_channels=64, + backbone_use_bn=True, + backbone_act_func="relu", + backbone_dropout_rate=0, + backbone_ops_order="weight_bn_act", + + backbone_stage1_in_channels=(64, 64, 64), + backbone_stage1_out_channels=(64, 64, 64), + backbone_stage1_kernel_size=((3, 3), (3, 3), (3, 3)), + backbone_stage1_stride=(1, 2, 1), + backbone_stage1_dilation=(1, 1, 1), + backbone_stage1_groups=(1, 1, 1), + + backbone_stage2_in_channels=(64, 128, 128, 128), + backbone_stage2_out_channels=(128, 128, 128, 128), + backbone_stage2_kernel_size=((3, 3), (1, 3), (3, 3), (3, 1)), + backbone_stage2_stride=(2, 1, 1, 1), + backbone_stage2_dilation=(1, 1, 1, 1), + backbone_stage2_groups=(1, 1, 1, 1), + + backbone_stage3_in_channels=(128, 256, 256, 256), + backbone_stage3_out_channels=(256, 256, 256, 256), + backbone_stage3_kernel_size=((3, 3), (3, 3), (3, 1), (1, 3)), + backbone_stage3_stride=(2, 1, 1, 1), + backbone_stage3_dilation=(1, 1, 1, 1), + backbone_stage3_groups=(1, 1, 1, 1), + + backbone_stage4_in_channels=(256, 512, 512, 512), + backbone_stage4_out_channels=(512, 512, 512, 512), + backbone_stage4_kernel_size=((3, 3), (3, 1), (1, 3), (3, 3)), + backbone_stage4_stride=(2, 1, 1, 1), + backbone_stage4_dilation=(1, 1, 1, 1), + backbone_stage4_groups=(1, 1, 1, 1), + + neck_in_channels=(64, 128, 256, 512), + neck_out_channels=(128, 128, 128, 128), + neck_kernel_size=((3, 3), (3, 3), (3, 3), (3, 3)), + neck_stride=(1, 1, 1, 1), + neck_dilation=(1, 1, 1, 1), + neck_groups=(1, 1, 1, 1), + + head_pooling_size=9, + head_dropout_ratio=0.1, + + head_conv_in_channels=512, + head_conv_out_channels=128, + head_conv_kernel_size=(3, 3), + head_conv_stride=1, + head_conv_dilation=1, + head_conv_groups=1, + + head_final_kernel_size=1, + head_final_stride=1, + head_final_dilation=1, + head_final_groups=1, + head_final_bias=False, + head_final_has_shuffle=False, + head_final_in_channels=128, + head_final_out_channels=5, + head_final_use_bn=False, + head_final_act_func=None, + head_final_dropout_rate=0, + head_final_ops_order="weight", + **kwargs + ): + super().__init__(**kwargs) + + self.backbone_kernel_size = backbone_kernel_size + self.backbone_stride = backbone_stride + self.backbone_dilation = backbone_dilation + self.backbone_groups = backbone_groups + self.backbone_bias = backbone_bias + self.backbone_has_shuffle = backbone_has_shuffle + self.backbone_in_channels = backbone_in_channels + self.backbone_out_channels = backbone_out_channels + self.backbone_use_bn = backbone_use_bn + self.backbone_act_func = backbone_act_func + self.backbone_dropout_rate = backbone_dropout_rate + self.backbone_ops_order = backbone_ops_order self.backbone_stage1_in_channels = backbone_stage1_in_channels self.backbone_stage1_out_channels = backbone_stage1_out_channels @@ -70,9 +99,53 @@ def __init__(self, self.backbone_stage1_dilation = backbone_stage1_dilation, self.backbone_stage1_groups = backbone_stage1_groups, + self.backbone_stage2_in_channels = backbone_stage2_in_channels + self.backbone_stage2_out_channels = backbone_stage2_out_channels + self.backbone_stage2_kernel_size = backbone_stage2_kernel_size, + self.backbone_stage2_stride = backbone_stage2_stride, + self.backbone_stage2_dilation = backbone_stage2_dilation, + self.backbone_stage2_groups = backbone_stage2_groups, + + self.backbone_stage3_in_channels = backbone_stage3_in_channels + self.backbone_stage3_out_channels = backbone_stage3_out_channels + self.backbone_stage3_kernel_size = backbone_stage3_kernel_size, + self.backbone_stage3_stride = backbone_stage3_stride, + self.backbone_stage3_dilation = backbone_stage3_dilation, + self.backbone_stage3_groups = backbone_stage3_groups, + + self.backbone_stage4_in_channels = backbone_stage4_in_channels + self.backbone_stage4_out_channels = backbone_stage4_out_channels + self.backbone_stage4_kernel_size = backbone_stage4_kernel_size, + self.backbone_stage4_stride = backbone_stage4_stride, + self.backbone_stage4_dilation = backbone_stage4_dilation, + self.backbone_stage4_groups = backbone_stage4_groups, + self.neck_in_channels = neck_in_channels, self.neck_out_channels = neck_out_channels, - self.neck_kernel_size_channels = neck_kernel_size, - self.neck_stride_channels = neck_stride, - self.neck_dilation_channels = neck_dilation, - self.neck_groups_channels = neck_groups, + self.neck_kernel_size = neck_kernel_size, + self.neck_stride = neck_stride, + self.neck_dilation = neck_dilation, + self.neck_groups = neck_groups, + + self.head_pooling_size = head_pooling_size, + self.head_dropout_ratio = head_dropout_ratio, + + self.head_conv_in_channels = head_conv_in_channels + self.head_conv_out_channels = head_conv_out_channels + self.head_conv_kernel_size = head_conv_kernel_size + self.head_conv_stride = head_conv_stride + self.head_conv_dilation = head_conv_dilation + self.head_conv_groups = head_conv_groups + + self.head_final_kernel_size = head_final_kernel_size, + self.head_final_stride = head_final_stride, + self.head_final_dilation = head_final_dilation, + self.head_final_groups = head_final_groups, + self.head_final_bias = head_final_bias, + self.head_final_has_shuffle = head_final_has_shuffle, + self.head_final_in_channels = head_final_in_channels, + self.head_final_out_channels = head_final_out_channels, + self.head_final_use_bn = head_final_use_bn, + self.head_final_act_func = head_final_act_func, + self.head_final_dropout_rate = head_final_dropout_rate, + self.head_final_ops_order = head_final_ops_order diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 4684378ce8eb..dc415b76a0b5 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -1,228 +1,13 @@ import math from collections import OrderedDict +import cv2 import numpy as np import torch import torch.nn as nn import torch.nn.functional as F -import time -import cv2 - -class DiceLoss(nn.Module): - def __init__(self, loss_weight=1.0): - super(DiceLoss, self).__init__() - self.loss_weight = loss_weight - - def forward(self, input, target, mask, reduce=True): - batch_size = input.size(0) - input = torch.sigmoid(input) - - input = input.contiguous().view(batch_size, -1) - target = target.contiguous().view(batch_size, -1).float() - mask = mask.contiguous().view(batch_size, -1).float() - - input = input * mask - target = target * mask - - a = torch.sum(input * target, dim=1) - b = torch.sum(input * input, dim=1) + 0.001 - c = torch.sum(target * target, dim=1) + 0.001 - d = (2 * a) / (b + c) - loss = 1 - d - - loss = self.loss_weight * loss - - if reduce: - loss = torch.mean(loss) - - return loss - - -class EmbLoss_v1(nn.Module): - def __init__(self, feature_dim=4, loss_weight=1.0): - super(EmbLoss_v1, self).__init__() - self.feature_dim = feature_dim - self.loss_weight = loss_weight - self.delta_v = 0.5 - self.delta_d = 1.5 - self.weights = (1.0, 1.0) - - def forward_single(self, emb, instance, kernel, training_mask): - training_mask = (training_mask > 0.5).long() - kernel = (kernel > 0.5).long() - instance = instance * training_mask - instance_kernel = (instance * kernel).view(-1) - instance = instance.view(-1) - emb = emb.view(self.feature_dim, -1) - - unique_labels, unique_ids = torch.unique(instance_kernel, sorted=True, return_inverse=True) - num_instance = unique_labels.size(0) - if num_instance <= 1: - return 0 - - emb_mean = emb.new_zeros((self.feature_dim, num_instance), dtype=torch.float32) - for i, lb in enumerate(unique_labels): - if lb == 0: - continue - ind_k = instance_kernel == lb - emb_mean[:, i] = torch.mean(emb[:, ind_k], dim=1) - - l_agg = emb.new_zeros(num_instance, dtype=torch.float32) # bug - for i, lb in enumerate(unique_labels): - if lb == 0: - continue - ind = instance == lb - emb_ = emb[:, ind] - dist = (emb_ - emb_mean[:, i:i + 1]).norm(p=2, dim=0) - dist = F.relu(dist - self.delta_v) ** 2 - l_agg[i] = torch.mean(torch.log(dist + 1.0)) - l_agg = torch.mean(l_agg[1:]) - - if num_instance > 2: - emb_interleave = emb_mean.permute(1, 0).repeat(num_instance, 1) - emb_band = emb_mean.permute(1, 0).repeat(1, num_instance).view(-1, self.feature_dim) - # print(seg_band) - - mask = (1 - torch.eye(num_instance, dtype=torch.int8)).view(-1, 1).repeat(1, self.feature_dim) - mask = mask.view(num_instance, num_instance, -1) - mask[0, :, :] = 0 - mask[:, 0, :] = 0 - mask = mask.view(num_instance * num_instance, -1) - # print(mask) - - dist = emb_interleave - emb_band - dist = dist[mask > 0].view(-1, self.feature_dim).norm(p=2, dim=1) - dist = F.relu(2 * self.delta_d - dist) ** 2 - l_dis = torch.mean(torch.log(dist + 1.0)) - else: - l_dis = 0 - - l_agg = self.weights[0] * l_agg - l_dis = self.weights[1] * l_dis - l_reg = torch.mean(torch.log(torch.norm(emb_mean, 2, 0) + 1.0)) * 0.001 - loss = l_agg + l_dis + l_reg - return loss - - def forward(self, emb, instance, kernel, training_mask, reduce=True): - loss_batch = emb.new_zeros((emb.size(0)), dtype=torch.float32) - - for i in range(loss_batch.size(0)): - loss_batch[i] = self.forward_single(emb[i], instance[i], kernel[i], training_mask[i]) - - loss_batch = self.loss_weight * loss_batch - - if reduce: - loss_batch = torch.mean(loss_batch) - - return loss_batch - - -class EmbLoss_v2(nn.Module): - def __init__(self, feature_dim=4, loss_weight=1.0): - super(EmbLoss_v2, self).__init__() - self.feature_dim = feature_dim - self.loss_weight = loss_weight - self.delta_v = 0.5 - self.delta_d = 1.5 - self.weights = (1.0, 1.0) - - def forward_single(self, emb, instance, kernel, training_mask): - training_mask = (training_mask > 0.5).long() - kernel = (kernel > 0.5).long() - instance = instance * training_mask - instance_kernel = (instance * kernel).view(-1) - instance = instance.view(-1) - emb = emb.view(self.feature_dim, -1) - - unique_labels, unique_ids = torch.unique(instance_kernel, sorted=True, return_inverse=True) - num_instance = unique_labels.size(0) - if num_instance <= 1: - return 0 - - emb_mean = emb.new_zeros((self.feature_dim, num_instance), dtype=torch.float32) - for i, lb in enumerate(unique_labels): - if lb == 0: - continue - ind_k = instance_kernel == lb - emb_mean[:, i] = torch.mean(emb[:, ind_k], dim=1) - - l_agg = emb.new_zeros(num_instance, dtype=torch.float32) # bug - for i, lb in enumerate(unique_labels): - if lb == 0: - continue - ind = instance == lb - emb_ = emb[:, ind] - dist = (emb_ - emb_mean[:, i:i + 1]).norm(p=2, dim=0) - dist = F.relu(dist - self.delta_v) ** 2 - l_agg[i] = torch.mean(torch.log(dist + 1.0)) - l_agg = torch.mean(l_agg[1:]) - - if num_instance > 2: - emb_interleave = emb_mean.permute(1, 0).repeat(num_instance, 1) - emb_band = emb_mean.permute(1, 0).repeat(1, num_instance).view(-1, self.feature_dim) - # print(seg_band) - - mask = (1 - torch.eye(num_instance, dtype=torch.int8)).view(-1, 1).repeat(1, self.feature_dim) - mask = mask.view(num_instance, num_instance, -1) - mask[0, :, :] = 0 - mask[:, 0, :] = 0 - mask = mask.view(num_instance * num_instance, -1) - # print(mask) - - dist = emb_interleave - emb_band - dist = dist[mask > 0].view(-1, self.feature_dim).norm(p=2, dim=1) - dist = F.relu(2 * self.delta_d - dist) ** 2 - # l_dis = torch.mean(torch.log(dist + 1.0)) - - l_dis = [torch.log(dist + 1.0)] - emb_bg = emb[:, instance == 0].view(self.feature_dim, -1) - if emb_bg.size(1) > 100: - rand_ind = np.random.permutation(emb_bg.size(1))[:100] - emb_bg = emb_bg[:, rand_ind] - if emb_bg.size(1) > 0: - for i, lb in enumerate(unique_labels): - if lb == 0: - continue - dist = (emb_bg - emb_mean[:, i:i + 1]).norm(p=2, dim=0) - dist = F.relu(2 * self.delta_d - dist) ** 2 - l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True) - l_dis.append(l_dis_bg) - l_dis = torch.mean(torch.cat(l_dis)) - else: - l_dis = 0 - - l_agg = self.weights[0] * l_agg - l_dis = self.weights[1] * l_dis - l_reg = torch.mean(torch.log(torch.norm(emb_mean, 2, 0) + 1.0)) * 0.001 - loss = l_agg + l_dis + l_reg - return loss - def forward(self, emb, instance, kernel, training_mask, reduce=True): - loss_batch = emb.new_zeros((emb.size(0)), dtype=torch.float32) - - for i in range(loss_batch.size(0)): - loss_batch[i] = self.forward_single(emb[i], instance[i], kernel[i], training_mask[i]) - - loss_batch = self.loss_weight * loss_batch - - if reduce: - loss_batch = torch.mean(loss_batch) - - return loss_batch - - -def set_layer_from_config(layer_config): - if layer_config is None: - return None - - name2layer = { - ConvLayer.__name__: ConvLayer, - RepConvLayer.__name__: RepConvLayer - } - - layer_name = layer_config.pop('name') - layer = name2layer[layer_name] - return layer.build_from_config(layer_config) +from transformers import PreTrainedModel def get_same_padding(kernel_size): @@ -236,6 +21,21 @@ def get_same_padding(kernel_size): return kernel_size // 2 +def build_activation(act_func, inplace=True): + if act_func == 'relu': + return nn.ReLU(inplace=inplace) + elif act_func == 'relu6': + return nn.ReLU6(inplace=inplace) + elif act_func == 'tanh': + return nn.Tanh() + elif act_func == 'sigmoid': + return nn.Sigmoid() + elif act_func is None: + return None + else: + raise ValueError('do not support: %s' % act_func) + + class My2DLayer(nn.Module): def __init__(self, in_channels, out_channels, @@ -365,6 +165,10 @@ def generate_bbox(keys, label, score, scales, cfg): return bboxes, scores +class FalsePreTrainedModel(PreTrainedModel): + pass + + class ConvLayer(My2DLayer): def __init__(self, in_channels, out_channels, @@ -395,10 +199,6 @@ def weight_op(self): return weight_dict - @staticmethod - def build_from_config(config): - return ConvLayer(**config) - class RepConvLayer(nn.Module): @@ -534,77 +334,104 @@ def _pad_to_mxn_tensor(self, kernel): return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right, pad_top_down, pad_top_down]) - def switch_to_deploy(self): - if hasattr(self, 'fused_conv'): - return - kernel, bias = self.get_equivalent_kernel_bias() - self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels, - out_channels=self.main_conv.out_channels, - kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride, - padding=self.main_conv.padding, dilation=self.main_conv.dilation, - groups=self.main_conv.groups, bias=True) - self.fused_conv.weight.data = kernel - self.fused_conv.bias.data = bias - self.deploy = True - for para in self.parameters(): - para.detach_() - for attr in ['main_conv', 'main_bn', 'ver_conv', 'ver_bn', 'hor_conv', 'hor_bn']: - if hasattr(self, attr): - self.__delattr__(attr) - - if hasattr(self, 'rbr_identity'): - self.__delattr__('rbr_identity') - - def switch_to_test(self): - kernel, bias = self.get_equivalent_kernel_bias() - self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels, - out_channels=self.main_conv.out_channels, - kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride, - padding=self.main_conv.padding, dilation=self.main_conv.dilation, - groups=self.main_conv.groups, bias=True) - self.fused_conv.weight.data = kernel - self.fused_conv.bias.data = bias - for para in self.fused_conv.parameters(): - para.detach_() - self.deploy = True - - def switch_to_train(self): - if hasattr(self, 'fused_conv'): - self.__delattr__('fused_conv') - self.deploy = False - - @staticmethod - def is_zero_layer(): - return False - - @property - def module_str(self): - return 'Rep_%dx%d' % (self.kernel_size[0], self.kernel_size[1]) - - @property - def config(self): - return {'name': RepConvLayer.__name__, - 'in_channels': self.in_channels, - 'out_channels': self.out_channels, - 'kernel_size': self.kernel_size, - 'stride': self.stride, - 'dilation': self.dilation, - 'groups': self.groups} - - @staticmethod - def build_from_config(config): - return RepConvLayer(**config) - - -class TextNet(nn.Module): - - def __init__(self, first_conv, stage1, stage2, stage3, stage4): - super(TextNet, self).__init__() - - self.first_conv = first_conv + # def switch_to_deploy(self): + # if hasattr(self, 'fused_conv'): + # return + # kernel, bias = self.get_equivalent_kernel_bias() + # self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels, + # out_channels=self.main_conv.out_channels, + # kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride, + # padding=self.main_conv.padding, dilation=self.main_conv.dilation, + # groups=self.main_conv.groups, bias=True) + # self.fused_conv.weight.data = kernel + # self.fused_conv.bias.data = bias + # self.deploy = True + # for para in self.parameters(): + # para.detach_() + # for attr in ['main_conv', 'main_bn', 'ver_conv', 'ver_bn', 'hor_conv', 'hor_bn']: + # if hasattr(self, attr): + # self.__delattr__(attr) + # + # if hasattr(self, 'rbr_identity'): + # self.__delattr__('rbr_identity') + + # def switch_to_test(self): + # kernel, bias = self.get_equivalent_kernel_bias() + # self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels, + # out_channels=self.main_conv.out_channels, + # kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride, + # padding=self.main_conv.padding, dilation=self.main_conv.dilation, + # groups=self.main_conv.groups, bias=True) + # self.fused_conv.weight.data = kernel + # self.fused_conv.bias.data = bias + # for para in self.fused_conv.parameters(): + # para.detach_() + # self.deploy = True + + # def switch_to_train(self): + # if hasattr(self, 'fused_conv'): + # self.__delattr__('fused_conv') + # self.deploy = False + + # @staticmethod + # def is_zero_layer(): + # return False + + # @property + # def module_str(self): + # return 'Rep_%dx%d' % (self.kernel_size[0], self.kernel_size[1]) + + # @property + # def config(self): + # return {'name': RepConvLayer.__name__, + # 'in_channels': self.in_channels, + # 'out_channels': self.out_channels, + # 'kernel_size': self.kernel_size, + # 'stride': self.stride, + # 'dilation': self.dilation, + # 'groups': self.groups} + + # @staticmethod + # def build_from_config(config): + # return RepConvLayer(**config) + + +class TextNet(PreTrainedModel): + + def __init__(self, config): + super().__init__(config) + self.first_conv = ConvLayer(config.backbone_in_channels, config.backbone_out_channels, + config.backbone_kernel_size, config.backbone_stride, config.backbone_dilation, + config.backbone_groups, config.backbone_bias, config.backbone_has_shuffle, + config.backbone_use_bn, config.backbone_act_func, config.backbone_dropout_rate, + config.backbone_ops_order) + + stage1 = [] + for stage_config in zip(config.backbone_stage1_in_channels, config.backbone_stage1_out_channels, + config.backbone_stage1_kernel_size[0], config.backbone_stage1_stride[0], + config.backbone_stage1_dilation[0], config.backbone_stage1_groups[0]): + stage1.append(RepConvLayer(*stage_config)) self.stage1 = nn.ModuleList(stage1) + + stage2 = [] + for stage_config in zip(config.backbone_stage2_in_channels, config.backbone_stage2_out_channels, + config.backbone_stage2_kernel_size[0], config.backbone_stage2_stride[0], + config.backbone_stage2_dilation[0], config.backbone_stage2_groups[0]): + stage2.append(RepConvLayer(*stage_config)) self.stage2 = nn.ModuleList(stage2) + + stage3 = [] + for stage_config in zip(config.backbone_stage3_in_channels, config.backbone_stage3_out_channels, + config.backbone_stage3_kernel_size[0], config.backbone_stage3_stride[0], + config.backbone_stage3_dilation[0], config.backbone_stage3_groups[0]): + stage3.append(RepConvLayer(*stage_config)) self.stage3 = nn.ModuleList(stage3) + + stage4 = [] + for stage_config in zip(config.backbone_stage4_in_channels, config.backbone_stage4_out_channels, + config.backbone_stage4_kernel_size[0], config.backbone_stage4_stride[0], + config.backbone_stage4_dilation[0], config.backbone_stage4_groups[0]): + stage4.append(RepConvLayer(*stage_config)) self.stage4 = nn.ModuleList(stage4) self._initialize_weights() @@ -639,31 +466,18 @@ def forward(self, x): return output - @staticmethod - def build_from_config(config): - first_conv = set_layer_from_config(config['first_conv']) - stage1, stage2, stage3, stage4 = [], [], [], [] - for block_config in config['stage1']: - stage1.append(set_layer_from_config(block_config)) - for block_config in config['stage2']: - stage2.append(set_layer_from_config(block_config)) - for block_config in config['stage3']: - stage3.append(set_layer_from_config(block_config)) - for block_config in config['stage4']: - stage4.append(set_layer_from_config(block_config)) - - net = TextNet(first_conv, stage1, stage2, stage3, stage4) - - return net - - -class FASTNeck(nn.Module): - def __init__(self, reduce_layer1, reduce_layer2, reduce_layer3, reduce_layer4): - super(FASTNeck, self).__init__() - self.reduce_layer1 = reduce_layer1 - self.reduce_layer2 = reduce_layer2 - self.reduce_layer3 = reduce_layer3 - self.reduce_layer4 = reduce_layer4 + +class FASTNeck(PreTrainedModel): + + def __init__(self, config): + super().__init__(config) + reduce_layer_configs = list(zip(config.neck_in_channels[0], config.neck_out_channels[0], config.neck_kernel_size[0], + config.neck_stride[0], config.neck_dilation[0], config.neck_groups[0])) + + self.reduce_layer1 = RepConvLayer(*reduce_layer_configs[0]) + self.reduce_layer2 = RepConvLayer(*reduce_layer_configs[1]) + self.reduce_layer3 = RepConvLayer(*reduce_layer_configs[2]) + self.reduce_layer4 = RepConvLayer(*reduce_layer_configs[3]) self._initialize_weights() @@ -692,39 +506,30 @@ def forward(self, x): f = torch.cat((f1, f2, f3, f4), 1) return f - @staticmethod - def build_from_config(config): - reduce_layer1 = set_layer_from_config(config['reduce_layer1']) - reduce_layer2 = set_layer_from_config(config['reduce_layer2']) - reduce_layer3 = set_layer_from_config(config['reduce_layer3']) - reduce_layer4 = set_layer_from_config(config['reduce_layer4']) - return FASTNeck(reduce_layer1, reduce_layer2, reduce_layer3, reduce_layer4) - class FASTHead(nn.Module): - def __init__(self, conv, blocks, final, pooling_size, - loss_text, loss_kernel, loss_emb, dropout_ratio=0): + + def __init__(self, config): super(FASTHead, self).__init__() - self.conv = conv - if blocks is not None: - self.blocks = nn.ModuleList(blocks) - else: - self.blocks = None - self.final = final + self.conv = RepConvLayer(config.head_conv_in_channels, config.head_conv_out_channels, + config.head_conv_kernel_size, config.head_conv_stride, config.head_conv_dilation, + config.head_conv_groups) - # self.text_loss = build_loss(loss_text) - # self.kernel_loss = build_loss(loss_kernel) - # self.emb_loss = build_loss(loss_emb) + self.final = ConvLayer(config.head_final_in_channels[0], config.head_final_out_channels[0], + config.head_final_kernel_size[0], config.head_final_stride[0], config.head_final_dilation[0], + config.head_final_groups[0], config.head_final_bias[0], config.head_final_has_shuffle[0], + config.head_final_use_bn[0], config.head_final_act_func[0], config.head_final_dropout_rate[0], + config.head_final_ops_order) - self.pooling_size = pooling_size + self.pooling_size = config.head_pooling_size[0] self.pooling_1s = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2) self.pooling_2s = nn.MaxPool2d(kernel_size=self.pooling_size // 2 + 1, stride=1, padding=(self.pooling_size // 2) // 2) - if dropout_ratio > 0: - self.dropout = nn.Dropout2d(dropout_ratio) + if config.head_dropout_ratio[0] > 0: + self.dropout = nn.Dropout2d(config.head_dropout_ratio[0]) else: self.dropout = None @@ -740,9 +545,6 @@ def _initialize_weights(self): def forward(self, x): x = self.conv(x) - if self.blocks is not None: - for block in self.blocks: - x = block(x) if self.dropout is not None: x = self.dropout(x) x = self.final(x) @@ -750,10 +552,6 @@ def forward(self, x): def get_results(self, out, img_meta, cfg, scale=2): - if not self.training: - torch.cuda.synchronize() - start = time.time() - org_img_size = img_meta['org_img_size'][0] img_size = img_meta['img_size'][0] # 640*640 batch_size = out.size(0) @@ -767,15 +565,12 @@ def get_results(self, out, img_meta, cfg, scale=2): score_maps = score_maps.squeeze(1) # B*640*640 kernels = (out[:, 0, :, :] > 0).to(torch.uint8) # B*160*160 - if kernels.is_cuda: - labels_ = ccl_cuda.ccl_batch(kernels) # B*160*160 - else: - labels_ = [] - for kernel in kernels.numpy(): - ret, label_ = cv2.connectedComponents(kernel) - labels_.append(label_) - labels_ = np.array(labels_) - labels_ = torch.from_numpy(labels_) + labels_ = [] + for kernel in kernels.numpy(): + ret, label_ = cv2.connectedComponents(kernel) + labels_.append(label_) + labels_ = np.array(labels_) + labels_ = torch.from_numpy(labels_) labels = labels_.unsqueeze(1).to(torch.float32) # B*1*160*160 labels = F.interpolate(labels, size=(img_size[0] // scale, img_size[1] // scale), mode='nearest') # B*1*320*320 labels = self._max_pooling(labels, scale=scale) @@ -784,12 +579,6 @@ def get_results(self, out, img_meta, cfg, scale=2): keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)] - if not self.training: - torch.cuda.synchronize() - outputs.update(dict( - post_time=time.time() - start - )) - outputs.update(dict(kernels=kernels.data.cpu())) scales = (float(org_img_size[1]) / float(img_size[1]), @@ -813,107 +602,29 @@ def _max_pooling(self, x, scale=1): x = self.pooling_2s(x) return x - # def loss(self, out, gt_texts, gt_kernels, training_masks, gt_instances): - # # output - # kernels = out[:, 0, :, :] # 4*640*640 - # texts = self._max_pooling(kernels, scale=1) # 4*640*640 - # embs = out[:, 1:, :, :] # 4*4*640*640 - # - # # text loss - # selected_masks = ohem_batch(texts, gt_texts, training_masks) - # loss_text = self.text_loss(texts, gt_texts, selected_masks, reduce=False) - # iou_text = iou((texts > 0).long(), gt_texts, training_masks, reduce=False) - # losses = dict( - # loss_text=loss_text, - # iou_text=iou_text - # ) - # - # # kernel loss - # selected_masks = gt_texts * training_masks - # loss_kernel = self.kernel_loss(kernels, gt_kernels, selected_masks, reduce=False) - # loss_kernel = torch.mean(loss_kernel, dim=0) - # iou_kernel = iou((kernels > 0).long(), gt_kernels, selected_masks, reduce=False) - # losses.update(dict( - # loss_kernels=loss_kernel, - # iou_kernel=iou_kernel - # )) - # - # # auxiliary loss - # loss_emb = self.emb_loss(embs, gt_instances, gt_kernels, training_masks, reduce=False) - # losses.update(dict( - # loss_emb=loss_emb - # )) - # - # return losses - @staticmethod - def build_from_config(config, **kwargs): - conv = set_layer_from_config(config['conv']) - final = set_layer_from_config(config['final']) - try: - blocks = [] - for block_config in config['blocks']: - blocks.append(set_layer_from_config(block_config)) - return FASTHead(conv, blocks, final, **kwargs) - except: - return FASTHead(conv, None, final, **kwargs) - - -class FAST(nn.Module): - def __init__(self, backbone, neck, detection_head): - super(FAST, self).__init__() - self.backbone = TextNet.build_from_config(backbone) - self.neck = FASTNeck.build_from_config(neck) - self.det_head = FASTHead.build_from_config(detection_head) +class FASTForImageCaptioning(nn.Module): + def __init__(self, config): + super().__init__() + self.backbone = TextNet(config=config) + self.neck = FASTNeck(config=config) + self.det_head = FASTHead(config=config) def _upsample(self, x, size, scale=1): _, _, H, W = size return F.interpolate(x, size=(H // scale, W // scale), mode='bilinear') - def forward(self, imgs, gt_texts=None, gt_kernels=None, training_masks=None, - gt_instances=None, img_metas=None, cfg=None): + def forward(self, imgs, img_metas=None, cfg=None): outputs = dict() - if not self.training: - torch.cuda.synchronize() - start = time.time() - - # backbone f = self.backbone(imgs) - if not self.training: - torch.cuda.synchronize() - outputs.update(dict( - backbone_time=time.time() - start - )) - start = time.time() - - # reduce channel f = self.neck(f) - if not self.training: - torch.cuda.synchronize() - outputs.update(dict( - neck_time=time.time() - start - )) - start = time.time() - - # detection det_out = self.det_head(f) - if not self.training: - torch.cuda.synchronize() - outputs.update(dict( - det_head_time=time.time() - start - )) - - if self.training: - det_out = self._upsample(det_out, imgs.size(), scale=1) - det_loss = self.det_head.loss(det_out, gt_texts, gt_kernels, training_masks, gt_instances) - outputs.update(det_loss) - else: - det_out = self._upsample(det_out, imgs.size(), scale=4) - det_res = self.det_head.get_results(det_out, img_metas, cfg, scale=2) - outputs.update(det_res) + det_out = self._upsample(det_out, imgs.size(), scale=4) + det_res = self.det_head.get_results(det_out, img_metas, cfg, scale=2) + outputs.update(det_res) return outputs From 185603e351fae06142700b62399f3a31e198f294 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Sun, 8 Oct 2023 09:24:36 +0530 Subject: [PATCH 003/152] Refactor modeling and add tests --- src/transformers/__init__.py | 13 + src/transformers/models/fast/__init__.py | 54 +++ .../models/fast/configuration_fast.py | 207 +++++------ src/transformers/models/fast/modeling_fast.py | 342 +++++++++++------- tests/models/fast/__init__.py | 0 tests/models/fast/test_modeling_fast.py | 256 +++++++++++++ 6 files changed, 637 insertions(+), 235 deletions(-) create mode 100644 tests/models/fast/__init__.py create mode 100644 tests/models/fast/test_modeling_fast.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 4941d724455d..280e824efb89 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -424,6 +424,7 @@ "models.ernie_m": ["ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP", "ErnieMConfig"], "models.esm": ["ESM_PRETRAINED_CONFIG_ARCHIVE_MAP", "EsmConfig", "EsmTokenizer"], "models.falcon": ["FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP", "FalconConfig"], +<<<<<<< HEAD "models.fastspeech2_conformer": [ "FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", @@ -433,6 +434,9 @@ "FastSpeech2ConformerTokenizer", "FastSpeech2ConformerWithHifiGanConfig", ], +======= + "models.fast": ["FastConfig"], +>>>>>>> 67fec5b40 (Refactor modeling and add tests) "models.flaubert": ["FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlaubertConfig", "FlaubertTokenizer"], "models.flava": [ "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP", @@ -5113,6 +5117,7 @@ from .models.ernie_m import ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieMConfig from .models.esm import ESM_PRETRAINED_CONFIG_ARCHIVE_MAP, EsmConfig, EsmTokenizer from .models.falcon import FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP, FalconConfig +<<<<<<< HEAD from .models.fastspeech2_conformer import ( FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP, FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -5122,6 +5127,9 @@ FastSpeech2ConformerTokenizer, FastSpeech2ConformerWithHifiGanConfig, ) +======= + from .models.fast import FastConfig +>>>>>>> 67fec5b40 (Refactor modeling and add tests) from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer from .models.flava import ( FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -6698,12 +6706,17 @@ FalconModel, FalconPreTrainedModel, ) +<<<<<<< HEAD from .models.fastspeech2_conformer import ( FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, FastSpeech2ConformerHifiGan, FastSpeech2ConformerModel, FastSpeech2ConformerPreTrainedModel, FastSpeech2ConformerWithHifiGan, +======= + from .models.fast import ( + FASTForImageCaptioning, +>>>>>>> 67fec5b40 (Refactor modeling and add tests) ) from .models.flaubert import ( FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py index e69de29bb2d1..6fad75850bba 100644 --- a/src/transformers/models/fast/__init__.py +++ b/src/transformers/models/fast/__init__.py @@ -0,0 +1,54 @@ +# coding=utf-8 +# Copyright 2023 the Fast authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_torch_available, +) + +_import_structure = { + "configuration_fast": ["FastConfig"], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_fast"] = [ + "FASTForImageCaptioning" + ] + +if TYPE_CHECKING: + from .configuration_fast import FastConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_fast import ( + FASTForImageCaptioning + ) + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index aab305edb5de..914bcda0567f 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -2,80 +2,71 @@ class FastConfig(PretrainedConfig): - def __init__( - self, - backbone_kernel_size=3, - backbone_stride=2, - backbone_dilation=1, - backbone_groups=1, - backbone_bias=False, - backbone_has_shuffle=False, - backbone_in_channels=3, - backbone_out_channels=64, - backbone_use_bn=True, - backbone_act_func="relu", - backbone_dropout_rate=0, - backbone_ops_order="weight_bn_act", - - backbone_stage1_in_channels=(64, 64, 64), - backbone_stage1_out_channels=(64, 64, 64), - backbone_stage1_kernel_size=((3, 3), (3, 3), (3, 3)), - backbone_stage1_stride=(1, 2, 1), - backbone_stage1_dilation=(1, 1, 1), - backbone_stage1_groups=(1, 1, 1), - - backbone_stage2_in_channels=(64, 128, 128, 128), - backbone_stage2_out_channels=(128, 128, 128, 128), - backbone_stage2_kernel_size=((3, 3), (1, 3), (3, 3), (3, 1)), - backbone_stage2_stride=(2, 1, 1, 1), - backbone_stage2_dilation=(1, 1, 1, 1), - backbone_stage2_groups=(1, 1, 1, 1), - - backbone_stage3_in_channels=(128, 256, 256, 256), - backbone_stage3_out_channels=(256, 256, 256, 256), - backbone_stage3_kernel_size=((3, 3), (3, 3), (3, 1), (1, 3)), - backbone_stage3_stride=(2, 1, 1, 1), - backbone_stage3_dilation=(1, 1, 1, 1), - backbone_stage3_groups=(1, 1, 1, 1), - - backbone_stage4_in_channels=(256, 512, 512, 512), - backbone_stage4_out_channels=(512, 512, 512, 512), - backbone_stage4_kernel_size=((3, 3), (3, 1), (1, 3), (3, 3)), - backbone_stage4_stride=(2, 1, 1, 1), - backbone_stage4_dilation=(1, 1, 1, 1), - backbone_stage4_groups=(1, 1, 1, 1), - - neck_in_channels=(64, 128, 256, 512), - neck_out_channels=(128, 128, 128, 128), - neck_kernel_size=((3, 3), (3, 3), (3, 3), (3, 3)), - neck_stride=(1, 1, 1, 1), - neck_dilation=(1, 1, 1, 1), - neck_groups=(1, 1, 1, 1), - - head_pooling_size=9, - head_dropout_ratio=0.1, - - head_conv_in_channels=512, - head_conv_out_channels=128, - head_conv_kernel_size=(3, 3), - head_conv_stride=1, - head_conv_dilation=1, - head_conv_groups=1, - - head_final_kernel_size=1, - head_final_stride=1, - head_final_dilation=1, - head_final_groups=1, - head_final_bias=False, - head_final_has_shuffle=False, - head_final_in_channels=128, - head_final_out_channels=5, - head_final_use_bn=False, - head_final_act_func=None, - head_final_dropout_rate=0, - head_final_ops_order="weight", - **kwargs + self, + backbone_kernel_size=3, + backbone_stride=2, + backbone_dilation=1, + backbone_groups=1, + backbone_bias=False, + backbone_has_shuffle=False, + backbone_in_channels=3, + backbone_out_channels=64, + backbone_use_bn=True, + backbone_act_func="relu", + backbone_dropout_rate=0, + backbone_ops_order="weight_bn_act", + backbone_stage1_in_channels=(64, 64, 64), + backbone_stage1_out_channels=(64, 64, 64), + backbone_stage1_kernel_size=((3, 3), (3, 3), (3, 3)), + backbone_stage1_stride=(1, 2, 1), + backbone_stage1_dilation=(1, 1, 1), + backbone_stage1_groups=(1, 1, 1), + backbone_stage2_in_channels=(64, 128, 128, 128), + backbone_stage2_out_channels=(128, 128, 128, 128), + backbone_stage2_kernel_size=((3, 3), (1, 3), (3, 3), (3, 1)), + backbone_stage2_stride=(2, 1, 1, 1), + backbone_stage2_dilation=(1, 1, 1, 1), + backbone_stage2_groups=(1, 1, 1, 1), + backbone_stage3_in_channels=(128, 256, 256, 256), + backbone_stage3_out_channels=(256, 256, 256, 256), + backbone_stage3_kernel_size=((3, 3), (3, 3), (3, 1), (1, 3)), + backbone_stage3_stride=(2, 1, 1, 1), + backbone_stage3_dilation=(1, 1, 1, 1), + backbone_stage3_groups=(1, 1, 1, 1), + backbone_stage4_in_channels=(256, 512, 512, 512), + backbone_stage4_out_channels=(512, 512, 512, 512), + backbone_stage4_kernel_size=((3, 3), (3, 1), (1, 3), (3, 3)), + backbone_stage4_stride=(2, 1, 1, 1), + backbone_stage4_dilation=(1, 1, 1, 1), + backbone_stage4_groups=(1, 1, 1, 1), + neck_in_channels=(64, 128, 256, 512), + neck_out_channels=(128, 128, 128, 128), + neck_kernel_size=((3, 3), (3, 3), (3, 3), (3, 3)), + neck_stride=(1, 1, 1, 1), + neck_dilation=(1, 1, 1, 1), + neck_groups=(1, 1, 1, 1), + head_pooling_size=9, + head_dropout_ratio=0.1, + head_conv_in_channels=512, + head_conv_out_channels=128, + head_conv_kernel_size=(3, 3), + head_conv_stride=1, + head_conv_dilation=1, + head_conv_groups=1, + head_final_kernel_size=1, + head_final_stride=1, + head_final_dilation=1, + head_final_groups=1, + head_final_bias=False, + head_final_has_shuffle=False, + head_final_in_channels=128, + head_final_out_channels=5, + head_final_use_bn=False, + head_final_act_func=None, + head_final_dropout_rate=0, + head_final_ops_order="weight", + **kwargs, ): super().__init__(**kwargs) @@ -94,41 +85,41 @@ def __init__( self.backbone_stage1_in_channels = backbone_stage1_in_channels self.backbone_stage1_out_channels = backbone_stage1_out_channels - self.backbone_stage1_kernel_size = backbone_stage1_kernel_size, - self.backbone_stage1_stride = backbone_stage1_stride, - self.backbone_stage1_dilation = backbone_stage1_dilation, - self.backbone_stage1_groups = backbone_stage1_groups, + self.backbone_stage1_kernel_size = (backbone_stage1_kernel_size,) + self.backbone_stage1_stride = (backbone_stage1_stride,) + self.backbone_stage1_dilation = (backbone_stage1_dilation,) + self.backbone_stage1_groups = (backbone_stage1_groups,) self.backbone_stage2_in_channels = backbone_stage2_in_channels self.backbone_stage2_out_channels = backbone_stage2_out_channels - self.backbone_stage2_kernel_size = backbone_stage2_kernel_size, - self.backbone_stage2_stride = backbone_stage2_stride, - self.backbone_stage2_dilation = backbone_stage2_dilation, - self.backbone_stage2_groups = backbone_stage2_groups, + self.backbone_stage2_kernel_size = (backbone_stage2_kernel_size,) + self.backbone_stage2_stride = (backbone_stage2_stride,) + self.backbone_stage2_dilation = (backbone_stage2_dilation,) + self.backbone_stage2_groups = (backbone_stage2_groups,) self.backbone_stage3_in_channels = backbone_stage3_in_channels self.backbone_stage3_out_channels = backbone_stage3_out_channels - self.backbone_stage3_kernel_size = backbone_stage3_kernel_size, - self.backbone_stage3_stride = backbone_stage3_stride, - self.backbone_stage3_dilation = backbone_stage3_dilation, - self.backbone_stage3_groups = backbone_stage3_groups, + self.backbone_stage3_kernel_size = (backbone_stage3_kernel_size,) + self.backbone_stage3_stride = (backbone_stage3_stride,) + self.backbone_stage3_dilation = (backbone_stage3_dilation,) + self.backbone_stage3_groups = (backbone_stage3_groups,) self.backbone_stage4_in_channels = backbone_stage4_in_channels self.backbone_stage4_out_channels = backbone_stage4_out_channels - self.backbone_stage4_kernel_size = backbone_stage4_kernel_size, - self.backbone_stage4_stride = backbone_stage4_stride, - self.backbone_stage4_dilation = backbone_stage4_dilation, - self.backbone_stage4_groups = backbone_stage4_groups, + self.backbone_stage4_kernel_size = (backbone_stage4_kernel_size,) + self.backbone_stage4_stride = (backbone_stage4_stride,) + self.backbone_stage4_dilation = (backbone_stage4_dilation,) + self.backbone_stage4_groups = (backbone_stage4_groups,) - self.neck_in_channels = neck_in_channels, - self.neck_out_channels = neck_out_channels, - self.neck_kernel_size = neck_kernel_size, - self.neck_stride = neck_stride, - self.neck_dilation = neck_dilation, - self.neck_groups = neck_groups, + self.neck_in_channels = (neck_in_channels,) + self.neck_out_channels = (neck_out_channels,) + self.neck_kernel_size = (neck_kernel_size,) + self.neck_stride = (neck_stride,) + self.neck_dilation = (neck_dilation,) + self.neck_groups = (neck_groups,) - self.head_pooling_size = head_pooling_size, - self.head_dropout_ratio = head_dropout_ratio, + self.head_pooling_size = (head_pooling_size,) + self.head_dropout_ratio = (head_dropout_ratio,) self.head_conv_in_channels = head_conv_in_channels self.head_conv_out_channels = head_conv_out_channels @@ -137,15 +128,15 @@ def __init__( self.head_conv_dilation = head_conv_dilation self.head_conv_groups = head_conv_groups - self.head_final_kernel_size = head_final_kernel_size, - self.head_final_stride = head_final_stride, - self.head_final_dilation = head_final_dilation, - self.head_final_groups = head_final_groups, - self.head_final_bias = head_final_bias, - self.head_final_has_shuffle = head_final_has_shuffle, - self.head_final_in_channels = head_final_in_channels, - self.head_final_out_channels = head_final_out_channels, - self.head_final_use_bn = head_final_use_bn, - self.head_final_act_func = head_final_act_func, - self.head_final_dropout_rate = head_final_dropout_rate, + self.head_final_kernel_size = (head_final_kernel_size,) + self.head_final_stride = (head_final_stride,) + self.head_final_dilation = (head_final_dilation,) + self.head_final_groups = (head_final_groups,) + self.head_final_bias = (head_final_bias,) + self.head_final_has_shuffle = (head_final_has_shuffle,) + self.head_final_in_channels = (head_final_in_channels,) + self.head_final_out_channels = (head_final_out_channels,) + self.head_final_use_bn = (head_final_use_bn,) + self.head_final_act_func = (head_final_act_func,) + self.head_final_dropout_rate = (head_final_dropout_rate,) self.head_final_ops_order = head_final_ops_order diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index dc415b76a0b5..a700902b1fb1 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -1,4 +1,5 @@ import math +import unittest from collections import OrderedDict import cv2 @@ -12,34 +13,34 @@ def get_same_padding(kernel_size): if isinstance(kernel_size, tuple): - assert len(kernel_size) == 2, 'invalid kernel size: %s' % kernel_size + assert len(kernel_size) == 2, "invalid kernel size: %s" % kernel_size p1 = get_same_padding(kernel_size[0]) p2 = get_same_padding(kernel_size[1]) return p1, p2 - assert isinstance(kernel_size, int), 'kernel size should be either `int` or `tuple`' - assert kernel_size % 2 > 0, 'kernel size should be odd number' + assert isinstance(kernel_size, int), "kernel size should be either `int` or `tuple`" + assert kernel_size % 2 > 0, "kernel size should be odd number" return kernel_size // 2 def build_activation(act_func, inplace=True): - if act_func == 'relu': + if act_func == "relu": return nn.ReLU(inplace=inplace) - elif act_func == 'relu6': + elif act_func == "relu6": return nn.ReLU6(inplace=inplace) - elif act_func == 'tanh': + elif act_func == "tanh": return nn.Tanh() - elif act_func == 'sigmoid': + elif act_func == "sigmoid": return nn.Sigmoid() elif act_func is None: return None else: - raise ValueError('do not support: %s' % act_func) + raise ValueError("do not support: %s" % act_func) class My2DLayer(nn.Module): - - def __init__(self, in_channels, out_channels, - use_bn=True, act_func='relu', dropout_rate=0, ops_order='weight_bn_act'): + def __init__( + self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act" + ): super(My2DLayer, self).__init__() self.in_channels = in_channels self.out_channels = out_channels @@ -49,55 +50,55 @@ def __init__(self, in_channels, out_channels, self.dropout_rate = dropout_rate self.ops_order = ops_order - """ modules """ + """ modules""" modules = {} # batch norm if self.use_bn: if self.bn_before_weight: - modules['bn'] = nn.BatchNorm2d(in_channels) + modules["bn"] = nn.BatchNorm2d(in_channels) else: - modules['bn'] = nn.BatchNorm2d(out_channels) + modules["bn"] = nn.BatchNorm2d(out_channels) else: - modules['bn'] = None + modules["bn"] = None # activation - modules['act'] = build_activation(self.act_func, self.ops_list[0] != 'act') + modules["act"] = build_activation(self.act_func, self.ops_list[0] != "act") # dropout if self.dropout_rate > 0: - modules['dropout'] = nn.Dropout2d(self.dropout_rate, inplace=True) + modules["dropout"] = nn.Dropout2d(self.dropout_rate, inplace=True) else: - modules['dropout'] = None + modules["dropout"] = None # weight - modules['weight'] = self.weight_op() + modules["weight"] = self.weight_op() # add modules for op in self.ops_list: if modules[op] is None: continue - elif op == 'weight': - if modules['dropout'] is not None: - self.add_module('dropout', modules['dropout']) - for key in modules['weight']: - self.add_module(key, modules['weight'][key]) + elif op == "weight": + if modules["dropout"] is not None: + self.add_module("dropout", modules["dropout"]) + for key in modules["weight"]: + self.add_module(key, modules["weight"][key]) else: self.add_module(op, modules[op]) @property def ops_list(self): - return self.ops_order.split('_') + return self.ops_order.split("_") @property def bn_before_weight(self): for op in self.ops_list: - if op == 'bn': + if op == "bn": return True - elif op == 'weight': + elif op == "weight": return False - raise ValueError('Invalid ops_order: %s' % self.ops_order) + raise ValueError("Invalid ops_order: %s" % self.ops_order) def weight_op(self): raise NotImplementedError - """ Methods defined in MyModule """ + """ Methods defined in MyModule""" def forward(self, x): for module in self._modules.values(): @@ -111,12 +112,12 @@ def module_str(self): @property def config(self): return { - 'in_channels': self.in_channels, - 'out_channels': self.out_channels, - 'use_bn': self.use_bn, - 'act_func': self.act_func, - 'dropout_rate': self.dropout_rate, - 'ops_order': self.ops_order, + "in_channels": self.in_channels, + "out_channels": self.out_channels, + "use_bn": self.use_bn, + "act_func": self.act_func, + "dropout_rate": self.dropout_rate, + "ops_order": self.ops_order, } @staticmethod @@ -137,7 +138,7 @@ def generate_bbox(keys, label, score, scales, cfg): scores = [] for index in range(1, label_num): i = keys[index] - ind = (label == i) + ind = label == i ind_np = ind.data.cpu().numpy() points = np.array(np.where(ind_np)).transpose((1, 0)) if points.shape[0] < cfg.test_cfg.min_area: @@ -148,18 +149,18 @@ def generate_bbox(keys, label, score, scales, cfg): label[ind] = 0 continue - if cfg.test_cfg.bbox_type == 'rect': + if cfg.test_cfg.bbox_type == "rect": rect = cv2.minAreaRect(points[:, ::-1]) alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1]))) rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2]) bbox = cv2.boxPoints(rect) * scales - elif cfg.test_cfg.bbox_type == 'poly': - binary = np.zeros(label.shape, dtype='uint8') + elif cfg.test_cfg.bbox_type == "poly": + binary = np.zeros(label.shape, dtype="uint8") binary[ind_np] = 1 contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) bbox = contours[0] * scales - bbox = bbox.astype('int32') + bbox = bbox.astype("int32") bboxes.append(bbox.reshape(-1).tolist()) scores.append(score_i) return bboxes, scores @@ -170,10 +171,21 @@ class FalsePreTrainedModel(PreTrainedModel): class ConvLayer(My2DLayer): - - def __init__(self, in_channels, out_channels, - kernel_size=3, stride=1, dilation=1, groups=1, bias=False, has_shuffle=False, - use_bn=True, act_func='relu', dropout_rate=0, ops_order='weight_bn_act'): + def __init__( + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + dilation=1, + groups=1, + bias=False, + has_shuffle=False, + use_bn=True, + act_func="relu", + dropout_rate=0, + ops_order="weight_bn_act", + ): self.kernel_size = kernel_size self.stride = stride self.dilation = dilation @@ -192,16 +204,21 @@ def weight_op(self): padding[1] *= self.dilation weight_dict = OrderedDict() - weight_dict['conv'] = nn.Conv2d( - self.in_channels, self.out_channels, kernel_size=self.kernel_size, stride=self.stride, padding=padding, - dilation=self.dilation, groups=self.groups, bias=self.bias + weight_dict["conv"] = nn.Conv2d( + self.in_channels, + self.out_channels, + kernel_size=self.kernel_size, + stride=self.stride, + padding=padding, + dilation=self.dilation, + groups=self.groups, + bias=self.bias, ) return weight_dict class RepConvLayer(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, deploy=False): super(RepConvLayer, self).__init__() @@ -214,47 +231,73 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, self.deploy = deploy assert len(kernel_size) == 2 - padding = (int(((kernel_size[0] - 1) * dilation) / 2), - int(((kernel_size[1] - 1) * dilation) / 2)) + padding = (int(((kernel_size[0] - 1) * dilation) / 2), int(((kernel_size[1] - 1) * dilation) / 2)) self.nonlinearity = nn.ReLU(inplace=True) if deploy: - self.fused_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, - kernel_size=kernel_size, stride=stride, padding=padding, - dilation=dilation, groups=groups, bias=True) + self.fused_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=True, + ) else: - self.main_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, - kernel_size=kernel_size, stride=stride, padding=padding, - dilation=dilation, groups=groups, bias=False) + self.main_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=False, + ) self.main_bn = nn.BatchNorm2d(num_features=out_channels) ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0) hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2)) if kernel_size[1] != 1: # 卷积核的宽大于1 -> 有垂直卷积 - self.ver_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, - kernel_size=(kernel_size[0], 1), - stride=stride, padding=ver_pad, - dilation=dilation, groups=groups, bias=False) + self.ver_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(kernel_size[0], 1), + stride=stride, + padding=ver_pad, + dilation=dilation, + groups=groups, + bias=False, + ) self.ver_bn = nn.BatchNorm2d(num_features=out_channels) else: self.ver_conv, self.ver_bn = None, None if kernel_size[0] != 1: # 卷积核的高大于1 -> 有水平卷积 - self.hor_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, - kernel_size=(1, kernel_size[1]), - stride=stride, padding=hor_pad, - dilation=dilation, groups=groups, bias=False) + self.hor_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(1, kernel_size[1]), + stride=stride, + padding=hor_pad, + dilation=dilation, + groups=groups, + bias=False, + ) self.hor_bn = nn.BatchNorm2d(num_features=out_channels) else: self.hor_conv, self.hor_bn = None, None - self.rbr_identity = nn.BatchNorm2d( - num_features=in_channels) if out_channels == in_channels and stride == 1 else None + self.rbr_identity = ( + nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None + ) def forward(self, input): - if hasattr(self, 'fused_conv'): + if hasattr(self, "fused_conv"): return self.nonlinearity(self.fused_conv(input)) else: main_outputs = self.main_conv(input) @@ -282,7 +325,7 @@ def _identity_to_conv(self, identity): if identity is None: return 0, 0 assert isinstance(identity, nn.BatchNorm2d) - if not hasattr(self, 'id_tensor'): + if not hasattr(self, "id_tensor"): input_dim = self.in_channels // self.groups kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32) for i in range(self.in_channels): @@ -331,8 +374,7 @@ def _pad_to_mxn_tensor(self, kernel): height, width = kernel.shape[2:] pad_left_right = (kernel_width - width) // 2 pad_top_down = (kernel_height - height) // 2 - return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right, - pad_top_down, pad_top_down]) + return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right, pad_top_down, pad_top_down]) # def switch_to_deploy(self): # if hasattr(self, 'fused_conv'): @@ -397,40 +439,68 @@ def _pad_to_mxn_tensor(self, kernel): class TextNet(PreTrainedModel): - def __init__(self, config): super().__init__(config) - self.first_conv = ConvLayer(config.backbone_in_channels, config.backbone_out_channels, - config.backbone_kernel_size, config.backbone_stride, config.backbone_dilation, - config.backbone_groups, config.backbone_bias, config.backbone_has_shuffle, - config.backbone_use_bn, config.backbone_act_func, config.backbone_dropout_rate, - config.backbone_ops_order) + self.first_conv = ConvLayer( + config.backbone_in_channels, + config.backbone_out_channels, + config.backbone_kernel_size, + config.backbone_stride, + config.backbone_dilation, + config.backbone_groups, + config.backbone_bias, + config.backbone_has_shuffle, + config.backbone_use_bn, + config.backbone_act_func, + config.backbone_dropout_rate, + config.backbone_ops_order, + ) stage1 = [] - for stage_config in zip(config.backbone_stage1_in_channels, config.backbone_stage1_out_channels, - config.backbone_stage1_kernel_size[0], config.backbone_stage1_stride[0], - config.backbone_stage1_dilation[0], config.backbone_stage1_groups[0]): + for stage_config in zip( + config.backbone_stage1_in_channels, + config.backbone_stage1_out_channels, + config.backbone_stage1_kernel_size[0], + config.backbone_stage1_stride[0], + config.backbone_stage1_dilation[0], + config.backbone_stage1_groups[0], + ): stage1.append(RepConvLayer(*stage_config)) self.stage1 = nn.ModuleList(stage1) stage2 = [] - for stage_config in zip(config.backbone_stage2_in_channels, config.backbone_stage2_out_channels, - config.backbone_stage2_kernel_size[0], config.backbone_stage2_stride[0], - config.backbone_stage2_dilation[0], config.backbone_stage2_groups[0]): + for stage_config in zip( + config.backbone_stage2_in_channels, + config.backbone_stage2_out_channels, + config.backbone_stage2_kernel_size[0], + config.backbone_stage2_stride[0], + config.backbone_stage2_dilation[0], + config.backbone_stage2_groups[0], + ): stage2.append(RepConvLayer(*stage_config)) self.stage2 = nn.ModuleList(stage2) stage3 = [] - for stage_config in zip(config.backbone_stage3_in_channels, config.backbone_stage3_out_channels, - config.backbone_stage3_kernel_size[0], config.backbone_stage3_stride[0], - config.backbone_stage3_dilation[0], config.backbone_stage3_groups[0]): + for stage_config in zip( + config.backbone_stage3_in_channels, + config.backbone_stage3_out_channels, + config.backbone_stage3_kernel_size[0], + config.backbone_stage3_stride[0], + config.backbone_stage3_dilation[0], + config.backbone_stage3_groups[0], + ): stage3.append(RepConvLayer(*stage_config)) self.stage3 = nn.ModuleList(stage3) stage4 = [] - for stage_config in zip(config.backbone_stage4_in_channels, config.backbone_stage4_out_channels, - config.backbone_stage4_kernel_size[0], config.backbone_stage4_stride[0], - config.backbone_stage4_dilation[0], config.backbone_stage4_groups[0]): + for stage_config in zip( + config.backbone_stage4_in_channels, + config.backbone_stage4_out_channels, + config.backbone_stage4_kernel_size[0], + config.backbone_stage4_stride[0], + config.backbone_stage4_dilation[0], + config.backbone_stage4_groups[0], + ): stage4.append(RepConvLayer(*stage_config)) self.stage4 = nn.ModuleList(stage4) @@ -446,7 +516,7 @@ def _initialize_weights(self): def forward(self, x): x = self.first_conv(x) - output = list() + output = [] for block in self.stage1: x = block(x) @@ -468,11 +538,18 @@ def forward(self, x): class FASTNeck(PreTrainedModel): - def __init__(self, config): super().__init__(config) - reduce_layer_configs = list(zip(config.neck_in_channels[0], config.neck_out_channels[0], config.neck_kernel_size[0], - config.neck_stride[0], config.neck_dilation[0], config.neck_groups[0])) + reduce_layer_configs = list( + zip( + config.neck_in_channels[0], + config.neck_out_channels[0], + config.neck_kernel_size[0], + config.neck_stride[0], + config.neck_dilation[0], + config.neck_groups[0], + ) + ) self.reduce_layer1 = RepConvLayer(*reduce_layer_configs[0]) self.reduce_layer2 = RepConvLayer(*reduce_layer_configs[1]) @@ -491,7 +568,7 @@ def _initialize_weights(self): def _upsample(self, x, y): _, _, H, W = y.size() - return F.upsample(x, size=(H, W), mode='bilinear') + return F.upsample(x, size=(H, W), mode="bilinear") def forward(self, x): f1, f2, f3, f4 = x @@ -508,25 +585,38 @@ def forward(self, x): class FASTHead(nn.Module): - def __init__(self, config): super(FASTHead, self).__init__() - self.conv = RepConvLayer(config.head_conv_in_channels, config.head_conv_out_channels, - config.head_conv_kernel_size, config.head_conv_stride, config.head_conv_dilation, - config.head_conv_groups) + self.conv = RepConvLayer( + config.head_conv_in_channels, + config.head_conv_out_channels, + config.head_conv_kernel_size, + config.head_conv_stride, + config.head_conv_dilation, + config.head_conv_groups, + ) - self.final = ConvLayer(config.head_final_in_channels[0], config.head_final_out_channels[0], - config.head_final_kernel_size[0], config.head_final_stride[0], config.head_final_dilation[0], - config.head_final_groups[0], config.head_final_bias[0], config.head_final_has_shuffle[0], - config.head_final_use_bn[0], config.head_final_act_func[0], config.head_final_dropout_rate[0], - config.head_final_ops_order) + self.final = ConvLayer( + config.head_final_in_channels[0], + config.head_final_out_channels[0], + config.head_final_kernel_size[0], + config.head_final_stride[0], + config.head_final_dilation[0], + config.head_final_groups[0], + config.head_final_bias[0], + config.head_final_has_shuffle[0], + config.head_final_use_bn[0], + config.head_final_act_func[0], + config.head_final_dropout_rate[0], + config.head_final_ops_order, + ) self.pooling_size = config.head_pooling_size[0] - self.pooling_1s = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, - padding=(self.pooling_size - 1) // 2) - self.pooling_2s = nn.MaxPool2d(kernel_size=self.pooling_size // 2 + 1, stride=1, - padding=(self.pooling_size // 2) // 2) + self.pooling_1s = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2) + self.pooling_2s = nn.MaxPool2d( + kernel_size=self.pooling_size // 2 + 1, stride=1, padding=(self.pooling_size // 2) // 2 + ) if config.head_dropout_ratio[0] > 0: self.dropout = nn.Dropout2d(config.head_dropout_ratio[0]) @@ -551,17 +641,17 @@ def forward(self, x): return x def get_results(self, out, img_meta, cfg, scale=2): - - org_img_size = img_meta['org_img_size'][0] - img_size = img_meta['img_size'][0] # 640*640 + org_img_size = img_meta["org_img_size"][0] + img_size = img_meta["img_size"][0] # 640*640 batch_size = out.size(0) - outputs = dict() + outputs = {} - texts = F.interpolate(out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), - mode='nearest') # B*1*320*320 + texts = F.interpolate( + out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), mode="nearest" + ) # B*1*320*320 texts = self._max_pooling(texts, scale=scale) # B*1*320*320 score_maps = torch.sigmoid_(texts) # B*1*320*320 - score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode='nearest') # B*1*640*640 + score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode="nearest") # B*1*640*640 score_maps = score_maps.squeeze(1) # B*640*640 kernels = (out[:, 0, :, :] > 0).to(torch.uint8) # B*160*160 @@ -572,26 +662,24 @@ def get_results(self, out, img_meta, cfg, scale=2): labels_ = np.array(labels_) labels_ = torch.from_numpy(labels_) labels = labels_.unsqueeze(1).to(torch.float32) # B*1*160*160 - labels = F.interpolate(labels, size=(img_size[0] // scale, img_size[1] // scale), mode='nearest') # B*1*320*320 + labels = F.interpolate( + labels, size=(img_size[0] // scale, img_size[1] // scale), mode="nearest" + ) # B*1*320*320 labels = self._max_pooling(labels, scale=scale) - labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode='nearest') # B*1*640*640 + labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode="nearest") # B*1*640*640 labels = labels.squeeze(1).to(torch.int32) # B*640*640 keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)] - outputs.update(dict(kernels=kernels.data.cpu())) + outputs.update({"kernels": kernels.data.cpu()}) - scales = (float(org_img_size[1]) / float(img_size[1]), - float(org_img_size[0]) / float(img_size[0])) + scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0])) results = [] for i in range(batch_size): bboxes, scores = generate_bbox(keys[i], labels[i], score_maps[i], scales, cfg) - results.append(dict( - bboxes=bboxes, - scores=scores - )) - outputs.update(dict(results=results)) + results.append({"bboxes": bboxes, "scores": scores}) + outputs.update({"results": results}) return outputs @@ -603,19 +691,19 @@ def _max_pooling(self, x, scale=1): return x -class FASTForImageCaptioning(nn.Module): +class FASTForImageCaptioning(PreTrainedModel): def __init__(self, config): - super().__init__() + super().__init__(config) self.backbone = TextNet(config=config) self.neck = FASTNeck(config=config) self.det_head = FASTHead(config=config) def _upsample(self, x, size, scale=1): _, _, H, W = size - return F.interpolate(x, size=(H // scale, W // scale), mode='bilinear') + return F.interpolate(x, size=(H // scale, W // scale), mode="bilinear") def forward(self, imgs, img_metas=None, cfg=None): - outputs = dict() + outputs = {} f = self.backbone(imgs) diff --git a/tests/models/fast/__init__.py b/tests/models/fast/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py new file mode 100644 index 000000000000..25fcaffb82a0 --- /dev/null +++ b/tests/models/fast/test_modeling_fast.py @@ -0,0 +1,256 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch Falcon model. """ + +import unittest + +from parameterized import parameterized + +from transformers import ( + FastConfig, + is_torch_available, + set_seed, +) +from transformers.testing_utils import CaptureLogger, require_bitsandbytes, require_torch, slow, tooslow, torch_device +from transformers.utils import logging as transformers_logging + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask, floats_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + +if is_torch_available(): + import torch + + from transformers import ( + FASTForImageCaptioning, + ) + + +class FastModelTester: + def __init__( + self, + backbone_kernel_size=3, + backbone_stride=2, + backbone_dilation=1, + backbone_groups=1, + backbone_bias=False, + backbone_has_shuffle=False, + backbone_in_channels=3, + backbone_out_channels=64, + backbone_use_bn=True, + backbone_act_func="relu", + backbone_dropout_rate=0, + backbone_ops_order="weight_bn_act", + backbone_stage1_in_channels=(64, 64, 64), + backbone_stage1_out_channels=(64, 64, 64), + backbone_stage1_kernel_size=((3, 3), (3, 3), (3, 3)), + backbone_stage1_stride=(1, 2, 1), + backbone_stage1_dilation=(1, 1, 1), + backbone_stage1_groups=(1, 1, 1), + backbone_stage2_in_channels=(64, 128, 128, 128), + backbone_stage2_out_channels=(128, 128, 128, 128), + backbone_stage2_kernel_size=((3, 3), (1, 3), (3, 3), (3, 1)), + backbone_stage2_stride=(2, 1, 1, 1), + backbone_stage2_dilation=(1, 1, 1, 1), + backbone_stage2_groups=(1, 1, 1, 1), + backbone_stage3_in_channels=(128, 256, 256, 256), + backbone_stage3_out_channels=(256, 256, 256, 256), + backbone_stage3_kernel_size=((3, 3), (3, 3), (3, 1), (1, 3)), + backbone_stage3_stride=(2, 1, 1, 1), + backbone_stage3_dilation=(1, 1, 1, 1), + backbone_stage3_groups=(1, 1, 1, 1), + backbone_stage4_in_channels=(256, 512, 512, 512), + backbone_stage4_out_channels=(512, 512, 512, 512), + backbone_stage4_kernel_size=((3, 3), (3, 1), (1, 3), (3, 3)), + backbone_stage4_stride=(2, 1, 1, 1), + backbone_stage4_dilation=(1, 1, 1, 1), + backbone_stage4_groups=(1, 1, 1, 1), + neck_in_channels=(64, 128, 256, 512), + neck_out_channels=(128, 128, 128, 128), + neck_kernel_size=((3, 3), (3, 3), (3, 3), (3, 3)), + neck_stride=(1, 1, 1, 1), + neck_dilation=(1, 1, 1, 1), + neck_groups=(1, 1, 1, 1), + head_pooling_size=9, + head_dropout_ratio=0.1, + head_conv_in_channels=512, + head_conv_out_channels=128, + head_conv_kernel_size=(3, 3), + head_conv_stride=1, + head_conv_dilation=1, + head_conv_groups=1, + head_final_kernel_size=1, + head_final_stride=1, + head_final_dilation=1, + head_final_groups=1, + head_final_bias=False, + head_final_has_shuffle=False, + head_final_in_channels=128, + head_final_out_channels=5, + head_final_use_bn=False, + head_final_act_func=None, + head_final_dropout_rate=0, + head_final_ops_order="weight", + batch_size=3, + num_channels=3, + image_size=500, + + ): + self.backbone_kernel_size = backbone_kernel_size + self.backbone_stride = backbone_stride + self.backbone_dilation = backbone_dilation + self.backbone_groups = backbone_groups + self.backbone_bias = backbone_bias + self.backbone_has_shuffle = backbone_has_shuffle + self.backbone_in_channels = backbone_in_channels + self.backbone_out_channels = backbone_out_channels + self.backbone_use_bn = backbone_use_bn + self.backbone_act_func = backbone_act_func + self.backbone_dropout_rate = backbone_dropout_rate + self.backbone_ops_order = backbone_ops_order + + self.backbone_stage1_in_channels = backbone_stage1_in_channels + self.backbone_stage1_out_channels = backbone_stage1_out_channels + self.backbone_stage1_kernel_size = (backbone_stage1_kernel_size,) + self.backbone_stage1_stride = (backbone_stage1_stride,) + self.backbone_stage1_dilation = (backbone_stage1_dilation,) + self.backbone_stage1_groups = (backbone_stage1_groups,) + + self.backbone_stage2_in_channels = backbone_stage2_in_channels + self.backbone_stage2_out_channels = backbone_stage2_out_channels + self.backbone_stage2_kernel_size = (backbone_stage2_kernel_size,) + self.backbone_stage2_stride = (backbone_stage2_stride,) + self.backbone_stage2_dilation = (backbone_stage2_dilation,) + self.backbone_stage2_groups = (backbone_stage2_groups,) + + self.backbone_stage3_in_channels = backbone_stage3_in_channels + self.backbone_stage3_out_channels = backbone_stage3_out_channels + self.backbone_stage3_kernel_size = (backbone_stage3_kernel_size,) + self.backbone_stage3_stride = (backbone_stage3_stride,) + self.backbone_stage3_dilation = (backbone_stage3_dilation,) + self.backbone_stage3_groups = (backbone_stage3_groups,) + + self.backbone_stage4_in_channels = backbone_stage4_in_channels + self.backbone_stage4_out_channels = backbone_stage4_out_channels + self.backbone_stage4_kernel_size = (backbone_stage4_kernel_size,) + self.backbone_stage4_stride = (backbone_stage4_stride,) + self.backbone_stage4_dilation = (backbone_stage4_dilation,) + self.backbone_stage4_groups = (backbone_stage4_groups,) + + self.neck_in_channels = (neck_in_channels,) + self.neck_out_channels = (neck_out_channels,) + self.neck_kernel_size = (neck_kernel_size,) + self.neck_stride = (neck_stride,) + self.neck_dilation = (neck_dilation,) + self.neck_groups = (neck_groups,) + + self.head_pooling_size = (head_pooling_size,) + self.head_dropout_ratio = (head_dropout_ratio,) + + self.head_conv_in_channels = head_conv_in_channels + self.head_conv_out_channels = head_conv_out_channels + self.head_conv_kernel_size = head_conv_kernel_size + self.head_conv_stride = head_conv_stride + self.head_conv_dilation = head_conv_dilation + self.head_conv_groups = head_conv_groups + + self.head_final_kernel_size = (head_final_kernel_size,) + self.head_final_stride = (head_final_stride,) + self.head_final_dilation = (head_final_dilation,) + self.head_final_groups = (head_final_groups,) + self.head_final_bias = (head_final_bias,) + self.head_final_has_shuffle = (head_final_has_shuffle,) + self.head_final_in_channels = (head_final_in_channels,) + self.head_final_out_channels = (head_final_out_channels,) + self.head_final_use_bn = (head_final_use_bn,) + self.head_final_act_func = (head_final_act_func,) + self.head_final_dropout_rate = (head_final_dropout_rate,) + self.head_final_ops_order = head_final_ops_order + + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + pixel_values_meta = { + "org_img_size": (500, 500), + "img_size": (500, 500) + } + # labels = None + # if self.use_labels: + # labels = ids_tensor([self.batch_size], self.num_labels) + # + config = self.get_config() + + return config, {"imgs": pixel_values, "img_meta": pixel_values_meta} + + def get_config(self): + return FastConfig() + + def create_and_check_model(self, config, pixel_values): + model = FASTForImageCaptioning(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"imgs": pixel_values} + return config, inputs_dict + + +@require_torch +class FastModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + FASTForImageCaptioning, + ) + if is_torch_available() + else () + ) + + pipeline_model_mapping = {} + test_headmasking = False + test_pruning = False + test_attention_outputs = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + + def setUp(self): + self.model_tester = FastModelTester(self) + self.config_tester = ConfigTester(self, config_class=FastConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def create_and_test_config_common_properties(self): + return + + @unittest.skip(reason="Fast does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Fast does not support input and output embeddings") + def test_model_common_attributes(self): + pass From 5d21171ebf039f21fde59288db4ac928c2972be7 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 11 Oct 2023 08:16:10 +0530 Subject: [PATCH 004/152] More changes --- .../models/fast/configuration_fast.py | 205 ++++++++-------- src/transformers/models/fast/modeling_fast.py | 158 ++++++------- tests/models/fast/test_modeling_fast.py | 219 ++++++++++++------ 3 files changed, 331 insertions(+), 251 deletions(-) diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index 914bcda0567f..773dbcb151c7 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -3,70 +3,73 @@ class FastConfig(PretrainedConfig): def __init__( - self, - backbone_kernel_size=3, - backbone_stride=2, - backbone_dilation=1, - backbone_groups=1, - backbone_bias=False, - backbone_has_shuffle=False, - backbone_in_channels=3, - backbone_out_channels=64, - backbone_use_bn=True, - backbone_act_func="relu", - backbone_dropout_rate=0, - backbone_ops_order="weight_bn_act", - backbone_stage1_in_channels=(64, 64, 64), - backbone_stage1_out_channels=(64, 64, 64), - backbone_stage1_kernel_size=((3, 3), (3, 3), (3, 3)), - backbone_stage1_stride=(1, 2, 1), - backbone_stage1_dilation=(1, 1, 1), - backbone_stage1_groups=(1, 1, 1), - backbone_stage2_in_channels=(64, 128, 128, 128), - backbone_stage2_out_channels=(128, 128, 128, 128), - backbone_stage2_kernel_size=((3, 3), (1, 3), (3, 3), (3, 1)), - backbone_stage2_stride=(2, 1, 1, 1), - backbone_stage2_dilation=(1, 1, 1, 1), - backbone_stage2_groups=(1, 1, 1, 1), - backbone_stage3_in_channels=(128, 256, 256, 256), - backbone_stage3_out_channels=(256, 256, 256, 256), - backbone_stage3_kernel_size=((3, 3), (3, 3), (3, 1), (1, 3)), - backbone_stage3_stride=(2, 1, 1, 1), - backbone_stage3_dilation=(1, 1, 1, 1), - backbone_stage3_groups=(1, 1, 1, 1), - backbone_stage4_in_channels=(256, 512, 512, 512), - backbone_stage4_out_channels=(512, 512, 512, 512), - backbone_stage4_kernel_size=((3, 3), (3, 1), (1, 3), (3, 3)), - backbone_stage4_stride=(2, 1, 1, 1), - backbone_stage4_dilation=(1, 1, 1, 1), - backbone_stage4_groups=(1, 1, 1, 1), - neck_in_channels=(64, 128, 256, 512), - neck_out_channels=(128, 128, 128, 128), - neck_kernel_size=((3, 3), (3, 3), (3, 3), (3, 3)), - neck_stride=(1, 1, 1, 1), - neck_dilation=(1, 1, 1, 1), - neck_groups=(1, 1, 1, 1), - head_pooling_size=9, - head_dropout_ratio=0.1, - head_conv_in_channels=512, - head_conv_out_channels=128, - head_conv_kernel_size=(3, 3), - head_conv_stride=1, - head_conv_dilation=1, - head_conv_groups=1, - head_final_kernel_size=1, - head_final_stride=1, - head_final_dilation=1, - head_final_groups=1, - head_final_bias=False, - head_final_has_shuffle=False, - head_final_in_channels=128, - head_final_out_channels=5, - head_final_use_bn=False, - head_final_act_func=None, - head_final_dropout_rate=0, - head_final_ops_order="weight", - **kwargs, + self, + backbone_kernel_size=3, + backbone_stride=2, + backbone_dilation=1, + backbone_groups=1, + backbone_bias=False, + backbone_has_shuffle=False, + backbone_in_channels=3, + backbone_out_channels=64, + backbone_use_bn=True, + backbone_act_func="relu", + backbone_dropout_rate=0, + backbone_ops_order="weight_bn_act", + backbone_stage1_in_channels=[64, 64, 64], + backbone_stage1_out_channels=[64, 64, 64], + backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]], + backbone_stage1_stride=[1, 2, 1], + backbone_stage1_dilation=[1, 1, 1], + backbone_stage1_groups=[1, 1, 1], + backbone_stage2_in_channels=[64, 128, 128, 128], + backbone_stage2_out_channels=[128, 128, 128, 128], + backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]], + backbone_stage2_stride=[2, 1, 1, 1], + backbone_stage2_dilation=[1, 1, 1, 1], + backbone_stage2_groups=[1, 1, 1, 1], + backbone_stage3_in_channels=[128, 256, 256, 256], + backbone_stage3_out_channels=[256, 256, 256, 256], + backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]], + backbone_stage3_stride=[2, 1, 1, 1], + backbone_stage3_dilation=[1, 1, 1, 1], + backbone_stage3_groups=[1, 1, 1, 1], + backbone_stage4_in_channels=[256, 512, 512, 512], + backbone_stage4_out_channels=[512, 512, 512, 512], + backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]], + backbone_stage4_stride=[2, 1, 1, 1], + backbone_stage4_dilation=[1, 1, 1, 1], + backbone_stage4_groups=[1, 1, 1, 1], + neck_in_channels=[64, 128, 256, 512], + neck_out_channels=[128, 128, 128, 128], + neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]], + neck_stride=[1, 1, 1, 1], + neck_dilation=[1, 1, 1, 1], + neck_groups=[1, 1, 1, 1], + head_pooling_size=9, + head_dropout_ratio=0.1, + head_conv_in_channels=512, + head_conv_out_channels=128, + head_conv_kernel_size=[3, 3], + head_conv_stride=1, + head_conv_dilation=1, + head_conv_groups=1, + head_final_kernel_size=1, + head_final_stride=1, + head_final_dilation=1, + head_final_groups=1, + head_final_bias=False, + head_final_has_shuffle=False, + head_final_in_channels=128, + head_final_out_channels=5, + head_final_use_bn=False, + head_final_act_func=None, + head_final_dropout_rate=0, + head_final_ops_order="weight", + min_area=250, + min_score=0.88, + bbox_type='rect', + **kwargs, ): super().__init__(**kwargs) @@ -85,41 +88,41 @@ def __init__( self.backbone_stage1_in_channels = backbone_stage1_in_channels self.backbone_stage1_out_channels = backbone_stage1_out_channels - self.backbone_stage1_kernel_size = (backbone_stage1_kernel_size,) - self.backbone_stage1_stride = (backbone_stage1_stride,) - self.backbone_stage1_dilation = (backbone_stage1_dilation,) - self.backbone_stage1_groups = (backbone_stage1_groups,) + self.backbone_stage1_kernel_size = backbone_stage1_kernel_size + self.backbone_stage1_stride = backbone_stage1_stride + self.backbone_stage1_dilation = backbone_stage1_dilation + self.backbone_stage1_groups = backbone_stage1_groups self.backbone_stage2_in_channels = backbone_stage2_in_channels self.backbone_stage2_out_channels = backbone_stage2_out_channels - self.backbone_stage2_kernel_size = (backbone_stage2_kernel_size,) - self.backbone_stage2_stride = (backbone_stage2_stride,) - self.backbone_stage2_dilation = (backbone_stage2_dilation,) - self.backbone_stage2_groups = (backbone_stage2_groups,) + self.backbone_stage2_kernel_size = backbone_stage2_kernel_size + self.backbone_stage2_stride = backbone_stage2_stride + self.backbone_stage2_dilation = backbone_stage2_dilation + self.backbone_stage2_groups = backbone_stage2_groups self.backbone_stage3_in_channels = backbone_stage3_in_channels self.backbone_stage3_out_channels = backbone_stage3_out_channels - self.backbone_stage3_kernel_size = (backbone_stage3_kernel_size,) - self.backbone_stage3_stride = (backbone_stage3_stride,) - self.backbone_stage3_dilation = (backbone_stage3_dilation,) - self.backbone_stage3_groups = (backbone_stage3_groups,) + self.backbone_stage3_kernel_size = backbone_stage3_kernel_size + self.backbone_stage3_stride = backbone_stage3_stride + self.backbone_stage3_dilation = backbone_stage3_dilation + self.backbone_stage3_groups = backbone_stage3_groups self.backbone_stage4_in_channels = backbone_stage4_in_channels self.backbone_stage4_out_channels = backbone_stage4_out_channels - self.backbone_stage4_kernel_size = (backbone_stage4_kernel_size,) - self.backbone_stage4_stride = (backbone_stage4_stride,) - self.backbone_stage4_dilation = (backbone_stage4_dilation,) - self.backbone_stage4_groups = (backbone_stage4_groups,) + self.backbone_stage4_kernel_size = backbone_stage4_kernel_size + self.backbone_stage4_stride = backbone_stage4_stride + self.backbone_stage4_dilation = backbone_stage4_dilation + self.backbone_stage4_groups = backbone_stage4_groups - self.neck_in_channels = (neck_in_channels,) - self.neck_out_channels = (neck_out_channels,) - self.neck_kernel_size = (neck_kernel_size,) - self.neck_stride = (neck_stride,) - self.neck_dilation = (neck_dilation,) - self.neck_groups = (neck_groups,) + self.neck_in_channels = neck_in_channels + self.neck_out_channels = neck_out_channels + self.neck_kernel_size = neck_kernel_size + self.neck_stride = neck_stride + self.neck_dilation = neck_dilation + self.neck_groups = neck_groups - self.head_pooling_size = (head_pooling_size,) - self.head_dropout_ratio = (head_dropout_ratio,) + self.head_pooling_size = head_pooling_size + self.head_dropout_ratio = head_dropout_ratio self.head_conv_in_channels = head_conv_in_channels self.head_conv_out_channels = head_conv_out_channels @@ -128,15 +131,19 @@ def __init__( self.head_conv_dilation = head_conv_dilation self.head_conv_groups = head_conv_groups - self.head_final_kernel_size = (head_final_kernel_size,) - self.head_final_stride = (head_final_stride,) - self.head_final_dilation = (head_final_dilation,) - self.head_final_groups = (head_final_groups,) - self.head_final_bias = (head_final_bias,) - self.head_final_has_shuffle = (head_final_has_shuffle,) - self.head_final_in_channels = (head_final_in_channels,) - self.head_final_out_channels = (head_final_out_channels,) - self.head_final_use_bn = (head_final_use_bn,) - self.head_final_act_func = (head_final_act_func,) - self.head_final_dropout_rate = (head_final_dropout_rate,) + self.head_final_kernel_size = head_final_kernel_size + self.head_final_stride = head_final_stride + self.head_final_dilation = head_final_dilation + self.head_final_groups = head_final_groups + self.head_final_bias = head_final_bias + self.head_final_has_shuffle = head_final_has_shuffle + self.head_final_in_channels = head_final_in_channels + self.head_final_out_channels = head_final_out_channels + self.head_final_use_bn = head_final_use_bn + self.head_final_act_func = head_final_act_func + self.head_final_dropout_rate = head_final_dropout_rate self.head_final_ops_order = head_final_ops_order + + self.min_area = min_area + self.min_score = min_score + self.bbox_type = bbox_type diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index a700902b1fb1..255eb2635fcf 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -1,5 +1,4 @@ import math -import unittest from collections import OrderedDict import cv2 @@ -132,40 +131,6 @@ def is_zero_layer(): return False -def generate_bbox(keys, label, score, scales, cfg): - label_num = len(keys) - bboxes = [] - scores = [] - for index in range(1, label_num): - i = keys[index] - ind = label == i - ind_np = ind.data.cpu().numpy() - points = np.array(np.where(ind_np)).transpose((1, 0)) - if points.shape[0] < cfg.test_cfg.min_area: - label[ind] = 0 - continue - score_i = score[ind].mean().item() - if score_i < cfg.test_cfg.min_score: - label[ind] = 0 - continue - - if cfg.test_cfg.bbox_type == "rect": - rect = cv2.minAreaRect(points[:, ::-1]) - alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1]))) - rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2]) - bbox = cv2.boxPoints(rect) * scales - - elif cfg.test_cfg.bbox_type == "poly": - binary = np.zeros(label.shape, dtype="uint8") - binary[ind_np] = 1 - contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - bbox = contours[0] * scales - bbox = bbox.astype("int32") - bboxes.append(bbox.reshape(-1).tolist()) - scores.append(score_i) - return bboxes, scores - - class FalsePreTrainedModel(PreTrainedModel): pass @@ -460,10 +425,10 @@ def __init__(self, config): for stage_config in zip( config.backbone_stage1_in_channels, config.backbone_stage1_out_channels, - config.backbone_stage1_kernel_size[0], - config.backbone_stage1_stride[0], - config.backbone_stage1_dilation[0], - config.backbone_stage1_groups[0], + config.backbone_stage1_kernel_size, + config.backbone_stage1_stride, + config.backbone_stage1_dilation, + config.backbone_stage1_groups, ): stage1.append(RepConvLayer(*stage_config)) self.stage1 = nn.ModuleList(stage1) @@ -472,10 +437,10 @@ def __init__(self, config): for stage_config in zip( config.backbone_stage2_in_channels, config.backbone_stage2_out_channels, - config.backbone_stage2_kernel_size[0], - config.backbone_stage2_stride[0], - config.backbone_stage2_dilation[0], - config.backbone_stage2_groups[0], + config.backbone_stage2_kernel_size, + config.backbone_stage2_stride, + config.backbone_stage2_dilation, + config.backbone_stage2_groups, ): stage2.append(RepConvLayer(*stage_config)) self.stage2 = nn.ModuleList(stage2) @@ -484,10 +449,10 @@ def __init__(self, config): for stage_config in zip( config.backbone_stage3_in_channels, config.backbone_stage3_out_channels, - config.backbone_stage3_kernel_size[0], - config.backbone_stage3_stride[0], - config.backbone_stage3_dilation[0], - config.backbone_stage3_groups[0], + config.backbone_stage3_kernel_size, + config.backbone_stage3_stride, + config.backbone_stage3_dilation, + config.backbone_stage3_groups, ): stage3.append(RepConvLayer(*stage_config)) self.stage3 = nn.ModuleList(stage3) @@ -496,10 +461,10 @@ def __init__(self, config): for stage_config in zip( config.backbone_stage4_in_channels, config.backbone_stage4_out_channels, - config.backbone_stage4_kernel_size[0], - config.backbone_stage4_stride[0], - config.backbone_stage4_dilation[0], - config.backbone_stage4_groups[0], + config.backbone_stage4_kernel_size, + config.backbone_stage4_stride, + config.backbone_stage4_dilation, + config.backbone_stage4_groups, ): stage4.append(RepConvLayer(*stage_config)) self.stage4 = nn.ModuleList(stage4) @@ -542,12 +507,12 @@ def __init__(self, config): super().__init__(config) reduce_layer_configs = list( zip( - config.neck_in_channels[0], - config.neck_out_channels[0], - config.neck_kernel_size[0], - config.neck_stride[0], - config.neck_dilation[0], - config.neck_groups[0], + config.neck_in_channels, + config.neck_out_channels, + config.neck_kernel_size, + config.neck_stride, + config.neck_dilation, + config.neck_groups, ) ) @@ -597,29 +562,33 @@ def __init__(self, config): ) self.final = ConvLayer( - config.head_final_in_channels[0], - config.head_final_out_channels[0], - config.head_final_kernel_size[0], - config.head_final_stride[0], - config.head_final_dilation[0], - config.head_final_groups[0], - config.head_final_bias[0], - config.head_final_has_shuffle[0], - config.head_final_use_bn[0], - config.head_final_act_func[0], - config.head_final_dropout_rate[0], + config.head_final_in_channels, + config.head_final_out_channels, + config.head_final_kernel_size, + config.head_final_stride, + config.head_final_dilation, + config.head_final_groups, + config.head_final_bias, + config.head_final_has_shuffle, + config.head_final_use_bn, + config.head_final_act_func, + config.head_final_dropout_rate, config.head_final_ops_order, ) - self.pooling_size = config.head_pooling_size[0] + self.min_area = config.min_area + self.min_score = config.min_score + self.bbox_type = config.bbox_type + + self.pooling_size = config.head_pooling_size self.pooling_1s = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2) self.pooling_2s = nn.MaxPool2d( kernel_size=self.pooling_size // 2 + 1, stride=1, padding=(self.pooling_size // 2) // 2 ) - if config.head_dropout_ratio[0] > 0: - self.dropout = nn.Dropout2d(config.head_dropout_ratio[0]) + if config.head_dropout_ratio > 0: + self.dropout = nn.Dropout2d(config.head_dropout_ratio) else: self.dropout = None @@ -640,9 +609,9 @@ def forward(self, x): x = self.final(x) return x - def get_results(self, out, img_meta, cfg, scale=2): - org_img_size = img_meta["org_img_size"][0] - img_size = img_meta["img_size"][0] # 640*640 + def get_results(self, out, img_meta, scale=2): + org_img_size = img_meta["org_img_size"] + img_size = img_meta["img_size"] # 640*640 batch_size = out.size(0) outputs = {} @@ -650,7 +619,7 @@ def get_results(self, out, img_meta, cfg, scale=2): out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), mode="nearest" ) # B*1*320*320 texts = self._max_pooling(texts, scale=scale) # B*1*320*320 - score_maps = torch.sigmoid_(texts) # B*1*320*320 + score_maps = torch.sigmoid_(texts) # B*1*320*320~ score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode="nearest") # B*1*640*640 score_maps = score_maps.squeeze(1) # B*640*640 @@ -677,7 +646,7 @@ def get_results(self, out, img_meta, cfg, scale=2): results = [] for i in range(batch_size): - bboxes, scores = generate_bbox(keys[i], labels[i], score_maps[i], scales, cfg) + bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales) results.append({"bboxes": bboxes, "scores": scores}) outputs.update({"results": results}) @@ -690,6 +659,39 @@ def _max_pooling(self, x, scale=1): x = self.pooling_2s(x) return x + def generate_bbox(self, keys, label, score, scales): + label_num = len(keys) + bboxes = [] + scores = [] + for index in range(1, label_num): + i = keys[index] + ind = label == i + ind_np = ind.data.cpu().numpy() + points = np.array(np.where(ind_np)).transpose((1, 0)) + if points.shape[0] < self.min_area: + label[ind] = 0 + continue + score_i = score[ind].mean().item() + if score_i < self.min_score: + label[ind] = 0 + continue + + if self.bbox_type == "rect": + rect = cv2.minAreaRect(points[:, ::-1]) + alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1]))) + rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2]) + bbox = cv2.boxPoints(rect) * scales + else: + binary = np.zeros(label.shape, dtype="uint8") + binary[ind_np] = 1 + contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + bbox = contours[0] * scales + + bbox = bbox.astype("int32") + bboxes.append(bbox.reshape(-1).tolist()) + scores.append(score_i) + return bboxes, scores + class FASTForImageCaptioning(PreTrainedModel): def __init__(self, config): @@ -702,7 +704,7 @@ def _upsample(self, x, size, scale=1): _, _, H, W = size return F.interpolate(x, size=(H // scale, W // scale), mode="bilinear") - def forward(self, imgs, img_metas=None, cfg=None): + def forward(self, imgs, img_metas=None): outputs = {} f = self.backbone(imgs) @@ -712,7 +714,7 @@ def forward(self, imgs, img_metas=None, cfg=None): det_out = self.det_head(f) det_out = self._upsample(det_out, imgs.size(), scale=4) - det_res = self.det_head.get_results(det_out, img_metas, cfg, scale=2) + det_res = self.det_head.get_results(det_out, img_metas, scale=2) outputs.update(det_res) return outputs diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index 25fcaffb82a0..26d2fd8e347e 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -42,6 +42,7 @@ class FastModelTester: def __init__( self, + parent, backbone_kernel_size=3, backbone_stride=2, backbone_dilation=1, @@ -54,41 +55,41 @@ def __init__( backbone_act_func="relu", backbone_dropout_rate=0, backbone_ops_order="weight_bn_act", - backbone_stage1_in_channels=(64, 64, 64), - backbone_stage1_out_channels=(64, 64, 64), - backbone_stage1_kernel_size=((3, 3), (3, 3), (3, 3)), - backbone_stage1_stride=(1, 2, 1), - backbone_stage1_dilation=(1, 1, 1), - backbone_stage1_groups=(1, 1, 1), - backbone_stage2_in_channels=(64, 128, 128, 128), - backbone_stage2_out_channels=(128, 128, 128, 128), - backbone_stage2_kernel_size=((3, 3), (1, 3), (3, 3), (3, 1)), - backbone_stage2_stride=(2, 1, 1, 1), - backbone_stage2_dilation=(1, 1, 1, 1), - backbone_stage2_groups=(1, 1, 1, 1), - backbone_stage3_in_channels=(128, 256, 256, 256), - backbone_stage3_out_channels=(256, 256, 256, 256), - backbone_stage3_kernel_size=((3, 3), (3, 3), (3, 1), (1, 3)), - backbone_stage3_stride=(2, 1, 1, 1), - backbone_stage3_dilation=(1, 1, 1, 1), - backbone_stage3_groups=(1, 1, 1, 1), - backbone_stage4_in_channels=(256, 512, 512, 512), - backbone_stage4_out_channels=(512, 512, 512, 512), - backbone_stage4_kernel_size=((3, 3), (3, 1), (1, 3), (3, 3)), - backbone_stage4_stride=(2, 1, 1, 1), - backbone_stage4_dilation=(1, 1, 1, 1), - backbone_stage4_groups=(1, 1, 1, 1), - neck_in_channels=(64, 128, 256, 512), - neck_out_channels=(128, 128, 128, 128), - neck_kernel_size=((3, 3), (3, 3), (3, 3), (3, 3)), - neck_stride=(1, 1, 1, 1), - neck_dilation=(1, 1, 1, 1), - neck_groups=(1, 1, 1, 1), + backbone_stage1_in_channels=[64, 64, 64], + backbone_stage1_out_channels=[64, 64, 64], + backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]], + backbone_stage1_stride=[1, 2, 1], + backbone_stage1_dilation=[1, 1, 1], + backbone_stage1_groups=[1, 1, 1], + backbone_stage2_in_channels=[64, 128, 128, 128], + backbone_stage2_out_channels=[128, 128, 128, 128], + backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]], + backbone_stage2_stride=[2, 1, 1, 1], + backbone_stage2_dilation=[1, 1, 1, 1], + backbone_stage2_groups=[1, 1, 1, 1], + backbone_stage3_in_channels=[128, 256, 256, 256], + backbone_stage3_out_channels=[256, 256, 256, 256], + backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]], + backbone_stage3_stride=[2, 1, 1, 1], + backbone_stage3_dilation=[1, 1, 1, 1], + backbone_stage3_groups=[1, 1, 1, 1], + backbone_stage4_in_channels=[256, 512, 512, 512], + backbone_stage4_out_channels=[512, 512, 512, 512], + backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]], + backbone_stage4_stride=[2, 1, 1, 1], + backbone_stage4_dilation=[1, 1, 1, 1], + backbone_stage4_groups=[1, 1, 1, 1], + neck_in_channels=[64, 128, 256, 512], + neck_out_channels=[128, 128, 128, 128], + neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]], + neck_stride=[1, 1, 1, 1], + neck_dilation=[1, 1, 1, 1], + neck_groups=[1, 1, 1, 1], head_pooling_size=9, head_dropout_ratio=0.1, head_conv_in_channels=512, head_conv_out_channels=128, - head_conv_kernel_size=(3, 3), + head_conv_kernel_size=[3, 3], head_conv_stride=1, head_conv_dilation=1, head_conv_groups=1, @@ -107,8 +108,9 @@ def __init__( batch_size=3, num_channels=3, image_size=500, - + is_training=True, ): + self.parent = parent self.backbone_kernel_size = backbone_kernel_size self.backbone_stride = backbone_stride self.backbone_dilation = backbone_dilation @@ -124,41 +126,41 @@ def __init__( self.backbone_stage1_in_channels = backbone_stage1_in_channels self.backbone_stage1_out_channels = backbone_stage1_out_channels - self.backbone_stage1_kernel_size = (backbone_stage1_kernel_size,) - self.backbone_stage1_stride = (backbone_stage1_stride,) - self.backbone_stage1_dilation = (backbone_stage1_dilation,) - self.backbone_stage1_groups = (backbone_stage1_groups,) + self.backbone_stage1_kernel_size = backbone_stage1_kernel_size + self.backbone_stage1_stride = backbone_stage1_stride + self.backbone_stage1_dilation = backbone_stage1_dilation + self.backbone_stage1_groups = backbone_stage1_groups self.backbone_stage2_in_channels = backbone_stage2_in_channels self.backbone_stage2_out_channels = backbone_stage2_out_channels - self.backbone_stage2_kernel_size = (backbone_stage2_kernel_size,) - self.backbone_stage2_stride = (backbone_stage2_stride,) - self.backbone_stage2_dilation = (backbone_stage2_dilation,) - self.backbone_stage2_groups = (backbone_stage2_groups,) + self.backbone_stage2_kernel_size = backbone_stage2_kernel_size + self.backbone_stage2_stride = backbone_stage2_stride + self.backbone_stage2_dilation = backbone_stage2_dilation + self.backbone_stage2_groups = backbone_stage2_groups self.backbone_stage3_in_channels = backbone_stage3_in_channels self.backbone_stage3_out_channels = backbone_stage3_out_channels - self.backbone_stage3_kernel_size = (backbone_stage3_kernel_size,) - self.backbone_stage3_stride = (backbone_stage3_stride,) - self.backbone_stage3_dilation = (backbone_stage3_dilation,) - self.backbone_stage3_groups = (backbone_stage3_groups,) + self.backbone_stage3_kernel_size = backbone_stage3_kernel_size + self.backbone_stage3_stride = backbone_stage3_stride + self.backbone_stage3_dilation = backbone_stage3_dilation + self.backbone_stage3_groups = backbone_stage3_groups self.backbone_stage4_in_channels = backbone_stage4_in_channels self.backbone_stage4_out_channels = backbone_stage4_out_channels - self.backbone_stage4_kernel_size = (backbone_stage4_kernel_size,) - self.backbone_stage4_stride = (backbone_stage4_stride,) - self.backbone_stage4_dilation = (backbone_stage4_dilation,) - self.backbone_stage4_groups = (backbone_stage4_groups,) + self.backbone_stage4_kernel_size = backbone_stage4_kernel_size + self.backbone_stage4_stride = backbone_stage4_stride + self.backbone_stage4_dilation = backbone_stage4_dilation + self.backbone_stage4_groups = backbone_stage4_groups - self.neck_in_channels = (neck_in_channels,) - self.neck_out_channels = (neck_out_channels,) - self.neck_kernel_size = (neck_kernel_size,) - self.neck_stride = (neck_stride,) - self.neck_dilation = (neck_dilation,) - self.neck_groups = (neck_groups,) + self.neck_in_channels = neck_in_channels + self.neck_out_channels = neck_out_channels + self.neck_kernel_size = neck_kernel_size + self.neck_stride = neck_stride + self.neck_dilation = neck_dilation + self.neck_groups = neck_groups - self.head_pooling_size = (head_pooling_size,) - self.head_dropout_ratio = (head_dropout_ratio,) + self.head_pooling_size = head_pooling_size + self.head_dropout_ratio = head_dropout_ratio self.head_conv_in_channels = head_conv_in_channels self.head_conv_out_channels = head_conv_out_channels @@ -167,22 +169,23 @@ def __init__( self.head_conv_dilation = head_conv_dilation self.head_conv_groups = head_conv_groups - self.head_final_kernel_size = (head_final_kernel_size,) - self.head_final_stride = (head_final_stride,) - self.head_final_dilation = (head_final_dilation,) - self.head_final_groups = (head_final_groups,) - self.head_final_bias = (head_final_bias,) - self.head_final_has_shuffle = (head_final_has_shuffle,) - self.head_final_in_channels = (head_final_in_channels,) - self.head_final_out_channels = (head_final_out_channels,) - self.head_final_use_bn = (head_final_use_bn,) - self.head_final_act_func = (head_final_act_func,) - self.head_final_dropout_rate = (head_final_dropout_rate,) + self.head_final_kernel_size = head_final_kernel_size + self.head_final_stride = head_final_stride + self.head_final_dilation = head_final_dilation + self.head_final_groups = head_final_groups + self.head_final_bias = head_final_bias + self.head_final_has_shuffle = head_final_has_shuffle + self.head_final_in_channels = head_final_in_channels + self.head_final_out_channels = head_final_out_channels + self.head_final_use_bn = head_final_use_bn + self.head_final_act_func = head_final_act_func + self.head_final_dropout_rate = head_final_dropout_rate self.head_final_ops_order = head_final_ops_order self.batch_size = batch_size self.num_channels = num_channels self.image_size = image_size + self.is_training = is_training def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) @@ -196,22 +199,84 @@ def prepare_config_and_inputs(self): # config = self.get_config() - return config, {"imgs": pixel_values, "img_meta": pixel_values_meta} + return config, {"imgs": pixel_values, "img_metas": pixel_values_meta} def get_config(self): - return FastConfig() + return FastConfig( + backbone_kernel_size=self.backbone_kernel_size, + backbone_stride=self.backbone_stride, + backbone_dilation=self.backbone_dilation, + backbone_groups=self.backbone_groups, + backbone_bias=self.backbone_bias, + backbone_has_shuffle=self.backbone_has_shuffle, + backbone_in_channels=self.backbone_in_channels, + backbone_out_channels=self.backbone_out_channels, + backbone_use_bn=self.backbone_use_bn, + backbone_act_func=self.backbone_act_func, + backbone_dropout_rate=self.backbone_dropout_rate, + backbone_ops_order=self.backbone_ops_order, + backbone_stage1_in_channels=self.backbone_stage1_in_channels, + backbone_stage1_out_channels=self.backbone_stage1_out_channels, + backbone_stage1_kernel_size=self.backbone_stage1_kernel_size, + backbone_stage1_stride=self.backbone_stage1_stride, + backbone_stage1_dilation=self.backbone_stage1_dilation, + backbone_stage1_groups=self.backbone_stage1_groups, + backbone_stage2_in_channels=self.backbone_stage2_in_channels, + backbone_stage2_out_channels=self.backbone_stage2_out_channels, + backbone_stage2_kernel_size=self.backbone_stage2_kernel_size, + backbone_stage2_stride=self.backbone_stage2_stride, + backbone_stage2_dilation=self.backbone_stage2_dilation, + backbone_stage2_groups=self.backbone_stage2_groups, + backbone_stage3_in_channels=self.backbone_stage3_in_channels, + backbone_stage3_out_channels=self.backbone_stage3_out_channels, + backbone_stage3_kernel_size=self.backbone_stage3_kernel_size, + backbone_stage3_stride=self.backbone_stage3_stride, + backbone_stage3_dilation=self.backbone_stage3_dilation, + backbone_stage3_groups=self.backbone_stage3_groups, + backbone_stage4_in_channels=self.backbone_stage4_in_channels, + backbone_stage4_out_channels=self.backbone_stage4_out_channels, + backbone_stage4_kernel_size=self.backbone_stage4_kernel_size, + backbone_stage4_stride=self.backbone_stage4_stride, + backbone_stage4_dilation=self.backbone_stage4_dilation, + backbone_stage4_groups=self.backbone_stage4_groups, + neck_in_channels=self.neck_in_channels, + neck_out_channels=self.neck_out_channels, + neck_kernel_size=self.neck_kernel_size, + neck_stride=self.neck_stride, + neck_dilation=self.neck_dilation, + neck_groups=self.neck_groups, + head_pooling_size=self.head_pooling_size, + head_dropout_ratio=self.head_dropout_ratio, + head_conv_in_channels=self.head_conv_in_channels, + head_conv_out_channels=self.head_conv_out_channels, + head_conv_kernel_size=self.head_conv_kernel_size, + head_conv_stride=self.head_conv_stride, + head_conv_dilation=self.head_conv_dilation, + head_conv_groups=self.head_conv_groups, + head_final_kernel_size=self.head_final_kernel_size, + head_final_stride=self.head_final_stride, + head_final_dilation=self.head_final_dilation, + head_final_groups=self.head_final_groups, + head_final_bias=self.head_final_bias, + head_final_has_shuffle=self.head_final_has_shuffle, + head_final_in_channels=self.head_final_in_channels, + head_final_out_channels=self.head_final_out_channels, + head_final_use_bn=self.head_final_use_bn, + head_final_act_func=self.head_final_act_func, + head_final_dropout_rate=self.head_final_dropout_rate, + head_final_ops_order=self.head_final_ops_order, + ) - def create_and_check_model(self, config, pixel_values): + def create_and_check_model(self, config, input): model = FASTForImageCaptioning(config=config) model.to(torch_device) model.eval() - result = model(pixel_values) + result = model(imgs=input['imgs'], imgs_mets=input['img_metas']) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - config, pixel_values = config_and_inputs - inputs_dict = {"imgs": pixel_values} + config, inputs_dict = config_and_inputs return config, inputs_dict @@ -238,7 +303,13 @@ def setUp(self): self.config_tester = ConfigTester(self, config_class=FastConfig, hidden_size=37) def test_config(self): - self.config_tester.run_common_tests() + self.create_and_test_config_common_properties() + self.config_tester.create_and_test_config_to_json_string() + self.config_tester.create_and_test_config_to_json_file() + self.config_tester.create_and_test_config_from_and_save_pretrained() + self.config_tester.create_and_test_config_with_num_labels() + self.config_tester.check_config_can_be_init_without_params() + self.config_tester.check_config_arguments_init() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() From a6e1cfdee13129c06ba4817115484c6acfe5a415 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Fri, 13 Oct 2023 18:50:00 +0530 Subject: [PATCH 005/152] WIP --- src/transformers/models/fast/modeling_fast.py | 154 +++++++++--------- 1 file changed, 77 insertions(+), 77 deletions(-) diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 255eb2635fcf..4f3188819ac3 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -609,48 +609,48 @@ def forward(self, x): x = self.final(x) return x - def get_results(self, out, img_meta, scale=2): - org_img_size = img_meta["org_img_size"] - img_size = img_meta["img_size"] # 640*640 - batch_size = out.size(0) - outputs = {} - - texts = F.interpolate( - out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), mode="nearest" - ) # B*1*320*320 - texts = self._max_pooling(texts, scale=scale) # B*1*320*320 - score_maps = torch.sigmoid_(texts) # B*1*320*320~ - score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode="nearest") # B*1*640*640 - score_maps = score_maps.squeeze(1) # B*640*640 - - kernels = (out[:, 0, :, :] > 0).to(torch.uint8) # B*160*160 - labels_ = [] - for kernel in kernels.numpy(): - ret, label_ = cv2.connectedComponents(kernel) - labels_.append(label_) - labels_ = np.array(labels_) - labels_ = torch.from_numpy(labels_) - labels = labels_.unsqueeze(1).to(torch.float32) # B*1*160*160 - labels = F.interpolate( - labels, size=(img_size[0] // scale, img_size[1] // scale), mode="nearest" - ) # B*1*320*320 - labels = self._max_pooling(labels, scale=scale) - labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode="nearest") # B*1*640*640 - labels = labels.squeeze(1).to(torch.int32) # B*640*640 - - keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)] - - outputs.update({"kernels": kernels.data.cpu()}) - - scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0])) - - results = [] - for i in range(batch_size): - bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales) - results.append({"bboxes": bboxes, "scores": scores}) - outputs.update({"results": results}) - - return outputs + # def get_results(self, out, img_meta, scale=2): + # org_img_size = img_meta["org_img_size"] + # img_size = img_meta["img_size"] # 640*640 + # batch_size = out.size(0) + # outputs = {} + # + # texts = F.interpolate( + # out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), mode="nearest" + # ) # B*1*320*320 + # texts = self._max_pooling(texts, scale=scale) # B*1*320*320 + # score_maps = torch.sigmoid_(texts) # B*1*320*320~ + # score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode="nearest") # B*1*640*640 + # score_maps = score_maps.squeeze(1) # B*640*640 + # + # kernels = (out[:, 0, :, :] > 0).to(torch.uint8) # B*160*160 + # labels_ = [] + # for kernel in kernels.numpy(): + # ret, label_ = cv2.connectedComponents(kernel) + # labels_.append(label_) + # labels_ = np.array(labels_) + # labels_ = torch.from_numpy(labels_) + # labels = labels_.unsqueeze(1).to(torch.float32) # B*1*160*160 + # labels = F.interpolate( + # labels, size=(img_size[0] // scale, img_size[1] // scale), mode="nearest" + # ) # B*1*320*320 + # labels = self._max_pooling(labels, scale=scale) + # labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode="nearest") # B*1*640*640 + # labels = labels.squeeze(1).to(torch.int32) # B*640*640 + # + # keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)] + # + # outputs.update({"kernels": kernels.data.cpu()}) + # + # scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0])) + # + # results = [] + # for i in range(batch_size): + # bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales) + # results.append({"bboxes": bboxes, "scores": scores}) + # outputs.update({"results": results}) + # + # return outputs def _max_pooling(self, x, scale=1): if scale == 1: @@ -659,38 +659,38 @@ def _max_pooling(self, x, scale=1): x = self.pooling_2s(x) return x - def generate_bbox(self, keys, label, score, scales): - label_num = len(keys) - bboxes = [] - scores = [] - for index in range(1, label_num): - i = keys[index] - ind = label == i - ind_np = ind.data.cpu().numpy() - points = np.array(np.where(ind_np)).transpose((1, 0)) - if points.shape[0] < self.min_area: - label[ind] = 0 - continue - score_i = score[ind].mean().item() - if score_i < self.min_score: - label[ind] = 0 - continue - - if self.bbox_type == "rect": - rect = cv2.minAreaRect(points[:, ::-1]) - alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1]))) - rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2]) - bbox = cv2.boxPoints(rect) * scales - else: - binary = np.zeros(label.shape, dtype="uint8") - binary[ind_np] = 1 - contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - bbox = contours[0] * scales - - bbox = bbox.astype("int32") - bboxes.append(bbox.reshape(-1).tolist()) - scores.append(score_i) - return bboxes, scores + # def generate_bbox(self, keys, label, score, scales): + # label_num = len(keys) + # bboxes = [] + # scores = [] + # for index in range(1, label_num): + # i = keys[index] + # ind = label == i + # ind_np = ind.data.cpu().numpy() + # points = np.array(np.where(ind_np)).transpose((1, 0)) + # if points.shape[0] < self.min_area: + # label[ind] = 0 + # continue + # score_i = score[ind].mean().item() + # if score_i < self.min_score: + # label[ind] = 0 + # continue + # + # if self.bbox_type == "rect": + # rect = cv2.minAreaRect(points[:, ::-1]) + # alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1]))) + # rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2]) + # bbox = cv2.boxPoints(rect) * scales + # else: + # binary = np.zeros(label.shape, dtype="uint8") + # binary[ind_np] = 1 + # contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + # bbox = contours[0] * scales + # + # bbox = bbox.astype("int32") + # bboxes.append(bbox.reshape(-1).tolist()) + # scores.append(score_i) + # return bboxes, scores class FASTForImageCaptioning(PreTrainedModel): @@ -714,7 +714,7 @@ def forward(self, imgs, img_metas=None): det_out = self.det_head(f) det_out = self._upsample(det_out, imgs.size(), scale=4) - det_res = self.det_head.get_results(det_out, img_metas, scale=2) - outputs.update(det_res) + # det_res = self.det_head.get_results(det_out, img_metas, scale=2) + # outputs.update(det_res) - return outputs + return det_out From a8e4320b0666762b7e52a1d4135c293f33ed12fb Mon Sep 17 00:00:00 2001 From: raghavanone Date: Sat, 14 Oct 2023 20:17:16 +0530 Subject: [PATCH 006/152] Add tests --- .../models/fast/configuration_fast.py | 4 + src/transformers/models/fast/modeling_fast.py | 342 ++++++++++++++++-- tests/models/fast/test_modeling_fast.py | 128 +++++-- 3 files changed, 398 insertions(+), 76 deletions(-) diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index 773dbcb151c7..5b57ac482a0e 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -69,6 +69,8 @@ def __init__( min_area=250, min_score=0.88, bbox_type='rect', + loss_bg=False, + initializer_range=0.02, **kwargs, ): super().__init__(**kwargs) @@ -147,3 +149,5 @@ def __init__( self.min_area = min_area self.min_score = min_score self.bbox_type = bbox_type + self.loss_bg = loss_bg + self.initializer_range = initializer_range \ No newline at end of file diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 4f3188819ac3..798ecba93aa2 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -1,5 +1,7 @@ import math from collections import OrderedDict +from dataclasses import dataclass +from typing import Optional, Dict import cv2 import numpy as np @@ -7,7 +9,8 @@ import torch.nn as nn import torch.nn.functional as F -from transformers import PreTrainedModel +from transformers import PreTrainedModel, FastConfig +from transformers.utils import ModelOutput def get_same_padding(kernel_size): @@ -131,10 +134,6 @@ def is_zero_layer(): return False -class FalsePreTrainedModel(PreTrainedModel): - pass - - class ConvLayer(My2DLayer): def __init__( self, @@ -403,7 +402,24 @@ def _pad_to_mxn_tensor(self, kernel): # return RepConvLayer(**config) -class TextNet(PreTrainedModel): +class FastPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = FastConfig + base_model_prefix = "fast" + main_input_name = "pixel_values" + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + + +class TextNet(FastPreTrainedModel): def __init__(self, config): super().__init__(config) self.first_conv = ConvLayer( @@ -420,7 +436,7 @@ def __init__(self, config): config.backbone_dropout_rate, config.backbone_ops_order, ) - + self.first_conv.apply(self._init_weights) stage1 = [] for stage_config in zip( config.backbone_stage1_in_channels, @@ -469,15 +485,15 @@ def __init__(self, config): stage4.append(RepConvLayer(*stage_config)) self.stage4 = nn.ModuleList(stage4) - self._initialize_weights() - - def _initialize_weights(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight) - elif isinstance(m, nn.BatchNorm2d): - m.weight.data.fill_(1) - m.bias.data.zero_() + # self._initialize_weights() + # + # def _initialize_weights(self): + # for m in self.modules(): + # if isinstance(m, nn.Conv2d): + # nn.init.kaiming_normal_(m.weight) + # elif isinstance(m, nn.BatchNorm2d): + # m.weight.data.fill_(1) + # m.bias.data.zero_() def forward(self, x): x = self.first_conv(x) @@ -502,7 +518,7 @@ def forward(self, x): return output -class FASTNeck(PreTrainedModel): +class FASTNeck(FastPreTrainedModel): def __init__(self, config): super().__init__(config) reduce_layer_configs = list( @@ -515,11 +531,13 @@ def __init__(self, config): config.neck_groups, ) ) - - self.reduce_layer1 = RepConvLayer(*reduce_layer_configs[0]) - self.reduce_layer2 = RepConvLayer(*reduce_layer_configs[1]) - self.reduce_layer3 = RepConvLayer(*reduce_layer_configs[2]) - self.reduce_layer4 = RepConvLayer(*reduce_layer_configs[3]) + self.layers_count = len(reduce_layer_configs) + for layer_ix in range(0, len(reduce_layer_configs)): + setattr(self, f"reduce_layer{layer_ix + 1}", RepConvLayer(*reduce_layer_configs[layer_ix])) + # self.reduce_layer1 = RepConvLayer(*reduce_layer_configs[0]) + # self.reduce_layer2 = RepConvLayer(*reduce_layer_configs[1]) + # self.reduce_layer3 = RepConvLayer(*reduce_layer_configs[2]) + # self.reduce_layer4 = RepConvLayer(*reduce_layer_configs[3]) self._initialize_weights() @@ -536,22 +554,22 @@ def _upsample(self, x, y): return F.upsample(x, size=(H, W), mode="bilinear") def forward(self, x): - f1, f2, f3, f4 = x + f1 = x[0] f1 = self.reduce_layer1(f1) - f2 = self.reduce_layer2(f2) - f3 = self.reduce_layer3(f3) - f4 = self.reduce_layer4(f4) - - f2 = self._upsample(f2, f1) - f3 = self._upsample(f3, f1) - f4 = self._upsample(f4, f1) - f = torch.cat((f1, f2, f3, f4), 1) + output_stages = [f1] + + for layer_ix in range(1, self.layers_count): + layer_out = getattr(self, f"reduce_layer{layer_ix + 1}")(x[layer_ix]) + layer_out = self._upsample(layer_out, f1) + output_stages.append(layer_out) + + f = torch.cat(output_stages, 1) return f -class FASTHead(nn.Module): +class FASTHead(FastPreTrainedModel): def __init__(self, config): - super(FASTHead, self).__init__() + super().__init__(config) self.conv = RepConvLayer( config.head_conv_in_channels, config.head_conv_out_channels, @@ -693,28 +711,274 @@ def _max_pooling(self, x, scale=1): # return bboxes, scores -class FASTForImageCaptioning(PreTrainedModel): +def emb_loss(emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), + bg_sample=False): + training_mask = (training_mask > 0.5).long() + kernel = (kernel > 0.5).long() + instance = instance * training_mask + instance_kernel = (instance * kernel).view(-1) + instance = instance.view(-1) + emb = emb.view(feature_dim, -1) + + unique_labels, unique_ids = torch.unique(instance_kernel, sorted=True, return_inverse=True) + num_instance = unique_labels.size(0) + if num_instance <= 1: + return 0 + + emb_mean = emb.new_zeros((feature_dim, num_instance), dtype=torch.float32) + for i, lb in enumerate(unique_labels): + if lb == 0: + continue + ind_k = instance_kernel == lb + emb_mean[:, i] = torch.mean(emb[:, ind_k], dim=1) + + l_agg = emb.new_zeros(num_instance, dtype=torch.float32) # bug + for i, lb in enumerate(unique_labels): + if lb == 0: + continue + ind = instance == lb + emb_ = emb[:, ind] + dist = (emb_ - emb_mean[:, i:i + 1]).norm(p=2, dim=0) + dist = F.relu(dist - delta_v) ** 2 + l_agg[i] = torch.mean(torch.log(dist + 1.0)) + l_agg = torch.mean(l_agg[1:]) + + if num_instance > 2: + emb_interleave = emb_mean.permute(1, 0).repeat(num_instance, 1) + emb_band = emb_mean.permute(1, 0).repeat(1, num_instance).view(-1, feature_dim) + # print(seg_band) + + mask = (1 - torch.eye(num_instance, dtype=torch.int8)).view(-1, 1).repeat(1, feature_dim) + mask = mask.view(num_instance, num_instance, -1) + mask[0, :, :] = 0 + mask[:, 0, :] = 0 + mask = mask.view(num_instance * num_instance, -1) + # print(mask) + + dist = emb_interleave - emb_band + dist = dist[mask > 0].view(-1, feature_dim).norm(p=2, dim=1) + dist = F.relu(2 * delta_d - dist) ** 2 + l_dis = torch.mean(torch.log(dist + 1.0)) + + if bg_sample: + l_dis = [torch.log(dist + 1.0)] + emb_bg = emb[:, instance == 0].view(feature_dim, -1) + if emb_bg.size(1) > 100: + rand_ind = np.random.permutation(emb_bg.size(1))[:100] + emb_bg = emb_bg[:, rand_ind] + if emb_bg.size(1) > 0: + for i, lb in enumerate(unique_labels): + if lb == 0: + continue + dist = (emb_bg - emb_mean[:, i:i + 1]).norm(p=2, dim=0) + dist = F.relu(2 * delta_d - dist) ** 2 + l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True) + l_dis.append(l_dis_bg) + l_dis = torch.mean(torch.cat(l_dis)) + else: + l_dis = 0 + + l_agg = weights[0] * l_agg + l_dis = weights[1] * l_dis + l_reg = torch.mean(torch.log(torch.norm(emb_mean, 2, 0) + 1.0)) * 0.001 + loss = l_agg + l_dis + l_reg + return loss + + +def emb_loss_batch(emb, instance, kernel, training_mask, reduce=True, loss_weight=0.25, bg_sample=False): + loss_batch = emb.new_zeros((emb.size(0)), dtype=torch.float32) + + for i in range(loss_batch.size(0)): + loss_batch[i] = emb_loss(emb[i], instance[i], kernel[i], training_mask[i]) + + loss_batch = loss_weight * loss_batch + + if reduce: + loss_batch = torch.mean(loss_batch) + + return loss_batch + + +def dice_loss_with_masks(input, target, mask, reduce=True): + loss_weight = 0.5 + batch_size = input.size(0) + input = torch.sigmoid(input) + + input = input.contiguous().view(batch_size, -1) + target = target.contiguous().view(batch_size, -1).float() + mask = mask.contiguous().view(batch_size, -1).float() + + input = input * mask + target = target * mask + + a = torch.sum(input * target, dim=1) + b = torch.sum(input * input, dim=1) + 0.001 + c = torch.sum(target * target, dim=1) + 0.001 + d = (2 * a) / (b + c) + loss = 1 - d + + loss = loss_weight * loss + + if reduce: + loss = torch.mean(loss) + + return loss + + +def ohem_single(score, gt_text, training_mask): + pos_num = int(torch.sum(gt_text > 0.5)) - int(torch.sum((gt_text > 0.5) & (training_mask <= 0.5))) + + if pos_num == 0: + # selected_mask = gt_text.copy() * 0 # may be not good + selected_mask = training_mask + selected_mask = selected_mask.view(1, selected_mask.shape[0], selected_mask.shape[1]).float() + return selected_mask + + neg_num = int(torch.sum(gt_text <= 0.5)) + neg_num = int(min(pos_num * 3, neg_num)) + + if neg_num == 0: + selected_mask = training_mask + selected_mask = selected_mask.view(1, selected_mask.shape[0], selected_mask.shape[1]).float() + return selected_mask + + neg_score = score[gt_text <= 0.5] + neg_score_sorted, _ = torch.sort(-neg_score) + threshold = -neg_score_sorted[neg_num - 1] + + selected_mask = ((score >= threshold) | (gt_text > 0.5)) & (training_mask > 0.5) + selected_mask = selected_mask.reshape(1, selected_mask.shape[0], selected_mask.shape[1]).float() + return selected_mask + + +def ohem_batch(scores, gt_texts, training_masks): + selected_masks = [] + for i in range(scores.shape[0]): + selected_masks.append(ohem_single(scores[i, :, :], gt_texts[i, :, :], training_masks[i, :, :])) + + selected_masks = torch.cat(selected_masks, 0).float() + return selected_masks + + +def iou_single(a, b, mask, n_class): + EPS = 1e-6 + valid = mask == 1 + a = a[valid] + b = b[valid] + miou = [] + for i in range(n_class): + inter = ((a == i) & (b == i)).float() + union = ((a == i) | (b == i)).float() + + miou.append(torch.sum(inter) / (torch.sum(union) + EPS)) + miou = sum(miou) / len(miou) + return miou + + +def iou(a, b, mask, n_class=2, reduce=True): + batch_size = a.size(0) + + a = a.view(batch_size, -1) + b = b.view(batch_size, -1) + mask = mask.view(batch_size, -1) + + iou = a.new_zeros((batch_size,), dtype=torch.float32) + for i in range(batch_size): + iou[i] = iou_single(a[i], b[i], mask[i], n_class) + + if reduce: + iou = torch.mean(iou) + return iou + + +@dataclass +class FASTForImageCaptioningOutput(ModelOutput): + """ + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity + scores. + + Args: + loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Languge modeling loss from the text decoder. + text_hidden (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional*): + The image hidden states. + """ + + loss: Optional[torch.Tensor] = None + hidden_states: Optional[torch.FloatTensor] = None + + +class FASTForImageCaptioning(FastPreTrainedModel): + def __init__(self, config): super().__init__(config) self.backbone = TextNet(config=config) self.neck = FASTNeck(config=config) self.det_head = FASTHead(config=config) + self.loss_bg = config.loss_bg + + self.pooling_1s = nn.MaxPool2d(kernel_size=config.head_pooling_size, stride=1, + padding=(config.head_pooling_size - 1) // 2) + self.pooling_2s = nn.MaxPool2d(kernel_size=config.head_pooling_size // 2 + 1, stride=1, + padding=(config.head_pooling_size // 2) // 2) + self.post_init() def _upsample(self, x, size, scale=1): _, _, H, W = size return F.interpolate(x, size=(H // scale, W // scale), mode="bilinear") - def forward(self, imgs, img_metas=None): - outputs = {} + def _max_pooling(self, x, scale=1): + if scale == 1: + x = self.pooling_1s(x) + elif scale == 2: + x = self.pooling_2s(x) + return x + + def loss(self, hidden, labels): + gt_texts = labels['gt_texts'] + gt_kernels = labels['gt_kernels'] + training_masks = labels['training_masks'] + gt_instances = labels['gt_instances'] + + kernels = hidden[:, 0, :, :] # 4*640*640 + texts = self._max_pooling(kernels, scale=1) # 4*640*640 + embs = hidden[:, 1:, :, :] # 4*4*640*640 + + selected_masks = ohem_batch(texts, gt_texts, training_masks) + loss_text = dice_loss_with_masks(texts, gt_texts, selected_masks, reduce=False) - f = self.backbone(imgs) + selected_masks = gt_texts * training_masks + loss_kernel = dice_loss_with_masks(kernels, gt_kernels, selected_masks, reduce=False) + loss_kernel = torch.mean(loss_kernel, dim=0) + + loss_emb = emb_loss_batch(embs, gt_instances, gt_kernels, training_masks, reduce=False, bg_sample=self.loss_bg) + + return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb) + + def forward(self, + pixel_values: torch.FloatTensor, + output_hidden_states: Optional[bool] = True, + return_dict: Optional[bool] = None, + labels: Dict = None + ): + # outputs = {} + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + f = self.backbone(pixel_values) f = self.neck(f) det_out = self.det_head(f) - det_out = self._upsample(det_out, imgs.size(), scale=4) + loss = None + if labels: + out = self._upsample(det_out, pixel_values.size(), scale=1) + loss = self.loss(out, labels) # det_res = self.det_head.get_results(det_out, img_metas, scale=2) # outputs.update(det_res) + det_out = self._upsample(det_out, pixel_values.size(), scale=4) + + if not return_dict: + return (loss, det_out) if loss is not None else (det_out,) - return det_out + return FASTForImageCaptioningOutput(loss, det_out) diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index 26d2fd8e347e..f3790cfb8300 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ Testing suite for the PyTorch Falcon model. """ - +import inspect import unittest from parameterized import parameterized @@ -55,40 +55,40 @@ def __init__( backbone_act_func="relu", backbone_dropout_rate=0, backbone_ops_order="weight_bn_act", - backbone_stage1_in_channels=[64, 64, 64], - backbone_stage1_out_channels=[64, 64, 64], - backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]], - backbone_stage1_stride=[1, 2, 1], - backbone_stage1_dilation=[1, 1, 1], - backbone_stage1_groups=[1, 1, 1], - backbone_stage2_in_channels=[64, 128, 128, 128], - backbone_stage2_out_channels=[128, 128, 128, 128], - backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]], - backbone_stage2_stride=[2, 1, 1, 1], - backbone_stage2_dilation=[1, 1, 1, 1], - backbone_stage2_groups=[1, 1, 1, 1], - backbone_stage3_in_channels=[128, 256, 256, 256], - backbone_stage3_out_channels=[256, 256, 256, 256], - backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]], - backbone_stage3_stride=[2, 1, 1, 1], - backbone_stage3_dilation=[1, 1, 1, 1], - backbone_stage3_groups=[1, 1, 1, 1], - backbone_stage4_in_channels=[256, 512, 512, 512], - backbone_stage4_out_channels=[512, 512, 512, 512], - backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]], - backbone_stage4_stride=[2, 1, 1, 1], - backbone_stage4_dilation=[1, 1, 1, 1], - backbone_stage4_groups=[1, 1, 1, 1], - neck_in_channels=[64, 128, 256, 512], - neck_out_channels=[128, 128, 128, 128], - neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]], - neck_stride=[1, 1, 1, 1], - neck_dilation=[1, 1, 1, 1], - neck_groups=[1, 1, 1, 1], + backbone_stage1_in_channels=[64], + backbone_stage1_out_channels=[64], + backbone_stage1_kernel_size=[[3, 3]], + backbone_stage1_stride=[1], + backbone_stage1_dilation=[1], + backbone_stage1_groups=[1], + backbone_stage2_in_channels=[64], + backbone_stage2_out_channels=[128], + backbone_stage2_kernel_size=[ [3, 1]], + backbone_stage2_stride=[2], + backbone_stage2_dilation=[1], + backbone_stage2_groups=[1], + backbone_stage3_in_channels=[128], + backbone_stage3_out_channels=[256], + backbone_stage3_kernel_size=[ [1, 3]], + backbone_stage3_stride=[2], + backbone_stage3_dilation=[1], + backbone_stage3_groups=[1], + backbone_stage4_in_channels=[256], + backbone_stage4_out_channels=[512], + backbone_stage4_kernel_size=[[3, 3]], + backbone_stage4_stride=[2], + backbone_stage4_dilation=[1], + backbone_stage4_groups=[1], + neck_in_channels=[64], + neck_out_channels=[128], + neck_kernel_size=[[3, 3]], + neck_stride=[1], + neck_dilation=[1], + neck_groups=[1], head_pooling_size=9, head_dropout_ratio=0.1, - head_conv_in_channels=512, - head_conv_out_channels=128, + head_conv_in_channels=128, + head_conv_out_channels=4, head_conv_kernel_size=[3, 3], head_conv_stride=1, head_conv_dilation=1, @@ -99,7 +99,7 @@ def __init__( head_final_groups=1, head_final_bias=False, head_final_has_shuffle=False, - head_final_in_channels=128, + head_final_in_channels=4, head_final_out_channels=5, head_final_use_bn=False, head_final_act_func=None, @@ -199,7 +199,7 @@ def prepare_config_and_inputs(self): # config = self.get_config() - return config, {"imgs": pixel_values, "img_metas": pixel_values_meta} + return config, {"pixel_values": pixel_values} def get_config(self): return FastConfig( @@ -271,8 +271,8 @@ def create_and_check_model(self, config, input): model = FASTForImageCaptioning(config=config) model.to(torch_device) model.eval() - result = model(imgs=input['imgs'], imgs_mets=input['img_metas']) - self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + result = model(pixel_values=input['pixel_values']) + self.parent.assertEqual(result.hidden_states.shape, (self.batch_size, 5, 125, 125)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() @@ -325,3 +325,57 @@ def test_inputs_embeds(self): @unittest.skip(reason="Fast does not support input and output embeddings") def test_model_common_attributes(self): pass + + @unittest.skip(reason="Fast is not a generative model") + def test_generate_without_input_ids(self): + pass + + @unittest.skip(reason="Fast is does not have any hidden_states") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="Fast is does not have any attention") + def test_retain_grad_hidden_states_attentions(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + to_return = inputs_dict.copy() + gt_instances = torch.zeros(self.model_tester.batch_size, self.model_tester.image_size, + self.model_tester.image_size) + gt_kernels = torch.zeros(self.model_tester.batch_size, self.model_tester.image_size, + self.model_tester.image_size) + gt_text = torch.zeros(self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size) + training_masks = torch.ones(self.model_tester.batch_size, self.model_tester.image_size, + self.model_tester.image_size) + labels = {} + labels["gt_instances"] = gt_instances + labels["gt_kernels"] = gt_kernels + labels["gt_texts"] = gt_text + labels["training_masks"] = training_masks + + to_return["labels"] = labels + + return to_return + + def test_model_is_small(self): + # Just a consistency check to make sure we are not running tests on 80M parameter models. + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + num_params = model.num_parameters() + assert ( + num_params < 3000000 + ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max." From 3b15aa97e791e0beda5cb20a46a4c0dae8caf210 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Sun, 15 Oct 2023 11:49:51 +0530 Subject: [PATCH 007/152] Add conversion script --- .../models/fast/configuration_fast.py | 2 +- .../fast/convert_fast_original_to_pytorch.py | 256 ++++++++++++++++++ 2 files changed, 257 insertions(+), 1 deletion(-) create mode 100644 src/transformers/models/fast/convert_fast_original_to_pytorch.py diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index 5b57ac482a0e..ad72054b5cee 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -47,7 +47,7 @@ def __init__( neck_dilation=[1, 1, 1, 1], neck_groups=[1, 1, 1, 1], head_pooling_size=9, - head_dropout_ratio=0.1, + head_dropout_ratio=0, head_conv_in_channels=512, head_conv_out_channels=128, head_conv_kernel_size=[3, 3], diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py new file mode 100644 index 000000000000..24f0c3dd56e0 --- /dev/null +++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py @@ -0,0 +1,256 @@ +# coding=utf-8 +# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import copy +import json + +import numpy as np +import pandas as pd +import requests +import torch +from huggingface_hub import hf_hub_download +from PIL import Image + +from transformers import ( + FastConfig, + FASTForImageCaptioning +) +from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD + +tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config" +small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config" +base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config" + +rename_key_mappings = { + "head": "classifier", + "text_embed": "text_embedding", + "vision_embed": "vision_embedding", + "k_proj": "key_proj", + "q_proj": "query_proj", + "v_proj": "value_proj", + "A": "text", + "B": "image", + "layer_norm": "fc_norm", + "self_attn_fc_norm": "self_attn_layer_norm", + "final_fc_norm": "final_layer_norm", + "first": "first", +} + + +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +def prepare_config(size_config_url, pooling_size, min_area, min_score, bbox_type, loss_bg): + config_dict = json.loads(requests.get(size_config_url).text) + + backbone_config = {} + for stage_ix in range(1, 5): + stage_config = config_dict[f'stage{stage_ix}'] + + merged_dict = {} + + # Iterate through the list of dictionaries + for layer in stage_config: + for key, value in layer.items(): + if key != "name": + # Check if the key is already in the merged_dict + if key in merged_dict: + merged_dict[key].append(value) + else: + # If the key is not in merged_dict, create a new list with the value + merged_dict[key] = [value] + backbone_config[f'stage{stage_ix}'] = merged_dict + + neck_in_channels = [] + neck_out_channels = [] + neck_kernel_size = [] + neck_stride = [] + neck_dilation = [] + neck_groups = [] + + for i in range(1, 5): + layer_key = f"reduce_layer{i}" + layer_dict = config_dict['neck'].get(layer_key) + + if layer_dict: + # Append values to the corresponding lists + neck_in_channels.append(layer_dict["in_channels"]) + neck_out_channels.append(layer_dict["out_channels"]) + neck_kernel_size.append(layer_dict["kernel_size"]) + neck_stride.append(layer_dict["stride"]) + neck_dilation.append(layer_dict["dilation"]) + neck_groups.append(layer_dict["groups"]) + + return FastConfig( + backbone_kernel_size=config_dict["first_conv"]["kernel_size"], + backbone_stride=config_dict["first_conv"]["stride"], + backbone_dilation=config_dict["first_conv"]["dilation"], + backbone_groups=config_dict["first_conv"]["groups"], + backbone_bias=config_dict["first_conv"]["bias"], + backbone_has_shuffle=config_dict["first_conv"]["has_shuffle"], + backbone_in_channels=config_dict["first_conv"]["in_channels"], + backbone_out_channels=config_dict["first_conv"]["out_channels"], + backbone_use_bn=config_dict["first_conv"]["use_bn"], + backbone_act_func=config_dict["first_conv"]["act_func"], + backbone_dropout_rate=config_dict["first_conv"]["dropout_rate"], + backbone_ops_order=config_dict["first_conv"]["ops_order"], + + backbone_stage1_in_channels=backbone_config['stage1']['in_channels'], + backbone_stage1_out_channels=backbone_config['stage1']['out_channels'], + backbone_stage1_kernel_size=backbone_config['stage1']['kernel_size'], + backbone_stage1_stride=backbone_config['stage1']['stride'], + backbone_stage1_dilation=backbone_config['stage1']['dilation'], + backbone_stage1_groups=backbone_config['stage1']['groups'], + + backbone_stage2_in_channels=backbone_config['stage2']['in_channels'], + backbone_stage2_out_channels=backbone_config['stage2']['out_channels'], + backbone_stage2_kernel_size=backbone_config['stage2']['kernel_size'], + backbone_stage2_stride=backbone_config['stage2']['stride'], + backbone_stage2_dilation=backbone_config['stage2']['dilation'], + backbone_stage2_groups=backbone_config['stage2']['groups'], + + backbone_stage3_in_channels=backbone_config['stage3']['in_channels'], + backbone_stage3_out_channels=backbone_config['stage3']['out_channels'], + backbone_stage3_kernel_size=backbone_config['stage3']['kernel_size'], + backbone_stage3_stride=backbone_config['stage3']['stride'], + backbone_stage3_dilation=backbone_config['stage3']['dilation'], + backbone_stage3_groups=backbone_config['stage3']['groups'], + + backbone_stage4_in_channels=backbone_config['stage4']['in_channels'], + backbone_stage4_out_channels=backbone_config['stage4']['out_channels'], + backbone_stage4_kernel_size=backbone_config['stage4']['kernel_size'], + backbone_stage4_stride=backbone_config['stage4']['stride'], + backbone_stage4_dilation=backbone_config['stage4']['dilation'], + backbone_stage4_groups=backbone_config['stage4']['groups'], + + neck_in_channels=neck_in_channels, + neck_out_channels=neck_out_channels, + neck_kernel_size=neck_kernel_size, + neck_stride=neck_stride, + neck_dilation=neck_dilation, + neck_groups=neck_groups, + + head_pooling_size=pooling_size, + head_dropout_ratio=0.1, + head_conv_in_channels=config_dict['head']['conv']['in_channels'], + head_conv_out_channels=config_dict['head']['conv']['out_channels'], + head_conv_kernel_size=config_dict['head']['conv']['kernel_size'], + head_conv_stride=config_dict['head']['conv']['stride'], + head_conv_dilation=config_dict['head']['conv']['dilation'], + head_conv_groups=config_dict['head']['conv']['groups'], + + head_final_kernel_size=config_dict['head']['final']['kernel_size'], + head_final_stride=config_dict['head']['final']['stride'], + head_final_dilation=config_dict['head']['final']['dilation'], + head_final_groups=config_dict['head']['final']['groups'], + head_final_bias=config_dict['head']['final']['bias'], + head_final_has_shuffle=config_dict['head']['final']['has_shuffle'], + head_final_in_channels=config_dict['head']['final']['in_channels'], + head_final_out_channels=config_dict['head']['final']['out_channels'], + head_final_use_bn=config_dict['head']['final']['use_bn'], + head_final_act_func=config_dict['head']['final']['act_func'], + head_final_dropout_rate=config_dict['head']['final']['dropout_rate'], + head_final_ops_order=config_dict['head']['final']['ops_order'], + + min_area=min_area, + min_score=min_score, + bbox_type=bbox_type, + loss_bg=loss_bg, + ) + + +def get_small_model_config(): + pass + + +def get_base_model_config(): + pass + + +def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_folder_path, validate_logits): + response = requests.get(checkpoint_config_url) + content = response.text + + namespace = {} + + exec(content, namespace) + + model_config = namespace.get('model') + test_config = namespace.get('test_cfg', None) + + min_score = 0.88 + min_area = 250 + bbox_type = 'rect' + loss_bg = False + if test_config is not None: + min_area = test_config.get('min_area', min_area) + min_score = test_config.get('min_area', min_score) + bbox_type = test_config.get('min_area', bbox_type) + loss_bg = test_config.get('loss_emb', None) == "EmbLoss_v2" + + if 'tiny' in model_config['backbone']['config']: + config = prepare_config(tiny_config_url, model_config['detection_head']['pooling_size'], + min_area, min_score, bbox_type, loss_bg) + elif 'small' in model_config['backbone']['config']: + config = prepare_config(small_config_url, model_config['detection_head']['pooling_size'], + min_area, min_score, bbox_type, loss_bg) + else: + config = prepare_config(base_config_url, model_config['detection_head']['pooling_size'], + min_area, min_score, bbox_type, loss_bg) + + model = FASTForImageCaptioning(config) + state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)['ema'] + state_dict_changed = copy.deepcopy(state_dict) + for key in state_dict: + val = state_dict_changed.pop(key) + state_dict_changed[key.replace('module.', '')] = val + model.load_state_dict(state_dict_changed) + + model.save_pretrained(pytorch_dump_folder_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--checkpoint_url", + default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth", + type=str, + help="URL to the original PyTorch checkpoint (.pth file).", + ) + parser.add_argument( + "--checkpoint_config_url", + default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth", + type=str, + help="URL to the original PyTorch checkpoint (.pth file).", + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." + ) + parser.add_argument( + "--validate_logits", + default=False, + type=bool, + help="whether to assert logits outputs", + ) + args = parser.parse_args() + + convert_fast_checkpoint( + args.checkpoint_url, args.checkpoint_config_url, args.pytorch_dump_folder_path, args.validate_logits + ) From c565cf334950f8d42a1e5ec61d880c6e0eb46d8b Mon Sep 17 00:00:00 2001 From: raghavanone Date: Fri, 20 Oct 2023 20:47:02 +0530 Subject: [PATCH 008/152] Add conversion scripts, integration tests, image processor --- .../fast/convert_fast_original_to_pytorch.py | 16 +- .../models/fast/image_processing_fast.py | 603 ++++++++++++++++++ src/transformers/models/fast/modeling_fast.py | 236 ++++--- tests/models/fast/test_modeling_fast.py | 66 +- 4 files changed, 789 insertions(+), 132 deletions(-) create mode 100644 src/transformers/models/fast/image_processing_fast.py diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py index 24f0c3dd56e0..7ef78a312080 100644 --- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py +++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py @@ -16,6 +16,7 @@ import argparse import copy import json +import logging import numpy as np import pandas as pd @@ -28,6 +29,7 @@ FastConfig, FASTForImageCaptioning ) +from transformers.models.fast.image_processing_fast import FastImageProcessor from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config" @@ -193,6 +195,7 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_ model_config = namespace.get('model') test_config = namespace.get('test_cfg', None) + data_config = namespace.get('data') min_score = 0.88 min_area = 250 @@ -200,8 +203,8 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_ loss_bg = False if test_config is not None: min_area = test_config.get('min_area', min_area) - min_score = test_config.get('min_area', min_score) - bbox_type = test_config.get('min_area', bbox_type) + min_score = test_config.get('min_score', min_score) + bbox_type = test_config.get('bbox_type', bbox_type) loss_bg = test_config.get('loss_emb', None) == "EmbLoss_v2" if 'tiny' in model_config['backbone']['config']: @@ -213,8 +216,15 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_ else: config = prepare_config(base_config_url, model_config['detection_head']['pooling_size'], min_area, min_score, bbox_type, loss_bg) + size = 640 + if "train" in data_config: + if "short_size" in data_config['train']: + size = data_config['train']['short_size'] model = FASTForImageCaptioning(config) + fast_image_processor = FastImageProcessor(size={'height': size, 'width': size}, min_score=config.min_score, + min_area=config.min_area, + bbox_type=config.bbox_type, pooling_size=config.head_pooling_size) state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)['ema'] state_dict_changed = copy.deepcopy(state_dict) for key in state_dict: @@ -223,6 +233,8 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_ model.load_state_dict(state_dict_changed) model.save_pretrained(pytorch_dump_folder_path) + fast_image_processor.save_pretrained(pytorch_dump_folder_path) + logging.info("The converted weights are save here : " + pytorch_dump_folder_path) if __name__ == "__main__": diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py new file mode 100644 index 000000000000..637aea38e086 --- /dev/null +++ b/src/transformers/models/fast/image_processing_fast.py @@ -0,0 +1,603 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Beit.""" +import math +import warnings +from typing import Any, Dict, List, Optional, Tuple, Union +import torch.nn.functional as F +import torch.nn as nn + +import cv2 +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import resize, to_channel_dimension_format +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, +) +from ...utils import TensorType, is_torch_available, is_torch_tensor, is_vision_available, logging, \ + IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD + +if is_vision_available(): + import PIL + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +class FastImageProcessor(BaseImageProcessor): + r""" + Constructs a BEiT image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the + `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image + is padded with 0's and then center cropped. Can be overridden by the `do_center_crop` parameter in the + `preprocess` method. + crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`): + Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`. + Can be overridden by the `crop_size` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + The mean to use if normalizing the image. This is a float or list of floats of length of the number of + channels of the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + The standard deviation to use if normalizing the image. This is a float or list of floats of length of the + number of channels of the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_reduce_labels (`bool`, *optional*, defaults to `False`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is + used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The + background label will be replaced by 255. Can be overridden by the `do_reduce_labels` parameter in the + `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = False, + crop_size: Dict[str, int] = None, + rescale_factor: Union[int, float] = 1 / 255, + do_rescale: bool = True, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_reduce_labels: bool = False, + min_area: int = 10, + min_score: float = 0.88, + bbox_type: str = "rect", + pooling_size: int = 9, + **kwargs, + ) -> None: + if "reduce_labels" in kwargs: + warnings.warn( + "The `reduce_labels` parameter is deprecated and will be removed in a future version. Please use" + " `do_reduce_labels` instead.", + FutureWarning, + ) + do_reduce_labels = kwargs.pop("reduce_labels") + super().__init__(**kwargs) + size = size if size is not None else {"height": 640, "width": 640} + size = get_size_dict(size) + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + crop_size = get_size_dict(crop_size, param_name="crop_size") + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_reduce_labels = do_reduce_labels + self.min_area = min_area + self.min_score = min_score + self.bbox_type = bbox_type + self.pooling_size = pooling_size + + @classmethod + def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure `reduce_labels` is updated if image processor + is created using from_dict and kwargs e.g. `BeitImageProcessor.from_pretrained(checkpoint, reduce_labels=True)` + """ + image_processor_dict = image_processor_dict.copy() + if "reduce_labels" in kwargs: + image_processor_dict["reduce_labels"] = kwargs.pop("reduce_labels") + return super().from_dict(image_processor_dict, **kwargs) + + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to (size["height"], size["width"]). + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PIL.Image.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + size = get_size_dict(size, default_to_square=True, param_name="size") + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` argument must contain `height` and `width` keys. Got {size.keys()}") + return resize( + image, + size=(size["height"], size["width"]), + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def reduce_label(self, label: ImageInput) -> np.ndarray: + label = to_numpy_array(label) + # Avoid using underflow conversion + label[label == 0] = 255 + label = label - 1 + label[label == 254] = 255 + return label + + def _preprocess( + self, + image: ImageInput, + do_reduce_labels: bool = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + if do_reduce_labels: + image = self.reduce_label(image) + + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_center_crop: + image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + + return image + + def _preprocess_image( + self, + image: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """Preprocesses a single image.""" + # All transformations expect numpy arrays. + image = to_numpy_array(image) + if is_scaled_image(image) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + image = self._preprocess( + image, + do_reduce_labels=False, + do_resize=do_resize, + size=size, + resample=resample, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + input_data_format=input_data_format, + ) + if data_format is not None: + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + return image + + def _preprocess_segmentation_map( + self, + segmentation_map: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_reduce_labels: bool = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """Preprocesses a single segmentation map.""" + # All transformations expect numpy arrays. + segmentation_map = to_numpy_array(segmentation_map) + # Add an axis to the segmentation maps for transformations. + if segmentation_map.ndim == 2: + segmentation_map = segmentation_map[None, ...] + added_dimension = True + input_data_format = ChannelDimension.FIRST + else: + added_dimension = False + if input_data_format is None: + input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1) + segmentation_map = self._preprocess( + image=segmentation_map, + do_reduce_labels=do_reduce_labels, + do_resize=do_resize, + resample=resample, + size=size, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_normalize=False, + do_rescale=False, + input_data_format=ChannelDimension.FIRST, + ) + # Remove extra axis if added + if added_dimension: + segmentation_map = np.squeeze(segmentation_map, axis=0) + segmentation_map = segmentation_map.astype(np.int64) + return segmentation_map + + def __call__(self, images, segmentation_maps=None, **kwargs): + # Overrides the `__call__` method of the `Preprocessor` class such that the images and segmentation maps can both + # be passed in as positional arguments. + return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs) + + def preprocess( + self, + images: ImageInput, + segmentation_maps: Optional[ImageInput] = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_reduce_labels: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the image after center crop. If one edge the image is smaller than `crop_size`, it will be + padded with zeros and then cropped + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation. + do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. + ADE20k). The background label will be replaced by 255. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=True, param_name="size") + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_reduce_labels = do_reduce_labels if do_reduce_labels is not None else self.do_reduce_labels + + images = make_list_of_images(images) + if segmentation_maps is not None: + segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + if segmentation_maps is not None and not valid_images(segmentation_maps): + raise ValueError( + "Invalid segmentation map type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + if do_resize and size is None or resample is None: + raise ValueError("Size and resample must be specified if do_resize is True.") + + if do_center_crop and crop_size is None: + raise ValueError("Crop size must be specified if do_center_crop is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and (image_mean is None or image_std is None): + raise ValueError("Image mean and std must be specified if do_normalize is True.") + + images = [ + self._preprocess_image( + image=img, + do_resize=do_resize, + do_center_crop=do_center_crop, + do_rescale=do_rescale, + do_normalize=do_normalize, + resample=resample, + size=size, + rescale_factor=rescale_factor, + crop_size=crop_size, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + input_data_format=input_data_format, + ) + for img in images + ] + + data = {"pixel_values": images} + + if segmentation_maps is not None: + segmentation_maps = [ + self._preprocess_segmentation_map( + segmentation_map=segmentation_map, + do_reduce_labels=do_reduce_labels, + do_resize=do_resize, + resample=resample, + size=size, + do_center_crop=do_center_crop, + crop_size=crop_size, + ) + for segmentation_map in segmentation_maps + ] + data["labels"] = segmentation_maps + + return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None): + """ + Converts the output of [`BeitForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch. + + Args: + outputs ([`BeitForSemanticSegmentation`]): + Raw outputs of the model. + target_sizes (`List[Tuple]` of length `batch_size`, *optional*): + List of tuples corresponding to the requested final size (height, width) of each prediction. If unset, + predictions will not be resized. + + Returns: + semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic + segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is + specified). Each entry of each `torch.Tensor` correspond to a semantic class id. + """ + # TODO: add support for other frameworks + logits = outputs.logits + + # Resize logits and compute semantic segmentation maps + if target_sizes is not None: + if len(logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + if is_torch_tensor(target_sizes): + target_sizes = target_sizes.numpy() + + semantic_segmentation = [] + + for idx in range(len(logits)): + resized_logits = torch.nn.functional.interpolate( + logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False + ) + semantic_map = resized_logits[0].argmax(dim=0) + semantic_segmentation.append(semantic_map) + else: + semantic_segmentation = logits.argmax(dim=1) + semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])] + + return semantic_segmentation + + def _max_pooling(self, x, scale=1): + if scale == 1: + x = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, + padding=(self.pooling_size - 1) // 2)(x) + elif scale == 2: + x = nn.MaxPool2d(kernel_size=self.pooling_size // 2 + 1, stride=1, + padding=(self.pooling_size // 2) // 2)(x) + return x + + def get_results(self, output, target_sizes): + scale = 2 + img_size = (self.size['height'], self.size['width']) + out = output['hidden_states'] + batch_size = out.size(0) + final_results = dict() + + texts = F.interpolate(out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), + mode='nearest') # B*1*320*320 + texts = self._max_pooling(texts, scale=scale) # B*1*320*320 + score_maps = torch.sigmoid_(texts) # B*1*320*320 + score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode='nearest') # B*1*640*640 + score_maps = score_maps.squeeze(1) # B*640*640 + + kernels = (out[:, 0, :, :] > 0).to(torch.uint8) # B*160*160 + labels_ = [] + for kernel in kernels.numpy(): + ret, label_ = cv2.connectedComponents(kernel) + labels_.append(label_) + labels_ = np.array(labels_) + labels_ = torch.from_numpy(labels_) + labels = labels_.unsqueeze(1).to(torch.float32) # B*1*160*160 + labels = F.interpolate(labels, size=(img_size[0] // scale, img_size[1] // scale), mode='nearest') # B*1*320*320 + labels = self._max_pooling(labels, scale=scale) + labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode='nearest') # B*1*640*640 + labels = labels.squeeze(1).to(torch.int32) # B*640*640 + + keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)] + + final_results.update(dict(kernels=kernels.data.cpu())) + + results = [] + for i in range(batch_size): + org_img_size = target_sizes[i] + scales = (float(org_img_size[1]) / float(img_size[1]), + float(org_img_size[0]) / float(img_size[0])) + + bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales) + results.append(dict( + bboxes=bboxes, + scores=scores + )) + final_results.update(dict(results=results)) + + return results + + def generate_bbox(self, keys, label, score, scales): + label_num = len(keys) + bboxes = [] + scores = [] + for index in range(1, label_num): + i = keys[index] + ind = (label == i) + ind_np = ind.data.cpu().numpy() + points = np.array(np.where(ind_np)).transpose((1, 0)) + if points.shape[0] < self.min_area: + label[ind] = 0 + continue + score_i = score[ind].mean().item() + if score_i < self.min_score: + label[ind] = 0 + continue + + if self.bbox_type == 'rect': + rect = cv2.minAreaRect(points[:, ::-1]) + alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1]))) + rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2]) + bbox = cv2.boxPoints(rect) * scales + + elif self.bbox_type == 'poly': + binary = np.zeros(label.shape, dtype='uint8') + binary[ind_np] = 1 + contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + bbox = contours[0] * scales + bbox = bbox.astype('int32') + bboxes.append(bbox.reshape(-1).tolist()) + scores.append(score_i) + return bboxes, scores diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 798ecba93aa2..dc892f0e58ce 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -103,7 +103,9 @@ def weight_op(self): """ Methods defined in MyModule""" def forward(self, x): - for module in self._modules.values(): + for key, module in self._modules.items(): + if key == 'bn' and not self.training: + continue x = module(x) return x @@ -134,7 +136,7 @@ def is_zero_layer(): return False -class ConvLayer(My2DLayer): +class ConvLayer(nn.Module): def __init__( self, in_channels, @@ -148,18 +150,19 @@ def __init__( use_bn=True, act_func="relu", dropout_rate=0, - ops_order="weight_bn_act", + use_act=True ): + + super().__init__() + self.kernel_size = kernel_size self.stride = stride self.dilation = dilation self.groups = groups self.bias = bias self.has_shuffle = has_shuffle + self.act_func = act_func - super(ConvLayer, self).__init__(in_channels, out_channels, use_bn, act_func, dropout_rate, ops_order) - - def weight_op(self): padding = get_same_padding(self.kernel_size) if isinstance(padding, int): padding *= self.dilation @@ -167,23 +170,61 @@ def weight_op(self): padding[0] *= self.dilation padding[1] *= self.dilation - weight_dict = OrderedDict() - weight_dict["conv"] = nn.Conv2d( - self.in_channels, - self.out_channels, - kernel_size=self.kernel_size, - stride=self.stride, + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, padding=padding, - dilation=self.dilation, - groups=self.groups, - bias=self.bias, + dilation=dilation, + groups=groups, + bias=bias, ) + self.bn = nn.Identity() + if use_bn: + self.bn = nn.BatchNorm2d(out_channels) - return weight_dict + self.act = nn.Identity() + if use_act: + act = build_activation(self.act_func, True) + if act is not None: + self.act = act + + def forward(self, x): + if self.training: + if hasattr(self, 'fused_conv'): + delattr(self, 'fused_conv') + x = self.conv(x) + x = self.bn(x) + return self.act(x) + else: + if not hasattr(self, 'fused_conv'): + setattr(self, 'fused_conv', self.fuse_conv_bn(self.conv, self.bn)) + x = self.fused_conv(x) + if self.act is not None: + x = self.act(x) + return x + + def fuse_conv_bn(self, conv, bn): + """During inference, the functionary of batch norm layers is turned off but + only the mean and var alone channels are used, which exposes the chance to + fuse it with the preceding conv layers to save computations and simplify + network structures.""" + if isinstance(bn, nn.Identity): + return conv + conv_w = conv.weight + conv_b = conv.bias if conv.bias is not None else torch.zeros_like( + bn.running_mean) + + factor = bn.weight / torch.sqrt(bn.running_var + bn.eps) + conv.weight = nn.Parameter(conv_w * + factor.reshape([conv.out_channels, 1, 1, 1])) + conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias) + return conv class RepConvLayer(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, deploy=False): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1): super(RepConvLayer, self).__init__() self.in_channels = in_channels @@ -192,78 +233,66 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, self.stride = stride self.dilation = dilation self.groups = groups - self.deploy = deploy assert len(kernel_size) == 2 padding = (int(((kernel_size[0] - 1) * dilation) / 2), int(((kernel_size[1] - 1) * dilation) / 2)) self.nonlinearity = nn.ReLU(inplace=True) - if deploy: - self.fused_conv = nn.Conv2d( + self.main_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=False, + ) + self.main_bn = nn.BatchNorm2d(num_features=out_channels) + + ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0) + hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2)) + + if kernel_size[1] != 1: + self.ver_conv = nn.Conv2d( in_channels=in_channels, out_channels=out_channels, - kernel_size=kernel_size, + kernel_size=(kernel_size[0], 1), stride=stride, - padding=padding, + padding=ver_pad, dilation=dilation, groups=groups, - bias=True, + bias=False, ) + self.ver_bn = nn.BatchNorm2d(num_features=out_channels) else: - self.main_conv = nn.Conv2d( + self.ver_conv, self.ver_bn = None, None + + if kernel_size[0] != 1: # 卷积核的高大于1 -> 有水平卷积 + self.hor_conv = nn.Conv2d( in_channels=in_channels, out_channels=out_channels, - kernel_size=kernel_size, + kernel_size=(1, kernel_size[1]), stride=stride, - padding=padding, + padding=hor_pad, dilation=dilation, groups=groups, bias=False, ) - self.main_bn = nn.BatchNorm2d(num_features=out_channels) - - ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0) - hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2)) - - if kernel_size[1] != 1: # 卷积核的宽大于1 -> 有垂直卷积 - self.ver_conv = nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=(kernel_size[0], 1), - stride=stride, - padding=ver_pad, - dilation=dilation, - groups=groups, - bias=False, - ) - self.ver_bn = nn.BatchNorm2d(num_features=out_channels) - else: - self.ver_conv, self.ver_bn = None, None - - if kernel_size[0] != 1: # 卷积核的高大于1 -> 有水平卷积 - self.hor_conv = nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=(1, kernel_size[1]), - stride=stride, - padding=hor_pad, - dilation=dilation, - groups=groups, - bias=False, - ) - self.hor_bn = nn.BatchNorm2d(num_features=out_channels) - else: - self.hor_conv, self.hor_bn = None, None + self.hor_bn = nn.BatchNorm2d(num_features=out_channels) + else: + self.hor_conv, self.hor_bn = None, None - self.rbr_identity = ( - nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None - ) + self.rbr_identity = ( + nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None + ) def forward(self, input): - if hasattr(self, "fused_conv"): - return self.nonlinearity(self.fused_conv(input)) - else: + if self.training: + if hasattr(self, 'fused_conv'): + self.__delattr__('fused_conv') + main_outputs = self.main_conv(input) main_outputs = self.main_bn(main_outputs) if self.ver_conv is not None: @@ -284,6 +313,10 @@ def forward(self, input): id_out = self.rbr_identity(input) return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out) + else: + if not hasattr(self, 'fused_conv'): + self.prepare_for_eval() + return self.nonlinearity(self.fused_conv(input)) def _identity_to_conv(self, identity): if identity is None: @@ -340,66 +373,17 @@ def _pad_to_mxn_tensor(self, kernel): pad_top_down = (kernel_height - height) // 2 return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right, pad_top_down, pad_top_down]) - # def switch_to_deploy(self): - # if hasattr(self, 'fused_conv'): - # return - # kernel, bias = self.get_equivalent_kernel_bias() - # self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels, - # out_channels=self.main_conv.out_channels, - # kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride, - # padding=self.main_conv.padding, dilation=self.main_conv.dilation, - # groups=self.main_conv.groups, bias=True) - # self.fused_conv.weight.data = kernel - # self.fused_conv.bias.data = bias - # self.deploy = True - # for para in self.parameters(): - # para.detach_() - # for attr in ['main_conv', 'main_bn', 'ver_conv', 'ver_bn', 'hor_conv', 'hor_bn']: - # if hasattr(self, attr): - # self.__delattr__(attr) - # - # if hasattr(self, 'rbr_identity'): - # self.__delattr__('rbr_identity') - - # def switch_to_test(self): - # kernel, bias = self.get_equivalent_kernel_bias() - # self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels, - # out_channels=self.main_conv.out_channels, - # kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride, - # padding=self.main_conv.padding, dilation=self.main_conv.dilation, - # groups=self.main_conv.groups, bias=True) - # self.fused_conv.weight.data = kernel - # self.fused_conv.bias.data = bias - # for para in self.fused_conv.parameters(): - # para.detach_() - # self.deploy = True - - # def switch_to_train(self): - # if hasattr(self, 'fused_conv'): - # self.__delattr__('fused_conv') - # self.deploy = False - - # @staticmethod - # def is_zero_layer(): - # return False - - # @property - # def module_str(self): - # return 'Rep_%dx%d' % (self.kernel_size[0], self.kernel_size[1]) - - # @property - # def config(self): - # return {'name': RepConvLayer.__name__, - # 'in_channels': self.in_channels, - # 'out_channels': self.out_channels, - # 'kernel_size': self.kernel_size, - # 'stride': self.stride, - # 'dilation': self.dilation, - # 'groups': self.groups} - - # @staticmethod - # def build_from_config(config): - # return RepConvLayer(**config) + def prepare_for_eval(self): + kernel, bias = self.get_equivalent_kernel_bias() + self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels, + out_channels=self.main_conv.out_channels, + kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride, + padding=self.main_conv.padding, dilation=self.main_conv.dilation, + groups=self.main_conv.groups, bias=True) + self.fused_conv.weight.data = kernel + self.fused_conv.bias.data = bias + for para in self.fused_conv.parameters(): + para.detach_() class FastPreTrainedModel(PreTrainedModel): diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index f3790cfb8300..ee84d0e857a8 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -16,6 +16,8 @@ import inspect import unittest +import requests +from PIL import Image from parameterized import parameterized from transformers import ( @@ -23,7 +25,9 @@ is_torch_available, set_seed, ) -from transformers.testing_utils import CaptureLogger, require_bitsandbytes, require_torch, slow, tooslow, torch_device +from transformers.models.fast.image_processing_fast import FastImageProcessor +from transformers.testing_utils import CaptureLogger, require_bitsandbytes, require_torch, slow, tooslow, torch_device, \ + require_vision from transformers.utils import logging as transformers_logging from ...generation.test_utils import GenerationTesterMixin @@ -63,13 +67,13 @@ def __init__( backbone_stage1_groups=[1], backbone_stage2_in_channels=[64], backbone_stage2_out_channels=[128], - backbone_stage2_kernel_size=[ [3, 1]], + backbone_stage2_kernel_size=[[3, 1]], backbone_stage2_stride=[2], backbone_stage2_dilation=[1], backbone_stage2_groups=[1], backbone_stage3_in_channels=[128], backbone_stage3_out_channels=[256], - backbone_stage3_kernel_size=[ [1, 3]], + backbone_stage3_kernel_size=[[1, 3]], backbone_stage3_stride=[2], backbone_stage3_dilation=[1], backbone_stage3_groups=[1], @@ -377,5 +381,59 @@ def test_model_is_small(self): model = model_class(config) num_params = model.num_parameters() assert ( - num_params < 3000000 + num_params < 3000000 ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max." + + # def prepare_image(): + # image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img_329.jpg" + # raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB") + # return raw_image + + +@require_torch +@require_vision +class FastModelIntegrationTest(unittest.TestCase): + # @slow + def test_inference_fast_tiny_ic17mlt_model(self): + model = FASTForImageCaptioning.from_pretrained("Raghavan/ic17mlt_Fast_T") + + image_processor = FastImageProcessor.from_pretrained("Raghavan/ic17mlt_Fast_T") + + def prepare_image(): + image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img_329.jpg" + raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB") + return raw_image + + image = prepare_image() + input = image_processor(image, return_tensor="np") + + output = model(pixel_values=torch.tensor(input['pixel_values'])) + target_sizes = [(image.shape[1], image.shape[2]) for image in input['pixel_values']] + final_out = image_processor.get_results(output, target_sizes) + + assert ( + final_out[0]['bboxes'][0] == [224, 120, 246, 120, 246, 134, 224, 134] + ) + assert round(float(final_out[0]['scores'][0]), 5) == 0.95541 + + def test_inference_fast_base_800_total_text_ic17mlt_model(self): + model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") + + image_processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") + + def prepare_image(): + image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg" + raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB") + return raw_image + + image = prepare_image() + input = image_processor(image, return_tensor="np") + + output = model(pixel_values=torch.tensor(input['pixel_values'])) + target_sizes = [(image.shape[1], image.shape[2]) for image in input['pixel_values']] + final_out = image_processor.get_results(output, target_sizes) + + assert ( + final_out[0]['bboxes'][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182] + ) + assert round(float(final_out[0]['scores'][0]), 5) == 0.92356 From 0457e7465ad6b666ec44a57331a6f8533851441d Mon Sep 17 00:00:00 2001 From: raghavanone Date: Tue, 31 Oct 2023 19:19:53 +0530 Subject: [PATCH 009/152] Fix style and copies --- src/transformers/models/fast/__init__.py | 9 +- .../models/fast/configuration_fast.py | 140 +++++------ .../fast/convert_fast_original_to_pytorch.py | 163 ++++++------- .../models/fast/image_processing_fast.py | 225 +++++++++--------- src/transformers/models/fast/modeling_fast.py | 167 ++++++------- tests/models/fast/test_modeling_fast.py | 200 ++++++++-------- 6 files changed, 445 insertions(+), 459 deletions(-) diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py index 6fad75850bba..133d20bc0c52 100644 --- a/src/transformers/models/fast/__init__.py +++ b/src/transformers/models/fast/__init__.py @@ -20,6 +20,7 @@ is_torch_available, ) + _import_structure = { "configuration_fast": ["FastConfig"], } @@ -30,9 +31,7 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["modeling_fast"] = [ - "FASTForImageCaptioning" - ] + _import_structure["modeling_fast"] = ["FASTForImageCaptioning"] if TYPE_CHECKING: from .configuration_fast import FastConfig @@ -43,9 +42,7 @@ except OptionalDependencyNotAvailable: pass else: - from .modeling_fast import ( - FASTForImageCaptioning - ) + from .modeling_fast import FASTForImageCaptioning else: diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index ad72054b5cee..ee8c27b03a32 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -3,75 +3,75 @@ class FastConfig(PretrainedConfig): def __init__( - self, - backbone_kernel_size=3, - backbone_stride=2, - backbone_dilation=1, - backbone_groups=1, - backbone_bias=False, - backbone_has_shuffle=False, - backbone_in_channels=3, - backbone_out_channels=64, - backbone_use_bn=True, - backbone_act_func="relu", - backbone_dropout_rate=0, - backbone_ops_order="weight_bn_act", - backbone_stage1_in_channels=[64, 64, 64], - backbone_stage1_out_channels=[64, 64, 64], - backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]], - backbone_stage1_stride=[1, 2, 1], - backbone_stage1_dilation=[1, 1, 1], - backbone_stage1_groups=[1, 1, 1], - backbone_stage2_in_channels=[64, 128, 128, 128], - backbone_stage2_out_channels=[128, 128, 128, 128], - backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]], - backbone_stage2_stride=[2, 1, 1, 1], - backbone_stage2_dilation=[1, 1, 1, 1], - backbone_stage2_groups=[1, 1, 1, 1], - backbone_stage3_in_channels=[128, 256, 256, 256], - backbone_stage3_out_channels=[256, 256, 256, 256], - backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]], - backbone_stage3_stride=[2, 1, 1, 1], - backbone_stage3_dilation=[1, 1, 1, 1], - backbone_stage3_groups=[1, 1, 1, 1], - backbone_stage4_in_channels=[256, 512, 512, 512], - backbone_stage4_out_channels=[512, 512, 512, 512], - backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]], - backbone_stage4_stride=[2, 1, 1, 1], - backbone_stage4_dilation=[1, 1, 1, 1], - backbone_stage4_groups=[1, 1, 1, 1], - neck_in_channels=[64, 128, 256, 512], - neck_out_channels=[128, 128, 128, 128], - neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]], - neck_stride=[1, 1, 1, 1], - neck_dilation=[1, 1, 1, 1], - neck_groups=[1, 1, 1, 1], - head_pooling_size=9, - head_dropout_ratio=0, - head_conv_in_channels=512, - head_conv_out_channels=128, - head_conv_kernel_size=[3, 3], - head_conv_stride=1, - head_conv_dilation=1, - head_conv_groups=1, - head_final_kernel_size=1, - head_final_stride=1, - head_final_dilation=1, - head_final_groups=1, - head_final_bias=False, - head_final_has_shuffle=False, - head_final_in_channels=128, - head_final_out_channels=5, - head_final_use_bn=False, - head_final_act_func=None, - head_final_dropout_rate=0, - head_final_ops_order="weight", - min_area=250, - min_score=0.88, - bbox_type='rect', - loss_bg=False, - initializer_range=0.02, - **kwargs, + self, + backbone_kernel_size=3, + backbone_stride=2, + backbone_dilation=1, + backbone_groups=1, + backbone_bias=False, + backbone_has_shuffle=False, + backbone_in_channels=3, + backbone_out_channels=64, + backbone_use_bn=True, + backbone_act_func="relu", + backbone_dropout_rate=0, + backbone_ops_order="weight_bn_act", + backbone_stage1_in_channels=[64, 64, 64], + backbone_stage1_out_channels=[64, 64, 64], + backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]], + backbone_stage1_stride=[1, 2, 1], + backbone_stage1_dilation=[1, 1, 1], + backbone_stage1_groups=[1, 1, 1], + backbone_stage2_in_channels=[64, 128, 128, 128], + backbone_stage2_out_channels=[128, 128, 128, 128], + backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]], + backbone_stage2_stride=[2, 1, 1, 1], + backbone_stage2_dilation=[1, 1, 1, 1], + backbone_stage2_groups=[1, 1, 1, 1], + backbone_stage3_in_channels=[128, 256, 256, 256], + backbone_stage3_out_channels=[256, 256, 256, 256], + backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]], + backbone_stage3_stride=[2, 1, 1, 1], + backbone_stage3_dilation=[1, 1, 1, 1], + backbone_stage3_groups=[1, 1, 1, 1], + backbone_stage4_in_channels=[256, 512, 512, 512], + backbone_stage4_out_channels=[512, 512, 512, 512], + backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]], + backbone_stage4_stride=[2, 1, 1, 1], + backbone_stage4_dilation=[1, 1, 1, 1], + backbone_stage4_groups=[1, 1, 1, 1], + neck_in_channels=[64, 128, 256, 512], + neck_out_channels=[128, 128, 128, 128], + neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]], + neck_stride=[1, 1, 1, 1], + neck_dilation=[1, 1, 1, 1], + neck_groups=[1, 1, 1, 1], + head_pooling_size=9, + head_dropout_ratio=0, + head_conv_in_channels=512, + head_conv_out_channels=128, + head_conv_kernel_size=[3, 3], + head_conv_stride=1, + head_conv_dilation=1, + head_conv_groups=1, + head_final_kernel_size=1, + head_final_stride=1, + head_final_dilation=1, + head_final_groups=1, + head_final_bias=False, + head_final_has_shuffle=False, + head_final_in_channels=128, + head_final_out_channels=5, + head_final_use_bn=False, + head_final_act_func=None, + head_final_dropout_rate=0, + head_final_ops_order="weight", + min_area=250, + min_score=0.88, + bbox_type="rect", + loss_bg=False, + initializer_range=0.02, + **kwargs, ): super().__init__(**kwargs) @@ -150,4 +150,4 @@ def __init__( self.min_score = min_score self.bbox_type = bbox_type self.loss_bg = loss_bg - self.initializer_range = initializer_range \ No newline at end of file + self.initializer_range = initializer_range diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py index 7ef78a312080..e549294081b8 100644 --- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py +++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py @@ -18,19 +18,13 @@ import json import logging -import numpy as np -import pandas as pd import requests import torch -from huggingface_hub import hf_hub_download from PIL import Image -from transformers import ( - FastConfig, - FASTForImageCaptioning -) +from transformers import FastConfig, FASTForImageCaptioning from transformers.models.fast.image_processing_fast import FastImageProcessor -from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD + tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config" small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config" @@ -63,7 +57,7 @@ def prepare_config(size_config_url, pooling_size, min_area, min_score, bbox_type backbone_config = {} for stage_ix in range(1, 5): - stage_config = config_dict[f'stage{stage_ix}'] + stage_config = config_dict[f"stage{stage_ix}"] merged_dict = {} @@ -77,7 +71,7 @@ def prepare_config(size_config_url, pooling_size, min_area, min_score, bbox_type else: # If the key is not in merged_dict, create a new list with the value merged_dict[key] = [value] - backbone_config[f'stage{stage_ix}'] = merged_dict + backbone_config[f"stage{stage_ix}"] = merged_dict neck_in_channels = [] neck_out_channels = [] @@ -88,7 +82,7 @@ def prepare_config(size_config_url, pooling_size, min_area, min_score, bbox_type for i in range(1, 5): layer_key = f"reduce_layer{i}" - layer_dict = config_dict['neck'].get(layer_key) + layer_dict = config_dict["neck"].get(layer_key) if layer_dict: # Append values to the corresponding lists @@ -112,64 +106,56 @@ def prepare_config(size_config_url, pooling_size, min_area, min_score, bbox_type backbone_act_func=config_dict["first_conv"]["act_func"], backbone_dropout_rate=config_dict["first_conv"]["dropout_rate"], backbone_ops_order=config_dict["first_conv"]["ops_order"], - - backbone_stage1_in_channels=backbone_config['stage1']['in_channels'], - backbone_stage1_out_channels=backbone_config['stage1']['out_channels'], - backbone_stage1_kernel_size=backbone_config['stage1']['kernel_size'], - backbone_stage1_stride=backbone_config['stage1']['stride'], - backbone_stage1_dilation=backbone_config['stage1']['dilation'], - backbone_stage1_groups=backbone_config['stage1']['groups'], - - backbone_stage2_in_channels=backbone_config['stage2']['in_channels'], - backbone_stage2_out_channels=backbone_config['stage2']['out_channels'], - backbone_stage2_kernel_size=backbone_config['stage2']['kernel_size'], - backbone_stage2_stride=backbone_config['stage2']['stride'], - backbone_stage2_dilation=backbone_config['stage2']['dilation'], - backbone_stage2_groups=backbone_config['stage2']['groups'], - - backbone_stage3_in_channels=backbone_config['stage3']['in_channels'], - backbone_stage3_out_channels=backbone_config['stage3']['out_channels'], - backbone_stage3_kernel_size=backbone_config['stage3']['kernel_size'], - backbone_stage3_stride=backbone_config['stage3']['stride'], - backbone_stage3_dilation=backbone_config['stage3']['dilation'], - backbone_stage3_groups=backbone_config['stage3']['groups'], - - backbone_stage4_in_channels=backbone_config['stage4']['in_channels'], - backbone_stage4_out_channels=backbone_config['stage4']['out_channels'], - backbone_stage4_kernel_size=backbone_config['stage4']['kernel_size'], - backbone_stage4_stride=backbone_config['stage4']['stride'], - backbone_stage4_dilation=backbone_config['stage4']['dilation'], - backbone_stage4_groups=backbone_config['stage4']['groups'], - + backbone_stage1_in_channels=backbone_config["stage1"]["in_channels"], + backbone_stage1_out_channels=backbone_config["stage1"]["out_channels"], + backbone_stage1_kernel_size=backbone_config["stage1"]["kernel_size"], + backbone_stage1_stride=backbone_config["stage1"]["stride"], + backbone_stage1_dilation=backbone_config["stage1"]["dilation"], + backbone_stage1_groups=backbone_config["stage1"]["groups"], + backbone_stage2_in_channels=backbone_config["stage2"]["in_channels"], + backbone_stage2_out_channels=backbone_config["stage2"]["out_channels"], + backbone_stage2_kernel_size=backbone_config["stage2"]["kernel_size"], + backbone_stage2_stride=backbone_config["stage2"]["stride"], + backbone_stage2_dilation=backbone_config["stage2"]["dilation"], + backbone_stage2_groups=backbone_config["stage2"]["groups"], + backbone_stage3_in_channels=backbone_config["stage3"]["in_channels"], + backbone_stage3_out_channels=backbone_config["stage3"]["out_channels"], + backbone_stage3_kernel_size=backbone_config["stage3"]["kernel_size"], + backbone_stage3_stride=backbone_config["stage3"]["stride"], + backbone_stage3_dilation=backbone_config["stage3"]["dilation"], + backbone_stage3_groups=backbone_config["stage3"]["groups"], + backbone_stage4_in_channels=backbone_config["stage4"]["in_channels"], + backbone_stage4_out_channels=backbone_config["stage4"]["out_channels"], + backbone_stage4_kernel_size=backbone_config["stage4"]["kernel_size"], + backbone_stage4_stride=backbone_config["stage4"]["stride"], + backbone_stage4_dilation=backbone_config["stage4"]["dilation"], + backbone_stage4_groups=backbone_config["stage4"]["groups"], neck_in_channels=neck_in_channels, neck_out_channels=neck_out_channels, neck_kernel_size=neck_kernel_size, neck_stride=neck_stride, neck_dilation=neck_dilation, neck_groups=neck_groups, - head_pooling_size=pooling_size, head_dropout_ratio=0.1, - head_conv_in_channels=config_dict['head']['conv']['in_channels'], - head_conv_out_channels=config_dict['head']['conv']['out_channels'], - head_conv_kernel_size=config_dict['head']['conv']['kernel_size'], - head_conv_stride=config_dict['head']['conv']['stride'], - head_conv_dilation=config_dict['head']['conv']['dilation'], - head_conv_groups=config_dict['head']['conv']['groups'], - - head_final_kernel_size=config_dict['head']['final']['kernel_size'], - head_final_stride=config_dict['head']['final']['stride'], - head_final_dilation=config_dict['head']['final']['dilation'], - head_final_groups=config_dict['head']['final']['groups'], - head_final_bias=config_dict['head']['final']['bias'], - head_final_has_shuffle=config_dict['head']['final']['has_shuffle'], - head_final_in_channels=config_dict['head']['final']['in_channels'], - head_final_out_channels=config_dict['head']['final']['out_channels'], - head_final_use_bn=config_dict['head']['final']['use_bn'], - head_final_act_func=config_dict['head']['final']['act_func'], - head_final_dropout_rate=config_dict['head']['final']['dropout_rate'], - head_final_ops_order=config_dict['head']['final']['ops_order'], - + head_conv_in_channels=config_dict["head"]["conv"]["in_channels"], + head_conv_out_channels=config_dict["head"]["conv"]["out_channels"], + head_conv_kernel_size=config_dict["head"]["conv"]["kernel_size"], + head_conv_stride=config_dict["head"]["conv"]["stride"], + head_conv_dilation=config_dict["head"]["conv"]["dilation"], + head_conv_groups=config_dict["head"]["conv"]["groups"], + head_final_kernel_size=config_dict["head"]["final"]["kernel_size"], + head_final_stride=config_dict["head"]["final"]["stride"], + head_final_dilation=config_dict["head"]["final"]["dilation"], + head_final_groups=config_dict["head"]["final"]["groups"], + head_final_bias=config_dict["head"]["final"]["bias"], + head_final_has_shuffle=config_dict["head"]["final"]["has_shuffle"], + head_final_in_channels=config_dict["head"]["final"]["in_channels"], + head_final_out_channels=config_dict["head"]["final"]["out_channels"], + head_final_use_bn=config_dict["head"]["final"]["use_bn"], + head_final_act_func=config_dict["head"]["final"]["act_func"], + head_final_dropout_rate=config_dict["head"]["final"]["dropout_rate"], + head_final_ops_order=config_dict["head"]["final"]["ops_order"], min_area=min_area, min_score=min_score, bbox_type=bbox_type, @@ -193,43 +179,50 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_ exec(content, namespace) - model_config = namespace.get('model') - test_config = namespace.get('test_cfg', None) - data_config = namespace.get('data') + model_config = namespace.get("model") + test_config = namespace.get("test_cfg", None) + data_config = namespace.get("data") min_score = 0.88 min_area = 250 - bbox_type = 'rect' + bbox_type = "rect" loss_bg = False if test_config is not None: - min_area = test_config.get('min_area', min_area) - min_score = test_config.get('min_score', min_score) - bbox_type = test_config.get('bbox_type', bbox_type) - loss_bg = test_config.get('loss_emb', None) == "EmbLoss_v2" - - if 'tiny' in model_config['backbone']['config']: - config = prepare_config(tiny_config_url, model_config['detection_head']['pooling_size'], - min_area, min_score, bbox_type, loss_bg) - elif 'small' in model_config['backbone']['config']: - config = prepare_config(small_config_url, model_config['detection_head']['pooling_size'], - min_area, min_score, bbox_type, loss_bg) + min_area = test_config.get("min_area", min_area) + min_score = test_config.get("min_score", min_score) + bbox_type = test_config.get("bbox_type", bbox_type) + loss_bg = test_config.get("loss_emb", None) == "EmbLoss_v2" + + if "tiny" in model_config["backbone"]["config"]: + config = prepare_config( + tiny_config_url, model_config["detection_head"]["pooling_size"], min_area, min_score, bbox_type, loss_bg + ) + elif "small" in model_config["backbone"]["config"]: + config = prepare_config( + small_config_url, model_config["detection_head"]["pooling_size"], min_area, min_score, bbox_type, loss_bg + ) else: - config = prepare_config(base_config_url, model_config['detection_head']['pooling_size'], - min_area, min_score, bbox_type, loss_bg) + config = prepare_config( + base_config_url, model_config["detection_head"]["pooling_size"], min_area, min_score, bbox_type, loss_bg + ) size = 640 if "train" in data_config: - if "short_size" in data_config['train']: - size = data_config['train']['short_size'] + if "short_size" in data_config["train"]: + size = data_config["train"]["short_size"] model = FASTForImageCaptioning(config) - fast_image_processor = FastImageProcessor(size={'height': size, 'width': size}, min_score=config.min_score, - min_area=config.min_area, - bbox_type=config.bbox_type, pooling_size=config.head_pooling_size) - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)['ema'] + fast_image_processor = FastImageProcessor( + size={"height": size, "width": size}, + min_score=config.min_score, + min_area=config.min_area, + bbox_type=config.bbox_type, + pooling_size=config.head_pooling_size, + ) + state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"] state_dict_changed = copy.deepcopy(state_dict) for key in state_dict: val = state_dict_changed.pop(key) - state_dict_changed[key.replace('module.', '')] = val + state_dict_changed[key.replace("module.", "")] = val model.load_state_dict(state_dict_changed) model.save_pretrained(pytorch_dump_folder_path) diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index 637aea38e086..812c617f073c 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -16,17 +16,15 @@ import math import warnings from typing import Any, Dict, List, Optional, Tuple, Union -import torch.nn.functional as F -import torch.nn as nn import cv2 import numpy as np +import torch.nn as nn +import torch.nn.functional as F from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import resize, to_channel_dimension_format from ...image_utils import ( - IMAGENET_STANDARD_MEAN, - IMAGENET_STANDARD_STD, ChannelDimension, ImageInput, PILImageResampling, @@ -36,8 +34,16 @@ to_numpy_array, valid_images, ) -from ...utils import TensorType, is_torch_available, is_torch_tensor, is_vision_available, logging, \ - IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from ...utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + TensorType, + is_torch_available, + is_torch_tensor, + is_vision_available, + logging, +) + if is_vision_available(): import PIL @@ -94,23 +100,23 @@ class FastImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] def __init__( - self, - do_resize: bool = True, - size: Dict[str, int] = None, - resample: PILImageResampling = PILImageResampling.BICUBIC, - do_center_crop: bool = False, - crop_size: Dict[str, int] = None, - rescale_factor: Union[int, float] = 1 / 255, - do_rescale: bool = True, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_reduce_labels: bool = False, - min_area: int = 10, - min_score: float = 0.88, - bbox_type: str = "rect", - pooling_size: int = 9, - **kwargs, + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = False, + crop_size: Dict[str, int] = None, + rescale_factor: Union[int, float] = 1 / 255, + do_rescale: bool = True, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_reduce_labels: bool = False, + min_area: int = 10, + min_score: float = 0.88, + bbox_type: str = "rect", + pooling_size: int = 9, + **kwargs, ) -> None: if "reduce_labels" in kwargs: warnings.warn( @@ -152,13 +158,13 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): return super().from_dict(image_processor_dict, **kwargs) def resize( - self, - image: np.ndarray, - size: Dict[str, int], - resample: PILImageResampling = PILImageResampling.BICUBIC, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, ) -> np.ndarray: """ Resize an image to (size["height"], size["width"]). @@ -196,20 +202,20 @@ def reduce_label(self, label: ImageInput) -> np.ndarray: return label def _preprocess( - self, - image: ImageInput, - do_reduce_labels: bool = None, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + image: ImageInput, + do_reduce_labels: bool = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ): if do_reduce_labels: image = self.reduce_label(image) @@ -229,20 +235,20 @@ def _preprocess( return image def _preprocess_image( - self, - image: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + image: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """Preprocesses a single image.""" # All transformations expect numpy arrays. @@ -274,15 +280,15 @@ def _preprocess_image( return image def _preprocess_segmentation_map( - self, - segmentation_map: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_reduce_labels: bool = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + segmentation_map: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_reduce_labels: bool = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ): """Preprocesses a single segmentation map.""" # All transformations expect numpy arrays. @@ -320,24 +326,24 @@ def __call__(self, images, segmentation_maps=None, **kwargs): return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs) def preprocess( - self, - images: ImageInput, - segmentation_maps: Optional[ImageInput] = None, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_reduce_labels: Optional[bool] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: ChannelDimension = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, + self, + images: ImageInput, + segmentation_maps: Optional[ImageInput] = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_reduce_labels: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. @@ -516,25 +522,26 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] def _max_pooling(self, x, scale=1): if scale == 1: - x = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, - padding=(self.pooling_size - 1) // 2)(x) + x = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2)(x) elif scale == 2: - x = nn.MaxPool2d(kernel_size=self.pooling_size // 2 + 1, stride=1, - padding=(self.pooling_size // 2) // 2)(x) + x = nn.MaxPool2d(kernel_size=self.pooling_size // 2 + 1, stride=1, padding=(self.pooling_size // 2) // 2)( + x + ) return x def get_results(self, output, target_sizes): scale = 2 - img_size = (self.size['height'], self.size['width']) - out = output['hidden_states'] + img_size = (self.size["height"], self.size["width"]) + out = output["hidden_states"] batch_size = out.size(0) - final_results = dict() + final_results = {} - texts = F.interpolate(out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), - mode='nearest') # B*1*320*320 + texts = F.interpolate( + out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), mode="nearest" + ) # B*1*320*320 texts = self._max_pooling(texts, scale=scale) # B*1*320*320 score_maps = torch.sigmoid_(texts) # B*1*320*320 - score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode='nearest') # B*1*640*640 + score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode="nearest") # B*1*640*640 score_maps = score_maps.squeeze(1) # B*640*640 kernels = (out[:, 0, :, :] > 0).to(torch.uint8) # B*160*160 @@ -545,27 +552,25 @@ def get_results(self, output, target_sizes): labels_ = np.array(labels_) labels_ = torch.from_numpy(labels_) labels = labels_.unsqueeze(1).to(torch.float32) # B*1*160*160 - labels = F.interpolate(labels, size=(img_size[0] // scale, img_size[1] // scale), mode='nearest') # B*1*320*320 + labels = F.interpolate( + labels, size=(img_size[0] // scale, img_size[1] // scale), mode="nearest" + ) # B*1*320*320 labels = self._max_pooling(labels, scale=scale) - labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode='nearest') # B*1*640*640 + labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode="nearest") # B*1*640*640 labels = labels.squeeze(1).to(torch.int32) # B*640*640 keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)] - final_results.update(dict(kernels=kernels.data.cpu())) + final_results.update({"kernels": kernels.data.cpu()}) results = [] for i in range(batch_size): org_img_size = target_sizes[i] - scales = (float(org_img_size[1]) / float(img_size[1]), - float(org_img_size[0]) / float(img_size[0])) + scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0])) bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales) - results.append(dict( - bboxes=bboxes, - scores=scores - )) - final_results.update(dict(results=results)) + results.append({"bboxes": bboxes, "scores": scores}) + final_results.update({"results": results}) return results @@ -575,7 +580,7 @@ def generate_bbox(self, keys, label, score, scales): scores = [] for index in range(1, label_num): i = keys[index] - ind = (label == i) + ind = label == i ind_np = ind.data.cpu().numpy() points = np.array(np.where(ind_np)).transpose((1, 0)) if points.shape[0] < self.min_area: @@ -586,18 +591,18 @@ def generate_bbox(self, keys, label, score, scales): label[ind] = 0 continue - if self.bbox_type == 'rect': + if self.bbox_type == "rect": rect = cv2.minAreaRect(points[:, ::-1]) alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1]))) rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2]) bbox = cv2.boxPoints(rect) * scales - elif self.bbox_type == 'poly': - binary = np.zeros(label.shape, dtype='uint8') + elif self.bbox_type == "poly": + binary = np.zeros(label.shape, dtype="uint8") binary[ind_np] = 1 contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) bbox = contours[0] * scales - bbox = bbox.astype('int32') + bbox = bbox.astype("int32") bboxes.append(bbox.reshape(-1).tolist()) scores.append(score_i) return bboxes, scores diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index dc892f0e58ce..4557cf4754c4 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -1,15 +1,12 @@ -import math -from collections import OrderedDict from dataclasses import dataclass -from typing import Optional, Dict +from typing import Dict, Optional -import cv2 import numpy as np import torch import torch.nn as nn import torch.nn.functional as F -from transformers import PreTrainedModel, FastConfig +from transformers import FastConfig, PreTrainedModel from transformers.utils import ModelOutput @@ -41,7 +38,7 @@ def build_activation(act_func, inplace=True): class My2DLayer(nn.Module): def __init__( - self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act" + self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act" ): super(My2DLayer, self).__init__() self.in_channels = in_channels @@ -104,7 +101,7 @@ def weight_op(self): def forward(self, x): for key, module in self._modules.items(): - if key == 'bn' and not self.training: + if key == "bn" and not self.training: continue x = module(x) return x @@ -138,21 +135,20 @@ def is_zero_layer(): class ConvLayer(nn.Module): def __init__( - self, - in_channels, - out_channels, - kernel_size=3, - stride=1, - dilation=1, - groups=1, - bias=False, - has_shuffle=False, - use_bn=True, - act_func="relu", - dropout_rate=0, - use_act=True + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + dilation=1, + groups=1, + bias=False, + has_shuffle=False, + use_bn=True, + act_func="relu", + dropout_rate=0, + use_act=True, ): - super().__init__() self.kernel_size = kernel_size @@ -192,14 +188,14 @@ def __init__( def forward(self, x): if self.training: - if hasattr(self, 'fused_conv'): - delattr(self, 'fused_conv') + if hasattr(self, "fused_conv"): + delattr(self, "fused_conv") x = self.conv(x) x = self.bn(x) return self.act(x) else: - if not hasattr(self, 'fused_conv'): - setattr(self, 'fused_conv', self.fuse_conv_bn(self.conv, self.bn)) + if not hasattr(self, "fused_conv"): + setattr(self, "fused_conv", self.fuse_conv_bn(self.conv, self.bn)) x = self.fused_conv(x) if self.act is not None: x = self.act(x) @@ -207,18 +203,15 @@ def forward(self, x): def fuse_conv_bn(self, conv, bn): """During inference, the functionary of batch norm layers is turned off but - only the mean and var alone channels are used, which exposes the chance to - fuse it with the preceding conv layers to save computations and simplify - network structures.""" + only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv + layers to save computations and simplify network structures.""" if isinstance(bn, nn.Identity): return conv conv_w = conv.weight - conv_b = conv.bias if conv.bias is not None else torch.zeros_like( - bn.running_mean) + conv_b = conv.bias if conv.bias is not None else torch.zeros_like(bn.running_mean) factor = bn.weight / torch.sqrt(bn.running_var + bn.eps) - conv.weight = nn.Parameter(conv_w * - factor.reshape([conv.out_channels, 1, 1, 1])) + conv.weight = nn.Parameter(conv_w * factor.reshape([conv.out_channels, 1, 1, 1])) conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias) return conv @@ -290,8 +283,8 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, def forward(self, input): if self.training: - if hasattr(self, 'fused_conv'): - self.__delattr__('fused_conv') + if hasattr(self, "fused_conv"): + self.__delattr__("fused_conv") main_outputs = self.main_conv(input) main_outputs = self.main_bn(main_outputs) @@ -314,7 +307,7 @@ def forward(self, input): return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out) else: - if not hasattr(self, 'fused_conv'): + if not hasattr(self, "fused_conv"): self.prepare_for_eval() return self.nonlinearity(self.fused_conv(input)) @@ -375,11 +368,16 @@ def _pad_to_mxn_tensor(self, kernel): def prepare_for_eval(self): kernel, bias = self.get_equivalent_kernel_bias() - self.fused_conv = nn.Conv2d(in_channels=self.main_conv.in_channels, - out_channels=self.main_conv.out_channels, - kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride, - padding=self.main_conv.padding, dilation=self.main_conv.dilation, - groups=self.main_conv.groups, bias=True) + self.fused_conv = nn.Conv2d( + in_channels=self.main_conv.in_channels, + out_channels=self.main_conv.out_channels, + kernel_size=self.main_conv.kernel_size, + stride=self.main_conv.stride, + padding=self.main_conv.padding, + dilation=self.main_conv.dilation, + groups=self.main_conv.groups, + bias=True, + ) self.fused_conv.weight.data = kernel self.fused_conv.bias.data = bias for para in self.fused_conv.parameters(): @@ -423,48 +421,48 @@ def __init__(self, config): self.first_conv.apply(self._init_weights) stage1 = [] for stage_config in zip( - config.backbone_stage1_in_channels, - config.backbone_stage1_out_channels, - config.backbone_stage1_kernel_size, - config.backbone_stage1_stride, - config.backbone_stage1_dilation, - config.backbone_stage1_groups, + config.backbone_stage1_in_channels, + config.backbone_stage1_out_channels, + config.backbone_stage1_kernel_size, + config.backbone_stage1_stride, + config.backbone_stage1_dilation, + config.backbone_stage1_groups, ): stage1.append(RepConvLayer(*stage_config)) self.stage1 = nn.ModuleList(stage1) stage2 = [] for stage_config in zip( - config.backbone_stage2_in_channels, - config.backbone_stage2_out_channels, - config.backbone_stage2_kernel_size, - config.backbone_stage2_stride, - config.backbone_stage2_dilation, - config.backbone_stage2_groups, + config.backbone_stage2_in_channels, + config.backbone_stage2_out_channels, + config.backbone_stage2_kernel_size, + config.backbone_stage2_stride, + config.backbone_stage2_dilation, + config.backbone_stage2_groups, ): stage2.append(RepConvLayer(*stage_config)) self.stage2 = nn.ModuleList(stage2) stage3 = [] for stage_config in zip( - config.backbone_stage3_in_channels, - config.backbone_stage3_out_channels, - config.backbone_stage3_kernel_size, - config.backbone_stage3_stride, - config.backbone_stage3_dilation, - config.backbone_stage3_groups, + config.backbone_stage3_in_channels, + config.backbone_stage3_out_channels, + config.backbone_stage3_kernel_size, + config.backbone_stage3_stride, + config.backbone_stage3_dilation, + config.backbone_stage3_groups, ): stage3.append(RepConvLayer(*stage_config)) self.stage3 = nn.ModuleList(stage3) stage4 = [] for stage_config in zip( - config.backbone_stage4_in_channels, - config.backbone_stage4_out_channels, - config.backbone_stage4_kernel_size, - config.backbone_stage4_stride, - config.backbone_stage4_dilation, - config.backbone_stage4_groups, + config.backbone_stage4_in_channels, + config.backbone_stage4_out_channels, + config.backbone_stage4_kernel_size, + config.backbone_stage4_stride, + config.backbone_stage4_dilation, + config.backbone_stage4_groups, ): stage4.append(RepConvLayer(*stage_config)) self.stage4 = nn.ModuleList(stage4) @@ -695,8 +693,9 @@ def _max_pooling(self, x, scale=1): # return bboxes, scores -def emb_loss(emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), - bg_sample=False): +def emb_loss( + emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), bg_sample=False +): training_mask = (training_mask > 0.5).long() kernel = (kernel > 0.5).long() instance = instance * training_mask @@ -722,7 +721,7 @@ def emb_loss(emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, d continue ind = instance == lb emb_ = emb[:, ind] - dist = (emb_ - emb_mean[:, i:i + 1]).norm(p=2, dim=0) + dist = (emb_ - emb_mean[:, i : i + 1]).norm(p=2, dim=0) dist = F.relu(dist - delta_v) ** 2 l_agg[i] = torch.mean(torch.log(dist + 1.0)) l_agg = torch.mean(l_agg[1:]) @@ -754,7 +753,7 @@ def emb_loss(emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, d for i, lb in enumerate(unique_labels): if lb == 0: continue - dist = (emb_bg - emb_mean[:, i:i + 1]).norm(p=2, dim=0) + dist = (emb_bg - emb_mean[:, i : i + 1]).norm(p=2, dim=0) dist = F.relu(2 * delta_d - dist) ** 2 l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True) l_dis.append(l_dis_bg) @@ -894,7 +893,6 @@ class FASTForImageCaptioningOutput(ModelOutput): class FASTForImageCaptioning(FastPreTrainedModel): - def __init__(self, config): super().__init__(config) self.backbone = TextNet(config=config) @@ -902,10 +900,12 @@ def __init__(self, config): self.det_head = FASTHead(config=config) self.loss_bg = config.loss_bg - self.pooling_1s = nn.MaxPool2d(kernel_size=config.head_pooling_size, stride=1, - padding=(config.head_pooling_size - 1) // 2) - self.pooling_2s = nn.MaxPool2d(kernel_size=config.head_pooling_size // 2 + 1, stride=1, - padding=(config.head_pooling_size // 2) // 2) + self.pooling_1s = nn.MaxPool2d( + kernel_size=config.head_pooling_size, stride=1, padding=(config.head_pooling_size - 1) // 2 + ) + self.pooling_2s = nn.MaxPool2d( + kernel_size=config.head_pooling_size // 2 + 1, stride=1, padding=(config.head_pooling_size // 2) // 2 + ) self.post_init() def _upsample(self, x, size, scale=1): @@ -920,10 +920,10 @@ def _max_pooling(self, x, scale=1): return x def loss(self, hidden, labels): - gt_texts = labels['gt_texts'] - gt_kernels = labels['gt_kernels'] - training_masks = labels['training_masks'] - gt_instances = labels['gt_instances'] + gt_texts = labels["gt_texts"] + gt_kernels = labels["gt_kernels"] + training_masks = labels["training_masks"] + gt_instances = labels["gt_instances"] kernels = hidden[:, 0, :, :] # 4*640*640 texts = self._max_pooling(kernels, scale=1) # 4*640*640 @@ -940,12 +940,13 @@ def loss(self, hidden, labels): return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb) - def forward(self, - pixel_values: torch.FloatTensor, - output_hidden_states: Optional[bool] = True, - return_dict: Optional[bool] = None, - labels: Dict = None - ): + def forward( + self, + pixel_values: torch.FloatTensor, + output_hidden_states: Optional[bool] = True, + return_dict: Optional[bool] = None, + labels: Dict = None, + ): # outputs = {} return_dict = return_dict if return_dict is not None else self.config.use_return_dict f = self.backbone(pixel_values) diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index ee84d0e857a8..d1a2075a199b 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -18,23 +18,24 @@ import requests from PIL import Image -from parameterized import parameterized from transformers import ( FastConfig, is_torch_available, - set_seed, ) from transformers.models.fast.image_processing_fast import FastImageProcessor -from transformers.testing_utils import CaptureLogger, require_bitsandbytes, require_torch, slow, tooslow, torch_device, \ - require_vision -from transformers.utils import logging as transformers_logging +from transformers.testing_utils import ( + require_torch, + require_vision, + torch_device, +) from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor from ...test_pipeline_mixin import PipelineTesterMixin + if is_torch_available(): import torch @@ -45,74 +46,74 @@ class FastModelTester: def __init__( - self, - parent, - backbone_kernel_size=3, - backbone_stride=2, - backbone_dilation=1, - backbone_groups=1, - backbone_bias=False, - backbone_has_shuffle=False, - backbone_in_channels=3, - backbone_out_channels=64, - backbone_use_bn=True, - backbone_act_func="relu", - backbone_dropout_rate=0, - backbone_ops_order="weight_bn_act", - backbone_stage1_in_channels=[64], - backbone_stage1_out_channels=[64], - backbone_stage1_kernel_size=[[3, 3]], - backbone_stage1_stride=[1], - backbone_stage1_dilation=[1], - backbone_stage1_groups=[1], - backbone_stage2_in_channels=[64], - backbone_stage2_out_channels=[128], - backbone_stage2_kernel_size=[[3, 1]], - backbone_stage2_stride=[2], - backbone_stage2_dilation=[1], - backbone_stage2_groups=[1], - backbone_stage3_in_channels=[128], - backbone_stage3_out_channels=[256], - backbone_stage3_kernel_size=[[1, 3]], - backbone_stage3_stride=[2], - backbone_stage3_dilation=[1], - backbone_stage3_groups=[1], - backbone_stage4_in_channels=[256], - backbone_stage4_out_channels=[512], - backbone_stage4_kernel_size=[[3, 3]], - backbone_stage4_stride=[2], - backbone_stage4_dilation=[1], - backbone_stage4_groups=[1], - neck_in_channels=[64], - neck_out_channels=[128], - neck_kernel_size=[[3, 3]], - neck_stride=[1], - neck_dilation=[1], - neck_groups=[1], - head_pooling_size=9, - head_dropout_ratio=0.1, - head_conv_in_channels=128, - head_conv_out_channels=4, - head_conv_kernel_size=[3, 3], - head_conv_stride=1, - head_conv_dilation=1, - head_conv_groups=1, - head_final_kernel_size=1, - head_final_stride=1, - head_final_dilation=1, - head_final_groups=1, - head_final_bias=False, - head_final_has_shuffle=False, - head_final_in_channels=4, - head_final_out_channels=5, - head_final_use_bn=False, - head_final_act_func=None, - head_final_dropout_rate=0, - head_final_ops_order="weight", - batch_size=3, - num_channels=3, - image_size=500, - is_training=True, + self, + parent, + backbone_kernel_size=3, + backbone_stride=2, + backbone_dilation=1, + backbone_groups=1, + backbone_bias=False, + backbone_has_shuffle=False, + backbone_in_channels=3, + backbone_out_channels=64, + backbone_use_bn=True, + backbone_act_func="relu", + backbone_dropout_rate=0, + backbone_ops_order="weight_bn_act", + backbone_stage1_in_channels=[64], + backbone_stage1_out_channels=[64], + backbone_stage1_kernel_size=[[3, 3]], + backbone_stage1_stride=[1], + backbone_stage1_dilation=[1], + backbone_stage1_groups=[1], + backbone_stage2_in_channels=[64], + backbone_stage2_out_channels=[128], + backbone_stage2_kernel_size=[[3, 1]], + backbone_stage2_stride=[2], + backbone_stage2_dilation=[1], + backbone_stage2_groups=[1], + backbone_stage3_in_channels=[128], + backbone_stage3_out_channels=[256], + backbone_stage3_kernel_size=[[1, 3]], + backbone_stage3_stride=[2], + backbone_stage3_dilation=[1], + backbone_stage3_groups=[1], + backbone_stage4_in_channels=[256], + backbone_stage4_out_channels=[512], + backbone_stage4_kernel_size=[[3, 3]], + backbone_stage4_stride=[2], + backbone_stage4_dilation=[1], + backbone_stage4_groups=[1], + neck_in_channels=[64], + neck_out_channels=[128], + neck_kernel_size=[[3, 3]], + neck_stride=[1], + neck_dilation=[1], + neck_groups=[1], + head_pooling_size=9, + head_dropout_ratio=0.1, + head_conv_in_channels=128, + head_conv_out_channels=4, + head_conv_kernel_size=[3, 3], + head_conv_stride=1, + head_conv_dilation=1, + head_conv_groups=1, + head_final_kernel_size=1, + head_final_stride=1, + head_final_dilation=1, + head_final_groups=1, + head_final_bias=False, + head_final_has_shuffle=False, + head_final_in_channels=4, + head_final_out_channels=5, + head_final_use_bn=False, + head_final_act_func=None, + head_final_dropout_rate=0, + head_final_ops_order="weight", + batch_size=3, + num_channels=3, + image_size=500, + is_training=True, ): self.parent = parent self.backbone_kernel_size = backbone_kernel_size @@ -193,10 +194,6 @@ def __init__( def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) - pixel_values_meta = { - "org_img_size": (500, 500), - "img_size": (500, 500) - } # labels = None # if self.use_labels: # labels = ids_tensor([self.batch_size], self.num_labels) @@ -275,7 +272,7 @@ def create_and_check_model(self, config, input): model = FASTForImageCaptioning(config=config) model.to(torch_device) model.eval() - result = model(pixel_values=input['pixel_values']) + result = model(pixel_values=input["pixel_values"]) self.parent.assertEqual(result.hidden_states.shape, (self.batch_size, 5, 125, 125)) def prepare_config_and_inputs_for_common(self): @@ -286,13 +283,7 @@ def prepare_config_and_inputs_for_common(self): @require_torch class FastModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): - all_model_classes = ( - ( - FASTForImageCaptioning, - ) - if is_torch_available() - else () - ) + all_model_classes = (FASTForImageCaptioning,) if is_torch_available() else () pipeline_model_mapping = {} test_headmasking = False @@ -356,13 +347,16 @@ def test_forward_signature(self): def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): to_return = inputs_dict.copy() - gt_instances = torch.zeros(self.model_tester.batch_size, self.model_tester.image_size, - self.model_tester.image_size) - gt_kernels = torch.zeros(self.model_tester.batch_size, self.model_tester.image_size, - self.model_tester.image_size) + gt_instances = torch.zeros( + self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size + ) + gt_kernels = torch.zeros( + self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size + ) gt_text = torch.zeros(self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size) - training_masks = torch.ones(self.model_tester.batch_size, self.model_tester.image_size, - self.model_tester.image_size) + training_masks = torch.ones( + self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size + ) labels = {} labels["gt_instances"] = gt_instances labels["gt_kernels"] = gt_kernels @@ -381,7 +375,7 @@ def test_model_is_small(self): model = model_class(config) num_params = model.num_parameters() assert ( - num_params < 3000000 + num_params < 3000000 ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max." # def prepare_image(): @@ -407,14 +401,12 @@ def prepare_image(): image = prepare_image() input = image_processor(image, return_tensor="np") - output = model(pixel_values=torch.tensor(input['pixel_values'])) - target_sizes = [(image.shape[1], image.shape[2]) for image in input['pixel_values']] + output = model(pixel_values=torch.tensor(input["pixel_values"])) + target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]] final_out = image_processor.get_results(output, target_sizes) - assert ( - final_out[0]['bboxes'][0] == [224, 120, 246, 120, 246, 134, 224, 134] - ) - assert round(float(final_out[0]['scores'][0]), 5) == 0.95541 + assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134] + assert round(float(final_out[0]["scores"][0]), 5) == 0.95541 def test_inference_fast_base_800_total_text_ic17mlt_model(self): model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") @@ -429,11 +421,9 @@ def prepare_image(): image = prepare_image() input = image_processor(image, return_tensor="np") - output = model(pixel_values=torch.tensor(input['pixel_values'])) - target_sizes = [(image.shape[1], image.shape[2]) for image in input['pixel_values']] + output = model(pixel_values=torch.tensor(input["pixel_values"])) + target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]] final_out = image_processor.get_results(output, target_sizes) - assert ( - final_out[0]['bboxes'][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182] - ) - assert round(float(final_out[0]['scores'][0]), 5) == 0.92356 + assert final_out[0]["bboxes"][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182] + assert round(float(final_out[0]["scores"][0]), 5) == 0.92356 From 3fef2616e950889ca7e6ab22f4db68fd7d8ade51 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Tue, 31 Oct 2023 19:37:06 +0530 Subject: [PATCH 010/152] Add fast model to init --- src/transformers/__init__.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 280e824efb89..4941d724455d 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -424,7 +424,6 @@ "models.ernie_m": ["ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP", "ErnieMConfig"], "models.esm": ["ESM_PRETRAINED_CONFIG_ARCHIVE_MAP", "EsmConfig", "EsmTokenizer"], "models.falcon": ["FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP", "FalconConfig"], -<<<<<<< HEAD "models.fastspeech2_conformer": [ "FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", @@ -434,9 +433,6 @@ "FastSpeech2ConformerTokenizer", "FastSpeech2ConformerWithHifiGanConfig", ], -======= - "models.fast": ["FastConfig"], ->>>>>>> 67fec5b40 (Refactor modeling and add tests) "models.flaubert": ["FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlaubertConfig", "FlaubertTokenizer"], "models.flava": [ "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP", @@ -5117,7 +5113,6 @@ from .models.ernie_m import ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieMConfig from .models.esm import ESM_PRETRAINED_CONFIG_ARCHIVE_MAP, EsmConfig, EsmTokenizer from .models.falcon import FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP, FalconConfig -<<<<<<< HEAD from .models.fastspeech2_conformer import ( FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP, FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -5127,9 +5122,6 @@ FastSpeech2ConformerTokenizer, FastSpeech2ConformerWithHifiGanConfig, ) -======= - from .models.fast import FastConfig ->>>>>>> 67fec5b40 (Refactor modeling and add tests) from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer from .models.flava import ( FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -6706,17 +6698,12 @@ FalconModel, FalconPreTrainedModel, ) -<<<<<<< HEAD from .models.fastspeech2_conformer import ( FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, FastSpeech2ConformerHifiGan, FastSpeech2ConformerModel, FastSpeech2ConformerPreTrainedModel, FastSpeech2ConformerWithHifiGan, -======= - from .models.fast import ( - FASTForImageCaptioning, ->>>>>>> 67fec5b40 (Refactor modeling and add tests) ) from .models.flaubert import ( FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, From 597abe1da92632e471968b3ea60f03725d444a73 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Tue, 31 Oct 2023 19:53:17 +0530 Subject: [PATCH 011/152] Add fast model in docs and other places --- docs/source/en/model_doc/fast.md | 39 +++++++++++++++++++ .../models/auto/image_processing_auto.py | 1 + 2 files changed, 40 insertions(+) create mode 100644 docs/source/en/model_doc/fast.md diff --git a/docs/source/en/model_doc/fast.md b/docs/source/en/model_doc/fast.md new file mode 100644 index 000000000000..ddcc5e1148f8 --- /dev/null +++ b/docs/source/en/model_doc/fast.md @@ -0,0 +1,39 @@ + + +# Fast + +## Overview + +Fast model proposes an accurate and efficient scene text detection framework, termed FAST (i.e., faster +arbitrarily-shaped text detector). + +FAST has two new designs. (1) We design a minimalist kernel representation (only has 1-channel output) to model text +with arbitrary shape, as well as a GPU-parallel post-processing to efficiently assemble text lines with a negligible +time overhead. (2) We search the network architecture tailored for text detection, leading to more powerful features +than most networks that are searched for image classification + +## FastConfig + +[[autodoc]] FastConfig + +## Fast + +[[autodoc]] FASTForImageCaptioningOutput + - forward + + + diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index e41889c5ef81..55a128fe5519 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -62,6 +62,7 @@ ("dpt", "DPTImageProcessor"), ("efficientformer", "EfficientFormerImageProcessor"), ("efficientnet", "EfficientNetImageProcessor"), + ("fast", "FastImageProcessor"), ("flava", "FlavaImageProcessor"), ("focalnet", "BitImageProcessor"), ("fuyu", "FuyuImageProcessor"), From c3b43e739c8601ed55a2f7f0903405ebc02c14d0 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Tue, 31 Oct 2023 20:13:48 +0530 Subject: [PATCH 012/152] Fix import of cv2 --- .../models/fast/image_processing_fast.py | 165 +++++++++--------- 1 file changed, 83 insertions(+), 82 deletions(-) diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index 812c617f073c..2f8ad3cb4e57 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -16,8 +16,10 @@ import math import warnings from typing import Any, Dict, List, Optional, Tuple, Union +from ...utils.import_utils import is_cv2_available -import cv2 +if is_cv2_available(): + import cv2 import numpy as np import torch.nn as nn import torch.nn.functional as F @@ -41,10 +43,9 @@ is_torch_available, is_torch_tensor, is_vision_available, - logging, + logging, is_cv2_available, ) - if is_vision_available(): import PIL @@ -100,23 +101,23 @@ class FastImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] def __init__( - self, - do_resize: bool = True, - size: Dict[str, int] = None, - resample: PILImageResampling = PILImageResampling.BICUBIC, - do_center_crop: bool = False, - crop_size: Dict[str, int] = None, - rescale_factor: Union[int, float] = 1 / 255, - do_rescale: bool = True, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_reduce_labels: bool = False, - min_area: int = 10, - min_score: float = 0.88, - bbox_type: str = "rect", - pooling_size: int = 9, - **kwargs, + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = False, + crop_size: Dict[str, int] = None, + rescale_factor: Union[int, float] = 1 / 255, + do_rescale: bool = True, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_reduce_labels: bool = False, + min_area: int = 10, + min_score: float = 0.88, + bbox_type: str = "rect", + pooling_size: int = 9, + **kwargs, ) -> None: if "reduce_labels" in kwargs: warnings.warn( @@ -158,13 +159,13 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): return super().from_dict(image_processor_dict, **kwargs) def resize( - self, - image: np.ndarray, - size: Dict[str, int], - resample: PILImageResampling = PILImageResampling.BICUBIC, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, ) -> np.ndarray: """ Resize an image to (size["height"], size["width"]). @@ -202,20 +203,20 @@ def reduce_label(self, label: ImageInput) -> np.ndarray: return label def _preprocess( - self, - image: ImageInput, - do_reduce_labels: bool = None, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + image: ImageInput, + do_reduce_labels: bool = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ): if do_reduce_labels: image = self.reduce_label(image) @@ -235,20 +236,20 @@ def _preprocess( return image def _preprocess_image( - self, - image: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + image: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """Preprocesses a single image.""" # All transformations expect numpy arrays. @@ -280,15 +281,15 @@ def _preprocess_image( return image def _preprocess_segmentation_map( - self, - segmentation_map: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_reduce_labels: bool = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + segmentation_map: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_reduce_labels: bool = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ): """Preprocesses a single segmentation map.""" # All transformations expect numpy arrays. @@ -326,24 +327,24 @@ def __call__(self, images, segmentation_maps=None, **kwargs): return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs) def preprocess( - self, - images: ImageInput, - segmentation_maps: Optional[ImageInput] = None, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_reduce_labels: Optional[bool] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: ChannelDimension = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, + self, + images: ImageInput, + segmentation_maps: Optional[ImageInput] = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_reduce_labels: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. From 4903a6914174cbddd62984ca8ecdb4dbfde64c8b Mon Sep 17 00:00:00 2001 From: raghavanone Date: Tue, 31 Oct 2023 20:18:15 +0530 Subject: [PATCH 013/152] Rename image processing method --- .../models/fast/image_processing_fast.py | 50 ++----------------- tests/models/fast/test_modeling_fast.py | 9 ++-- 2 files changed, 9 insertions(+), 50 deletions(-) diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index 2f8ad3cb4e57..04cb89e1d5cd 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -43,7 +43,7 @@ is_torch_available, is_torch_tensor, is_vision_available, - logging, is_cv2_available, + logging, ) if is_vision_available(): @@ -57,7 +57,7 @@ class FastImageProcessor(BaseImageProcessor): r""" - Constructs a BEiT image processor. + Constructs a Fast image processor. Args: do_resize (`bool`, *optional*, defaults to `True`): @@ -151,7 +151,7 @@ def __init__( def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): """ Overrides the `from_dict` method from the base class to make sure `reduce_labels` is updated if image processor - is created using from_dict and kwargs e.g. `BeitImageProcessor.from_pretrained(checkpoint, reduce_labels=True)` + is created using from_dict and kwargs e.g. `FastImageProcessor.from_pretrained(checkpoint, reduce_labels=True)` """ image_processor_dict = image_processor_dict.copy() if "reduce_labels" in kwargs: @@ -478,48 +478,6 @@ def preprocess( return BatchFeature(data=data, tensor_type=return_tensors) - def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None): - """ - Converts the output of [`BeitForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch. - - Args: - outputs ([`BeitForSemanticSegmentation`]): - Raw outputs of the model. - target_sizes (`List[Tuple]` of length `batch_size`, *optional*): - List of tuples corresponding to the requested final size (height, width) of each prediction. If unset, - predictions will not be resized. - - Returns: - semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic - segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is - specified). Each entry of each `torch.Tensor` correspond to a semantic class id. - """ - # TODO: add support for other frameworks - logits = outputs.logits - - # Resize logits and compute semantic segmentation maps - if target_sizes is not None: - if len(logits) != len(target_sizes): - raise ValueError( - "Make sure that you pass in as many target sizes as the batch dimension of the logits" - ) - - if is_torch_tensor(target_sizes): - target_sizes = target_sizes.numpy() - - semantic_segmentation = [] - - for idx in range(len(logits)): - resized_logits = torch.nn.functional.interpolate( - logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False - ) - semantic_map = resized_logits[0].argmax(dim=0) - semantic_segmentation.append(semantic_map) - else: - semantic_segmentation = logits.argmax(dim=1) - semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])] - - return semantic_segmentation def _max_pooling(self, x, scale=1): if scale == 1: @@ -530,7 +488,7 @@ def _max_pooling(self, x, scale=1): ) return x - def get_results(self, output, target_sizes): + def post_process_text_detection(self, output, target_sizes): scale = 2 img_size = (self.size["height"], self.size["width"]) out = output["hidden_states"] diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index d1a2075a199b..7acd27b6e9e8 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -27,7 +27,7 @@ from transformers.testing_utils import ( require_torch, require_vision, - torch_device, + torch_device, slow, ) from ...generation.test_utils import GenerationTesterMixin @@ -387,7 +387,7 @@ def test_model_is_small(self): @require_torch @require_vision class FastModelIntegrationTest(unittest.TestCase): - # @slow + @slow def test_inference_fast_tiny_ic17mlt_model(self): model = FASTForImageCaptioning.from_pretrained("Raghavan/ic17mlt_Fast_T") @@ -403,11 +403,12 @@ def prepare_image(): output = model(pixel_values=torch.tensor(input["pixel_values"])) target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]] - final_out = image_processor.get_results(output, target_sizes) + final_out = image_processor.post_process_text_detection(output, target_sizes) assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134] assert round(float(final_out[0]["scores"][0]), 5) == 0.95541 + @slow def test_inference_fast_base_800_total_text_ic17mlt_model(self): model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") @@ -423,7 +424,7 @@ def prepare_image(): output = model(pixel_values=torch.tensor(input["pixel_values"])) target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]] - final_out = image_processor.get_results(output, target_sizes) + final_out = image_processor.post_process_text_detection(output, target_sizes) assert final_out[0]["bboxes"][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182] assert round(float(final_out[0]["scores"][0]), 5) == 0.92356 From c391cf6d1e15be8715709181c3142454baab1fbf Mon Sep 17 00:00:00 2001 From: raghavanone Date: Tue, 31 Oct 2023 20:25:23 +0530 Subject: [PATCH 014/152] Fix build --- .../models/fast/image_processing_fast.py | 165 +++++++++--------- tests/models/fast/test_modeling_fast.py | 3 +- 2 files changed, 85 insertions(+), 83 deletions(-) diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index 04cb89e1d5cd..1c652128a85e 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -15,9 +15,11 @@ """Image processor class for Beit.""" import math import warnings -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Union + from ...utils.import_utils import is_cv2_available + if is_cv2_available(): import cv2 import numpy as np @@ -41,11 +43,11 @@ IMAGENET_DEFAULT_STD, TensorType, is_torch_available, - is_torch_tensor, is_vision_available, logging, ) + if is_vision_available(): import PIL @@ -101,23 +103,23 @@ class FastImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] def __init__( - self, - do_resize: bool = True, - size: Dict[str, int] = None, - resample: PILImageResampling = PILImageResampling.BICUBIC, - do_center_crop: bool = False, - crop_size: Dict[str, int] = None, - rescale_factor: Union[int, float] = 1 / 255, - do_rescale: bool = True, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_reduce_labels: bool = False, - min_area: int = 10, - min_score: float = 0.88, - bbox_type: str = "rect", - pooling_size: int = 9, - **kwargs, + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = False, + crop_size: Dict[str, int] = None, + rescale_factor: Union[int, float] = 1 / 255, + do_rescale: bool = True, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_reduce_labels: bool = False, + min_area: int = 10, + min_score: float = 0.88, + bbox_type: str = "rect", + pooling_size: int = 9, + **kwargs, ) -> None: if "reduce_labels" in kwargs: warnings.warn( @@ -159,13 +161,13 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): return super().from_dict(image_processor_dict, **kwargs) def resize( - self, - image: np.ndarray, - size: Dict[str, int], - resample: PILImageResampling = PILImageResampling.BICUBIC, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, ) -> np.ndarray: """ Resize an image to (size["height"], size["width"]). @@ -203,20 +205,20 @@ def reduce_label(self, label: ImageInput) -> np.ndarray: return label def _preprocess( - self, - image: ImageInput, - do_reduce_labels: bool = None, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + image: ImageInput, + do_reduce_labels: bool = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ): if do_reduce_labels: image = self.reduce_label(image) @@ -236,20 +238,20 @@ def _preprocess( return image def _preprocess_image( - self, - image: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + image: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """Preprocesses a single image.""" # All transformations expect numpy arrays. @@ -281,15 +283,15 @@ def _preprocess_image( return image def _preprocess_segmentation_map( - self, - segmentation_map: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_reduce_labels: bool = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + segmentation_map: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_reduce_labels: bool = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ): """Preprocesses a single segmentation map.""" # All transformations expect numpy arrays. @@ -327,24 +329,24 @@ def __call__(self, images, segmentation_maps=None, **kwargs): return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs) def preprocess( - self, - images: ImageInput, - segmentation_maps: Optional[ImageInput] = None, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_reduce_labels: Optional[bool] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: ChannelDimension = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, + self, + images: ImageInput, + segmentation_maps: Optional[ImageInput] = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_reduce_labels: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. @@ -478,7 +480,6 @@ def preprocess( return BatchFeature(data=data, tensor_type=return_tensors) - def _max_pooling(self, x, scale=1): if scale == 1: x = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2)(x) diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index 7acd27b6e9e8..17f09befd7cd 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -27,7 +27,8 @@ from transformers.testing_utils import ( require_torch, require_vision, - torch_device, slow, + slow, + torch_device, ) from ...generation.test_utils import GenerationTesterMixin From d3bf608b8701578929041b5d295ce984faf6e82d Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 1 Nov 2023 08:03:39 +0530 Subject: [PATCH 015/152] Fix Build --- docs/source/en/model_doc/fast.md | 13 ++++- src/transformers/__init__.py | 28 ++++------ src/transformers/models/fast/__init__.py | 8 +-- src/transformers/models/fast/modeling_fast.py | 51 +++++++++++++++---- utils/check_repo.py | 27 +++++----- 5 files changed, 80 insertions(+), 47 deletions(-) diff --git a/docs/source/en/model_doc/fast.md b/docs/source/en/model_doc/fast.md index ddcc5e1148f8..3c81109380ae 100644 --- a/docs/source/en/model_doc/fast.md +++ b/docs/source/en/model_doc/fast.md @@ -30,10 +30,19 @@ than most networks that are searched for image classification [[autodoc]] FastConfig -## Fast +## FastImageProcessor + +[[autodoc]] FastImageProcessor + +## FASTForImageCaptioning + +[[autodoc]] FASTForImageCaptioning +- forward + +## FASTForImageCaptioningOutput [[autodoc]] FASTForImageCaptioningOutput - - forward +- forward diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 4941d724455d..82cee836cf05 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -47,10 +47,8 @@ logging, ) - logger = logging.get_logger(__name__) # pylint: disable=invalid-name - # Base objects, independent of any specific backend _import_structure = { "audio_utils": [], @@ -1200,7 +1198,6 @@ _import_structure["models.xlnet"].append("XLNetTokenizerFast") _import_structure["tokenization_utils_fast"] = ["PreTrainedTokenizerFast"] - try: if not (is_sentencepiece_available() and is_tokenizers_available()): raise OptionalDependencyNotAvailable() @@ -1310,7 +1307,6 @@ _import_structure["models.vivit"].append("VivitImageProcessor") _import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"]) - # PyTorch-backed objects try: if not is_torch_available(): @@ -4402,14 +4398,13 @@ ] _import_structure["tf_utils"] = [] - try: if not ( - is_librosa_available() - and is_essentia_available() - and is_scipy_available() - and is_torch_available() - and is_pretty_midi_available() + is_librosa_available() + and is_essentia_available() + and is_scipy_available() + and is_torch_available() + and is_pretty_midi_available() ): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: @@ -4427,7 +4422,6 @@ _import_structure["models.pop2piano"].append("Pop2PianoTokenizer") _import_structure["models.pop2piano"].append("Pop2PianoProcessor") - # FLAX-backed objects try: if not is_flax_available(): @@ -4752,7 +4746,6 @@ ] ) - # Direct imports for type-checking if TYPE_CHECKING: # Configuration @@ -8561,11 +8554,11 @@ try: if not ( - is_librosa_available() - and is_essentia_available() - and is_scipy_available() - and is_torch_available() - and is_pretty_midi_available() + is_librosa_available() + and is_essentia_available() + and is_scipy_available() + and is_torch_available() + and is_pretty_midi_available() ): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: @@ -8868,7 +8861,6 @@ extra_objects={"__version__": __version__}, ) - if not is_tf_available() and not is_torch_available() and not is_flax_available(): logger.warning( "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. " diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py index 133d20bc0c52..e7e044c5d2ce 100644 --- a/src/transformers/models/fast/__init__.py +++ b/src/transformers/models/fast/__init__.py @@ -20,9 +20,9 @@ is_torch_available, ) - _import_structure = { "configuration_fast": ["FastConfig"], + "image_processing_fast": ["FastImageProcessor"] } try: @@ -31,18 +31,18 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["modeling_fast"] = ["FASTForImageCaptioning"] + _import_structure["modeling_fast"] = ["FASTForImageCaptioning","FastPreTrainedModel"] if TYPE_CHECKING: from .configuration_fast import FastConfig - + from .image_processing_fast import FastImageProcessor try: if not is_torch_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: pass else: - from .modeling_fast import FASTForImageCaptioning + from .modeling_fast import FASTForImageCaptioning,FastPreTrainedModel else: diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 4557cf4754c4..4d04b8e56ed5 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -6,9 +6,36 @@ import torch.nn as nn import torch.nn.functional as F -from transformers import FastConfig, PreTrainedModel +from transformers import FastConfig, PreTrainedModel, add_start_docstrings from transformers.utils import ModelOutput +FAST_FOR_CAPTIONING_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + [What are input IDs?](../glossary#input-ids) + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`BeitImageProcessor.__call__`] for details. + language_masked_pos (`torch.LongTensor` of shape `({0})`): + language_masked_pos for denoting tokens for captioning + - 1 indicates the token is **Present**, + - 0 indicates the token is **absent**. + text_len (`torch.LongTensor` of shape `({0})`): + Length of text for captioning + past_key_value (`Dict`): + A Dictionary containing the incremental states layerwise + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. A + classification loss is computed (Cross-Entropy) against these labels. +""" def get_same_padding(kernel_size): if isinstance(kernel_size, tuple): @@ -401,9 +428,9 @@ def _init_weights(self, module): module.bias.data.zero_() -class TextNet(FastPreTrainedModel): +class TextNet(nn.Module): def __init__(self, config): - super().__init__(config) + super().__init__() self.first_conv = ConvLayer( config.backbone_in_channels, config.backbone_out_channels, @@ -418,7 +445,7 @@ def __init__(self, config): config.backbone_dropout_rate, config.backbone_ops_order, ) - self.first_conv.apply(self._init_weights) + # self.first_conv.apply(self._init_weights) stage1 = [] for stage_config in zip( config.backbone_stage1_in_channels, @@ -500,9 +527,9 @@ def forward(self, x): return output -class FASTNeck(FastPreTrainedModel): +class FASTNeck(nn.Module): def __init__(self, config): - super().__init__(config) + super().__init__() reduce_layer_configs = list( zip( config.neck_in_channels, @@ -549,9 +576,9 @@ def forward(self, x): return f -class FASTHead(FastPreTrainedModel): +class FASTHead(nn.Module): def __init__(self, config): - super().__init__(config) + super().__init__() self.conv = RepConvLayer( config.head_conv_in_channels, config.head_conv_out_channels, @@ -891,7 +918,13 @@ class FASTForImageCaptioningOutput(ModelOutput): loss: Optional[torch.Tensor] = None hidden_states: Optional[torch.FloatTensor] = None - +@add_start_docstrings( + """BEiT-3 is a general-purpose multimodal foundation model that excels in both vision and vision-language tasks. It + utilizes [Multiway transformers] (https://arxiv.org/abs/2208.10442) for deep fusion and modality-specific + encoding, and unifies masked modeling on images, texts, and image-text pairs, achieving top performance on + multiple benchmarks.""", + FAST_FOR_CAPTIONING_INPUTS_DOCSTRING, +) class FASTForImageCaptioning(FastPreTrainedModel): def __init__(self, config): super().__init__(config) diff --git a/utils/check_repo.py b/utils/check_repo.py index aa448f32e62d..f7f88615b670 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -50,7 +50,6 @@ from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES from transformers.utils import ENV_VARS_TRUE_VALUES, direct_transformers_import - # All paths are set with the intent you should run this script from the root of the repo with the command # python utils/check_repo.py PATH_TO_TRANSFORMERS = "src/transformers" @@ -223,6 +222,7 @@ "TFCLIPVisionModel", "TFGroupViTTextModel", "TFGroupViTVisionModel", + "FASTForImageCaptioning", "FlaxCLIPTextModel", "FlaxCLIPTextModelWithProjection", "FlaxCLIPVisionModel", @@ -337,7 +337,6 @@ ] ) - # This is to make sure the transformers module imported is the one in the repo. transformers = direct_transformers_import(PATH_TO_TRANSFORMERS) @@ -812,9 +811,9 @@ def check_objects_being_equally_in_main_init(): module_name = module_path.split(".")[-1] module_dir = ".".join(module_path.split(".")[:-1]) if ( - module_name.startswith("modeling_") - and not module_name.startswith("modeling_tf_") - and not module_name.startswith("modeling_flax_") + module_name.startswith("modeling_") + and not module_name.startswith("modeling_tf_") + and not module_name.startswith("modeling_flax_") ): parent_module = sys.modules[module_dir] @@ -1007,17 +1006,17 @@ def ignore_undocumented(name: str) -> bool: return True # PreTrainedModels / Encoders / Decoders / Layers / Embeddings / Attention are not documented. if ( - name.endswith("PreTrainedModel") - or name.endswith("Decoder") - or name.endswith("Encoder") - or name.endswith("Layer") - or name.endswith("Embeddings") - or name.endswith("Attention") + name.endswith("PreTrainedModel") + or name.endswith("Decoder") + or name.endswith("Encoder") + or name.endswith("Layer") + or name.endswith("Embeddings") + or name.endswith("Attention") ): return True # Submodules are not documented. if os.path.isdir(os.path.join(PATH_TO_TRANSFORMERS, name)) or os.path.isfile( - os.path.join(PATH_TO_TRANSFORMERS, f"{name}.py") + os.path.join(PATH_TO_TRANSFORMERS, f"{name}.py") ): return True # All load functions are not documented. @@ -1075,7 +1074,7 @@ def check_model_type_doc_match(): "Some model doc pages do not match any existing model type:\n" + "\n".join(errors) + "\nYou can add any missing model type to the `MODEL_NAMES_MAPPING` constant in " - "models/auto/configuration_auto.py." + "models/auto/configuration_auto.py." ) @@ -1119,7 +1118,7 @@ def check_docstrings_are_in_md(): "The following files have docstrings written in rst:\n" + "\n".join([f"- {f}" for f in files_with_rst]) + "\nTo fix this run `doc-builder convert path_to_py_file` after installing `doc-builder`\n" - "(`pip install git+https://github.com/huggingface/doc-builder`)" + "(`pip install git+https://github.com/huggingface/doc-builder`)" ) From 13ea2bbc0120792d228089be2d47073afd4a68bd Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 1 Nov 2023 08:10:46 +0530 Subject: [PATCH 016/152] fix style and fix copies --- src/transformers/__init__.py | 21 ++++++++-------- src/transformers/models/fast/__init__.py | 11 ++++---- .../models/fast/image_processing_fast.py | 14 +++++++---- src/transformers/models/fast/modeling_fast.py | 10 +++++--- src/transformers/utils/dummy_pt_objects.py | 7 ++++++ utils/check_repo.py | 25 ++++++++++--------- 6 files changed, 51 insertions(+), 37 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 82cee836cf05..5e12dc8c3354 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -47,6 +47,7 @@ logging, ) + logger = logging.get_logger(__name__) # pylint: disable=invalid-name # Base objects, independent of any specific backend @@ -4400,11 +4401,11 @@ try: if not ( - is_librosa_available() - and is_essentia_available() - and is_scipy_available() - and is_torch_available() - and is_pretty_midi_available() + is_librosa_available() + and is_essentia_available() + and is_scipy_available() + and is_torch_available() + and is_pretty_midi_available() ): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: @@ -8554,11 +8555,11 @@ try: if not ( - is_librosa_available() - and is_essentia_available() - and is_scipy_available() - and is_torch_available() - and is_pretty_midi_available() + is_librosa_available() + and is_essentia_available() + and is_scipy_available() + and is_torch_available() + and is_pretty_midi_available() ): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py index e7e044c5d2ce..78bd816d9d0d 100644 --- a/src/transformers/models/fast/__init__.py +++ b/src/transformers/models/fast/__init__.py @@ -20,10 +20,8 @@ is_torch_available, ) -_import_structure = { - "configuration_fast": ["FastConfig"], - "image_processing_fast": ["FastImageProcessor"] -} + +_import_structure = {"configuration_fast": ["FastConfig"], "image_processing_fast": ["FastImageProcessor"]} try: if not is_torch_available(): @@ -31,18 +29,19 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["modeling_fast"] = ["FASTForImageCaptioning","FastPreTrainedModel"] + _import_structure["modeling_fast"] = ["FASTForImageCaptioning", "FastPreTrainedModel"] if TYPE_CHECKING: from .configuration_fast import FastConfig from .image_processing_fast import FastImageProcessor + try: if not is_torch_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: pass else: - from .modeling_fast import FASTForImageCaptioning,FastPreTrainedModel + from .modeling_fast import FASTForImageCaptioning, FastPreTrainedModel else: diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index 1c652128a85e..1f160810c315 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -68,22 +68,22 @@ class FastImageProcessor(BaseImageProcessor): size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`): Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` method. - resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the `preprocess` method. - do_center_crop (`bool`, *optional*, defaults to `True`): + do_center_crop (`bool`, *optional*, defaults to `False`): Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image is padded with 0's and then center cropped. Can be overridden by the `do_center_crop` parameter in the `preprocess` method. crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`): Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`. Can be overridden by the `crop_size` parameter in the `preprocess` method. - do_rescale (`bool`, *optional*, defaults to `True`): - Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` - parameter in the `preprocess` method. rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. do_normalize (`bool`, *optional*, defaults to `True`): Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` method. @@ -98,6 +98,10 @@ class FastImageProcessor(BaseImageProcessor): used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The background label will be replaced by 255. Can be overridden by the `do_reduce_labels` parameter in the `preprocess` method. + min_area (`int`, *optional*, defaults to 10): + min_score (`float`, *optional*, defaults to 0.88): + bbox_type (`str`, *optional*, defaults to `"rect"`): + pooling_size (`int`, *optional*, defaults to 9): """ model_input_names = ["pixel_values"] diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 4d04b8e56ed5..7820fd24cdd3 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -9,13 +9,13 @@ from transformers import FastConfig, PreTrainedModel, add_start_docstrings from transformers.utils import ModelOutput + FAST_FOR_CAPTIONING_INPUTS_DOCSTRING = r""" Args: input_ids (`torch.LongTensor` of shape `({0})`): - Indices of input sequence tokens in the vocabulary. - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - [What are input IDs?](../glossary#input-ids) + Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input + IDs?](../glossary#input-ids) pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`BeitImageProcessor.__call__`] for details. @@ -37,6 +37,7 @@ classification loss is computed (Cross-Entropy) against these labels. """ + def get_same_padding(kernel_size): if isinstance(kernel_size, tuple): assert len(kernel_size) == 2, "invalid kernel size: %s" % kernel_size @@ -918,6 +919,7 @@ class FASTForImageCaptioningOutput(ModelOutput): loss: Optional[torch.Tensor] = None hidden_states: Optional[torch.FloatTensor] = None + @add_start_docstrings( """BEiT-3 is a general-purpose multimodal foundation model that excels in both vision and vision-language tasks. It utilizes [Multiway transformers] (https://arxiv.org/abs/2208.10442) for deep fusion and modality-specific diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 4d89b2942f79..06bdee17752b 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -3453,6 +3453,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class FastPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/utils/check_repo.py b/utils/check_repo.py index f7f88615b670..e9419bd78b03 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -50,6 +50,7 @@ from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES from transformers.utils import ENV_VARS_TRUE_VALUES, direct_transformers_import + # All paths are set with the intent you should run this script from the root of the repo with the command # python utils/check_repo.py PATH_TO_TRANSFORMERS = "src/transformers" @@ -811,9 +812,9 @@ def check_objects_being_equally_in_main_init(): module_name = module_path.split(".")[-1] module_dir = ".".join(module_path.split(".")[:-1]) if ( - module_name.startswith("modeling_") - and not module_name.startswith("modeling_tf_") - and not module_name.startswith("modeling_flax_") + module_name.startswith("modeling_") + and not module_name.startswith("modeling_tf_") + and not module_name.startswith("modeling_flax_") ): parent_module = sys.modules[module_dir] @@ -1006,17 +1007,17 @@ def ignore_undocumented(name: str) -> bool: return True # PreTrainedModels / Encoders / Decoders / Layers / Embeddings / Attention are not documented. if ( - name.endswith("PreTrainedModel") - or name.endswith("Decoder") - or name.endswith("Encoder") - or name.endswith("Layer") - or name.endswith("Embeddings") - or name.endswith("Attention") + name.endswith("PreTrainedModel") + or name.endswith("Decoder") + or name.endswith("Encoder") + or name.endswith("Layer") + or name.endswith("Embeddings") + or name.endswith("Attention") ): return True # Submodules are not documented. if os.path.isdir(os.path.join(PATH_TO_TRANSFORMERS, name)) or os.path.isfile( - os.path.join(PATH_TO_TRANSFORMERS, f"{name}.py") + os.path.join(PATH_TO_TRANSFORMERS, f"{name}.py") ): return True # All load functions are not documented. @@ -1074,7 +1075,7 @@ def check_model_type_doc_match(): "Some model doc pages do not match any existing model type:\n" + "\n".join(errors) + "\nYou can add any missing model type to the `MODEL_NAMES_MAPPING` constant in " - "models/auto/configuration_auto.py." + "models/auto/configuration_auto.py." ) @@ -1118,7 +1119,7 @@ def check_docstrings_are_in_md(): "The following files have docstrings written in rst:\n" + "\n".join([f"- {f}" for f in files_with_rst]) + "\nTo fix this run `doc-builder convert path_to_py_file` after installing `doc-builder`\n" - "(`pip install git+https://github.com/huggingface/doc-builder`)" + "(`pip install git+https://github.com/huggingface/doc-builder`)" ) From 1abfbc0c944e997fcf280b0f48fc819818bfeb2d Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 1 Nov 2023 09:48:33 +0530 Subject: [PATCH 017/152] Fix build --- src/transformers/models/fast/__init__.py | 7 +++-- .../models/fast/configuration_fast.py | 29 +++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py index 78bd816d9d0d..c4ecab2f2c0d 100644 --- a/src/transformers/models/fast/__init__.py +++ b/src/transformers/models/fast/__init__.py @@ -21,7 +21,10 @@ ) -_import_structure = {"configuration_fast": ["FastConfig"], "image_processing_fast": ["FastImageProcessor"]} +_import_structure = { + "configuration_fast": ["FAST_PRETRAINED_CONFIG_ARCHIVE_MAP", "FastConfig"], + "image_processing_fast": ["FastImageProcessor"], +} try: if not is_torch_available(): @@ -32,7 +35,7 @@ _import_structure["modeling_fast"] = ["FASTForImageCaptioning", "FastPreTrainedModel"] if TYPE_CHECKING: - from .configuration_fast import FastConfig + from .configuration_fast import FAST_PRETRAINED_CONFIG_ARCHIVE_MAP, FastConfig from .image_processing_fast import FastImageProcessor try: diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index ee8c27b03a32..3f813386507f 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -1,7 +1,36 @@ +# coding=utf-8 +# Copyright The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Fast model configuration""" from transformers import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +FAST_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "fast_base_tt_800_finetune_ic17mlt": ( + "https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt/raw/main/config.json" + ), +} class FastConfig(PretrainedConfig): + r""" + [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt) + """ + def __init__( self, backbone_kernel_size=3, From f85fbda8f71bb1726a743cee058b6e5d8fb27ba8 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 1 Nov 2023 10:00:22 +0530 Subject: [PATCH 018/152] Fix build --- .../models/fast/image_processing_fast.py | 166 +++++++++--------- 1 file changed, 82 insertions(+), 84 deletions(-) diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index 1f160810c315..320d3cb6cb47 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -12,19 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Image processor class for Beit.""" +"""Image processor class for Fast.""" import math import warnings from typing import Any, Dict, List, Optional, Union from ...utils.import_utils import is_cv2_available - if is_cv2_available(): import cv2 import numpy as np -import torch.nn as nn -import torch.nn.functional as F from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import resize, to_channel_dimension_format @@ -47,12 +44,13 @@ logging, ) - if is_vision_available(): import PIL if is_torch_available(): import torch + import torch.nn as nn + import torch.nn.functional as F logger = logging.get_logger(__name__) @@ -107,23 +105,23 @@ class FastImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] def __init__( - self, - do_resize: bool = True, - size: Dict[str, int] = None, - resample: PILImageResampling = PILImageResampling.BICUBIC, - do_center_crop: bool = False, - crop_size: Dict[str, int] = None, - rescale_factor: Union[int, float] = 1 / 255, - do_rescale: bool = True, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_reduce_labels: bool = False, - min_area: int = 10, - min_score: float = 0.88, - bbox_type: str = "rect", - pooling_size: int = 9, - **kwargs, + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = False, + crop_size: Dict[str, int] = None, + rescale_factor: Union[int, float] = 1 / 255, + do_rescale: bool = True, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_reduce_labels: bool = False, + min_area: int = 10, + min_score: float = 0.88, + bbox_type: str = "rect", + pooling_size: int = 9, + **kwargs, ) -> None: if "reduce_labels" in kwargs: warnings.warn( @@ -165,13 +163,13 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): return super().from_dict(image_processor_dict, **kwargs) def resize( - self, - image: np.ndarray, - size: Dict[str, int], - resample: PILImageResampling = PILImageResampling.BICUBIC, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, ) -> np.ndarray: """ Resize an image to (size["height"], size["width"]). @@ -209,20 +207,20 @@ def reduce_label(self, label: ImageInput) -> np.ndarray: return label def _preprocess( - self, - image: ImageInput, - do_reduce_labels: bool = None, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + image: ImageInput, + do_reduce_labels: bool = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ): if do_reduce_labels: image = self.reduce_label(image) @@ -242,20 +240,20 @@ def _preprocess( return image def _preprocess_image( - self, - image: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + image: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """Preprocesses a single image.""" # All transformations expect numpy arrays. @@ -287,15 +285,15 @@ def _preprocess_image( return image def _preprocess_segmentation_map( - self, - segmentation_map: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_reduce_labels: bool = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + segmentation_map: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_reduce_labels: bool = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ): """Preprocesses a single segmentation map.""" # All transformations expect numpy arrays. @@ -333,24 +331,24 @@ def __call__(self, images, segmentation_maps=None, **kwargs): return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs) def preprocess( - self, - images: ImageInput, - segmentation_maps: Optional[ImageInput] = None, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_reduce_labels: Optional[bool] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: ChannelDimension = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, + self, + images: ImageInput, + segmentation_maps: Optional[ImageInput] = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_reduce_labels: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. From cd0b45f670865208eedeab366d843768447f1dbc Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 1 Nov 2023 10:22:36 +0530 Subject: [PATCH 019/152] Fix Build --- .../models/fast/image_processing_fast.py | 160 +++++++++--------- 1 file changed, 81 insertions(+), 79 deletions(-) diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index 320d3cb6cb47..a9ae06694fd6 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -19,6 +19,7 @@ from ...utils.import_utils import is_cv2_available + if is_cv2_available(): import cv2 import numpy as np @@ -44,6 +45,7 @@ logging, ) + if is_vision_available(): import PIL @@ -105,23 +107,23 @@ class FastImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] def __init__( - self, - do_resize: bool = True, - size: Dict[str, int] = None, - resample: PILImageResampling = PILImageResampling.BICUBIC, - do_center_crop: bool = False, - crop_size: Dict[str, int] = None, - rescale_factor: Union[int, float] = 1 / 255, - do_rescale: bool = True, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_reduce_labels: bool = False, - min_area: int = 10, - min_score: float = 0.88, - bbox_type: str = "rect", - pooling_size: int = 9, - **kwargs, + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = False, + crop_size: Dict[str, int] = None, + rescale_factor: Union[int, float] = 1 / 255, + do_rescale: bool = True, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_reduce_labels: bool = False, + min_area: int = 10, + min_score: float = 0.88, + bbox_type: str = "rect", + pooling_size: int = 9, + **kwargs, ) -> None: if "reduce_labels" in kwargs: warnings.warn( @@ -163,13 +165,13 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): return super().from_dict(image_processor_dict, **kwargs) def resize( - self, - image: np.ndarray, - size: Dict[str, int], - resample: PILImageResampling = PILImageResampling.BICUBIC, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, ) -> np.ndarray: """ Resize an image to (size["height"], size["width"]). @@ -207,20 +209,20 @@ def reduce_label(self, label: ImageInput) -> np.ndarray: return label def _preprocess( - self, - image: ImageInput, - do_reduce_labels: bool = None, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + image: ImageInput, + do_reduce_labels: bool = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ): if do_reduce_labels: image = self.reduce_label(image) @@ -240,20 +242,20 @@ def _preprocess( return image def _preprocess_image( - self, - image: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + image: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """Preprocesses a single image.""" # All transformations expect numpy arrays. @@ -285,15 +287,15 @@ def _preprocess_image( return image def _preprocess_segmentation_map( - self, - segmentation_map: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_reduce_labels: bool = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + self, + segmentation_map: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_reduce_labels: bool = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ): """Preprocesses a single segmentation map.""" # All transformations expect numpy arrays. @@ -331,24 +333,24 @@ def __call__(self, images, segmentation_maps=None, **kwargs): return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs) def preprocess( - self, - images: ImageInput, - segmentation_maps: Optional[ImageInput] = None, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_reduce_labels: Optional[bool] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: ChannelDimension = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, + self, + images: ImageInput, + segmentation_maps: Optional[ImageInput] = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: Dict[str, int] = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_reduce_labels: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. From 6005f2febcfb14c038b2e47914ef37e7588b10bd Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 1 Nov 2023 15:54:52 +0530 Subject: [PATCH 020/152] Clean up docstrings --- .../models/fast/image_processing_fast.py | 15 +-- src/transformers/models/fast/modeling_fast.py | 106 ++++++++---------- 2 files changed, 48 insertions(+), 73 deletions(-) diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index a9ae06694fd6..f950f4bca2fa 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -93,11 +93,6 @@ class FastImageProcessor(BaseImageProcessor): image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): The standard deviation to use if normalizing the image. This is a float or list of floats of length of the number of channels of the image. Can be overridden by the `image_std` parameter in the `preprocess` method. - do_reduce_labels (`bool`, *optional*, defaults to `False`): - Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is - used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The - background label will be replaced by 255. Can be overridden by the `do_reduce_labels` parameter in the - `preprocess` method. min_area (`int`, *optional*, defaults to 10): min_score (`float`, *optional*, defaults to 0.88): bbox_type (`str`, *optional*, defaults to `"rect"`): @@ -118,20 +113,13 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - do_reduce_labels: bool = False, min_area: int = 10, min_score: float = 0.88, bbox_type: str = "rect", pooling_size: int = 9, **kwargs, ) -> None: - if "reduce_labels" in kwargs: - warnings.warn( - "The `reduce_labels` parameter is deprecated and will be removed in a future version. Please use" - " `do_reduce_labels` instead.", - FutureWarning, - ) - do_reduce_labels = kwargs.pop("reduce_labels") + super().__init__(**kwargs) size = size if size is not None else {"height": 640, "width": 640} size = get_size_dict(size) @@ -147,7 +135,6 @@ def __init__( self.do_normalize = do_normalize self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD - self.do_reduce_labels = do_reduce_labels self.min_area = min_area self.min_score = min_score self.bbox_type = bbox_type diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 7820fd24cdd3..4d916690bce2 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -9,24 +9,11 @@ from transformers import FastConfig, PreTrainedModel, add_start_docstrings from transformers.utils import ModelOutput - FAST_FOR_CAPTIONING_INPUTS_DOCSTRING = r""" Args: - input_ids (`torch.LongTensor` of shape `({0})`): - Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See - [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input - IDs?](../glossary#input-ids) pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`BeitImageProcessor.__call__`] for details. - language_masked_pos (`torch.LongTensor` of shape `({0})`): - language_masked_pos for denoting tokens for captioning - - 1 indicates the token is **Present**, - - 0 indicates the token is **absent**. - text_len (`torch.LongTensor` of shape `({0})`): - Length of text for captioning - past_key_value (`Dict`): - A Dictionary containing the incremental states layerwise output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. @@ -66,7 +53,7 @@ def build_activation(act_func, inplace=True): class My2DLayer(nn.Module): def __init__( - self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act" + self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act" ): super(My2DLayer, self).__init__() self.in_channels = in_channels @@ -163,19 +150,19 @@ def is_zero_layer(): class ConvLayer(nn.Module): def __init__( - self, - in_channels, - out_channels, - kernel_size=3, - stride=1, - dilation=1, - groups=1, - bias=False, - has_shuffle=False, - use_bn=True, - act_func="relu", - dropout_rate=0, - use_act=True, + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + dilation=1, + groups=1, + bias=False, + has_shuffle=False, + use_bn=True, + act_func="relu", + dropout_rate=0, + use_act=True, ): super().__init__() @@ -449,48 +436,48 @@ def __init__(self, config): # self.first_conv.apply(self._init_weights) stage1 = [] for stage_config in zip( - config.backbone_stage1_in_channels, - config.backbone_stage1_out_channels, - config.backbone_stage1_kernel_size, - config.backbone_stage1_stride, - config.backbone_stage1_dilation, - config.backbone_stage1_groups, + config.backbone_stage1_in_channels, + config.backbone_stage1_out_channels, + config.backbone_stage1_kernel_size, + config.backbone_stage1_stride, + config.backbone_stage1_dilation, + config.backbone_stage1_groups, ): stage1.append(RepConvLayer(*stage_config)) self.stage1 = nn.ModuleList(stage1) stage2 = [] for stage_config in zip( - config.backbone_stage2_in_channels, - config.backbone_stage2_out_channels, - config.backbone_stage2_kernel_size, - config.backbone_stage2_stride, - config.backbone_stage2_dilation, - config.backbone_stage2_groups, + config.backbone_stage2_in_channels, + config.backbone_stage2_out_channels, + config.backbone_stage2_kernel_size, + config.backbone_stage2_stride, + config.backbone_stage2_dilation, + config.backbone_stage2_groups, ): stage2.append(RepConvLayer(*stage_config)) self.stage2 = nn.ModuleList(stage2) stage3 = [] for stage_config in zip( - config.backbone_stage3_in_channels, - config.backbone_stage3_out_channels, - config.backbone_stage3_kernel_size, - config.backbone_stage3_stride, - config.backbone_stage3_dilation, - config.backbone_stage3_groups, + config.backbone_stage3_in_channels, + config.backbone_stage3_out_channels, + config.backbone_stage3_kernel_size, + config.backbone_stage3_stride, + config.backbone_stage3_dilation, + config.backbone_stage3_groups, ): stage3.append(RepConvLayer(*stage_config)) self.stage3 = nn.ModuleList(stage3) stage4 = [] for stage_config in zip( - config.backbone_stage4_in_channels, - config.backbone_stage4_out_channels, - config.backbone_stage4_kernel_size, - config.backbone_stage4_stride, - config.backbone_stage4_dilation, - config.backbone_stage4_groups, + config.backbone_stage4_in_channels, + config.backbone_stage4_out_channels, + config.backbone_stage4_kernel_size, + config.backbone_stage4_stride, + config.backbone_stage4_dilation, + config.backbone_stage4_groups, ): stage4.append(RepConvLayer(*stage_config)) self.stage4 = nn.ModuleList(stage4) @@ -722,7 +709,8 @@ def _max_pooling(self, x, scale=1): def emb_loss( - emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), bg_sample=False + emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), + bg_sample=False ): training_mask = (training_mask > 0.5).long() kernel = (kernel > 0.5).long() @@ -749,7 +737,7 @@ def emb_loss( continue ind = instance == lb emb_ = emb[:, ind] - dist = (emb_ - emb_mean[:, i : i + 1]).norm(p=2, dim=0) + dist = (emb_ - emb_mean[:, i: i + 1]).norm(p=2, dim=0) dist = F.relu(dist - delta_v) ** 2 l_agg[i] = torch.mean(torch.log(dist + 1.0)) l_agg = torch.mean(l_agg[1:]) @@ -781,7 +769,7 @@ def emb_loss( for i, lb in enumerate(unique_labels): if lb == 0: continue - dist = (emb_bg - emb_mean[:, i : i + 1]).norm(p=2, dim=0) + dist = (emb_bg - emb_mean[:, i: i + 1]).norm(p=2, dim=0) dist = F.relu(2 * delta_d - dist) ** 2 l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True) l_dis.append(l_dis_bg) @@ -976,11 +964,11 @@ def loss(self, hidden, labels): return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb) def forward( - self, - pixel_values: torch.FloatTensor, - output_hidden_states: Optional[bool] = True, - return_dict: Optional[bool] = None, - labels: Dict = None, + self, + pixel_values: torch.FloatTensor, + output_hidden_states: Optional[bool] = True, + return_dict: Optional[bool] = None, + labels: Dict = None, ): # outputs = {} return_dict = return_dict if return_dict is not None else self.config.use_return_dict From e56bff7e4696b3037cf35a24d9b5dd97c33b2195 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 1 Nov 2023 16:49:32 +0530 Subject: [PATCH 021/152] Fix Build --- .../models/fast/image_processing_fast.py | 12 +-- src/transformers/models/fast/modeling_fast.py | 94 +++++++++---------- 2 files changed, 52 insertions(+), 54 deletions(-) diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index f950f4bca2fa..ff46ca02b012 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -14,7 +14,6 @@ # limitations under the License. """Image processor class for Fast.""" import math -import warnings from typing import Any, Dict, List, Optional, Union from ...utils.import_utils import is_cv2_available @@ -93,10 +92,10 @@ class FastImageProcessor(BaseImageProcessor): image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): The standard deviation to use if normalizing the image. This is a float or list of floats of length of the number of channels of the image. Can be overridden by the `image_std` parameter in the `preprocess` method. - min_area (`int`, *optional*, defaults to 10): - min_score (`float`, *optional*, defaults to 0.88): - bbox_type (`str`, *optional*, defaults to `"rect"`): - pooling_size (`int`, *optional*, defaults to 9): + min_area (`int`, *optional*, defaults to 200): Threshold for min area for results + min_score (`float`, *optional*, defaults to 0.88): Threshold for min score for results + bbox_type (`str`, *optional*, defaults to `"rect"`): Type of bbox, rect or poly + pooling_size (`int`, *optional*, defaults to 9): Pooling size for text detection """ model_input_names = ["pixel_values"] @@ -113,13 +112,12 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - min_area: int = 10, + min_area: int = 200, min_score: float = 0.88, bbox_type: str = "rect", pooling_size: int = 9, **kwargs, ) -> None: - super().__init__(**kwargs) size = size if size is not None else {"height": 640, "width": 640} size = get_size_dict(size) diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 4d916690bce2..e7590614eade 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -9,6 +9,7 @@ from transformers import FastConfig, PreTrainedModel, add_start_docstrings from transformers.utils import ModelOutput + FAST_FOR_CAPTIONING_INPUTS_DOCSTRING = r""" Args: pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): @@ -53,7 +54,7 @@ def build_activation(act_func, inplace=True): class My2DLayer(nn.Module): def __init__( - self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act" + self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act" ): super(My2DLayer, self).__init__() self.in_channels = in_channels @@ -150,19 +151,19 @@ def is_zero_layer(): class ConvLayer(nn.Module): def __init__( - self, - in_channels, - out_channels, - kernel_size=3, - stride=1, - dilation=1, - groups=1, - bias=False, - has_shuffle=False, - use_bn=True, - act_func="relu", - dropout_rate=0, - use_act=True, + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + dilation=1, + groups=1, + bias=False, + has_shuffle=False, + use_bn=True, + act_func="relu", + dropout_rate=0, + use_act=True, ): super().__init__() @@ -436,48 +437,48 @@ def __init__(self, config): # self.first_conv.apply(self._init_weights) stage1 = [] for stage_config in zip( - config.backbone_stage1_in_channels, - config.backbone_stage1_out_channels, - config.backbone_stage1_kernel_size, - config.backbone_stage1_stride, - config.backbone_stage1_dilation, - config.backbone_stage1_groups, + config.backbone_stage1_in_channels, + config.backbone_stage1_out_channels, + config.backbone_stage1_kernel_size, + config.backbone_stage1_stride, + config.backbone_stage1_dilation, + config.backbone_stage1_groups, ): stage1.append(RepConvLayer(*stage_config)) self.stage1 = nn.ModuleList(stage1) stage2 = [] for stage_config in zip( - config.backbone_stage2_in_channels, - config.backbone_stage2_out_channels, - config.backbone_stage2_kernel_size, - config.backbone_stage2_stride, - config.backbone_stage2_dilation, - config.backbone_stage2_groups, + config.backbone_stage2_in_channels, + config.backbone_stage2_out_channels, + config.backbone_stage2_kernel_size, + config.backbone_stage2_stride, + config.backbone_stage2_dilation, + config.backbone_stage2_groups, ): stage2.append(RepConvLayer(*stage_config)) self.stage2 = nn.ModuleList(stage2) stage3 = [] for stage_config in zip( - config.backbone_stage3_in_channels, - config.backbone_stage3_out_channels, - config.backbone_stage3_kernel_size, - config.backbone_stage3_stride, - config.backbone_stage3_dilation, - config.backbone_stage3_groups, + config.backbone_stage3_in_channels, + config.backbone_stage3_out_channels, + config.backbone_stage3_kernel_size, + config.backbone_stage3_stride, + config.backbone_stage3_dilation, + config.backbone_stage3_groups, ): stage3.append(RepConvLayer(*stage_config)) self.stage3 = nn.ModuleList(stage3) stage4 = [] for stage_config in zip( - config.backbone_stage4_in_channels, - config.backbone_stage4_out_channels, - config.backbone_stage4_kernel_size, - config.backbone_stage4_stride, - config.backbone_stage4_dilation, - config.backbone_stage4_groups, + config.backbone_stage4_in_channels, + config.backbone_stage4_out_channels, + config.backbone_stage4_kernel_size, + config.backbone_stage4_stride, + config.backbone_stage4_dilation, + config.backbone_stage4_groups, ): stage4.append(RepConvLayer(*stage_config)) self.stage4 = nn.ModuleList(stage4) @@ -709,8 +710,7 @@ def _max_pooling(self, x, scale=1): def emb_loss( - emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), - bg_sample=False + emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), bg_sample=False ): training_mask = (training_mask > 0.5).long() kernel = (kernel > 0.5).long() @@ -737,7 +737,7 @@ def emb_loss( continue ind = instance == lb emb_ = emb[:, ind] - dist = (emb_ - emb_mean[:, i: i + 1]).norm(p=2, dim=0) + dist = (emb_ - emb_mean[:, i : i + 1]).norm(p=2, dim=0) dist = F.relu(dist - delta_v) ** 2 l_agg[i] = torch.mean(torch.log(dist + 1.0)) l_agg = torch.mean(l_agg[1:]) @@ -769,7 +769,7 @@ def emb_loss( for i, lb in enumerate(unique_labels): if lb == 0: continue - dist = (emb_bg - emb_mean[:, i: i + 1]).norm(p=2, dim=0) + dist = (emb_bg - emb_mean[:, i : i + 1]).norm(p=2, dim=0) dist = F.relu(2 * delta_d - dist) ** 2 l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True) l_dis.append(l_dis_bg) @@ -964,11 +964,11 @@ def loss(self, hidden, labels): return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb) def forward( - self, - pixel_values: torch.FloatTensor, - output_hidden_states: Optional[bool] = True, - return_dict: Optional[bool] = None, - labels: Dict = None, + self, + pixel_values: torch.FloatTensor, + output_hidden_states: Optional[bool] = True, + return_dict: Optional[bool] = None, + labels: Dict = None, ): # outputs = {} return_dict = return_dict if return_dict is not None else self.config.use_return_dict From ac672f309a3333775621ddd4a52d99c4f9f9484c Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 1 Nov 2023 17:03:26 +0530 Subject: [PATCH 022/152] Fix Build --- src/transformers/models/fast/modeling_fast.py | 122 +++++++++--------- 1 file changed, 64 insertions(+), 58 deletions(-) diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index e7590614eade..2db6db54e78b 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -7,7 +7,20 @@ import torch.nn.functional as F from transformers import FastConfig, PreTrainedModel, add_start_docstrings -from transformers.utils import ModelOutput +from transformers.utils import ModelOutput, add_start_docstrings_to_model_forward, replace_return_docstrings + +_CONFIG_FOR_DOC = "FastConfig" + +FAST_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`Beit3Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" FAST_FOR_CAPTIONING_INPUTS_DOCSTRING = r""" @@ -54,7 +67,7 @@ def build_activation(act_func, inplace=True): class My2DLayer(nn.Module): def __init__( - self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act" + self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act" ): super(My2DLayer, self).__init__() self.in_channels = in_channels @@ -151,19 +164,19 @@ def is_zero_layer(): class ConvLayer(nn.Module): def __init__( - self, - in_channels, - out_channels, - kernel_size=3, - stride=1, - dilation=1, - groups=1, - bias=False, - has_shuffle=False, - use_bn=True, - act_func="relu", - dropout_rate=0, - use_act=True, + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + dilation=1, + groups=1, + bias=False, + has_shuffle=False, + use_bn=True, + act_func="relu", + dropout_rate=0, + use_act=True, ): super().__init__() @@ -437,62 +450,52 @@ def __init__(self, config): # self.first_conv.apply(self._init_weights) stage1 = [] for stage_config in zip( - config.backbone_stage1_in_channels, - config.backbone_stage1_out_channels, - config.backbone_stage1_kernel_size, - config.backbone_stage1_stride, - config.backbone_stage1_dilation, - config.backbone_stage1_groups, + config.backbone_stage1_in_channels, + config.backbone_stage1_out_channels, + config.backbone_stage1_kernel_size, + config.backbone_stage1_stride, + config.backbone_stage1_dilation, + config.backbone_stage1_groups, ): stage1.append(RepConvLayer(*stage_config)) self.stage1 = nn.ModuleList(stage1) stage2 = [] for stage_config in zip( - config.backbone_stage2_in_channels, - config.backbone_stage2_out_channels, - config.backbone_stage2_kernel_size, - config.backbone_stage2_stride, - config.backbone_stage2_dilation, - config.backbone_stage2_groups, + config.backbone_stage2_in_channels, + config.backbone_stage2_out_channels, + config.backbone_stage2_kernel_size, + config.backbone_stage2_stride, + config.backbone_stage2_dilation, + config.backbone_stage2_groups, ): stage2.append(RepConvLayer(*stage_config)) self.stage2 = nn.ModuleList(stage2) stage3 = [] for stage_config in zip( - config.backbone_stage3_in_channels, - config.backbone_stage3_out_channels, - config.backbone_stage3_kernel_size, - config.backbone_stage3_stride, - config.backbone_stage3_dilation, - config.backbone_stage3_groups, + config.backbone_stage3_in_channels, + config.backbone_stage3_out_channels, + config.backbone_stage3_kernel_size, + config.backbone_stage3_stride, + config.backbone_stage3_dilation, + config.backbone_stage3_groups, ): stage3.append(RepConvLayer(*stage_config)) self.stage3 = nn.ModuleList(stage3) stage4 = [] for stage_config in zip( - config.backbone_stage4_in_channels, - config.backbone_stage4_out_channels, - config.backbone_stage4_kernel_size, - config.backbone_stage4_stride, - config.backbone_stage4_dilation, - config.backbone_stage4_groups, + config.backbone_stage4_in_channels, + config.backbone_stage4_out_channels, + config.backbone_stage4_kernel_size, + config.backbone_stage4_stride, + config.backbone_stage4_dilation, + config.backbone_stage4_groups, ): stage4.append(RepConvLayer(*stage_config)) self.stage4 = nn.ModuleList(stage4) - # self._initialize_weights() - # - # def _initialize_weights(self): - # for m in self.modules(): - # if isinstance(m, nn.Conv2d): - # nn.init.kaiming_normal_(m.weight) - # elif isinstance(m, nn.BatchNorm2d): - # m.weight.data.fill_(1) - # m.bias.data.zero_() - def forward(self, x): x = self.first_conv(x) output = [] @@ -710,7 +713,8 @@ def _max_pooling(self, x, scale=1): def emb_loss( - emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), bg_sample=False + emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), + bg_sample=False ): training_mask = (training_mask > 0.5).long() kernel = (kernel > 0.5).long() @@ -737,7 +741,7 @@ def emb_loss( continue ind = instance == lb emb_ = emb[:, ind] - dist = (emb_ - emb_mean[:, i : i + 1]).norm(p=2, dim=0) + dist = (emb_ - emb_mean[:, i: i + 1]).norm(p=2, dim=0) dist = F.relu(dist - delta_v) ** 2 l_agg[i] = torch.mean(torch.log(dist + 1.0)) l_agg = torch.mean(l_agg[1:]) @@ -769,7 +773,7 @@ def emb_loss( for i, lb in enumerate(unique_labels): if lb == 0: continue - dist = (emb_bg - emb_mean[:, i : i + 1]).norm(p=2, dim=0) + dist = (emb_bg - emb_mean[:, i: i + 1]).norm(p=2, dim=0) dist = F.relu(2 * delta_d - dist) ** 2 l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True) l_dis.append(l_dis_bg) @@ -913,7 +917,7 @@ class FASTForImageCaptioningOutput(ModelOutput): utilizes [Multiway transformers] (https://arxiv.org/abs/2208.10442) for deep fusion and modality-specific encoding, and unifies masked modeling on images, texts, and image-text pairs, achieving top performance on multiple benchmarks.""", - FAST_FOR_CAPTIONING_INPUTS_DOCSTRING, + FAST_START_DOCSTRING, ) class FASTForImageCaptioning(FastPreTrainedModel): def __init__(self, config): @@ -963,12 +967,14 @@ def loss(self, hidden, labels): return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb) + @add_start_docstrings_to_model_forward(FAST_FOR_CAPTIONING_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FASTForImageCaptioningOutput, config_class=_CONFIG_FOR_DOC) def forward( - self, - pixel_values: torch.FloatTensor, - output_hidden_states: Optional[bool] = True, - return_dict: Optional[bool] = None, - labels: Dict = None, + self, + pixel_values: torch.FloatTensor, + output_hidden_states: Optional[bool] = True, + return_dict: Optional[bool] = None, + labels: Dict = None, ): # outputs = {} return_dict = return_dict if return_dict is not None else self.config.use_return_dict From aa1cc417d3b1361bb37239b94792ed70b4f0c924 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 1 Nov 2023 17:33:44 +0530 Subject: [PATCH 023/152] Fix Build --- src/transformers/models/fast/modeling_fast.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 2db6db54e78b..86e6210dab82 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -22,7 +22,6 @@ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ - FAST_FOR_CAPTIONING_INPUTS_DOCSTRING = r""" Args: pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): @@ -976,6 +975,14 @@ def forward( return_dict: Optional[bool] = None, labels: Dict = None, ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy). + + Returns: + + """ # outputs = {} return_dict = return_dict if return_dict is not None else self.config.use_return_dict f = self.backbone(pixel_values) From 90e0cd8a8d0764fc80d57d11bf6318fccb0b2cf5 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 1 Nov 2023 17:50:10 +0530 Subject: [PATCH 024/152] Fix build --- src/transformers/models/fast/modeling_fast.py | 94 +++++++++---------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 86e6210dab82..8f36da8528e4 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -9,6 +9,7 @@ from transformers import FastConfig, PreTrainedModel, add_start_docstrings from transformers.utils import ModelOutput, add_start_docstrings_to_model_forward, replace_return_docstrings + _CONFIG_FOR_DOC = "FastConfig" FAST_START_DOCSTRING = r""" @@ -66,7 +67,7 @@ def build_activation(act_func, inplace=True): class My2DLayer(nn.Module): def __init__( - self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act" + self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act" ): super(My2DLayer, self).__init__() self.in_channels = in_channels @@ -163,19 +164,19 @@ def is_zero_layer(): class ConvLayer(nn.Module): def __init__( - self, - in_channels, - out_channels, - kernel_size=3, - stride=1, - dilation=1, - groups=1, - bias=False, - has_shuffle=False, - use_bn=True, - act_func="relu", - dropout_rate=0, - use_act=True, + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + dilation=1, + groups=1, + bias=False, + has_shuffle=False, + use_bn=True, + act_func="relu", + dropout_rate=0, + use_act=True, ): super().__init__() @@ -449,48 +450,48 @@ def __init__(self, config): # self.first_conv.apply(self._init_weights) stage1 = [] for stage_config in zip( - config.backbone_stage1_in_channels, - config.backbone_stage1_out_channels, - config.backbone_stage1_kernel_size, - config.backbone_stage1_stride, - config.backbone_stage1_dilation, - config.backbone_stage1_groups, + config.backbone_stage1_in_channels, + config.backbone_stage1_out_channels, + config.backbone_stage1_kernel_size, + config.backbone_stage1_stride, + config.backbone_stage1_dilation, + config.backbone_stage1_groups, ): stage1.append(RepConvLayer(*stage_config)) self.stage1 = nn.ModuleList(stage1) stage2 = [] for stage_config in zip( - config.backbone_stage2_in_channels, - config.backbone_stage2_out_channels, - config.backbone_stage2_kernel_size, - config.backbone_stage2_stride, - config.backbone_stage2_dilation, - config.backbone_stage2_groups, + config.backbone_stage2_in_channels, + config.backbone_stage2_out_channels, + config.backbone_stage2_kernel_size, + config.backbone_stage2_stride, + config.backbone_stage2_dilation, + config.backbone_stage2_groups, ): stage2.append(RepConvLayer(*stage_config)) self.stage2 = nn.ModuleList(stage2) stage3 = [] for stage_config in zip( - config.backbone_stage3_in_channels, - config.backbone_stage3_out_channels, - config.backbone_stage3_kernel_size, - config.backbone_stage3_stride, - config.backbone_stage3_dilation, - config.backbone_stage3_groups, + config.backbone_stage3_in_channels, + config.backbone_stage3_out_channels, + config.backbone_stage3_kernel_size, + config.backbone_stage3_stride, + config.backbone_stage3_dilation, + config.backbone_stage3_groups, ): stage3.append(RepConvLayer(*stage_config)) self.stage3 = nn.ModuleList(stage3) stage4 = [] for stage_config in zip( - config.backbone_stage4_in_channels, - config.backbone_stage4_out_channels, - config.backbone_stage4_kernel_size, - config.backbone_stage4_stride, - config.backbone_stage4_dilation, - config.backbone_stage4_groups, + config.backbone_stage4_in_channels, + config.backbone_stage4_out_channels, + config.backbone_stage4_kernel_size, + config.backbone_stage4_stride, + config.backbone_stage4_dilation, + config.backbone_stage4_groups, ): stage4.append(RepConvLayer(*stage_config)) self.stage4 = nn.ModuleList(stage4) @@ -712,8 +713,7 @@ def _max_pooling(self, x, scale=1): def emb_loss( - emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), - bg_sample=False + emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), bg_sample=False ): training_mask = (training_mask > 0.5).long() kernel = (kernel > 0.5).long() @@ -740,7 +740,7 @@ def emb_loss( continue ind = instance == lb emb_ = emb[:, ind] - dist = (emb_ - emb_mean[:, i: i + 1]).norm(p=2, dim=0) + dist = (emb_ - emb_mean[:, i : i + 1]).norm(p=2, dim=0) dist = F.relu(dist - delta_v) ** 2 l_agg[i] = torch.mean(torch.log(dist + 1.0)) l_agg = torch.mean(l_agg[1:]) @@ -772,7 +772,7 @@ def emb_loss( for i, lb in enumerate(unique_labels): if lb == 0: continue - dist = (emb_bg - emb_mean[:, i: i + 1]).norm(p=2, dim=0) + dist = (emb_bg - emb_mean[:, i : i + 1]).norm(p=2, dim=0) dist = F.relu(2 * delta_d - dist) ** 2 l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True) l_dis.append(l_dis_bg) @@ -969,11 +969,11 @@ def loss(self, hidden, labels): @add_start_docstrings_to_model_forward(FAST_FOR_CAPTIONING_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=FASTForImageCaptioningOutput, config_class=_CONFIG_FOR_DOC) def forward( - self, - pixel_values: torch.FloatTensor, - output_hidden_states: Optional[bool] = True, - return_dict: Optional[bool] = None, - labels: Dict = None, + self, + pixel_values: torch.FloatTensor, + output_hidden_states: Optional[bool] = True, + return_dict: Optional[bool] = None, + labels: Dict = None, ): r""" labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): From c94fc70b99f5e383e4a4c68dc166dafb2d1effdc Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 1 Nov 2023 21:36:01 +0530 Subject: [PATCH 025/152] Add test for image_processing_fast and add documentation tests --- .../models/fast/image_processing_fast.py | 75 -------- src/transformers/models/fast/modeling_fast.py | 13 +- .../models/fast/test_image_processing_fast.py | 160 ++++++++++++++++++ tests/models/fast/test_modeling_fast.py | 2 +- 4 files changed, 173 insertions(+), 77 deletions(-) create mode 100644 tests/models/fast/test_image_processing_fast.py diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index ff46ca02b012..03625082c8ee 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -185,18 +185,9 @@ def resize( **kwargs, ) - def reduce_label(self, label: ImageInput) -> np.ndarray: - label = to_numpy_array(label) - # Avoid using underflow conversion - label[label == 0] = 255 - label = label - 1 - label[label == 254] = 255 - return label - def _preprocess( self, image: ImageInput, - do_reduce_labels: bool = None, do_resize: bool = None, size: Dict[str, int] = None, resample: PILImageResampling = None, @@ -209,9 +200,6 @@ def _preprocess( image_std: Optional[Union[float, List[float]]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, ): - if do_reduce_labels: - image = self.reduce_label(image) - if do_resize: image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) @@ -254,7 +242,6 @@ def _preprocess_image( input_data_format = infer_channel_dimension_format(image) image = self._preprocess( image, - do_reduce_labels=False, do_resize=do_resize, size=size, resample=resample, @@ -271,47 +258,6 @@ def _preprocess_image( image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) return image - def _preprocess_segmentation_map( - self, - segmentation_map: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_reduce_labels: bool = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ): - """Preprocesses a single segmentation map.""" - # All transformations expect numpy arrays. - segmentation_map = to_numpy_array(segmentation_map) - # Add an axis to the segmentation maps for transformations. - if segmentation_map.ndim == 2: - segmentation_map = segmentation_map[None, ...] - added_dimension = True - input_data_format = ChannelDimension.FIRST - else: - added_dimension = False - if input_data_format is None: - input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1) - segmentation_map = self._preprocess( - image=segmentation_map, - do_reduce_labels=do_reduce_labels, - do_resize=do_resize, - resample=resample, - size=size, - do_center_crop=do_center_crop, - crop_size=crop_size, - do_normalize=False, - do_rescale=False, - input_data_format=ChannelDimension.FIRST, - ) - # Remove extra axis if added - if added_dimension: - segmentation_map = np.squeeze(segmentation_map, axis=0) - segmentation_map = segmentation_map.astype(np.int64) - return segmentation_map - def __call__(self, images, segmentation_maps=None, **kwargs): # Overrides the `__call__` method of the `Preprocessor` class such that the images and segmentation maps can both # be passed in as positional arguments. @@ -331,7 +277,6 @@ def preprocess( do_normalize: bool = None, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - do_reduce_labels: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: ChannelDimension = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, @@ -366,10 +311,6 @@ def preprocess( Image mean. image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): Image standard deviation. - do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): - Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. - ADE20k). The background label will be replaced by 255. return_tensors (`str` or `TensorType`, *optional*): The type of tensors to return. Can be one of: - Unset: Return a list of `np.ndarray`. @@ -401,7 +342,6 @@ def preprocess( do_normalize = do_normalize if do_normalize is not None else self.do_normalize image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std - do_reduce_labels = do_reduce_labels if do_reduce_labels is not None else self.do_reduce_labels images = make_list_of_images(images) if segmentation_maps is not None: @@ -452,21 +392,6 @@ def preprocess( data = {"pixel_values": images} - if segmentation_maps is not None: - segmentation_maps = [ - self._preprocess_segmentation_map( - segmentation_map=segmentation_map, - do_reduce_labels=do_reduce_labels, - do_resize=do_resize, - resample=resample, - size=size, - do_center_crop=do_center_crop, - crop_size=crop_size, - ) - for segmentation_map in segmentation_maps - ] - data["labels"] = segmentation_maps - return BatchFeature(data=data, tensor_type=return_tensors) def _max_pooling(self, x, scale=1): diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 8f36da8528e4..6aad3fa97b45 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -982,7 +982,18 @@ def forward( Returns: - """ + Examples: + + ```python + >>> from transformers import FastImageProcessor, FASTForImageCaptioning >>> from PIL import Image >>> import + requests >>> url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg" >>> + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") >>> processor = + FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") >>> model = + FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") >>> inputs = + processor(image, return_tensor="np") >>> # forward pass >>> outputs = + model(pixel_values=torch.tensor(inputs["pixel_values"])) >>> target_sizes = [(image.shape[1], image.shape[2]) + for image in inputs["pixel_values"]] >>> text_locations = processor.post_process_text_detection(outputs, + target_sizes) >>> print(text_locations[0]["bboxes"][0][:10]) [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]""" # outputs = {} return_dict = return_dict if return_dict is not None else self.config.use_return_dict f = self.backbone(pixel_values) diff --git a/tests/models/fast/test_image_processing_fast.py b/tests/models/fast/test_image_processing_fast.py new file mode 100644 index 000000000000..17b11004b2f2 --- /dev/null +++ b/tests/models/fast/test_image_processing_fast.py @@ -0,0 +1,160 @@ +# coding=utf-8 +# Copyright 2021 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import requests + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import FASTForImageCaptioning, FastImageProcessor + + +class FastImageProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_center_crop=True, + crop_size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + min_area: int = 200, + min_score: float = 0.88, + bbox_type: str = "rect", + pooling_size: int = 9, + ): + size = size if size is not None else {"height": 20, "width": 20} + crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.min_area = min_area + self.min_score = min_score + self.bbox_type = bbox_type + self.pooling_size = pooling_size + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_center_crop": self.do_center_crop, + "crop_size": self.crop_size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "min_area": self.min_area, + "min_score": self.min_score, + "bbox_type": self.bbox_type, + "pooling_size": self.pooling_size, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.crop_size["height"], self.crop_size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class FastImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = FastImageProcessor if is_vision_available() else None + + def setUp(self): + self.image_processor_tester = FastImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "center_crop")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + + def test_image_processor_from_dict_with_kwargs(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 20, "width": 20}) + self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) + + image_processor = self.image_processing_class.from_dict( + self.image_processor_dict, size=42, crop_size=84, reduce_labels=True + ) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) + + def test_post_process_text_detection(self): + model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") + + image_processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") + + def prepare_image(): + image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg" + raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB") + return raw_image + + image = prepare_image() + inputs = image_processor(image, return_tensor="np") + + output = model(pixel_values=torch.tensor(inputs["pixel_values"])) + target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]] + final_out = image_processor.post_process_text_detection(output, target_sizes) + + assert len(final_out[0]["bboxes"]) == 2 + assert len(final_out[0]["bboxes"][0]) == 716 + assert final_out[0]["bboxes"][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182] + assert round(float(final_out[0]["scores"][0]), 5) == 0.92356 diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index 17f09befd7cd..6fcc0214c4c3 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -409,7 +409,7 @@ def prepare_image(): assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134] assert round(float(final_out[0]["scores"][0]), 5) == 0.95541 - @slow + # @slow def test_inference_fast_base_800_total_text_ic17mlt_model(self): model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") From 47409eb40f7841bb28499baa184764db2c197627 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 1 Nov 2023 21:42:29 +0530 Subject: [PATCH 026/152] some refactorings --- src/transformers/models/fast/modeling_fast.py | 156 ++++-------------- 1 file changed, 36 insertions(+), 120 deletions(-) diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 6aad3fa97b45..135c0f79b0cd 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -430,7 +430,7 @@ def _init_weights(self, module): module.bias.data.zero_() -class TextNet(nn.Module): +class FastTextNet(nn.Module): def __init__(self, config): super().__init__() self.first_conv = ConvLayer( @@ -447,7 +447,6 @@ def __init__(self, config): config.backbone_dropout_rate, config.backbone_ops_order, ) - # self.first_conv.apply(self._init_weights) stage1 = [] for stage_config in zip( config.backbone_stage1_in_channels, @@ -496,25 +495,25 @@ def __init__(self, config): stage4.append(RepConvLayer(*stage_config)) self.stage4 = nn.ModuleList(stage4) - def forward(self, x): - x = self.first_conv(x) + def forward(self, hidden_states): + hidden_states = self.first_conv(hidden_states) output = [] for block in self.stage1: - x = block(x) - output.append(x) + hidden_states = block(hidden_states) + output.append(hidden_states) for block in self.stage2: - x = block(x) - output.append(x) + hidden_states = block(hidden_states) + output.append(hidden_states) for block in self.stage3: - x = block(x) - output.append(x) + hidden_states = block(hidden_states) + output.append(hidden_states) for block in self.stage4: - x = block(x) - output.append(x) + hidden_states = block(hidden_states) + output.append(hidden_states) return output @@ -532,13 +531,9 @@ def __init__(self, config): config.neck_groups, ) ) - self.layers_count = len(reduce_layer_configs) + self.num_layers = len(reduce_layer_configs) for layer_ix in range(0, len(reduce_layer_configs)): setattr(self, f"reduce_layer{layer_ix + 1}", RepConvLayer(*reduce_layer_configs[layer_ix])) - # self.reduce_layer1 = RepConvLayer(*reduce_layer_configs[0]) - # self.reduce_layer2 = RepConvLayer(*reduce_layer_configs[1]) - # self.reduce_layer3 = RepConvLayer(*reduce_layer_configs[2]) - # self.reduce_layer4 = RepConvLayer(*reduce_layer_configs[3]) self._initialize_weights() @@ -550,22 +545,21 @@ def _initialize_weights(self): m.weight.data.fill_(1) m.bias.data.zero_() - def _upsample(self, x, y): - _, _, H, W = y.size() - return F.upsample(x, size=(H, W), mode="bilinear") + def _upsample(self, layer_out, height, width): + return F.upsample(layer_out, size=(height, width), mode="bilinear") - def forward(self, x): - f1 = x[0] - f1 = self.reduce_layer1(f1) - output_stages = [f1] + def forward(self, hidden_states): + first_layer_hidden = hidden_states[0] + first_layer_hidden = self.reduce_layer1(first_layer_hidden) + output_stages = [first_layer_hidden] - for layer_ix in range(1, self.layers_count): - layer_out = getattr(self, f"reduce_layer{layer_ix + 1}")(x[layer_ix]) - layer_out = self._upsample(layer_out, f1) + for layer_ix in range(1, self.num_layers): + layer_out = getattr(self, f"reduce_layer{layer_ix + 1}")(hidden_states[layer_ix]) + layer_out = self._upsample(layer_out, first_layer_hidden[2], first_layer_hidden[3]) output_stages.append(layer_out) - f = torch.cat(output_stages, 1) - return f + combined_hidden_states = torch.cat(output_stages, 1) + return combined_hidden_states class FASTHead(nn.Module): @@ -621,55 +615,12 @@ def _initialize_weights(self): m.weight.data.fill_(1) m.bias.data.zero_() - def forward(self, x): - x = self.conv(x) + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) if self.dropout is not None: - x = self.dropout(x) - x = self.final(x) - return x - - # def get_results(self, out, img_meta, scale=2): - # org_img_size = img_meta["org_img_size"] - # img_size = img_meta["img_size"] # 640*640 - # batch_size = out.size(0) - # outputs = {} - # - # texts = F.interpolate( - # out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), mode="nearest" - # ) # B*1*320*320 - # texts = self._max_pooling(texts, scale=scale) # B*1*320*320 - # score_maps = torch.sigmoid_(texts) # B*1*320*320~ - # score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode="nearest") # B*1*640*640 - # score_maps = score_maps.squeeze(1) # B*640*640 - # - # kernels = (out[:, 0, :, :] > 0).to(torch.uint8) # B*160*160 - # labels_ = [] - # for kernel in kernels.numpy(): - # ret, label_ = cv2.connectedComponents(kernel) - # labels_.append(label_) - # labels_ = np.array(labels_) - # labels_ = torch.from_numpy(labels_) - # labels = labels_.unsqueeze(1).to(torch.float32) # B*1*160*160 - # labels = F.interpolate( - # labels, size=(img_size[0] // scale, img_size[1] // scale), mode="nearest" - # ) # B*1*320*320 - # labels = self._max_pooling(labels, scale=scale) - # labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode="nearest") # B*1*640*640 - # labels = labels.squeeze(1).to(torch.int32) # B*640*640 - # - # keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)] - # - # outputs.update({"kernels": kernels.data.cpu()}) - # - # scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0])) - # - # results = [] - # for i in range(batch_size): - # bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales) - # results.append({"bboxes": bboxes, "scores": scores}) - # outputs.update({"results": results}) - # - # return outputs + hidden_states = self.dropout(hidden_states) + hidden_states = self.final(hidden_states) + return hidden_states def _max_pooling(self, x, scale=1): if scale == 1: @@ -678,39 +629,6 @@ def _max_pooling(self, x, scale=1): x = self.pooling_2s(x) return x - # def generate_bbox(self, keys, label, score, scales): - # label_num = len(keys) - # bboxes = [] - # scores = [] - # for index in range(1, label_num): - # i = keys[index] - # ind = label == i - # ind_np = ind.data.cpu().numpy() - # points = np.array(np.where(ind_np)).transpose((1, 0)) - # if points.shape[0] < self.min_area: - # label[ind] = 0 - # continue - # score_i = score[ind].mean().item() - # if score_i < self.min_score: - # label[ind] = 0 - # continue - # - # if self.bbox_type == "rect": - # rect = cv2.minAreaRect(points[:, ::-1]) - # alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1]))) - # rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2]) - # bbox = cv2.boxPoints(rect) * scales - # else: - # binary = np.zeros(label.shape, dtype="uint8") - # binary[ind_np] = 1 - # contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - # bbox = contours[0] * scales - # - # bbox = bbox.astype("int32") - # bboxes.append(bbox.reshape(-1).tolist()) - # scores.append(score_i) - # return bboxes, scores - def emb_loss( emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), bg_sample=False @@ -921,7 +839,7 @@ class FASTForImageCaptioningOutput(ModelOutput): class FASTForImageCaptioning(FastPreTrainedModel): def __init__(self, config): super().__init__(config) - self.backbone = TextNet(config=config) + self.backbone = FastTextNet(config=config) self.neck = FASTNeck(config=config) self.det_head = FASTHead(config=config) self.loss_bg = config.loss_bg @@ -996,21 +914,19 @@ def forward( target_sizes) >>> print(text_locations[0]["bboxes"][0][:10]) [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]""" # outputs = {} return_dict = return_dict if return_dict is not None else self.config.use_return_dict - f = self.backbone(pixel_values) + hidden_states = self.backbone(pixel_values) - f = self.neck(f) + hidden_states = self.neck(hidden_states) - det_out = self.det_head(f) + text_detection_output = self.det_head(hidden_states) loss = None if labels: - out = self._upsample(det_out, pixel_values.size(), scale=1) + out = self._upsample(text_detection_output, pixel_values.size(), scale=1) loss = self.loss(out, labels) - # det_res = self.det_head.get_results(det_out, img_metas, scale=2) - # outputs.update(det_res) - det_out = self._upsample(det_out, pixel_values.size(), scale=4) + text_detection_output = self._upsample(text_detection_output, pixel_values.size(), scale=4) if not return_dict: - return (loss, det_out) if loss is not None else (det_out,) + return (loss, text_detection_output) if loss is not None else (text_detection_output,) - return FASTForImageCaptioningOutput(loss, det_out) + return FASTForImageCaptioningOutput(loss, text_detection_output) From 6b787d687b1ece094a2c9f5d50098243423913eb Mon Sep 17 00:00:00 2001 From: raghavanone Date: Sun, 5 Nov 2023 09:09:42 +0530 Subject: [PATCH 027/152] Fix failing tests --- .../models/fast/image_processing_fast.py | 5 ---- src/transformers/models/fast/modeling_fast.py | 29 ++++++++++++------- tests/models/fast/test_modeling_fast.py | 7 ++--- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index 03625082c8ee..2e58d40c8856 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -258,11 +258,6 @@ def _preprocess_image( image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) return image - def __call__(self, images, segmentation_maps=None, **kwargs): - # Overrides the `__call__` method of the `Preprocessor` class such that the images and segmentation maps can both - # be passed in as positional arguments. - return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs) - def preprocess( self, images: ImageInput, diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 135c0f79b0cd..4d8f1155ad5c 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -555,7 +555,8 @@ def forward(self, hidden_states): for layer_ix in range(1, self.num_layers): layer_out = getattr(self, f"reduce_layer{layer_ix + 1}")(hidden_states[layer_ix]) - layer_out = self._upsample(layer_out, first_layer_hidden[2], first_layer_hidden[3]) + _, _, height, width = first_layer_hidden.size() + layer_out = self._upsample(layer_out, height, width) output_stages.append(layer_out) combined_hidden_states = torch.cat(output_stages, 1) @@ -903,15 +904,23 @@ def forward( Examples: ```python - >>> from transformers import FastImageProcessor, FASTForImageCaptioning >>> from PIL import Image >>> import - requests >>> url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg" >>> - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") >>> processor = - FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") >>> model = - FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") >>> inputs = - processor(image, return_tensor="np") >>> # forward pass >>> outputs = - model(pixel_values=torch.tensor(inputs["pixel_values"])) >>> target_sizes = [(image.shape[1], image.shape[2]) - for image in inputs["pixel_values"]] >>> text_locations = processor.post_process_text_detection(outputs, - target_sizes) >>> print(text_locations[0]["bboxes"][0][:10]) [484, 175, 484, 178, 483, 179, 452, 179, 452, 182]""" + >>> from transformers import FastImageProcessor, FASTForImageCaptioning + >>> from PIL import Image + >>> import requests + + >>> url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + >>> processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") + >>> model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") + >>> inputs = processor(image, return_tensors="pt") + >>> # forward pass + >>> outputs = model(pixel_values=inputs["pixel_values"]) + >>> target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]] + >>> text_locations = processor.post_process_text_detection(outputs, target_sizes) + >>> print(text_locations[0]["bboxes"][0][:10]) + [484, 175, 484, 178, 483, 179, 452, 179, 452, 182] + ``` + """ # outputs = {} return_dict = return_dict if return_dict is not None else self.config.use_return_dict hidden_states = self.backbone(pixel_values) diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index 6fcc0214c4c3..951fc71dcdc3 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -27,7 +27,6 @@ from transformers.testing_utils import ( require_torch, require_vision, - slow, torch_device, ) @@ -388,7 +387,7 @@ def test_model_is_small(self): @require_torch @require_vision class FastModelIntegrationTest(unittest.TestCase): - @slow + # @slow def test_inference_fast_tiny_ic17mlt_model(self): model = FASTForImageCaptioning.from_pretrained("Raghavan/ic17mlt_Fast_T") @@ -400,7 +399,7 @@ def prepare_image(): return raw_image image = prepare_image() - input = image_processor(image, return_tensor="np") + input = image_processor(image, return_tensors="pt") output = model(pixel_values=torch.tensor(input["pixel_values"])) target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]] @@ -421,7 +420,7 @@ def prepare_image(): return raw_image image = prepare_image() - input = image_processor(image, return_tensor="np") + input = image_processor(image, return_tensors="pt") output = model(pixel_values=torch.tensor(input["pixel_values"])) target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]] From 134f4cc37d632949fe2e2f528d24188247024e74 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Sun, 5 Nov 2023 09:30:32 +0530 Subject: [PATCH 028/152] Incorporate PR feedbacks --- docs/source/en/_toctree.yml | 2 + docs/source/en/model_doc/fast.md | 6 +- src/transformers/__init__.py | 6 ++ src/transformers/models/fast/__init__.py | 4 +- .../fast/convert_fast_original_to_pytorch.py | 4 +- src/transformers/models/fast/modeling_fast.py | 67 ++++++++++++------- .../models/fast/test_image_processing_fast.py | 4 +- tests/models/fast/test_modeling_fast.py | 12 ++-- utils/check_repo.py | 2 +- 9 files changed, 65 insertions(+), 42 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 86cffb9a7e35..ca9067c596b0 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -537,6 +537,8 @@ title: EfficientFormer - local: model_doc/efficientnet title: EfficientNet + - local: model_doc/fast + title: Fast - local: model_doc/focalnet title: FocalNet - local: model_doc/glpn diff --git a/docs/source/en/model_doc/fast.md b/docs/source/en/model_doc/fast.md index 3c81109380ae..b8304251f2f9 100644 --- a/docs/source/en/model_doc/fast.md +++ b/docs/source/en/model_doc/fast.md @@ -14,7 +14,7 @@ rendered properly in your Markdown viewer. --> -# Fast +# FAST ## Overview @@ -34,9 +34,9 @@ than most networks that are searched for image classification [[autodoc]] FastImageProcessor -## FASTForImageCaptioning +## FastForSceneTextRecognition -[[autodoc]] FASTForImageCaptioning +[[autodoc]] FastForSceneTextRecognition - forward ## FASTForImageCaptioningOutput diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 5e12dc8c3354..37333f4ed67f 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1199,6 +1199,7 @@ _import_structure["models.xlnet"].append("XLNetTokenizerFast") _import_structure["tokenization_utils_fast"] = ["PreTrainedTokenizerFast"] + try: if not (is_sentencepiece_available() and is_tokenizers_available()): raise OptionalDependencyNotAvailable() @@ -1308,6 +1309,7 @@ _import_structure["models.vivit"].append("VivitImageProcessor") _import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"]) + # PyTorch-backed objects try: if not is_torch_available(): @@ -4399,6 +4401,7 @@ ] _import_structure["tf_utils"] = [] + try: if not ( is_librosa_available() @@ -4423,6 +4426,7 @@ _import_structure["models.pop2piano"].append("Pop2PianoTokenizer") _import_structure["models.pop2piano"].append("Pop2PianoProcessor") + # FLAX-backed objects try: if not is_flax_available(): @@ -4747,6 +4751,7 @@ ] ) + # Direct imports for type-checking if TYPE_CHECKING: # Configuration @@ -8862,6 +8867,7 @@ extra_objects={"__version__": __version__}, ) + if not is_tf_available() and not is_torch_available() and not is_flax_available(): logger.warning( "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. " diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py index c4ecab2f2c0d..dedc491f6c59 100644 --- a/src/transformers/models/fast/__init__.py +++ b/src/transformers/models/fast/__init__.py @@ -32,7 +32,7 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["modeling_fast"] = ["FASTForImageCaptioning", "FastPreTrainedModel"] + _import_structure["modeling_fast"] = ["FastForSceneTextRecognition", "FastPreTrainedModel"] if TYPE_CHECKING: from .configuration_fast import FAST_PRETRAINED_CONFIG_ARCHIVE_MAP, FastConfig @@ -44,7 +44,7 @@ except OptionalDependencyNotAvailable: pass else: - from .modeling_fast import FASTForImageCaptioning, FastPreTrainedModel + from .modeling_fast import FastForSceneTextRecognition, FastPreTrainedModel else: diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py index e549294081b8..45522f429ec2 100644 --- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py +++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py @@ -22,7 +22,7 @@ import torch from PIL import Image -from transformers import FastConfig, FASTForImageCaptioning +from transformers import FastConfig, FastForSceneTextRecognition from transformers.models.fast.image_processing_fast import FastImageProcessor @@ -210,7 +210,7 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_ if "short_size" in data_config["train"]: size = data_config["train"]["short_size"] - model = FASTForImageCaptioning(config) + model = FastForSceneTextRecognition(config) fast_image_processor = FastImageProcessor( size={"height": size, "width": size}, min_score=config.min_score, diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 4d8f1155ad5c..b88dc6043a72 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -1,3 +1,19 @@ +# coding=utf-8 +# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch FAST model.""" + from dataclasses import dataclass from typing import Dict, Optional @@ -18,7 +34,7 @@ behavior. Parameters: - config ([`Beit3Config`]): Model configuration class with all the parameters of the model. + config ([`FastConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ @@ -27,7 +43,7 @@ Args: pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See - [`BeitImageProcessor.__call__`] for details. + [`FastImageProcessor.__call__`] for details. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. @@ -162,7 +178,7 @@ def is_zero_layer(): return False -class ConvLayer(nn.Module): +class FASTConvLayer(nn.Module): def __init__( self, in_channels, @@ -245,9 +261,9 @@ def fuse_conv_bn(self, conv, bn): return conv -class RepConvLayer(nn.Module): +class FASTRepConvLayer(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1): - super(RepConvLayer, self).__init__() + super().__init__() self.in_channels = in_channels self.out_channels = out_channels @@ -256,7 +272,6 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, self.dilation = dilation self.groups = groups - assert len(kernel_size) == 2 padding = (int(((kernel_size[0] - 1) * dilation) / 2), int(((kernel_size[1] - 1) * dilation) / 2)) self.nonlinearity = nn.ReLU(inplace=True) @@ -310,21 +325,21 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None ) - def forward(self, input): + def forward(self, hidden_states): if self.training: if hasattr(self, "fused_conv"): self.__delattr__("fused_conv") - main_outputs = self.main_conv(input) + main_outputs = self.main_conv(hidden_states) main_outputs = self.main_bn(main_outputs) if self.ver_conv is not None: - vertical_outputs = self.ver_conv(input) + vertical_outputs = self.ver_conv(hidden_states) vertical_outputs = self.ver_bn(vertical_outputs) else: vertical_outputs = 0 if self.hor_conv is not None: - horizontal_outputs = self.hor_conv(input) + horizontal_outputs = self.hor_conv(hidden_states) horizontal_outputs = self.hor_bn(horizontal_outputs) else: horizontal_outputs = 0 @@ -332,13 +347,13 @@ def forward(self, input): if self.rbr_identity is None: id_out = 0 else: - id_out = self.rbr_identity(input) + id_out = self.rbr_identity(hidden_states) return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out) else: if not hasattr(self, "fused_conv"): self.prepare_for_eval() - return self.nonlinearity(self.fused_conv(input)) + return self.nonlinearity(self.fused_conv(hidden_states)) def _identity_to_conv(self, identity): if identity is None: @@ -433,7 +448,7 @@ def _init_weights(self, module): class FastTextNet(nn.Module): def __init__(self, config): super().__init__() - self.first_conv = ConvLayer( + self.first_conv = FASTConvLayer( config.backbone_in_channels, config.backbone_out_channels, config.backbone_kernel_size, @@ -456,7 +471,7 @@ def __init__(self, config): config.backbone_stage1_dilation, config.backbone_stage1_groups, ): - stage1.append(RepConvLayer(*stage_config)) + stage1.append(FASTRepConvLayer(*stage_config)) self.stage1 = nn.ModuleList(stage1) stage2 = [] @@ -468,7 +483,7 @@ def __init__(self, config): config.backbone_stage2_dilation, config.backbone_stage2_groups, ): - stage2.append(RepConvLayer(*stage_config)) + stage2.append(FASTRepConvLayer(*stage_config)) self.stage2 = nn.ModuleList(stage2) stage3 = [] @@ -480,7 +495,7 @@ def __init__(self, config): config.backbone_stage3_dilation, config.backbone_stage3_groups, ): - stage3.append(RepConvLayer(*stage_config)) + stage3.append(FASTRepConvLayer(*stage_config)) self.stage3 = nn.ModuleList(stage3) stage4 = [] @@ -492,7 +507,7 @@ def __init__(self, config): config.backbone_stage4_dilation, config.backbone_stage4_groups, ): - stage4.append(RepConvLayer(*stage_config)) + stage4.append(FASTRepConvLayer(*stage_config)) self.stage4 = nn.ModuleList(stage4) def forward(self, hidden_states): @@ -533,7 +548,7 @@ def __init__(self, config): ) self.num_layers = len(reduce_layer_configs) for layer_ix in range(0, len(reduce_layer_configs)): - setattr(self, f"reduce_layer{layer_ix + 1}", RepConvLayer(*reduce_layer_configs[layer_ix])) + setattr(self, f"reduce_layer{layer_ix + 1}", FASTRepConvLayer(*reduce_layer_configs[layer_ix])) self._initialize_weights() @@ -566,7 +581,7 @@ def forward(self, hidden_states): class FASTHead(nn.Module): def __init__(self, config): super().__init__() - self.conv = RepConvLayer( + self.conv = FASTRepConvLayer( config.head_conv_in_channels, config.head_conv_out_channels, config.head_conv_kernel_size, @@ -575,7 +590,7 @@ def __init__(self, config): config.head_conv_groups, ) - self.final = ConvLayer( + self.final = FASTConvLayer( config.head_final_in_channels, config.head_final_out_channels, config.head_final_kernel_size, @@ -813,7 +828,7 @@ def iou(a, b, mask, n_class=2, reduce=True): @dataclass -class FASTForImageCaptioningOutput(ModelOutput): +class FastForSceneTextRecognitionOutput(ModelOutput): """ Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity @@ -837,7 +852,7 @@ class FASTForImageCaptioningOutput(ModelOutput): multiple benchmarks.""", FAST_START_DOCSTRING, ) -class FASTForImageCaptioning(FastPreTrainedModel): +class FastForSceneTextRecognition(FastPreTrainedModel): def __init__(self, config): super().__init__(config) self.backbone = FastTextNet(config=config) @@ -886,7 +901,7 @@ def loss(self, hidden, labels): return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb) @add_start_docstrings_to_model_forward(FAST_FOR_CAPTIONING_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=FASTForImageCaptioningOutput, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings(output_type=FastForSceneTextRecognitionOutput, config_class=_CONFIG_FOR_DOC) def forward( self, pixel_values: torch.FloatTensor, @@ -904,14 +919,14 @@ def forward( Examples: ```python - >>> from transformers import FastImageProcessor, FASTForImageCaptioning + >>> from transformers import FastImageProcessor, FastForSceneTextRecognition >>> from PIL import Image >>> import requests >>> url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg" >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB") >>> processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") - >>> model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") + >>> model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") >>> inputs = processor(image, return_tensors="pt") >>> # forward pass >>> outputs = model(pixel_values=inputs["pixel_values"]) @@ -938,4 +953,4 @@ def forward( if not return_dict: return (loss, text_detection_output) if loss is not None else (text_detection_output,) - return FASTForImageCaptioningOutput(loss, text_detection_output) + return FastForSceneTextRecognitionOutput(loss, text_detection_output) diff --git a/tests/models/fast/test_image_processing_fast.py b/tests/models/fast/test_image_processing_fast.py index 17b11004b2f2..8aa523dc03f3 100644 --- a/tests/models/fast/test_image_processing_fast.py +++ b/tests/models/fast/test_image_processing_fast.py @@ -30,7 +30,7 @@ if is_vision_available(): from PIL import Image - from transformers import FASTForImageCaptioning, FastImageProcessor + from transformers import FastForSceneTextRecognition, FastImageProcessor class FastImageProcessingTester(unittest.TestCase): @@ -138,7 +138,7 @@ def test_image_processor_from_dict_with_kwargs(self): self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) def test_post_process_text_detection(self): - model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") + model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") image_processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index 951fc71dcdc3..409f579eed0f 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Testing suite for the PyTorch Falcon model. """ +""" Testing suite for the PyTorch FAST model. """ import inspect import unittest @@ -40,7 +40,7 @@ import torch from transformers import ( - FASTForImageCaptioning, + FastForSceneTextRecognition, ) @@ -269,7 +269,7 @@ def get_config(self): ) def create_and_check_model(self, config, input): - model = FASTForImageCaptioning(config=config) + model = FastForSceneTextRecognition(config=config) model.to(torch_device) model.eval() result = model(pixel_values=input["pixel_values"]) @@ -283,7 +283,7 @@ def prepare_config_and_inputs_for_common(self): @require_torch class FastModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): - all_model_classes = (FASTForImageCaptioning,) if is_torch_available() else () + all_model_classes = (FastForSceneTextRecognition,) if is_torch_available() else () pipeline_model_mapping = {} test_headmasking = False @@ -389,7 +389,7 @@ def test_model_is_small(self): class FastModelIntegrationTest(unittest.TestCase): # @slow def test_inference_fast_tiny_ic17mlt_model(self): - model = FASTForImageCaptioning.from_pretrained("Raghavan/ic17mlt_Fast_T") + model = FastForSceneTextRecognition.from_pretrained("Raghavan/ic17mlt_Fast_T") image_processor = FastImageProcessor.from_pretrained("Raghavan/ic17mlt_Fast_T") @@ -410,7 +410,7 @@ def prepare_image(): # @slow def test_inference_fast_base_800_total_text_ic17mlt_model(self): - model = FASTForImageCaptioning.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") + model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") image_processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") diff --git a/utils/check_repo.py b/utils/check_repo.py index e9419bd78b03..3af3a05a8aa6 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -223,7 +223,7 @@ "TFCLIPVisionModel", "TFGroupViTTextModel", "TFGroupViTVisionModel", - "FASTForImageCaptioning", + "FastForSceneTextRecognition", "FlaxCLIPTextModel", "FlaxCLIPTextModelWithProjection", "FlaxCLIPVisionModel", From 5b9608b2b62f26886a9447b6d9be3717124bb5ce Mon Sep 17 00:00:00 2001 From: raghavanone Date: Sun, 5 Nov 2023 09:44:55 +0530 Subject: [PATCH 029/152] Incorporate PR feedbacks --- .../models/fast/image_processing_fast.py | 15 +-- src/transformers/models/fast/modeling_fast.py | 124 ++---------------- 2 files changed, 15 insertions(+), 124 deletions(-) diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index 2e58d40c8856..d03f8b542ae0 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Image processor class for Fast.""" +"""Image processor class for FAST.""" import math from typing import Any, Dict, List, Optional, Union @@ -58,7 +58,7 @@ class FastImageProcessor(BaseImageProcessor): r""" - Constructs a Fast image processor. + Constructs a FAST image processor. Args: do_resize (`bool`, *optional*, defaults to `True`): @@ -261,7 +261,6 @@ def _preprocess_image( def preprocess( self, images: ImageInput, - segmentation_maps: Optional[ImageInput] = None, do_resize: bool = None, size: Dict[str, int] = None, resample: PILImageResampling = None, @@ -339,8 +338,6 @@ def preprocess( image_std = image_std if image_std is not None else self.image_std images = make_list_of_images(images) - if segmentation_maps is not None: - segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2) if not valid_images(images): raise ValueError( @@ -348,12 +345,6 @@ def preprocess( "torch.Tensor, tf.Tensor or jax.ndarray." ) - if segmentation_maps is not None and not valid_images(segmentation_maps): - raise ValueError( - "Invalid segmentation map type. Must be of type PIL.Image.Image, numpy.ndarray, " - "torch.Tensor, tf.Tensor or jax.ndarray." - ) - if do_resize and size is None or resample is None: raise ValueError("Size and resample must be specified if do_resize is True.") diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index b88dc6043a72..b209edcdceb4 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -57,12 +57,9 @@ def get_same_padding(kernel_size): if isinstance(kernel_size, tuple): - assert len(kernel_size) == 2, "invalid kernel size: %s" % kernel_size p1 = get_same_padding(kernel_size[0]) p2 = get_same_padding(kernel_size[1]) return p1, p2 - assert isinstance(kernel_size, int), "kernel size should be either `int` or `tuple`" - assert kernel_size % 2 > 0, "kernel size should be odd number" return kernel_size // 2 @@ -81,103 +78,6 @@ def build_activation(act_func, inplace=True): raise ValueError("do not support: %s" % act_func) -class My2DLayer(nn.Module): - def __init__( - self, in_channels, out_channels, use_bn=True, act_func="relu", dropout_rate=0, ops_order="weight_bn_act" - ): - super(My2DLayer, self).__init__() - self.in_channels = in_channels - self.out_channels = out_channels - - self.use_bn = use_bn - self.act_func = act_func - self.dropout_rate = dropout_rate - self.ops_order = ops_order - - """ modules""" - modules = {} - # batch norm - if self.use_bn: - if self.bn_before_weight: - modules["bn"] = nn.BatchNorm2d(in_channels) - else: - modules["bn"] = nn.BatchNorm2d(out_channels) - else: - modules["bn"] = None - # activation - modules["act"] = build_activation(self.act_func, self.ops_list[0] != "act") - # dropout - if self.dropout_rate > 0: - modules["dropout"] = nn.Dropout2d(self.dropout_rate, inplace=True) - else: - modules["dropout"] = None - # weight - modules["weight"] = self.weight_op() - - # add modules - for op in self.ops_list: - if modules[op] is None: - continue - elif op == "weight": - if modules["dropout"] is not None: - self.add_module("dropout", modules["dropout"]) - for key in modules["weight"]: - self.add_module(key, modules["weight"][key]) - else: - self.add_module(op, modules[op]) - - @property - def ops_list(self): - return self.ops_order.split("_") - - @property - def bn_before_weight(self): - for op in self.ops_list: - if op == "bn": - return True - elif op == "weight": - return False - raise ValueError("Invalid ops_order: %s" % self.ops_order) - - def weight_op(self): - raise NotImplementedError - - """ Methods defined in MyModule""" - - def forward(self, x): - for key, module in self._modules.items(): - if key == "bn" and not self.training: - continue - x = module(x) - return x - - @property - def module_str(self): - raise NotImplementedError - - @property - def config(self): - return { - "in_channels": self.in_channels, - "out_channels": self.out_channels, - "use_bn": self.use_bn, - "act_func": self.act_func, - "dropout_rate": self.dropout_rate, - "ops_order": self.ops_order, - } - - @staticmethod - def build_from_config(config): - raise NotImplementedError - - def get_flops(self, x): - raise NotImplementedError - - @staticmethod - def is_zero_layer(): - return False - - class FASTConvLayer(nn.Module): def __init__( self, @@ -202,7 +102,7 @@ def __init__( self.groups = groups self.bias = bias self.has_shuffle = has_shuffle - self.act_func = act_func + self.activation_function = act_func padding = get_same_padding(self.kernel_size) if isinstance(padding, int): @@ -225,11 +125,11 @@ def __init__( if use_bn: self.bn = nn.BatchNorm2d(out_channels) - self.act = nn.Identity() + self.activation = nn.Identity() if use_act: - act = build_activation(self.act_func, True) + act = build_activation(self.activation_function, True) if act is not None: - self.act = act + self.activation = act def forward(self, x): if self.training: @@ -237,27 +137,27 @@ def forward(self, x): delattr(self, "fused_conv") x = self.conv(x) x = self.bn(x) - return self.act(x) + return self.activation(x) else: if not hasattr(self, "fused_conv"): setattr(self, "fused_conv", self.fuse_conv_bn(self.conv, self.bn)) x = self.fused_conv(x) - if self.act is not None: - x = self.act(x) + if self.activation is not None: + x = self.activation(x) return x - def fuse_conv_bn(self, conv, bn): + def fuse_conv_bn(self, conv, batch_norm): """During inference, the functionary of batch norm layers is turned off but only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv layers to save computations and simplify network structures.""" - if isinstance(bn, nn.Identity): + if isinstance(batch_norm, nn.Identity): return conv conv_w = conv.weight - conv_b = conv.bias if conv.bias is not None else torch.zeros_like(bn.running_mean) + conv_b = conv.bias if conv.bias is not None else torch.zeros_like(batch_norm.running_mean) - factor = bn.weight / torch.sqrt(bn.running_var + bn.eps) + factor = batch_norm.weight / torch.sqrt(batch_norm.running_var + batch_norm.eps) conv.weight = nn.Parameter(conv_w * factor.reshape([conv.out_channels, 1, 1, 1])) - conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias) + conv.bias = nn.Parameter((conv_b - batch_norm.running_mean) * factor + batch_norm.bias) return conv From 344dc6eafa9dc714c12fca53a5853d189cc5ce6c Mon Sep 17 00:00:00 2001 From: raghavanone Date: Sun, 5 Nov 2023 15:43:30 +0530 Subject: [PATCH 030/152] Incorporate PR feedbacks --- docs/source/en/_toctree.yml | 2 +- docs/source/en/model_doc/fast.md | 2 +- src/transformers/__init__.py | 1 + .../fast/convert_fast_original_to_pytorch.py | 21 ++-- src/transformers/models/fast/modeling_fast.py | 95 +++++++++---------- 5 files changed, 57 insertions(+), 64 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index ca9067c596b0..51602dc805d4 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -538,7 +538,7 @@ - local: model_doc/efficientnet title: EfficientNet - local: model_doc/fast - title: Fast + title: FAST - local: model_doc/focalnet title: FocalNet - local: model_doc/glpn diff --git a/docs/source/en/model_doc/fast.md b/docs/source/en/model_doc/fast.md index b8304251f2f9..e5c8c58f1856 100644 --- a/docs/source/en/model_doc/fast.md +++ b/docs/source/en/model_doc/fast.md @@ -24,7 +24,7 @@ arbitrarily-shaped text detector). FAST has two new designs. (1) We design a minimalist kernel representation (only has 1-channel output) to model text with arbitrary shape, as well as a GPU-parallel post-processing to efficiently assemble text lines with a negligible time overhead. (2) We search the network architecture tailored for text detection, leading to more powerful features -than most networks that are searched for image classification +than most networks that are searched for image classification. ## FastConfig diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 37333f4ed67f..4941d724455d 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -50,6 +50,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name + # Base objects, independent of any specific backend _import_structure = { "audio_utils": [], diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py index 45522f429ec2..b64263b6df9f 100644 --- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py +++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py @@ -31,18 +31,9 @@ base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config" rename_key_mappings = { - "head": "classifier", - "text_embed": "text_embedding", - "vision_embed": "vision_embedding", - "k_proj": "key_proj", - "q_proj": "query_proj", - "v_proj": "value_proj", - "A": "text", - "B": "image", - "layer_norm": "fc_norm", - "self_attn_fc_norm": "self_attn_layer_norm", - "final_fc_norm": "final_layer_norm", - "first": "first", + "bn": "batch_norm", + "hor": "horizontal", + "ver": "vertical", } @@ -222,7 +213,11 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_ state_dict_changed = copy.deepcopy(state_dict) for key in state_dict: val = state_dict_changed.pop(key) - state_dict_changed[key.replace("module.", "")] = val + new_key = key.replace("module.", "") + for search, replacement in rename_key_mappings.items(): + if search in new_key: + new_key = new_key.replace(search, replacement) + state_dict_changed[new_key] = val model.load_state_dict(state_dict_changed) model.save_pretrained(pytorch_dump_folder_path) diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index b209edcdceb4..dd1e9be971c3 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -49,9 +49,6 @@ more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. A - classification loss is computed (Cross-Entropy) against these labels. """ @@ -89,7 +86,7 @@ def __init__( groups=1, bias=False, has_shuffle=False, - use_bn=True, + use_batch_norm=True, act_func="relu", dropout_rate=0, use_act=True, @@ -121,9 +118,9 @@ def __init__( groups=groups, bias=bias, ) - self.bn = nn.Identity() - if use_bn: - self.bn = nn.BatchNorm2d(out_channels) + self.batch_norm = nn.Identity() + if use_batch_norm: + self.batch_norm = nn.BatchNorm2d(out_channels) self.activation = nn.Identity() if use_act: @@ -131,22 +128,22 @@ def __init__( if act is not None: self.activation = act - def forward(self, x): + def forward(self, hidden_states): if self.training: if hasattr(self, "fused_conv"): delattr(self, "fused_conv") - x = self.conv(x) - x = self.bn(x) - return self.activation(x) + hidden_states = self.conv(hidden_states) + hidden_states = self.batch_norm(hidden_states) + return self.activation(hidden_states) else: if not hasattr(self, "fused_conv"): - setattr(self, "fused_conv", self.fuse_conv_bn(self.conv, self.bn)) - x = self.fused_conv(x) + setattr(self, "fused_conv", self.fuse_conv_batch_norm(self.conv, self.batch_norm)) + hidden_states = self.fused_conv(hidden_states) if self.activation is not None: - x = self.activation(x) - return x + hidden_states = self.activation(hidden_states) + return hidden_states - def fuse_conv_bn(self, conv, batch_norm): + def fuse_conv_batch_norm(self, conv, batch_norm): """During inference, the functionary of batch norm layers is turned off but only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv layers to save computations and simplify network structures.""" @@ -186,13 +183,13 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=groups, bias=False, ) - self.main_bn = nn.BatchNorm2d(num_features=out_channels) + self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels) ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0) hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2)) if kernel_size[1] != 1: - self.ver_conv = nn.Conv2d( + self.vertical_conv = nn.Conv2d( in_channels=in_channels, out_channels=out_channels, kernel_size=(kernel_size[0], 1), @@ -202,12 +199,12 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=groups, bias=False, ) - self.ver_bn = nn.BatchNorm2d(num_features=out_channels) + self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels) else: - self.ver_conv, self.ver_bn = None, None + self.vertical_conv, self.vertical_batch_norm = None, None if kernel_size[0] != 1: # 卷积核的高大于1 -> 有水平卷积 - self.hor_conv = nn.Conv2d( + self.horizontal_conv = nn.Conv2d( in_channels=in_channels, out_channels=out_channels, kernel_size=(1, kernel_size[1]), @@ -217,9 +214,9 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=groups, bias=False, ) - self.hor_bn = nn.BatchNorm2d(num_features=out_channels) + self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels) else: - self.hor_conv, self.hor_bn = None, None + self.horizontal_conv, self.horizontal_batch_norm = None, None self.rbr_identity = ( nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None @@ -231,16 +228,16 @@ def forward(self, hidden_states): self.__delattr__("fused_conv") main_outputs = self.main_conv(hidden_states) - main_outputs = self.main_bn(main_outputs) - if self.ver_conv is not None: - vertical_outputs = self.ver_conv(hidden_states) - vertical_outputs = self.ver_bn(vertical_outputs) + main_outputs = self.main_batch_norm(main_outputs) + if self.vertical_conv is not None: + vertical_outputs = self.vertical_conv(hidden_states) + vertical_outputs = self.vertical_batch_norm(vertical_outputs) else: vertical_outputs = 0 - if self.hor_conv is not None: - horizontal_outputs = self.hor_conv(hidden_states) - horizontal_outputs = self.hor_bn(horizontal_outputs) + if self.horizontal_conv is not None: + horizontal_outputs = self.horizontal_conv(hidden_states) + horizontal_outputs = self.horizontal_batch_norm(horizontal_outputs) else: horizontal_outputs = 0 @@ -258,7 +255,6 @@ def forward(self, hidden_states): def _identity_to_conv(self, identity): if identity is None: return 0, 0 - assert isinstance(identity, nn.BatchNorm2d) if not hasattr(self, "id_tensor"): input_dim = self.in_channels // self.groups kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32) @@ -276,26 +272,26 @@ def _identity_to_conv(self, identity): t = (gamma / std).reshape(-1, 1, 1, 1) return kernel * t, beta - running_mean * gamma / std - def _fuse_bn_tensor(self, conv, bn): + def _fuse_batch_norm_tensor(self, conv, batch_norm): kernel = conv.weight kernel = self._pad_to_mxn_tensor(kernel) - running_mean = bn.running_mean - running_var = bn.running_var - gamma = bn.weight - beta = bn.bias - eps = bn.eps + running_mean = batch_norm.running_mean + running_var = batch_norm.running_var + gamma = batch_norm.weight + beta = batch_norm.bias + eps = batch_norm.eps std = (running_var + eps).sqrt() t = (gamma / std).reshape(-1, 1, 1, 1) return kernel * t, beta - running_mean * gamma / std def get_equivalent_kernel_bias(self): - kernel_mxn, bias_mxn = self._fuse_bn_tensor(self.main_conv, self.main_bn) - if self.ver_conv is not None: - kernel_mx1, bias_mx1 = self._fuse_bn_tensor(self.ver_conv, self.ver_bn) + kernel_mxn, bias_mxn = self._fuse_batch_norm_tensor(self.main_conv, self.main_batch_norm) + if self.vertical_conv is not None: + kernel_mx1, bias_mx1 = self._fuse_batch_norm_tensor(self.vertical_conv, self.vertical_batch_norm) else: kernel_mx1, bias_mx1 = 0, 0 - if self.hor_conv is not None: - kernel_1xn, bias_1xn = self._fuse_bn_tensor(self.hor_conv, self.hor_bn) + if self.horizontal_conv is not None: + kernel_1xn, bias_1xn = self._fuse_batch_norm_tensor(self.horizontal_conv, self.horizontal_batch_norm) else: kernel_1xn, bias_1xn = 0, 0 kernel_id, bias_id = self._identity_to_conv(self.rbr_identity) @@ -746,10 +742,12 @@ class FastForSceneTextRecognitionOutput(ModelOutput): @add_start_docstrings( - """BEiT-3 is a general-purpose multimodal foundation model that excels in both vision and vision-language tasks. It - utilizes [Multiway transformers] (https://arxiv.org/abs/2208.10442) for deep fusion and modality-specific - encoding, and unifies masked modeling on images, texts, and image-text pairs, achieving top performance on - multiple benchmarks.""", + """FAST (faster arbitararily-shaped text detector) proposes an accurate and efficient scene text detection + framework, termed FAST (i.e., faster arbitrarily-shaped text detector).FAST has two new designs. (1) They design a + minimalist kernel representation (only has 1-channel output) to model text with arbitrary shape, as well as a + GPU-parallel post-processing to efficiently assemble text lines with a negligible time overhead. (2) We search the + network architecture tailored for text detection, leading to more powerful features than most networks that are + searched for image classification.""", FAST_START_DOCSTRING, ) class FastForSceneTextRecognition(FastPreTrainedModel): @@ -810,9 +808,8 @@ def forward( labels: Dict = None, ): r""" - labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): - Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy). + labels (`Dict[str, torch.Tensor]`, *optional*): + Should contain 3 keys: gt_texts,gt_kernels,gt_instances Returns: From 932d59233544f7e37276cd6e8fee73aa0b0e7343 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Sun, 5 Nov 2023 16:39:20 +0530 Subject: [PATCH 031/152] Incorporate PR feedbacks --- .../models/fast/configuration_fast.py | 2 -- .../fast/convert_fast_original_to_pytorch.py | 12 ++++------- .../models/fast/image_processing_fast.py | 21 ++++++++++--------- src/transformers/models/fast/modeling_fast.py | 7 ++----- .../models/fast/test_image_processing_fast.py | 3 ++- tests/models/fast/test_modeling_fast.py | 6 ++++-- 6 files changed, 23 insertions(+), 28 deletions(-) diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index 3f813386507f..186b398a4745 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -96,7 +96,6 @@ def __init__( head_final_dropout_rate=0, head_final_ops_order="weight", min_area=250, - min_score=0.88, bbox_type="rect", loss_bg=False, initializer_range=0.02, @@ -176,7 +175,6 @@ def __init__( self.head_final_ops_order = head_final_ops_order self.min_area = min_area - self.min_score = min_score self.bbox_type = bbox_type self.loss_bg = loss_bg self.initializer_range = initializer_range diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py index b64263b6df9f..0207f123b257 100644 --- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py +++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py @@ -43,7 +43,7 @@ def prepare_img(): return im -def prepare_config(size_config_url, pooling_size, min_area, min_score, bbox_type, loss_bg): +def prepare_config(size_config_url, pooling_size, min_area, bbox_type, loss_bg): config_dict = json.loads(requests.get(size_config_url).text) backbone_config = {} @@ -148,7 +148,6 @@ def prepare_config(size_config_url, pooling_size, min_area, min_score, bbox_type head_final_dropout_rate=config_dict["head"]["final"]["dropout_rate"], head_final_ops_order=config_dict["head"]["final"]["ops_order"], min_area=min_area, - min_score=min_score, bbox_type=bbox_type, loss_bg=loss_bg, ) @@ -174,27 +173,25 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_ test_config = namespace.get("test_cfg", None) data_config = namespace.get("data") - min_score = 0.88 min_area = 250 bbox_type = "rect" loss_bg = False if test_config is not None: min_area = test_config.get("min_area", min_area) - min_score = test_config.get("min_score", min_score) bbox_type = test_config.get("bbox_type", bbox_type) loss_bg = test_config.get("loss_emb", None) == "EmbLoss_v2" if "tiny" in model_config["backbone"]["config"]: config = prepare_config( - tiny_config_url, model_config["detection_head"]["pooling_size"], min_area, min_score, bbox_type, loss_bg + tiny_config_url, model_config["detection_head"]["pooling_size"], min_area, bbox_type, loss_bg ) elif "small" in model_config["backbone"]["config"]: config = prepare_config( - small_config_url, model_config["detection_head"]["pooling_size"], min_area, min_score, bbox_type, loss_bg + small_config_url, model_config["detection_head"]["pooling_size"], min_area, bbox_type, loss_bg ) else: config = prepare_config( - base_config_url, model_config["detection_head"]["pooling_size"], min_area, min_score, bbox_type, loss_bg + base_config_url, model_config["detection_head"]["pooling_size"], min_area, bbox_type, loss_bg ) size = 640 if "train" in data_config: @@ -204,7 +201,6 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_ model = FastForSceneTextRecognition(config) fast_image_processor = FastImageProcessor( size={"height": size, "width": size}, - min_score=config.min_score, min_area=config.min_area, bbox_type=config.bbox_type, pooling_size=config.head_pooling_size, diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index d03f8b542ae0..8aeb1e6f0334 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -92,10 +92,12 @@ class FastImageProcessor(BaseImageProcessor): image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): The standard deviation to use if normalizing the image. This is a float or list of floats of length of the number of channels of the image. Can be overridden by the `image_std` parameter in the `preprocess` method. - min_area (`int`, *optional*, defaults to 200): Threshold for min area for results - min_score (`float`, *optional*, defaults to 0.88): Threshold for min score for results - bbox_type (`str`, *optional*, defaults to `"rect"`): Type of bbox, rect or poly - pooling_size (`int`, *optional*, defaults to 9): Pooling size for text detection + min_area (`int`, *optional*, defaults to 200): + Threshold for min area for results + bbox_type (`str`, *optional*, defaults to `"rect"`): + Type of bbox, rect or poly + pooling_size (`int`, *optional*, defaults to 9): + Pooling size for text detection """ model_input_names = ["pixel_values"] @@ -113,7 +115,6 @@ def __init__( image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, min_area: int = 200, - min_score: float = 0.88, bbox_type: str = "rect", pooling_size: int = 9, **kwargs, @@ -134,7 +135,7 @@ def __init__( self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.min_area = min_area - self.min_score = min_score + # self.threshold = threshold self.bbox_type = bbox_type self.pooling_size = pooling_size @@ -389,7 +390,7 @@ def _max_pooling(self, x, scale=1): ) return x - def post_process_text_detection(self, output, target_sizes): + def post_process_text_detection(self, output, target_sizes, threshold): scale = 2 img_size = (self.size["height"], self.size["width"]) out = output["hidden_states"] @@ -428,13 +429,13 @@ def post_process_text_detection(self, output, target_sizes): org_img_size = target_sizes[i] scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0])) - bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales) + bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales, threshold) results.append({"bboxes": bboxes, "scores": scores}) final_results.update({"results": results}) return results - def generate_bbox(self, keys, label, score, scales): + def generate_bbox(self, keys, label, score, scales, threshold): label_num = len(keys) bboxes = [] scores = [] @@ -447,7 +448,7 @@ def generate_bbox(self, keys, label, score, scales): label[ind] = 0 continue score_i = score[ind].mean().item() - if score_i < self.min_score: + if score_i < threshold: label[ind] = 0 continue diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index dd1e9be971c3..761f7a9066f7 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -501,10 +501,6 @@ def __init__(self, config): config.head_final_ops_order, ) - self.min_area = config.min_area - self.min_score = config.min_score - self.bbox_type = config.bbox_type - self.pooling_size = config.head_pooling_size self.pooling_1s = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2) @@ -828,7 +824,8 @@ def forward( >>> # forward pass >>> outputs = model(pixel_values=inputs["pixel_values"]) >>> target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]] - >>> text_locations = processor.post_process_text_detection(outputs, target_sizes) + >>> threshold = 0.88 + >>> text_locations = processor.post_process_text_detection(outputs, target_sizes, threshold) >>> print(text_locations[0]["bboxes"][0][:10]) [484, 175, 484, 178, 483, 179, 452, 179, 452, 182] ``` diff --git a/tests/models/fast/test_image_processing_fast.py b/tests/models/fast/test_image_processing_fast.py index 8aa523dc03f3..f8192856849b 100644 --- a/tests/models/fast/test_image_processing_fast.py +++ b/tests/models/fast/test_image_processing_fast.py @@ -152,7 +152,8 @@ def prepare_image(): output = model(pixel_values=torch.tensor(inputs["pixel_values"])) target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]] - final_out = image_processor.post_process_text_detection(output, target_sizes) + threshold = 0.88 + final_out = image_processor.post_process_text_detection(output, target_sizes, threshold) assert len(final_out[0]["bboxes"]) == 2 assert len(final_out[0]["bboxes"][0]) == 716 diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index 409f579eed0f..f97481436676 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -403,7 +403,8 @@ def prepare_image(): output = model(pixel_values=torch.tensor(input["pixel_values"])) target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]] - final_out = image_processor.post_process_text_detection(output, target_sizes) + threshold = 0.88 + final_out = image_processor.post_process_text_detection(output, target_sizes, threshold) assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134] assert round(float(final_out[0]["scores"][0]), 5) == 0.95541 @@ -424,7 +425,8 @@ def prepare_image(): output = model(pixel_values=torch.tensor(input["pixel_values"])) target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]] - final_out = image_processor.post_process_text_detection(output, target_sizes) + threshold = 0.88 + final_out = image_processor.post_process_text_detection(output, target_sizes, threshold) assert final_out[0]["bboxes"][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182] assert round(float(final_out[0]["scores"][0]), 5) == 0.92356 From 5f1af193c5b63f07c9997de5059d20a297fc2069 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Sun, 5 Nov 2023 18:58:31 +0530 Subject: [PATCH 032/152] Incorporate PR feedbacks --- src/transformers/models/fast/modeling_fast.py | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 761f7a9066f7..cfd3506de0fc 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -446,16 +446,6 @@ def __init__(self, config): for layer_ix in range(0, len(reduce_layer_configs)): setattr(self, f"reduce_layer{layer_ix + 1}", FASTRepConvLayer(*reduce_layer_configs[layer_ix])) - self._initialize_weights() - - def _initialize_weights(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight) - elif isinstance(m, nn.BatchNorm2d): - m.weight.data.fill_(1) - m.bias.data.zero_() - def _upsample(self, layer_out, height, width): return F.upsample(layer_out, size=(height, width), mode="bilinear") @@ -513,16 +503,6 @@ def __init__(self, config): else: self.dropout = None - self._initialize_weights() - - def _initialize_weights(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight) - elif isinstance(m, nn.BatchNorm2d): - m.weight.data.fill_(1) - m.bias.data.zero_() - def forward(self, hidden_states): hidden_states = self.conv(hidden_states) if self.dropout is not None: From c9a354320ae308486bd0d560fb9f340736bf33bf Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 8 Nov 2023 09:09:57 +0530 Subject: [PATCH 033/152] Introduce TextNet --- README.md | 1 + README_es.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/index.md | 1 + docs/source/en/model_doc/textnet.md | 42 ++ docs/source/en/tasks/image_classification.md | 2 +- src/transformers/__init__.py | 23 + .../models/auto/configuration_auto.py | 2 + src/transformers/models/auto/modeling_auto.py | 3 + src/transformers/models/textnet/__init__.py | 53 ++ .../models/textnet/configuration_textnet.py | 134 ++++ .../textnet/image_processing_textnet.py | 323 +++++++++ .../models/textnet/modeling_textnet.py | 614 ++++++++++++++++++ src/transformers/utils/dummy_pt_objects.py | 24 + .../utils/dummy_vision_objects.py | 7 + tests/models/textnet/__init__.py | 0 tests/models/textnet/test_modeling_textnet.py | 407 ++++++++++++ 21 files changed, 1641 insertions(+), 1 deletion(-) create mode 100644 docs/source/en/model_doc/textnet.md create mode 100644 src/transformers/models/textnet/__init__.py create mode 100644 src/transformers/models/textnet/configuration_textnet.py create mode 100644 src/transformers/models/textnet/image_processing_textnet.py create mode 100644 src/transformers/models/textnet/modeling_textnet.py create mode 100644 tests/models/textnet/__init__.py create mode 100644 tests/models/textnet/test_modeling_textnet.py diff --git a/README.md b/README.md index daab3d1f9d6b..cb1beeec315c 100644 --- a/README.md +++ b/README.md @@ -491,6 +491,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. +1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from ) released with the paper []() by . 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine diff --git a/README_es.md b/README_es.md index 9e1ac93b4a99..2d8279f5b0fe 100644 --- a/README_es.md +++ b/README_es.md @@ -466,6 +466,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. +1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from ) released with the paper []() by . 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine diff --git a/README_hd.md b/README_hd.md index 92935efb589c..ef97795ebbff 100644 --- a/README_hd.md +++ b/README_hd.md @@ -440,6 +440,7 @@ conda install conda-forge::transformers 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [पबटेबल्स-1एम: टूवर्ड्स कॉम्प्रिहेंसिव टेबल एक्सट्रैक्शन फ्रॉम अनस्ट्रक्चर्ड डॉक्यूमेंट्स ](https://arxiv.org/abs/2110.00061) ब्रैंडन स्मॉक, रोहित पेसाला, रॉबिन अब्राहम द्वारा पोस्ट किया गया। 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI से) साथ में कागज [TAPAS: पूर्व-प्रशिक्षण के माध्यम से कमजोर पर्यवेक्षण तालिका पार्सिंग](https:// arxiv.org/abs/2004.02349) जोनाथन हर्ज़िग, पावेल क्रिज़िस्तोफ़ नोवाक, थॉमस मुलर, फ्रांसेस्को पिकिन्नो और जूलियन मार्टिन ईसेन्च्लोस द्वारा। 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [TAPEX: टेबल प्री-ट्रेनिंग थ्रू लर्निंग अ न्यूरल SQL एक्ज़ीक्यूटर](https: //arxiv.org/abs/2107.07653) कियान लियू, बेई चेन, जियाकी गुओ, मोर्टेज़ा ज़ियादी, ज़ेकी लिन, वीज़ू चेन, जियान-गुआंग लू द्वारा पोस्ट किया गया। +1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from ) released with the paper []() by . 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine diff --git a/README_ja.md b/README_ja.md index f43dda021c6f..cf9b70b1ba7f 100644 --- a/README_ja.md +++ b/README_ja.md @@ -500,6 +500,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research から) Brandon Smock, Rohith Pesala, Robin Abraham から公開された研究論文: [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI から) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos から公開された研究論文: [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research から) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou から公開された研究論文: [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) +1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from ) released with the paper []() by . 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (HuggingFace から). 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook から) Gedas Bertasius, Heng Wang, Lorenzo Torresani から公開された研究論文: [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley から) Michael Janner, Qiyang Li, Sergey Levine から公開された研究論文: [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) diff --git a/README_ko.md b/README_ko.md index c2e53a1b81ce..6b256cba5aa0 100644 --- a/README_ko.md +++ b/README_ko.md @@ -415,6 +415,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research 에서) Brandon Smock, Rohith Pesala, Robin Abraham 의 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 논문과 함께 발표했습니다. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI 에서) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 의 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 논문과 함께 발표했습니다. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research 에서) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 의 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 논문과 함께 발표했습니다. +1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from ) released with the paper []() by . 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook 에서) Gedas Bertasius, Heng Wang, Lorenzo Torresani 의 [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) 논문과 함께 발표했습니다. 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley 에서) Michael Janner, Qiyang Li, Sergey Levin 의 [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) 논문과 함께 발표했습니다. diff --git a/README_zh-hans.md b/README_zh-hans.md index 972f3a386f42..4b2950743ce7 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -439,6 +439,7 @@ conda install conda-forge::transformers 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (来自 Microsoft Research) 伴随论文 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 由 Brandon Smock, Rohith Pesala, Robin Abraham 发布。 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。 +1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from ) released with the paper []() by . 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine diff --git a/README_zh-hant.md b/README_zh-hant.md index b17c8946bc3e..511630eb58fa 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -451,6 +451,7 @@ conda install conda-forge::transformers 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. +1. **[TextNet](https://huggingface.co/docs/transformers/main/model_doc/textnet)** (from ) released with the paper []() by . 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). 1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 52b5df6e59ba..76cdfee42f6d 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -266,6 +266,7 @@ Flax), PyTorch, and/or TensorFlow. | [Table Transformer](model_doc/table-transformer) | ✅ | ❌ | ❌ | | [TAPAS](model_doc/tapas) | ✅ | ✅ | ❌ | | [TAPEX](model_doc/tapex) | ✅ | ✅ | ✅ | +| [TextNet](model_doc/textnet) | ✅ | ❌ | ❌ | | [Time Series Transformer](model_doc/time_series_transformer) | ✅ | ❌ | ❌ | | [TimeSformer](model_doc/timesformer) | ✅ | ❌ | ❌ | | [Trajectory Transformer](model_doc/trajectory_transformer) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md new file mode 100644 index 000000000000..088adb572bdb --- /dev/null +++ b/docs/source/en/model_doc/textnet.md @@ -0,0 +1,42 @@ + + +# TextNet + +## Overview + +The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. +TextNet was results of NAS for efficient text detection task. + +## TextNetConfig + +[[autodoc]] TextNetConfig + +## TextNetImageProcessor + +[[autodoc]] TextNetImageProcessor + - preprocess + +## TextNetModel + +[[autodoc]] TextNetModel + - forward + +## TextNetForImageClassification + +[[autodoc]] TextNetForImageClassification + - forward + diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md index 489ec59ddf6a..55949c68ee14 100644 --- a/docs/source/en/tasks/image_classification.md +++ b/docs/source/en/tasks/image_classification.md @@ -34,7 +34,7 @@ The task illustrated in this tutorial is supported by the following model archit -[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [PVT](../model_doc/pvt), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn) +[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [PVT](../model_doc/pvt), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [TextNet](../model_doc/textnet), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 4941d724455d..90270a9e406f 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -822,6 +822,10 @@ "TapasConfig", "TapasTokenizer", ], + "models.textnet": [ + "TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", + "TextNetConfig" + ], "models.time_series_transformer": [ "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimeSeriesTransformerConfig", @@ -1300,6 +1304,7 @@ _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"]) _import_structure["models.siglip"].append("SiglipImageProcessor") _import_structure["models.swin2sr"].append("Swin2SRImageProcessor") + _import_structure["models.textnet"].append("TextNetImageProcessor") _import_structure["models.tvlt"].append("TvltImageProcessor") _import_structure["models.tvp"].append("TvpImageProcessor") _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"]) @@ -3295,6 +3300,13 @@ "load_tf_weights_in_tapas", ] ) + _import_structure["models.textnet"].extend( + [ + "TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", + "TextNetBackbone", + "TextNetModel", + ] + ) _import_structure["models.time_series_transformer"].extend( [ "TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -5518,6 +5530,10 @@ TapasConfig, TapasTokenizer, ) + from .models.textnet import ( + TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, + TextNetConfig, + ) from .models.time_series_transformer import ( TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TimeSeriesTransformerConfig, @@ -5993,6 +6009,7 @@ from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor from .models.siglip import SiglipImageProcessor from .models.swin2sr import Swin2SRImageProcessor + from .models.textnet import TextNetImageProcessor from .models.tvlt import TvltImageProcessor from .models.tvp import TvpImageProcessor from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor @@ -7642,6 +7659,12 @@ TapasPreTrainedModel, load_tf_weights_in_tapas, ) + from .models.textnet import ( + TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, + TextNetBackbone, + TextNetModel, + TextNetPreTrainedModel, + ) from .models.time_series_transformer import ( TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, TimeSeriesTransformerForPrediction, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 9eb3f1985c85..8ac6a1912c3a 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -216,6 +216,7 @@ ("t5", "T5Config"), ("table-transformer", "TableTransformerConfig"), ("tapas", "TapasConfig"), + ("textnet", "TextNetConfig"), ("time_series_transformer", "TimeSeriesTransformerConfig"), ("timesformer", "TimesformerConfig"), ("timm_backbone", "TimmBackboneConfig"), @@ -685,6 +686,7 @@ ("table-transformer", "Table Transformer"), ("tapas", "TAPAS"), ("tapex", "TAPEX"), + ("textnet", "TextNet"), ("time_series_transformer", "Time Series Transformer"), ("timesformer", "TimeSformer"), ("timm_backbone", "TimmBackbone"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 7bf50a4518fa..c1ecdee1578e 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -207,6 +207,7 @@ ("t5", "T5Model"), ("table-transformer", "TableTransformerModel"), ("tapas", "TapasModel"), + ("textnet", "TextNetModel"), ("time_series_transformer", "TimeSeriesTransformerModel"), ("timesformer", "TimesformerModel"), ("timm_backbone", "TimmBackbone"), @@ -538,6 +539,7 @@ ("swiftformer", "SwiftFormerForImageClassification"), ("swin", "SwinForImageClassification"), ("swinv2", "Swinv2ForImageClassification"), + ("textnet", "TextNetForImageClassification"), ("van", "VanForImageClassification"), ("vit", "ViTForImageClassification"), ("vit_hybrid", "ViTHybridForImageClassification"), @@ -1123,6 +1125,7 @@ ("resnet", "ResNetBackbone"), ("swin", "SwinBackbone"), ("swinv2", "Swinv2Backbone"), + ("textnet", "TextNetBackbone"), ("timm_backbone", "TimmBackbone"), ("vitdet", "VitDetBackbone"), ] diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py new file mode 100644 index 000000000000..21e26f387817 --- /dev/null +++ b/src/transformers/models/textnet/__init__.py @@ -0,0 +1,53 @@ +# coding=utf-8 +# Copyright 2023 the Fast authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_torch_available, +) + + +_import_structure = { + "configuration_textnet": ["TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "TextNetConfig"], + "image_processing_textnet": ["TextNetImageProcessor"], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_textnet"] = ["TextNetBackbone", "TextNetModel", "TextNetPreTrainedModel"] + +if TYPE_CHECKING: + from .configuration_textnet import TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, TextNetConfig + from .image_processing_textnet import TextNetImageProcessor + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_textnet import TextNetBackbone, TextNetModel, TextNetPreTrainedModel + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py new file mode 100644 index 000000000000..9c7fe907aa13 --- /dev/null +++ b/src/transformers/models/textnet/configuration_textnet.py @@ -0,0 +1,134 @@ +# coding=utf-8 +# Copyright The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TextNet model configuration""" +from transformers import PretrainedConfig +from transformers.utils import logging +from transformers.utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +logger = logging.get_logger(__name__) + +TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "fast_base_tt_800_finetune_ic17mlt": ( + "https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt/raw/main/config.json" + ), +} + + +class TextNetConfig(BackboneConfigMixin, PretrainedConfig): + r""" + [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt) + """ + + def __init__( + self, + backbone_kernel_size=3, + backbone_stride=2, + backbone_dilation=1, + backbone_groups=1, + backbone_bias=False, + backbone_has_shuffle=False, + backbone_in_channels=3, + backbone_out_channels=64, + backbone_use_bn=True, + backbone_act_func="relu", + backbone_dropout_rate=0, + backbone_ops_order="weight_bn_act", + backbone_stage1_in_channels=[64, 64, 64], + backbone_stage1_out_channels=[64, 64, 64], + backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]], + backbone_stage1_stride=[1, 2, 1], + backbone_stage1_dilation=[1, 1, 1], + backbone_stage1_groups=[1, 1, 1], + backbone_stage2_in_channels=[64, 128, 128, 128], + backbone_stage2_out_channels=[128, 128, 128, 128], + backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]], + backbone_stage2_stride=[2, 1, 1, 1], + backbone_stage2_dilation=[1, 1, 1, 1], + backbone_stage2_groups=[1, 1, 1, 1], + backbone_stage3_in_channels=[128, 256, 256, 256], + backbone_stage3_out_channels=[256, 256, 256, 256], + backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]], + backbone_stage3_stride=[2, 1, 1, 1], + backbone_stage3_dilation=[1, 1, 1, 1], + backbone_stage3_groups=[1, 1, 1, 1], + backbone_stage4_in_channels=[256, 512, 512, 512], + backbone_stage4_out_channels=[512, 512, 512, 512], + backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]], + backbone_stage4_stride=[2, 1, 1, 1], + backbone_stage4_dilation=[1, 1, 1, 1], + backbone_stage4_groups=[1, 1, 1, 1], + hidden_sizes=[64, 64, 128, 256, 512], + initializer_range=0.02, + out_features=None, + out_indices=None, + **kwargs, + ): + super().__init__(**kwargs) + + self.backbone_kernel_size = backbone_kernel_size + self.backbone_stride = backbone_stride + self.backbone_dilation = backbone_dilation + self.backbone_groups = backbone_groups + self.backbone_bias = backbone_bias + self.backbone_has_shuffle = backbone_has_shuffle + self.backbone_in_channels = backbone_in_channels + self.backbone_out_channels = backbone_out_channels + self.backbone_use_bn = backbone_use_bn + self.backbone_act_func = backbone_act_func + self.backbone_dropout_rate = backbone_dropout_rate + self.backbone_ops_order = backbone_ops_order + + self.backbone_stage1_in_channels = backbone_stage1_in_channels + self.backbone_stage1_out_channels = backbone_stage1_out_channels + self.backbone_stage1_kernel_size = backbone_stage1_kernel_size + self.backbone_stage1_stride = backbone_stage1_stride + self.backbone_stage1_dilation = backbone_stage1_dilation + self.backbone_stage1_groups = backbone_stage1_groups + + self.backbone_stage2_in_channels = backbone_stage2_in_channels + self.backbone_stage2_out_channels = backbone_stage2_out_channels + self.backbone_stage2_kernel_size = backbone_stage2_kernel_size + self.backbone_stage2_stride = backbone_stage2_stride + self.backbone_stage2_dilation = backbone_stage2_dilation + self.backbone_stage2_groups = backbone_stage2_groups + + self.backbone_stage3_in_channels = backbone_stage3_in_channels + self.backbone_stage3_out_channels = backbone_stage3_out_channels + self.backbone_stage3_kernel_size = backbone_stage3_kernel_size + self.backbone_stage3_stride = backbone_stage3_stride + self.backbone_stage3_dilation = backbone_stage3_dilation + self.backbone_stage3_groups = backbone_stage3_groups + + self.backbone_stage4_in_channels = backbone_stage4_in_channels + self.backbone_stage4_out_channels = backbone_stage4_out_channels + self.backbone_stage4_kernel_size = backbone_stage4_kernel_size + self.backbone_stage4_stride = backbone_stage4_stride + self.backbone_stage4_dilation = backbone_stage4_dilation + self.backbone_stage4_groups = backbone_stage4_groups + + self.initializer_range = initializer_range + self.hidden_sizes = hidden_sizes + + self.depths = [ + len(self.backbone_stage1_out_channels), + len(self.backbone_stage2_out_channels), + len(self.backbone_stage3_out_channels), + len(self.backbone_stage4_out_channels), + ] + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, 5)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py new file mode 100644 index 000000000000..32975e13c7a8 --- /dev/null +++ b/src/transformers/models/textnet/image_processing_textnet.py @@ -0,0 +1,323 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for TextNet.""" + +from typing import Dict, List, Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import ( + convert_to_rgb, + get_resize_output_image_size, + resize, + to_channel_dimension_format, +) +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, +) +from ...utils import TensorType, is_vision_available, logging + + +logger = logging.get_logger(__name__) + + +if is_vision_available(): + import PIL + + +class TextNetImageProcessor(BaseImageProcessor): + r""" + Constructs a TextNet image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by + `do_resize` in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`): + Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the + `preprocess` method. + crop_size (`Dict[str, int]` *optional*, defaults to 224): + Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess` + method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + image_mean (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + use_square_size (`bool`, *optional*, defaults to `False`): + The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the + `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not. + Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = True, + crop_size: Dict[str, int] = None, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + use_square_size: bool = False, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"shortest_edge": 224} + size = get_size_dict(size, default_to_square=use_square_size) + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + self.use_square_size = use_square_size + + # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge + resized to keep the input aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + size = get_size_dict(size, default_to_square=self.use_square_size) + if "shortest_edge" not in size: + raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}") + output_size = get_resize_output_image_size( + image, + size=size["shortest_edge"], + default_to_square=self.use_square_size, + input_data_format=input_data_format, + ) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def preprocess( + self, + images: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: int = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, param_name="size", default_to_square=self.use_square_size) + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True) + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + if do_resize and size is None: + raise ValueError("Size must be specified if do_resize is True.") + + if do_center_crop and crop_size is None: + raise ValueError("Crop size must be specified if do_center_crop is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and (image_mean is None or image_std is None): + raise ValueError("Image mean and std must be specified if do_normalize is True.") + + # PIL RGBA images are converted to RGB + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_center_crop: + images = [ + self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py new file mode 100644 index 000000000000..119ee2c7418d --- /dev/null +++ b/src/transformers/models/textnet/modeling_textnet.py @@ -0,0 +1,614 @@ +# coding=utf-8 +# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch TextNet model.""" +from typing import Any, List, Optional, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +from torch import Tensor +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers import PreTrainedModel, add_start_docstrings +from transformers.modeling_outputs import ( + BackboneOutput, + BaseModelOutputWithPoolingAndNoAttention, + ImageClassifierOutputWithNoAttention, +) +from transformers.models.textnet.configuration_textnet import TextNetConfig +from transformers.utils import add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from transformers.utils.backbone_utils import BackboneMixin + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "BitConfig" + +TEXTNET_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`BitConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BIT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`BitImageProcessor.__call__`] + for details. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): +""" + +BIT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + # "google/bit-50", + # See all BiT models at https://huggingface.co/models?filter=bit +] + + +def get_same_padding(kernel_size): + if isinstance(kernel_size, tuple): + p1 = get_same_padding(kernel_size[0]) + p2 = get_same_padding(kernel_size[1]) + return p1, p2 + return kernel_size // 2 + + +def build_activation(act_func, inplace=True): + if act_func == "relu": + return nn.ReLU(inplace=inplace) + elif act_func == "relu6": + return nn.ReLU6(inplace=inplace) + elif act_func == "tanh": + return nn.Tanh() + elif act_func == "sigmoid": + return nn.Sigmoid() + elif act_func is None: + return None + else: + raise ValueError("do not support: %s" % act_func) + + +class TextNetConvLayer(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + dilation=1, + groups=1, + bias=False, + has_shuffle=False, + use_batch_norm=True, + act_func="relu", + dropout_rate=0, + use_act=True, + ): + super().__init__() + + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + self.groups = groups + self.bias = bias + self.has_shuffle = has_shuffle + self.activation_function = act_func + + padding = get_same_padding(self.kernel_size) + if isinstance(padding, int): + padding *= self.dilation + else: + padding[0] *= self.dilation + padding[1] *= self.dilation + + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + self.batch_norm = nn.Identity() + if use_batch_norm: + self.batch_norm = nn.BatchNorm2d(out_channels) + + self.activation = nn.Identity() + if use_act: + act = build_activation(self.activation_function, True) + if act is not None: + self.activation = act + + def forward(self, hidden_states): + if self.training: + if hasattr(self, "fused_conv"): + delattr(self, "fused_conv") + hidden_states = self.conv(hidden_states) + hidden_states = self.batch_norm(hidden_states) + return self.activation(hidden_states) + else: + if not hasattr(self, "fused_conv"): + setattr(self, "fused_conv", self.fuse_conv_batch_norm(self.conv, self.batch_norm)) + hidden_states = self.fused_conv(hidden_states) + if self.activation is not None: + hidden_states = self.activation(hidden_states) + return hidden_states + + def fuse_conv_batch_norm(self, conv, batch_norm): + """During inference, the functionary of batch norm layers is turned off but + only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv + layers to save computations and simplify network structures.""" + if isinstance(batch_norm, nn.Identity): + return conv + conv_w = conv.weight + conv_b = conv.bias if conv.bias is not None else torch.zeros_like(batch_norm.running_mean) + + factor = batch_norm.weight / torch.sqrt(batch_norm.running_var + batch_norm.eps) + conv.weight = nn.Parameter(conv_w * factor.reshape([conv.out_channels, 1, 1, 1])) + conv.bias = nn.Parameter((conv_b - batch_norm.running_mean) * factor + batch_norm.bias) + return conv + + +class TestNetRepConvLayer(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1): + super().__init__() + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + self.groups = groups + + padding = (int(((kernel_size[0] - 1) * dilation) / 2), int(((kernel_size[1] - 1) * dilation) / 2)) + + self.nonlinearity = nn.ReLU(inplace=True) + + self.main_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=False, + ) + self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels) + + ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0) + hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2)) + + if kernel_size[1] != 1: + self.vertical_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(kernel_size[0], 1), + stride=stride, + padding=ver_pad, + dilation=dilation, + groups=groups, + bias=False, + ) + self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels) + else: + self.vertical_conv, self.vertical_batch_norm = None, None + + if kernel_size[0] != 1: # 卷积核的高大于1 -> 有水平卷积 + self.horizontal_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(1, kernel_size[1]), + stride=stride, + padding=hor_pad, + dilation=dilation, + groups=groups, + bias=False, + ) + self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels) + else: + self.horizontal_conv, self.horizontal_batch_norm = None, None + + self.rbr_identity = ( + nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None + ) + + def forward(self, hidden_states): + if self.training: + if hasattr(self, "fused_conv"): + self.__delattr__("fused_conv") + + main_outputs = self.main_conv(hidden_states) + main_outputs = self.main_batch_norm(main_outputs) + if self.vertical_conv is not None: + vertical_outputs = self.vertical_conv(hidden_states) + vertical_outputs = self.vertical_batch_norm(vertical_outputs) + else: + vertical_outputs = 0 + + if self.horizontal_conv is not None: + horizontal_outputs = self.horizontal_conv(hidden_states) + horizontal_outputs = self.horizontal_batch_norm(horizontal_outputs) + else: + horizontal_outputs = 0 + + if self.rbr_identity is None: + id_out = 0 + else: + id_out = self.rbr_identity(hidden_states) + + return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out) + else: + if not hasattr(self, "fused_conv"): + self.prepare_for_eval() + return self.nonlinearity(self.fused_conv(hidden_states)) + + def _identity_to_conv(self, identity): + if identity is None: + return 0, 0 + if not hasattr(self, "id_tensor"): + input_dim = self.in_channels // self.groups + kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32) + for i in range(self.in_channels): + kernel_value[i, i % input_dim, 0, 0] = 1 + id_tensor = torch.from_numpy(kernel_value).to(identity.weight.device) + self.id_tensor = self._pad_to_mxn_tensor(id_tensor) + kernel = self.id_tensor + running_mean = identity.running_mean + running_var = identity.running_var + gamma = identity.weight + beta = identity.bias + eps = identity.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape(-1, 1, 1, 1) + return kernel * t, beta - running_mean * gamma / std + + def _fuse_batch_norm_tensor(self, conv, batch_norm): + kernel = conv.weight + kernel = self._pad_to_mxn_tensor(kernel) + running_mean = batch_norm.running_mean + running_var = batch_norm.running_var + gamma = batch_norm.weight + beta = batch_norm.bias + eps = batch_norm.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape(-1, 1, 1, 1) + return kernel * t, beta - running_mean * gamma / std + + def get_equivalent_kernel_bias(self): + kernel_mxn, bias_mxn = self._fuse_batch_norm_tensor(self.main_conv, self.main_batch_norm) + if self.vertical_conv is not None: + kernel_mx1, bias_mx1 = self._fuse_batch_norm_tensor(self.vertical_conv, self.vertical_batch_norm) + else: + kernel_mx1, bias_mx1 = 0, 0 + if self.horizontal_conv is not None: + kernel_1xn, bias_1xn = self._fuse_batch_norm_tensor(self.horizontal_conv, self.horizontal_batch_norm) + else: + kernel_1xn, bias_1xn = 0, 0 + kernel_id, bias_id = self._identity_to_conv(self.rbr_identity) + kernel_mxn = kernel_mxn + kernel_mx1 + kernel_1xn + kernel_id + bias_mxn = bias_mxn + bias_mx1 + bias_1xn + bias_id + return kernel_mxn, bias_mxn + + def _pad_to_mxn_tensor(self, kernel): + kernel_height, kernel_width = self.kernel_size + height, width = kernel.shape[2:] + pad_left_right = (kernel_width - width) // 2 + pad_top_down = (kernel_height - height) // 2 + return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right, pad_top_down, pad_top_down]) + + def prepare_for_eval(self): + kernel, bias = self.get_equivalent_kernel_bias() + self.fused_conv = nn.Conv2d( + in_channels=self.main_conv.in_channels, + out_channels=self.main_conv.out_channels, + kernel_size=self.main_conv.kernel_size, + stride=self.main_conv.stride, + padding=self.main_conv.padding, + dilation=self.main_conv.dilation, + groups=self.main_conv.groups, + bias=True, + ) + self.fused_conv.weight.data = kernel + self.fused_conv.bias.data = bias + for para in self.fused_conv.parameters(): + para.detach_() + + +class TextNetPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = TextNetConfig + base_model_prefix = "textnet" + main_input_name = "pixel_values" + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + + +@add_start_docstrings( + "The bare Textnet model outputting raw features without any specific head on top.", + TEXTNET_START_DOCSTRING, +) +class TextNetModel(TextNetPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.first_conv = TextNetConvLayer( + config.backbone_in_channels, + config.backbone_out_channels, + config.backbone_kernel_size, + config.backbone_stride, + config.backbone_dilation, + config.backbone_groups, + config.backbone_bias, + config.backbone_has_shuffle, + config.backbone_use_bn, + config.backbone_act_func, + config.backbone_dropout_rate, + config.backbone_ops_order, + ) + stage1 = [] + for stage_config in zip( + config.backbone_stage1_in_channels, + config.backbone_stage1_out_channels, + config.backbone_stage1_kernel_size, + config.backbone_stage1_stride, + config.backbone_stage1_dilation, + config.backbone_stage1_groups, + ): + stage1.append(TestNetRepConvLayer(*stage_config)) + self.stage1 = nn.ModuleList(stage1) + + stage2 = [] + for stage_config in zip( + config.backbone_stage2_in_channels, + config.backbone_stage2_out_channels, + config.backbone_stage2_kernel_size, + config.backbone_stage2_stride, + config.backbone_stage2_dilation, + config.backbone_stage2_groups, + ): + stage2.append(TestNetRepConvLayer(*stage_config)) + self.stage2 = nn.ModuleList(stage2) + + stage3 = [] + for stage_config in zip( + config.backbone_stage3_in_channels, + config.backbone_stage3_out_channels, + config.backbone_stage3_kernel_size, + config.backbone_stage3_stride, + config.backbone_stage3_dilation, + config.backbone_stage3_groups, + ): + stage3.append(TestNetRepConvLayer(*stage_config)) + self.stage3 = nn.ModuleList(stage3) + + stage4 = [] + for stage_config in zip( + config.backbone_stage4_in_channels, + config.backbone_stage4_out_channels, + config.backbone_stage4_kernel_size, + config.backbone_stage4_stride, + config.backbone_stage4_dilation, + config.backbone_stage4_groups, + ): + stage4.append(TestNetRepConvLayer(*stage_config)) + self.stage4 = nn.ModuleList(stage4) + + self.pooler = nn.AdaptiveAvgPool2d((2, 2)) + + self.init_weights() + + def forward( + self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None + ) -> Union[Tuple[Any, List[Any]], Tuple[Any], BaseModelOutputWithPoolingAndNoAttention]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + hidden_state = self.first_conv(pixel_values) + hidden_states = [hidden_state] + + for block in self.stage1: + hidden_state = block(hidden_state) + hidden_states.append(hidden_state) + + for block in self.stage2: + hidden_state = block(hidden_state) + hidden_states.append(hidden_state) + + for block in self.stage3: + hidden_state = block(hidden_state) + hidden_states.append(hidden_state) + + for block in self.stage4: + hidden_state = block(hidden_state) + hidden_states.append(hidden_state) + + pooled_output = self.pooler(hidden_state) + + if not return_dict: + output = (pooled_output, hidden_state) + return output + (hidden_states,) if output_hidden_states else output + + return BaseModelOutputWithPoolingAndNoAttention( + pooler_output=pooled_output, + last_hidden_state=hidden_state, + hidden_states=tuple(hidden_states) if output_hidden_states else None, + ) + + +@add_start_docstrings( + """ + TextNet backbone, to be used with frameworks like DETR and MaskFormer. + """, + TEXTNET_START_DOCSTRING, +) +class TextNetBackbone(TextNetPreTrainedModel, BackboneMixin): + def __init__(self, config): + super().__init__(config) + super()._init_backbone(config) + + self.textnet = TextNetModel(config) + self.num_features = [ + config.backbone_out_channels, + config.backbone_stage1_out_channels[-1], + config.backbone_stage2_out_channels[-1], + config.backbone_stage3_out_channels[-1], + config.backbone_stage4_out_channels[-1], + ] + + # initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward("BIT_INPUTS_DOCSTRING") + @replace_return_docstrings(output_type=BackboneOutput, config_class="") + def forward( + self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None + ) -> BackboneOutput: + """ + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("google/resnetnv2-50") + >>> model = AutoBackbone.from_pretrained("google/resnetnv2-50") + + >>> inputs = processor(image, return_tensors="pt") + >>> outputs = model(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + outputs = self.textnet(pixel_values, output_hidden_states=True, return_dict=True) + + hidden_states = outputs.hidden_states + + feature_maps = () + for idx, stage in enumerate(self.stage_names): + if stage in self.out_features: + feature_maps += (hidden_states[idx],) + + if not return_dict: + output = (feature_maps,) + if output_hidden_states: + output += (outputs.hidden_states,) + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=None, + ) + + +@add_start_docstrings( + """ + TextNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for + ImageNet. + """, + TEXTNET_START_DOCSTRING, +) +class TextNetForImageClassification(TextNetPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.textnet = TextNetModel(config) + # classification head + self.classifier = nn.Sequential( + nn.Flatten(), + nn.Linear(config.hidden_sizes[-1] * 2 * 2, config.num_labels) if config.num_labels > 0 else nn.Identity(), + ) + # initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward("BIT_INPUTS_DOCSTRING") + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> ImageClassifierOutputWithNoAttention: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.textnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + + last_hidden_state = outputs.last_hidden_state if return_dict else outputs[0] + + logits = self.classifier(last_hidden_state) + + loss = None + + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return (loss,) + output if loss is not None else output + + return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 06bdee17752b..095c9a6f4189 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -8023,6 +8023,30 @@ def load_tf_weights_in_tapas(*args, **kwargs): requires_backends(load_tf_weights_in_tapas, ["torch"]) +TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP = None + + +class TextNetBackbone(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class TextNetModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class TextNetPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 89366aba5081..18c6a27bd7dc 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -485,6 +485,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class TextNetImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class TvltImageProcessor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/models/textnet/__init__.py b/tests/models/textnet/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py new file mode 100644 index 000000000000..d7ebe31f6021 --- /dev/null +++ b/tests/models/textnet/test_modeling_textnet.py @@ -0,0 +1,407 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch TextNet model. """ +import inspect +import unittest + +import torch.nn as nn + +from transformers import ( + TextNetBackbone, + TextNetConfig, + is_torch_available, +) +from transformers.models.textnet.modeling_textnet import TextNetForImageClassification +from transformers.testing_utils import ( + require_torch, + torch_device, +) + +from ...test_backbone_common import BackboneTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + TextNetModel, + ) + + +class TextNetModelTester: + def __init__( + self, + parent, + backbone_kernel_size=3, + backbone_stride=2, + backbone_dilation=1, + backbone_groups=1, + backbone_bias=False, + backbone_has_shuffle=False, + backbone_in_channels=3, + backbone_out_channels=64, + backbone_use_bn=True, + backbone_act_func="relu", + backbone_dropout_rate=0, + backbone_ops_order="weight_bn_act", + backbone_stage1_in_channels=[64], + backbone_stage1_out_channels=[64], + backbone_stage1_kernel_size=[[3, 3]], + backbone_stage1_stride=[1], + backbone_stage1_dilation=[1], + backbone_stage1_groups=[1], + backbone_stage2_in_channels=[64], + backbone_stage2_out_channels=[128], + backbone_stage2_kernel_size=[[3, 1]], + backbone_stage2_stride=[2], + backbone_stage2_dilation=[1], + backbone_stage2_groups=[1], + backbone_stage3_in_channels=[128], + backbone_stage3_out_channels=[256], + backbone_stage3_kernel_size=[[1, 3]], + backbone_stage3_stride=[2], + backbone_stage3_dilation=[1], + backbone_stage3_groups=[1], + backbone_stage4_in_channels=[256], + backbone_stage4_out_channels=[512], + backbone_stage4_kernel_size=[[3, 3]], + backbone_stage4_stride=[2], + backbone_stage4_dilation=[1], + backbone_stage4_groups=[1], + out_features=["stage1", "stage2", "stage3", "stage4"], + out_indices=[1, 2, 3, 4], + batch_size=3, + num_channels=3, + image_size=32, + is_training=True, + use_labels=True, + hidden_act="relu", + num_labels=3, + hidden_sizes=[64, 64, 128, 256, 512], + ): + self.parent = parent + self.backbone_kernel_size = backbone_kernel_size + self.backbone_stride = backbone_stride + self.backbone_dilation = backbone_dilation + self.backbone_groups = backbone_groups + self.backbone_bias = backbone_bias + self.backbone_has_shuffle = backbone_has_shuffle + self.backbone_in_channels = backbone_in_channels + self.backbone_out_channels = backbone_out_channels + self.backbone_use_bn = backbone_use_bn + self.backbone_act_func = backbone_act_func + self.backbone_dropout_rate = backbone_dropout_rate + self.backbone_ops_order = backbone_ops_order + + self.backbone_stage1_in_channels = backbone_stage1_in_channels + self.backbone_stage1_out_channels = backbone_stage1_out_channels + self.backbone_stage1_kernel_size = backbone_stage1_kernel_size + self.backbone_stage1_stride = backbone_stage1_stride + self.backbone_stage1_dilation = backbone_stage1_dilation + self.backbone_stage1_groups = backbone_stage1_groups + + self.backbone_stage2_in_channels = backbone_stage2_in_channels + self.backbone_stage2_out_channels = backbone_stage2_out_channels + self.backbone_stage2_kernel_size = backbone_stage2_kernel_size + self.backbone_stage2_stride = backbone_stage2_stride + self.backbone_stage2_dilation = backbone_stage2_dilation + self.backbone_stage2_groups = backbone_stage2_groups + + self.backbone_stage3_in_channels = backbone_stage3_in_channels + self.backbone_stage3_out_channels = backbone_stage3_out_channels + self.backbone_stage3_kernel_size = backbone_stage3_kernel_size + self.backbone_stage3_stride = backbone_stage3_stride + self.backbone_stage3_dilation = backbone_stage3_dilation + self.backbone_stage3_groups = backbone_stage3_groups + + self.backbone_stage4_in_channels = backbone_stage4_in_channels + self.backbone_stage4_out_channels = backbone_stage4_out_channels + self.backbone_stage4_kernel_size = backbone_stage4_kernel_size + self.backbone_stage4_stride = backbone_stage4_stride + self.backbone_stage4_dilation = backbone_stage4_dilation + self.backbone_stage4_groups = backbone_stage4_groups + + self.out_features = out_features + self.out_indices = out_indices + + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.is_training = is_training + self.use_labels = use_labels + self.num_labels = num_labels + self.hidden_sizes = hidden_sizes + + self.num_stages = 5 + + def get_config(self): + return TextNetConfig( + backbone_kernel_size=self.backbone_kernel_size, + backbone_stride=self.backbone_stride, + backbone_dilation=self.backbone_dilation, + backbone_groups=self.backbone_groups, + backbone_bias=self.backbone_bias, + backbone_has_shuffle=self.backbone_has_shuffle, + backbone_in_channels=self.backbone_in_channels, + backbone_out_channels=self.backbone_out_channels, + backbone_use_bn=self.backbone_use_bn, + backbone_act_func=self.backbone_act_func, + backbone_dropout_rate=self.backbone_dropout_rate, + backbone_ops_order=self.backbone_ops_order, + backbone_stage1_in_channels=self.backbone_stage1_in_channels, + backbone_stage1_out_channels=self.backbone_stage1_out_channels, + backbone_stage1_kernel_size=self.backbone_stage1_kernel_size, + backbone_stage1_stride=self.backbone_stage1_stride, + backbone_stage1_dilation=self.backbone_stage1_dilation, + backbone_stage1_groups=self.backbone_stage1_groups, + backbone_stage2_in_channels=self.backbone_stage2_in_channels, + backbone_stage2_out_channels=self.backbone_stage2_out_channels, + backbone_stage2_kernel_size=self.backbone_stage2_kernel_size, + backbone_stage2_stride=self.backbone_stage2_stride, + backbone_stage2_dilation=self.backbone_stage2_dilation, + backbone_stage2_groups=self.backbone_stage2_groups, + backbone_stage3_in_channels=self.backbone_stage3_in_channels, + backbone_stage3_out_channels=self.backbone_stage3_out_channels, + backbone_stage3_kernel_size=self.backbone_stage3_kernel_size, + backbone_stage3_stride=self.backbone_stage3_stride, + backbone_stage3_dilation=self.backbone_stage3_dilation, + backbone_stage3_groups=self.backbone_stage3_groups, + backbone_stage4_in_channels=self.backbone_stage4_in_channels, + backbone_stage4_out_channels=self.backbone_stage4_out_channels, + backbone_stage4_kernel_size=self.backbone_stage4_kernel_size, + backbone_stage4_stride=self.backbone_stage4_stride, + backbone_stage4_dilation=self.backbone_stage4_dilation, + backbone_stage4_groups=self.backbone_stage4_groups, + out_features=self.out_features, + out_indices=self.out_indices, + hidden_sizes=self.hidden_sizes, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = TextNetModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual( + result.last_hidden_state.shape, + (self.batch_size, self.hidden_sizes[-1], 2, 2), + ) + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels + + def create_and_check_backbone(self, config, pixel_values, labels): + model = TextNetBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify feature maps + self.parent.assertEqual(len(result.feature_maps), len(config.out_features)) + self.parent.assertListEqual( + list(result.feature_maps[0].shape), [self.batch_size, self.backbone_stage1_out_channels[-1], 16, 16] + ) + + # verify channels + self.parent.assertEqual(len(model.channels), len(config.out_features)) + self.parent.assertListEqual(model.channels, config.hidden_sizes[1:]) + + # verify backbone works with out_features=None + config.out_features = None + model = TextNetBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify feature maps + self.parent.assertEqual(len(result.feature_maps), 1) + self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, 512, 2, 2]) + + # verify channels + self.parent.assertEqual(len(model.channels), 1) + self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]]) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (TextNetModel, TextNetForImageClassification, TextNetBackbone) if is_torch_available() else () + + pipeline_model_mapping = ( + {"feature-extraction": TextNetModel, "image-classification": TextNetForImageClassification} + if is_torch_available() + else {} + ) + # fx_compatible = False + # test_pruning = False + # test_resize_embeddings = False + # test_head_masking = False + # has_attentions = False + + fx_compatible = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + + def setUp(self): + self.model_tester = TextNetModelTester(self) + self.config_tester = ConfigTester(self, config_class=TextNetConfig, hidden_size=37) + + def test_config(self): + self.create_and_test_config_common_properties() + self.config_tester.create_and_test_config_to_json_string() + self.config_tester.create_and_test_config_to_json_file() + self.config_tester.create_and_test_config_from_and_save_pretrained() + self.config_tester.create_and_test_config_with_num_labels() + self.config_tester.check_config_can_be_init_without_params() + self.config_tester.check_config_arguments_init() + + def create_and_test_config_common_properties(self): + return + + @unittest.skip(reason="Bit does not output attentions") + def test_attention_outputs(self): + pass + + @unittest.skip(reason="Bit does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Bit does not support input and output embeddings") + def test_model_common_attributes(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_backbone(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_backbone(*config_and_inputs) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config=config) + for name, module in model.named_modules(): + if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)): + self.assertTrue( + torch.all(module.weight == 1), + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + self.assertTrue( + torch.all(module.bias == 0), + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_stages = self.model_tester.num_stages - 1 + self.assertEqual(len(hidden_states), expected_num_stages + 1) + + # Bit's feature maps are of shape (batch_size, num_channels, height, width) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [self.model_tester.image_size // 2, self.model_tester.image_size // 2], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + layers_type = ["preactivation", "bottleneck"] + for model_class in self.all_model_classes: + for layer_type in layers_type: + config.layer_type = layer_type + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_model_is_small(self): + # Just a consistency check to make sure we are not running tests on 80M parameter models. + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + num_params = model.num_parameters() + assert ( + num_params < 3000000 + ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max." + + @unittest.skip(reason="Bit does not use feedforward chunking") + def test_feed_forward_chunking(self): + pass + + # def test_for_image_classification(self): + # config_and_inputs = self.model_tester.prepare_config_and_inputs() + # self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + # @slow + # def test_model_from_pretrained(self): + # for model_name in BIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + # model = BitModel.from_pretrained(model_name) + # self.assertIsNotNone(model) + + +@require_torch +class BitBackboneTest(BackboneTesterMixin, unittest.TestCase): + all_model_classes = (TextNetBackbone,) if is_torch_available() else () + config_class = TextNetConfig + + has_attentions = False + + def setUp(self): + self.model_tester = TextNetModelTester(self) From 12941e6aaad3acd281017e27649e58142b2c527b Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 8 Nov 2023 15:37:46 +0530 Subject: [PATCH 034/152] Fix failures --- src/transformers/models/textnet/image_processing_textnet.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py index 32975e13c7a8..0455e8199adf 100644 --- a/src/transformers/models/textnet/image_processing_textnet.py +++ b/src/transformers/models/textnet/image_processing_textnet.py @@ -72,8 +72,7 @@ class TextNetImageProcessor(BaseImageProcessor): the `preprocess` method. rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` - method. - Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. + method. Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. do_normalize (`bool`, *optional*, defaults to `True`): image_mean (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): Mean to use if normalizing the image. This is a float or list of floats the length of the number of From 30568ef68e0745ea8db5cb83fc3e3b57b4fb52cf Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 8 Nov 2023 15:50:22 +0530 Subject: [PATCH 035/152] Refactor textnet model --- .../models/textnet/configuration_textnet.py | 152 ++++++------ .../models/textnet/modeling_textnet.py | 82 +++---- tests/models/textnet/test_modeling_textnet.py | 226 +++++++++--------- 3 files changed, 230 insertions(+), 230 deletions(-) diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py index 9c7fe907aa13..6bcb961a0f97 100644 --- a/src/transformers/models/textnet/configuration_textnet.py +++ b/src/transformers/models/textnet/configuration_textnet.py @@ -34,42 +34,42 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig): def __init__( self, - backbone_kernel_size=3, - backbone_stride=2, - backbone_dilation=1, - backbone_groups=1, - backbone_bias=False, - backbone_has_shuffle=False, - backbone_in_channels=3, - backbone_out_channels=64, - backbone_use_bn=True, - backbone_act_func="relu", - backbone_dropout_rate=0, - backbone_ops_order="weight_bn_act", - backbone_stage1_in_channels=[64, 64, 64], - backbone_stage1_out_channels=[64, 64, 64], - backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]], - backbone_stage1_stride=[1, 2, 1], - backbone_stage1_dilation=[1, 1, 1], - backbone_stage1_groups=[1, 1, 1], - backbone_stage2_in_channels=[64, 128, 128, 128], - backbone_stage2_out_channels=[128, 128, 128, 128], - backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]], - backbone_stage2_stride=[2, 1, 1, 1], - backbone_stage2_dilation=[1, 1, 1, 1], - backbone_stage2_groups=[1, 1, 1, 1], - backbone_stage3_in_channels=[128, 256, 256, 256], - backbone_stage3_out_channels=[256, 256, 256, 256], - backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]], - backbone_stage3_stride=[2, 1, 1, 1], - backbone_stage3_dilation=[1, 1, 1, 1], - backbone_stage3_groups=[1, 1, 1, 1], - backbone_stage4_in_channels=[256, 512, 512, 512], - backbone_stage4_out_channels=[512, 512, 512, 512], - backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]], - backbone_stage4_stride=[2, 1, 1, 1], - backbone_stage4_dilation=[1, 1, 1, 1], - backbone_stage4_groups=[1, 1, 1, 1], + kernel_size=3, + stride=2, + dilation=1, + groups=1, + bias=False, + has_shuffle=False, + in_channels=3, + out_channels=64, + use_bn=True, + act_func="relu", + dropout_rate=0, + ops_order="weight_bn_act", + stage1_in_channels=[64, 64, 64], + stage1_out_channels=[64, 64, 64], + stage1_kernel_size=[[3, 3], [3, 3], [3, 3]], + stage1_stride=[1, 2, 1], + stage1_dilation=[1, 1, 1], + stage1_groups=[1, 1, 1], + stage2_in_channels=[64, 128, 128, 128], + stage2_out_channels=[128, 128, 128, 128], + stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]], + stage2_stride=[2, 1, 1, 1], + stage2_dilation=[1, 1, 1, 1], + stage2_groups=[1, 1, 1, 1], + stage3_in_channels=[128, 256, 256, 256], + stage3_out_channels=[256, 256, 256, 256], + stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]], + stage3_stride=[2, 1, 1, 1], + stage3_dilation=[1, 1, 1, 1], + stage3_groups=[1, 1, 1, 1], + stage4_in_channels=[256, 512, 512, 512], + stage4_out_channels=[512, 512, 512, 512], + stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]], + stage4_stride=[2, 1, 1, 1], + stage4_dilation=[1, 1, 1, 1], + stage4_groups=[1, 1, 1, 1], hidden_sizes=[64, 64, 128, 256, 512], initializer_range=0.02, out_features=None, @@ -78,55 +78,55 @@ def __init__( ): super().__init__(**kwargs) - self.backbone_kernel_size = backbone_kernel_size - self.backbone_stride = backbone_stride - self.backbone_dilation = backbone_dilation - self.backbone_groups = backbone_groups - self.backbone_bias = backbone_bias - self.backbone_has_shuffle = backbone_has_shuffle - self.backbone_in_channels = backbone_in_channels - self.backbone_out_channels = backbone_out_channels - self.backbone_use_bn = backbone_use_bn - self.backbone_act_func = backbone_act_func - self.backbone_dropout_rate = backbone_dropout_rate - self.backbone_ops_order = backbone_ops_order + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + self.groups = groups + self.bias = bias + self.has_shuffle = has_shuffle + self.in_channels = in_channels + self.out_channels = out_channels + self.use_bn = use_bn + self.act_func = act_func + self.dropout_rate = dropout_rate + self.ops_order = ops_order - self.backbone_stage1_in_channels = backbone_stage1_in_channels - self.backbone_stage1_out_channels = backbone_stage1_out_channels - self.backbone_stage1_kernel_size = backbone_stage1_kernel_size - self.backbone_stage1_stride = backbone_stage1_stride - self.backbone_stage1_dilation = backbone_stage1_dilation - self.backbone_stage1_groups = backbone_stage1_groups + self.stage1_in_channels = stage1_in_channels + self.stage1_out_channels = stage1_out_channels + self.stage1_kernel_size = stage1_kernel_size + self.stage1_stride = stage1_stride + self.stage1_dilation = stage1_dilation + self.stage1_groups = stage1_groups - self.backbone_stage2_in_channels = backbone_stage2_in_channels - self.backbone_stage2_out_channels = backbone_stage2_out_channels - self.backbone_stage2_kernel_size = backbone_stage2_kernel_size - self.backbone_stage2_stride = backbone_stage2_stride - self.backbone_stage2_dilation = backbone_stage2_dilation - self.backbone_stage2_groups = backbone_stage2_groups + self.stage2_in_channels = stage2_in_channels + self.stage2_out_channels = stage2_out_channels + self.stage2_kernel_size = stage2_kernel_size + self.stage2_stride = stage2_stride + self.stage2_dilation = stage2_dilation + self.stage2_groups = stage2_groups - self.backbone_stage3_in_channels = backbone_stage3_in_channels - self.backbone_stage3_out_channels = backbone_stage3_out_channels - self.backbone_stage3_kernel_size = backbone_stage3_kernel_size - self.backbone_stage3_stride = backbone_stage3_stride - self.backbone_stage3_dilation = backbone_stage3_dilation - self.backbone_stage3_groups = backbone_stage3_groups + self.stage3_in_channels = stage3_in_channels + self.stage3_out_channels = stage3_out_channels + self.stage3_kernel_size = stage3_kernel_size + self.stage3_stride = stage3_stride + self.stage3_dilation = stage3_dilation + self.stage3_groups = stage3_groups - self.backbone_stage4_in_channels = backbone_stage4_in_channels - self.backbone_stage4_out_channels = backbone_stage4_out_channels - self.backbone_stage4_kernel_size = backbone_stage4_kernel_size - self.backbone_stage4_stride = backbone_stage4_stride - self.backbone_stage4_dilation = backbone_stage4_dilation - self.backbone_stage4_groups = backbone_stage4_groups + self.stage4_in_channels = stage4_in_channels + self.stage4_out_channels = stage4_out_channels + self.stage4_kernel_size = stage4_kernel_size + self.stage4_stride = stage4_stride + self.stage4_dilation = stage4_dilation + self.stage4_groups = stage4_groups self.initializer_range = initializer_range self.hidden_sizes = hidden_sizes self.depths = [ - len(self.backbone_stage1_out_channels), - len(self.backbone_stage2_out_channels), - len(self.backbone_stage3_out_channels), - len(self.backbone_stage4_out_channels), + len(self.stage1_out_channels), + len(self.stage2_out_channels), + len(self.stage3_out_channels), + len(self.stage4_out_channels), ] self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, 5)] self._out_features, self._out_indices = get_aligned_output_features_output_indices( diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py index 119ee2c7418d..cc0b834f4262 100644 --- a/src/transformers/models/textnet/modeling_textnet.py +++ b/src/transformers/models/textnet/modeling_textnet.py @@ -363,63 +363,63 @@ class TextNetModel(TextNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.first_conv = TextNetConvLayer( - config.backbone_in_channels, - config.backbone_out_channels, - config.backbone_kernel_size, - config.backbone_stride, - config.backbone_dilation, - config.backbone_groups, - config.backbone_bias, - config.backbone_has_shuffle, - config.backbone_use_bn, - config.backbone_act_func, - config.backbone_dropout_rate, - config.backbone_ops_order, + config.in_channels, + config.out_channels, + config.kernel_size, + config.stride, + config.dilation, + config.groups, + config.bias, + config.has_shuffle, + config.use_bn, + config.act_func, + config.dropout_rate, + config.ops_order, ) stage1 = [] for stage_config in zip( - config.backbone_stage1_in_channels, - config.backbone_stage1_out_channels, - config.backbone_stage1_kernel_size, - config.backbone_stage1_stride, - config.backbone_stage1_dilation, - config.backbone_stage1_groups, + config.stage1_in_channels, + config.stage1_out_channels, + config.stage1_kernel_size, + config.stage1_stride, + config.stage1_dilation, + config.stage1_groups, ): stage1.append(TestNetRepConvLayer(*stage_config)) self.stage1 = nn.ModuleList(stage1) stage2 = [] for stage_config in zip( - config.backbone_stage2_in_channels, - config.backbone_stage2_out_channels, - config.backbone_stage2_kernel_size, - config.backbone_stage2_stride, - config.backbone_stage2_dilation, - config.backbone_stage2_groups, + config.stage2_in_channels, + config.stage2_out_channels, + config.stage2_kernel_size, + config.stage2_stride, + config.stage2_dilation, + config.stage2_groups, ): stage2.append(TestNetRepConvLayer(*stage_config)) self.stage2 = nn.ModuleList(stage2) stage3 = [] for stage_config in zip( - config.backbone_stage3_in_channels, - config.backbone_stage3_out_channels, - config.backbone_stage3_kernel_size, - config.backbone_stage3_stride, - config.backbone_stage3_dilation, - config.backbone_stage3_groups, + config.stage3_in_channels, + config.stage3_out_channels, + config.stage3_kernel_size, + config.stage3_stride, + config.stage3_dilation, + config.stage3_groups, ): stage3.append(TestNetRepConvLayer(*stage_config)) self.stage3 = nn.ModuleList(stage3) stage4 = [] for stage_config in zip( - config.backbone_stage4_in_channels, - config.backbone_stage4_out_channels, - config.backbone_stage4_kernel_size, - config.backbone_stage4_stride, - config.backbone_stage4_dilation, - config.backbone_stage4_groups, + config.stage4_in_channels, + config.stage4_out_channels, + config.stage4_kernel_size, + config.stage4_stride, + config.stage4_dilation, + config.stage4_groups, ): stage4.append(TestNetRepConvLayer(*stage_config)) self.stage4 = nn.ModuleList(stage4) @@ -481,11 +481,11 @@ def __init__(self, config): self.textnet = TextNetModel(config) self.num_features = [ - config.backbone_out_channels, - config.backbone_stage1_out_channels[-1], - config.backbone_stage2_out_channels[-1], - config.backbone_stage3_out_channels[-1], - config.backbone_stage4_out_channels[-1], + config.out_channels, + config.stage1_out_channels[-1], + config.stage2_out_channels[-1], + config.stage3_out_channels[-1], + config.stage4_out_channels[-1], ] # initialize weights and apply final processing diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py index d7ebe31f6021..bd18111f582c 100644 --- a/tests/models/textnet/test_modeling_textnet.py +++ b/tests/models/textnet/test_modeling_textnet.py @@ -47,42 +47,42 @@ class TextNetModelTester: def __init__( self, parent, - backbone_kernel_size=3, - backbone_stride=2, - backbone_dilation=1, - backbone_groups=1, - backbone_bias=False, - backbone_has_shuffle=False, - backbone_in_channels=3, - backbone_out_channels=64, - backbone_use_bn=True, - backbone_act_func="relu", - backbone_dropout_rate=0, - backbone_ops_order="weight_bn_act", - backbone_stage1_in_channels=[64], - backbone_stage1_out_channels=[64], - backbone_stage1_kernel_size=[[3, 3]], - backbone_stage1_stride=[1], - backbone_stage1_dilation=[1], - backbone_stage1_groups=[1], - backbone_stage2_in_channels=[64], - backbone_stage2_out_channels=[128], - backbone_stage2_kernel_size=[[3, 1]], - backbone_stage2_stride=[2], - backbone_stage2_dilation=[1], - backbone_stage2_groups=[1], - backbone_stage3_in_channels=[128], - backbone_stage3_out_channels=[256], - backbone_stage3_kernel_size=[[1, 3]], - backbone_stage3_stride=[2], - backbone_stage3_dilation=[1], - backbone_stage3_groups=[1], - backbone_stage4_in_channels=[256], - backbone_stage4_out_channels=[512], - backbone_stage4_kernel_size=[[3, 3]], - backbone_stage4_stride=[2], - backbone_stage4_dilation=[1], - backbone_stage4_groups=[1], + kernel_size=3, + stride=2, + dilation=1, + groups=1, + bias=False, + has_shuffle=False, + in_channels=3, + out_channels=64, + use_bn=True, + act_func="relu", + dropout_rate=0, + ops_order="weight_bn_act", + stage1_in_channels=[64], + stage1_out_channels=[64], + stage1_kernel_size=[[3, 3]], + stage1_stride=[1], + stage1_dilation=[1], + stage1_groups=[1], + stage2_in_channels=[64], + stage2_out_channels=[128], + stage2_kernel_size=[[3, 1]], + stage2_stride=[2], + stage2_dilation=[1], + stage2_groups=[1], + stage3_in_channels=[128], + stage3_out_channels=[256], + stage3_kernel_size=[[1, 3]], + stage3_stride=[2], + stage3_dilation=[1], + stage3_groups=[1], + stage4_in_channels=[256], + stage4_out_channels=[512], + stage4_kernel_size=[[3, 3]], + stage4_stride=[2], + stage4_dilation=[1], + stage4_groups=[1], out_features=["stage1", "stage2", "stage3", "stage4"], out_indices=[1, 2, 3, 4], batch_size=3, @@ -95,46 +95,46 @@ def __init__( hidden_sizes=[64, 64, 128, 256, 512], ): self.parent = parent - self.backbone_kernel_size = backbone_kernel_size - self.backbone_stride = backbone_stride - self.backbone_dilation = backbone_dilation - self.backbone_groups = backbone_groups - self.backbone_bias = backbone_bias - self.backbone_has_shuffle = backbone_has_shuffle - self.backbone_in_channels = backbone_in_channels - self.backbone_out_channels = backbone_out_channels - self.backbone_use_bn = backbone_use_bn - self.backbone_act_func = backbone_act_func - self.backbone_dropout_rate = backbone_dropout_rate - self.backbone_ops_order = backbone_ops_order - - self.backbone_stage1_in_channels = backbone_stage1_in_channels - self.backbone_stage1_out_channels = backbone_stage1_out_channels - self.backbone_stage1_kernel_size = backbone_stage1_kernel_size - self.backbone_stage1_stride = backbone_stage1_stride - self.backbone_stage1_dilation = backbone_stage1_dilation - self.backbone_stage1_groups = backbone_stage1_groups - - self.backbone_stage2_in_channels = backbone_stage2_in_channels - self.backbone_stage2_out_channels = backbone_stage2_out_channels - self.backbone_stage2_kernel_size = backbone_stage2_kernel_size - self.backbone_stage2_stride = backbone_stage2_stride - self.backbone_stage2_dilation = backbone_stage2_dilation - self.backbone_stage2_groups = backbone_stage2_groups - - self.backbone_stage3_in_channels = backbone_stage3_in_channels - self.backbone_stage3_out_channels = backbone_stage3_out_channels - self.backbone_stage3_kernel_size = backbone_stage3_kernel_size - self.backbone_stage3_stride = backbone_stage3_stride - self.backbone_stage3_dilation = backbone_stage3_dilation - self.backbone_stage3_groups = backbone_stage3_groups - - self.backbone_stage4_in_channels = backbone_stage4_in_channels - self.backbone_stage4_out_channels = backbone_stage4_out_channels - self.backbone_stage4_kernel_size = backbone_stage4_kernel_size - self.backbone_stage4_stride = backbone_stage4_stride - self.backbone_stage4_dilation = backbone_stage4_dilation - self.backbone_stage4_groups = backbone_stage4_groups + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + self.groups = groups + self.bias = bias + self.has_shuffle = has_shuffle + self.in_channels = in_channels + self.out_channels = out_channels + self.use_bn = use_bn + self.act_func = act_func + self.dropout_rate = dropout_rate + self.ops_order = ops_order + + self.stage1_in_channels = stage1_in_channels + self.stage1_out_channels = stage1_out_channels + self.stage1_kernel_size = stage1_kernel_size + self.stage1_stride = stage1_stride + self.stage1_dilation = stage1_dilation + self.stage1_groups = stage1_groups + + self.stage2_in_channels = stage2_in_channels + self.stage2_out_channels = stage2_out_channels + self.stage2_kernel_size = stage2_kernel_size + self.stage2_stride = stage2_stride + self.stage2_dilation = stage2_dilation + self.stage2_groups = stage2_groups + + self.stage3_in_channels = stage3_in_channels + self.stage3_out_channels = stage3_out_channels + self.stage3_kernel_size = stage3_kernel_size + self.stage3_stride = stage3_stride + self.stage3_dilation = stage3_dilation + self.stage3_groups = stage3_groups + + self.stage4_in_channels = stage4_in_channels + self.stage4_out_channels = stage4_out_channels + self.stage4_kernel_size = stage4_kernel_size + self.stage4_stride = stage4_stride + self.stage4_dilation = stage4_dilation + self.stage4_groups = stage4_groups self.out_features = out_features self.out_indices = out_indices @@ -151,42 +151,42 @@ def __init__( def get_config(self): return TextNetConfig( - backbone_kernel_size=self.backbone_kernel_size, - backbone_stride=self.backbone_stride, - backbone_dilation=self.backbone_dilation, - backbone_groups=self.backbone_groups, - backbone_bias=self.backbone_bias, - backbone_has_shuffle=self.backbone_has_shuffle, - backbone_in_channels=self.backbone_in_channels, - backbone_out_channels=self.backbone_out_channels, - backbone_use_bn=self.backbone_use_bn, - backbone_act_func=self.backbone_act_func, - backbone_dropout_rate=self.backbone_dropout_rate, - backbone_ops_order=self.backbone_ops_order, - backbone_stage1_in_channels=self.backbone_stage1_in_channels, - backbone_stage1_out_channels=self.backbone_stage1_out_channels, - backbone_stage1_kernel_size=self.backbone_stage1_kernel_size, - backbone_stage1_stride=self.backbone_stage1_stride, - backbone_stage1_dilation=self.backbone_stage1_dilation, - backbone_stage1_groups=self.backbone_stage1_groups, - backbone_stage2_in_channels=self.backbone_stage2_in_channels, - backbone_stage2_out_channels=self.backbone_stage2_out_channels, - backbone_stage2_kernel_size=self.backbone_stage2_kernel_size, - backbone_stage2_stride=self.backbone_stage2_stride, - backbone_stage2_dilation=self.backbone_stage2_dilation, - backbone_stage2_groups=self.backbone_stage2_groups, - backbone_stage3_in_channels=self.backbone_stage3_in_channels, - backbone_stage3_out_channels=self.backbone_stage3_out_channels, - backbone_stage3_kernel_size=self.backbone_stage3_kernel_size, - backbone_stage3_stride=self.backbone_stage3_stride, - backbone_stage3_dilation=self.backbone_stage3_dilation, - backbone_stage3_groups=self.backbone_stage3_groups, - backbone_stage4_in_channels=self.backbone_stage4_in_channels, - backbone_stage4_out_channels=self.backbone_stage4_out_channels, - backbone_stage4_kernel_size=self.backbone_stage4_kernel_size, - backbone_stage4_stride=self.backbone_stage4_stride, - backbone_stage4_dilation=self.backbone_stage4_dilation, - backbone_stage4_groups=self.backbone_stage4_groups, + kernel_size=self.kernel_size, + stride=self.stride, + dilation=self.dilation, + groups=self.groups, + bias=self.bias, + has_shuffle=self.has_shuffle, + in_channels=self.in_channels, + out_channels=self.out_channels, + use_bn=self.use_bn, + act_func=self.act_func, + dropout_rate=self.dropout_rate, + ops_order=self.ops_order, + stage1_in_channels=self.stage1_in_channels, + stage1_out_channels=self.stage1_out_channels, + stage1_kernel_size=self.stage1_kernel_size, + stage1_stride=self.stage1_stride, + stage1_dilation=self.stage1_dilation, + stage1_groups=self.stage1_groups, + stage2_in_channels=self.stage2_in_channels, + stage2_out_channels=self.stage2_out_channels, + stage2_kernel_size=self.stage2_kernel_size, + stage2_stride=self.stage2_stride, + stage2_dilation=self.stage2_dilation, + stage2_groups=self.stage2_groups, + stage3_in_channels=self.stage3_in_channels, + stage3_out_channels=self.stage3_out_channels, + stage3_kernel_size=self.stage3_kernel_size, + stage3_stride=self.stage3_stride, + stage3_dilation=self.stage3_dilation, + stage3_groups=self.stage3_groups, + stage4_in_channels=self.stage4_in_channels, + stage4_out_channels=self.stage4_out_channels, + stage4_kernel_size=self.stage4_kernel_size, + stage4_stride=self.stage4_stride, + stage4_dilation=self.stage4_dilation, + stage4_groups=self.stage4_groups, out_features=self.out_features, out_indices=self.out_indices, hidden_sizes=self.hidden_sizes, @@ -222,7 +222,7 @@ def create_and_check_backbone(self, config, pixel_values, labels): # verify feature maps self.parent.assertEqual(len(result.feature_maps), len(config.out_features)) self.parent.assertListEqual( - list(result.feature_maps[0].shape), [self.batch_size, self.backbone_stage1_out_channels[-1], 16, 16] + list(result.feature_maps[0].shape), [self.batch_size, self.stage1_out_channels[-1], 16, 16] ) # verify channels From 1f99e8485fe177ad0523b69d9e51ecd869cc10cf Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 8 Nov 2023 16:08:53 +0530 Subject: [PATCH 036/152] Fix failures --- src/transformers/__init__.py | 3 +++ src/transformers/models/__init__.py | 1 + src/transformers/models/textnet/__init__.py | 4 ++-- src/transformers/utils/dummy_pt_objects.py | 7 +++++++ utils/check_repo.py | 1 + 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 90270a9e406f..e3110a1d72ba 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -3305,6 +3305,8 @@ "TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "TextNetBackbone", "TextNetModel", + "TextNetForImageClassification", + "TextNetPreTrainedModel" ] ) _import_structure["models.time_series_transformer"].extend( @@ -7664,6 +7666,7 @@ TextNetBackbone, TextNetModel, TextNetPreTrainedModel, + TextNetForImageClassification, ) from .models.time_series_transformer import ( TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 2c20873c2ed7..8ca5c9ae27ba 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -206,6 +206,7 @@ swinv2, switch_transformers, t5, + textnet, table_transformer, tapas, time_series_transformer, diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py index 21e26f387817..9a4832c7caa9 100644 --- a/src/transformers/models/textnet/__init__.py +++ b/src/transformers/models/textnet/__init__.py @@ -20,7 +20,6 @@ is_torch_available, ) - _import_structure = { "configuration_textnet": ["TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "TextNetConfig"], "image_processing_textnet": ["TextNetImageProcessor"], @@ -32,7 +31,8 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["modeling_textnet"] = ["TextNetBackbone", "TextNetModel", "TextNetPreTrainedModel"] + _import_structure["modeling_textnet"] = ["TextNetBackbone", "TextNetModel", "TextNetPreTrainedModel", + "TextNetForImageClassification"] if TYPE_CHECKING: from .configuration_textnet import TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, TextNetConfig diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 095c9a6f4189..70047baa2931 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -8047,6 +8047,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class TextNetForImageClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/utils/check_repo.py b/utils/check_repo.py index 3af3a05a8aa6..66f9d7f2b757 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -993,6 +993,7 @@ def find_all_documented_objects() -> List[str]: "ResNetBackbone", "SwinBackbone", "Swinv2Backbone", + "TextNetBackbone", "TimmBackbone", "TimmBackboneConfig", "VitDetBackbone", From 02e85ed1f2527370d3dec313d3e7765a6355e493 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 8 Nov 2023 17:38:14 +0530 Subject: [PATCH 037/152] Add cv2 to setup --- setup.py | 1 + src/transformers/__init__.py | 6 +++--- src/transformers/models/__init__.py | 2 +- src/transformers/models/textnet/__init__.py | 9 +++++++-- src/transformers/utils/dummy_pt_objects.py | 6 +++--- 5 files changed, 15 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 65b84fe938f7..cde79041adb3 100644 --- a/setup.py +++ b/setup.py @@ -296,6 +296,7 @@ def run(self): extras["natten"] = deps_list("natten") extras["codecarbon"] = deps_list("codecarbon") extras["video"] = deps_list("decord", "av") +extras["opencv-python"] = deps_list("opencv-python") extras["sentencepiece"] = deps_list("sentencepiece", "protobuf") extras["testing"] = ( diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index e3110a1d72ba..7cfffec8463b 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -3304,9 +3304,9 @@ [ "TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "TextNetBackbone", - "TextNetModel", "TextNetForImageClassification", - "TextNetPreTrainedModel" + "TextNetModel", + "TextNetPreTrainedModel", ] ) _import_structure["models.time_series_transformer"].extend( @@ -7664,9 +7664,9 @@ from .models.textnet import ( TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, TextNetBackbone, + TextNetForImageClassification, TextNetModel, TextNetPreTrainedModel, - TextNetForImageClassification, ) from .models.time_series_transformer import ( TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 8ca5c9ae27ba..b63e845c7060 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -206,9 +206,9 @@ swinv2, switch_transformers, t5, - textnet, table_transformer, tapas, + textnet, time_series_transformer, timesformer, timm_backbone, diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py index 9a4832c7caa9..cd8c1fa3276b 100644 --- a/src/transformers/models/textnet/__init__.py +++ b/src/transformers/models/textnet/__init__.py @@ -20,6 +20,7 @@ is_torch_available, ) + _import_structure = { "configuration_textnet": ["TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "TextNetConfig"], "image_processing_textnet": ["TextNetImageProcessor"], @@ -31,8 +32,12 @@ except OptionalDependencyNotAvailable: pass else: - _import_structure["modeling_textnet"] = ["TextNetBackbone", "TextNetModel", "TextNetPreTrainedModel", - "TextNetForImageClassification"] + _import_structure["modeling_textnet"] = [ + "TextNetBackbone", + "TextNetModel", + "TextNetPreTrainedModel", + "TextNetForImageClassification", + ] if TYPE_CHECKING: from .configuration_textnet import TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, TextNetConfig diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 70047baa2931..f45bbda68f5f 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -8033,21 +8033,21 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class TextNetModel(metaclass=DummyObject): +class TextNetForImageClassification(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class TextNetPreTrainedModel(metaclass=DummyObject): +class TextNetModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class TextNetForImageClassification(metaclass=DummyObject): +class TextNetPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): From 632ef069bb0f99d0a122f0e075a6dad3d2fc9ac7 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 8 Nov 2023 17:56:16 +0530 Subject: [PATCH 038/152] Fix failures --- src/transformers/models/textnet/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py index cd8c1fa3276b..6ac78b0bce02 100644 --- a/src/transformers/models/textnet/__init__.py +++ b/src/transformers/models/textnet/__init__.py @@ -49,7 +49,12 @@ except OptionalDependencyNotAvailable: pass else: - from .modeling_textnet import TextNetBackbone, TextNetModel, TextNetPreTrainedModel + from .modeling_textnet import ( + TextNetBackbone, + TextNetForImageClassification, + TextNetModel, + TextNetPreTrainedModel, + ) else: From 8c25e477cd96e8d48251377e76a234f7c7a9c699 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 8 Nov 2023 18:05:44 +0530 Subject: [PATCH 039/152] Fix failures --- .../models/textnet/modeling_textnet.py | 26 +++++++++---------- tests/models/textnet/test_modeling_textnet.py | 3 +-- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py index cc0b834f4262..72950f0776ec 100644 --- a/src/transformers/models/textnet/modeling_textnet.py +++ b/src/transformers/models/textnet/modeling_textnet.py @@ -502,19 +502,19 @@ def forward( Examples: ```python - >>> from transformers import AutoImageProcessor, AutoBackbone - >>> import torch - >>> from PIL import Image - >>> import requests - - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> processor = AutoImageProcessor.from_pretrained("google/resnetnv2-50") - >>> model = AutoBackbone.from_pretrained("google/resnetnv2-50") - - >>> inputs = processor(image, return_tensors="pt") - >>> outputs = model(**inputs) + # >>> from transformers import AutoImageProcessor, AutoBackbone + # >>> import torch + # >>> from PIL import Image + # >>> import requests + # + # >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + # >>> image = Image.open(requests.get(url, stream=True).raw) + # + # >>> processor = AutoImageProcessor.from_pretrained("google/resnetnv2-50") + # >>> model = AutoBackbone.from_pretrained("google/resnetnv2-50") + # + # >>> inputs = processor(image, return_tensors="pt") + # >>> outputs = model(**inputs) ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_hidden_states = ( diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py index bd18111f582c..a73833f20a16 100644 --- a/tests/models/textnet/test_modeling_textnet.py +++ b/tests/models/textnet/test_modeling_textnet.py @@ -16,8 +16,6 @@ import inspect import unittest -import torch.nn as nn - from transformers import ( TextNetBackbone, TextNetConfig, @@ -37,6 +35,7 @@ if is_torch_available(): import torch + import torch.nn as nn from transformers import ( TextNetModel, From 1537643c48265ebe0c4336661e6ab36beec9cf82 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 8 Nov 2023 18:14:28 +0530 Subject: [PATCH 040/152] Add CV2 dependency --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index cde79041adb3..cc354db415ef 100644 --- a/setup.py +++ b/setup.py @@ -290,13 +290,12 @@ def run(self): extras["torch-speech"] = deps_list("torchaudio") + extras["audio"] extras["tf-speech"] = extras["audio"] extras["flax-speech"] = extras["audio"] -extras["vision"] = deps_list("Pillow") +extras["vision"] = deps_list("Pillow", "opencv-python") extras["timm"] = deps_list("timm") extras["torch-vision"] = deps_list("torchvision") + extras["vision"] extras["natten"] = deps_list("natten") extras["codecarbon"] = deps_list("codecarbon") extras["video"] = deps_list("decord", "av") -extras["opencv-python"] = deps_list("opencv-python") extras["sentencepiece"] = deps_list("sentencepiece", "protobuf") extras["testing"] = ( From 9718ca1cdcfb4f1fbec9ff19beebccc757ed7d35 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 8 Nov 2023 18:32:42 +0530 Subject: [PATCH 041/152] Fix bugs --- src/transformers/models/fast/configuration_fast.py | 4 ---- .../models/fast/image_processing_fast.py | 12 +++++++----- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index 186b398a4745..3d499d756c5e 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -95,8 +95,6 @@ def __init__( head_final_act_func=None, head_final_dropout_rate=0, head_final_ops_order="weight", - min_area=250, - bbox_type="rect", loss_bg=False, initializer_range=0.02, **kwargs, @@ -174,7 +172,5 @@ def __init__( self.head_final_dropout_rate = head_final_dropout_rate self.head_final_ops_order = head_final_ops_order - self.min_area = min_area - self.bbox_type = bbox_type self.loss_bg = loss_bg self.initializer_range = initializer_range diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index 8aeb1e6f0334..ae4505d4fffa 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -390,7 +390,7 @@ def _max_pooling(self, x, scale=1): ) return x - def post_process_text_detection(self, output, target_sizes, threshold): + def post_process_text_detection(self, output, target_sizes, threshold, bbox_type="rect"): scale = 2 img_size = (self.size["height"], self.size["width"]) out = output["hidden_states"] @@ -429,13 +429,15 @@ def post_process_text_detection(self, output, target_sizes, threshold): org_img_size = target_sizes[i] scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0])) - bboxes, scores = self.generate_bbox(keys[i], labels[i], score_maps[i], scales, threshold) + bboxes, scores = self.generate_bbox( + keys[i], labels[i], score_maps[i], scales, threshold, bbox_type=bbox_type + ) results.append({"bboxes": bboxes, "scores": scores}) final_results.update({"results": results}) return results - def generate_bbox(self, keys, label, score, scales, threshold): + def generate_bbox(self, keys, label, score, scales, threshold, bbox_type): label_num = len(keys) bboxes = [] scores = [] @@ -452,13 +454,13 @@ def generate_bbox(self, keys, label, score, scales, threshold): label[ind] = 0 continue - if self.bbox_type == "rect": + if bbox_type == "rect": rect = cv2.minAreaRect(points[:, ::-1]) alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1]))) rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2]) bbox = cv2.boxPoints(rect) * scales - elif self.bbox_type == "poly": + elif bbox_type == "poly": binary = np.zeros(label.shape, dtype="uint8") binary[ind_np] = 1 contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) From 26c7542a844ca3d405c4dc870d18f79fd9c69d4d Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 8 Nov 2023 19:07:00 +0530 Subject: [PATCH 042/152] Fix build issue --- tests/models/textnet/test_modeling_textnet.py | 11 +++++------ utils/check_docstrings.py | 1 + 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py index a73833f20a16..6fb31172adc8 100644 --- a/tests/models/textnet/test_modeling_textnet.py +++ b/tests/models/textnet/test_modeling_textnet.py @@ -16,16 +16,11 @@ import inspect import unittest -from transformers import ( - TextNetBackbone, - TextNetConfig, - is_torch_available, -) -from transformers.models.textnet.modeling_textnet import TextNetForImageClassification from transformers.testing_utils import ( require_torch, torch_device, ) +from transformers.utils import is_torch_available from ...test_backbone_common import BackboneTesterMixin from ...test_configuration_common import ConfigTester @@ -38,7 +33,11 @@ import torch.nn as nn from transformers import ( + TextNetBackbone, + TextNetConfig, + TextNetForImageClassification, TextNetModel, + is_torch_available, ) diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index 3c4663103979..a867e46ce64e 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -487,6 +487,7 @@ "TapasConfig", "TapasModel", "TapasTokenizer", + "TextNetImageProcessor", "Text2TextGenerationPipeline", "TextClassificationPipeline", "TextGenerationPipeline", From ed85312ccede591ee03eecc9f507463489928df6 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Wed, 8 Nov 2023 19:37:44 +0530 Subject: [PATCH 043/152] Fix failures --- src/transformers/models/fast/modeling_fast.py | 4 ++-- tests/models/fast/test_image_processing_fast.py | 7 ++++--- tests/models/fast/test_modeling_fast.py | 4 ++-- tests/models/textnet/test_modeling_textnet.py | 2 +- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index cfd3506de0fc..b7f5f45f1f00 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -804,8 +804,8 @@ def forward( >>> # forward pass >>> outputs = model(pixel_values=inputs["pixel_values"]) >>> target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]] - >>> threshold = 0.88 - >>> text_locations = processor.post_process_text_detection(outputs, target_sizes, threshold) + >>> threshold = 0.85 + >>> text_locations = processor.post_process_text_detection(outputs, target_sizes, threshold, bbox_type="poly") >>> print(text_locations[0]["bboxes"][0][:10]) [484, 175, 484, 178, 483, 179, 452, 179, 452, 182] ``` diff --git a/tests/models/fast/test_image_processing_fast.py b/tests/models/fast/test_image_processing_fast.py index f8192856849b..667ce191d43a 100644 --- a/tests/models/fast/test_image_processing_fast.py +++ b/tests/models/fast/test_image_processing_fast.py @@ -18,7 +18,7 @@ import requests -from transformers.testing_utils import require_torch, require_vision +from transformers.testing_utils import require_torch, require_vision, slow from transformers.utils import is_torch_available, is_vision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs @@ -137,6 +137,7 @@ def test_image_processor_from_dict_with_kwargs(self): self.assertEqual(image_processor.size, {"height": 42, "width": 42}) self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) + @slow def test_post_process_text_detection(self): model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") @@ -152,8 +153,8 @@ def prepare_image(): output = model(pixel_values=torch.tensor(inputs["pixel_values"])) target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]] - threshold = 0.88 - final_out = image_processor.post_process_text_detection(output, target_sizes, threshold) + threshold = 0.85 + final_out = image_processor.post_process_text_detection(output, target_sizes, threshold, bbox_type="poly") assert len(final_out[0]["bboxes"]) == 2 assert len(final_out[0]["bboxes"][0]) == 716 diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index f97481436676..44168b853961 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -425,8 +425,8 @@ def prepare_image(): output = model(pixel_values=torch.tensor(input["pixel_values"])) target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]] - threshold = 0.88 - final_out = image_processor.post_process_text_detection(output, target_sizes, threshold) + threshold = 0.85 + final_out = image_processor.post_process_text_detection(output, target_sizes, threshold, bbox_type="poly") assert final_out[0]["bboxes"][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182] assert round(float(final_out[0]["scores"][0]), 5) == 0.92356 diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py index 6fb31172adc8..01337be50b7c 100644 --- a/tests/models/textnet/test_modeling_textnet.py +++ b/tests/models/textnet/test_modeling_textnet.py @@ -395,7 +395,7 @@ def test_feed_forward_chunking(self): @require_torch -class BitBackboneTest(BackboneTesterMixin, unittest.TestCase): +class TextNetBackboneTest(BackboneTesterMixin, unittest.TestCase): all_model_classes = (TextNetBackbone,) if is_torch_available() else () config_class = TextNetConfig From 3f8be4dd7d9bc8cb8be74d01a0b299bdd6f9fce2 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Thu, 9 Nov 2023 06:20:03 +0530 Subject: [PATCH 044/152] Remove textnet from modeling fast --- .../models/fast/configuration_fast.py | 119 +++++++----------- .../fast/convert_fast_original_to_pytorch.py | 87 +++++++------ src/transformers/models/fast/modeling_fast.py | 118 ++++------------- .../models/textnet/configuration_textnet.py | 1 + tests/models/fast/test_modeling_fast.py | 85 +++++++------ tests/models/textnet/test_modeling_textnet.py | 2 +- 6 files changed, 164 insertions(+), 248 deletions(-) diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index 3d499d756c5e..0ed87373049e 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ Fast model configuration""" -from transformers import PretrainedConfig +from transformers import CONFIG_MAPPING, PretrainedConfig from transformers.utils import logging @@ -33,42 +33,9 @@ class FastConfig(PretrainedConfig): def __init__( self, - backbone_kernel_size=3, - backbone_stride=2, - backbone_dilation=1, - backbone_groups=1, - backbone_bias=False, - backbone_has_shuffle=False, - backbone_in_channels=3, - backbone_out_channels=64, - backbone_use_bn=True, - backbone_act_func="relu", - backbone_dropout_rate=0, - backbone_ops_order="weight_bn_act", - backbone_stage1_in_channels=[64, 64, 64], - backbone_stage1_out_channels=[64, 64, 64], - backbone_stage1_kernel_size=[[3, 3], [3, 3], [3, 3]], - backbone_stage1_stride=[1, 2, 1], - backbone_stage1_dilation=[1, 1, 1], - backbone_stage1_groups=[1, 1, 1], - backbone_stage2_in_channels=[64, 128, 128, 128], - backbone_stage2_out_channels=[128, 128, 128, 128], - backbone_stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]], - backbone_stage2_stride=[2, 1, 1, 1], - backbone_stage2_dilation=[1, 1, 1, 1], - backbone_stage2_groups=[1, 1, 1, 1], - backbone_stage3_in_channels=[128, 256, 256, 256], - backbone_stage3_out_channels=[256, 256, 256, 256], - backbone_stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]], - backbone_stage3_stride=[2, 1, 1, 1], - backbone_stage3_dilation=[1, 1, 1, 1], - backbone_stage3_groups=[1, 1, 1, 1], - backbone_stage4_in_channels=[256, 512, 512, 512], - backbone_stage4_out_channels=[512, 512, 512, 512], - backbone_stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]], - backbone_stage4_stride=[2, 1, 1, 1], - backbone_stage4_dilation=[1, 1, 1, 1], - backbone_stage4_groups=[1, 1, 1, 1], + use_timm_backbone=True, + backbone_config=None, + num_channels=3, neck_in_channels=[64, 128, 256, 512], neck_out_channels=[128, 128, 128, 128], neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]], @@ -96,51 +63,33 @@ def __init__( head_final_dropout_rate=0, head_final_ops_order="weight", loss_bg=False, + backbone="resnet50", + use_pretrained_backbone=True, + dilation=False, initializer_range=0.02, **kwargs, ): super().__init__(**kwargs) - self.backbone_kernel_size = backbone_kernel_size - self.backbone_stride = backbone_stride - self.backbone_dilation = backbone_dilation - self.backbone_groups = backbone_groups - self.backbone_bias = backbone_bias - self.backbone_has_shuffle = backbone_has_shuffle - self.backbone_in_channels = backbone_in_channels - self.backbone_out_channels = backbone_out_channels - self.backbone_use_bn = backbone_use_bn - self.backbone_act_func = backbone_act_func - self.backbone_dropout_rate = backbone_dropout_rate - self.backbone_ops_order = backbone_ops_order - - self.backbone_stage1_in_channels = backbone_stage1_in_channels - self.backbone_stage1_out_channels = backbone_stage1_out_channels - self.backbone_stage1_kernel_size = backbone_stage1_kernel_size - self.backbone_stage1_stride = backbone_stage1_stride - self.backbone_stage1_dilation = backbone_stage1_dilation - self.backbone_stage1_groups = backbone_stage1_groups - - self.backbone_stage2_in_channels = backbone_stage2_in_channels - self.backbone_stage2_out_channels = backbone_stage2_out_channels - self.backbone_stage2_kernel_size = backbone_stage2_kernel_size - self.backbone_stage2_stride = backbone_stage2_stride - self.backbone_stage2_dilation = backbone_stage2_dilation - self.backbone_stage2_groups = backbone_stage2_groups - - self.backbone_stage3_in_channels = backbone_stage3_in_channels - self.backbone_stage3_out_channels = backbone_stage3_out_channels - self.backbone_stage3_kernel_size = backbone_stage3_kernel_size - self.backbone_stage3_stride = backbone_stage3_stride - self.backbone_stage3_dilation = backbone_stage3_dilation - self.backbone_stage3_groups = backbone_stage3_groups - - self.backbone_stage4_in_channels = backbone_stage4_in_channels - self.backbone_stage4_out_channels = backbone_stage4_out_channels - self.backbone_stage4_kernel_size = backbone_stage4_kernel_size - self.backbone_stage4_stride = backbone_stage4_stride - self.backbone_stage4_dilation = backbone_stage4_dilation - self.backbone_stage4_groups = backbone_stage4_groups + if backbone_config is not None and use_timm_backbone: + raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") + + if not use_timm_backbone: + if backbone_config is None: + logger.info( + "`backbone_config` is `None`. Initializing the config with the default `TextNet` backbone." + ) + backbone_config = CONFIG_MAPPING["textnet"](out_features=["stage1", "stage2", "stage3", "stage4"]) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.get("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + # set timm attributes to None + dilation, backbone, use_pretrained_backbone = None, None, None + + self.use_timm_backbone = use_timm_backbone + self.backbone_config = backbone_config + self.num_channels = num_channels self.neck_in_channels = neck_in_channels self.neck_out_channels = neck_out_channels @@ -173,4 +122,20 @@ def __init__( self.head_final_ops_order = head_final_ops_order self.loss_bg = loss_bg + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.dilation = dilation + self.initializer_range = initializer_range + + @classmethod + def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs): + """Instantiate a [`FastConfig`] (or a derived class) from a pre-trained backbone model configuration. + + Args: + backbone_config ([`PretrainedConfig`]): + The backbone configuration. + Returns: + [`DetrConfig`]: An instance of a configuration object + """ + return cls(backbone_config=backbone_config, **kwargs) diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py index 0207f123b257..c624440bc0cb 100644 --- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py +++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py @@ -22,7 +22,7 @@ import torch from PIL import Image -from transformers import FastConfig, FastForSceneTextRecognition +from transformers import FastConfig, FastForSceneTextRecognition, TextNetConfig from transformers.models.fast.image_processing_fast import FastImageProcessor @@ -84,43 +84,50 @@ def prepare_config(size_config_url, pooling_size, min_area, bbox_type, loss_bg): neck_dilation.append(layer_dict["dilation"]) neck_groups.append(layer_dict["groups"]) + textnet_config = TextNetConfig( + kernel_size=config_dict["first_conv"]["kernel_size"], + stride=config_dict["first_conv"]["stride"], + dilation=config_dict["first_conv"]["dilation"], + groups=config_dict["first_conv"]["groups"], + bias=config_dict["first_conv"]["bias"], + has_shuffle=config_dict["first_conv"]["has_shuffle"], + in_channels=config_dict["first_conv"]["in_channels"], + out_channels=config_dict["first_conv"]["out_channels"], + use_bn=config_dict["first_conv"]["use_bn"], + act_func=config_dict["first_conv"]["act_func"], + dropout_rate=config_dict["first_conv"]["dropout_rate"], + ops_order=config_dict["first_conv"]["ops_order"], + stage1_in_channels=backbone_config["stage1"]["in_channels"], + stage1_out_channels=backbone_config["stage1"]["out_channels"], + stage1_kernel_size=backbone_config["stage1"]["kernel_size"], + stage1_stride=backbone_config["stage1"]["stride"], + stage1_dilation=backbone_config["stage1"]["dilation"], + stage1_groups=backbone_config["stage1"]["groups"], + stage2_in_channels=backbone_config["stage2"]["in_channels"], + stage2_out_channels=backbone_config["stage2"]["out_channels"], + stage2_kernel_size=backbone_config["stage2"]["kernel_size"], + stage2_stride=backbone_config["stage2"]["stride"], + stage2_dilation=backbone_config["stage2"]["dilation"], + stage2_groups=backbone_config["stage2"]["groups"], + stage3_in_channels=backbone_config["stage3"]["in_channels"], + stage3_out_channels=backbone_config["stage3"]["out_channels"], + stage3_kernel_size=backbone_config["stage3"]["kernel_size"], + stage3_stride=backbone_config["stage3"]["stride"], + stage3_dilation=backbone_config["stage3"]["dilation"], + stage3_groups=backbone_config["stage3"]["groups"], + stage4_in_channels=backbone_config["stage4"]["in_channels"], + stage4_out_channels=backbone_config["stage4"]["out_channels"], + stage4_kernel_size=backbone_config["stage4"]["kernel_size"], + stage4_stride=backbone_config["stage4"]["stride"], + stage4_dilation=backbone_config["stage4"]["dilation"], + stage4_groups=backbone_config["stage4"]["groups"], + out_features=["stage1", "stage2", "stage3", "stage4"], + out_indices=[1, 2, 3, 4], + ) + return FastConfig( - backbone_kernel_size=config_dict["first_conv"]["kernel_size"], - backbone_stride=config_dict["first_conv"]["stride"], - backbone_dilation=config_dict["first_conv"]["dilation"], - backbone_groups=config_dict["first_conv"]["groups"], - backbone_bias=config_dict["first_conv"]["bias"], - backbone_has_shuffle=config_dict["first_conv"]["has_shuffle"], - backbone_in_channels=config_dict["first_conv"]["in_channels"], - backbone_out_channels=config_dict["first_conv"]["out_channels"], - backbone_use_bn=config_dict["first_conv"]["use_bn"], - backbone_act_func=config_dict["first_conv"]["act_func"], - backbone_dropout_rate=config_dict["first_conv"]["dropout_rate"], - backbone_ops_order=config_dict["first_conv"]["ops_order"], - backbone_stage1_in_channels=backbone_config["stage1"]["in_channels"], - backbone_stage1_out_channels=backbone_config["stage1"]["out_channels"], - backbone_stage1_kernel_size=backbone_config["stage1"]["kernel_size"], - backbone_stage1_stride=backbone_config["stage1"]["stride"], - backbone_stage1_dilation=backbone_config["stage1"]["dilation"], - backbone_stage1_groups=backbone_config["stage1"]["groups"], - backbone_stage2_in_channels=backbone_config["stage2"]["in_channels"], - backbone_stage2_out_channels=backbone_config["stage2"]["out_channels"], - backbone_stage2_kernel_size=backbone_config["stage2"]["kernel_size"], - backbone_stage2_stride=backbone_config["stage2"]["stride"], - backbone_stage2_dilation=backbone_config["stage2"]["dilation"], - backbone_stage2_groups=backbone_config["stage2"]["groups"], - backbone_stage3_in_channels=backbone_config["stage3"]["in_channels"], - backbone_stage3_out_channels=backbone_config["stage3"]["out_channels"], - backbone_stage3_kernel_size=backbone_config["stage3"]["kernel_size"], - backbone_stage3_stride=backbone_config["stage3"]["stride"], - backbone_stage3_dilation=backbone_config["stage3"]["dilation"], - backbone_stage3_groups=backbone_config["stage3"]["groups"], - backbone_stage4_in_channels=backbone_config["stage4"]["in_channels"], - backbone_stage4_out_channels=backbone_config["stage4"]["out_channels"], - backbone_stage4_kernel_size=backbone_config["stage4"]["kernel_size"], - backbone_stage4_stride=backbone_config["stage4"]["stride"], - backbone_stage4_dilation=backbone_config["stage4"]["dilation"], - backbone_stage4_groups=backbone_config["stage4"]["groups"], + use_timm_backbone=False, + backbone_config=textnet_config, neck_in_channels=neck_in_channels, neck_out_channels=neck_out_channels, neck_kernel_size=neck_kernel_size, @@ -164,7 +171,7 @@ def get_base_model_config(): def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_folder_path, validate_logits): response = requests.get(checkpoint_config_url) content = response.text - + print("Got respose") namespace = {} exec(content, namespace) @@ -197,7 +204,7 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_ if "train" in data_config: if "short_size" in data_config["train"]: size = data_config["train"]["short_size"] - + print("we got config") model = FastForSceneTextRecognition(config) fast_image_processor = FastImageProcessor( size={"height": size, "width": size}, @@ -209,7 +216,7 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_ state_dict_changed = copy.deepcopy(state_dict) for key in state_dict: val = state_dict_changed.pop(key) - new_key = key.replace("module.", "") + new_key = key.replace("module.", "").replace("backbone.", "backbone.textnet.") for search, replacement in rename_key_mappings.items(): if search in new_key: new_key = new_key.replace(search, replacement) diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index b7f5f45f1f00..ce5f2aab384f 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -21,8 +21,9 @@ import torch import torch.nn as nn import torch.nn.functional as F +from timm import create_model -from transformers import FastConfig, PreTrainedModel, add_start_docstrings +from transformers import AutoBackbone, FastConfig, PreTrainedModel, add_start_docstrings, requires_backends from transformers.utils import ModelOutput, add_start_docstrings_to_model_forward, replace_return_docstrings @@ -341,94 +342,6 @@ def _init_weights(self, module): module.bias.data.zero_() -class FastTextNet(nn.Module): - def __init__(self, config): - super().__init__() - self.first_conv = FASTConvLayer( - config.backbone_in_channels, - config.backbone_out_channels, - config.backbone_kernel_size, - config.backbone_stride, - config.backbone_dilation, - config.backbone_groups, - config.backbone_bias, - config.backbone_has_shuffle, - config.backbone_use_bn, - config.backbone_act_func, - config.backbone_dropout_rate, - config.backbone_ops_order, - ) - stage1 = [] - for stage_config in zip( - config.backbone_stage1_in_channels, - config.backbone_stage1_out_channels, - config.backbone_stage1_kernel_size, - config.backbone_stage1_stride, - config.backbone_stage1_dilation, - config.backbone_stage1_groups, - ): - stage1.append(FASTRepConvLayer(*stage_config)) - self.stage1 = nn.ModuleList(stage1) - - stage2 = [] - for stage_config in zip( - config.backbone_stage2_in_channels, - config.backbone_stage2_out_channels, - config.backbone_stage2_kernel_size, - config.backbone_stage2_stride, - config.backbone_stage2_dilation, - config.backbone_stage2_groups, - ): - stage2.append(FASTRepConvLayer(*stage_config)) - self.stage2 = nn.ModuleList(stage2) - - stage3 = [] - for stage_config in zip( - config.backbone_stage3_in_channels, - config.backbone_stage3_out_channels, - config.backbone_stage3_kernel_size, - config.backbone_stage3_stride, - config.backbone_stage3_dilation, - config.backbone_stage3_groups, - ): - stage3.append(FASTRepConvLayer(*stage_config)) - self.stage3 = nn.ModuleList(stage3) - - stage4 = [] - for stage_config in zip( - config.backbone_stage4_in_channels, - config.backbone_stage4_out_channels, - config.backbone_stage4_kernel_size, - config.backbone_stage4_stride, - config.backbone_stage4_dilation, - config.backbone_stage4_groups, - ): - stage4.append(FASTRepConvLayer(*stage_config)) - self.stage4 = nn.ModuleList(stage4) - - def forward(self, hidden_states): - hidden_states = self.first_conv(hidden_states) - output = [] - - for block in self.stage1: - hidden_states = block(hidden_states) - output.append(hidden_states) - - for block in self.stage2: - hidden_states = block(hidden_states) - output.append(hidden_states) - - for block in self.stage3: - hidden_states = block(hidden_states) - output.append(hidden_states) - - for block in self.stage4: - hidden_states = block(hidden_states) - output.append(hidden_states) - - return output - - class FASTNeck(nn.Module): def __init__(self, config): super().__init__() @@ -729,7 +642,26 @@ class FastForSceneTextRecognitionOutput(ModelOutput): class FastForSceneTextRecognition(FastPreTrainedModel): def __init__(self, config): super().__init__(config) - self.backbone = FastTextNet(config=config) + # self.backbone = FastTextNet(config=config) + self.config = config + + if config.use_timm_backbone: + requires_backends(self, ["timm"]) + kwargs = {} + if config.dilation: + kwargs["output_stride"] = 16 + backbone = create_model( + config.backbone, + pretrained=config.use_pretrained_backbone, + features_only=True, + out_indices=(1, 2, 3, 4), + in_chans=config.num_channels, + **kwargs, + ) + else: + backbone = AutoBackbone.from_config(config.backbone_config) + + self.backbone = backbone self.neck = FASTNeck(config=config) self.det_head = FASTHead(config=config) self.loss_bg = config.loss_bg @@ -812,9 +744,11 @@ def forward( """ # outputs = {} return_dict = return_dict if return_dict is not None else self.config.use_return_dict - hidden_states = self.backbone(pixel_values) + features = ( + self.backbone(pixel_values) if self.config.use_timm_backbone else self.backbone(pixel_values).feature_maps + ) - hidden_states = self.neck(hidden_states) + hidden_states = self.neck(features) text_detection_output = self.det_head(hidden_states) diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py index 6bcb961a0f97..1f059550f50a 100644 --- a/src/transformers/models/textnet/configuration_textnet.py +++ b/src/transformers/models/textnet/configuration_textnet.py @@ -31,6 +31,7 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig): r""" [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt) """ + model_type = "textnet" def __init__( self, diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index 44168b853961..b39a7b8e7e88 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -21,12 +21,14 @@ from transformers import ( FastConfig, + TextNetConfig, is_torch_available, ) from transformers.models.fast.image_processing_fast import FastImageProcessor from transformers.testing_utils import ( require_torch, require_vision, + slow, torch_device, ) @@ -203,43 +205,50 @@ def prepare_config_and_inputs(self): return config, {"pixel_values": pixel_values} def get_config(self): + textnet_config = TextNetConfig( + kernel_size=self.backbone_kernel_size, + stride=self.backbone_stride, + dilation=self.backbone_dilation, + groups=self.backbone_groups, + bias=self.backbone_bias, + has_shuffle=self.backbone_has_shuffle, + in_channels=self.backbone_in_channels, + out_channels=self.backbone_out_channels, + use_bn=self.backbone_use_bn, + act_func=self.backbone_act_func, + dropout_rate=self.backbone_dropout_rate, + ops_order=self.backbone_ops_order, + stage1_in_channels=self.backbone_stage1_in_channels, + stage1_out_channels=self.backbone_stage1_out_channels, + stage1_kernel_size=self.backbone_stage1_kernel_size, + stage1_stride=self.backbone_stage1_stride, + stage1_dilation=self.backbone_stage1_dilation, + stage1_groups=self.backbone_stage1_groups, + stage2_in_channels=self.backbone_stage2_in_channels, + stage2_out_channels=self.backbone_stage2_out_channels, + stage2_kernel_size=self.backbone_stage2_kernel_size, + stage2_stride=self.backbone_stage2_stride, + stage2_dilation=self.backbone_stage2_dilation, + stage2_groups=self.backbone_stage2_groups, + stage3_in_channels=self.backbone_stage3_in_channels, + stage3_out_channels=self.backbone_stage3_out_channels, + stage3_kernel_size=self.backbone_stage3_kernel_size, + stage3_stride=self.backbone_stage3_stride, + stage3_dilation=self.backbone_stage3_dilation, + stage3_groups=self.backbone_stage3_groups, + stage4_in_channels=self.backbone_stage4_in_channels, + stage4_out_channels=self.backbone_stage4_out_channels, + stage4_kernel_size=self.backbone_stage4_kernel_size, + stage4_stride=self.backbone_stage4_stride, + stage4_dilation=self.backbone_stage4_dilation, + stage4_groups=self.backbone_stage4_groups, + out_features=["stage1", "stage2", "stage3", "stage4"], + out_indices=[1, 2, 3, 4], + ) + return FastConfig( - backbone_kernel_size=self.backbone_kernel_size, - backbone_stride=self.backbone_stride, - backbone_dilation=self.backbone_dilation, - backbone_groups=self.backbone_groups, - backbone_bias=self.backbone_bias, - backbone_has_shuffle=self.backbone_has_shuffle, - backbone_in_channels=self.backbone_in_channels, - backbone_out_channels=self.backbone_out_channels, - backbone_use_bn=self.backbone_use_bn, - backbone_act_func=self.backbone_act_func, - backbone_dropout_rate=self.backbone_dropout_rate, - backbone_ops_order=self.backbone_ops_order, - backbone_stage1_in_channels=self.backbone_stage1_in_channels, - backbone_stage1_out_channels=self.backbone_stage1_out_channels, - backbone_stage1_kernel_size=self.backbone_stage1_kernel_size, - backbone_stage1_stride=self.backbone_stage1_stride, - backbone_stage1_dilation=self.backbone_stage1_dilation, - backbone_stage1_groups=self.backbone_stage1_groups, - backbone_stage2_in_channels=self.backbone_stage2_in_channels, - backbone_stage2_out_channels=self.backbone_stage2_out_channels, - backbone_stage2_kernel_size=self.backbone_stage2_kernel_size, - backbone_stage2_stride=self.backbone_stage2_stride, - backbone_stage2_dilation=self.backbone_stage2_dilation, - backbone_stage2_groups=self.backbone_stage2_groups, - backbone_stage3_in_channels=self.backbone_stage3_in_channels, - backbone_stage3_out_channels=self.backbone_stage3_out_channels, - backbone_stage3_kernel_size=self.backbone_stage3_kernel_size, - backbone_stage3_stride=self.backbone_stage3_stride, - backbone_stage3_dilation=self.backbone_stage3_dilation, - backbone_stage3_groups=self.backbone_stage3_groups, - backbone_stage4_in_channels=self.backbone_stage4_in_channels, - backbone_stage4_out_channels=self.backbone_stage4_out_channels, - backbone_stage4_kernel_size=self.backbone_stage4_kernel_size, - backbone_stage4_stride=self.backbone_stage4_stride, - backbone_stage4_dilation=self.backbone_stage4_dilation, - backbone_stage4_groups=self.backbone_stage4_groups, + use_timm_backbone=False, + backbone_config=textnet_config, neck_in_channels=self.neck_in_channels, neck_out_channels=self.neck_out_channels, neck_kernel_size=self.neck_kernel_size, @@ -387,7 +396,7 @@ def test_model_is_small(self): @require_torch @require_vision class FastModelIntegrationTest(unittest.TestCase): - # @slow + @slow def test_inference_fast_tiny_ic17mlt_model(self): model = FastForSceneTextRecognition.from_pretrained("Raghavan/ic17mlt_Fast_T") @@ -409,7 +418,7 @@ def prepare_image(): assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134] assert round(float(final_out[0]["scores"][0]), 5) == 0.95541 - # @slow + @slow def test_inference_fast_base_800_total_text_ic17mlt_model(self): model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py index 01337be50b7c..a95c072fba95 100644 --- a/tests/models/textnet/test_modeling_textnet.py +++ b/tests/models/textnet/test_modeling_textnet.py @@ -16,6 +16,7 @@ import inspect import unittest +from transformers import TextNetConfig from transformers.testing_utils import ( require_torch, torch_device, @@ -34,7 +35,6 @@ from transformers import ( TextNetBackbone, - TextNetConfig, TextNetForImageClassification, TextNetModel, is_torch_available, From 45ebd1eb8769df5842bbeb07ab5b46dacdf1e305 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Thu, 9 Nov 2023 08:14:37 +0530 Subject: [PATCH 045/152] Fix build and other things --- .../models/auto/image_processing_auto.py | 1 + .../models/fast/configuration_fast.py | 76 ++++++++++++++++++- .../fast/convert_fast_original_to_pytorch.py | 2 - .../models/fast/image_processing_fast.py | 4 - src/transformers/models/fast/modeling_fast.py | 17 ++++- .../utils/dummy_vision_objects.py | 7 ++ 6 files changed, 97 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 55a128fe5519..2fac0833c940 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -104,6 +104,7 @@ ("swin2sr", "Swin2SRImageProcessor"), ("swinv2", "ViTImageProcessor"), ("table-transformer", "DetrImageProcessor"), + ("textnet", "TextNetImageProcessor"), ("timesformer", "VideoMAEImageProcessor"), ("tvlt", "TvltImageProcessor"), ("tvp", "TvpImageProcessor"), diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index 0ed87373049e..ce5e05b319f8 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -27,6 +27,78 @@ class FastConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FastForSceneTextRecognition`]. It is used to + instantiate a FastForSceneTextRecognition model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the + FastForSceneTextRecognition. + [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + use_timm_backbone (`bool`, *optional*, defaults to `True`): + Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`] + API. + backbone_config (`PretrainedConfig` or `dict`, *optional*): + The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which + case it will default to `ResNetConfig()`. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + neck_in_channels (`List[int]`, *optional*, defaults to `[64, 128, 256, 512]`): + neck_out_channels (`List[int]`, *optional*, defaults to `[128, 128, 128, 128]`): + neck_kernel_size (`List[int]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 3], [3, 3]]`): + neck_stride (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`): + neck_dilation (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`): + neck_groups (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`): + head_pooling_size (`int`, *optional*, defaults to 9): + head_dropout_ratio (`int`, *optional*, defaults to 0): + head_conv_in_channels (`int`, *optional*, defaults to 512): + head_conv_out_channels (`int`, *optional*, defaults to 128): + head_conv_kernel_size (`List[int]`, *optional*, defaults to `[3, 3]`): + head_conv_stride (`int`, *optional*, defaults to 1): + head_conv_dilation (`int`, *optional*, defaults to 1): + head_conv_groups (`int`, *optional*, defaults to 1): + head_final_kernel_size (`int`, *optional*, defaults to 1): + head_final_stride (`int`, *optional*, defaults to 1): + head_final_dilation (`int`, *optional*, defaults to 1): + head_final_groups (`int`, *optional*, defaults to 1): + head_final_bias (`bool`, *optional*, defaults to `False`): + head_final_has_shuffle (`bool`, *optional*, defaults to `False`): + head_final_in_channels (`int`, *optional*, defaults to 128): + head_final_out_channels (`int`, *optional*, defaults to 5): + head_final_use_bn (`bool`, *optional*, defaults to `False`): + head_final_act_func (`str`, *optional*): + head_final_dropout_rate (`int`, *optional*, defaults to 0): + head_final_ops_order (`str`, *optional*, defaults to `"weight"`): + loss_bg (`bool`, *optional*, defaults to `False`): + backbone (`str`, *optional*, defaults to `"textnet"`): + Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional + backbone from the timm package. For a list of all available models, see [this + page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model). + use_pretrained_backbone (`bool`, *optional*, defaults to `True`): + Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`. + dilation (`bool`, *optional*, defaults to `False`): + Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when + `use_timm_backbone` = `True`. + initializer_range (`float`, *optional*, defaults to 0.02): + Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. + + Examples: + + ```python + >>> from transformers import FastConfig, FastForSceneTextRecognition + + >>> # Initializing a Fast Config + >>> configuration = FastConfig() + + >>> # Initializing a model (with random weights) + >>> model = FastForSceneTextRecognition(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" r""" [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt) """ @@ -63,7 +135,7 @@ def __init__( head_final_dropout_rate=0, head_final_ops_order="weight", loss_bg=False, - backbone="resnet50", + backbone="textnet", use_pretrained_backbone=True, dilation=False, initializer_range=0.02, @@ -136,6 +208,6 @@ def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs): backbone_config ([`PretrainedConfig`]): The backbone configuration. Returns: - [`DetrConfig`]: An instance of a configuration object + [`FastConfig`]: An instance of a configuration object """ return cls(backbone_config=backbone_config, **kwargs) diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py index c624440bc0cb..c7a8e622aae7 100644 --- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py +++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py @@ -171,7 +171,6 @@ def get_base_model_config(): def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_folder_path, validate_logits): response = requests.get(checkpoint_config_url) content = response.text - print("Got respose") namespace = {} exec(content, namespace) @@ -204,7 +203,6 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_ if "train" in data_config: if "short_size" in data_config["train"]: size = data_config["train"]["short_size"] - print("we got config") model = FastForSceneTextRecognition(config) fast_image_processor = FastImageProcessor( size={"height": size, "width": size}, diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index ae4505d4fffa..eb5020195f2d 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -94,8 +94,6 @@ class FastImageProcessor(BaseImageProcessor): number of channels of the image. Can be overridden by the `image_std` parameter in the `preprocess` method. min_area (`int`, *optional*, defaults to 200): Threshold for min area for results - bbox_type (`str`, *optional*, defaults to `"rect"`): - Type of bbox, rect or poly pooling_size (`int`, *optional*, defaults to 9): Pooling size for text detection """ @@ -115,7 +113,6 @@ def __init__( image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, min_area: int = 200, - bbox_type: str = "rect", pooling_size: int = 9, **kwargs, ) -> None: @@ -136,7 +133,6 @@ def __init__( self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.min_area = min_area # self.threshold = threshold - self.bbox_type = bbox_type self.pooling_size = pooling_size @classmethod diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index ce5f2aab384f..ba9460e2d029 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -21,9 +21,22 @@ import torch import torch.nn as nn import torch.nn.functional as F -from timm import create_model -from transformers import AutoBackbone, FastConfig, PreTrainedModel, add_start_docstrings, requires_backends +from ...utils import is_timm_available + + +if is_timm_available(): + from timm import create_model + + +from transformers import ( + AutoBackbone, + FastConfig, + PreTrainedModel, + add_start_docstrings, + is_timm_available, + requires_backends, +) from transformers.utils import ModelOutput, add_start_docstrings_to_model_forward, replace_return_docstrings diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 18c6a27bd7dc..4ee5d2c9c296 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -198,6 +198,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class FastImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class FlavaFeatureExtractor(metaclass=DummyObject): _backends = ["vision"] From 5c6dbaf32f24d63d115a8feee7e20b9ba1ebd198 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Thu, 9 Nov 2023 08:56:40 +0530 Subject: [PATCH 046/152] Fix build --- src/transformers/__init__.py | 1 + src/transformers/models/fast/configuration_fast.py | 4 ++-- src/transformers/models/fast/modeling_fast.py | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 7cfffec8463b..5c0d2bed5b5f 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1279,6 +1279,7 @@ _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"]) _import_structure["models.efficientformer"].append("EfficientFormerImageProcessor") _import_structure["models.efficientnet"].append("EfficientNetImageProcessor") + _import_structure["models.fast"].extend(["FastImageProcessor"]) _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"]) _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"]) _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"]) diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index ce5e05b319f8..e0f88dabe16f 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -73,7 +73,7 @@ class FastConfig(PretrainedConfig): head_final_dropout_rate (`int`, *optional*, defaults to 0): head_final_ops_order (`str`, *optional*, defaults to `"weight"`): loss_bg (`bool`, *optional*, defaults to `False`): - backbone (`str`, *optional*, defaults to `"textnet"`): + backbone (`str`, *optional*, defaults to `"resnet50"`): Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional backbone from the timm package. For a list of all available models, see [this page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model). @@ -135,7 +135,7 @@ def __init__( head_final_dropout_rate=0, head_final_ops_order="weight", loss_bg=False, - backbone="textnet", + backbone="resnet50", use_pretrained_backbone=True, dilation=False, initializer_range=0.02, diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index ba9460e2d029..644bf67ca4f4 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -655,7 +655,6 @@ class FastForSceneTextRecognitionOutput(ModelOutput): class FastForSceneTextRecognition(FastPreTrainedModel): def __init__(self, config): super().__init__(config) - # self.backbone = FastTextNet(config=config) self.config = config if config.use_timm_backbone: From 643ccacda1021d769a02b3d0029a741cbb7450dd Mon Sep 17 00:00:00 2001 From: raghavanone Date: Thu, 9 Nov 2023 10:16:15 +0530 Subject: [PATCH 047/152] some cleanups --- .../models/fast/configuration_fast.py | 37 +++++++++++--- src/transformers/models/fast/modeling_fast.py | 51 +++---------------- .../models/textnet/configuration_textnet.py | 6 --- .../models/textnet/modeling_textnet.py | 32 ++---------- tests/models/fast/test_modeling_fast.py | 20 +++----- 5 files changed, 48 insertions(+), 98 deletions(-) diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index e0f88dabe16f..5bfc9ee6fb2a 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -47,32 +47,59 @@ class FastConfig(PretrainedConfig): num_channels (`int`, *optional*, defaults to 3): The number of input channels. neck_in_channels (`List[int]`, *optional*, defaults to `[64, 128, 256, 512]`): + Denotes the in channels of FASTRepConvLayer in neck module. neck_out_channels (`List[int]`, *optional*, defaults to `[128, 128, 128, 128]`): + Denotes the out channels of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels` neck_kernel_size (`List[int]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 3], [3, 3]]`): + Denotes the kernel_size of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels` neck_stride (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`): + Denotes the neck_stride of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels` neck_dilation (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`): + Denotes the neck_dilation of FASTRepConvLayer in neck module. Should be of same length of + `neck_in_channels` neck_groups (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`): + Denotes the groups of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels` head_pooling_size (`int`, *optional*, defaults to 9): + Denotes the pooling size of head layer head_dropout_ratio (`int`, *optional*, defaults to 0): + Denotes the dropout ratio used in dropout layer of head layer.. head_conv_in_channels (`int`, *optional*, defaults to 512): + Denotes the in channels of first conv layer in head layer. head_conv_out_channels (`int`, *optional*, defaults to 128): + Denotes the out channels of first conv layer in head layer. head_conv_kernel_size (`List[int]`, *optional*, defaults to `[3, 3]`): + Denotes the conv kernel size of first conv layer in head layer. head_conv_stride (`int`, *optional*, defaults to 1): + Denotes the conv stride of first conv layer in head layer. head_conv_dilation (`int`, *optional*, defaults to 1): + Denotes the conv dilation of first conv layer in head layer. head_conv_groups (`int`, *optional*, defaults to 1): + Denotes the conv groups of first conv layer in head layer. head_final_kernel_size (`int`, *optional*, defaults to 1): + Denotes the conv kernel size of final conv layer in head layer. head_final_stride (`int`, *optional*, defaults to 1): + Denotes the conv stride of final conv layer in head layer. head_final_dilation (`int`, *optional*, defaults to 1): + Denotes the conv dilation of final conv layer in head layer. head_final_groups (`int`, *optional*, defaults to 1): + Denotes the conv groups of final conv layer in head layer. head_final_bias (`bool`, *optional*, defaults to `False`): + Denotes the conv bais of final conv layer in head layer. head_final_has_shuffle (`bool`, *optional*, defaults to `False`): + Denotes the conv shuffle of final conv layer in head layer. head_final_in_channels (`int`, *optional*, defaults to 128): + Denotes the in channels of final conv layer in head layer. head_final_out_channels (`int`, *optional*, defaults to 5): - head_final_use_bn (`bool`, *optional*, defaults to `False`): + Denotes the out channels of final conv layer in head layer. + head_final_use_batch_norm (`bool`, *optional*, defaults to `False`): + Denotes to use or not to use batch norm of final conv layer in head layer. head_final_act_func (`str`, *optional*): + Denotes to activation function of final conv layer in head layer. head_final_dropout_rate (`int`, *optional*, defaults to 0): + Denotes to dropout_rate of dropout layer of final conv layer in head layer. head_final_ops_order (`str`, *optional*, defaults to `"weight"`): - loss_bg (`bool`, *optional*, defaults to `False`): + Denotes to dropout_rate of dropout layer of final conv layer in head layer. + loss_bg (``, *optional*, defaults to `False`): backbone (`str`, *optional*, defaults to `"resnet50"`): Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional backbone from the timm package. For a list of all available models, see [this @@ -130,7 +157,7 @@ def __init__( head_final_has_shuffle=False, head_final_in_channels=128, head_final_out_channels=5, - head_final_use_bn=False, + head_final_use_batch_norm=False, head_final_act_func=None, head_final_dropout_rate=0, head_final_ops_order="weight", @@ -188,10 +215,6 @@ def __init__( self.head_final_has_shuffle = head_final_has_shuffle self.head_final_in_channels = head_final_in_channels self.head_final_out_channels = head_final_out_channels - self.head_final_use_bn = head_final_use_bn - self.head_final_act_func = head_final_act_func - self.head_final_dropout_rate = head_final_dropout_rate - self.head_final_ops_order = head_final_ops_order self.loss_bg = loss_bg self.backbone = backbone diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 644bf67ca4f4..207f1115bc8b 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -68,27 +68,12 @@ def get_same_padding(kernel_size): if isinstance(kernel_size, tuple): - p1 = get_same_padding(kernel_size[0]) - p2 = get_same_padding(kernel_size[1]) - return p1, p2 + padding1 = get_same_padding(kernel_size[0]) + padding2 = get_same_padding(kernel_size[1]) + return padding1, padding2 return kernel_size // 2 -def build_activation(act_func, inplace=True): - if act_func == "relu": - return nn.ReLU(inplace=inplace) - elif act_func == "relu6": - return nn.ReLU6(inplace=inplace) - elif act_func == "tanh": - return nn.Tanh() - elif act_func == "sigmoid": - return nn.Sigmoid() - elif act_func is None: - return None - else: - raise ValueError("do not support: %s" % act_func) - - class FASTConvLayer(nn.Module): def __init__( self, @@ -100,10 +85,6 @@ def __init__( groups=1, bias=False, has_shuffle=False, - use_batch_norm=True, - act_func="relu", - dropout_rate=0, - use_act=True, ): super().__init__() @@ -113,7 +94,6 @@ def __init__( self.groups = groups self.bias = bias self.has_shuffle = has_shuffle - self.activation_function = act_func padding = get_same_padding(self.kernel_size) if isinstance(padding, int): @@ -132,29 +112,17 @@ def __init__( groups=groups, bias=bias, ) - self.batch_norm = nn.Identity() - if use_batch_norm: - self.batch_norm = nn.BatchNorm2d(out_channels) - - self.activation = nn.Identity() - if use_act: - act = build_activation(self.activation_function, True) - if act is not None: - self.activation = act def forward(self, hidden_states): if self.training: if hasattr(self, "fused_conv"): delattr(self, "fused_conv") hidden_states = self.conv(hidden_states) - hidden_states = self.batch_norm(hidden_states) - return self.activation(hidden_states) + return hidden_states else: if not hasattr(self, "fused_conv"): - setattr(self, "fused_conv", self.fuse_conv_batch_norm(self.conv, self.batch_norm)) + setattr(self, "fused_conv", self.fuse_conv_batch_norm(self.conv, nn.Identity())) hidden_states = self.fused_conv(hidden_states) - if self.activation is not None: - hidden_states = self.activation(hidden_states) return hidden_states def fuse_conv_batch_norm(self, conv, batch_norm): @@ -411,10 +379,6 @@ def __init__(self, config): config.head_final_groups, config.head_final_bias, config.head_final_has_shuffle, - config.head_final_use_bn, - config.head_final_act_func, - config.head_final_dropout_rate, - config.head_final_ops_order, ) self.pooling_size = config.head_pooling_size @@ -519,7 +483,7 @@ def emb_loss( return loss -def emb_loss_batch(emb, instance, kernel, training_mask, reduce=True, loss_weight=0.25, bg_sample=False): +def emb_loss_batch(emb, instance, kernel, training_mask, reduce=True, loss_weight=0.25): loss_batch = emb.new_zeros((emb.size(0)), dtype=torch.float32) for i in range(loss_batch.size(0)): @@ -676,7 +640,6 @@ def __init__(self, config): self.backbone = backbone self.neck = FASTNeck(config=config) self.det_head = FASTHead(config=config) - self.loss_bg = config.loss_bg self.pooling_1s = nn.MaxPool2d( kernel_size=config.head_pooling_size, stride=1, padding=(config.head_pooling_size - 1) // 2 @@ -714,7 +677,7 @@ def loss(self, hidden, labels): loss_kernel = dice_loss_with_masks(kernels, gt_kernels, selected_masks, reduce=False) loss_kernel = torch.mean(loss_kernel, dim=0) - loss_emb = emb_loss_batch(embs, gt_instances, gt_kernels, training_masks, reduce=False, bg_sample=self.loss_bg) + loss_emb = emb_loss_batch(embs, gt_instances, gt_kernels, training_masks, reduce=False) return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb) diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py index 1f059550f50a..4ad83123f4ff 100644 --- a/src/transformers/models/textnet/configuration_textnet.py +++ b/src/transformers/models/textnet/configuration_textnet.py @@ -43,10 +43,7 @@ def __init__( has_shuffle=False, in_channels=3, out_channels=64, - use_bn=True, act_func="relu", - dropout_rate=0, - ops_order="weight_bn_act", stage1_in_channels=[64, 64, 64], stage1_out_channels=[64, 64, 64], stage1_kernel_size=[[3, 3], [3, 3], [3, 3]], @@ -87,10 +84,7 @@ def __init__( self.has_shuffle = has_shuffle self.in_channels = in_channels self.out_channels = out_channels - self.use_bn = use_bn self.act_func = act_func - self.dropout_rate = dropout_rate - self.ops_order = ops_order self.stage1_in_channels = stage1_in_channels self.stage1_out_channels = stage1_out_channels diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py index 72950f0776ec..8e9cd5335569 100644 --- a/src/transformers/models/textnet/modeling_textnet.py +++ b/src/transformers/models/textnet/modeling_textnet.py @@ -22,6 +22,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from transformers import PreTrainedModel, add_start_docstrings +from transformers.activations import ACT2CLS from transformers.modeling_outputs import ( BackboneOutput, BaseModelOutputWithPoolingAndNoAttention, @@ -74,21 +75,6 @@ def get_same_padding(kernel_size): return kernel_size // 2 -def build_activation(act_func, inplace=True): - if act_func == "relu": - return nn.ReLU(inplace=inplace) - elif act_func == "relu6": - return nn.ReLU6(inplace=inplace) - elif act_func == "tanh": - return nn.Tanh() - elif act_func == "sigmoid": - return nn.Sigmoid() - elif act_func is None: - return None - else: - raise ValueError("do not support: %s" % act_func) - - class TextNetConvLayer(nn.Module): def __init__( self, @@ -100,10 +86,7 @@ def __init__( groups=1, bias=False, has_shuffle=False, - use_batch_norm=True, act_func="relu", - dropout_rate=0, - use_act=True, ): super().__init__() @@ -133,14 +116,12 @@ def __init__( bias=bias, ) self.batch_norm = nn.Identity() - if use_batch_norm: - self.batch_norm = nn.BatchNorm2d(out_channels) + + self.batch_norm = nn.BatchNorm2d(out_channels) self.activation = nn.Identity() - if use_act: - act = build_activation(self.activation_function, True) - if act is not None: - self.activation = act + if self.activation_function is not None: + self.activation = ACT2CLS[self.activation_function](inplace=True) def forward(self, hidden_states): if self.training: @@ -371,10 +352,7 @@ def __init__(self, config): config.groups, config.bias, config.has_shuffle, - config.use_bn, config.act_func, - config.dropout_rate, - config.ops_order, ) stage1 = [] for stage_config in zip( diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index b39a7b8e7e88..31656261a426 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -28,7 +28,6 @@ from transformers.testing_utils import ( require_torch, require_vision, - slow, torch_device, ) @@ -59,7 +58,7 @@ def __init__( backbone_in_channels=3, backbone_out_channels=64, backbone_use_bn=True, - backbone_act_func="relu", + backbone_activation_func="relu", backbone_dropout_rate=0, backbone_ops_order="weight_bn_act", backbone_stage1_in_channels=[64], @@ -108,7 +107,7 @@ def __init__( head_final_has_shuffle=False, head_final_in_channels=4, head_final_out_channels=5, - head_final_use_bn=False, + head_final_use_batch_norm=False, head_final_act_func=None, head_final_dropout_rate=0, head_final_ops_order="weight", @@ -127,7 +126,7 @@ def __init__( self.backbone_in_channels = backbone_in_channels self.backbone_out_channels = backbone_out_channels self.backbone_use_bn = backbone_use_bn - self.backbone_act_func = backbone_act_func + self.backbone_act_func = backbone_activation_func self.backbone_dropout_rate = backbone_dropout_rate self.backbone_ops_order = backbone_ops_order @@ -184,7 +183,7 @@ def __init__( self.head_final_has_shuffle = head_final_has_shuffle self.head_final_in_channels = head_final_in_channels self.head_final_out_channels = head_final_out_channels - self.head_final_use_bn = head_final_use_bn + self.head_final_use_bn = head_final_use_batch_norm self.head_final_act_func = head_final_act_func self.head_final_dropout_rate = head_final_dropout_rate self.head_final_ops_order = head_final_ops_order @@ -214,10 +213,7 @@ def get_config(self): has_shuffle=self.backbone_has_shuffle, in_channels=self.backbone_in_channels, out_channels=self.backbone_out_channels, - use_bn=self.backbone_use_bn, act_func=self.backbone_act_func, - dropout_rate=self.backbone_dropout_rate, - ops_order=self.backbone_ops_order, stage1_in_channels=self.backbone_stage1_in_channels, stage1_out_channels=self.backbone_stage1_out_channels, stage1_kernel_size=self.backbone_stage1_kernel_size, @@ -271,10 +267,6 @@ def get_config(self): head_final_has_shuffle=self.head_final_has_shuffle, head_final_in_channels=self.head_final_in_channels, head_final_out_channels=self.head_final_out_channels, - head_final_use_bn=self.head_final_use_bn, - head_final_act_func=self.head_final_act_func, - head_final_dropout_rate=self.head_final_dropout_rate, - head_final_ops_order=self.head_final_ops_order, ) def create_and_check_model(self, config, input): @@ -396,7 +388,7 @@ def test_model_is_small(self): @require_torch @require_vision class FastModelIntegrationTest(unittest.TestCase): - @slow + # @slow def test_inference_fast_tiny_ic17mlt_model(self): model = FastForSceneTextRecognition.from_pretrained("Raghavan/ic17mlt_Fast_T") @@ -418,7 +410,7 @@ def prepare_image(): assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134] assert round(float(final_out[0]["scores"][0]), 5) == 0.95541 - @slow + # @slow def test_inference_fast_base_800_total_text_ic17mlt_model(self): model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") From d50df437fcd0f1730bb4b7b0051bfdef1f9ec2f9 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Thu, 9 Nov 2023 12:10:04 +0530 Subject: [PATCH 048/152] some cleanups --- .../models/fast/configuration_fast.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index 5bfc9ee6fb2a..734b0eeede0d 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -91,15 +91,6 @@ class FastConfig(PretrainedConfig): Denotes the in channels of final conv layer in head layer. head_final_out_channels (`int`, *optional*, defaults to 5): Denotes the out channels of final conv layer in head layer. - head_final_use_batch_norm (`bool`, *optional*, defaults to `False`): - Denotes to use or not to use batch norm of final conv layer in head layer. - head_final_act_func (`str`, *optional*): - Denotes to activation function of final conv layer in head layer. - head_final_dropout_rate (`int`, *optional*, defaults to 0): - Denotes to dropout_rate of dropout layer of final conv layer in head layer. - head_final_ops_order (`str`, *optional*, defaults to `"weight"`): - Denotes to dropout_rate of dropout layer of final conv layer in head layer. - loss_bg (``, *optional*, defaults to `False`): backbone (`str`, *optional*, defaults to `"resnet50"`): Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional backbone from the timm package. For a list of all available models, see [this @@ -157,11 +148,6 @@ def __init__( head_final_has_shuffle=False, head_final_in_channels=128, head_final_out_channels=5, - head_final_use_batch_norm=False, - head_final_act_func=None, - head_final_dropout_rate=0, - head_final_ops_order="weight", - loss_bg=False, backbone="resnet50", use_pretrained_backbone=True, dilation=False, @@ -216,7 +202,6 @@ def __init__( self.head_final_in_channels = head_final_in_channels self.head_final_out_channels = head_final_out_channels - self.loss_bg = loss_bg self.backbone = backbone self.use_pretrained_backbone = use_pretrained_backbone self.dilation = dilation From 6acd3bafc9a52befe2ab42611824c461d69d1cc4 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Thu, 9 Nov 2023 13:02:24 +0530 Subject: [PATCH 049/152] Some more cleanups --- src/transformers/models/fast/modeling_fast.py | 56 +++++-------------- .../models/textnet/modeling_textnet.py | 56 ++++--------------- tests/models/fast/test_modeling_fast.py | 5 +- 3 files changed, 30 insertions(+), 87 deletions(-) diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index 207f1115bc8b..d244d1889aa5 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -81,26 +81,19 @@ def __init__( out_channels, kernel_size=3, stride=1, - dilation=1, - groups=1, bias=False, - has_shuffle=False, ): super().__init__() self.kernel_size = kernel_size self.stride = stride - self.dilation = dilation - self.groups = groups - self.bias = bias - self.has_shuffle = has_shuffle padding = get_same_padding(self.kernel_size) - if isinstance(padding, int): - padding *= self.dilation - else: - padding[0] *= self.dilation - padding[1] *= self.dilation + # if isinstance(padding, int): + # padding *= self.dilation + # else: + # padding[0] *= self.dilation + # padding[1] *= self.dilation self.conv = nn.Conv2d( in_channels, @@ -108,9 +101,7 @@ def __init__( kernel_size=kernel_size, stride=stride, padding=padding, - dilation=dilation, - groups=groups, - bias=bias, + bias=False, ) def forward(self, hidden_states): @@ -121,7 +112,7 @@ def forward(self, hidden_states): return hidden_states else: if not hasattr(self, "fused_conv"): - setattr(self, "fused_conv", self.fuse_conv_batch_norm(self.conv, nn.Identity())) + setattr(self, "fused_conv", self.conv) hidden_states = self.fused_conv(hidden_states) return hidden_states @@ -141,19 +132,17 @@ def fuse_conv_batch_norm(self, conv, batch_norm): class FASTRepConvLayer(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1): + def __init__(self, in_channels, out_channels, kernel_size, stride=1): super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size self.stride = stride - self.dilation = dilation - self.groups = groups - padding = (int(((kernel_size[0] - 1) * dilation) / 2), int(((kernel_size[1] - 1) * dilation) / 2)) + padding = (int((kernel_size[0] - 1) / 2), int((kernel_size[1] - 1) / 2)) - self.nonlinearity = nn.ReLU(inplace=True) + self.activation = nn.ReLU(inplace=True) self.main_conv = nn.Conv2d( in_channels=in_channels, @@ -161,14 +150,12 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, kernel_size=kernel_size, stride=stride, padding=padding, - dilation=dilation, - groups=groups, bias=False, ) self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels) - ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0) - hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2)) + ver_pad = (int((kernel_size[0] - 1) / 2), 0) + hor_pad = (0, int((kernel_size[1] - 1) / 2)) if kernel_size[1] != 1: self.vertical_conv = nn.Conv2d( @@ -177,8 +164,6 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, kernel_size=(kernel_size[0], 1), stride=stride, padding=ver_pad, - dilation=dilation, - groups=groups, bias=False, ) self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels) @@ -192,8 +177,6 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, kernel_size=(1, kernel_size[1]), stride=stride, padding=hor_pad, - dilation=dilation, - groups=groups, bias=False, ) self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels) @@ -228,17 +211,17 @@ def forward(self, hidden_states): else: id_out = self.rbr_identity(hidden_states) - return self.nonlinearity(main_outputs + vertical_outputs + horizontal_outputs + id_out) + return self.activation(main_outputs + vertical_outputs + horizontal_outputs + id_out) else: if not hasattr(self, "fused_conv"): self.prepare_for_eval() - return self.nonlinearity(self.fused_conv(hidden_states)) + return self.activation(self.fused_conv(hidden_states)) def _identity_to_conv(self, identity): if identity is None: return 0, 0 if not hasattr(self, "id_tensor"): - input_dim = self.in_channels // self.groups + input_dim = self.in_channels kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32) for i in range(self.in_channels): kernel_value[i, i % input_dim, 0, 0] = 1 @@ -296,8 +279,6 @@ def prepare_for_eval(self): kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride, padding=self.main_conv.padding, - dilation=self.main_conv.dilation, - groups=self.main_conv.groups, bias=True, ) self.fused_conv.weight.data = kernel @@ -332,8 +313,6 @@ def __init__(self, config): config.neck_out_channels, config.neck_kernel_size, config.neck_stride, - config.neck_dilation, - config.neck_groups, ) ) self.num_layers = len(reduce_layer_configs) @@ -366,8 +345,6 @@ def __init__(self, config): config.head_conv_out_channels, config.head_conv_kernel_size, config.head_conv_stride, - config.head_conv_dilation, - config.head_conv_groups, ) self.final = FASTConvLayer( @@ -375,10 +352,7 @@ def __init__(self, config): config.head_final_out_channels, config.head_final_kernel_size, config.head_final_stride, - config.head_final_dilation, - config.head_final_groups, config.head_final_bias, - config.head_final_has_shuffle, ) self.pooling_size = config.head_pooling_size diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py index 8e9cd5335569..daf76ba2667e 100644 --- a/src/transformers/models/textnet/modeling_textnet.py +++ b/src/transformers/models/textnet/modeling_textnet.py @@ -82,28 +82,20 @@ def __init__( out_channels, kernel_size=3, stride=1, - dilation=1, - groups=1, - bias=False, - has_shuffle=False, act_func="relu", ): super().__init__() self.kernel_size = kernel_size self.stride = stride - self.dilation = dilation - self.groups = groups - self.bias = bias - self.has_shuffle = has_shuffle self.activation_function = act_func padding = get_same_padding(self.kernel_size) - if isinstance(padding, int): - padding *= self.dilation - else: - padding[0] *= self.dilation - padding[1] *= self.dilation + # if isinstance(padding, int): + # padding *= self.dilation + # else: + # padding[0] *= self.dilation + # padding[1] *= self.dilation self.conv = nn.Conv2d( in_channels, @@ -111,9 +103,7 @@ def __init__( kernel_size=kernel_size, stride=stride, padding=padding, - dilation=dilation, - groups=groups, - bias=bias, + bias=False, ) self.batch_norm = nn.Identity() @@ -154,17 +144,15 @@ def fuse_conv_batch_norm(self, conv, batch_norm): class TestNetRepConvLayer(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1): + def __init__(self, in_channels, out_channels, kernel_size, stride=1): super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size self.stride = stride - self.dilation = dilation - self.groups = groups - padding = (int(((kernel_size[0] - 1) * dilation) / 2), int(((kernel_size[1] - 1) * dilation) / 2)) + padding = (int((kernel_size[0] - 1) / 2), int((kernel_size[1] - 1) / 2)) self.nonlinearity = nn.ReLU(inplace=True) @@ -174,14 +162,12 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, kernel_size=kernel_size, stride=stride, padding=padding, - dilation=dilation, - groups=groups, bias=False, ) self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels) - ver_pad = (int(((kernel_size[0] - 1) * dilation) / 2), 0) - hor_pad = (0, int(((kernel_size[1] - 1) * dilation) / 2)) + ver_pad = (int((kernel_size[0] - 1) / 2), 0) + hor_pad = (0, int((kernel_size[1] - 1) / 2)) if kernel_size[1] != 1: self.vertical_conv = nn.Conv2d( @@ -190,23 +176,19 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, kernel_size=(kernel_size[0], 1), stride=stride, padding=ver_pad, - dilation=dilation, - groups=groups, bias=False, ) self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels) else: self.vertical_conv, self.vertical_batch_norm = None, None - if kernel_size[0] != 1: # 卷积核的高大于1 -> 有水平卷积 + if kernel_size[0] != 1: self.horizontal_conv = nn.Conv2d( in_channels=in_channels, out_channels=out_channels, kernel_size=(1, kernel_size[1]), stride=stride, padding=hor_pad, - dilation=dilation, - groups=groups, bias=False, ) self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels) @@ -251,7 +233,7 @@ def _identity_to_conv(self, identity): if identity is None: return 0, 0 if not hasattr(self, "id_tensor"): - input_dim = self.in_channels // self.groups + input_dim = self.in_channels kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32) for i in range(self.in_channels): kernel_value[i, i % input_dim, 0, 0] = 1 @@ -309,8 +291,6 @@ def prepare_for_eval(self): kernel_size=self.main_conv.kernel_size, stride=self.main_conv.stride, padding=self.main_conv.padding, - dilation=self.main_conv.dilation, - groups=self.main_conv.groups, bias=True, ) self.fused_conv.weight.data = kernel @@ -348,10 +328,6 @@ def __init__(self, config): config.out_channels, config.kernel_size, config.stride, - config.dilation, - config.groups, - config.bias, - config.has_shuffle, config.act_func, ) stage1 = [] @@ -360,8 +336,6 @@ def __init__(self, config): config.stage1_out_channels, config.stage1_kernel_size, config.stage1_stride, - config.stage1_dilation, - config.stage1_groups, ): stage1.append(TestNetRepConvLayer(*stage_config)) self.stage1 = nn.ModuleList(stage1) @@ -372,8 +346,6 @@ def __init__(self, config): config.stage2_out_channels, config.stage2_kernel_size, config.stage2_stride, - config.stage2_dilation, - config.stage2_groups, ): stage2.append(TestNetRepConvLayer(*stage_config)) self.stage2 = nn.ModuleList(stage2) @@ -384,8 +356,6 @@ def __init__(self, config): config.stage3_out_channels, config.stage3_kernel_size, config.stage3_stride, - config.stage3_dilation, - config.stage3_groups, ): stage3.append(TestNetRepConvLayer(*stage_config)) self.stage3 = nn.ModuleList(stage3) @@ -396,8 +366,6 @@ def __init__(self, config): config.stage4_out_channels, config.stage4_kernel_size, config.stage4_stride, - config.stage4_dilation, - config.stage4_groups, ): stage4.append(TestNetRepConvLayer(*stage_config)) self.stage4 = nn.ModuleList(stage4) diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index 31656261a426..2a4fe0ab7fd3 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -28,6 +28,7 @@ from transformers.testing_utils import ( require_torch, require_vision, + slow, torch_device, ) @@ -388,7 +389,7 @@ def test_model_is_small(self): @require_torch @require_vision class FastModelIntegrationTest(unittest.TestCase): - # @slow + @slow def test_inference_fast_tiny_ic17mlt_model(self): model = FastForSceneTextRecognition.from_pretrained("Raghavan/ic17mlt_Fast_T") @@ -410,7 +411,7 @@ def prepare_image(): assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134] assert round(float(final_out[0]["scores"][0]), 5) == 0.95541 - # @slow + @slow def test_inference_fast_base_800_total_text_ic17mlt_model(self): model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") From 85c128a97402a57db57db4b9278c91d1a575d989 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Thu, 9 Nov 2023 13:22:25 +0530 Subject: [PATCH 050/152] Fix build --- .../models/fast/configuration_fast.py | 29 ------------ .../models/textnet/configuration_textnet.py | 24 ---------- tests/models/fast/test_modeling_fast.py | 44 ------------------- tests/models/textnet/test_modeling_textnet.py | 36 --------------- 4 files changed, 133 deletions(-) diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py index 734b0eeede0d..6a5f3a425fa6 100644 --- a/src/transformers/models/fast/configuration_fast.py +++ b/src/transformers/models/fast/configuration_fast.py @@ -54,11 +54,6 @@ class FastConfig(PretrainedConfig): Denotes the kernel_size of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels` neck_stride (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`): Denotes the neck_stride of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels` - neck_dilation (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`): - Denotes the neck_dilation of FASTRepConvLayer in neck module. Should be of same length of - `neck_in_channels` - neck_groups (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`): - Denotes the groups of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels` head_pooling_size (`int`, *optional*, defaults to 9): Denotes the pooling size of head layer head_dropout_ratio (`int`, *optional*, defaults to 0): @@ -71,22 +66,12 @@ class FastConfig(PretrainedConfig): Denotes the conv kernel size of first conv layer in head layer. head_conv_stride (`int`, *optional*, defaults to 1): Denotes the conv stride of first conv layer in head layer. - head_conv_dilation (`int`, *optional*, defaults to 1): - Denotes the conv dilation of first conv layer in head layer. - head_conv_groups (`int`, *optional*, defaults to 1): - Denotes the conv groups of first conv layer in head layer. head_final_kernel_size (`int`, *optional*, defaults to 1): Denotes the conv kernel size of final conv layer in head layer. head_final_stride (`int`, *optional*, defaults to 1): Denotes the conv stride of final conv layer in head layer. - head_final_dilation (`int`, *optional*, defaults to 1): - Denotes the conv dilation of final conv layer in head layer. - head_final_groups (`int`, *optional*, defaults to 1): - Denotes the conv groups of final conv layer in head layer. head_final_bias (`bool`, *optional*, defaults to `False`): Denotes the conv bais of final conv layer in head layer. - head_final_has_shuffle (`bool`, *optional*, defaults to `False`): - Denotes the conv shuffle of final conv layer in head layer. head_final_in_channels (`int`, *optional*, defaults to 128): Denotes the in channels of final conv layer in head layer. head_final_out_channels (`int`, *optional*, defaults to 5): @@ -130,22 +115,15 @@ def __init__( neck_out_channels=[128, 128, 128, 128], neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]], neck_stride=[1, 1, 1, 1], - neck_dilation=[1, 1, 1, 1], - neck_groups=[1, 1, 1, 1], head_pooling_size=9, head_dropout_ratio=0, head_conv_in_channels=512, head_conv_out_channels=128, head_conv_kernel_size=[3, 3], head_conv_stride=1, - head_conv_dilation=1, - head_conv_groups=1, head_final_kernel_size=1, head_final_stride=1, - head_final_dilation=1, - head_final_groups=1, head_final_bias=False, - head_final_has_shuffle=False, head_final_in_channels=128, head_final_out_channels=5, backbone="resnet50", @@ -180,8 +158,6 @@ def __init__( self.neck_out_channels = neck_out_channels self.neck_kernel_size = neck_kernel_size self.neck_stride = neck_stride - self.neck_dilation = neck_dilation - self.neck_groups = neck_groups self.head_pooling_size = head_pooling_size self.head_dropout_ratio = head_dropout_ratio @@ -190,15 +166,10 @@ def __init__( self.head_conv_out_channels = head_conv_out_channels self.head_conv_kernel_size = head_conv_kernel_size self.head_conv_stride = head_conv_stride - self.head_conv_dilation = head_conv_dilation - self.head_conv_groups = head_conv_groups self.head_final_kernel_size = head_final_kernel_size self.head_final_stride = head_final_stride - self.head_final_dilation = head_final_dilation - self.head_final_groups = head_final_groups self.head_final_bias = head_final_bias - self.head_final_has_shuffle = head_final_has_shuffle self.head_final_in_channels = head_final_in_channels self.head_final_out_channels = head_final_out_channels diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py index 4ad83123f4ff..e67d02a21bac 100644 --- a/src/transformers/models/textnet/configuration_textnet.py +++ b/src/transformers/models/textnet/configuration_textnet.py @@ -37,10 +37,6 @@ def __init__( self, kernel_size=3, stride=2, - dilation=1, - groups=1, - bias=False, - has_shuffle=False, in_channels=3, out_channels=64, act_func="relu", @@ -48,26 +44,18 @@ def __init__( stage1_out_channels=[64, 64, 64], stage1_kernel_size=[[3, 3], [3, 3], [3, 3]], stage1_stride=[1, 2, 1], - stage1_dilation=[1, 1, 1], - stage1_groups=[1, 1, 1], stage2_in_channels=[64, 128, 128, 128], stage2_out_channels=[128, 128, 128, 128], stage2_kernel_size=[[3, 3], [1, 3], [3, 3], [3, 1]], stage2_stride=[2, 1, 1, 1], - stage2_dilation=[1, 1, 1, 1], - stage2_groups=[1, 1, 1, 1], stage3_in_channels=[128, 256, 256, 256], stage3_out_channels=[256, 256, 256, 256], stage3_kernel_size=[[3, 3], [3, 3], [3, 1], [1, 3]], stage3_stride=[2, 1, 1, 1], - stage3_dilation=[1, 1, 1, 1], - stage3_groups=[1, 1, 1, 1], stage4_in_channels=[256, 512, 512, 512], stage4_out_channels=[512, 512, 512, 512], stage4_kernel_size=[[3, 3], [3, 1], [1, 3], [3, 3]], stage4_stride=[2, 1, 1, 1], - stage4_dilation=[1, 1, 1, 1], - stage4_groups=[1, 1, 1, 1], hidden_sizes=[64, 64, 128, 256, 512], initializer_range=0.02, out_features=None, @@ -78,10 +66,6 @@ def __init__( self.kernel_size = kernel_size self.stride = stride - self.dilation = dilation - self.groups = groups - self.bias = bias - self.has_shuffle = has_shuffle self.in_channels = in_channels self.out_channels = out_channels self.act_func = act_func @@ -90,29 +74,21 @@ def __init__( self.stage1_out_channels = stage1_out_channels self.stage1_kernel_size = stage1_kernel_size self.stage1_stride = stage1_stride - self.stage1_dilation = stage1_dilation - self.stage1_groups = stage1_groups self.stage2_in_channels = stage2_in_channels self.stage2_out_channels = stage2_out_channels self.stage2_kernel_size = stage2_kernel_size self.stage2_stride = stage2_stride - self.stage2_dilation = stage2_dilation - self.stage2_groups = stage2_groups self.stage3_in_channels = stage3_in_channels self.stage3_out_channels = stage3_out_channels self.stage3_kernel_size = stage3_kernel_size self.stage3_stride = stage3_stride - self.stage3_dilation = stage3_dilation - self.stage3_groups = stage3_groups self.stage4_in_channels = stage4_in_channels self.stage4_out_channels = stage4_out_channels self.stage4_kernel_size = stage4_kernel_size self.stage4_stride = stage4_stride - self.stage4_dilation = stage4_dilation - self.stage4_groups = stage4_groups self.initializer_range = initializer_range self.hidden_sizes = hidden_sizes diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index 2a4fe0ab7fd3..50a2effa2eb9 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -54,7 +54,6 @@ def __init__( backbone_stride=2, backbone_dilation=1, backbone_groups=1, - backbone_bias=False, backbone_has_shuffle=False, backbone_in_channels=3, backbone_out_channels=64, @@ -90,22 +89,15 @@ def __init__( neck_out_channels=[128], neck_kernel_size=[[3, 3]], neck_stride=[1], - neck_dilation=[1], - neck_groups=[1], head_pooling_size=9, head_dropout_ratio=0.1, head_conv_in_channels=128, head_conv_out_channels=4, head_conv_kernel_size=[3, 3], head_conv_stride=1, - head_conv_dilation=1, - head_conv_groups=1, head_final_kernel_size=1, head_final_stride=1, - head_final_dilation=1, - head_final_groups=1, head_final_bias=False, - head_final_has_shuffle=False, head_final_in_channels=4, head_final_out_channels=5, head_final_use_batch_norm=False, @@ -120,9 +112,6 @@ def __init__( self.parent = parent self.backbone_kernel_size = backbone_kernel_size self.backbone_stride = backbone_stride - self.backbone_dilation = backbone_dilation - self.backbone_groups = backbone_groups - self.backbone_bias = backbone_bias self.backbone_has_shuffle = backbone_has_shuffle self.backbone_in_channels = backbone_in_channels self.backbone_out_channels = backbone_out_channels @@ -135,36 +124,26 @@ def __init__( self.backbone_stage1_out_channels = backbone_stage1_out_channels self.backbone_stage1_kernel_size = backbone_stage1_kernel_size self.backbone_stage1_stride = backbone_stage1_stride - self.backbone_stage1_dilation = backbone_stage1_dilation - self.backbone_stage1_groups = backbone_stage1_groups self.backbone_stage2_in_channels = backbone_stage2_in_channels self.backbone_stage2_out_channels = backbone_stage2_out_channels self.backbone_stage2_kernel_size = backbone_stage2_kernel_size self.backbone_stage2_stride = backbone_stage2_stride - self.backbone_stage2_dilation = backbone_stage2_dilation - self.backbone_stage2_groups = backbone_stage2_groups self.backbone_stage3_in_channels = backbone_stage3_in_channels self.backbone_stage3_out_channels = backbone_stage3_out_channels self.backbone_stage3_kernel_size = backbone_stage3_kernel_size self.backbone_stage3_stride = backbone_stage3_stride - self.backbone_stage3_dilation = backbone_stage3_dilation - self.backbone_stage3_groups = backbone_stage3_groups self.backbone_stage4_in_channels = backbone_stage4_in_channels self.backbone_stage4_out_channels = backbone_stage4_out_channels self.backbone_stage4_kernel_size = backbone_stage4_kernel_size self.backbone_stage4_stride = backbone_stage4_stride - self.backbone_stage4_dilation = backbone_stage4_dilation - self.backbone_stage4_groups = backbone_stage4_groups self.neck_in_channels = neck_in_channels self.neck_out_channels = neck_out_channels self.neck_kernel_size = neck_kernel_size self.neck_stride = neck_stride - self.neck_dilation = neck_dilation - self.neck_groups = neck_groups self.head_pooling_size = head_pooling_size self.head_dropout_ratio = head_dropout_ratio @@ -173,15 +152,10 @@ def __init__( self.head_conv_out_channels = head_conv_out_channels self.head_conv_kernel_size = head_conv_kernel_size self.head_conv_stride = head_conv_stride - self.head_conv_dilation = head_conv_dilation - self.head_conv_groups = head_conv_groups self.head_final_kernel_size = head_final_kernel_size self.head_final_stride = head_final_stride - self.head_final_dilation = head_final_dilation - self.head_final_groups = head_final_groups self.head_final_bias = head_final_bias - self.head_final_has_shuffle = head_final_has_shuffle self.head_final_in_channels = head_final_in_channels self.head_final_out_channels = head_final_out_channels self.head_final_use_bn = head_final_use_batch_norm @@ -208,9 +182,6 @@ def get_config(self): textnet_config = TextNetConfig( kernel_size=self.backbone_kernel_size, stride=self.backbone_stride, - dilation=self.backbone_dilation, - groups=self.backbone_groups, - bias=self.backbone_bias, has_shuffle=self.backbone_has_shuffle, in_channels=self.backbone_in_channels, out_channels=self.backbone_out_channels, @@ -219,26 +190,18 @@ def get_config(self): stage1_out_channels=self.backbone_stage1_out_channels, stage1_kernel_size=self.backbone_stage1_kernel_size, stage1_stride=self.backbone_stage1_stride, - stage1_dilation=self.backbone_stage1_dilation, - stage1_groups=self.backbone_stage1_groups, stage2_in_channels=self.backbone_stage2_in_channels, stage2_out_channels=self.backbone_stage2_out_channels, stage2_kernel_size=self.backbone_stage2_kernel_size, stage2_stride=self.backbone_stage2_stride, - stage2_dilation=self.backbone_stage2_dilation, - stage2_groups=self.backbone_stage2_groups, stage3_in_channels=self.backbone_stage3_in_channels, stage3_out_channels=self.backbone_stage3_out_channels, stage3_kernel_size=self.backbone_stage3_kernel_size, stage3_stride=self.backbone_stage3_stride, - stage3_dilation=self.backbone_stage3_dilation, - stage3_groups=self.backbone_stage3_groups, stage4_in_channels=self.backbone_stage4_in_channels, stage4_out_channels=self.backbone_stage4_out_channels, stage4_kernel_size=self.backbone_stage4_kernel_size, stage4_stride=self.backbone_stage4_stride, - stage4_dilation=self.backbone_stage4_dilation, - stage4_groups=self.backbone_stage4_groups, out_features=["stage1", "stage2", "stage3", "stage4"], out_indices=[1, 2, 3, 4], ) @@ -250,22 +213,15 @@ def get_config(self): neck_out_channels=self.neck_out_channels, neck_kernel_size=self.neck_kernel_size, neck_stride=self.neck_stride, - neck_dilation=self.neck_dilation, - neck_groups=self.neck_groups, head_pooling_size=self.head_pooling_size, head_dropout_ratio=self.head_dropout_ratio, head_conv_in_channels=self.head_conv_in_channels, head_conv_out_channels=self.head_conv_out_channels, head_conv_kernel_size=self.head_conv_kernel_size, head_conv_stride=self.head_conv_stride, - head_conv_dilation=self.head_conv_dilation, - head_conv_groups=self.head_conv_groups, head_final_kernel_size=self.head_final_kernel_size, head_final_stride=self.head_final_stride, - head_final_dilation=self.head_final_dilation, - head_final_groups=self.head_final_groups, head_final_bias=self.head_final_bias, - head_final_has_shuffle=self.head_final_has_shuffle, head_final_in_channels=self.head_final_in_channels, head_final_out_channels=self.head_final_out_channels, ) diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py index a95c072fba95..c19e5c8c2536 100644 --- a/tests/models/textnet/test_modeling_textnet.py +++ b/tests/models/textnet/test_modeling_textnet.py @@ -47,10 +47,6 @@ def __init__( parent, kernel_size=3, stride=2, - dilation=1, - groups=1, - bias=False, - has_shuffle=False, in_channels=3, out_channels=64, use_bn=True, @@ -61,26 +57,18 @@ def __init__( stage1_out_channels=[64], stage1_kernel_size=[[3, 3]], stage1_stride=[1], - stage1_dilation=[1], - stage1_groups=[1], stage2_in_channels=[64], stage2_out_channels=[128], stage2_kernel_size=[[3, 1]], stage2_stride=[2], - stage2_dilation=[1], - stage2_groups=[1], stage3_in_channels=[128], stage3_out_channels=[256], stage3_kernel_size=[[1, 3]], stage3_stride=[2], - stage3_dilation=[1], - stage3_groups=[1], stage4_in_channels=[256], stage4_out_channels=[512], stage4_kernel_size=[[3, 3]], stage4_stride=[2], - stage4_dilation=[1], - stage4_groups=[1], out_features=["stage1", "stage2", "stage3", "stage4"], out_indices=[1, 2, 3, 4], batch_size=3, @@ -95,10 +83,6 @@ def __init__( self.parent = parent self.kernel_size = kernel_size self.stride = stride - self.dilation = dilation - self.groups = groups - self.bias = bias - self.has_shuffle = has_shuffle self.in_channels = in_channels self.out_channels = out_channels self.use_bn = use_bn @@ -110,29 +94,21 @@ def __init__( self.stage1_out_channels = stage1_out_channels self.stage1_kernel_size = stage1_kernel_size self.stage1_stride = stage1_stride - self.stage1_dilation = stage1_dilation - self.stage1_groups = stage1_groups self.stage2_in_channels = stage2_in_channels self.stage2_out_channels = stage2_out_channels self.stage2_kernel_size = stage2_kernel_size self.stage2_stride = stage2_stride - self.stage2_dilation = stage2_dilation - self.stage2_groups = stage2_groups self.stage3_in_channels = stage3_in_channels self.stage3_out_channels = stage3_out_channels self.stage3_kernel_size = stage3_kernel_size self.stage3_stride = stage3_stride - self.stage3_dilation = stage3_dilation - self.stage3_groups = stage3_groups self.stage4_in_channels = stage4_in_channels self.stage4_out_channels = stage4_out_channels self.stage4_kernel_size = stage4_kernel_size self.stage4_stride = stage4_stride - self.stage4_dilation = stage4_dilation - self.stage4_groups = stage4_groups self.out_features = out_features self.out_indices = out_indices @@ -151,10 +127,6 @@ def get_config(self): return TextNetConfig( kernel_size=self.kernel_size, stride=self.stride, - dilation=self.dilation, - groups=self.groups, - bias=self.bias, - has_shuffle=self.has_shuffle, in_channels=self.in_channels, out_channels=self.out_channels, use_bn=self.use_bn, @@ -165,26 +137,18 @@ def get_config(self): stage1_out_channels=self.stage1_out_channels, stage1_kernel_size=self.stage1_kernel_size, stage1_stride=self.stage1_stride, - stage1_dilation=self.stage1_dilation, - stage1_groups=self.stage1_groups, stage2_in_channels=self.stage2_in_channels, stage2_out_channels=self.stage2_out_channels, stage2_kernel_size=self.stage2_kernel_size, stage2_stride=self.stage2_stride, - stage2_dilation=self.stage2_dilation, - stage2_groups=self.stage2_groups, stage3_in_channels=self.stage3_in_channels, stage3_out_channels=self.stage3_out_channels, stage3_kernel_size=self.stage3_kernel_size, stage3_stride=self.stage3_stride, - stage3_dilation=self.stage3_dilation, - stage3_groups=self.stage3_groups, stage4_in_channels=self.stage4_in_channels, stage4_out_channels=self.stage4_out_channels, stage4_kernel_size=self.stage4_kernel_size, stage4_stride=self.stage4_stride, - stage4_dilation=self.stage4_dilation, - stage4_groups=self.stage4_groups, out_features=self.out_features, out_indices=self.out_indices, hidden_sizes=self.hidden_sizes, From c22ba88c123af31c26edd07535f56e1a36d74f98 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Thu, 9 Nov 2023 13:54:09 +0530 Subject: [PATCH 051/152] Incorporate PR feedbacks --- .../fast/convert_fast_original_to_pytorch.py | 6 +----- .../models/fast/image_processing_fast.py | 2 +- src/transformers/models/fast/modeling_fast.py | 20 +++++++++++++------ tests/models/fast/test_modeling_fast.py | 7 +++---- 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py index c7a8e622aae7..c98243c16457 100644 --- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py +++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py @@ -30,11 +30,7 @@ small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config" base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config" -rename_key_mappings = { - "bn": "batch_norm", - "hor": "horizontal", - "ver": "vertical", -} +rename_key_mappings = {"bn": "batch_norm", "hor": "horizontal", "ver": "vertical", "det_head": "text_detection_head"} def prepare_img(): diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py index eb5020195f2d..5e70a83ac58a 100644 --- a/src/transformers/models/fast/image_processing_fast.py +++ b/src/transformers/models/fast/image_processing_fast.py @@ -389,7 +389,7 @@ def _max_pooling(self, x, scale=1): def post_process_text_detection(self, output, target_sizes, threshold, bbox_type="rect"): scale = 2 img_size = (self.size["height"], self.size["width"]) - out = output["hidden_states"] + out = output["last_hidden_state"] batch_size = out.size(0) final_results = {} diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py index d244d1889aa5..179aa9eb6402 100644 --- a/src/transformers/models/fast/modeling_fast.py +++ b/src/transformers/models/fast/modeling_fast.py @@ -15,7 +15,7 @@ """ PyTorch FAST model.""" from dataclasses import dataclass -from typing import Dict, Optional +from typing import Dict, Optional, Tuple import numpy as np import torch @@ -578,7 +578,8 @@ class FastForSceneTextRecognitionOutput(ModelOutput): """ loss: Optional[torch.Tensor] = None - hidden_states: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None @add_start_docstrings( @@ -613,7 +614,7 @@ def __init__(self, config): self.backbone = backbone self.neck = FASTNeck(config=config) - self.det_head = FASTHead(config=config) + self.text_detection_head = FASTHead(config=config) self.pooling_1s = nn.MaxPool2d( kernel_size=config.head_pooling_size, stride=1, padding=(config.head_pooling_size - 1) // 2 @@ -699,7 +700,9 @@ def forward( hidden_states = self.neck(features) - text_detection_output = self.det_head(hidden_states) + text_detection_output = self.text_detection_head(hidden_states) + + all_hidden_states = (features, hidden_states) loss = None if labels: @@ -708,6 +711,11 @@ def forward( text_detection_output = self._upsample(text_detection_output, pixel_values.size(), scale=4) if not return_dict: - return (loss, text_detection_output) if loss is not None else (text_detection_output,) + output = (loss, text_detection_output) if loss is not None else (text_detection_output,) + return output + (all_hidden_states,) if output_hidden_states else output - return FastForSceneTextRecognitionOutput(loss, text_detection_output) + return FastForSceneTextRecognitionOutput( + loss=loss, + last_hidden_state=text_detection_output, + hidden_states=all_hidden_states if output_hidden_states else None, + ) diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index 50a2effa2eb9..4fb17cf824a3 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -28,7 +28,6 @@ from transformers.testing_utils import ( require_torch, require_vision, - slow, torch_device, ) @@ -231,7 +230,7 @@ def create_and_check_model(self, config, input): model.to(torch_device) model.eval() result = model(pixel_values=input["pixel_values"]) - self.parent.assertEqual(result.hidden_states.shape, (self.batch_size, 5, 125, 125)) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 5, 125, 125)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() @@ -345,7 +344,7 @@ def test_model_is_small(self): @require_torch @require_vision class FastModelIntegrationTest(unittest.TestCase): - @slow + # @slow def test_inference_fast_tiny_ic17mlt_model(self): model = FastForSceneTextRecognition.from_pretrained("Raghavan/ic17mlt_Fast_T") @@ -367,7 +366,7 @@ def prepare_image(): assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134] assert round(float(final_out[0]["scores"][0]), 5) == 0.95541 - @slow + # @slow def test_inference_fast_base_800_total_text_ic17mlt_model(self): model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") From 2ee0440fd35d3fca8e8671293043fb4ffa0c3ddc Mon Sep 17 00:00:00 2001 From: raghavanone Date: Thu, 9 Nov 2023 19:08:15 +0530 Subject: [PATCH 052/152] More cleanup --- .../fast/convert_fast_original_to_pytorch.py | 18 +++- .../models/textnet/configuration_textnet.py | 89 ++++++++++++++++++- .../models/textnet/modeling_textnet.py | 18 ++-- tests/models/fast/test_modeling_fast.py | 5 +- 4 files changed, 109 insertions(+), 21 deletions(-) diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py index c98243c16457..6c36af421153 100644 --- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py +++ b/src/transformers/models/fast/convert_fast_original_to_pytorch.py @@ -164,7 +164,9 @@ def get_base_model_config(): pass -def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_folder_path, validate_logits): +def convert_fast_checkpoint( + checkpoint_url, checkpoint_config_url, pytorch_dump_folder_path, validate_logits, save_backbone_separately +): response = requests.get(checkpoint_config_url) content = response.text namespace = {} @@ -218,6 +220,8 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_ model.load_state_dict(state_dict_changed) model.save_pretrained(pytorch_dump_folder_path) + if save_backbone_separately: + model.backbone.save_pretrained(pytorch_dump_folder_path + "/textnet/") fast_image_processor.save_pretrained(pytorch_dump_folder_path) logging.info("The converted weights are save here : " + pytorch_dump_folder_path) @@ -246,8 +250,18 @@ def convert_fast_checkpoint(checkpoint_url, checkpoint_config_url, pytorch_dump_ type=bool, help="whether to assert logits outputs", ) + parser.add_argument( + "--save_backbone_separately", + default=False, + type=bool, + help="whether to assert logits outputs", + ) args = parser.parse_args() convert_fast_checkpoint( - args.checkpoint_url, args.checkpoint_config_url, args.pytorch_dump_folder_path, args.validate_logits + args.checkpoint_url, + args.checkpoint_config_url, + args.pytorch_dump_folder_path, + args.validate_logits, + args.save_backbone_separately, ) diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py index e67d02a21bac..33e2b4c3b25c 100644 --- a/src/transformers/models/textnet/configuration_textnet.py +++ b/src/transformers/models/textnet/configuration_textnet.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright The HuggingFace Inc. team. All rights reserved. +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,13 +21,94 @@ logger = logging.get_logger(__name__) TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "fast_base_tt_800_finetune_ic17mlt": ( - "https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt/raw/main/config.json" - ), + "textnet-base": ("https://huggingface.co/Raghavan/textnet-base/blob/main/config.json"), } class TextNetConfig(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FastForSceneTextRecognition`]. It is used to + instantiate a FastForSceneTextRecognition model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the + FastForSceneTextRecognition. + [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + kernel_size (`int`, *optional*, defaults to 3): + The kernel size for the initial convolution layer. + stride (`int`, *optional*, defaults to 2): + The stride for the initial convolution layer. + in_channels (`int`, *optional*, defaults to 3): + The num of channels in input for the initial convolution layer. + out_channels (`int`, *optional*, defaults to 64): + The num of channels in out for the initial convolution layer. + act_func (`str`, *optional*, defaults to `"relu"`): + The activation function for the initial convolution layer. + stage1_in_channels (`List[int]`, *optional*, defaults to `[64, 64, 64]`): + The num of channels in input for list of conv in stage 1. + stage1_out_channels (`List[int]`, *optional*, defaults to `[64, 64, 64]`): + The num of channels in output for list of conv in stage 1.Should be of same length os `stage1_in_channels` + stage1_kernel_size (`List[int]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 3]]`): + The kernel sizes for list of conv in stage 1.Should be of same length os `stage1_in_channels` + stage1_stride (`List[int]`, *optional*, defaults to `[1, 2, 1]`): + The strides for list of conv in stage 1.Should be of same length os `stage1_in_channels` + stage2_in_channels (`List[int]`, *optional*, defaults to `[64, 128, 128, 128]`): + The num of channels in input for list of conv in stage 2. + stage2_out_channels (`List[int]`, *optional*, defaults to `[128, 128, 128, 128]`): + The num of channels in output for list of conv in stage 2.Should be of same length os `stage2_in_channels` + stage2_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [1, 3], [3, 3], [3, 1]]`): + The kernel sizes for list of conv in stage 2.Should be of same length os + `stage2_in_channels` + stage2_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): + The strides for list of conv in stage 2.Should be of same length os `stage2_in_channels` + stage3_in_channels (`List[int]`, *optional*, defaults to `[128, 256, 256, 256]`): + The num of channels in input for list of conv in stage 3. + stage3_out_channels (`List[int]`, *optional*, defaults to `[256, 256, 256, 256]`): + The num of channels in output for list of conv in stage 3.Should be of same length os `stage3_in_channels` + stage3_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 1], [1, 3]]`): + The kernel sizes for list of conv in stage 3.Should be of same length os + `stage3_in_channels` + stage3_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): + The strides for list of conv in stage 3.Should be of same length os `stage3_in_channels` + stage4_in_channels (`List[int]`, *optional*, defaults to `[256, 512, 512, 512]`): + The strides for list of conv in stage 4.Should be of same length os `stage4_in_channels` + stage4_out_channels (`List[int]`, *optional*, defaults to `[512, 512, 512, 512]`): + The num of channels in output for list of conv in stage 4.Should be of same length os `stage4_in_channels` + stage4_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [3, 1], [1, 3], [3, 3]]`): + The kernel sizes for list of conv in stage 4.Should be of same length os + `stage4_in_channels` + stage4_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): + The strides for list of conv in stage 4.Should be of same length os `stage4_in_channels` + hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`): + Dimensionality (hidden size) at each stage. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + out_features (`List[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. + out_indices (`List[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. + + Examples: + + ```python + >>> from transformers import FastConfig, FastForSceneTextRecognition + + >>> # Initializing a Fast Config + >>> configuration = FastConfig() + + >>> # Initializing a model (with random weights) + >>> model = FastForSceneTextRecognition(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" r""" [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt) """ diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py index daf76ba2667e..13091ad9d389 100644 --- a/src/transformers/models/textnet/modeling_textnet.py +++ b/src/transformers/models/textnet/modeling_textnet.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -61,17 +61,14 @@ return_dict (`bool`, *optional*): """ -BIT_PRETRAINED_MODEL_ARCHIVE_LIST = [ - # "google/bit-50", - # See all BiT models at https://huggingface.co/models?filter=bit -] +BIT_PRETRAINED_MODEL_ARCHIVE_LIST = ["Raghavan/textnet-base"] def get_same_padding(kernel_size): if isinstance(kernel_size, tuple): - p1 = get_same_padding(kernel_size[0]) - p2 = get_same_padding(kernel_size[1]) - return p1, p2 + padding1 = get_same_padding(kernel_size[0]) + padding2 = get_same_padding(kernel_size[1]) + return padding1, padding2 return kernel_size // 2 @@ -91,11 +88,6 @@ def __init__( self.activation_function = act_func padding = get_same_padding(self.kernel_size) - # if isinstance(padding, int): - # padding *= self.dilation - # else: - # padding[0] *= self.dilation - # padding[1] *= self.dilation self.conv = nn.Conv2d( in_channels, diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py index 4fb17cf824a3..07c3f9b24b20 100644 --- a/tests/models/fast/test_modeling_fast.py +++ b/tests/models/fast/test_modeling_fast.py @@ -28,6 +28,7 @@ from transformers.testing_utils import ( require_torch, require_vision, + slow, torch_device, ) @@ -344,7 +345,7 @@ def test_model_is_small(self): @require_torch @require_vision class FastModelIntegrationTest(unittest.TestCase): - # @slow + @slow def test_inference_fast_tiny_ic17mlt_model(self): model = FastForSceneTextRecognition.from_pretrained("Raghavan/ic17mlt_Fast_T") @@ -366,7 +367,7 @@ def prepare_image(): assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134] assert round(float(final_out[0]["scores"][0]), 5) == 0.95541 - # @slow + @slow def test_inference_fast_base_800_total_text_ic17mlt_model(self): model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") From 04d761d7c6d547e5a455ebdb0a14e41b8732c3b6 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Thu, 9 Nov 2023 19:17:21 +0530 Subject: [PATCH 053/152] More cleanup --- .../models/textnet/modeling_textnet.py | 6 ++-- tests/models/textnet/test_modeling_textnet.py | 31 +++++++++++-------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py index 13091ad9d389..1943f2343c67 100644 --- a/src/transformers/models/textnet/modeling_textnet.py +++ b/src/transformers/models/textnet/modeling_textnet.py @@ -49,7 +49,7 @@ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ -BIT_INPUTS_DOCSTRING = r""" +TEXTNET_INPUTS_DOCSTRING = r""" Args: pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`BitImageProcessor.__call__`] @@ -61,7 +61,7 @@ return_dict (`bool`, *optional*): """ -BIT_PRETRAINED_MODEL_ARCHIVE_LIST = ["Raghavan/textnet-base"] +TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST = ["Raghavan/textnet-base"] def get_same_padding(kernel_size): @@ -429,7 +429,7 @@ def __init__(self, config): # initialize weights and apply final processing self.post_init() - @add_start_docstrings_to_model_forward("BIT_INPUTS_DOCSTRING") + @add_start_docstrings_to_model_forward(TEXTNET_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BackboneOutput, config_class="") def forward( self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py index c19e5c8c2536..999ddc23c7c1 100644 --- a/tests/models/textnet/test_modeling_textnet.py +++ b/tests/models/textnet/test_modeling_textnet.py @@ -17,8 +17,10 @@ import unittest from transformers import TextNetConfig +from transformers.models.textnet.modeling_textnet import TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST from transformers.testing_utils import ( require_torch, + slow, torch_device, ) from transformers.utils import is_torch_available @@ -164,6 +166,14 @@ def create_and_check_model(self, config, pixel_values, labels): (self.batch_size, self.hidden_sizes[-1], 2, 2), ) + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = TextNetForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) @@ -222,11 +232,6 @@ class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) if is_torch_available() else {} ) - # fx_compatible = False - # test_pruning = False - # test_resize_embeddings = False - # test_head_masking = False - # has_attentions = False fx_compatible = False test_pruning = False @@ -347,15 +352,15 @@ def test_model_is_small(self): def test_feed_forward_chunking(self): pass - # def test_for_image_classification(self): - # config_and_inputs = self.model_tester.prepare_config_and_inputs() - # self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) - # @slow - # def test_model_from_pretrained(self): - # for model_name in BIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: - # model = BitModel.from_pretrained(model_name) - # self.assertIsNotNone(model) + @slow + def test_model_from_pretrained(self): + for model_name in TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = TextNetModel.from_pretrained(model_name) + self.assertIsNotNone(model) @require_torch From 25724619e5e6a435022b862cc377ec7dcd23d9fd Mon Sep 17 00:00:00 2001 From: raghavanone Date: Thu, 9 Nov 2023 19:37:29 +0530 Subject: [PATCH 054/152] More cleanup --- .../models/textnet/configuration_textnet.py | 47 +++++++++---------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py index 33e2b4c3b25c..650c1bc4858f 100644 --- a/src/transformers/models/textnet/configuration_textnet.py +++ b/src/transformers/models/textnet/configuration_textnet.py @@ -37,52 +37,49 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig): documentation from [`PretrainedConfig`] for more information. Args: - kernel_size (`int`, *optional*, defaults to 3): + kernel_size (`int`, *optional*, defaults to 3): The kernel size for the initial convolution layer. - stride (`int`, *optional*, defaults to 2): + stride (`int`, *optional*, defaults to 2): The stride for the initial convolution layer. - in_channels (`int`, *optional*, defaults to 3): + in_channels (`int`, *optional*, defaults to 3): The num of channels in input for the initial convolution layer. - out_channels (`int`, *optional*, defaults to 64): + out_channels (`int`, *optional*, defaults to 64): The num of channels in out for the initial convolution layer. - act_func (`str`, *optional*, defaults to `"relu"`): + act_func (`str`, *optional*, defaults to `"relu"`): The activation function for the initial convolution layer. - stage1_in_channels (`List[int]`, *optional*, defaults to `[64, 64, 64]`): + stage1_in_channels (`List[int]`, *optional*, defaults to `[64, 64, 64]`): The num of channels in input for list of conv in stage 1. - stage1_out_channels (`List[int]`, *optional*, defaults to `[64, 64, 64]`): + stage1_out_channels (`List[int]`, *optional*, defaults to `[64, 64, 64]`): The num of channels in output for list of conv in stage 1.Should be of same length os `stage1_in_channels` - stage1_kernel_size (`List[int]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 3]]`): + stage1_kernel_size (`List[int]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 3]]`): The kernel sizes for list of conv in stage 1.Should be of same length os `stage1_in_channels` - stage1_stride (`List[int]`, *optional*, defaults to `[1, 2, 1]`): + stage1_stride (`List[int]`, *optional*, defaults to `[1, 2, 1]`): The strides for list of conv in stage 1.Should be of same length os `stage1_in_channels` - stage2_in_channels (`List[int]`, *optional*, defaults to `[64, 128, 128, 128]`): + stage2_in_channels (`List[int]`, *optional*, defaults to `[64, 128, 128, 128]`): The num of channels in input for list of conv in stage 2. - stage2_out_channels (`List[int]`, *optional*, defaults to `[128, 128, 128, 128]`): + stage2_out_channels (`List[int]`, *optional*, defaults to `[128, 128, 128, 128]`): The num of channels in output for list of conv in stage 2.Should be of same length os `stage2_in_channels` stage2_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [1, 3], [3, 3], [3, 1]]`): - The kernel sizes for list of conv in stage 2.Should be of same length os - `stage2_in_channels` - stage2_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): + The kernel sizes for list of conv in stage 2.Should be of same length os `stage2_in_channels` + stage2_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): The strides for list of conv in stage 2.Should be of same length os `stage2_in_channels` - stage3_in_channels (`List[int]`, *optional*, defaults to `[128, 256, 256, 256]`): + stage3_in_channels (`List[int]`, *optional*, defaults to `[128, 256, 256, 256]`): The num of channels in input for list of conv in stage 3. - stage3_out_channels (`List[int]`, *optional*, defaults to `[256, 256, 256, 256]`): + stage3_out_channels (`List[int]`, *optional*, defaults to `[256, 256, 256, 256]`): The num of channels in output for list of conv in stage 3.Should be of same length os `stage3_in_channels` stage3_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 1], [1, 3]]`): - The kernel sizes for list of conv in stage 3.Should be of same length os - `stage3_in_channels` - stage3_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): + The kernel sizes for list of conv in stage 3.Should be of same length os `stage3_in_channels` + stage3_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): The strides for list of conv in stage 3.Should be of same length os `stage3_in_channels` - stage4_in_channels (`List[int]`, *optional*, defaults to `[256, 512, 512, 512]`): + stage4_in_channels (`List[int]`, *optional*, defaults to `[256, 512, 512, 512]`): The strides for list of conv in stage 4.Should be of same length os `stage4_in_channels` - stage4_out_channels (`List[int]`, *optional*, defaults to `[512, 512, 512, 512]`): + stage4_out_channels (`List[int]`, *optional*, defaults to `[512, 512, 512, 512]`): The num of channels in output for list of conv in stage 4.Should be of same length os `stage4_in_channels` stage4_kernel_size (`List[List[int]]`, *optional*, defaults to `[[3, 3], [3, 1], [1, 3], [3, 3]]`): - The kernel sizes for list of conv in stage 4.Should be of same length os - `stage4_in_channels` - stage4_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): + The kernel sizes for list of conv in stage 4.Should be of same length os `stage4_in_channels` + stage4_stride (`List[int]`, *optional*, defaults to `[2, 1, 1, 1]`): The strides for list of conv in stage 4.Should be of same length os `stage4_in_channels` - hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`): + hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`): Dimensionality (hidden size) at each stage. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. From c576f4a51c0a49b862d20e313036950bcd8d2e8f Mon Sep 17 00:00:00 2001 From: raghavanone Date: Thu, 9 Nov 2023 20:44:27 +0530 Subject: [PATCH 055/152] Fix build --- tests/models/textnet/test_modeling_textnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py index 999ddc23c7c1..957661e61144 100644 --- a/tests/models/textnet/test_modeling_textnet.py +++ b/tests/models/textnet/test_modeling_textnet.py @@ -17,7 +17,6 @@ import unittest from transformers import TextNetConfig -from transformers.models.textnet.modeling_textnet import TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST from transformers.testing_utils import ( require_torch, slow, @@ -41,6 +40,7 @@ TextNetModel, is_torch_available, ) + from transformers.models.textnet.modeling_textnet import TEXTNET_PRETRAINED_MODEL_ARCHIVE_LIST class TextNetModelTester: From 5d58c6767d8313fed9fd4fcf648a53bf7fb21b58 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Fri, 10 Nov 2023 14:06:01 +0530 Subject: [PATCH 056/152] Remove all the references of fast model --- src/transformers/__init__.py | 1 - .../models/auto/image_processing_auto.py | 1 - src/transformers/models/auto/modeling_auto.py | 3 + src/transformers/models/fast/__init__.py | 53 -- .../models/fast/configuration_fast.py | 192 ----- .../fast/convert_fast_original_to_pytorch.py | 267 ------- .../models/fast/image_processing_fast.py | 467 ------------ src/transformers/models/fast/modeling_fast.py | 721 ------------------ .../utils/dummy_vision_objects.py | 7 - tests/models/fast/__init__.py | 0 .../models/fast/test_image_processing_fast.py | 162 ---- tests/models/fast/test_modeling_fast.py | 390 ---------- utils/check_repo.py | 1 - 13 files changed, 3 insertions(+), 2262 deletions(-) delete mode 100644 src/transformers/models/fast/__init__.py delete mode 100644 src/transformers/models/fast/configuration_fast.py delete mode 100644 src/transformers/models/fast/convert_fast_original_to_pytorch.py delete mode 100644 src/transformers/models/fast/image_processing_fast.py delete mode 100644 src/transformers/models/fast/modeling_fast.py delete mode 100644 tests/models/fast/__init__.py delete mode 100644 tests/models/fast/test_image_processing_fast.py delete mode 100644 tests/models/fast/test_modeling_fast.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 5c0d2bed5b5f..7cfffec8463b 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1279,7 +1279,6 @@ _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"]) _import_structure["models.efficientformer"].append("EfficientFormerImageProcessor") _import_structure["models.efficientnet"].append("EfficientNetImageProcessor") - _import_structure["models.fast"].extend(["FastImageProcessor"]) _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"]) _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"]) _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"]) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 2fac0833c940..6244276b1d0b 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -62,7 +62,6 @@ ("dpt", "DPTImageProcessor"), ("efficientformer", "EfficientFormerImageProcessor"), ("efficientnet", "EfficientNetImageProcessor"), - ("fast", "FastImageProcessor"), ("flava", "FlavaImageProcessor"), ("focalnet", "BitImageProcessor"), ("fuyu", "FuyuImageProcessor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index c1ecdee1578e..d396ccb21c4a 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -95,7 +95,10 @@ ("ernie_m", "ErnieMModel"), ("esm", "EsmModel"), ("falcon", "FalconModel"), +<<<<<<< HEAD ("fastspeech2_conformer", "FastSpeech2ConformerModel"), +======= +>>>>>>> ae576e088 (Remove all the references of fast model) ("flaubert", "FlaubertModel"), ("flava", "FlavaModel"), ("fnet", "FNetModel"), diff --git a/src/transformers/models/fast/__init__.py b/src/transformers/models/fast/__init__.py deleted file mode 100644 index dedc491f6c59..000000000000 --- a/src/transformers/models/fast/__init__.py +++ /dev/null @@ -1,53 +0,0 @@ -# coding=utf-8 -# Copyright 2023 the Fast authors and HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import TYPE_CHECKING - -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_torch_available, -) - - -_import_structure = { - "configuration_fast": ["FAST_PRETRAINED_CONFIG_ARCHIVE_MAP", "FastConfig"], - "image_processing_fast": ["FastImageProcessor"], -} - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_fast"] = ["FastForSceneTextRecognition", "FastPreTrainedModel"] - -if TYPE_CHECKING: - from .configuration_fast import FAST_PRETRAINED_CONFIG_ARCHIVE_MAP, FastConfig - from .image_processing_fast import FastImageProcessor - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_fast import FastForSceneTextRecognition, FastPreTrainedModel - - -else: - import sys - - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/fast/configuration_fast.py b/src/transformers/models/fast/configuration_fast.py deleted file mode 100644 index 6a5f3a425fa6..000000000000 --- a/src/transformers/models/fast/configuration_fast.py +++ /dev/null @@ -1,192 +0,0 @@ -# coding=utf-8 -# Copyright The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Fast model configuration""" -from transformers import CONFIG_MAPPING, PretrainedConfig -from transformers.utils import logging - - -logger = logging.get_logger(__name__) - -FAST_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "fast_base_tt_800_finetune_ic17mlt": ( - "https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt/raw/main/config.json" - ), -} - - -class FastConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`FastForSceneTextRecognition`]. It is used to - instantiate a FastForSceneTextRecognition model according to the specified arguments, defining the model - architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the - FastForSceneTextRecognition. - [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt) - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - Args: - use_timm_backbone (`bool`, *optional*, defaults to `True`): - Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`] - API. - backbone_config (`PretrainedConfig` or `dict`, *optional*): - The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which - case it will default to `ResNetConfig()`. - num_channels (`int`, *optional*, defaults to 3): - The number of input channels. - neck_in_channels (`List[int]`, *optional*, defaults to `[64, 128, 256, 512]`): - Denotes the in channels of FASTRepConvLayer in neck module. - neck_out_channels (`List[int]`, *optional*, defaults to `[128, 128, 128, 128]`): - Denotes the out channels of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels` - neck_kernel_size (`List[int]`, *optional*, defaults to `[[3, 3], [3, 3], [3, 3], [3, 3]]`): - Denotes the kernel_size of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels` - neck_stride (`List[int]`, *optional*, defaults to `[1, 1, 1, 1]`): - Denotes the neck_stride of FASTRepConvLayer in neck module. Should be of same length of `neck_in_channels` - head_pooling_size (`int`, *optional*, defaults to 9): - Denotes the pooling size of head layer - head_dropout_ratio (`int`, *optional*, defaults to 0): - Denotes the dropout ratio used in dropout layer of head layer.. - head_conv_in_channels (`int`, *optional*, defaults to 512): - Denotes the in channels of first conv layer in head layer. - head_conv_out_channels (`int`, *optional*, defaults to 128): - Denotes the out channels of first conv layer in head layer. - head_conv_kernel_size (`List[int]`, *optional*, defaults to `[3, 3]`): - Denotes the conv kernel size of first conv layer in head layer. - head_conv_stride (`int`, *optional*, defaults to 1): - Denotes the conv stride of first conv layer in head layer. - head_final_kernel_size (`int`, *optional*, defaults to 1): - Denotes the conv kernel size of final conv layer in head layer. - head_final_stride (`int`, *optional*, defaults to 1): - Denotes the conv stride of final conv layer in head layer. - head_final_bias (`bool`, *optional*, defaults to `False`): - Denotes the conv bais of final conv layer in head layer. - head_final_in_channels (`int`, *optional*, defaults to 128): - Denotes the in channels of final conv layer in head layer. - head_final_out_channels (`int`, *optional*, defaults to 5): - Denotes the out channels of final conv layer in head layer. - backbone (`str`, *optional*, defaults to `"resnet50"`): - Name of convolutional backbone to use in case `use_timm_backbone` = `True`. Supports any convolutional - backbone from the timm package. For a list of all available models, see [this - page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model). - use_pretrained_backbone (`bool`, *optional*, defaults to `True`): - Whether to use pretrained weights for the backbone. Only supported when `use_timm_backbone` = `True`. - dilation (`bool`, *optional*, defaults to `False`): - Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when - `use_timm_backbone` = `True`. - initializer_range (`float`, *optional*, defaults to 0.02): - Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. - - Examples: - - ```python - >>> from transformers import FastConfig, FastForSceneTextRecognition - - >>> # Initializing a Fast Config - >>> configuration = FastConfig() - - >>> # Initializing a model (with random weights) - >>> model = FastForSceneTextRecognition(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - r""" - [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt) - """ - - def __init__( - self, - use_timm_backbone=True, - backbone_config=None, - num_channels=3, - neck_in_channels=[64, 128, 256, 512], - neck_out_channels=[128, 128, 128, 128], - neck_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3]], - neck_stride=[1, 1, 1, 1], - head_pooling_size=9, - head_dropout_ratio=0, - head_conv_in_channels=512, - head_conv_out_channels=128, - head_conv_kernel_size=[3, 3], - head_conv_stride=1, - head_final_kernel_size=1, - head_final_stride=1, - head_final_bias=False, - head_final_in_channels=128, - head_final_out_channels=5, - backbone="resnet50", - use_pretrained_backbone=True, - dilation=False, - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - - if backbone_config is not None and use_timm_backbone: - raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") - - if not use_timm_backbone: - if backbone_config is None: - logger.info( - "`backbone_config` is `None`. Initializing the config with the default `TextNet` backbone." - ) - backbone_config = CONFIG_MAPPING["textnet"](out_features=["stage1", "stage2", "stage3", "stage4"]) - elif isinstance(backbone_config, dict): - backbone_model_type = backbone_config.get("model_type") - config_class = CONFIG_MAPPING[backbone_model_type] - backbone_config = config_class.from_dict(backbone_config) - # set timm attributes to None - dilation, backbone, use_pretrained_backbone = None, None, None - - self.use_timm_backbone = use_timm_backbone - self.backbone_config = backbone_config - self.num_channels = num_channels - - self.neck_in_channels = neck_in_channels - self.neck_out_channels = neck_out_channels - self.neck_kernel_size = neck_kernel_size - self.neck_stride = neck_stride - - self.head_pooling_size = head_pooling_size - self.head_dropout_ratio = head_dropout_ratio - - self.head_conv_in_channels = head_conv_in_channels - self.head_conv_out_channels = head_conv_out_channels - self.head_conv_kernel_size = head_conv_kernel_size - self.head_conv_stride = head_conv_stride - - self.head_final_kernel_size = head_final_kernel_size - self.head_final_stride = head_final_stride - self.head_final_bias = head_final_bias - self.head_final_in_channels = head_final_in_channels - self.head_final_out_channels = head_final_out_channels - - self.backbone = backbone - self.use_pretrained_backbone = use_pretrained_backbone - self.dilation = dilation - - self.initializer_range = initializer_range - - @classmethod - def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs): - """Instantiate a [`FastConfig`] (or a derived class) from a pre-trained backbone model configuration. - - Args: - backbone_config ([`PretrainedConfig`]): - The backbone configuration. - Returns: - [`FastConfig`]: An instance of a configuration object - """ - return cls(backbone_config=backbone_config, **kwargs) diff --git a/src/transformers/models/fast/convert_fast_original_to_pytorch.py b/src/transformers/models/fast/convert_fast_original_to_pytorch.py deleted file mode 100644 index 6c36af421153..000000000000 --- a/src/transformers/models/fast/convert_fast_original_to_pytorch.py +++ /dev/null @@ -1,267 +0,0 @@ -# coding=utf-8 -# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import copy -import json -import logging - -import requests -import torch -from PIL import Image - -from transformers import FastConfig, FastForSceneTextRecognition, TextNetConfig -from transformers.models.fast.image_processing_fast import FastImageProcessor - - -tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config" -small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config" -base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config" - -rename_key_mappings = {"bn": "batch_norm", "hor": "horizontal", "ver": "vertical", "det_head": "text_detection_head"} - - -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def prepare_config(size_config_url, pooling_size, min_area, bbox_type, loss_bg): - config_dict = json.loads(requests.get(size_config_url).text) - - backbone_config = {} - for stage_ix in range(1, 5): - stage_config = config_dict[f"stage{stage_ix}"] - - merged_dict = {} - - # Iterate through the list of dictionaries - for layer in stage_config: - for key, value in layer.items(): - if key != "name": - # Check if the key is already in the merged_dict - if key in merged_dict: - merged_dict[key].append(value) - else: - # If the key is not in merged_dict, create a new list with the value - merged_dict[key] = [value] - backbone_config[f"stage{stage_ix}"] = merged_dict - - neck_in_channels = [] - neck_out_channels = [] - neck_kernel_size = [] - neck_stride = [] - neck_dilation = [] - neck_groups = [] - - for i in range(1, 5): - layer_key = f"reduce_layer{i}" - layer_dict = config_dict["neck"].get(layer_key) - - if layer_dict: - # Append values to the corresponding lists - neck_in_channels.append(layer_dict["in_channels"]) - neck_out_channels.append(layer_dict["out_channels"]) - neck_kernel_size.append(layer_dict["kernel_size"]) - neck_stride.append(layer_dict["stride"]) - neck_dilation.append(layer_dict["dilation"]) - neck_groups.append(layer_dict["groups"]) - - textnet_config = TextNetConfig( - kernel_size=config_dict["first_conv"]["kernel_size"], - stride=config_dict["first_conv"]["stride"], - dilation=config_dict["first_conv"]["dilation"], - groups=config_dict["first_conv"]["groups"], - bias=config_dict["first_conv"]["bias"], - has_shuffle=config_dict["first_conv"]["has_shuffle"], - in_channels=config_dict["first_conv"]["in_channels"], - out_channels=config_dict["first_conv"]["out_channels"], - use_bn=config_dict["first_conv"]["use_bn"], - act_func=config_dict["first_conv"]["act_func"], - dropout_rate=config_dict["first_conv"]["dropout_rate"], - ops_order=config_dict["first_conv"]["ops_order"], - stage1_in_channels=backbone_config["stage1"]["in_channels"], - stage1_out_channels=backbone_config["stage1"]["out_channels"], - stage1_kernel_size=backbone_config["stage1"]["kernel_size"], - stage1_stride=backbone_config["stage1"]["stride"], - stage1_dilation=backbone_config["stage1"]["dilation"], - stage1_groups=backbone_config["stage1"]["groups"], - stage2_in_channels=backbone_config["stage2"]["in_channels"], - stage2_out_channels=backbone_config["stage2"]["out_channels"], - stage2_kernel_size=backbone_config["stage2"]["kernel_size"], - stage2_stride=backbone_config["stage2"]["stride"], - stage2_dilation=backbone_config["stage2"]["dilation"], - stage2_groups=backbone_config["stage2"]["groups"], - stage3_in_channels=backbone_config["stage3"]["in_channels"], - stage3_out_channels=backbone_config["stage3"]["out_channels"], - stage3_kernel_size=backbone_config["stage3"]["kernel_size"], - stage3_stride=backbone_config["stage3"]["stride"], - stage3_dilation=backbone_config["stage3"]["dilation"], - stage3_groups=backbone_config["stage3"]["groups"], - stage4_in_channels=backbone_config["stage4"]["in_channels"], - stage4_out_channels=backbone_config["stage4"]["out_channels"], - stage4_kernel_size=backbone_config["stage4"]["kernel_size"], - stage4_stride=backbone_config["stage4"]["stride"], - stage4_dilation=backbone_config["stage4"]["dilation"], - stage4_groups=backbone_config["stage4"]["groups"], - out_features=["stage1", "stage2", "stage3", "stage4"], - out_indices=[1, 2, 3, 4], - ) - - return FastConfig( - use_timm_backbone=False, - backbone_config=textnet_config, - neck_in_channels=neck_in_channels, - neck_out_channels=neck_out_channels, - neck_kernel_size=neck_kernel_size, - neck_stride=neck_stride, - neck_dilation=neck_dilation, - neck_groups=neck_groups, - head_pooling_size=pooling_size, - head_dropout_ratio=0.1, - head_conv_in_channels=config_dict["head"]["conv"]["in_channels"], - head_conv_out_channels=config_dict["head"]["conv"]["out_channels"], - head_conv_kernel_size=config_dict["head"]["conv"]["kernel_size"], - head_conv_stride=config_dict["head"]["conv"]["stride"], - head_conv_dilation=config_dict["head"]["conv"]["dilation"], - head_conv_groups=config_dict["head"]["conv"]["groups"], - head_final_kernel_size=config_dict["head"]["final"]["kernel_size"], - head_final_stride=config_dict["head"]["final"]["stride"], - head_final_dilation=config_dict["head"]["final"]["dilation"], - head_final_groups=config_dict["head"]["final"]["groups"], - head_final_bias=config_dict["head"]["final"]["bias"], - head_final_has_shuffle=config_dict["head"]["final"]["has_shuffle"], - head_final_in_channels=config_dict["head"]["final"]["in_channels"], - head_final_out_channels=config_dict["head"]["final"]["out_channels"], - head_final_use_bn=config_dict["head"]["final"]["use_bn"], - head_final_act_func=config_dict["head"]["final"]["act_func"], - head_final_dropout_rate=config_dict["head"]["final"]["dropout_rate"], - head_final_ops_order=config_dict["head"]["final"]["ops_order"], - min_area=min_area, - bbox_type=bbox_type, - loss_bg=loss_bg, - ) - - -def get_small_model_config(): - pass - - -def get_base_model_config(): - pass - - -def convert_fast_checkpoint( - checkpoint_url, checkpoint_config_url, pytorch_dump_folder_path, validate_logits, save_backbone_separately -): - response = requests.get(checkpoint_config_url) - content = response.text - namespace = {} - - exec(content, namespace) - - model_config = namespace.get("model") - test_config = namespace.get("test_cfg", None) - data_config = namespace.get("data") - - min_area = 250 - bbox_type = "rect" - loss_bg = False - if test_config is not None: - min_area = test_config.get("min_area", min_area) - bbox_type = test_config.get("bbox_type", bbox_type) - loss_bg = test_config.get("loss_emb", None) == "EmbLoss_v2" - - if "tiny" in model_config["backbone"]["config"]: - config = prepare_config( - tiny_config_url, model_config["detection_head"]["pooling_size"], min_area, bbox_type, loss_bg - ) - elif "small" in model_config["backbone"]["config"]: - config = prepare_config( - small_config_url, model_config["detection_head"]["pooling_size"], min_area, bbox_type, loss_bg - ) - else: - config = prepare_config( - base_config_url, model_config["detection_head"]["pooling_size"], min_area, bbox_type, loss_bg - ) - size = 640 - if "train" in data_config: - if "short_size" in data_config["train"]: - size = data_config["train"]["short_size"] - model = FastForSceneTextRecognition(config) - fast_image_processor = FastImageProcessor( - size={"height": size, "width": size}, - min_area=config.min_area, - bbox_type=config.bbox_type, - pooling_size=config.head_pooling_size, - ) - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"] - state_dict_changed = copy.deepcopy(state_dict) - for key in state_dict: - val = state_dict_changed.pop(key) - new_key = key.replace("module.", "").replace("backbone.", "backbone.textnet.") - for search, replacement in rename_key_mappings.items(): - if search in new_key: - new_key = new_key.replace(search, replacement) - state_dict_changed[new_key] = val - model.load_state_dict(state_dict_changed) - - model.save_pretrained(pytorch_dump_folder_path) - if save_backbone_separately: - model.backbone.save_pretrained(pytorch_dump_folder_path + "/textnet/") - fast_image_processor.save_pretrained(pytorch_dump_folder_path) - logging.info("The converted weights are save here : " + pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_url", - default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth", - type=str, - help="URL to the original PyTorch checkpoint (.pth file).", - ) - parser.add_argument( - "--checkpoint_config_url", - default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth", - type=str, - help="URL to the original PyTorch checkpoint (.pth file).", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - parser.add_argument( - "--validate_logits", - default=False, - type=bool, - help="whether to assert logits outputs", - ) - parser.add_argument( - "--save_backbone_separately", - default=False, - type=bool, - help="whether to assert logits outputs", - ) - args = parser.parse_args() - - convert_fast_checkpoint( - args.checkpoint_url, - args.checkpoint_config_url, - args.pytorch_dump_folder_path, - args.validate_logits, - args.save_backbone_separately, - ) diff --git a/src/transformers/models/fast/image_processing_fast.py b/src/transformers/models/fast/image_processing_fast.py deleted file mode 100644 index 5e70a83ac58a..000000000000 --- a/src/transformers/models/fast/image_processing_fast.py +++ /dev/null @@ -1,467 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Image processor class for FAST.""" -import math -from typing import Any, Dict, List, Optional, Union - -from ...utils.import_utils import is_cv2_available - - -if is_cv2_available(): - import cv2 -import numpy as np - -from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict -from ...image_transforms import resize, to_channel_dimension_format -from ...image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - infer_channel_dimension_format, - is_scaled_image, - make_list_of_images, - to_numpy_array, - valid_images, -) -from ...utils import ( - IMAGENET_DEFAULT_MEAN, - IMAGENET_DEFAULT_STD, - TensorType, - is_torch_available, - is_vision_available, - logging, -) - - -if is_vision_available(): - import PIL - -if is_torch_available(): - import torch - import torch.nn as nn - import torch.nn.functional as F - -logger = logging.get_logger(__name__) - - -class FastImageProcessor(BaseImageProcessor): - r""" - Constructs a FAST image processor. - - Args: - do_resize (`bool`, *optional*, defaults to `True`): - Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the - `do_resize` parameter in the `preprocess` method. - size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`): - Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` - method. - resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): - Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the - `preprocess` method. - do_center_crop (`bool`, *optional*, defaults to `False`): - Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image - is padded with 0's and then center cropped. Can be overridden by the `do_center_crop` parameter in the - `preprocess` method. - crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`): - Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`. - Can be overridden by the `crop_size` parameter in the `preprocess` method. - rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the - `preprocess` method. - do_rescale (`bool`, *optional*, defaults to `True`): - Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` - parameter in the `preprocess` method. - do_normalize (`bool`, *optional*, defaults to `True`): - Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` - method. - image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): - The mean to use if normalizing the image. This is a float or list of floats of length of the number of - channels of the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. - image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): - The standard deviation to use if normalizing the image. This is a float or list of floats of length of the - number of channels of the image. Can be overridden by the `image_std` parameter in the `preprocess` method. - min_area (`int`, *optional*, defaults to 200): - Threshold for min area for results - pooling_size (`int`, *optional*, defaults to 9): - Pooling size for text detection - """ - - model_input_names = ["pixel_values"] - - def __init__( - self, - do_resize: bool = True, - size: Dict[str, int] = None, - resample: PILImageResampling = PILImageResampling.BICUBIC, - do_center_crop: bool = False, - crop_size: Dict[str, int] = None, - rescale_factor: Union[int, float] = 1 / 255, - do_rescale: bool = True, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - min_area: int = 200, - pooling_size: int = 9, - **kwargs, - ) -> None: - super().__init__(**kwargs) - size = size if size is not None else {"height": 640, "width": 640} - size = get_size_dict(size) - crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} - crop_size = get_size_dict(crop_size, param_name="crop_size") - self.do_resize = do_resize - self.size = size - self.resample = resample - self.do_center_crop = do_center_crop - self.crop_size = crop_size - self.do_rescale = do_rescale - self.rescale_factor = rescale_factor - self.do_normalize = do_normalize - self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN - self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD - self.min_area = min_area - # self.threshold = threshold - self.pooling_size = pooling_size - - @classmethod - def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure `reduce_labels` is updated if image processor - is created using from_dict and kwargs e.g. `FastImageProcessor.from_pretrained(checkpoint, reduce_labels=True)` - """ - image_processor_dict = image_processor_dict.copy() - if "reduce_labels" in kwargs: - image_processor_dict["reduce_labels"] = kwargs.pop("reduce_labels") - return super().from_dict(image_processor_dict, **kwargs) - - def resize( - self, - image: np.ndarray, - size: Dict[str, int], - resample: PILImageResampling = PILImageResampling.BICUBIC, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, - ) -> np.ndarray: - """ - Resize an image to (size["height"], size["width"]). - - Args: - image (`np.ndarray`): - Image to resize. - size (`Dict[str, int]`): - Size of the output image. - resample (`PILImageResampling`, *optional*, defaults to `PIL.Image.BICUBIC`): - Resampling filter to use when resiizing the image. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format of the image. If not provided, it will be the same as the input image. - input_data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format of the input image. If not provided, it will be inferred. - """ - size = get_size_dict(size, default_to_square=True, param_name="size") - if "height" not in size or "width" not in size: - raise ValueError(f"The `size` argument must contain `height` and `width` keys. Got {size.keys()}") - return resize( - image, - size=(size["height"], size["width"]), - resample=resample, - data_format=data_format, - input_data_format=input_data_format, - **kwargs, - ) - - def _preprocess( - self, - image: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ): - if do_resize: - image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) - - if do_center_crop: - image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) - - if do_rescale: - image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) - - if do_normalize: - image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) - - return image - - def _preprocess_image( - self, - image: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> np.ndarray: - """Preprocesses a single image.""" - # All transformations expect numpy arrays. - image = to_numpy_array(image) - if is_scaled_image(image) and do_rescale: - logger.warning_once( - "It looks like you are trying to rescale already rescaled images. If the input" - " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." - ) - if input_data_format is None: - input_data_format = infer_channel_dimension_format(image) - image = self._preprocess( - image, - do_resize=do_resize, - size=size, - resample=resample, - do_center_crop=do_center_crop, - crop_size=crop_size, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - input_data_format=input_data_format, - ) - if data_format is not None: - image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) - return image - - def preprocess( - self, - images: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_center_crop: bool = None, - crop_size: Dict[str, int] = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: ChannelDimension = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, - ) -> PIL.Image.Image: - """ - Preprocess an image or batch of images. - - Args: - images (`ImageInput`): - Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If - passing in images with pixel values between 0 and 1, set `do_rescale=False`. - do_resize (`bool`, *optional*, defaults to `self.do_resize`): - Whether to resize the image. - size (`Dict[str, int]`, *optional*, defaults to `self.size`): - Size of the image after resizing. - resample (`int`, *optional*, defaults to `self.resample`): - Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only - has an effect if `do_resize` is set to `True`. - do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): - Whether to center crop the image. - crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`): - Size of the image after center crop. If one edge the image is smaller than `crop_size`, it will be - padded with zeros and then cropped - do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): - Whether to rescale the image values between [0 - 1]. - rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): - Rescale factor to rescale the image by if `do_rescale` is set to `True`. - do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): - Image mean. - image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): - Image standard deviation. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - """ - do_resize = do_resize if do_resize is not None else self.do_resize - size = size if size is not None else self.size - size = get_size_dict(size, default_to_square=True, param_name="size") - resample = resample if resample is not None else self.resample - do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop - crop_size = crop_size if crop_size is not None else self.crop_size - crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - - images = make_list_of_images(images) - - if not valid_images(images): - raise ValueError( - "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " - "torch.Tensor, tf.Tensor or jax.ndarray." - ) - - if do_resize and size is None or resample is None: - raise ValueError("Size and resample must be specified if do_resize is True.") - - if do_center_crop and crop_size is None: - raise ValueError("Crop size must be specified if do_center_crop is True.") - - if do_rescale and rescale_factor is None: - raise ValueError("Rescale factor must be specified if do_rescale is True.") - - if do_normalize and (image_mean is None or image_std is None): - raise ValueError("Image mean and std must be specified if do_normalize is True.") - - images = [ - self._preprocess_image( - image=img, - do_resize=do_resize, - do_center_crop=do_center_crop, - do_rescale=do_rescale, - do_normalize=do_normalize, - resample=resample, - size=size, - rescale_factor=rescale_factor, - crop_size=crop_size, - image_mean=image_mean, - image_std=image_std, - data_format=data_format, - input_data_format=input_data_format, - ) - for img in images - ] - - data = {"pixel_values": images} - - return BatchFeature(data=data, tensor_type=return_tensors) - - def _max_pooling(self, x, scale=1): - if scale == 1: - x = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2)(x) - elif scale == 2: - x = nn.MaxPool2d(kernel_size=self.pooling_size // 2 + 1, stride=1, padding=(self.pooling_size // 2) // 2)( - x - ) - return x - - def post_process_text_detection(self, output, target_sizes, threshold, bbox_type="rect"): - scale = 2 - img_size = (self.size["height"], self.size["width"]) - out = output["last_hidden_state"] - batch_size = out.size(0) - final_results = {} - - texts = F.interpolate( - out[:, 0:1, :, :], size=(img_size[0] // scale, img_size[1] // scale), mode="nearest" - ) # B*1*320*320 - texts = self._max_pooling(texts, scale=scale) # B*1*320*320 - score_maps = torch.sigmoid_(texts) # B*1*320*320 - score_maps = F.interpolate(score_maps, size=(img_size[0], img_size[1]), mode="nearest") # B*1*640*640 - score_maps = score_maps.squeeze(1) # B*640*640 - - kernels = (out[:, 0, :, :] > 0).to(torch.uint8) # B*160*160 - labels_ = [] - for kernel in kernels.numpy(): - ret, label_ = cv2.connectedComponents(kernel) - labels_.append(label_) - labels_ = np.array(labels_) - labels_ = torch.from_numpy(labels_) - labels = labels_.unsqueeze(1).to(torch.float32) # B*1*160*160 - labels = F.interpolate( - labels, size=(img_size[0] // scale, img_size[1] // scale), mode="nearest" - ) # B*1*320*320 - labels = self._max_pooling(labels, scale=scale) - labels = F.interpolate(labels, size=(img_size[0], img_size[1]), mode="nearest") # B*1*640*640 - labels = labels.squeeze(1).to(torch.int32) # B*640*640 - - keys = [torch.unique(labels_[i], sorted=True) for i in range(batch_size)] - - final_results.update({"kernels": kernels.data.cpu()}) - - results = [] - for i in range(batch_size): - org_img_size = target_sizes[i] - scales = (float(org_img_size[1]) / float(img_size[1]), float(org_img_size[0]) / float(img_size[0])) - - bboxes, scores = self.generate_bbox( - keys[i], labels[i], score_maps[i], scales, threshold, bbox_type=bbox_type - ) - results.append({"bboxes": bboxes, "scores": scores}) - final_results.update({"results": results}) - - return results - - def generate_bbox(self, keys, label, score, scales, threshold, bbox_type): - label_num = len(keys) - bboxes = [] - scores = [] - for index in range(1, label_num): - i = keys[index] - ind = label == i - ind_np = ind.data.cpu().numpy() - points = np.array(np.where(ind_np)).transpose((1, 0)) - if points.shape[0] < self.min_area: - label[ind] = 0 - continue - score_i = score[ind].mean().item() - if score_i < threshold: - label[ind] = 0 - continue - - if bbox_type == "rect": - rect = cv2.minAreaRect(points[:, ::-1]) - alpha = math.sqrt(math.sqrt(points.shape[0] / (rect[1][0] * rect[1][1]))) - rect = (rect[0], (rect[1][0] * alpha, rect[1][1] * alpha), rect[2]) - bbox = cv2.boxPoints(rect) * scales - - elif bbox_type == "poly": - binary = np.zeros(label.shape, dtype="uint8") - binary[ind_np] = 1 - contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - bbox = contours[0] * scales - bbox = bbox.astype("int32") - bboxes.append(bbox.reshape(-1).tolist()) - scores.append(score_i) - return bboxes, scores diff --git a/src/transformers/models/fast/modeling_fast.py b/src/transformers/models/fast/modeling_fast.py deleted file mode 100644 index 179aa9eb6402..000000000000 --- a/src/transformers/models/fast/modeling_fast.py +++ /dev/null @@ -1,721 +0,0 @@ -# coding=utf-8 -# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch FAST model.""" - -from dataclasses import dataclass -from typing import Dict, Optional, Tuple - -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F - -from ...utils import is_timm_available - - -if is_timm_available(): - from timm import create_model - - -from transformers import ( - AutoBackbone, - FastConfig, - PreTrainedModel, - add_start_docstrings, - is_timm_available, - requires_backends, -) -from transformers.utils import ModelOutput, add_start_docstrings_to_model_forward, replace_return_docstrings - - -_CONFIG_FOR_DOC = "FastConfig" - -FAST_START_DOCSTRING = r""" - This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it - as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and - behavior. - - Parameters: - config ([`FastConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - -FAST_FOR_CAPTIONING_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See - [`FastImageProcessor.__call__`] for details. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -def get_same_padding(kernel_size): - if isinstance(kernel_size, tuple): - padding1 = get_same_padding(kernel_size[0]) - padding2 = get_same_padding(kernel_size[1]) - return padding1, padding2 - return kernel_size // 2 - - -class FASTConvLayer(nn.Module): - def __init__( - self, - in_channels, - out_channels, - kernel_size=3, - stride=1, - bias=False, - ): - super().__init__() - - self.kernel_size = kernel_size - self.stride = stride - - padding = get_same_padding(self.kernel_size) - # if isinstance(padding, int): - # padding *= self.dilation - # else: - # padding[0] *= self.dilation - # padding[1] *= self.dilation - - self.conv = nn.Conv2d( - in_channels, - out_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - bias=False, - ) - - def forward(self, hidden_states): - if self.training: - if hasattr(self, "fused_conv"): - delattr(self, "fused_conv") - hidden_states = self.conv(hidden_states) - return hidden_states - else: - if not hasattr(self, "fused_conv"): - setattr(self, "fused_conv", self.conv) - hidden_states = self.fused_conv(hidden_states) - return hidden_states - - def fuse_conv_batch_norm(self, conv, batch_norm): - """During inference, the functionary of batch norm layers is turned off but - only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv - layers to save computations and simplify network structures.""" - if isinstance(batch_norm, nn.Identity): - return conv - conv_w = conv.weight - conv_b = conv.bias if conv.bias is not None else torch.zeros_like(batch_norm.running_mean) - - factor = batch_norm.weight / torch.sqrt(batch_norm.running_var + batch_norm.eps) - conv.weight = nn.Parameter(conv_w * factor.reshape([conv.out_channels, 1, 1, 1])) - conv.bias = nn.Parameter((conv_b - batch_norm.running_mean) * factor + batch_norm.bias) - return conv - - -class FASTRepConvLayer(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, stride=1): - super().__init__() - - self.in_channels = in_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.stride = stride - - padding = (int((kernel_size[0] - 1) / 2), int((kernel_size[1] - 1) / 2)) - - self.activation = nn.ReLU(inplace=True) - - self.main_conv = nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - bias=False, - ) - self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels) - - ver_pad = (int((kernel_size[0] - 1) / 2), 0) - hor_pad = (0, int((kernel_size[1] - 1) / 2)) - - if kernel_size[1] != 1: - self.vertical_conv = nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=(kernel_size[0], 1), - stride=stride, - padding=ver_pad, - bias=False, - ) - self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels) - else: - self.vertical_conv, self.vertical_batch_norm = None, None - - if kernel_size[0] != 1: # 卷积核的高大于1 -> 有水平卷积 - self.horizontal_conv = nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=(1, kernel_size[1]), - stride=stride, - padding=hor_pad, - bias=False, - ) - self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels) - else: - self.horizontal_conv, self.horizontal_batch_norm = None, None - - self.rbr_identity = ( - nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None - ) - - def forward(self, hidden_states): - if self.training: - if hasattr(self, "fused_conv"): - self.__delattr__("fused_conv") - - main_outputs = self.main_conv(hidden_states) - main_outputs = self.main_batch_norm(main_outputs) - if self.vertical_conv is not None: - vertical_outputs = self.vertical_conv(hidden_states) - vertical_outputs = self.vertical_batch_norm(vertical_outputs) - else: - vertical_outputs = 0 - - if self.horizontal_conv is not None: - horizontal_outputs = self.horizontal_conv(hidden_states) - horizontal_outputs = self.horizontal_batch_norm(horizontal_outputs) - else: - horizontal_outputs = 0 - - if self.rbr_identity is None: - id_out = 0 - else: - id_out = self.rbr_identity(hidden_states) - - return self.activation(main_outputs + vertical_outputs + horizontal_outputs + id_out) - else: - if not hasattr(self, "fused_conv"): - self.prepare_for_eval() - return self.activation(self.fused_conv(hidden_states)) - - def _identity_to_conv(self, identity): - if identity is None: - return 0, 0 - if not hasattr(self, "id_tensor"): - input_dim = self.in_channels - kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32) - for i in range(self.in_channels): - kernel_value[i, i % input_dim, 0, 0] = 1 - id_tensor = torch.from_numpy(kernel_value).to(identity.weight.device) - self.id_tensor = self._pad_to_mxn_tensor(id_tensor) - kernel = self.id_tensor - running_mean = identity.running_mean - running_var = identity.running_var - gamma = identity.weight - beta = identity.bias - eps = identity.eps - std = (running_var + eps).sqrt() - t = (gamma / std).reshape(-1, 1, 1, 1) - return kernel * t, beta - running_mean * gamma / std - - def _fuse_batch_norm_tensor(self, conv, batch_norm): - kernel = conv.weight - kernel = self._pad_to_mxn_tensor(kernel) - running_mean = batch_norm.running_mean - running_var = batch_norm.running_var - gamma = batch_norm.weight - beta = batch_norm.bias - eps = batch_norm.eps - std = (running_var + eps).sqrt() - t = (gamma / std).reshape(-1, 1, 1, 1) - return kernel * t, beta - running_mean * gamma / std - - def get_equivalent_kernel_bias(self): - kernel_mxn, bias_mxn = self._fuse_batch_norm_tensor(self.main_conv, self.main_batch_norm) - if self.vertical_conv is not None: - kernel_mx1, bias_mx1 = self._fuse_batch_norm_tensor(self.vertical_conv, self.vertical_batch_norm) - else: - kernel_mx1, bias_mx1 = 0, 0 - if self.horizontal_conv is not None: - kernel_1xn, bias_1xn = self._fuse_batch_norm_tensor(self.horizontal_conv, self.horizontal_batch_norm) - else: - kernel_1xn, bias_1xn = 0, 0 - kernel_id, bias_id = self._identity_to_conv(self.rbr_identity) - kernel_mxn = kernel_mxn + kernel_mx1 + kernel_1xn + kernel_id - bias_mxn = bias_mxn + bias_mx1 + bias_1xn + bias_id - return kernel_mxn, bias_mxn - - def _pad_to_mxn_tensor(self, kernel): - kernel_height, kernel_width = self.kernel_size - height, width = kernel.shape[2:] - pad_left_right = (kernel_width - width) // 2 - pad_top_down = (kernel_height - height) // 2 - return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right, pad_top_down, pad_top_down]) - - def prepare_for_eval(self): - kernel, bias = self.get_equivalent_kernel_bias() - self.fused_conv = nn.Conv2d( - in_channels=self.main_conv.in_channels, - out_channels=self.main_conv.out_channels, - kernel_size=self.main_conv.kernel_size, - stride=self.main_conv.stride, - padding=self.main_conv.padding, - bias=True, - ) - self.fused_conv.weight.data = kernel - self.fused_conv.bias.data = bias - for para in self.fused_conv.parameters(): - para.detach_() - - -class FastPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = FastConfig - base_model_prefix = "fast" - main_input_name = "pixel_values" - - def _init_weights(self, module): - if isinstance(module, (nn.Linear, nn.Conv2d)): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if module.bias is not None: - module.bias.data.zero_() - - -class FASTNeck(nn.Module): - def __init__(self, config): - super().__init__() - reduce_layer_configs = list( - zip( - config.neck_in_channels, - config.neck_out_channels, - config.neck_kernel_size, - config.neck_stride, - ) - ) - self.num_layers = len(reduce_layer_configs) - for layer_ix in range(0, len(reduce_layer_configs)): - setattr(self, f"reduce_layer{layer_ix + 1}", FASTRepConvLayer(*reduce_layer_configs[layer_ix])) - - def _upsample(self, layer_out, height, width): - return F.upsample(layer_out, size=(height, width), mode="bilinear") - - def forward(self, hidden_states): - first_layer_hidden = hidden_states[0] - first_layer_hidden = self.reduce_layer1(first_layer_hidden) - output_stages = [first_layer_hidden] - - for layer_ix in range(1, self.num_layers): - layer_out = getattr(self, f"reduce_layer{layer_ix + 1}")(hidden_states[layer_ix]) - _, _, height, width = first_layer_hidden.size() - layer_out = self._upsample(layer_out, height, width) - output_stages.append(layer_out) - - combined_hidden_states = torch.cat(output_stages, 1) - return combined_hidden_states - - -class FASTHead(nn.Module): - def __init__(self, config): - super().__init__() - self.conv = FASTRepConvLayer( - config.head_conv_in_channels, - config.head_conv_out_channels, - config.head_conv_kernel_size, - config.head_conv_stride, - ) - - self.final = FASTConvLayer( - config.head_final_in_channels, - config.head_final_out_channels, - config.head_final_kernel_size, - config.head_final_stride, - config.head_final_bias, - ) - - self.pooling_size = config.head_pooling_size - - self.pooling_1s = nn.MaxPool2d(kernel_size=self.pooling_size, stride=1, padding=(self.pooling_size - 1) // 2) - self.pooling_2s = nn.MaxPool2d( - kernel_size=self.pooling_size // 2 + 1, stride=1, padding=(self.pooling_size // 2) // 2 - ) - - if config.head_dropout_ratio > 0: - self.dropout = nn.Dropout2d(config.head_dropout_ratio) - else: - self.dropout = None - - def forward(self, hidden_states): - hidden_states = self.conv(hidden_states) - if self.dropout is not None: - hidden_states = self.dropout(hidden_states) - hidden_states = self.final(hidden_states) - return hidden_states - - def _max_pooling(self, x, scale=1): - if scale == 1: - x = self.pooling_1s(x) - elif scale == 2: - x = self.pooling_2s(x) - return x - - -def emb_loss( - emb, instance, kernel, training_mask, feature_dim=4, delta_v=0.5, delta_d=1.5, weights=(1.0, 1.0), bg_sample=False -): - training_mask = (training_mask > 0.5).long() - kernel = (kernel > 0.5).long() - instance = instance * training_mask - instance_kernel = (instance * kernel).view(-1) - instance = instance.view(-1) - emb = emb.view(feature_dim, -1) - - unique_labels, unique_ids = torch.unique(instance_kernel, sorted=True, return_inverse=True) - num_instance = unique_labels.size(0) - if num_instance <= 1: - return 0 - - emb_mean = emb.new_zeros((feature_dim, num_instance), dtype=torch.float32) - for i, lb in enumerate(unique_labels): - if lb == 0: - continue - ind_k = instance_kernel == lb - emb_mean[:, i] = torch.mean(emb[:, ind_k], dim=1) - - l_agg = emb.new_zeros(num_instance, dtype=torch.float32) # bug - for i, lb in enumerate(unique_labels): - if lb == 0: - continue - ind = instance == lb - emb_ = emb[:, ind] - dist = (emb_ - emb_mean[:, i : i + 1]).norm(p=2, dim=0) - dist = F.relu(dist - delta_v) ** 2 - l_agg[i] = torch.mean(torch.log(dist + 1.0)) - l_agg = torch.mean(l_agg[1:]) - - if num_instance > 2: - emb_interleave = emb_mean.permute(1, 0).repeat(num_instance, 1) - emb_band = emb_mean.permute(1, 0).repeat(1, num_instance).view(-1, feature_dim) - # print(seg_band) - - mask = (1 - torch.eye(num_instance, dtype=torch.int8)).view(-1, 1).repeat(1, feature_dim) - mask = mask.view(num_instance, num_instance, -1) - mask[0, :, :] = 0 - mask[:, 0, :] = 0 - mask = mask.view(num_instance * num_instance, -1) - # print(mask) - - dist = emb_interleave - emb_band - dist = dist[mask > 0].view(-1, feature_dim).norm(p=2, dim=1) - dist = F.relu(2 * delta_d - dist) ** 2 - l_dis = torch.mean(torch.log(dist + 1.0)) - - if bg_sample: - l_dis = [torch.log(dist + 1.0)] - emb_bg = emb[:, instance == 0].view(feature_dim, -1) - if emb_bg.size(1) > 100: - rand_ind = np.random.permutation(emb_bg.size(1))[:100] - emb_bg = emb_bg[:, rand_ind] - if emb_bg.size(1) > 0: - for i, lb in enumerate(unique_labels): - if lb == 0: - continue - dist = (emb_bg - emb_mean[:, i : i + 1]).norm(p=2, dim=0) - dist = F.relu(2 * delta_d - dist) ** 2 - l_dis_bg = torch.mean(torch.log(dist + 1.0), 0, keepdim=True) - l_dis.append(l_dis_bg) - l_dis = torch.mean(torch.cat(l_dis)) - else: - l_dis = 0 - - l_agg = weights[0] * l_agg - l_dis = weights[1] * l_dis - l_reg = torch.mean(torch.log(torch.norm(emb_mean, 2, 0) + 1.0)) * 0.001 - loss = l_agg + l_dis + l_reg - return loss - - -def emb_loss_batch(emb, instance, kernel, training_mask, reduce=True, loss_weight=0.25): - loss_batch = emb.new_zeros((emb.size(0)), dtype=torch.float32) - - for i in range(loss_batch.size(0)): - loss_batch[i] = emb_loss(emb[i], instance[i], kernel[i], training_mask[i]) - - loss_batch = loss_weight * loss_batch - - if reduce: - loss_batch = torch.mean(loss_batch) - - return loss_batch - - -def dice_loss_with_masks(input, target, mask, reduce=True): - loss_weight = 0.5 - batch_size = input.size(0) - input = torch.sigmoid(input) - - input = input.contiguous().view(batch_size, -1) - target = target.contiguous().view(batch_size, -1).float() - mask = mask.contiguous().view(batch_size, -1).float() - - input = input * mask - target = target * mask - - a = torch.sum(input * target, dim=1) - b = torch.sum(input * input, dim=1) + 0.001 - c = torch.sum(target * target, dim=1) + 0.001 - d = (2 * a) / (b + c) - loss = 1 - d - - loss = loss_weight * loss - - if reduce: - loss = torch.mean(loss) - - return loss - - -def ohem_single(score, gt_text, training_mask): - pos_num = int(torch.sum(gt_text > 0.5)) - int(torch.sum((gt_text > 0.5) & (training_mask <= 0.5))) - - if pos_num == 0: - # selected_mask = gt_text.copy() * 0 # may be not good - selected_mask = training_mask - selected_mask = selected_mask.view(1, selected_mask.shape[0], selected_mask.shape[1]).float() - return selected_mask - - neg_num = int(torch.sum(gt_text <= 0.5)) - neg_num = int(min(pos_num * 3, neg_num)) - - if neg_num == 0: - selected_mask = training_mask - selected_mask = selected_mask.view(1, selected_mask.shape[0], selected_mask.shape[1]).float() - return selected_mask - - neg_score = score[gt_text <= 0.5] - neg_score_sorted, _ = torch.sort(-neg_score) - threshold = -neg_score_sorted[neg_num - 1] - - selected_mask = ((score >= threshold) | (gt_text > 0.5)) & (training_mask > 0.5) - selected_mask = selected_mask.reshape(1, selected_mask.shape[0], selected_mask.shape[1]).float() - return selected_mask - - -def ohem_batch(scores, gt_texts, training_masks): - selected_masks = [] - for i in range(scores.shape[0]): - selected_masks.append(ohem_single(scores[i, :, :], gt_texts[i, :, :], training_masks[i, :, :])) - - selected_masks = torch.cat(selected_masks, 0).float() - return selected_masks - - -def iou_single(a, b, mask, n_class): - EPS = 1e-6 - valid = mask == 1 - a = a[valid] - b = b[valid] - miou = [] - for i in range(n_class): - inter = ((a == i) & (b == i)).float() - union = ((a == i) | (b == i)).float() - - miou.append(torch.sum(inter) / (torch.sum(union) + EPS)) - miou = sum(miou) / len(miou) - return miou - - -def iou(a, b, mask, n_class=2, reduce=True): - batch_size = a.size(0) - - a = a.view(batch_size, -1) - b = b.view(batch_size, -1) - mask = mask.view(batch_size, -1) - - iou = a.new_zeros((batch_size,), dtype=torch.float32) - for i in range(batch_size): - iou[i] = iou_single(a[i], b[i], mask[i], n_class) - - if reduce: - iou = torch.mean(iou) - return iou - - -@dataclass -class FastForSceneTextRecognitionOutput(ModelOutput): - """ - Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the - last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity - scores. - - Args: - loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): - Languge modeling loss from the text decoder. - text_hidden (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional*): - The image hidden states. - """ - - loss: Optional[torch.Tensor] = None - last_hidden_state: Optional[torch.FloatTensor] = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - - -@add_start_docstrings( - """FAST (faster arbitararily-shaped text detector) proposes an accurate and efficient scene text detection - framework, termed FAST (i.e., faster arbitrarily-shaped text detector).FAST has two new designs. (1) They design a - minimalist kernel representation (only has 1-channel output) to model text with arbitrary shape, as well as a - GPU-parallel post-processing to efficiently assemble text lines with a negligible time overhead. (2) We search the - network architecture tailored for text detection, leading to more powerful features than most networks that are - searched for image classification.""", - FAST_START_DOCSTRING, -) -class FastForSceneTextRecognition(FastPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.config = config - - if config.use_timm_backbone: - requires_backends(self, ["timm"]) - kwargs = {} - if config.dilation: - kwargs["output_stride"] = 16 - backbone = create_model( - config.backbone, - pretrained=config.use_pretrained_backbone, - features_only=True, - out_indices=(1, 2, 3, 4), - in_chans=config.num_channels, - **kwargs, - ) - else: - backbone = AutoBackbone.from_config(config.backbone_config) - - self.backbone = backbone - self.neck = FASTNeck(config=config) - self.text_detection_head = FASTHead(config=config) - - self.pooling_1s = nn.MaxPool2d( - kernel_size=config.head_pooling_size, stride=1, padding=(config.head_pooling_size - 1) // 2 - ) - self.pooling_2s = nn.MaxPool2d( - kernel_size=config.head_pooling_size // 2 + 1, stride=1, padding=(config.head_pooling_size // 2) // 2 - ) - self.post_init() - - def _upsample(self, x, size, scale=1): - _, _, H, W = size - return F.interpolate(x, size=(H // scale, W // scale), mode="bilinear") - - def _max_pooling(self, x, scale=1): - if scale == 1: - x = self.pooling_1s(x) - elif scale == 2: - x = self.pooling_2s(x) - return x - - def loss(self, hidden, labels): - gt_texts = labels["gt_texts"] - gt_kernels = labels["gt_kernels"] - training_masks = labels["training_masks"] - gt_instances = labels["gt_instances"] - - kernels = hidden[:, 0, :, :] # 4*640*640 - texts = self._max_pooling(kernels, scale=1) # 4*640*640 - embs = hidden[:, 1:, :, :] # 4*4*640*640 - - selected_masks = ohem_batch(texts, gt_texts, training_masks) - loss_text = dice_loss_with_masks(texts, gt_texts, selected_masks, reduce=False) - - selected_masks = gt_texts * training_masks - loss_kernel = dice_loss_with_masks(kernels, gt_kernels, selected_masks, reduce=False) - loss_kernel = torch.mean(loss_kernel, dim=0) - - loss_emb = emb_loss_batch(embs, gt_instances, gt_kernels, training_masks, reduce=False) - - return torch.mean(loss_text) + torch.mean(loss_kernel) + torch.mean(loss_emb) - - @add_start_docstrings_to_model_forward(FAST_FOR_CAPTIONING_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=FastForSceneTextRecognitionOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - pixel_values: torch.FloatTensor, - output_hidden_states: Optional[bool] = True, - return_dict: Optional[bool] = None, - labels: Dict = None, - ): - r""" - labels (`Dict[str, torch.Tensor]`, *optional*): - Should contain 3 keys: gt_texts,gt_kernels,gt_instances - - Returns: - - Examples: - - ```python - >>> from transformers import FastImageProcessor, FastForSceneTextRecognition - >>> from PIL import Image - >>> import requests - - >>> url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - >>> processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") - >>> model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") - >>> inputs = processor(image, return_tensors="pt") - >>> # forward pass - >>> outputs = model(pixel_values=inputs["pixel_values"]) - >>> target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]] - >>> threshold = 0.85 - >>> text_locations = processor.post_process_text_detection(outputs, target_sizes, threshold, bbox_type="poly") - >>> print(text_locations[0]["bboxes"][0][:10]) - [484, 175, 484, 178, 483, 179, 452, 179, 452, 182] - ``` - """ - # outputs = {} - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - features = ( - self.backbone(pixel_values) if self.config.use_timm_backbone else self.backbone(pixel_values).feature_maps - ) - - hidden_states = self.neck(features) - - text_detection_output = self.text_detection_head(hidden_states) - - all_hidden_states = (features, hidden_states) - - loss = None - if labels: - out = self._upsample(text_detection_output, pixel_values.size(), scale=1) - loss = self.loss(out, labels) - text_detection_output = self._upsample(text_detection_output, pixel_values.size(), scale=4) - - if not return_dict: - output = (loss, text_detection_output) if loss is not None else (text_detection_output,) - return output + (all_hidden_states,) if output_hidden_states else output - - return FastForSceneTextRecognitionOutput( - loss=loss, - last_hidden_state=text_detection_output, - hidden_states=all_hidden_states if output_hidden_states else None, - ) diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 4ee5d2c9c296..18c6a27bd7dc 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -198,13 +198,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) -class FastImageProcessor(metaclass=DummyObject): - _backends = ["vision"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["vision"]) - - class FlavaFeatureExtractor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/models/fast/__init__.py b/tests/models/fast/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/models/fast/test_image_processing_fast.py b/tests/models/fast/test_image_processing_fast.py deleted file mode 100644 index 667ce191d43a..000000000000 --- a/tests/models/fast/test_image_processing_fast.py +++ /dev/null @@ -1,162 +0,0 @@ -# coding=utf-8 -# Copyright 2021 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import unittest - -import requests - -from transformers.testing_utils import require_torch, require_vision, slow -from transformers.utils import is_torch_available, is_vision_available - -from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs - - -if is_torch_available(): - import torch - -if is_vision_available(): - from PIL import Image - - from transformers import FastForSceneTextRecognition, FastImageProcessor - - -class FastImageProcessingTester(unittest.TestCase): - def __init__( - self, - parent, - batch_size=7, - num_channels=3, - image_size=18, - min_resolution=30, - max_resolution=400, - do_resize=True, - size=None, - do_center_crop=True, - crop_size=None, - do_normalize=True, - image_mean=[0.5, 0.5, 0.5], - image_std=[0.5, 0.5, 0.5], - min_area: int = 200, - min_score: float = 0.88, - bbox_type: str = "rect", - pooling_size: int = 9, - ): - size = size if size is not None else {"height": 20, "width": 20} - crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18} - self.parent = parent - self.batch_size = batch_size - self.num_channels = num_channels - self.image_size = image_size - self.min_resolution = min_resolution - self.max_resolution = max_resolution - self.do_resize = do_resize - self.size = size - self.do_center_crop = do_center_crop - self.crop_size = crop_size - self.do_normalize = do_normalize - self.image_mean = image_mean - self.image_std = image_std - self.min_area = min_area - self.min_score = min_score - self.bbox_type = bbox_type - self.pooling_size = pooling_size - - def prepare_image_processor_dict(self): - return { - "do_resize": self.do_resize, - "size": self.size, - "do_center_crop": self.do_center_crop, - "crop_size": self.crop_size, - "do_normalize": self.do_normalize, - "image_mean": self.image_mean, - "image_std": self.image_std, - "min_area": self.min_area, - "min_score": self.min_score, - "bbox_type": self.bbox_type, - "pooling_size": self.pooling_size, - } - - def expected_output_image_shape(self, images): - return self.num_channels, self.crop_size["height"], self.crop_size["width"] - - def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): - return prepare_image_inputs( - batch_size=self.batch_size, - num_channels=self.num_channels, - min_resolution=self.min_resolution, - max_resolution=self.max_resolution, - equal_resolution=equal_resolution, - numpify=numpify, - torchify=torchify, - ) - - -@require_torch -@require_vision -class FastImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): - image_processing_class = FastImageProcessor if is_vision_available() else None - - def setUp(self): - self.image_processor_tester = FastImageProcessingTester(self) - - @property - def image_processor_dict(self): - return self.image_processor_tester.prepare_image_processor_dict() - - def test_image_processor_properties(self): - image_processing = self.image_processing_class(**self.image_processor_dict) - self.assertTrue(hasattr(image_processing, "do_resize")) - self.assertTrue(hasattr(image_processing, "size")) - self.assertTrue(hasattr(image_processing, "do_center_crop")) - self.assertTrue(hasattr(image_processing, "center_crop")) - self.assertTrue(hasattr(image_processing, "do_normalize")) - self.assertTrue(hasattr(image_processing, "image_mean")) - self.assertTrue(hasattr(image_processing, "image_std")) - - def test_image_processor_from_dict_with_kwargs(self): - image_processor = self.image_processing_class.from_dict(self.image_processor_dict) - self.assertEqual(image_processor.size, {"height": 20, "width": 20}) - self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) - - image_processor = self.image_processing_class.from_dict( - self.image_processor_dict, size=42, crop_size=84, reduce_labels=True - ) - self.assertEqual(image_processor.size, {"height": 42, "width": 42}) - self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) - - @slow - def test_post_process_text_detection(self): - model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") - - image_processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") - - def prepare_image(): - image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg" - raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB") - return raw_image - - image = prepare_image() - inputs = image_processor(image, return_tensor="np") - - output = model(pixel_values=torch.tensor(inputs["pixel_values"])) - target_sizes = [(image.shape[1], image.shape[2]) for image in inputs["pixel_values"]] - threshold = 0.85 - final_out = image_processor.post_process_text_detection(output, target_sizes, threshold, bbox_type="poly") - - assert len(final_out[0]["bboxes"]) == 2 - assert len(final_out[0]["bboxes"][0]) == 716 - assert final_out[0]["bboxes"][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182] - assert round(float(final_out[0]["scores"][0]), 5) == 0.92356 diff --git a/tests/models/fast/test_modeling_fast.py b/tests/models/fast/test_modeling_fast.py deleted file mode 100644 index 07c3f9b24b20..000000000000 --- a/tests/models/fast/test_modeling_fast.py +++ /dev/null @@ -1,390 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Testing suite for the PyTorch FAST model. """ -import inspect -import unittest - -import requests -from PIL import Image - -from transformers import ( - FastConfig, - TextNetConfig, - is_torch_available, -) -from transformers.models.fast.image_processing_fast import FastImageProcessor -from transformers.testing_utils import ( - require_torch, - require_vision, - slow, - torch_device, -) - -from ...generation.test_utils import GenerationTesterMixin -from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor -from ...test_pipeline_mixin import PipelineTesterMixin - - -if is_torch_available(): - import torch - - from transformers import ( - FastForSceneTextRecognition, - ) - - -class FastModelTester: - def __init__( - self, - parent, - backbone_kernel_size=3, - backbone_stride=2, - backbone_dilation=1, - backbone_groups=1, - backbone_has_shuffle=False, - backbone_in_channels=3, - backbone_out_channels=64, - backbone_use_bn=True, - backbone_activation_func="relu", - backbone_dropout_rate=0, - backbone_ops_order="weight_bn_act", - backbone_stage1_in_channels=[64], - backbone_stage1_out_channels=[64], - backbone_stage1_kernel_size=[[3, 3]], - backbone_stage1_stride=[1], - backbone_stage1_dilation=[1], - backbone_stage1_groups=[1], - backbone_stage2_in_channels=[64], - backbone_stage2_out_channels=[128], - backbone_stage2_kernel_size=[[3, 1]], - backbone_stage2_stride=[2], - backbone_stage2_dilation=[1], - backbone_stage2_groups=[1], - backbone_stage3_in_channels=[128], - backbone_stage3_out_channels=[256], - backbone_stage3_kernel_size=[[1, 3]], - backbone_stage3_stride=[2], - backbone_stage3_dilation=[1], - backbone_stage3_groups=[1], - backbone_stage4_in_channels=[256], - backbone_stage4_out_channels=[512], - backbone_stage4_kernel_size=[[3, 3]], - backbone_stage4_stride=[2], - backbone_stage4_dilation=[1], - backbone_stage4_groups=[1], - neck_in_channels=[64], - neck_out_channels=[128], - neck_kernel_size=[[3, 3]], - neck_stride=[1], - head_pooling_size=9, - head_dropout_ratio=0.1, - head_conv_in_channels=128, - head_conv_out_channels=4, - head_conv_kernel_size=[3, 3], - head_conv_stride=1, - head_final_kernel_size=1, - head_final_stride=1, - head_final_bias=False, - head_final_in_channels=4, - head_final_out_channels=5, - head_final_use_batch_norm=False, - head_final_act_func=None, - head_final_dropout_rate=0, - head_final_ops_order="weight", - batch_size=3, - num_channels=3, - image_size=500, - is_training=True, - ): - self.parent = parent - self.backbone_kernel_size = backbone_kernel_size - self.backbone_stride = backbone_stride - self.backbone_has_shuffle = backbone_has_shuffle - self.backbone_in_channels = backbone_in_channels - self.backbone_out_channels = backbone_out_channels - self.backbone_use_bn = backbone_use_bn - self.backbone_act_func = backbone_activation_func - self.backbone_dropout_rate = backbone_dropout_rate - self.backbone_ops_order = backbone_ops_order - - self.backbone_stage1_in_channels = backbone_stage1_in_channels - self.backbone_stage1_out_channels = backbone_stage1_out_channels - self.backbone_stage1_kernel_size = backbone_stage1_kernel_size - self.backbone_stage1_stride = backbone_stage1_stride - - self.backbone_stage2_in_channels = backbone_stage2_in_channels - self.backbone_stage2_out_channels = backbone_stage2_out_channels - self.backbone_stage2_kernel_size = backbone_stage2_kernel_size - self.backbone_stage2_stride = backbone_stage2_stride - - self.backbone_stage3_in_channels = backbone_stage3_in_channels - self.backbone_stage3_out_channels = backbone_stage3_out_channels - self.backbone_stage3_kernel_size = backbone_stage3_kernel_size - self.backbone_stage3_stride = backbone_stage3_stride - - self.backbone_stage4_in_channels = backbone_stage4_in_channels - self.backbone_stage4_out_channels = backbone_stage4_out_channels - self.backbone_stage4_kernel_size = backbone_stage4_kernel_size - self.backbone_stage4_stride = backbone_stage4_stride - - self.neck_in_channels = neck_in_channels - self.neck_out_channels = neck_out_channels - self.neck_kernel_size = neck_kernel_size - self.neck_stride = neck_stride - - self.head_pooling_size = head_pooling_size - self.head_dropout_ratio = head_dropout_ratio - - self.head_conv_in_channels = head_conv_in_channels - self.head_conv_out_channels = head_conv_out_channels - self.head_conv_kernel_size = head_conv_kernel_size - self.head_conv_stride = head_conv_stride - - self.head_final_kernel_size = head_final_kernel_size - self.head_final_stride = head_final_stride - self.head_final_bias = head_final_bias - self.head_final_in_channels = head_final_in_channels - self.head_final_out_channels = head_final_out_channels - self.head_final_use_bn = head_final_use_batch_norm - self.head_final_act_func = head_final_act_func - self.head_final_dropout_rate = head_final_dropout_rate - self.head_final_ops_order = head_final_ops_order - - self.batch_size = batch_size - self.num_channels = num_channels - self.image_size = image_size - self.is_training = is_training - - def prepare_config_and_inputs(self): - pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) - # labels = None - # if self.use_labels: - # labels = ids_tensor([self.batch_size], self.num_labels) - # - config = self.get_config() - - return config, {"pixel_values": pixel_values} - - def get_config(self): - textnet_config = TextNetConfig( - kernel_size=self.backbone_kernel_size, - stride=self.backbone_stride, - has_shuffle=self.backbone_has_shuffle, - in_channels=self.backbone_in_channels, - out_channels=self.backbone_out_channels, - act_func=self.backbone_act_func, - stage1_in_channels=self.backbone_stage1_in_channels, - stage1_out_channels=self.backbone_stage1_out_channels, - stage1_kernel_size=self.backbone_stage1_kernel_size, - stage1_stride=self.backbone_stage1_stride, - stage2_in_channels=self.backbone_stage2_in_channels, - stage2_out_channels=self.backbone_stage2_out_channels, - stage2_kernel_size=self.backbone_stage2_kernel_size, - stage2_stride=self.backbone_stage2_stride, - stage3_in_channels=self.backbone_stage3_in_channels, - stage3_out_channels=self.backbone_stage3_out_channels, - stage3_kernel_size=self.backbone_stage3_kernel_size, - stage3_stride=self.backbone_stage3_stride, - stage4_in_channels=self.backbone_stage4_in_channels, - stage4_out_channels=self.backbone_stage4_out_channels, - stage4_kernel_size=self.backbone_stage4_kernel_size, - stage4_stride=self.backbone_stage4_stride, - out_features=["stage1", "stage2", "stage3", "stage4"], - out_indices=[1, 2, 3, 4], - ) - - return FastConfig( - use_timm_backbone=False, - backbone_config=textnet_config, - neck_in_channels=self.neck_in_channels, - neck_out_channels=self.neck_out_channels, - neck_kernel_size=self.neck_kernel_size, - neck_stride=self.neck_stride, - head_pooling_size=self.head_pooling_size, - head_dropout_ratio=self.head_dropout_ratio, - head_conv_in_channels=self.head_conv_in_channels, - head_conv_out_channels=self.head_conv_out_channels, - head_conv_kernel_size=self.head_conv_kernel_size, - head_conv_stride=self.head_conv_stride, - head_final_kernel_size=self.head_final_kernel_size, - head_final_stride=self.head_final_stride, - head_final_bias=self.head_final_bias, - head_final_in_channels=self.head_final_in_channels, - head_final_out_channels=self.head_final_out_channels, - ) - - def create_and_check_model(self, config, input): - model = FastForSceneTextRecognition(config=config) - model.to(torch_device) - model.eval() - result = model(pixel_values=input["pixel_values"]) - self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 5, 125, 125)) - - def prepare_config_and_inputs_for_common(self): - config_and_inputs = self.prepare_config_and_inputs() - config, inputs_dict = config_and_inputs - return config, inputs_dict - - -@require_torch -class FastModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): - all_model_classes = (FastForSceneTextRecognition,) if is_torch_available() else () - - pipeline_model_mapping = {} - test_headmasking = False - test_pruning = False - test_attention_outputs = False - test_resize_embeddings = False - test_head_masking = False - has_attentions = False - - def setUp(self): - self.model_tester = FastModelTester(self) - self.config_tester = ConfigTester(self, config_class=FastConfig, hidden_size=37) - - def test_config(self): - self.create_and_test_config_common_properties() - self.config_tester.create_and_test_config_to_json_string() - self.config_tester.create_and_test_config_to_json_file() - self.config_tester.create_and_test_config_from_and_save_pretrained() - self.config_tester.create_and_test_config_with_num_labels() - self.config_tester.check_config_can_be_init_without_params() - self.config_tester.check_config_arguments_init() - - def test_model(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_model(*config_and_inputs) - - def create_and_test_config_common_properties(self): - return - - @unittest.skip(reason="Fast does not use inputs_embeds") - def test_inputs_embeds(self): - pass - - @unittest.skip(reason="Fast does not support input and output embeddings") - def test_model_common_attributes(self): - pass - - @unittest.skip(reason="Fast is not a generative model") - def test_generate_without_input_ids(self): - pass - - @unittest.skip(reason="Fast is does not have any hidden_states") - def test_hidden_states_output(self): - pass - - @unittest.skip(reason="Fast is does not have any attention") - def test_retain_grad_hidden_states_attentions(self): - pass - - def test_forward_signature(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - signature = inspect.signature(model.forward) - # signature.parameters is an OrderedDict => so arg_names order is deterministic - arg_names = [*signature.parameters.keys()] - - expected_arg_names = ["pixel_values"] - self.assertListEqual(arg_names[:1], expected_arg_names) - - def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): - to_return = inputs_dict.copy() - gt_instances = torch.zeros( - self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size - ) - gt_kernels = torch.zeros( - self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size - ) - gt_text = torch.zeros(self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size) - training_masks = torch.ones( - self.model_tester.batch_size, self.model_tester.image_size, self.model_tester.image_size - ) - labels = {} - labels["gt_instances"] = gt_instances - labels["gt_kernels"] = gt_kernels - labels["gt_texts"] = gt_text - labels["training_masks"] = training_masks - - to_return["labels"] = labels - - return to_return - - def test_model_is_small(self): - # Just a consistency check to make sure we are not running tests on 80M parameter models. - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - model = model_class(config) - num_params = model.num_parameters() - assert ( - num_params < 3000000 - ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max." - - # def prepare_image(): - # image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img_329.jpg" - # raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB") - # return raw_image - - -@require_torch -@require_vision -class FastModelIntegrationTest(unittest.TestCase): - @slow - def test_inference_fast_tiny_ic17mlt_model(self): - model = FastForSceneTextRecognition.from_pretrained("Raghavan/ic17mlt_Fast_T") - - image_processor = FastImageProcessor.from_pretrained("Raghavan/ic17mlt_Fast_T") - - def prepare_image(): - image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img_329.jpg" - raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB") - return raw_image - - image = prepare_image() - input = image_processor(image, return_tensors="pt") - - output = model(pixel_values=torch.tensor(input["pixel_values"])) - target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]] - threshold = 0.88 - final_out = image_processor.post_process_text_detection(output, target_sizes, threshold) - - assert final_out[0]["bboxes"][0] == [224, 120, 246, 120, 246, 134, 224, 134] - assert round(float(final_out[0]["scores"][0]), 5) == 0.95541 - - @slow - def test_inference_fast_base_800_total_text_ic17mlt_model(self): - model = FastForSceneTextRecognition.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") - - image_processor = FastImageProcessor.from_pretrained("Raghavan/fast_base_tt_800_finetune_ic17mlt") - - def prepare_image(): - image_url = "https://huggingface.co/datasets/Raghavan/fast_model_samples/resolve/main/img657.jpg" - raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB") - return raw_image - - image = prepare_image() - input = image_processor(image, return_tensors="pt") - - output = model(pixel_values=torch.tensor(input["pixel_values"])) - target_sizes = [(image.shape[1], image.shape[2]) for image in input["pixel_values"]] - threshold = 0.85 - final_out = image_processor.post_process_text_detection(output, target_sizes, threshold, bbox_type="poly") - - assert final_out[0]["bboxes"][0][:10] == [484, 175, 484, 178, 483, 179, 452, 179, 452, 182] - assert round(float(final_out[0]["scores"][0]), 5) == 0.92356 diff --git a/utils/check_repo.py b/utils/check_repo.py index 66f9d7f2b757..10a9fd83e0c9 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -223,7 +223,6 @@ "TFCLIPVisionModel", "TFGroupViTTextModel", "TFGroupViTVisionModel", - "FastForSceneTextRecognition", "FlaxCLIPTextModel", "FlaxCLIPTextModelWithProjection", "FlaxCLIPVisionModel", From cbf6c81b85fd2c8465423374ead5b1b9d484b815 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Fri, 10 Nov 2023 14:12:42 +0530 Subject: [PATCH 057/152] More cleanup --- docs/source/en/_toctree.yml | 2 - docs/source/en/model_doc/fast.md | 48 ------------------- docs/source/en/model_doc/textnet.md | 2 +- tests/models/textnet/test_modeling_textnet.py | 9 ++-- utils/check_repo.py | 1 + 5 files changed, 6 insertions(+), 56 deletions(-) delete mode 100644 docs/source/en/model_doc/fast.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 51602dc805d4..86cffb9a7e35 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -537,8 +537,6 @@ title: EfficientFormer - local: model_doc/efficientnet title: EfficientNet - - local: model_doc/fast - title: FAST - local: model_doc/focalnet title: FocalNet - local: model_doc/glpn diff --git a/docs/source/en/model_doc/fast.md b/docs/source/en/model_doc/fast.md deleted file mode 100644 index e5c8c58f1856..000000000000 --- a/docs/source/en/model_doc/fast.md +++ /dev/null @@ -1,48 +0,0 @@ - - -# FAST - -## Overview - -Fast model proposes an accurate and efficient scene text detection framework, termed FAST (i.e., faster -arbitrarily-shaped text detector). - -FAST has two new designs. (1) We design a minimalist kernel representation (only has 1-channel output) to model text -with arbitrary shape, as well as a GPU-parallel post-processing to efficiently assemble text lines with a negligible -time overhead. (2) We search the network architecture tailored for text detection, leading to more powerful features -than most networks that are searched for image classification. - -## FastConfig - -[[autodoc]] FastConfig - -## FastImageProcessor - -[[autodoc]] FastImageProcessor - -## FastForSceneTextRecognition - -[[autodoc]] FastForSceneTextRecognition -- forward - -## FASTForImageCaptioningOutput - -[[autodoc]] FASTForImageCaptioningOutput -- forward - - - diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md index 088adb572bdb..1c50cfa71016 100644 --- a/docs/source/en/model_doc/textnet.md +++ b/docs/source/en/model_doc/textnet.md @@ -19,7 +19,7 @@ rendered properly in your Markdown viewer. ## Overview The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. -TextNet was results of NAS for efficient text detection task. +TextNet model was results of NAS for efficient text detection task. It is used in fast model as backbone. ## TextNetConfig diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py index 957661e61144..5cf9bdeaa423 100644 --- a/tests/models/textnet/test_modeling_textnet.py +++ b/tests/models/textnet/test_modeling_textnet.py @@ -255,15 +255,15 @@ def test_config(self): def create_and_test_config_common_properties(self): return - @unittest.skip(reason="Bit does not output attentions") + @unittest.skip(reason="TextNet does not output attentions") def test_attention_outputs(self): pass - @unittest.skip(reason="Bit does not use inputs_embeds") + @unittest.skip(reason="TextNet does not use inputs_embeds") def test_inputs_embeds(self): pass - @unittest.skip(reason="Bit does not support input and output embeddings") + @unittest.skip(reason="TextNet does not support input and output embeddings") def test_model_common_attributes(self): pass @@ -317,7 +317,6 @@ def check_hidden_states_output(inputs_dict, config, model_class): expected_num_stages = self.model_tester.num_stages - 1 self.assertEqual(len(hidden_states), expected_num_stages + 1) - # Bit's feature maps are of shape (batch_size, num_channels, height, width) self.assertListEqual( list(hidden_states[0].shape[-2:]), [self.model_tester.image_size // 2, self.model_tester.image_size // 2], @@ -348,7 +347,7 @@ def test_model_is_small(self): num_params < 3000000 ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max." - @unittest.skip(reason="Bit does not use feedforward chunking") + @unittest.skip(reason="TextNet does not use feedforward chunking") def test_feed_forward_chunking(self): pass diff --git a/utils/check_repo.py b/utils/check_repo.py index 10a9fd83e0c9..b0d9cffe8c46 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -337,6 +337,7 @@ ] ) + # This is to make sure the transformers module imported is the one in the repo. transformers = direct_transformers_import(PATH_TO_TRANSFORMERS) From 1db7bd97799c1391bd008577a39456c420a3dd45 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Fri, 10 Nov 2023 14:22:29 +0530 Subject: [PATCH 058/152] Fix build --- src/transformers/models/textnet/configuration_textnet.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py index 650c1bc4858f..9ca9748050cc 100644 --- a/src/transformers/models/textnet/configuration_textnet.py +++ b/src/transformers/models/textnet/configuration_textnet.py @@ -95,19 +95,19 @@ class TextNetConfig(BackboneConfigMixin, PretrainedConfig): Examples: ```python - >>> from transformers import FastConfig, FastForSceneTextRecognition + >>> from transformers import TextNetConfig, TextNetBackbone >>> # Initializing a Fast Config - >>> configuration = FastConfig() + >>> configuration = TextNetConfig() >>> # Initializing a model (with random weights) - >>> model = FastForSceneTextRecognition(configuration) + >>> model = TextNetBackbone(configuration) >>> # Accessing the model configuration >>> configuration = model.config ```""" r""" - [Raghavan/fast_base_tt_800_finetune_ic17mlt](https://huggingface.co/Raghavan/fast_base_tt_800_finetune_ic17mlt) + [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base) """ model_type = "textnet" From bb4ac611391286cb935f56651c9d554d9cd90e91 Mon Sep 17 00:00:00 2001 From: raghavanone Date: Tue, 14 Nov 2023 11:32:21 +0530 Subject: [PATCH 059/152] Incorporate PR feedbacks --- docs/source/en/model_doc/textnet.md | 5 +- setup.py | 2 +- src/transformers/models/textnet/__init__.py | 18 +- .../models/textnet/configuration_textnet.py | 24 ++- .../textnet/image_processing_textnet.py | 14 +- .../models/textnet/modeling_textnet.py | 180 ++++++++++-------- 6 files changed, 135 insertions(+), 108 deletions(-) diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md index 1c50cfa71016..a9947c777f2e 100644 --- a/docs/source/en/model_doc/textnet.md +++ b/docs/source/en/model_doc/textnet.md @@ -1,4 +1,4 @@ -