mindee · felixdittrich92 · Sep 26, 2022 · Sep 21, 2022 · Sep 21, 2022 · Sep 22, 2022
diff --git a/doctr/models/classification/vit/pytorch.py b/doctr/models/classification/vit/pytorch.py
@@ -24,14 +24,14 @@
         "std": (0.299, 0.296, 0.301),
         "input_shape": (3, 32, 32),
         "classes": list(VOCABS["french"]),
-        "url": None,
+        "url": "https://github.com/mindee/doctr/releases/download/v0.5.1/vit_b-103002d1.pt",
     },
     "vit_s": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (3, 32, 32),
         "classes": list(VOCABS["french"]),
-        "url": None,
+        "url": "https://github.com/mindee/doctr/releases/download/v0.5.1/vit_s-cd3472bd.pt",
     },
 }
 
@@ -69,7 +69,6 @@ class VisionTransformer(nn.Sequential):
         num_heads: number of attention heads
         ffd_ratio: multiplier for the hidden dimension of the feedforward layer
         input_shape: size of the input image
-        patch_size: size of the patches to be extracted from the input
         dropout: dropout rate
         num_classes: number of output classes
         include_top: whether the classifier head should be instantiated
@@ -82,15 +81,14 @@ def __init__(
         num_heads: int,
         ffd_ratio: int,
         input_shape: Tuple[int, int, int] = (3, 32, 32),
-        patch_size: Tuple[int, int] = (4, 4),
         dropout: float = 0.0,
         num_classes: int = 1000,
         include_top: bool = True,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:
 
         _layers: List[nn.Module] = [
-            PatchEmbedding(input_shape, patch_size, d_model),
+            PatchEmbedding(input_shape, d_model),
             EncoderBlock(num_layers, num_heads, d_model, d_model * ffd_ratio, dropout, nn.GELU()),
         ]
         if include_top:
@@ -129,14 +127,16 @@ def _vit(
     return model
 
 
-def vit_b(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
-    """VisionTransformer-B architecture as described in
+def vit_s(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
+    """VisionTransformer-S architecture
     `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale",
-    <https://arxiv.org/pdf/2010.11929.pdf>`_.
+    <https://arxiv.org/pdf/2010.11929.pdf>`_. Patches: (H, W) -> (H/8, W/8)
+
+    NOTE: unofficial config used in ViTSTR and ParSeq
 
     >>> import torch
-    >>> from doctr.models import vit_b
-    >>> model = vit_b(pretrained=False)
+    >>> from doctr.models import vit_s
+    >>> model = vit_s(pretrained=False)
     >>> input_tensor = torch.rand((1, 3, 32, 32), dtype=tf.float32)
     >>> out = model(input_tensor)
 
@@ -148,27 +148,25 @@ def vit_b(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
     """
 
     return _vit(
-        "vit_b",
+        "vit_s",
         pretrained,
-        d_model=768,
+        d_model=384,
         num_layers=12,
-        num_heads=12,
+        num_heads=6,
         ffd_ratio=4,
-        ignore_keys=["head.weight", "head.bias"],
+        ignore_keys=["2.head.weight", "2.head.bias"],
         **kwargs,
     )
 
 
-def vit_s(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
-    """VisionTransformer-S architecture
+def vit_b(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
+    """VisionTransformer-B architecture as described in
     `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale",
-    <https://arxiv.org/pdf/2010.11929.pdf>`_.
-
-    NOTE: unofficial config used in ViTSTR and ParSeq
+    <https://arxiv.org/pdf/2010.11929.pdf>`_. Patches: (H, W) -> (H/8, W/8)
 
     >>> import torch
-    >>> from doctr.models import vit_s
-    >>> model = vit_s(pretrained=False)
+    >>> from doctr.models import vit_b
+    >>> model = vit_b(pretrained=False)
     >>> input_tensor = torch.rand((1, 3, 32, 32), dtype=tf.float32)
     >>> out = model(input_tensor)
 
@@ -180,12 +178,12 @@ def vit_s(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
     """
 
     return _vit(
-        "vit_s",
+        "vit_b",
         pretrained,
-        d_model=384,
+        d_model=768,
         num_layers=12,
-        num_heads=6,
+        num_heads=12,
         ffd_ratio=4,
-        ignore_keys=["head.weight", "head.bias"],
+        ignore_keys=["2.head.weight", "2.head.bias"],
         **kwargs,
     )
diff --git a/doctr/models/classification/vit/tensorflow.py b/doctr/models/classification/vit/tensorflow.py
@@ -26,14 +26,14 @@
         "std": (0.299, 0.296, 0.301),
         "input_shape": (3, 32, 32),
         "classes": list(VOCABS["french"]),
-        "url": None,
+        "url": "https://github.com/mindee/doctr/releases/download/v0.5.1/vit_s-7a23bea4.zip",
     },
     "vit_b": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": None,
+        "url": "https://github.com/mindee/doctr/releases/download/v0.5.1/vit_b-983c86b5.zip",
     },
 }
 
@@ -48,7 +48,7 @@ class ClassifierHead(layers.Layer, NestedObject):
     def __init__(self, num_classes: int) -> None:
         super().__init__()
 
-        self.head = layers.Dense(num_classes, kernel_initializer="he_normal")
+        self.head = layers.Dense(num_classes, kernel_initializer="he_normal", name="dense")
 
     def call(self, x: tf.Tensor) -> tf.Tensor:
         # (batch_size, num_classes) cls token
@@ -66,7 +66,6 @@ class VisionTransformer(Sequential):
         num_heads: number of attention heads
         ffd_ratio: multiplier for the hidden dimension of the feedforward layer
         input_shape: size of the input image
-        patch_size: size of the patches to be extracted from the input
         dropout: dropout rate
         num_classes: number of output classes
         include_top: whether the classifier head should be instantiated
@@ -79,15 +78,14 @@ def __init__(
         num_heads: int,
         ffd_ratio: int,
         input_shape: Tuple[int, int, int] = (32, 32, 3),
-        patch_size: Tuple[int, int] = (4, 4),
         dropout: float = 0.0,
         num_classes: int = 1000,
         include_top: bool = True,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:
 
         _layers = [
-            PatchEmbedding(input_shape, patch_size, d_model),
+            PatchEmbedding(input_shape, d_model),
             EncoderBlock(num_layers, num_heads, d_model, d_model * ffd_ratio, dropout, activation_fct=GELU()),
         ]
         if include_top:
@@ -125,11 +123,11 @@ def _vit(
 def vit_s(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
     """VisionTransformer-S architecture
     `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale",
-    <https://arxiv.org/pdf/2010.11929.pdf>`_.
+    <https://arxiv.org/pdf/2010.11929.pdf>`_. Patches: (H, W) -> (H/8, W/8)
 
     NOTE: unofficial config used in ViTSTR and ParSeq
 
-    >>> import tf
+    >>> import tensorflow as tf
     >>> from doctr.models import vit_s
     >>> model = vit_s(pretrained=False)
     >>> input_tensor = tf.random.uniform(shape=[1, 32, 32, 3], maxval=1, dtype=tf.float32)
@@ -156,7 +154,7 @@ def vit_s(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
 def vit_b(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
     """VisionTransformer-B architecture as described in
     `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale",
-    <https://arxiv.org/pdf/2010.11929.pdf>`_.
+    <https://arxiv.org/pdf/2010.11929.pdf>`_. Patches: (H, W) -> (H/8, W/8)
 
     >>> import tensorflow as tf
     >>> from doctr.models import vit_b

diff --git a/doctr/models/modules/vision_transformer/pytorch.py b/doctr/models/modules/vision_transformer/pytorch.py
@@ -16,19 +16,16 @@
 class PatchEmbedding(nn.Module):
     """Compute 2D patch embeddings with cls token and positional encoding"""
 
-    def __init__(
-        self,
-        input_shape: Tuple[int, int, int],
-        patch_size: Tuple[int, int],
-        embed_dim: int,
-    ) -> None:
+    def __init__(self, input_shape: Tuple[int, int, int], embed_dim: int) -> None:
 
         super().__init__()
         channels, height, width = input_shape
-        # fix patch size if recognition task with 32x128 input
-        self.patch_size = (4, 8) if height != width else patch_size
-        self.grid_size = (height // patch_size[0], width // patch_size[1])
-        self.num_patches = (height // patch_size[0]) * (width // patch_size[1])
+        # calculate patch size
+        # NOTE: this is different from the original implementation
+        self.patch_size = (height // (height // 8), width // (width // 8))
+
+        self.grid_size = (self.patch_size[0], self.patch_size[1])
+        self.num_patches = self.patch_size[0] * self.patch_size[1]
 
         self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))  # type: ignore[attr-defined]
         self.positions = nn.Parameter(torch.randn(1, self.num_patches + 1, embed_dim))  # type: ignore[attr-defined]
@@ -62,9 +59,10 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
         patch_pos_embed = nn.functional.interpolate(
             patch_pos_embed,
-            mode="bicubic",
-            align_corners=False,
             scale_factor=(h0 / math.sqrt(num_positions), w0 / math.sqrt(num_positions)),
+            mode="bilinear",
+            align_corners=False,
+            recompute_scale_factor=True,
         )
         assert int(h0) == patch_pos_embed.shape[-2], "height of interpolated patch embedding doesn't match"
         assert int(w0) == patch_pos_embed.shape[-1], "width of interpolated patch embedding doesn't match"

diff --git a/doctr/models/modules/vision_transformer/tensorflow.py b/doctr/models/modules/vision_transformer/tensorflow.py
@@ -17,19 +17,16 @@
 class PatchEmbedding(layers.Layer, NestedObject):
     """Compute 2D patch embeddings with cls token and positional encoding"""
 
-    def __init__(
-        self,
-        input_shape: Tuple[int, int, int],
-        patch_size: Tuple[int, int],
-        embed_dim: int,
-    ) -> None:
+    def __init__(self, input_shape: Tuple[int, int, int], embed_dim: int) -> None:
 
         super().__init__()
         height, width, _ = input_shape
-        # fix patch size if recognition task with 32x128 input
-        self.patch_size = (4, 8) if height != width else patch_size
-        self.grid_size = (height // patch_size[0], width // patch_size[1])
-        self.num_patches = (height // patch_size[0]) * (width // patch_size[1])
+        # calculate patch size
+        # NOTE: this is different from the original implementation
+        self.patch_size = (height // (height // 8), width // (width // 8))
+
+        self.grid_size = (self.patch_size[0], self.patch_size[1])
+        self.num_patches = self.patch_size[0] * self.patch_size[1]
 
         self.cls_token = self.add_weight(shape=(1, 1, embed_dim), initializer="zeros", trainable=True, name="cls_token")
         self.positions = self.add_weight(
@@ -38,7 +35,7 @@ def __init__(
             trainable=True,
             name="positions",
         )
-        self.proj = layers.Dense(embed_dim, kernel_initializer="he_normal")
+        self.proj = layers.Dense(embed_dim, kernel_initializer="he_normal", name="projection")
 
     def interpolate_pos_encoding(self, embeddings: tf.Tensor, height: int, width: int) -> tf.Tensor:
         """
@@ -68,7 +65,7 @@ def interpolate_pos_encoding(self, embeddings: tf.Tensor, height: int, width: in
                 patch_pos_embed, shape=(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
             ),
             size=(h0, w0),
-            method="bicubic",
+            method="bilinear",
         )
 
         shape = patch_pos_embed.shape

diff --git a/doctr/models/recognition/vitstr/pytorch.py b/doctr/models/recognition/vitstr/pytorch.py
@@ -48,7 +48,6 @@ class ViTSTR(_ViTSTR, nn.Module):
         max_length: maximum word length handled by the model
         dropout_prob: dropout probability of the encoder LSTM
         input_shape: input shape of the image
-        patch_size: size of the patches
         exportable: onnx exportable returns only logits
         cfg: dictionary containing information about the model
     """
@@ -60,7 +59,6 @@ def __init__(
         embedding_units: int,
         max_length: int = 25,
         input_shape: Tuple[int, int, int] = (3, 32, 128),  # different from paper
-        patch_size: Tuple[int, int] = (4, 8),  # different from paper to match our size
         exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:

diff --git a/doctr/models/recognition/vitstr/tensorflow.py b/doctr/models/recognition/vitstr/tensorflow.py
@@ -46,7 +46,6 @@ class ViTSTR(_ViTSTR, Model):
         max_length: maximum word length handled by the model
         dropout_prob: dropout probability for the encoder and decoder
         input_shape: input shape of the image
-        patch_size: size of the patches
         exportable: onnx exportable returns only logits
         cfg: dictionary containing information about the model
     """
@@ -61,7 +60,6 @@ def __init__(
         max_length: int = 25,
         dropout_prob: float = 0.0,
         input_shape: Tuple[int, int, int] = (32, 128, 3),  # different from paper
-        patch_size: Tuple[int, int] = (4, 8),  # different from paper to match our size
         exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:
@@ -74,7 +72,7 @@ def __init__(
         self.max_length = max_length + 3  # Add 1 step for EOS, 1 for SOS, 1 for PAD
 
         self.feat_extractor = feature_extractor
-        self.head = layers.Dense(len(self.vocab) + 3)
+        self.head = layers.Dense(len(self.vocab) + 3, name="head")
 
         self.postprocessor = ViTSTRPostProcessor(vocab=self.vocab)
 
@@ -203,7 +201,6 @@ def _vitstr(
     kwargs["vocab"] = _cfg["vocab"]
 
     # Feature extractor
-    # NOTE: switch to IntermediateLayerGetter if pretrained vit models are available
     feat_extractor = backbone_fn(
         pretrained=pretrained_backbone,
         input_shape=_cfg["input_shape"],