mindee · felixdittrich92 · Sep 26, 2022 · Sep 21, 2022 · Sep 21, 2022 · Sep 22, 2022
diff --git a/doctr/models/classification/vit/pytorch.py b/doctr/models/classification/vit/pytorch.py
@@ -24,14 +24,14 @@
         "std": (0.299, 0.296, 0.301),
         "input_shape": (3, 32, 32),
         "classes": list(VOCABS["french"]),
-        "url": None,
+        "url": "https://github.com/mindee/doctr/releases/download/v0.5.1/vit_b-13bbe405.pt",
     },
     "vit_s": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (3, 32, 32),
         "classes": list(VOCABS["french"]),
-        "url": None,
+        "url": "https://github.com/mindee/doctr/releases/download/v0.5.1/vit_s-ff3c4666.pt",
     },
 }
 
@@ -69,7 +69,6 @@ class VisionTransformer(nn.Sequential):
         num_heads: number of attention heads
         ffd_ratio: multiplier for the hidden dimension of the feedforward layer
         input_shape: size of the input image
-        patch_size: size of the patches to be extracted from the input
         dropout: dropout rate
         num_classes: number of output classes
         include_top: whether the classifier head should be instantiated
@@ -82,15 +81,14 @@ def __init__(
         num_heads: int,
         ffd_ratio: int,
         input_shape: Tuple[int, int, int] = (3, 32, 32),
-        patch_size: Tuple[int, int] = (4, 4),
         dropout: float = 0.0,
         num_classes: int = 1000,
         include_top: bool = True,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:
 
         _layers: List[nn.Module] = [
-            PatchEmbedding(input_shape, patch_size, d_model),
+            PatchEmbedding(input_shape, d_model),
             EncoderBlock(num_layers, num_heads, d_model, d_model * ffd_ratio, dropout, nn.GELU()),
         ]
         if include_top:
@@ -103,7 +101,7 @@ def __init__(
 def _vit(
     arch: str,
     pretrained: bool,
-    ignore_keys: Optional[List[str]] = None,
+    ignore_keys: Optional[List[str]] = [],
     **kwargs: Any,
 ) -> VisionTransformer:
 
@@ -123,20 +121,28 @@ def _vit(
     if pretrained:
         # The number of classes is not the same as the number of classes in the pretrained model =>
         # remove the last layer weights
-        _ignore_keys = ignore_keys if kwargs["num_classes"] != len(default_cfgs[arch]["classes"]) else None
+        _ignore_keys = ignore_keys if kwargs["num_classes"] != len(default_cfgs[arch]["classes"]) else []
+        # The model is used as a feature extractor => remove the patch embedding and position weights
+        _ignore_keys = (
+            _ignore_keys + ["0.positions", "0.proj.weight"]  # type: ignore
+            if kwargs["input_shape"] != default_cfgs[arch]["input_shape"]
+            else _ignore_keys
+        )
         load_pretrained_params(model, default_cfgs[arch]["url"], ignore_keys=_ignore_keys)
 
     return model
 
 
-def vit_b(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
-    """VisionTransformer-B architecture as described in
+def vit_s(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
+    """VisionTransformer-S architecture
     `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale",
     <https://arxiv.org/pdf/2010.11929.pdf>`_.
 
+    NOTE: unofficial config used in ViTSTR and ParSeq
+
     >>> import torch
-    >>> from doctr.models import vit_b
-    >>> model = vit_b(pretrained=False)
+    >>> from doctr.models import vit_s
+    >>> model = vit_s(pretrained=False)
     >>> input_tensor = torch.rand((1, 3, 32, 32), dtype=tf.float32)
     >>> out = model(input_tensor)
 
@@ -148,27 +154,25 @@ def vit_b(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
     """
 
     return _vit(
-        "vit_b",
+        "vit_s",
         pretrained,
-        d_model=768,
+        d_model=384,
         num_layers=12,
-        num_heads=12,
+        num_heads=6,
         ffd_ratio=4,
-        ignore_keys=["head.weight", "head.bias"],
+        ignore_keys=["2.head.weight", "2.head.bias"],
         **kwargs,
     )
 
 
-def vit_s(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
-    """VisionTransformer-S architecture
+def vit_b(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
+    """VisionTransformer-B architecture as described in
     `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale",
     <https://arxiv.org/pdf/2010.11929.pdf>`_.
 
-    NOTE: unofficial config used in ViTSTR and ParSeq
-
     >>> import torch
-    >>> from doctr.models import vit_s
-    >>> model = vit_s(pretrained=False)
+    >>> from doctr.models import vit_b
+    >>> model = vit_b(pretrained=False)
     >>> input_tensor = torch.rand((1, 3, 32, 32), dtype=tf.float32)
     >>> out = model(input_tensor)
 
@@ -180,12 +184,12 @@ def vit_s(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
     """
 
     return _vit(
-        "vit_s",
+        "vit_b",
         pretrained,
-        d_model=384,
+        d_model=768,
         num_layers=12,
-        num_heads=6,
+        num_heads=12,
         ffd_ratio=4,
-        ignore_keys=["head.weight", "head.bias"],
+        ignore_keys=["2.head.weight", "2.head.bias"],
         **kwargs,
     )
diff --git a/doctr/models/classification/vit/tensorflow.py b/doctr/models/classification/vit/tensorflow.py
@@ -26,14 +26,14 @@
         "std": (0.299, 0.296, 0.301),
         "input_shape": (3, 32, 32),
         "classes": list(VOCABS["french"]),
-        "url": None,
+        "url": "https://github.com/mindee/doctr/releases/download/v0.5.1/vit_s-f87ad69c.zip",
     },
     "vit_b": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": None,
+        "url": "https://github.com/mindee/doctr/releases/download/v0.5.1/vit_b-71da99f5.zip",
     },
 }
 
@@ -66,7 +66,6 @@ class VisionTransformer(Sequential):
         num_heads: number of attention heads
         ffd_ratio: multiplier for the hidden dimension of the feedforward layer
         input_shape: size of the input image
-        patch_size: size of the patches to be extracted from the input
         dropout: dropout rate
         num_classes: number of output classes
         include_top: whether the classifier head should be instantiated
@@ -79,15 +78,14 @@ def __init__(
         num_heads: int,
         ffd_ratio: int,
         input_shape: Tuple[int, int, int] = (32, 32, 3),
-        patch_size: Tuple[int, int] = (4, 4),
         dropout: float = 0.0,
         num_classes: int = 1000,
         include_top: bool = True,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:
 
         _layers = [
-            PatchEmbedding(input_shape, patch_size, d_model),
+            PatchEmbedding(input_shape, d_model),
             EncoderBlock(num_layers, num_heads, d_model, d_model * ffd_ratio, dropout, activation_fct=GELU()),
         ]
         if include_top:
@@ -129,7 +127,7 @@ def vit_s(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
 
     NOTE: unofficial config used in ViTSTR and ParSeq
 
-    >>> import tf
+    >>> import tensorflow as tf
     >>> from doctr.models import vit_s
     >>> model = vit_s(pretrained=False)
     >>> input_tensor = tf.random.uniform(shape=[1, 32, 32, 3], maxval=1, dtype=tf.float32)

diff --git a/doctr/models/modules/vision_transformer/pytorch.py b/doctr/models/modules/vision_transformer/pytorch.py
@@ -16,19 +16,15 @@
 class PatchEmbedding(nn.Module):
     """Compute 2D patch embeddings with cls token and positional encoding"""
 
-    def __init__(
-        self,
-        input_shape: Tuple[int, int, int],
-        patch_size: Tuple[int, int],
-        embed_dim: int,
-    ) -> None:
+    def __init__(self, input_shape: Tuple[int, int, int], embed_dim: int) -> None:
 
         super().__init__()
         channels, height, width = input_shape
-        # fix patch size if recognition task with 32x128 input
-        self.patch_size = (4, 8) if height != width else patch_size
-        self.grid_size = (height // patch_size[0], width // patch_size[1])
-        self.num_patches = (height // patch_size[0]) * (width // patch_size[1])
+        # calculate patch size 32x32 -> (4, 4) 32x128 -> (4, 16)
+        # NOTE: this is different from the original implementation
+        self.patch_size = (height // 8, width // 8)
+        self.grid_size = (height // self.patch_size[0], width // self.patch_size[1])
+        self.num_patches = (height // self.patch_size[0]) * (width // self.patch_size[1])
 
         self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))  # type: ignore[attr-defined]
         self.positions = nn.Parameter(torch.randn(1, self.num_patches + 1, embed_dim))  # type: ignore[attr-defined]

diff --git a/doctr/models/modules/vision_transformer/tensorflow.py b/doctr/models/modules/vision_transformer/tensorflow.py
@@ -17,19 +17,15 @@
 class PatchEmbedding(layers.Layer, NestedObject):
     """Compute 2D patch embeddings with cls token and positional encoding"""
 
-    def __init__(
-        self,
-        input_shape: Tuple[int, int, int],
-        patch_size: Tuple[int, int],
-        embed_dim: int,
-    ) -> None:
+    def __init__(self, input_shape: Tuple[int, int, int], embed_dim: int) -> None:
 
         super().__init__()
         height, width, _ = input_shape
-        # fix patch size if recognition task with 32x128 input
-        self.patch_size = (4, 8) if height != width else patch_size
-        self.grid_size = (height // patch_size[0], width // patch_size[1])
-        self.num_patches = (height // patch_size[0]) * (width // patch_size[1])
+        # calculate patch size 32x32 -> (4, 4) 32x128 -> (4, 16)
+        # NOTE: this is different from the original implementation
+        self.patch_size = (height // 8, width // 8)
+        self.grid_size = (height // self.patch_size[0], width // self.patch_size[1])
+        self.num_patches = (height // self.patch_size[0]) * (width // self.patch_size[1])
 
         self.cls_token = self.add_weight(shape=(1, 1, embed_dim), initializer="zeros", trainable=True, name="cls_token")
         self.positions = self.add_weight(

diff --git a/doctr/models/recognition/vitstr/pytorch.py b/doctr/models/recognition/vitstr/pytorch.py
@@ -48,7 +48,6 @@ class ViTSTR(_ViTSTR, nn.Module):
         max_length: maximum word length handled by the model
         dropout_prob: dropout probability of the encoder LSTM
         input_shape: input shape of the image
-        patch_size: size of the patches
         exportable: onnx exportable returns only logits
         cfg: dictionary containing information about the model
     """
@@ -60,7 +59,6 @@ def __init__(
         embedding_units: int,
         max_length: int = 25,
         input_shape: Tuple[int, int, int] = (3, 32, 128),  # different from paper
-        patch_size: Tuple[int, int] = (4, 8),  # different from paper to match our size
         exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:

diff --git a/doctr/models/recognition/vitstr/tensorflow.py b/doctr/models/recognition/vitstr/tensorflow.py
@@ -46,7 +46,6 @@ class ViTSTR(_ViTSTR, Model):
         max_length: maximum word length handled by the model
         dropout_prob: dropout probability for the encoder and decoder
         input_shape: input shape of the image
-        patch_size: size of the patches
         exportable: onnx exportable returns only logits
         cfg: dictionary containing information about the model
     """
@@ -61,7 +60,6 @@ def __init__(
         max_length: int = 25,
         dropout_prob: float = 0.0,
         input_shape: Tuple[int, int, int] = (32, 128, 3),  # different from paper
-        patch_size: Tuple[int, int] = (4, 8),  # different from paper to match our size
         exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:
@@ -202,10 +200,9 @@ def _vitstr(
 
     kwargs["vocab"] = _cfg["vocab"]
 
-    # Feature extractor
-    # NOTE: switch to IntermediateLayerGetter if pretrained vit models are available
+    # feature extractor
     feat_extractor = backbone_fn(
-        pretrained=pretrained_backbone,
+        pretrained=False,  # TODO: pretrained_backbone, solve weights shape mismatch
         input_shape=_cfg["input_shape"],
         include_top=False,
     )