From cd486df2971d0abe43835f6e2ae8ecc5e4cc6b23 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Mon, 1 Aug 2022 07:48:20 +0530
Subject: [PATCH 1/8] Refine `invertedresidual`

---
 src/convnets/efficientnet.jl          |  5 +++--
 src/convnets/mobilenet/mobilenetv2.jl |  2 +-
 src/convnets/mobilenet/mobilenetv3.jl |  2 +-
 src/layers/conv.jl                    | 19 ++++++++++++-------
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/convnets/efficientnet.jl b/src/convnets/efficientnet.jl
index 4321e9443..71e6f8f0a 100644
--- a/src/convnets/efficientnet.jl
+++ b/src/convnets/efficientnet.jl
@@ -36,11 +36,12 @@ function efficientnet(scalings, block_configs;
         out_channels = _round_channels(scalew(o), 8)
         repeats = scaled(n)
         push!(blocks,
-              invertedresidual(k, in_channels, in_channels * e, out_channels, swish;
+              invertedresidual((k, k), in_channels, in_channels * e, out_channels, swish;
                                stride = s, reduction = 4))
         for _ in 1:(repeats - 1)
             push!(blocks,
-                  invertedresidual(k, out_channels, out_channels * e, out_channels, swish;
+                  invertedresidual((k, k), out_channels, out_channels * e, out_channels,
+                                   swish;
                                    stride = 1, reduction = 4))
         end
     end
diff --git a/src/convnets/mobilenet/mobilenetv2.jl b/src/convnets/mobilenet/mobilenetv2.jl
index a97e7dda1..b97fc16ff 100644
--- a/src/convnets/mobilenet/mobilenetv2.jl
+++ b/src/convnets/mobilenet/mobilenetv2.jl
@@ -30,7 +30,7 @@ function mobilenetv2(width_mult, configs; inchannels = 3, max_width = 1280, ncla
         outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8)
         for i in 1:n
             push!(layers,
-                  invertedresidual(3, inplanes, inplanes * t, outplanes, a;
+                  invertedresidual((3, 3), inplanes, inplanes * t, outplanes, a;
                                    stride = i == 1 ? s : 1))
             inplanes = outplanes
         end
diff --git a/src/convnets/mobilenet/mobilenetv3.jl b/src/convnets/mobilenet/mobilenetv3.jl
index d8666c5f3..d6873ac57 100644
--- a/src/convnets/mobilenet/mobilenetv3.jl
+++ b/src/convnets/mobilenet/mobilenetv3.jl
@@ -36,7 +36,7 @@ function mobilenetv3(width_mult, configs; inchannels = 3, max_width = 1024, ncla
         outplanes = _round_channels(c * width_mult, 8)
         explanes = _round_channels(inplanes * t, 8)
         push!(layers,
-              invertedresidual(k, inplanes, explanes, outplanes, a;
+              invertedresidual((k, k), inplanes, explanes, outplanes, a;
                                stride = s, reduction = r))
         inplanes = outplanes
     end
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 5610d3be2..557db23a7 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -114,16 +114,17 @@ Create a basic inverted residual block for MobileNet variants
   - `reduction`: The reduction factor for the number of hidden feature maps
     in a squeeze and excite layer (see [`squeeze_excite`](#)).
 """
-function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes,
-                          activation = relu; stride, reduction = nothing)
+function invertedresidual(kernel_size, inplanes::Integer, hidden_planes::Integer,
+                          outplanes::Integer, activation = relu; stride::Integer,
+                          reduction::Union{Nothing, Integer} = nothing)
     @assert stride in [1, 2] "`stride` has to be 1 or 2"
     pad = @. (kernel_size - 1) ÷ 2
-    conv1 = (inplanes == hidden_planes) ? identity :
-            Chain(conv_norm((1, 1), inplanes, hidden_planes, activation; bias = false))
+    conv1 = (inplanes == hidden_planes) ? (identity,) :
+            conv_norm((1, 1), inplanes, hidden_planes, activation; bias = false)
     selayer = isnothing(reduction) ? identity :
               squeeze_excite(hidden_planes; reduction, activation, gate_activation = hardσ,
                              norm_layer = BatchNorm)
-    invres = Chain(conv1,
+    invres = Chain(conv1...,
                    conv_norm(kernel_size, hidden_planes, hidden_planes, activation;
                              bias = false, stride, pad = pad, groups = hidden_planes)...,
                    selayer,
@@ -131,6 +132,10 @@ function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes,
     return (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres
 end
 
-function invertedresidual(kernel_size::Integer, args...; kwargs...)
-    return invertedresidual((kernel_size, kernel_size), args...; kwargs...)
+function invertedresidual(kernel_size, inplanes::Integer, outplanes::Integer,
+                          activation = relu; stride::Integer, expansion,
+                          reduction::Union{Nothing, Integer} = nothing)
+    hidden_planes = Int(inplanes * expansion)
+    return invertedresidual(kernel_size, inplanes, hidden_planes, outplanes, activation;
+                            stride, reduction)
 end

From e9306c3d48775e683f323ca7561b8646310e7cc0 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Mon, 1 Aug 2022 11:10:48 +0530
Subject: [PATCH 2/8] Expose `inchannels` and `nclasses` for every model

Also
a. more type annotations
b. Expose only configurations vital to the model API in terms of pretraining at the highest level
---
 .github/workflows/CI.yml                    |  3 +-
 src/convnets/alexnet.jl                     | 12 ++---
 src/convnets/convmixer.jl                   | 20 +++----
 src/convnets/convnext.jl                    | 42 ++++++++-------
 src/convnets/densenet.jl                    | 49 +++++++++--------
 src/convnets/efficientnet.jl                | 36 ++++++-------
 src/convnets/inception/googlenet.jl         |  8 +--
 src/convnets/inception/inceptionresnetv2.jl | 12 +++--
 src/convnets/inception/inceptionv3.jl       | 14 ++---
 src/convnets/inception/inceptionv4.jl       | 14 ++---
 src/convnets/inception/xception.jl          | 33 ++++++------
 src/convnets/mobilenet/mobilenetv1.jl       | 19 +++----
 src/convnets/mobilenet/mobilenetv2.jl       | 29 +++++-----
 src/convnets/mobilenet/mobilenetv3.jl       | 29 +++++-----
 src/convnets/resnets/core.jl                |  2 +-
 src/convnets/resnets/resnet.jl              | 10 ++--
 src/convnets/resnets/resnext.jl             |  8 +--
 src/convnets/resnets/seresnet.jl            | 13 ++---
 src/convnets/squeezenet.jl                  | 34 ++++++++----
 src/convnets/vgg.jl                         | 60 +++++++++++----------
 src/layers/conv.jl                          | 18 ++++---
 src/layers/drop.jl                          |  9 ++--
 src/layers/embeddings.jl                    | 22 ++++----
 src/layers/mlp.jl                           | 10 ++--
 src/layers/pool.jl                          |  5 +-
 src/layers/scale.jl                         |  2 +-
 src/layers/selayers.jl                      |  8 +--
 src/mixers/core.jl                          | 11 ++--
 src/mixers/gmlp.jl                          | 37 +++++++------
 src/mixers/mlpmixer.jl                      | 34 ++++++------
 src/mixers/resmlp.jl                        | 36 ++++++-------
 src/vit-based/vit.jl                        | 24 +++++----
 test/convnets.jl                            | 11 ++--
 test/mixers.jl                              | 36 +++----------
 34 files changed, 363 insertions(+), 347 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index d43e61da4..316b7a422 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -34,8 +34,7 @@ jobs:
           - '"Inception"'
           - '"DenseNet"'
           - '["ConvNeXt", "ConvMixer"]'
-          - 'r"ViTs"'
-          - 'r"Mixers"'
+          - '[r"ViTs", r"Mixers"]'
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1
diff --git a/src/convnets/alexnet.jl b/src/convnets/alexnet.jl
index 8ff65ffef..75ba5ad48 100644
--- a/src/convnets/alexnet.jl
+++ b/src/convnets/alexnet.jl
@@ -1,5 +1,5 @@
 """
-    alexnet(; nclasses = 1000)
+    alexnet(; nclasses::Integer = 1000)
 
 Create an AlexNet model
 ([reference](https://papers.nips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf)).
@@ -8,8 +8,8 @@ Create an AlexNet model
 
   - `nclasses`: the number of output classes
 """
-function alexnet(; nclasses = 1000)
-    layers = Chain(Chain(Conv((11, 11), 3 => 64, relu; stride = (4, 4), pad = (2, 2)),
+function alexnet(; inchannels::Integer = 3, nclasses::Integer = 1000)
+    layers = Chain(Chain(Conv((11, 11), inchannels => 64, relu; stride = (4, 4), pad = (2, 2)),
                          MaxPool((3, 3); stride = (2, 2)),
                          Conv((5, 5), 64 => 192, relu; pad = (2, 2)),
                          MaxPool((3, 3); stride = (2, 2)),
@@ -28,7 +28,7 @@ function alexnet(; nclasses = 1000)
 end
 
 """
-    AlexNet(; pretrain = false, nclasses = 1000)
+    AlexNet(; pretrain::Bool = false, nclasses::Integer = 1000)
 
 Create a `AlexNet`.
 See also [`alexnet`](#).
@@ -47,8 +47,8 @@ struct AlexNet
 end
 @functor AlexNet
 
-function AlexNet(; pretrain = false, nclasses = 1000)
-    layers = alexnet(; nclasses = nclasses)
+function AlexNet(; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
+    layers = alexnet(; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "AlexNet")
     end
diff --git a/src/convnets/convmixer.jl b/src/convnets/convmixer.jl
index aa3d144d2..c75303184 100644
--- a/src/convnets/convmixer.jl
+++ b/src/convnets/convmixer.jl
@@ -1,6 +1,7 @@
 """
-    convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9), patch_size::Dims{2} = 7,
-              activation = gelu, nclasses = 1000)
+    convmixer(planes::Integer, depth::Integer; kernel_size = (9, 9),
+              patch_size::Dims{2} = (7, 7), activation = gelu,
+              inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ConvMixer model.
 ([reference](https://arxiv.org/abs/2201.09792))
@@ -9,14 +10,15 @@ Creates a ConvMixer model.
 
   - `planes`: number of planes in the output of each block
   - `depth`: number of layers
-  - `inchannels`: The number of channels in the input.
   - `kernel_size`: kernel size of the convolutional layers
   - `patch_size`: size of the patches
   - `activation`: activation function used after the convolutional layers
+  - `inchannels`: The number of channels in the input.
   - `nclasses`: number of classes in the output
 """
-function convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9),
-                   patch_size::Dims{2} = (7, 7), activation = gelu, nclasses = 1000)
+function convmixer(planes::Integer, depth::Integer; kernel_size = (9, 9),
+                   patch_size::Dims{2} = (7, 7), activation = gelu,
+                   inchannels::Integer = 3, nclasses::Integer = 1000)
     stem = conv_norm(patch_size, inchannels, planes, activation; preact = true,
                      stride = patch_size[1])
     blocks = [Chain(SkipConnection(Chain(conv_norm(kernel_size, planes, planes, activation;
@@ -39,7 +41,7 @@ const CONVMIXER_CONFIGS = Dict(:base => Dict(:planes => 1536, :depth => 20,
                                               :patch_size => (7, 7)))
 
 """
-    ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000)
+    ConvMixer(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ConvMixer model.
 ([reference](https://arxiv.org/abs/2201.09792))
@@ -48,7 +50,6 @@ Creates a ConvMixer model.
 
   - `mode`: the mode of the model, either `:base`, `:small` or `:large`
   - `inchannels`: The number of channels in the input.
-  - `activation`: activation function used after the convolutional layers
   - `nclasses`: number of classes in the output
 """
 struct ConvMixer
@@ -56,14 +57,13 @@ struct ConvMixer
 end
 @functor ConvMixer
 
-function ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000)
+function ConvMixer(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(mode, keys(CONVMIXER_CONFIGS))
     planes = CONVMIXER_CONFIGS[mode][:planes]
     depth = CONVMIXER_CONFIGS[mode][:depth]
     kernel_size = CONVMIXER_CONFIGS[mode][:kernel_size]
     patch_size = CONVMIXER_CONFIGS[mode][:patch_size]
-    layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation,
-                       nclasses)
+    layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, nclasses)
     return ConvMixer(layers)
 end
 
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
index e6ccee16a..d7c39cc04 100644
--- a/src/convnets/convnext.jl
+++ b/src/convnets/convnext.jl
@@ -1,5 +1,5 @@
 """
-    convnextblock(planes, drop_path_rate = 0., λ = 1f-6)
+    convnextblock(planes::Integer, drop_path_rate = 0.0, layerscale_init = 1.0f-6)
 
 Creates a single block of ConvNeXt.
 ([reference](https://arxiv.org/abs/2201.03545))
@@ -8,21 +8,23 @@ Creates a single block of ConvNeXt.
 
   - `planes`: number of input channels.
   - `drop_path_rate`: Stochastic depth rate.
-  - `λ`: Initial value for [`LayerScale`](#)
+  - `layerscale_init`: Initial value for [`LayerScale`](#)
 """
-function convnextblock(planes, drop_path_rate = 0.0, λ = 1.0f-6)
+function convnextblock(planes::Integer, drop_path_rate = 0.0, layerscale_init = 1.0f-6)
     layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3),
                                   swapdims((3, 1, 2, 4)),
                                   LayerNorm(planes; ϵ = 1.0f-6),
                                   mlp_block(planes, 4 * planes),
-                                  LayerScale(planes, λ),
+                                  LayerScale(planes, layerscale_init),
                                   swapdims((2, 3, 1, 4)),
                                   DropPath(drop_path_rate)), +)
     return layers
 end
 
 """
-    convnext(depths, planes; inchannels = 3, drop_path_rate = 0., λ = 1f-6, nclasses = 1000)
+    convnext(depths::Vector{<:Integer}, planes::Vector{<:Integer};
+             drop_path_rate = 0.0, layerscale_init = 1.0f-6, inchannels::Integer = 3,
+             nclasses::Integer = 1000)
 
 Creates the layers for a ConvNeXt model.
 ([reference](https://arxiv.org/abs/2201.03545))
@@ -33,12 +35,13 @@ Creates the layers for a ConvNeXt model.
   - `depths`: list with configuration for depth of each block
   - `planes`: list with configuration for number of output channels in each block
   - `drop_path_rate`: Stochastic depth rate.
-  - `λ`: Initial value for [`LayerScale`](#)
+  - `layerscale_init`: Initial value for [`LayerScale`](#)
     ([reference](https://arxiv.org/abs/2103.17239))
   - `nclasses`: number of output classes
 """
-function convnext(depths, planes; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
-                  nclasses = 1000)
+function convnext(depths::Vector{<:Integer}, planes::Vector{<:Integer};
+                  drop_path_rate = 0.0, layerscale_init = 1.0f-6, inchannels::Integer = 3,
+                  nclasses::Integer = 1000)
     @assert length(depths) == length(planes)
     "`planes` should have exactly one value for each block"
     downsample_layers = []
@@ -54,7 +57,9 @@ function convnext(depths, planes; inchannels = 3, drop_path_rate = 0.0, λ = 1.0
     dp_rates = linear_scheduler(drop_path_rate; depth = sum(depths))
     cur = 0
     for i in eachindex(depths)
-        push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]])
+        push!(stages,
+              [convnextblock(planes[i], dp_rates[cur + j], layerscale_init)
+               for j in 1:depths[i]])
         cur += depths[i]
     end
     backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages))))
@@ -72,13 +77,8 @@ const CONVNEXT_CONFIGS = Dict(:tiny => ([3, 3, 9, 3], [96, 192, 384, 768]),
                               :large => ([3, 3, 27, 3], [192, 384, 768, 1536]),
                               :xlarge => ([3, 3, 27, 3], [256, 512, 1024, 2048]))
 
-struct ConvNeXt
-    layers::Any
-end
-@functor ConvNeXt
-
 """
-    ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0., λ = 1f-6, nclasses = 1000)
+    ConvNeXt(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ConvNeXt model.
 ([reference](https://arxiv.org/abs/2201.03545))
@@ -86,16 +86,18 @@ Creates a ConvNeXt model.
 # Arguments
 
   - `inchannels`: The number of channels in the input.
-  - `drop_path_rate`: Stochastic depth rate.
-  - `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239)
   - `nclasses`: number of output classes
 
 See also [`Metalhead.convnext`](#).
 """
-function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
-                  nclasses = 1000)
+struct ConvNeXt
+    layers::Any
+end
+@functor ConvNeXt
+
+function ConvNeXt(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(mode, keys(CONVNEXT_CONFIGS))
-    layers = convnext(CONVNEXT_CONFIGS[mode]...; inchannels, drop_path_rate, λ, nclasses)
+    layers = convnext(CONVNEXT_CONFIGS[mode]...; inchannels, nclasses)
     return ConvNeXt(layers)
 end
 
diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl
index 332b5551f..0b164e2ab 100644
--- a/src/convnets/densenet.jl
+++ b/src/convnets/densenet.jl
@@ -10,7 +10,7 @@ Create a Densenet bottleneck layer
   - `outplanes`: number of output feature maps on bottleneck branch
     (and scaling factor for inner feature maps; see ref)
 """
-function dense_bottleneck(inplanes, outplanes)
+function dense_bottleneck(inplanes::Integer, outplanes::Integer)
     inner_channels = 4 * outplanes
     return SkipConnection(Chain(conv_norm((1, 1), inplanes, inner_channels; bias = false,
                                           revnorm = true)...,
@@ -30,7 +30,7 @@ Create a DenseNet transition sequence
   - `inplanes`: number of input feature maps
   - `outplanes`: number of output feature maps
 """
-function transition(inplanes, outplanes)
+function transition(inplanes::Integer, outplanes::Integer)
     return Chain(conv_norm((1, 1), inplanes, outplanes; bias = false, revnorm = true)...,
                  MeanPool((2, 2)))
 end
@@ -48,14 +48,14 @@ the number of output feature maps by `growth_rates` with each block
   - `growth_rates`: the growth (additive) rates of output feature maps
     after each block (a vector of `k`s from the ref)
 """
-function dense_block(inplanes, growth_rates)
+function dense_block(inplanes::Integer, growth_rates)
     return [dense_bottleneck(i, o)
             for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]),
                               growth_rates)]
 end
 
 """
-    densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
+    densenet(inplanes, growth_rates; reduction = 0.5, nclasses::Integer = 1000)
 
 Create a DenseNet model
 ([reference](https://arxiv.org/abs/1608.06993)).
@@ -68,9 +68,11 @@ Create a DenseNet model
   - `reduction`: the factor by which the number of feature maps is scaled across each transition
   - `nclasses`: the number of output classes
 """
-function densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
+function densenet(inplanes::Integer, growth_rates; reduction = 0.5, inchannels::Integer = 3,
+                  nclasses::Integer = 1000)
     layers = []
-    append!(layers, conv_norm((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false))
+    append!(layers,
+            conv_norm((7, 7), inchannels, inplanes; stride = 2, pad = (3, 3), bias = false))
     push!(layers, MaxPool((3, 3); stride = 2, pad = (1, 1)))
     outplanes = 0
     for (i, rates) in enumerate(growth_rates)
@@ -88,7 +90,7 @@ function densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
 end
 
 """
-    densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000)
+    densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses::Integer = 1000)
 
 Create a DenseNet model
 ([reference](https://arxiv.org/abs/1608.06993)).
@@ -100,15 +102,15 @@ Create a DenseNet model
   - `reduction`: the factor by which the number of feature maps is scaled across each transition
   - `nclasses`: the number of output classes
 """
-function densenet(nblocks::NTuple{N, <:Integer}; growth_rate = 32, reduction = 0.5,
-                  nclasses = 1000) where {N}
+function densenet(nblocks::Vector{<:Integer}; growth_rate::Integer = 32, reduction = 0.5,
+                  inchannels::Integer = 3, nclasses::Integer = 1000)
     return densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks];
-                    reduction = reduction, nclasses = nclasses)
+                    reduction, inchannels, nclasses)
 end
 
 """
-    DenseNet(nblocks::NTuple{N, <:Integer};
-             growth_rate = 32, reduction = 0.5, nclasses = 1000)
+    DenseNet(nblocks::Vector{<:Integer}; growth_rate::Integer = 32, reduction = 0.5,
+             inchannels = 3, nclasses::Integer = 1000)
 
 Create a DenseNet model
 ([reference](https://arxiv.org/abs/1608.06993)).
@@ -124,29 +126,26 @@ See also [`densenet`](#).
 struct DenseNet
     layers::Any
 end
+@functor DenseNet
 
-function DenseNet(nblocks::NTuple{N, <:Integer};
-                  growth_rate = 32, reduction = 0.5, nclasses = 1000) where {N}
-    layers = densenet(nblocks; growth_rate = growth_rate,
-                      reduction = reduction,
-                      nclasses = nclasses)
+function DenseNet(nblocks::Vector{<:Integer}; growth_rate::Integer = 32, reduction = 0.5,
+                  inchannels = 3, nclasses::Integer = 1000)
+    layers = densenet(nblocks; growth_rate, reduction, inchannels, nclasses)
     return DenseNet(layers)
 end
 
-@functor DenseNet
-
 (m::DenseNet)(x) = m.layers(x)
 
 backbone(m::DenseNet) = m.layers[1]
 classifier(m::DenseNet) = m.layers[2]
 
-const DENSENET_CONFIGS = Dict(121 => (6, 12, 24, 16),
-                              161 => (6, 12, 36, 24),
-                              169 => (6, 12, 32, 32),
-                              201 => (6, 12, 48, 32))
+const DENSENET_CONFIGS = Dict(121 => [6, 12, 24, 16],
+                              161 => [6, 12, 36, 24],
+                              169 => [6, 12, 32, 32],
+                              201 => [6, 12, 48, 32])
 
 """
-    DenseNet(config::Integer = 121; pretrain = false, nclasses = 1000)
+    DenseNet(config::Integer = 121; pretrain::Bool = false, nclasses::Integer = 1000)
     DenseNet(transition_configs::NTuple{N,Integer})
 
 Create a DenseNet model with specified configuration. Currently supported values are (121, 161, 169, 201)
@@ -159,7 +158,7 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 See also [`Metalhead.densenet`](#).
 """
-function DenseNet(config::Integer = 121; pretrain = false, nclasses = 1000)
+function DenseNet(config::Integer = 121; pretrain::Bool = false, nclasses::Integer = 1000)
     _checkconfig(config, keys(DENSENET_CONFIGS))
     model = DenseNet(DENSENET_CONFIGS[config]; nclasses = nclasses)
     if pretrain
diff --git a/src/convnets/efficientnet.jl b/src/convnets/efficientnet.jl
index 71e6f8f0a..730840fa4 100644
--- a/src/convnets/efficientnet.jl
+++ b/src/convnets/efficientnet.jl
@@ -1,6 +1,6 @@
 """
-    efficientnet(scalings, block_configs;
-                 inchannels = 3, nclasses = 1000, max_width = 1280)
+    efficientnet(scalings, block_configs; max_width::Integer = 1280,
+                 inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an EfficientNet model ([reference](https://arxiv.org/abs/1905.11946v5)).
 
@@ -22,8 +22,8 @@ Create an EfficientNet model ([reference](https://arxiv.org/abs/1905.11946v5)).
   - `max_width`: maximum number of output channels before the fully connected
     classification blocks
 """
-function efficientnet(scalings, block_configs;
-                      inchannels = 3, nclasses = 1000, max_width = 1280)
+function efficientnet(scalings, block_configs; max_width::Integer = 1280,
+                      inchannels::Integer = 3, nclasses::Integer = 1000)
     wscale, dscale = scalings
     scalew(w) = wscale ≈ 1 ? w : ceil(Int64, wscale * w)
     scaled(d) = dscale ≈ 1 ? d : ceil(Int64, dscale * d)
@@ -36,12 +36,11 @@ function efficientnet(scalings, block_configs;
         out_channels = _round_channels(scalew(o), 8)
         repeats = scaled(n)
         push!(blocks,
-              invertedresidual((k, k), in_channels, in_channels * e, out_channels, swish;
+              invertedresidual((k, k), in_channels, out_channels, swish; expansion = e,
                                stride = s, reduction = 4))
         for _ in 1:(repeats - 1)
             push!(blocks,
-                  invertedresidual((k, k), out_channels, out_channels * e, out_channels,
-                                   swish;
+                  invertedresidual((k, k), out_channels, out_channels, swish; expansion = e,
                                    stride = 1, reduction = 4))
         end
     end
@@ -74,6 +73,7 @@ const EFFICIENTNET_BLOCK_CONFIGS = [
 # w: width scaling
 # d: depth scaling
 # r: image resolution
+# Data is organised as (r, (w, d))
 const EFFICIENTNET_GLOBAL_CONFIGS = Dict(:b0 => (224, (1.0, 1.0)),
                                          :b1 => (240, (1.0, 1.1)),
                                          :b2 => (260, (1.1, 1.2)),
@@ -84,14 +84,9 @@ const EFFICIENTNET_GLOBAL_CONFIGS = Dict(:b0 => (224, (1.0, 1.0)),
                                          :b7 => (600, (2.0, 3.1)),
                                          :b8 => (672, (2.2, 3.6)))
 
-struct EfficientNet
-    layers::Any
-end
-@functor EfficientNet
-
 """
-    EfficientNet(scalings, block_configs;
-                 inchannels = 3, nclasses = 1000, max_width = 1280)
+    EfficientNet(scalings, block_configs; max_width::Integer = 1280,
+                 inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an EfficientNet model ([reference](https://arxiv.org/abs/1905.11946v5)).
 See also [`efficientnet`](#).
@@ -114,8 +109,13 @@ See also [`efficientnet`](#).
   - `max_width`: maximum number of output channels before the fully connected
     classification blocks
 """
-function EfficientNet(scalings, block_configs;
-                      inchannels = 3, nclasses = 1000, max_width = 1280)
+struct EfficientNet
+    layers::Any
+end
+@functor EfficientNet
+
+function EfficientNet(scalings, block_configs; max_width::Integer = 1280,
+                      inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = efficientnet(scalings, block_configs; inchannels, nclasses, max_width)
     return EfficientNet(layers)
 end
@@ -126,7 +126,7 @@ backbone(m::EfficientNet) = m.layers[1]
 classifier(m::EfficientNet) = m.layers[2]
 
 """
-    EfficientNet(name::Symbol; pretrain = false)
+    EfficientNet(name::Symbol; pretrain::Bool = false)
 
 Create an EfficientNet model ([reference](https://arxiv.org/abs/1905.11946v5)).
 See also [`efficientnet`](#).
@@ -137,7 +137,7 @@ See also [`efficientnet`](#).
     (can be `:b0`, `:b1`, `:b2`, `:b3`, `:b4`, `:b5`, `:b6`, `:b7`, `:b8`)
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
 """
-function EfficientNet(name::Symbol; pretrain = false)
+function EfficientNet(name::Symbol; pretrain::Bool = false)
     _checkconfig(name, keys(EFFICIENTNET_GLOBAL_CONFIGS))
     model = EfficientNet(EFFICIENTNET_GLOBAL_CONFIGS[name][2], EFFICIENTNET_BLOCK_CONFIGS)
     pretrain && loadpretrain!(model, string("efficientnet-", name))
diff --git a/src/convnets/inception/googlenet.jl b/src/convnets/inception/googlenet.jl
index 8a88ca943..90f92ddfc 100644
--- a/src/convnets/inception/googlenet.jl
+++ b/src/convnets/inception/googlenet.jl
@@ -27,7 +27,7 @@ function _inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5,
 end
 
 """
-    googlenet(; nclasses = 1000)
+    googlenet(; nclasses::Integer = 1000)
 
 Create an Inception-v1 model (commonly referred to as GoogLeNet)
 ([reference](https://arxiv.org/abs/1409.4842v1)).
@@ -36,7 +36,7 @@ Create an Inception-v1 model (commonly referred to as GoogLeNet)
 
   - `nclasses`: the number of output classes
 """
-function googlenet(; nclasses = 1000)
+function googlenet(; nclasses::Integer = 1000)
     layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3),
                          MaxPool((3, 3); stride = 2, pad = 1),
                          Conv((1, 1), 64 => 64),
@@ -61,7 +61,7 @@ function googlenet(; nclasses = 1000)
 end
 
 """
-    GoogLeNet(; pretrain = false,  nclasses = 1000)
+    GoogLeNet(; pretrain::Bool = false, nclasses::Integer = 1000)
 
 Create an Inception-v1 model (commonly referred to as `GoogLeNet`)
 ([reference](https://arxiv.org/abs/1409.4842v1)).
@@ -82,7 +82,7 @@ struct GoogLeNet
 end
 @functor GoogLeNet
 
-function GoogLeNet(; pretrain = false, nclasses = 1000)
+function GoogLeNet(; pretrain::Bool = false, nclasses::Integer = 1000)
     layers = googlenet(; nclasses = nclasses)
     if pretrain
         loadpretrain!(layers, "GoogLeNet")
diff --git a/src/convnets/inception/inceptionresnetv2.jl b/src/convnets/inception/inceptionresnetv2.jl
index 4b4b78706..747da2fb2 100644
--- a/src/convnets/inception/inceptionresnetv2.jl
+++ b/src/convnets/inception/inceptionresnetv2.jl
@@ -64,7 +64,7 @@ function block8(scale = 1.0f0; activation = identity)
 end
 
 """
-    inceptionresnetv2(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+    inceptionresnetv2(; inchannels::Integer = 3, dropout_rate = 0.0, nclasses::Integer = 1000)
 
 Creates an InceptionResNetv2 model.
 ([reference](https://arxiv.org/abs/1602.07261))
@@ -75,7 +75,8 @@ Creates an InceptionResNetv2 model.
   - `dropout_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 """
-function inceptionresnetv2(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+function inceptionresnetv2(; inchannels::Integer = 3, dropout_rate = 0.0,
+                           nclasses::Integer = 1000)
     body = Chain(conv_norm((3, 3), inchannels, 32; stride = 2)...,
                  conv_norm((3, 3), 32, 32)...,
                  conv_norm((3, 3), 32, 64; pad = 1)...,
@@ -97,7 +98,7 @@ function inceptionresnetv2(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000
 end
 
 """
-    InceptionResNetv2(; pretrain = false, inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+    InceptionResNetv2(; pretrain::Bool = false, inchannels::Integer = 3, dropout_rate = 0.0, nclasses::Integer = 1000)
 
 Creates an InceptionResNetv2 model.
 ([reference](https://arxiv.org/abs/1602.07261))
@@ -118,8 +119,9 @@ struct InceptionResNetv2
 end
 @functor InceptionResNetv2
 
-function InceptionResNetv2(; pretrain = false, inchannels = 3, dropout_rate = 0.0,
-                           nclasses = 1000)
+function InceptionResNetv2(; pretrain::Bool = false, inchannels::Integer = 3,
+                           dropout_rate = 0.0,
+                           nclasses::Integer = 1000)
     layers = inceptionresnetv2(; inchannels, dropout_rate, nclasses)
     if pretrain
         loadpretrain!(layers, "InceptionResNetv2")
diff --git a/src/convnets/inception/inceptionv3.jl b/src/convnets/inception/inceptionv3.jl
index 68b283838..8d9977d80 100644
--- a/src/convnets/inception/inceptionv3.jl
+++ b/src/convnets/inception/inceptionv3.jl
@@ -127,7 +127,7 @@ function inceptionv3_e(inplanes)
 end
 
 """
-    inceptionv3(; nclasses = 1000)
+    inceptionv3(; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
 
@@ -135,8 +135,8 @@ Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
 
   - `nclasses`: the number of output classes
 """
-function inceptionv3(; nclasses = 1000)
-    layer = Chain(Chain(conv_norm((3, 3), 3, 32; stride = 2)...,
+function inceptionv3(; inchannels::Integer = 3, nclasses::Integer = 1000)
+    layer = Chain(Chain(conv_norm((3, 3), inchannels, 32; stride = 2)...,
                         conv_norm((3, 3), 32, 32)...,
                         conv_norm((3, 3), 32, 64; pad = 1)...,
                         MaxPool((3, 3); stride = 2),
@@ -162,7 +162,7 @@ function inceptionv3(; nclasses = 1000)
 end
 
 """
-    Inceptionv3(; pretrain = false, nclasses = 1000)
+    Inceptionv3(; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
 See also [`inceptionv3`](#).
@@ -170,6 +170,7 @@ See also [`inceptionv3`](#).
 # Arguments
 
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
+  - `inchannels`: number of input channels
   - `nclasses`: the number of output classes
 
 !!! warning
@@ -180,8 +181,9 @@ struct Inceptionv3
     layers::Any
 end
 
-function Inceptionv3(; pretrain = false, nclasses = 1000)
-    layers = inceptionv3(; nclasses = nclasses)
+function Inceptionv3(; pretrain::Bool = false, inchannels::Integer = 3,
+                     nclasses::Integer = 1000)
+    layers = inceptionv3(; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "Inceptionv3")
     end
diff --git a/src/convnets/inception/inceptionv4.jl b/src/convnets/inception/inceptionv4.jl
index bb03646ec..b84232fb8 100644
--- a/src/convnets/inception/inceptionv4.jl
+++ b/src/convnets/inception/inceptionv4.jl
@@ -82,7 +82,7 @@ function inceptionv4_c()
 end
 
 """
-    inceptionv4(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+    inceptionv4(; inchannels::Integer = 3, dropout_rate = 0.0, nclasses::Integer = 1000)
 
 Create an Inceptionv4 model.
 ([reference](https://arxiv.org/abs/1602.07261))
@@ -93,7 +93,8 @@ Create an Inceptionv4 model.
   - `dropout_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 """
-function inceptionv4(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+function inceptionv4(; dropout_rate = 0.0, inchannels::Integer = 3,
+                     nclasses::Integer = 1000)
     body = Chain(conv_norm((3, 3), inchannels, 32; stride = 2)...,
                  conv_norm((3, 3), 32, 32)...,
                  conv_norm((3, 3), 32, 64; pad = 1)...,
@@ -122,7 +123,7 @@ function inceptionv4(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
 end
 
 """
-    Inceptionv4(; pretrain = false, inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+    Inceptionv4(; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates an Inceptionv4 model.
 ([reference](https://arxiv.org/abs/1602.07261))
@@ -131,7 +132,6 @@ Creates an Inceptionv4 model.
 
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
   - `inchannels`: number of input channels.
-  - `dropout_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 
 !!! warning
@@ -143,9 +143,9 @@ struct Inceptionv4
 end
 @functor Inceptionv4
 
-function Inceptionv4(; pretrain = false, inchannels = 3, dropout_rate = 0.0,
-                     nclasses = 1000)
-    layers = inceptionv4(; inchannels, dropout_rate, nclasses)
+function Inceptionv4(; pretrain::Bool = false, inchannels::Integer = 3,
+                     nclasses::Integer = 1000)
+    layers = inceptionv4(; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "Inceptionv4")
     end
diff --git a/src/convnets/inception/xception.jl b/src/convnets/inception/xception.jl
index 3c6d8331a..8d2ad13d8 100644
--- a/src/convnets/inception/xception.jl
+++ b/src/convnets/inception/xception.jl
@@ -1,6 +1,7 @@
 """
-    xception_block(inchannels, outchannels, nrepeats; stride = 1, start_with_relu = true, 
-                        grow_at_start = true)
+    xception_block(inchannels::Integer, outchannels::Integer, nrepeats::Integer;
+                   stride::Integer = 1, start_with_relu::Bool = true,
+                   grow_at_start::Bool = true)
 
 Create an Xception block.
 ([reference](https://arxiv.org/abs/1610.02357))
@@ -14,9 +15,9 @@ Create an Xception block.
   - `start_with_relu`: if true, start the block with a ReLU activation.
   - `grow_at_start`: if true, increase the number of channels at the first convolution.
 """
-function xception_block(inchannels, outchannels, nrepeats; stride = 1,
-                        start_with_relu = true,
-                        grow_at_start = true)
+function xception_block(inchannels::Integer, outchannels::Integer, nrepeats::Integer;
+                        stride::Integer = 1, start_with_relu::Bool = true,
+                        grow_at_start::Bool = true)
     if outchannels != inchannels || stride != 1
         skip = conv_norm((1, 1), inchannels, outchannels, identity; stride = stride,
                          bias = false)
@@ -44,7 +45,7 @@ function xception_block(inchannels, outchannels, nrepeats; stride = 1,
 end
 
 """
-    xception(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+    xception(; inchannels::Integer = 3, dropout_rate = 0.0, nclasses::Integer = 1000)
 
 Creates an Xception model.
 ([reference](https://arxiv.org/abs/1610.02357))
@@ -55,7 +56,7 @@ Creates an Xception model.
   - `dropout_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 """
-function xception(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+function xception(; dropout_rate = 0.0, inchannels::Integer = 3, nclasses::Integer = 1000)
     body = Chain(conv_norm((3, 3), inchannels, 32; stride = 2, bias = false)...,
                  conv_norm((3, 3), 32, 64; bias = false)...,
                  xception_block(64, 128, 2; stride = 2, start_with_relu = false),
@@ -70,13 +71,8 @@ function xception(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
     return Chain(body, head)
 end
 
-struct Xception
-    layers::Any
-end
-@functor Xception
-
 """
-    Xception(; pretrain = false, inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+    Xception(; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates an Xception model.
 ([reference](https://arxiv.org/abs/1610.02357))
@@ -85,15 +81,20 @@ Creates an Xception model.
 
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet.
   - `inchannels`: number of input channels.
-  - `dropout_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 
 !!! warning
     
     `Xception` does not currently support pretrained weights.
 """
-function Xception(; pretrain = false, inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
-    layers = xception(; inchannels, dropout_rate, nclasses)
+struct Xception
+    layers::Any
+end
+@functor Xception
+
+function Xception(; pretrain::Bool = false, inchannels::Integer = 3,
+                  nclasses::Integer = 1000)
+    layers = xception(; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "xception")
     end
diff --git a/src/convnets/mobilenet/mobilenetv1.jl b/src/convnets/mobilenet/mobilenetv1.jl
index fffa93a4d..e31f8835b 100644
--- a/src/convnets/mobilenet/mobilenetv1.jl
+++ b/src/convnets/mobilenet/mobilenetv1.jl
@@ -1,8 +1,6 @@
 """
-    mobilenetv1(width_mult, config;
-                activation = relu,
-                inchannels = 3,
-                nclasses = 1000)
+    mobilenetv1(width_mult::Number, config::Vector{<:Tuple}; activation = relu,
+                inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create a MobileNetv1 model ([reference](https://arxiv.org/abs/1704.04861v1)).
 
@@ -21,10 +19,8 @@ Create a MobileNetv1 model ([reference](https://arxiv.org/abs/1704.04861v1)).
   - `inchannels`: The number of input channels. The default value is 3.
   - `nclasses`: The number of output classes
 """
-function mobilenetv1(width_mult, config;
-                     activation = relu,
-                     inchannels = 3,
-                     nclasses = 1000)
+function mobilenetv1(width_mult::Number, config::Vector{<:Tuple}; activation = relu,
+                     inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = []
     for (dw, outch, stride, nrepeats) in config
         outch = Int(outch * width_mult)
@@ -61,7 +57,8 @@ const MOBILENETV1_CONFIGS = [
 ]
 
 """
-    MobileNetv1(width_mult = 1; inchannels = 3, pretrain = false, nclasses = 1000)
+    MobileNetv1(width_mult = 1; inchannels::Integer = 3, pretrain::Bool = false,
+                nclasses::Integer = 1000)
 
 Create a MobileNetv1 model with the baseline configuration
 ([reference](https://arxiv.org/abs/1704.04861v1)).
@@ -83,8 +80,8 @@ struct MobileNetv1
 end
 @functor MobileNetv1
 
-function MobileNetv1(width_mult::Number = 1; inchannels = 3, pretrain = false,
-                     nclasses = 1000)
+function MobileNetv1(width_mult::Number = 1; pretrain::Bool = false,
+                     inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = mobilenetv1(width_mult, MOBILENETV1_CONFIGS; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, string("MobileNetv1"))
diff --git a/src/convnets/mobilenet/mobilenetv2.jl b/src/convnets/mobilenet/mobilenetv2.jl
index b97fc16ff..9dd35e9f9 100644
--- a/src/convnets/mobilenet/mobilenetv2.jl
+++ b/src/convnets/mobilenet/mobilenetv2.jl
@@ -1,5 +1,7 @@
 """
-    mobilenetv2(width_mult, configs; inchannels = 3, max_width = 1280, nclasses = 1000)
+    mobilenetv2(width_mult::Number, configs::Vector{<:Tuple};
+                max_width::Integer = 1280, inchannels::Integer = 3,
+                nclasses::Integer = 1000)
 
 Create a MobileNetv2 model.
 ([reference](https://arxiv.org/abs/1801.04381)).
@@ -20,7 +22,9 @@ Create a MobileNetv2 model.
   - `max_width`: The maximum number of feature maps in any layer of the network
   - `nclasses`: The number of output classes
 """
-function mobilenetv2(width_mult, configs; inchannels = 3, max_width = 1280, nclasses = 1000)
+function mobilenetv2(width_mult::Number, configs::Vector{<:Tuple};
+                     max_width::Integer = 1280, inchannels::Integer = 3,
+                     nclasses::Integer = 1000)
     # building first layer
     inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8)
     layers = []
@@ -30,7 +34,7 @@ function mobilenetv2(width_mult, configs; inchannels = 3, max_width = 1280, ncla
         outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8)
         for i in 1:n
             push!(layers,
-                  invertedresidual((3, 3), inplanes, inplanes * t, outplanes, a;
+                  invertedresidual((3, 3), inplanes, outplanes, a; expansion = t,
                                    stride = i == 1 ? s : 1))
             inplanes = outplanes
         end
@@ -57,13 +61,9 @@ const MOBILENETV2_CONFIGS = [
     (6, 320, 1, 1, relu6),
 ]
 
-struct MobileNetv2
-    layers::Any
-end
-@functor MobileNetv2
-
 """
-    MobileNetv2(width_mult = 1.0; inchannels = 3, pretrain = false, nclasses = 1000)
+    MobileNetv2(width_mult = 1.0; inchannels::Integer = 3, pretrain::Bool = false,
+                nclasses::Integer = 1000)
 
 Create a MobileNetv2 model with the specified configuration.
 ([reference](https://arxiv.org/abs/1801.04381)).
@@ -74,14 +74,19 @@ Set `pretrain` to `true` to load the pretrained weights for ImageNet.
   - `width_mult`: Controls the number of output feature maps in each block
     (with 1.0 being the default in the paper;
     this is usually a value between 0.1 and 1.4)
-  - `inchannels`: The number of input channels.
   - `pretrain`: Whether to load the pre-trained weights for ImageNet
+  - `inchannels`: The number of input channels.
   - `nclasses`: The number of output classes
 
 See also [`Metalhead.mobilenetv2`](#).
 """
-function MobileNetv2(width_mult::Number = 1; inchannels = 3, pretrain = false,
-                     nclasses = 1000)
+struct MobileNetv2
+    layers::Any
+end
+@functor MobileNetv2
+
+function MobileNetv2(width_mult::Number = 1; pretrain::Bool = false,
+                     inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = mobilenetv2(width_mult, MOBILENETV2_CONFIGS; inchannels, nclasses)
     pretrain && loadpretrain!(layers, string("MobileNetv2"))
     if pretrain
diff --git a/src/convnets/mobilenet/mobilenetv3.jl b/src/convnets/mobilenet/mobilenetv3.jl
index d6873ac57..00c0e0139 100644
--- a/src/convnets/mobilenet/mobilenetv3.jl
+++ b/src/convnets/mobilenet/mobilenetv3.jl
@@ -1,5 +1,7 @@
 """
-    mobilenetv3(width_mult, configs; inchannels = 3, max_width = 1024, nclasses = 1000)
+    mobilenetv3(width_mult::Number, configs::Vector{<:Tuple};
+                max_width::Integer = 1024, inchannels::Integer = 3,
+                nclasses::Integer = 1000)
 
 Create a MobileNetv3 model.
 ([reference](https://arxiv.org/abs/1905.02244)).
@@ -22,7 +24,9 @@ Create a MobileNetv3 model.
   - `max_width`: The maximum number of feature maps in any layer of the network
   - `nclasses`: the number of output classes
 """
-function mobilenetv3(width_mult, configs; inchannels = 3, max_width = 1024, nclasses = 1000)
+function mobilenetv3(width_mult::Number, configs::Vector{<:Tuple};
+                     max_width::Integer = 1024, inchannels::Integer = 3,
+                     nclasses::Integer = 1000)
     # building first layer
     inplanes = _round_channels(16 * width_mult, 8)
     layers = []
@@ -86,13 +90,9 @@ const MOBILENETV3_CONFIGS = Dict(:small => [
                                      (5, 6, 160, 4, hardswish, 1),
                                  ])
 
-struct MobileNetv3
-    layers::Any
-end
-@functor MobileNetv3
-
 """
-    MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; inchannels = 3, pretrain = false, nclasses = 1000)
+    MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain::Bool = false,
+                inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create a MobileNetv3 model with the specified configuration.
 ([reference](https://arxiv.org/abs/1905.02244)).
@@ -104,15 +104,20 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
   - `width_mult`: Controls the number of output feature maps in each block
     (with 1.0 being the default in the paper;
     this is usually a value between 0.1 and 1.4)
-  - `inchannels`: The number of channels in the input.
   - `pretrain`: whether to load the pre-trained weights for ImageNet
+  - `inchannels`: The number of channels in the input.
   - `nclasses`: the number of output classes
 
 See also [`Metalhead.mobilenetv3`](#).
 """
-function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; inchannels = 3,
-                     pretrain = false, nclasses = 1000)
-    @assert mode in [:large, :small] "`mode` has to be either :large or :small"
+struct MobileNetv3
+    layers::Any
+end
+@functor MobileNetv3
+
+function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain::Bool = false,
+                     inchannels::Integer = 3, nclasses::Integer = 1000)
+    _checkconfig(mode, [:small, :large])
     max_width = (mode == :large) ? 1280 : 1024
     layers = mobilenetv3(width_mult, MOBILENETV3_CONFIGS[mode]; inchannels, max_width,
                          nclasses)
diff --git a/src/convnets/resnets/core.jl b/src/convnets/resnets/core.jl
index 03d96d6db..42f61c54a 100644
--- a/src/convnets/resnets/core.jl
+++ b/src/convnets/resnets/core.jl
@@ -132,7 +132,7 @@ end
 # end
 
 """
-    resnet_stem(; stem_type = :default, inchannels = 3, replace_stem_pool = false,
+    resnet_stem(; stem_type = :default, inchannels::Integer = 3, replace_stem_pool = false,
                   norm_layer = BatchNorm, activation = relu)
 
 Builds a stem to be used in a ResNet model. See the `stem` argument of [`resnet`](#) for details
diff --git a/src/convnets/resnets/resnet.jl b/src/convnets/resnets/resnet.jl
index fac7e7415..9bf9cd82c 100644
--- a/src/convnets/resnets/resnet.jl
+++ b/src/convnets/resnets/resnet.jl
@@ -1,5 +1,5 @@
 """
-    ResNet(depth::Integer; pretrain = false, inchannels = 3, nclasses = 1000)
+    ResNet(depth::Integer; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ResNet model with the specified depth.
 ((reference)[https://arxiv.org/abs/1512.03385])
@@ -22,7 +22,8 @@ struct ResNet
 end
 @functor ResNet
 
-function ResNet(depth::Integer; pretrain = false, inchannels = 3, nclasses = 1000)
+function ResNet(depth::Integer; pretrain::Bool = false, inchannels::Integer = 3,
+                nclasses::Integer = 1000)
     _checkconfig(depth, keys(RESNET_CONFIGS))
     layers = resnet(RESNET_CONFIGS[depth]...; inchannels, nclasses)
     if pretrain
@@ -37,7 +38,7 @@ backbone(m::ResNet) = m.layers[1]
 classifier(m::ResNet) = m.layers[2]
 
 """
-    WideResNet(depth::Integer; pretrain = false, inchannels = 3, nclasses = 1000)
+    WideResNet(depth::Integer; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a Wide ResNet model with the specified depth. The model is the same as ResNet
 except for the bottleneck number of channels which is twice larger in every block.
@@ -62,7 +63,8 @@ struct WideResNet
 end
 @functor WideResNet
 
-function WideResNet(depth::Integer; pretrain = false, inchannels = 3, nclasses = 1000)
+function WideResNet(depth::Integer; pretrain::Bool = false, inchannels::Integer = 3,
+                    nclasses::Integer = 1000)
     _checkconfig(depth, [50, 101])
     layers = resnet(RESNET_CONFIGS[depth]...; base_width = 128, inchannels, nclasses)
     if pretrain
diff --git a/src/convnets/resnets/resnext.jl b/src/convnets/resnets/resnext.jl
index 8032df5ab..29d89e3f1 100644
--- a/src/convnets/resnets/resnext.jl
+++ b/src/convnets/resnets/resnext.jl
@@ -1,6 +1,6 @@
 """
-    ResNeXt(depth::Integer; pretrain = false, cardinality = 32,
-            base_width = 4, inchannels = 3, nclasses = 1000)
+    ResNeXt(depth::Integer; pretrain::Bool = false, cardinality = 32,
+            base_width = 4, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ResNeXt model with the specified depth, cardinality, and base width.
 ((reference)[https://arxiv.org/abs/1611.05431])
@@ -27,8 +27,8 @@ end
 
 (m::ResNeXt)(x) = m.layers(x)
 
-function ResNeXt(depth::Integer; pretrain = false, cardinality = 32,
-                 base_width = 4, inchannels = 3, nclasses = 1000)
+function ResNeXt(depth::Integer; pretrain::Bool = false, cardinality = 32,
+                 base_width = 4, inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(depth, sort(collect(keys(RESNET_CONFIGS)))[3:end])
     layers = resnet(RESNET_CONFIGS[depth]...; inchannels, nclasses, cardinality, base_width)
     if pretrain
diff --git a/src/convnets/resnets/seresnet.jl b/src/convnets/resnets/seresnet.jl
index 05d842173..61eee3aad 100644
--- a/src/convnets/resnets/seresnet.jl
+++ b/src/convnets/resnets/seresnet.jl
@@ -1,5 +1,5 @@
 """
-    SEResNet(depth::Integer; pretrain = false, inchannels = 3, nclasses = 1000)
+    SEResNet(depth::Integer; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a SEResNet model with the specified depth.
 ((reference)[https://arxiv.org/pdf/1709.01507.pdf])
@@ -24,7 +24,8 @@ end
 
 (m::SEResNet)(x) = m.layers(x)
 
-function SEResNet(depth::Integer; pretrain = false, inchannels = 3, nclasses = 1000)
+function SEResNet(depth::Integer; pretrain::Bool = false, inchannels::Integer = 3,
+                  nclasses::Integer = 1000)
     _checkconfig(depth, keys(RESNET_CONFIGS))
     layers = resnet(RESNET_CONFIGS[depth]...; inchannels, nclasses,
                     attn_fn = squeeze_excite)
@@ -38,8 +39,8 @@ backbone(m::SEResNet) = m.layers[1]
 classifier(m::SEResNet) = m.layers[2]
 
 """
-    SEResNeXt(depth::Integer; pretrain = false, cardinality = 32, base_width = 4,
-              inchannels = 3, nclasses = 1000)
+    SEResNeXt(depth::Integer; pretrain::Bool = false, cardinality = 32, base_width = 4,
+              inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a SEResNeXt model with the specified depth, cardinality, and base width.
 ((reference)[https://arxiv.org/pdf/1709.01507.pdf])
@@ -66,8 +67,8 @@ end
 
 (m::SEResNeXt)(x) = m.layers(x)
 
-function SEResNeXt(depth::Integer; pretrain = false, cardinality = 32, base_width = 4,
-                   inchannels = 3, nclasses = 1000)
+function SEResNeXt(depth::Integer; pretrain::Bool = false, cardinality = 32, base_width = 4,
+                   inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(depth, sort(collect(keys(RESNET_CONFIGS)))[3:end])
     layers = resnet(RESNET_CONFIGS[depth]...; inchannels, nclasses, cardinality, base_width,
                     attn_fn = squeeze_excite)
diff --git a/src/convnets/squeezenet.jl b/src/convnets/squeezenet.jl
index abcdd63f8..3ee6653bc 100644
--- a/src/convnets/squeezenet.jl
+++ b/src/convnets/squeezenet.jl
@@ -1,5 +1,6 @@
 """
-    fire(inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes)
+    fire(inplanes::Integer, squeeze_planes::Integer, expand1x1_planes::Integer,
+         expand3x3_planes::Integer)
 
 Create a fire module
 ([reference](https://arxiv.org/abs/1602.07360v4)).
@@ -11,7 +12,8 @@ Create a fire module
   - `expand1x1_planes`: number of output feature maps for the 1x1 expansion convolution
   - `expand3x3_planes`: number of output feature maps for the 3x3 expansion convolution
 """
-function fire(inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes)
+function fire(inplanes::Integer, squeeze_planes::Integer, expand1x1_planes::Integer,
+              expand3x3_planes::Integer)
     branch_1 = Conv((1, 1), inplanes => squeeze_planes, relu)
     branch_2 = Conv((1, 1), squeeze_planes => expand1x1_planes, relu)
     branch_3 = Conv((3, 3), squeeze_planes => expand3x3_planes, relu; pad = 1)
@@ -19,13 +21,18 @@ function fire(inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes)
 end
 
 """
-    squeezenet()
+    squeezenet(; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create a SqueezeNet
 ([reference](https://arxiv.org/abs/1602.07360v4)).
+
+# Arguments
+
+  - `inchannels`: number of input channels.
+  - `nclasses`: the number of output classes.
 """
-function squeezenet()
-    return Chain(Chain(Conv((3, 3), 3 => 64, relu; stride = 2),
+function squeezenet(; inchannels::Integer = 3, nclasses::Integer = 1000)
+    return Chain(Chain(Conv((3, 3), inchannels => 64, relu; stride = 2),
                        MaxPool((3, 3); stride = 2),
                        fire(64, 16, 64, 64),
                        fire(128, 16, 64, 64),
@@ -38,17 +45,23 @@ function squeezenet()
                        fire(384, 64, 256, 256),
                        fire(512, 64, 256, 256),
                        Dropout(0.5),
-                       Conv((1, 1), 512 => 1000, relu)),
+                       Conv((1, 1), 512 => nclasses, relu)),
                  AdaptiveMeanPool((1, 1)),
                  MLUtils.flatten)
 end
 
 """
-    SqueezeNet(; pretrain = false)
+    SqueezeNet(; pretrain::Bool = false, inchannels::Integer = 3,
+           nclasses::Integer = 1000)
 
 Create a SqueezeNet
 ([reference](https://arxiv.org/abs/1602.07360v4)).
-Set `pretrain=true` to load the model with pre-trained weights for ImageNet.
+
+# Arguments
+
+  - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
+  - `inchannels`: number of input channels.
+  - `nclasses`: the number of output classes.
 
 !!! warning
     
@@ -61,8 +74,9 @@ struct SqueezeNet
 end
 @functor SqueezeNet
 
-function SqueezeNet(; pretrain = false)
-    layers = squeezenet()
+function SqueezeNet(; pretrain::Bool = false, inchannels::Integer = 3,
+                    nclasses::Integer = 1000)
+    layers = squeezenet(; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "SqueezeNet")
     end
diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl
index ccfdd2cff..0b6026eb8 100644
--- a/src/convnets/vgg.jl
+++ b/src/convnets/vgg.jl
@@ -11,7 +11,7 @@ A VGG block of convolution layers
   - `depth`: number of convolution/convolution + batch norm layers
   - `batchnorm`: set to `true` to include batch normalization after each convolution
 """
-function vgg_block(ifilters, ofilters, depth, batchnorm)
+function vgg_block(ifilters::Integer, ofilters::Integer, depth::Integer, batchnorm::Bool)
     k = (3, 3)
     p = (1, 1)
     layers = []
@@ -40,7 +40,8 @@ Create VGG convolution layers
   - `batchnorm`: set to `true` to include batch normalization after each convolution
   - `inchannels`: number of input channels
 """
-function vgg_convolutional_layers(config, batchnorm, inchannels)
+function vgg_convolutional_layers(config::Vector{<:Tuple}, batchnorm::Bool,
+                                  inchannels::Integer)
     layers = []
     ifilters = inchannels
     for c in config
@@ -65,7 +66,8 @@ Create VGG classifier (fully connected) layers
   - `fcsize`: input and output size of the intermediate fully connected layer
   - `dropout_rate`: the dropout level between each fully connected layer
 """
-function vgg_classifier_layers(imsize, nclasses, fcsize, dropout_rate)
+function vgg_classifier_layers(imsize::NTuple{3, <:Integer}, nclasses::Integer,
+                               fcsize::Integer, dropout_rate)
     return Chain(MLUtils.flatten,
                  Dense(Int(prod(imsize)), fcsize, relu),
                  Dropout(dropout_rate),
@@ -92,7 +94,8 @@ Create a VGG model
     (see [`Metalhead.vgg_classifier_layers`](#))
   - `dropout_rate`: dropout level between fully connected layers
 """
-function vgg(imsize; config, inchannels, batchnorm = false, nclasses, fcsize, dropout_rate)
+function vgg(imsize::Dims{2}; config, batchnorm::Bool = false, fcsize::Integer = 4096,
+             dropout_rate = 0.0, inchannels::Integer = 3, nclasses::Integer = 1000)
     conv = vgg_convolutional_layers(config, batchnorm, inchannels)
     imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3]
     class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout_rate)
@@ -109,10 +112,6 @@ const VGG_CONFIGS = Dict(11 => :A,
                          16 => :D,
                          19 => :E)
 
-struct VGG
-    layers::Any
-end
-
 """
     VGG(imsize::Dims{2}; config, inchannels, batchnorm = false, nclasses, fcsize, dropout_rate)
 
@@ -120,46 +119,53 @@ Construct a VGG model with the specified input image size. Typically, the image
 
 ## Keyword Arguments:
 
-  - `config` : VGG convolutional block configuration. It is defined as a vector of tuples `(output_channels, num_convolutions)` for each block
-  - `inchannels`::Integer : number of input channels
-  - `batchnorm`::Bool : set to `true` to use batch normalization after each convolution
-  - `nclasses`::Integer : number of output classes
+  - `config` : VGG convolutional block configuration. It is defined as a vector of tuples
+    `(output_channels, num_convolutions)` for each block
+  - `inchannels`: number of input channels
+  - `batchnorm`: set to `true` to use batch normalization after each convolution
+  - `nclasses`: number of output classes
   - `fcsize`: intermediate fully connected layer size
     (see [`Metalhead.vgg_classifier_layers`](#))
   - `dropout_rate`: dropout level between fully connected layers
 """
-function VGG(imsize::Dims{2}; config, inchannels, batchnorm = false, nclasses, fcsize,
-             dropout_rate)
-    layers = vgg(imsize; config, inchannels, batchnorm, nclasses, fcsize, dropout_rate)
-    return VGG(layers)
+struct VGG
+    layers::Any
 end
-
 @functor VGG
 
+function VGG(imsize::Dims{2}; config, batchnorm::Bool = false, dropout_rate = 0.5,
+             inchannels::Integer = 3, nclasses::Integer = 1000)
+    layers = vgg(imsize; config, inchannels, batchnorm, nclasses, dropout_rate)
+    return VGG(layers)
+end
+
 (m::VGG)(x) = m.layers(x)
 
 backbone(m::VGG) = m.layers[1]
 classifier(m::VGG) = m.layers[2]
 
 """
-    VGG(depth::Integer = 16; pretrain = false, batchnorm = false)
+    VGG(depth::Integer; pretrain::Bool = false, batchnorm::Bool = false,
+        inchannels::Integer = 3, nclasses::Integer = 1000)
 
-Create a VGG style model with specified `depth`. Available values include (11, 13, 16, 19).
+Create a VGG style model with specified `depth`.
 ([reference](https://arxiv.org/abs/1409.1556v6)).
-See also [`VGG`](#).
 
 # Arguments
 
+  - `depth`: the depth of the VGG model. Must be one of [11, 13, 16, 19].
   - `pretrain`: set to `true` to load pre-trained model weights for ImageNet
+  - `batchnorm`: set to `true` to use batch normalization after each convolution
+  - `inchannels`: number of input channels
+  - `nclasses`: number of output classes
+
+See also [`vgg`](#).
 """
-function VGG(depth::Integer = 16; pretrain = false, batchnorm = false, nclasses = 1000)
+function VGG(depth::Integer; pretrain::Bool = false, batchnorm::Bool = false,
+             inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(depth, keys(VGG_CONFIGS))
-    model = VGG((224, 224); config = VGG_CONV_CONFIGS[VGG_CONFIGS[depth]],
-                inchannels = 3,
-                batchnorm = batchnorm,
-                nclasses = nclasses,
-                fcsize = 4096,
-                dropout_rate = 0.5)
+    model = VGG((224, 224); config = VGG_CONV_CONFIGS[VGG_CONFIGS[depth]], batchnorm,
+                inchannels, nclasses)
     if pretrain && !batchnorm
         loadpretrain!(model, string("vgg", depth))
     elseif pretrain
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 557db23a7..75b40708c 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -23,9 +23,9 @@ Create a convolution + batch normalization pair with activation.
   - `groups`: groups for the convolution kernel
   - `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#))
 """
-function conv_norm(kernel_size, inplanes::Int, outplanes::Int, activation = relu;
-                   norm_layer = BatchNorm, revnorm = false, preact = false, use_norm = true,
-                   kwargs...)
+function conv_norm(kernel_size, inplanes::Integer, outplanes::Integer, activation = relu;
+                   norm_layer = BatchNorm, revnorm::Bool = false, preact::Bool = false,
+                   use_norm::Bool = true, kwargs...)
     if !use_norm
         if (preact || revnorm)
             throw(ArgumentError("`preact` only supported with `use_norm = true`"))
@@ -60,8 +60,8 @@ end
 
 """
     depthwise_sep_conv_norm(kernel_size, inplanes, outplanes, activation = relu;
-                               revnorm = false, use_norm = (true, true),
-                               stride = 1, pad = 0, dilation = 1, [bias, weight, init])
+                            revnorm = false, use_norm = (true, true),
+                            stride = 1, pad = 0, dilation = 1, [bias, weight, init])
 
 Create a depthwise separable convolution chain as used in MobileNetv1.
 This is sequence of layers:
@@ -86,9 +86,11 @@ See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
   - `dilation`: dilation of the first convolution kernel
   - `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#))
 """
-function depthwise_sep_conv_norm(kernel_size, inplanes, outplanes, activation = relu;
-                                 norm_layer = BatchNorm, revnorm = false,
-                                 use_norm = (true, true), stride = 1, kwargs...)
+function depthwise_sep_conv_norm(kernel_size, inplanes::Integer, outplanes::Integer,
+                                 activation = relu; norm_layer = BatchNorm,
+                                 revnorm::Bool = false,
+                                 use_norm::NTuple{2, Bool} = (true, true),
+                                 stride::Integer = 1, kwargs...)
     return vcat(conv_norm(kernel_size, inplanes, inplanes, activation;
                           norm_layer, revnorm, use_norm = use_norm[1], stride,
                           groups = inplanes, kwargs...),
diff --git a/src/layers/drop.jl b/src/layers/drop.jl
index b4a882cff..f823d5c22 100644
--- a/src/layers/drop.jl
+++ b/src/layers/drop.jl
@@ -1,5 +1,6 @@
 # Generates the mask to be used for `DropBlock`
-@inline function _dropblock_mask(rng, x, gamma, clipped_block_size)
+@inline function _dropblock_mask(rng, x::AbstractArray{T, 4}, gamma,
+                                 clipped_block_size::Integer) where {T}
     block_mask = rand_like(rng, x)
     block_mask .= block_mask .< gamma
     return 1 .- maxpool(block_mask, (clipped_block_size, clipped_block_size);
@@ -28,8 +29,8 @@ If you are an end-user, you do not want this function. Use [`DropBlock`](#) inst
 """
 # TODO add experimental `DropBlock` options from timm such as gaussian noise and
 # more precise `DropBlock` to deal with edges (#188)
-function dropblock(rng::AbstractRNG, x::AbstractArray{T, 4}, drop_block_prob, block_size,
-                   gamma_scale) where {T}
+function dropblock(rng::AbstractRNG, x::AbstractArray{T, 4}, drop_block_prob,
+                   block_size::Integer, gamma_scale) where {T}
     H, W, _, _ = size(x)
     total_size = H * W
     clipped_block_size = min(block_size, min(H, W))
@@ -100,7 +101,7 @@ size `block_size` in the input. During inference, it simply returns the input `x
   - `rng`: can be used to pass in a custom RNG instead of the default. Custom RNGs are only
     supported on the CPU.
 """
-function DropBlock(drop_block_prob = 0.1, block_size = 7, gamma_scale = 1.0,
+function DropBlock(drop_block_prob = 0.1, block_size::Integer = 7, gamma_scale = 1.0,
                    rng = rng_from_array())
     if drop_block_prob == 0.0
         return identity
diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl
index 3e85f18d9..560ac074d 100644
--- a/src/layers/embeddings.jl
+++ b/src/layers/embeddings.jl
@@ -1,7 +1,7 @@
 _flatten_spatial(x) = permutedims(reshape(x, (:, size(x, 3), size(x, 4))), (2, 1, 3))
 
 """
-    PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels = 3,
+    PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels::Integer = 3,
                    patch_size::Dims{2} = (16, 16), embedplanes = 768,
                    norm_layer = planes -> identity, flatten = true)
 
@@ -19,8 +19,8 @@ patches.
   - `flatten`: set true to flatten the input spatial dimensions after the embedding
 """
 function PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels::Integer = 3,
-                        patch_size::Dims{2} = (16, 16), embedplanes = 768,
-                        norm_layer = planes -> identity, flatten = true)
+                        patch_size::Dims{2} = (16, 16), embedplanes::Integer = 768,
+                        norm_layer = planes -> identity, flatten::Bool = true)
     im_height, im_width = imsize
     patch_height, patch_width = patch_size
 
@@ -33,13 +33,15 @@ function PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels::Integer = 3,
 end
 
 """
-    ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) -> rand(Float32, dims))
+    ViPosEmbedding(embedsize::Integer, npatches::Integer; 
+                   init = (dims::Dims{2}) -> rand(Float32, dims))
 
 Positional embedding layer used by many vision transformer-like models.
 """
 struct ViPosEmbedding{T}
     vectors::T
 end
+@functor ViPosEmbedding
 
 function ViPosEmbedding(embedsize::Integer, npatches::Integer;
                         init = (dims::Dims{2}) -> rand(Float32, dims))
@@ -48,22 +50,20 @@ end
 
 (p::ViPosEmbedding)(x) = x .+ p.vectors
 
-@functor ViPosEmbedding
-
 """
-    ClassTokens(dim; init = Flux.zeros32)
+    ClassTokens(planes::Integer; init = Flux.zeros32)
 
-Appends class tokens to an input with embedding dimension `dim` for use in many vision transformer models.
+Appends class tokens to an input with embedding dimension `planes` for use in many
+vision transformer models.
 """
 struct ClassTokens{T}
     token::T
 end
+@functor ClassTokens
 
-ClassTokens(dim::Integer; init = Flux.zeros32) = ClassTokens(init(dim, 1, 1))
+ClassTokens(planes::Integer; init = Flux.zeros32) = ClassTokens(init(planes, 1, 1))
 
 function (m::ClassTokens)(x::AbstractArray{T, 3}) where {T}
     tokens = m.token .* MLUtils.ones_like(x, T, (1, 1, size(x, 3)))
     return hcat(tokens, x)
 end
-
-@functor ClassTokens
diff --git a/src/layers/mlp.jl b/src/layers/mlp.jl
index a3bdb0fb5..3a1c27413 100644
--- a/src/layers/mlp.jl
+++ b/src/layers/mlp.jl
@@ -47,8 +47,9 @@ end
 gated_mlp_block(::typeof(identity), args...; kwargs...) = mlp_block(args...; kwargs...)
 
 """
-    create_classifier(inplanes, nclasses; pool_layer = AdaptiveMeanPool((1, 1)),
-                      dropout_rate = 0.0, use_conv = false)
+    create_classifier(inplanes::Integer, nclasses::Integer;
+                      pool_layer = AdaptiveMeanPool((1, 1)),
+                      dropout_rate = 0.0, use_conv::Bool = false)
 
 Creates a classifier head to be used for models.
 
@@ -61,8 +62,9 @@ Creates a classifier head to be used for models.
   - `dropout_rate`: dropout rate used in the classifier head.
   - `use_conv`: whether to use a 1x1 convolutional layer instead of a `Dense` layer.
 """
-function create_classifier(inplanes, nclasses; pool_layer = AdaptiveMeanPool((1, 1)),
-                           dropout_rate = 0.0, use_conv = false)
+function create_classifier(inplanes::Integer, nclasses::Integer;
+                           pool_layer = AdaptiveMeanPool((1, 1)),
+                           dropout_rate = 0.0, use_conv::Bool = false)
     # Pooling
     if pool_layer === identity
         @assert use_conv
diff --git a/src/layers/pool.jl b/src/layers/pool.jl
index 1962ab0fb..049c06451 100644
--- a/src/layers/pool.jl
+++ b/src/layers/pool.jl
@@ -1,5 +1,6 @@
 """
-    AdaptiveMeanMaxPool(output_size = (1, 1); connection = +)
+    AdaptiveMeanMaxPool(connection = +, output_size::Tuple = (1, 1))
+    AdaptiveMeanMaxPool(output_size::Tuple = (1, 1))
 
 A type of adaptive pooling layer which uses both mean and max pooling and combines them to
 produce a single output. Note that this is equivalent to
@@ -10,7 +11,7 @@ produce a single output. Note that this is equivalent to
   - `output_size`: The size of the output after pooling.
   - `connection`: The connection type to use.
 """
-function AdaptiveMeanMaxPool(connection, output_size = (1, 1))
+function AdaptiveMeanMaxPool(connection, output_size::Tuple = (1, 1))
     return Parallel(connection, AdaptiveMeanPool(output_size), AdaptiveMaxPool(output_size))
 end
 AdaptiveMeanMaxPool(output_size::Tuple = (1, 1)) = AdaptiveMeanMaxPool(+, output_size)
diff --git a/src/layers/scale.jl b/src/layers/scale.jl
index 965b50f38..f3a555b76 100644
--- a/src/layers/scale.jl
+++ b/src/layers/scale.jl
@@ -9,7 +9,7 @@ _input_scale(λ, activation, x) = activation.(λ .* x)
 _input_scale(λ, ::typeof(identity), x) = λ .* x
 
 """
-    LayerScale(λ, planes::Integer)
+    LayerScale(planes::Integer, λ)
 
 Creates a `Flux.Scale` layer that performs "`LayerScale`"
 ([reference](https://arxiv.org/abs/2103.17239)).
diff --git a/src/layers/selayers.jl b/src/layers/selayers.jl
index db0f3715d..0756225ba 100644
--- a/src/layers/selayers.jl
+++ b/src/layers/selayers.jl
@@ -15,9 +15,9 @@ Creates a squeeze-and-excitation layer used in MobileNets and SE-Nets.
   - `norm_layer`: The normalization layer to be used after the convolution layers
   - `rd_planes`: The number of hidden feature maps in a squeeze and excite layer
 """
-function squeeze_excite(inplanes; reduction = 16, rd_divisor = 8,
-                        activation = relu, gate_activation = sigmoid,
-                        norm_layer = planes -> identity,
+function squeeze_excite(inplanes::Integer; reduction::Integer = 16,
+                        rd_divisor::Integer = 8, activation = relu,
+                        gate_activation = sigmoid, norm_layer = planes -> identity,
                         rd_planes = _round_channels(inplanes ÷ reduction, rd_divisor, 0))
     layers = [AdaptiveMeanPool((1, 1)),
         Conv((1, 1), inplanes => rd_planes),
@@ -40,7 +40,7 @@ Effective squeeze-and-excitation layer.
   - `inplanes`: The number of input feature maps
   - `gate_activation`: The activation function for the gate layer
 """
-function effective_squeeze_excite(inplanes; gate_activation = sigmoid, kwargs...)
+function effective_squeeze_excite(inplanes::Integer; gate_activation = sigmoid)
     return SkipConnection(Chain(AdaptiveMeanPool((1, 1)),
                                 Conv((1, 1), inplanes, inplanes),
                                 gate_activation), .*)
diff --git a/src/mixers/core.jl b/src/mixers/core.jl
index 9f9d3b305..18f66aaa8 100644
--- a/src/mixers/core.jl
+++ b/src/mixers/core.jl
@@ -1,7 +1,7 @@
 """
-    mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3, norm_layer = LayerNorm,
+    mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels::Integer = 3, norm_layer = LayerNorm,
              patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.,
-             depth = 12, nclasses = 1000, kwargs...)
+             depth = 12, nclasses::Integer = 1000, kwargs...)
 
 Creates a model with the MLPMixer architecture.
 ([reference](https://arxiv.org/pdf/2105.01601)).
@@ -21,10 +21,9 @@ Creates a model with the MLPMixer architecture.
   - `kwargs`: additional arguments (if any) to pass to the mixer block. Will use the defaults if
     not specified.
 """
-function mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3,
-                  norm_layer = LayerNorm, patch_size::Dims{2} = (16, 16),
-                  embedplanes = 512, drop_path_rate = 0.0,
-                  depth = 12, nclasses = 1000, kwargs...)
+function mlpmixer(block, imsize::Dims{2} = (224, 224); norm_layer = LayerNorm,
+                  patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.0,
+                  depth = 12, inchannels::Integer = 3, nclasses::Integer = 1000, kwargs...)
     npatches = prod(imsize .÷ patch_size)
     dp_rates = linear_scheduler(drop_path_rate; depth)
     layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
diff --git a/src/mixers/gmlp.jl b/src/mixers/gmlp.jl
index 9ebd2dce3..df4a52b70 100644
--- a/src/mixers/gmlp.jl
+++ b/src/mixers/gmlp.jl
@@ -42,9 +42,9 @@ function (m::SpatialGatingUnit)(x)
 end
 
 """
-    spatial_gating_block(planes, npatches; mlp_ratio = 4.0, mlp_layer = gated_mlp_block,
-                         norm_layer = LayerNorm, dropout_rate = 0.0, drop_path_rate = 0.0,
-                         activation = gelu)
+    spatial_gating_block(planes::Integer, npatches::Integer; mlp_ratio = 4.0,
+                         norm_layer = LayerNorm, mlp_layer = gated_mlp_block,
+                         dropout_rate = 0.0, drop_path_rate = 0.0, activation = gelu)
 
 Creates a feedforward block based on the gMLP model architecture described in the paper.
 ([reference](https://arxiv.org/abs/2105.08050))
@@ -60,10 +60,9 @@ Creates a feedforward block based on the gMLP model architecture described in th
   - `drop_path_rate`: Stochastic depth rate
   - `activation`: the activation function to use in the MLP blocks
 """
-function spatial_gating_block(planes, npatches; mlp_ratio = 4.0, norm_layer = LayerNorm,
-                              mlp_layer = gated_mlp_block, dropout_rate = 0.0,
-                              drop_path_rate = 0.0,
-                              activation = gelu)
+function spatial_gating_block(planes::Integer, npatches::Integer; mlp_ratio = 4.0,
+                              norm_layer = LayerNorm, mlp_layer = gated_mlp_block,
+                              dropout_rate = 0.0, drop_path_rate = 0.0, activation = gelu)
     channelplanes = Int(mlp_ratio * planes)
     sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer)
     return SkipConnection(Chain(norm_layer(planes),
@@ -72,14 +71,9 @@ function spatial_gating_block(planes, npatches; mlp_ratio = 4.0, norm_layer = La
                                 DropPath(drop_path_rate)), +)
 end
 
-struct gMLP
-    layers::Any
-end
-@functor gMLP
-
 """
-    gMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-         imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000)
+    gMLP(size::Symbol; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
+         inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a model with the gMLP architecture.
 ([reference](https://arxiv.org/abs/2105.08050)).
@@ -89,18 +83,23 @@ Creates a model with the gMLP architecture.
   - `size`: the size of the model - one of `small`, `base`, `large` or `huge`
   - `patch_size`: the size of the patches
   - `imsize`: the size of the input image
-  - `drop_path_rate`: Stochastic depth rate
+  - `inchannels`: the number of input channels
   - `nclasses`: number of output classes
 
 See also [`Metalhead.mlpmixer`](#).
 """
-function gMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-              imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+struct gMLP
+    layers::Any
+end
+@functor gMLP
+
+function gMLP(size::Symbol; imsize::Dims{2} = (224, 224), patch_size::Dims{2} = (16, 16),
+              inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(size, keys(MIXER_CONFIGS))
     depth = MIXER_CONFIGS[size][:depth]
     embedplanes = MIXER_CONFIGS[size][:planes]
-    layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block,
-                      patch_size, embedplanes, drop_path_rate, depth, nclasses)
+    layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block, patch_size,
+                      embedplanes, depth, inchannels, nclasses)
     return gMLP(layers)
 end
 
diff --git a/src/mixers/mlpmixer.jl b/src/mixers/mlpmixer.jl
index 7b6d4aa09..06aefbd48 100644
--- a/src/mixers/mlpmixer.jl
+++ b/src/mixers/mlpmixer.jl
@@ -1,6 +1,7 @@
 """
-    mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block, 
-               dropout_rate = 0., drop_path_rate = 0., activation = gelu)
+    mixerblock(planes::Integer, npatches::Integer; mlp_layer = mlp_block,
+               mlp_ratio = (0.5, 4.0), dropout_rate = 0.0, drop_path_rate = 0.0,
+               activation = gelu)
 
 Creates a feedforward block for the MLPMixer architecture.
 ([reference](https://arxiv.org/pdf/2105.01601))
@@ -16,9 +17,10 @@ Creates a feedforward block for the MLPMixer architecture.
   - `drop_path_rate`: Stochastic depth rate
   - `activation`: the activation function to use in the MLP blocks
 """
-function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block,
-                    dropout_rate = 0.0, drop_path_rate = 0.0, activation = gelu)
-    tokenplanes, channelplanes = [Int(r * planes) for r in mlp_ratio]
+function mixerblock(planes::Integer, npatches::Integer; mlp_layer = mlp_block,
+                    mlp_ratio::NTuple{2, Number} = (0.5, 4.0), dropout_rate = 0.0,
+                    drop_path_rate = 0.0, activation = gelu)
+    tokenplanes, channelplanes = Int.(planes .* mlp_ratio)
     return Chain(SkipConnection(Chain(LayerNorm(planes),
                                       swapdims((2, 1, 3)),
                                       mlp_layer(npatches, tokenplanes; activation,
@@ -31,14 +33,9 @@ function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_bl
                                       DropPath(drop_path_rate)), +))
 end
 
-struct MLPMixer
-    layers::Any
-end
-@functor MLPMixer
-
 """
-    MLPMixer(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-             imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000)
+MLPMixer(size::Symbol; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
+         inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a model with the MLPMixer architecture.
 ([reference](https://arxiv.org/pdf/2105.01601)).
@@ -49,17 +46,22 @@ Creates a model with the MLPMixer architecture.
   - `patch_size`: the size of the patches
   - `imsize`: the size of the input image
   - `drop_path_rate`: Stochastic depth rate
+  - `inchannels`: the number of input channels
   - `nclasses`: number of output classes
 
 See also [`Metalhead.mlpmixer`](#).
 """
-function MLPMixer(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-                  imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+struct MLPMixer
+    layers::Any
+end
+@functor MLPMixer
+
+function MLPMixer(size::Symbol; imsize::Dims{2} = (224, 224), patch_size::Dims{2} = (16, 16),
+                  inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(size, keys(MIXER_CONFIGS))
     depth = MIXER_CONFIGS[size][:depth]
     embedplanes = MIXER_CONFIGS[size][:planes]
-    layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, drop_path_rate,
-                      nclasses)
+    layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, inchannels,nclasses)
     return MLPMixer(layers)
 end
 
diff --git a/src/mixers/resmlp.jl b/src/mixers/resmlp.jl
index 17e340310..f2c9ece15 100644
--- a/src/mixers/resmlp.jl
+++ b/src/mixers/resmlp.jl
@@ -1,6 +1,6 @@
 """
     resmixerblock(planes, npatches; dropout_rate = 0., drop_path_rate = 0., mlp_ratio = 4.0,
-                  activation = gelu, λ = 1e-4)
+                  activation = gelu, layerscale_init = 1e-4)
 
 Creates a block for the ResMixer architecture.
 ([reference](https://arxiv.org/abs/2105.03404)).
@@ -15,33 +15,28 @@ Creates a block for the ResMixer architecture.
   - `dropout_rate`: the dropout rate to use in the MLP blocks
   - `drop_path_rate`: Stochastic depth rate
   - `activation`: the activation function to use in the MLP blocks
-  - `λ`: initialisation constant for the LayerScale
+  - `layerscale_init`: initialisation constant for the LayerScale
 """
-function resmixerblock(planes, npatches; mlp_ratio = 4.0, mlp_layer = mlp_block,
-                       dropout_rate = 0.0, drop_path_rate = 0.0, activation = gelu,
-                       λ = 1e-4)
+function resmixerblock(planes::Integer, npatches::Integer; mlp_layer = mlp_block,
+                       mlp_ratio = 4.0, layerscale_init = 1e-4, dropout_rate = 0.0,
+                       drop_path_rate = 0.0, activation = gelu)
     return Chain(SkipConnection(Chain(Flux.Scale(planes),
                                       swapdims((2, 1, 3)),
                                       Dense(npatches, npatches),
                                       swapdims((2, 1, 3)),
-                                      LayerScale(planes, λ),
+                                      LayerScale(planes, layerscale_init),
                                       DropPath(drop_path_rate)), +),
                  SkipConnection(Chain(Flux.Scale(planes),
                                       mlp_layer(planes, Int(mlp_ratio * planes);
                                                 dropout_rate,
                                                 activation),
-                                      LayerScale(planes, λ),
+                                      LayerScale(planes, layerscale_init),
                                       DropPath(drop_path_rate)), +))
 end
 
-struct ResMLP
-    layers::Any
-end
-@functor ResMLP
-
 """
-    ResMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
-           drop_path_rate = 0., nclasses = 1000)
+    ResMLP(size::Symbol; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
+           inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a model with the ResMLP architecture.
 ([reference](https://arxiv.org/abs/2105.03404)).
@@ -51,18 +46,23 @@ Creates a model with the ResMLP architecture.
   - `size`: the size of the model - one of `small`, `base`, `large` or `huge`
   - `patch_size`: the size of the patches
   - `imsize`: the size of the input image
-  - `drop_path_rate`: Stochastic depth rate
+  - `inchannels`: the number of input channels
   - `nclasses`: number of output classes
 
 See also [`Metalhead.mlpmixer`](#).
 """
-function ResMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-                imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+struct ResMLP
+    layers::Any
+end
+@functor ResMLP
+
+function ResMLP(size::Symbol; imsize::Dims{2} = (224, 224), patch_size::Dims{2} = (16, 16),
+                inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(size, keys(MIXER_CONFIGS))
     depth = MIXER_CONFIGS[size][:depth]
     embedplanes = MIXER_CONFIGS[size][:planes]
     layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes,
-                      drop_path_rate, depth, nclasses)
+                      depth, inchannels, nclasses)
     return ResMLP(layers)
 end
 
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
index 1fece2191..1c049e46e 100644
--- a/src/vit-based/vit.jl
+++ b/src/vit-based/vit.jl
@@ -12,7 +12,8 @@ Transformer as used in the base ViT architecture.
   - `mlp_ratio`: ratio of MLP layers to the number of input channels
   - `dropout_rate`: dropout rate
 """
-function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout_rate = 0.0)
+function transformer_encoder(planes::Integer, depth::Integer, nheads::Integer;
+                             mlp_ratio = 4.0, dropout_rate = 0.0)
     layers = [Chain(SkipConnection(prenorm(planes,
                                            MHAttention(planes, nheads;
                                                        attn_dropout_rate = dropout_rate,
@@ -26,9 +27,9 @@ function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout_rat
 end
 
 """
-    vit(imsize::Dims{2} = (256, 256); inchannels = 3, patch_size::Dims{2} = (16, 16),
+    vit(imsize::Dims{2} = (256, 256); inchannels::Integer = 3, patch_size::Dims{2} = (16, 16),
         embedplanes = 768, depth = 6, nheads = 16, mlp_ratio = 4.0, dropout_rate = 0.1,
-        emb_dropout_rate = 0.1, pool = :class, nclasses = 1000)
+        emb_dropout_rate = 0.1, pool = :class, nclasses::Integer = 1000)
 
 Creates a Vision Transformer (ViT) model.
 ([reference](https://arxiv.org/abs/2010.11929)).
@@ -47,9 +48,10 @@ Creates a Vision Transformer (ViT) model.
   - `pool`: pooling type, either :class or :mean
   - `nclasses`: number of classes in the output
 """
-function vit(imsize::Dims{2} = (256, 256); inchannels = 3, patch_size::Dims{2} = (16, 16),
-             embedplanes = 768, depth = 6, nheads = 16, mlp_ratio = 4.0, dropout_rate = 0.1,
-             emb_dropout_rate = 0.1, pool = :class, nclasses = 1000)
+function vit(imsize::Dims{2} = (256, 256); inchannels::Integer = 3,
+             patch_size::Dims{2} = (16, 16), embedplanes::Integer = 768,
+             depth::Integer = 6, nheads::Integer = 16, mlp_ratio = 4.0, dropout_rate = 0.1,
+             emb_dropout_rate = 0.1, pool::Symbol = :class, nclasses::Integer = 1000)
     @assert pool in [:class, :mean]
     "Pool type must be either `:class` (class token) or `:mean` (mean pooling)"
     npatches = prod(imsize .÷ patch_size)
@@ -74,8 +76,8 @@ const VIT_CONFIGS = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3),
                                        mlp_ratio = 64 // 13))
 
 """
-    ViT(mode::Symbol = base; imsize::Dims{2} = (256, 256), inchannels = 3,
-        patch_size::Dims{2} = (16, 16), pool = :class, nclasses = 1000)
+    ViT(mode::Symbol = base; imsize::Dims{2} = (256, 256), inchannels::Integer = 3,
+        patch_size::Dims{2} = (16, 16), pool = :class, nclasses::Integer = 1000)
 
 Creates a Vision Transformer (ViT) model.
 ([reference](https://arxiv.org/abs/2010.11929)).
@@ -97,11 +99,11 @@ struct ViT
 end
 @functor ViT
 
-function ViT(mode::Symbol = :base; imsize::Dims{2} = (256, 256), inchannels = 3,
-             patch_size::Dims{2} = (16, 16), pool = :class, nclasses = 1000)
+function ViT(mode::Symbol = :base; imsize::Dims{2} = (256, 256), patch_size::Dims{2} = (16, 16),
+             inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(mode, keys(VIT_CONFIGS))
     kwargs = VIT_CONFIGS[mode]
-    layers = vit(imsize; inchannels, patch_size, nclasses, pool, kwargs...)
+    layers = vit(imsize; inchannels, patch_size, nclasses, kwargs...)
     return ViT(layers)
 end
 
diff --git a/test/convnets.jl b/test/convnets.jl
index 40f5ec75a..501ff1be4 100644
--- a/test/convnets.jl
+++ b/test/convnets.jl
@@ -258,15 +258,12 @@ end
     end
 end
 
-
 @testset "ConvNeXt" verbose = true begin
     @testset for mode in [:small, :base, :large, :tiny, :xlarge]
-        @testset for drop_path_rate in [0.0, 0.5]
-            m = ConvNeXt(mode; drop_path_rate)
-            @test size(m(x_224)) == (1000, 1)
-            @test gradtest(m, x_224)
-            _gc()
-        end
+        m = ConvNeXt(mode)
+        @test size(m(x_224)) == (1000, 1)
+        @test gradtest(m, x_224)
+        _gc()
     end
 end
 
diff --git a/test/mixers.jl b/test/mixers.jl
index 885ff5838..51cdd736e 100644
--- a/test/mixers.jl
+++ b/test/mixers.jl
@@ -1,32 +1,8 @@
-@testset "MLPMixer" begin
-	@testset for mode in [:small, :base, :large] #:huge]
-		@testset for drop_path_rate in [0.0, 0.5]
-			m = MLPMixer(mode; drop_path_rate)
-			@test size(m(x_224)) == (1000, 1)
-			@test gradtest(m, x_224)
-			_gc()
-		end
-	end
-end
-
-@testset "ResMLP" begin
-    @testset for mode in [:small, :base, :large] #:huge]
-        @testset for drop_path_rate in [0.0, 0.5]
-            m = ResMLP(mode; drop_path_rate)
-            @test size(m(x_224)) == (1000, 1)
-            @test gradtest(m, x_224)
-            _gc()
-        end
-    end
-end
-
-@testset "gMLP" begin
-    @testset for mode in [:small, :base, :large] #:huge]
-		@testset for drop_path_rate in [0.0, 0.5]
-			m = gMLP(mode; drop_path_rate)
-			@test size(m(x_224)) == (1000, 1)
-			@test gradtest(m, x_224)
-			_gc()
-		end
+@testset for model in [MLPMixer, ResMLP, gMLP]
+    @testset for mode in [:small, :base, :large]
+        m = model(mode)
+        @test size(m(x_224)) == (1000, 1)
+        @test gradtest(m, x_224)
+        _gc()
     end
 end

From 061b1331c1ce9001f061068f48a2a7c6f4fc604d Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Mon, 1 Aug 2022 07:48:20 +0530
Subject: [PATCH 3/8] Refine `invertedresidual`

---
 src/convnets/efficientnet.jl          |  5 +++--
 src/convnets/mobilenet/mobilenetv2.jl |  2 +-
 src/convnets/mobilenet/mobilenetv3.jl |  2 +-
 src/layers/conv.jl                    | 19 ++++++++++++-------
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/convnets/efficientnet.jl b/src/convnets/efficientnet.jl
index 4321e9443..71e6f8f0a 100644
--- a/src/convnets/efficientnet.jl
+++ b/src/convnets/efficientnet.jl
@@ -36,11 +36,12 @@ function efficientnet(scalings, block_configs;
         out_channels = _round_channels(scalew(o), 8)
         repeats = scaled(n)
         push!(blocks,
-              invertedresidual(k, in_channels, in_channels * e, out_channels, swish;
+              invertedresidual((k, k), in_channels, in_channels * e, out_channels, swish;
                                stride = s, reduction = 4))
         for _ in 1:(repeats - 1)
             push!(blocks,
-                  invertedresidual(k, out_channels, out_channels * e, out_channels, swish;
+                  invertedresidual((k, k), out_channels, out_channels * e, out_channels,
+                                   swish;
                                    stride = 1, reduction = 4))
         end
     end
diff --git a/src/convnets/mobilenet/mobilenetv2.jl b/src/convnets/mobilenet/mobilenetv2.jl
index a97e7dda1..b97fc16ff 100644
--- a/src/convnets/mobilenet/mobilenetv2.jl
+++ b/src/convnets/mobilenet/mobilenetv2.jl
@@ -30,7 +30,7 @@ function mobilenetv2(width_mult, configs; inchannels = 3, max_width = 1280, ncla
         outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8)
         for i in 1:n
             push!(layers,
-                  invertedresidual(3, inplanes, inplanes * t, outplanes, a;
+                  invertedresidual((3, 3), inplanes, inplanes * t, outplanes, a;
                                    stride = i == 1 ? s : 1))
             inplanes = outplanes
         end
diff --git a/src/convnets/mobilenet/mobilenetv3.jl b/src/convnets/mobilenet/mobilenetv3.jl
index d8666c5f3..d6873ac57 100644
--- a/src/convnets/mobilenet/mobilenetv3.jl
+++ b/src/convnets/mobilenet/mobilenetv3.jl
@@ -36,7 +36,7 @@ function mobilenetv3(width_mult, configs; inchannels = 3, max_width = 1024, ncla
         outplanes = _round_channels(c * width_mult, 8)
         explanes = _round_channels(inplanes * t, 8)
         push!(layers,
-              invertedresidual(k, inplanes, explanes, outplanes, a;
+              invertedresidual((k, k), inplanes, explanes, outplanes, a;
                                stride = s, reduction = r))
         inplanes = outplanes
     end
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 5610d3be2..557db23a7 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -114,16 +114,17 @@ Create a basic inverted residual block for MobileNet variants
   - `reduction`: The reduction factor for the number of hidden feature maps
     in a squeeze and excite layer (see [`squeeze_excite`](#)).
 """
-function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes,
-                          activation = relu; stride, reduction = nothing)
+function invertedresidual(kernel_size, inplanes::Integer, hidden_planes::Integer,
+                          outplanes::Integer, activation = relu; stride::Integer,
+                          reduction::Union{Nothing, Integer} = nothing)
     @assert stride in [1, 2] "`stride` has to be 1 or 2"
     pad = @. (kernel_size - 1) ÷ 2
-    conv1 = (inplanes == hidden_planes) ? identity :
-            Chain(conv_norm((1, 1), inplanes, hidden_planes, activation; bias = false))
+    conv1 = (inplanes == hidden_planes) ? (identity,) :
+            conv_norm((1, 1), inplanes, hidden_planes, activation; bias = false)
     selayer = isnothing(reduction) ? identity :
               squeeze_excite(hidden_planes; reduction, activation, gate_activation = hardσ,
                              norm_layer = BatchNorm)
-    invres = Chain(conv1,
+    invres = Chain(conv1...,
                    conv_norm(kernel_size, hidden_planes, hidden_planes, activation;
                              bias = false, stride, pad = pad, groups = hidden_planes)...,
                    selayer,
@@ -131,6 +132,10 @@ function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes,
     return (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres
 end
 
-function invertedresidual(kernel_size::Integer, args...; kwargs...)
-    return invertedresidual((kernel_size, kernel_size), args...; kwargs...)
+function invertedresidual(kernel_size, inplanes::Integer, outplanes::Integer,
+                          activation = relu; stride::Integer, expansion,
+                          reduction::Union{Nothing, Integer} = nothing)
+    hidden_planes = Int(inplanes * expansion)
+    return invertedresidual(kernel_size, inplanes, hidden_planes, outplanes, activation;
+                            stride, reduction)
 end

From 4e46d7b9db108e95417a104388d981bb0a71fe92 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Mon, 1 Aug 2022 11:10:48 +0530
Subject: [PATCH 4/8] Expose `inchannels` and `nclasses` for every model

Also
a. more type annotations
b. Expose only configurations vital to the model API in terms of pretraining at the highest level
---
 .github/workflows/CI.yml                    |  3 +-
 src/convnets/alexnet.jl                     | 12 ++---
 src/convnets/convmixer.jl                   | 20 +++----
 src/convnets/convnext.jl                    | 42 ++++++++-------
 src/convnets/densenet.jl                    | 49 +++++++++--------
 src/convnets/efficientnet.jl                | 36 ++++++-------
 src/convnets/inception/googlenet.jl         |  8 +--
 src/convnets/inception/inceptionresnetv2.jl | 12 +++--
 src/convnets/inception/inceptionv3.jl       | 14 ++---
 src/convnets/inception/inceptionv4.jl       | 14 ++---
 src/convnets/inception/xception.jl          | 33 ++++++------
 src/convnets/mobilenet/mobilenetv1.jl       | 19 +++----
 src/convnets/mobilenet/mobilenetv2.jl       | 29 +++++-----
 src/convnets/mobilenet/mobilenetv3.jl       | 29 +++++-----
 src/convnets/resnets/core.jl                |  2 +-
 src/convnets/resnets/resnet.jl              | 10 ++--
 src/convnets/resnets/resnext.jl             |  8 +--
 src/convnets/resnets/seresnet.jl            | 13 ++---
 src/convnets/squeezenet.jl                  | 34 ++++++++----
 src/convnets/vgg.jl                         | 60 +++++++++++----------
 src/layers/conv.jl                          | 18 ++++---
 src/layers/drop.jl                          |  9 ++--
 src/layers/embeddings.jl                    | 22 ++++----
 src/layers/mlp.jl                           | 10 ++--
 src/layers/pool.jl                          |  5 +-
 src/layers/scale.jl                         |  2 +-
 src/layers/selayers.jl                      |  8 +--
 src/mixers/core.jl                          | 11 ++--
 src/mixers/gmlp.jl                          | 37 +++++++------
 src/mixers/mlpmixer.jl                      | 34 ++++++------
 src/mixers/resmlp.jl                        | 36 ++++++-------
 src/vit-based/vit.jl                        | 24 +++++----
 test/convnets.jl                            | 11 ++--
 test/mixers.jl                              | 36 +++----------
 34 files changed, 363 insertions(+), 347 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 8de5bd6e0..c13f1c2d6 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -34,8 +34,7 @@ jobs:
           - '"Inception"'
           - '"DenseNet"'
           - '["ConvNeXt", "ConvMixer"]'
-          - 'r"ViTs"'
-          - 'r"Mixers"'
+          - '[r"ViTs", r"Mixers"]'
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1
diff --git a/src/convnets/alexnet.jl b/src/convnets/alexnet.jl
index 8ff65ffef..75ba5ad48 100644
--- a/src/convnets/alexnet.jl
+++ b/src/convnets/alexnet.jl
@@ -1,5 +1,5 @@
 """
-    alexnet(; nclasses = 1000)
+    alexnet(; nclasses::Integer = 1000)
 
 Create an AlexNet model
 ([reference](https://papers.nips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf)).
@@ -8,8 +8,8 @@ Create an AlexNet model
 
   - `nclasses`: the number of output classes
 """
-function alexnet(; nclasses = 1000)
-    layers = Chain(Chain(Conv((11, 11), 3 => 64, relu; stride = (4, 4), pad = (2, 2)),
+function alexnet(; inchannels::Integer = 3, nclasses::Integer = 1000)
+    layers = Chain(Chain(Conv((11, 11), inchannels => 64, relu; stride = (4, 4), pad = (2, 2)),
                          MaxPool((3, 3); stride = (2, 2)),
                          Conv((5, 5), 64 => 192, relu; pad = (2, 2)),
                          MaxPool((3, 3); stride = (2, 2)),
@@ -28,7 +28,7 @@ function alexnet(; nclasses = 1000)
 end
 
 """
-    AlexNet(; pretrain = false, nclasses = 1000)
+    AlexNet(; pretrain::Bool = false, nclasses::Integer = 1000)
 
 Create a `AlexNet`.
 See also [`alexnet`](#).
@@ -47,8 +47,8 @@ struct AlexNet
 end
 @functor AlexNet
 
-function AlexNet(; pretrain = false, nclasses = 1000)
-    layers = alexnet(; nclasses = nclasses)
+function AlexNet(; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
+    layers = alexnet(; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "AlexNet")
     end
diff --git a/src/convnets/convmixer.jl b/src/convnets/convmixer.jl
index aa3d144d2..c75303184 100644
--- a/src/convnets/convmixer.jl
+++ b/src/convnets/convmixer.jl
@@ -1,6 +1,7 @@
 """
-    convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9), patch_size::Dims{2} = 7,
-              activation = gelu, nclasses = 1000)
+    convmixer(planes::Integer, depth::Integer; kernel_size = (9, 9),
+              patch_size::Dims{2} = (7, 7), activation = gelu,
+              inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ConvMixer model.
 ([reference](https://arxiv.org/abs/2201.09792))
@@ -9,14 +10,15 @@ Creates a ConvMixer model.
 
   - `planes`: number of planes in the output of each block
   - `depth`: number of layers
-  - `inchannels`: The number of channels in the input.
   - `kernel_size`: kernel size of the convolutional layers
   - `patch_size`: size of the patches
   - `activation`: activation function used after the convolutional layers
+  - `inchannels`: The number of channels in the input.
   - `nclasses`: number of classes in the output
 """
-function convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9),
-                   patch_size::Dims{2} = (7, 7), activation = gelu, nclasses = 1000)
+function convmixer(planes::Integer, depth::Integer; kernel_size = (9, 9),
+                   patch_size::Dims{2} = (7, 7), activation = gelu,
+                   inchannels::Integer = 3, nclasses::Integer = 1000)
     stem = conv_norm(patch_size, inchannels, planes, activation; preact = true,
                      stride = patch_size[1])
     blocks = [Chain(SkipConnection(Chain(conv_norm(kernel_size, planes, planes, activation;
@@ -39,7 +41,7 @@ const CONVMIXER_CONFIGS = Dict(:base => Dict(:planes => 1536, :depth => 20,
                                               :patch_size => (7, 7)))
 
 """
-    ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000)
+    ConvMixer(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ConvMixer model.
 ([reference](https://arxiv.org/abs/2201.09792))
@@ -48,7 +50,6 @@ Creates a ConvMixer model.
 
   - `mode`: the mode of the model, either `:base`, `:small` or `:large`
   - `inchannels`: The number of channels in the input.
-  - `activation`: activation function used after the convolutional layers
   - `nclasses`: number of classes in the output
 """
 struct ConvMixer
@@ -56,14 +57,13 @@ struct ConvMixer
 end
 @functor ConvMixer
 
-function ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000)
+function ConvMixer(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(mode, keys(CONVMIXER_CONFIGS))
     planes = CONVMIXER_CONFIGS[mode][:planes]
     depth = CONVMIXER_CONFIGS[mode][:depth]
     kernel_size = CONVMIXER_CONFIGS[mode][:kernel_size]
     patch_size = CONVMIXER_CONFIGS[mode][:patch_size]
-    layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation,
-                       nclasses)
+    layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, nclasses)
     return ConvMixer(layers)
 end
 
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
index e6ccee16a..d7c39cc04 100644
--- a/src/convnets/convnext.jl
+++ b/src/convnets/convnext.jl
@@ -1,5 +1,5 @@
 """
-    convnextblock(planes, drop_path_rate = 0., λ = 1f-6)
+    convnextblock(planes::Integer, drop_path_rate = 0.0, layerscale_init = 1.0f-6)
 
 Creates a single block of ConvNeXt.
 ([reference](https://arxiv.org/abs/2201.03545))
@@ -8,21 +8,23 @@ Creates a single block of ConvNeXt.
 
   - `planes`: number of input channels.
   - `drop_path_rate`: Stochastic depth rate.
-  - `λ`: Initial value for [`LayerScale`](#)
+  - `layerscale_init`: Initial value for [`LayerScale`](#)
 """
-function convnextblock(planes, drop_path_rate = 0.0, λ = 1.0f-6)
+function convnextblock(planes::Integer, drop_path_rate = 0.0, layerscale_init = 1.0f-6)
     layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3),
                                   swapdims((3, 1, 2, 4)),
                                   LayerNorm(planes; ϵ = 1.0f-6),
                                   mlp_block(planes, 4 * planes),
-                                  LayerScale(planes, λ),
+                                  LayerScale(planes, layerscale_init),
                                   swapdims((2, 3, 1, 4)),
                                   DropPath(drop_path_rate)), +)
     return layers
 end
 
 """
-    convnext(depths, planes; inchannels = 3, drop_path_rate = 0., λ = 1f-6, nclasses = 1000)
+    convnext(depths::Vector{<:Integer}, planes::Vector{<:Integer};
+             drop_path_rate = 0.0, layerscale_init = 1.0f-6, inchannels::Integer = 3,
+             nclasses::Integer = 1000)
 
 Creates the layers for a ConvNeXt model.
 ([reference](https://arxiv.org/abs/2201.03545))
@@ -33,12 +35,13 @@ Creates the layers for a ConvNeXt model.
   - `depths`: list with configuration for depth of each block
   - `planes`: list with configuration for number of output channels in each block
   - `drop_path_rate`: Stochastic depth rate.
-  - `λ`: Initial value for [`LayerScale`](#)
+  - `layerscale_init`: Initial value for [`LayerScale`](#)
     ([reference](https://arxiv.org/abs/2103.17239))
   - `nclasses`: number of output classes
 """
-function convnext(depths, planes; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
-                  nclasses = 1000)
+function convnext(depths::Vector{<:Integer}, planes::Vector{<:Integer};
+                  drop_path_rate = 0.0, layerscale_init = 1.0f-6, inchannels::Integer = 3,
+                  nclasses::Integer = 1000)
     @assert length(depths) == length(planes)
     "`planes` should have exactly one value for each block"
     downsample_layers = []
@@ -54,7 +57,9 @@ function convnext(depths, planes; inchannels = 3, drop_path_rate = 0.0, λ = 1.0
     dp_rates = linear_scheduler(drop_path_rate; depth = sum(depths))
     cur = 0
     for i in eachindex(depths)
-        push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]])
+        push!(stages,
+              [convnextblock(planes[i], dp_rates[cur + j], layerscale_init)
+               for j in 1:depths[i]])
         cur += depths[i]
     end
     backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages))))
@@ -72,13 +77,8 @@ const CONVNEXT_CONFIGS = Dict(:tiny => ([3, 3, 9, 3], [96, 192, 384, 768]),
                               :large => ([3, 3, 27, 3], [192, 384, 768, 1536]),
                               :xlarge => ([3, 3, 27, 3], [256, 512, 1024, 2048]))
 
-struct ConvNeXt
-    layers::Any
-end
-@functor ConvNeXt
-
 """
-    ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0., λ = 1f-6, nclasses = 1000)
+    ConvNeXt(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ConvNeXt model.
 ([reference](https://arxiv.org/abs/2201.03545))
@@ -86,16 +86,18 @@ Creates a ConvNeXt model.
 # Arguments
 
   - `inchannels`: The number of channels in the input.
-  - `drop_path_rate`: Stochastic depth rate.
-  - `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239)
   - `nclasses`: number of output classes
 
 See also [`Metalhead.convnext`](#).
 """
-function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
-                  nclasses = 1000)
+struct ConvNeXt
+    layers::Any
+end
+@functor ConvNeXt
+
+function ConvNeXt(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(mode, keys(CONVNEXT_CONFIGS))
-    layers = convnext(CONVNEXT_CONFIGS[mode]...; inchannels, drop_path_rate, λ, nclasses)
+    layers = convnext(CONVNEXT_CONFIGS[mode]...; inchannels, nclasses)
     return ConvNeXt(layers)
 end
 
diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl
index 332b5551f..0b164e2ab 100644
--- a/src/convnets/densenet.jl
+++ b/src/convnets/densenet.jl
@@ -10,7 +10,7 @@ Create a Densenet bottleneck layer
   - `outplanes`: number of output feature maps on bottleneck branch
     (and scaling factor for inner feature maps; see ref)
 """
-function dense_bottleneck(inplanes, outplanes)
+function dense_bottleneck(inplanes::Integer, outplanes::Integer)
     inner_channels = 4 * outplanes
     return SkipConnection(Chain(conv_norm((1, 1), inplanes, inner_channels; bias = false,
                                           revnorm = true)...,
@@ -30,7 +30,7 @@ Create a DenseNet transition sequence
   - `inplanes`: number of input feature maps
   - `outplanes`: number of output feature maps
 """
-function transition(inplanes, outplanes)
+function transition(inplanes::Integer, outplanes::Integer)
     return Chain(conv_norm((1, 1), inplanes, outplanes; bias = false, revnorm = true)...,
                  MeanPool((2, 2)))
 end
@@ -48,14 +48,14 @@ the number of output feature maps by `growth_rates` with each block
   - `growth_rates`: the growth (additive) rates of output feature maps
     after each block (a vector of `k`s from the ref)
 """
-function dense_block(inplanes, growth_rates)
+function dense_block(inplanes::Integer, growth_rates)
     return [dense_bottleneck(i, o)
             for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]),
                               growth_rates)]
 end
 
 """
-    densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
+    densenet(inplanes, growth_rates; reduction = 0.5, nclasses::Integer = 1000)
 
 Create a DenseNet model
 ([reference](https://arxiv.org/abs/1608.06993)).
@@ -68,9 +68,11 @@ Create a DenseNet model
   - `reduction`: the factor by which the number of feature maps is scaled across each transition
   - `nclasses`: the number of output classes
 """
-function densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
+function densenet(inplanes::Integer, growth_rates; reduction = 0.5, inchannels::Integer = 3,
+                  nclasses::Integer = 1000)
     layers = []
-    append!(layers, conv_norm((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false))
+    append!(layers,
+            conv_norm((7, 7), inchannels, inplanes; stride = 2, pad = (3, 3), bias = false))
     push!(layers, MaxPool((3, 3); stride = 2, pad = (1, 1)))
     outplanes = 0
     for (i, rates) in enumerate(growth_rates)
@@ -88,7 +90,7 @@ function densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
 end
 
 """
-    densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000)
+    densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses::Integer = 1000)
 
 Create a DenseNet model
 ([reference](https://arxiv.org/abs/1608.06993)).
@@ -100,15 +102,15 @@ Create a DenseNet model
   - `reduction`: the factor by which the number of feature maps is scaled across each transition
   - `nclasses`: the number of output classes
 """
-function densenet(nblocks::NTuple{N, <:Integer}; growth_rate = 32, reduction = 0.5,
-                  nclasses = 1000) where {N}
+function densenet(nblocks::Vector{<:Integer}; growth_rate::Integer = 32, reduction = 0.5,
+                  inchannels::Integer = 3, nclasses::Integer = 1000)
     return densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks];
-                    reduction = reduction, nclasses = nclasses)
+                    reduction, inchannels, nclasses)
 end
 
 """
-    DenseNet(nblocks::NTuple{N, <:Integer};
-             growth_rate = 32, reduction = 0.5, nclasses = 1000)
+    DenseNet(nblocks::Vector{<:Integer}; growth_rate::Integer = 32, reduction = 0.5,
+             inchannels = 3, nclasses::Integer = 1000)
 
 Create a DenseNet model
 ([reference](https://arxiv.org/abs/1608.06993)).
@@ -124,29 +126,26 @@ See also [`densenet`](#).
 struct DenseNet
     layers::Any
 end
+@functor DenseNet
 
-function DenseNet(nblocks::NTuple{N, <:Integer};
-                  growth_rate = 32, reduction = 0.5, nclasses = 1000) where {N}
-    layers = densenet(nblocks; growth_rate = growth_rate,
-                      reduction = reduction,
-                      nclasses = nclasses)
+function DenseNet(nblocks::Vector{<:Integer}; growth_rate::Integer = 32, reduction = 0.5,
+                  inchannels = 3, nclasses::Integer = 1000)
+    layers = densenet(nblocks; growth_rate, reduction, inchannels, nclasses)
     return DenseNet(layers)
 end
 
-@functor DenseNet
-
 (m::DenseNet)(x) = m.layers(x)
 
 backbone(m::DenseNet) = m.layers[1]
 classifier(m::DenseNet) = m.layers[2]
 
-const DENSENET_CONFIGS = Dict(121 => (6, 12, 24, 16),
-                              161 => (6, 12, 36, 24),
-                              169 => (6, 12, 32, 32),
-                              201 => (6, 12, 48, 32))
+const DENSENET_CONFIGS = Dict(121 => [6, 12, 24, 16],
+                              161 => [6, 12, 36, 24],
+                              169 => [6, 12, 32, 32],
+                              201 => [6, 12, 48, 32])
 
 """
-    DenseNet(config::Integer = 121; pretrain = false, nclasses = 1000)
+    DenseNet(config::Integer = 121; pretrain::Bool = false, nclasses::Integer = 1000)
     DenseNet(transition_configs::NTuple{N,Integer})
 
 Create a DenseNet model with specified configuration. Currently supported values are (121, 161, 169, 201)
@@ -159,7 +158,7 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 See also [`Metalhead.densenet`](#).
 """
-function DenseNet(config::Integer = 121; pretrain = false, nclasses = 1000)
+function DenseNet(config::Integer = 121; pretrain::Bool = false, nclasses::Integer = 1000)
     _checkconfig(config, keys(DENSENET_CONFIGS))
     model = DenseNet(DENSENET_CONFIGS[config]; nclasses = nclasses)
     if pretrain
diff --git a/src/convnets/efficientnet.jl b/src/convnets/efficientnet.jl
index 71e6f8f0a..730840fa4 100644
--- a/src/convnets/efficientnet.jl
+++ b/src/convnets/efficientnet.jl
@@ -1,6 +1,6 @@
 """
-    efficientnet(scalings, block_configs;
-                 inchannels = 3, nclasses = 1000, max_width = 1280)
+    efficientnet(scalings, block_configs; max_width::Integer = 1280,
+                 inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an EfficientNet model ([reference](https://arxiv.org/abs/1905.11946v5)).
 
@@ -22,8 +22,8 @@ Create an EfficientNet model ([reference](https://arxiv.org/abs/1905.11946v5)).
   - `max_width`: maximum number of output channels before the fully connected
     classification blocks
 """
-function efficientnet(scalings, block_configs;
-                      inchannels = 3, nclasses = 1000, max_width = 1280)
+function efficientnet(scalings, block_configs; max_width::Integer = 1280,
+                      inchannels::Integer = 3, nclasses::Integer = 1000)
     wscale, dscale = scalings
     scalew(w) = wscale ≈ 1 ? w : ceil(Int64, wscale * w)
     scaled(d) = dscale ≈ 1 ? d : ceil(Int64, dscale * d)
@@ -36,12 +36,11 @@ function efficientnet(scalings, block_configs;
         out_channels = _round_channels(scalew(o), 8)
         repeats = scaled(n)
         push!(blocks,
-              invertedresidual((k, k), in_channels, in_channels * e, out_channels, swish;
+              invertedresidual((k, k), in_channels, out_channels, swish; expansion = e,
                                stride = s, reduction = 4))
         for _ in 1:(repeats - 1)
             push!(blocks,
-                  invertedresidual((k, k), out_channels, out_channels * e, out_channels,
-                                   swish;
+                  invertedresidual((k, k), out_channels, out_channels, swish; expansion = e,
                                    stride = 1, reduction = 4))
         end
     end
@@ -74,6 +73,7 @@ const EFFICIENTNET_BLOCK_CONFIGS = [
 # w: width scaling
 # d: depth scaling
 # r: image resolution
+# Data is organised as (r, (w, d))
 const EFFICIENTNET_GLOBAL_CONFIGS = Dict(:b0 => (224, (1.0, 1.0)),
                                          :b1 => (240, (1.0, 1.1)),
                                          :b2 => (260, (1.1, 1.2)),
@@ -84,14 +84,9 @@ const EFFICIENTNET_GLOBAL_CONFIGS = Dict(:b0 => (224, (1.0, 1.0)),
                                          :b7 => (600, (2.0, 3.1)),
                                          :b8 => (672, (2.2, 3.6)))
 
-struct EfficientNet
-    layers::Any
-end
-@functor EfficientNet
-
 """
-    EfficientNet(scalings, block_configs;
-                 inchannels = 3, nclasses = 1000, max_width = 1280)
+    EfficientNet(scalings, block_configs; max_width::Integer = 1280,
+                 inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an EfficientNet model ([reference](https://arxiv.org/abs/1905.11946v5)).
 See also [`efficientnet`](#).
@@ -114,8 +109,13 @@ See also [`efficientnet`](#).
   - `max_width`: maximum number of output channels before the fully connected
     classification blocks
 """
-function EfficientNet(scalings, block_configs;
-                      inchannels = 3, nclasses = 1000, max_width = 1280)
+struct EfficientNet
+    layers::Any
+end
+@functor EfficientNet
+
+function EfficientNet(scalings, block_configs; max_width::Integer = 1280,
+                      inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = efficientnet(scalings, block_configs; inchannels, nclasses, max_width)
     return EfficientNet(layers)
 end
@@ -126,7 +126,7 @@ backbone(m::EfficientNet) = m.layers[1]
 classifier(m::EfficientNet) = m.layers[2]
 
 """
-    EfficientNet(name::Symbol; pretrain = false)
+    EfficientNet(name::Symbol; pretrain::Bool = false)
 
 Create an EfficientNet model ([reference](https://arxiv.org/abs/1905.11946v5)).
 See also [`efficientnet`](#).
@@ -137,7 +137,7 @@ See also [`efficientnet`](#).
     (can be `:b0`, `:b1`, `:b2`, `:b3`, `:b4`, `:b5`, `:b6`, `:b7`, `:b8`)
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
 """
-function EfficientNet(name::Symbol; pretrain = false)
+function EfficientNet(name::Symbol; pretrain::Bool = false)
     _checkconfig(name, keys(EFFICIENTNET_GLOBAL_CONFIGS))
     model = EfficientNet(EFFICIENTNET_GLOBAL_CONFIGS[name][2], EFFICIENTNET_BLOCK_CONFIGS)
     pretrain && loadpretrain!(model, string("efficientnet-", name))
diff --git a/src/convnets/inception/googlenet.jl b/src/convnets/inception/googlenet.jl
index 8a88ca943..90f92ddfc 100644
--- a/src/convnets/inception/googlenet.jl
+++ b/src/convnets/inception/googlenet.jl
@@ -27,7 +27,7 @@ function _inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5,
 end
 
 """
-    googlenet(; nclasses = 1000)
+    googlenet(; nclasses::Integer = 1000)
 
 Create an Inception-v1 model (commonly referred to as GoogLeNet)
 ([reference](https://arxiv.org/abs/1409.4842v1)).
@@ -36,7 +36,7 @@ Create an Inception-v1 model (commonly referred to as GoogLeNet)
 
   - `nclasses`: the number of output classes
 """
-function googlenet(; nclasses = 1000)
+function googlenet(; nclasses::Integer = 1000)
     layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3),
                          MaxPool((3, 3); stride = 2, pad = 1),
                          Conv((1, 1), 64 => 64),
@@ -61,7 +61,7 @@ function googlenet(; nclasses = 1000)
 end
 
 """
-    GoogLeNet(; pretrain = false,  nclasses = 1000)
+    GoogLeNet(; pretrain::Bool = false, nclasses::Integer = 1000)
 
 Create an Inception-v1 model (commonly referred to as `GoogLeNet`)
 ([reference](https://arxiv.org/abs/1409.4842v1)).
@@ -82,7 +82,7 @@ struct GoogLeNet
 end
 @functor GoogLeNet
 
-function GoogLeNet(; pretrain = false, nclasses = 1000)
+function GoogLeNet(; pretrain::Bool = false, nclasses::Integer = 1000)
     layers = googlenet(; nclasses = nclasses)
     if pretrain
         loadpretrain!(layers, "GoogLeNet")
diff --git a/src/convnets/inception/inceptionresnetv2.jl b/src/convnets/inception/inceptionresnetv2.jl
index 4b4b78706..747da2fb2 100644
--- a/src/convnets/inception/inceptionresnetv2.jl
+++ b/src/convnets/inception/inceptionresnetv2.jl
@@ -64,7 +64,7 @@ function block8(scale = 1.0f0; activation = identity)
 end
 
 """
-    inceptionresnetv2(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+    inceptionresnetv2(; inchannels::Integer = 3, dropout_rate = 0.0, nclasses::Integer = 1000)
 
 Creates an InceptionResNetv2 model.
 ([reference](https://arxiv.org/abs/1602.07261))
@@ -75,7 +75,8 @@ Creates an InceptionResNetv2 model.
   - `dropout_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 """
-function inceptionresnetv2(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+function inceptionresnetv2(; inchannels::Integer = 3, dropout_rate = 0.0,
+                           nclasses::Integer = 1000)
     body = Chain(conv_norm((3, 3), inchannels, 32; stride = 2)...,
                  conv_norm((3, 3), 32, 32)...,
                  conv_norm((3, 3), 32, 64; pad = 1)...,
@@ -97,7 +98,7 @@ function inceptionresnetv2(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000
 end
 
 """
-    InceptionResNetv2(; pretrain = false, inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+    InceptionResNetv2(; pretrain::Bool = false, inchannels::Integer = 3, dropout_rate = 0.0, nclasses::Integer = 1000)
 
 Creates an InceptionResNetv2 model.
 ([reference](https://arxiv.org/abs/1602.07261))
@@ -118,8 +119,9 @@ struct InceptionResNetv2
 end
 @functor InceptionResNetv2
 
-function InceptionResNetv2(; pretrain = false, inchannels = 3, dropout_rate = 0.0,
-                           nclasses = 1000)
+function InceptionResNetv2(; pretrain::Bool = false, inchannels::Integer = 3,
+                           dropout_rate = 0.0,
+                           nclasses::Integer = 1000)
     layers = inceptionresnetv2(; inchannels, dropout_rate, nclasses)
     if pretrain
         loadpretrain!(layers, "InceptionResNetv2")
diff --git a/src/convnets/inception/inceptionv3.jl b/src/convnets/inception/inceptionv3.jl
index 68b283838..8d9977d80 100644
--- a/src/convnets/inception/inceptionv3.jl
+++ b/src/convnets/inception/inceptionv3.jl
@@ -127,7 +127,7 @@ function inceptionv3_e(inplanes)
 end
 
 """
-    inceptionv3(; nclasses = 1000)
+    inceptionv3(; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
 
@@ -135,8 +135,8 @@ Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
 
   - `nclasses`: the number of output classes
 """
-function inceptionv3(; nclasses = 1000)
-    layer = Chain(Chain(conv_norm((3, 3), 3, 32; stride = 2)...,
+function inceptionv3(; inchannels::Integer = 3, nclasses::Integer = 1000)
+    layer = Chain(Chain(conv_norm((3, 3), inchannels, 32; stride = 2)...,
                         conv_norm((3, 3), 32, 32)...,
                         conv_norm((3, 3), 32, 64; pad = 1)...,
                         MaxPool((3, 3); stride = 2),
@@ -162,7 +162,7 @@ function inceptionv3(; nclasses = 1000)
 end
 
 """
-    Inceptionv3(; pretrain = false, nclasses = 1000)
+    Inceptionv3(; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
 See also [`inceptionv3`](#).
@@ -170,6 +170,7 @@ See also [`inceptionv3`](#).
 # Arguments
 
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
+  - `inchannels`: number of input channels
   - `nclasses`: the number of output classes
 
 !!! warning
@@ -180,8 +181,9 @@ struct Inceptionv3
     layers::Any
 end
 
-function Inceptionv3(; pretrain = false, nclasses = 1000)
-    layers = inceptionv3(; nclasses = nclasses)
+function Inceptionv3(; pretrain::Bool = false, inchannels::Integer = 3,
+                     nclasses::Integer = 1000)
+    layers = inceptionv3(; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "Inceptionv3")
     end
diff --git a/src/convnets/inception/inceptionv4.jl b/src/convnets/inception/inceptionv4.jl
index bb03646ec..b84232fb8 100644
--- a/src/convnets/inception/inceptionv4.jl
+++ b/src/convnets/inception/inceptionv4.jl
@@ -82,7 +82,7 @@ function inceptionv4_c()
 end
 
 """
-    inceptionv4(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+    inceptionv4(; inchannels::Integer = 3, dropout_rate = 0.0, nclasses::Integer = 1000)
 
 Create an Inceptionv4 model.
 ([reference](https://arxiv.org/abs/1602.07261))
@@ -93,7 +93,8 @@ Create an Inceptionv4 model.
   - `dropout_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 """
-function inceptionv4(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+function inceptionv4(; dropout_rate = 0.0, inchannels::Integer = 3,
+                     nclasses::Integer = 1000)
     body = Chain(conv_norm((3, 3), inchannels, 32; stride = 2)...,
                  conv_norm((3, 3), 32, 32)...,
                  conv_norm((3, 3), 32, 64; pad = 1)...,
@@ -122,7 +123,7 @@ function inceptionv4(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
 end
 
 """
-    Inceptionv4(; pretrain = false, inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+    Inceptionv4(; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates an Inceptionv4 model.
 ([reference](https://arxiv.org/abs/1602.07261))
@@ -131,7 +132,6 @@ Creates an Inceptionv4 model.
 
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
   - `inchannels`: number of input channels.
-  - `dropout_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 
 !!! warning
@@ -143,9 +143,9 @@ struct Inceptionv4
 end
 @functor Inceptionv4
 
-function Inceptionv4(; pretrain = false, inchannels = 3, dropout_rate = 0.0,
-                     nclasses = 1000)
-    layers = inceptionv4(; inchannels, dropout_rate, nclasses)
+function Inceptionv4(; pretrain::Bool = false, inchannels::Integer = 3,
+                     nclasses::Integer = 1000)
+    layers = inceptionv4(; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "Inceptionv4")
     end
diff --git a/src/convnets/inception/xception.jl b/src/convnets/inception/xception.jl
index 3c6d8331a..8d2ad13d8 100644
--- a/src/convnets/inception/xception.jl
+++ b/src/convnets/inception/xception.jl
@@ -1,6 +1,7 @@
 """
-    xception_block(inchannels, outchannels, nrepeats; stride = 1, start_with_relu = true, 
-                        grow_at_start = true)
+    xception_block(inchannels::Integer, outchannels::Integer, nrepeats::Integer;
+                   stride::Integer = 1, start_with_relu::Bool = true,
+                   grow_at_start::Bool = true)
 
 Create an Xception block.
 ([reference](https://arxiv.org/abs/1610.02357))
@@ -14,9 +15,9 @@ Create an Xception block.
   - `start_with_relu`: if true, start the block with a ReLU activation.
   - `grow_at_start`: if true, increase the number of channels at the first convolution.
 """
-function xception_block(inchannels, outchannels, nrepeats; stride = 1,
-                        start_with_relu = true,
-                        grow_at_start = true)
+function xception_block(inchannels::Integer, outchannels::Integer, nrepeats::Integer;
+                        stride::Integer = 1, start_with_relu::Bool = true,
+                        grow_at_start::Bool = true)
     if outchannels != inchannels || stride != 1
         skip = conv_norm((1, 1), inchannels, outchannels, identity; stride = stride,
                          bias = false)
@@ -44,7 +45,7 @@ function xception_block(inchannels, outchannels, nrepeats; stride = 1,
 end
 
 """
-    xception(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+    xception(; inchannels::Integer = 3, dropout_rate = 0.0, nclasses::Integer = 1000)
 
 Creates an Xception model.
 ([reference](https://arxiv.org/abs/1610.02357))
@@ -55,7 +56,7 @@ Creates an Xception model.
   - `dropout_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 """
-function xception(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+function xception(; dropout_rate = 0.0, inchannels::Integer = 3, nclasses::Integer = 1000)
     body = Chain(conv_norm((3, 3), inchannels, 32; stride = 2, bias = false)...,
                  conv_norm((3, 3), 32, 64; bias = false)...,
                  xception_block(64, 128, 2; stride = 2, start_with_relu = false),
@@ -70,13 +71,8 @@ function xception(; inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
     return Chain(body, head)
 end
 
-struct Xception
-    layers::Any
-end
-@functor Xception
-
 """
-    Xception(; pretrain = false, inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
+    Xception(; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates an Xception model.
 ([reference](https://arxiv.org/abs/1610.02357))
@@ -85,15 +81,20 @@ Creates an Xception model.
 
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet.
   - `inchannels`: number of input channels.
-  - `dropout_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 
 !!! warning
     
     `Xception` does not currently support pretrained weights.
 """
-function Xception(; pretrain = false, inchannels = 3, dropout_rate = 0.0, nclasses = 1000)
-    layers = xception(; inchannels, dropout_rate, nclasses)
+struct Xception
+    layers::Any
+end
+@functor Xception
+
+function Xception(; pretrain::Bool = false, inchannels::Integer = 3,
+                  nclasses::Integer = 1000)
+    layers = xception(; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "xception")
     end
diff --git a/src/convnets/mobilenet/mobilenetv1.jl b/src/convnets/mobilenet/mobilenetv1.jl
index fffa93a4d..e31f8835b 100644
--- a/src/convnets/mobilenet/mobilenetv1.jl
+++ b/src/convnets/mobilenet/mobilenetv1.jl
@@ -1,8 +1,6 @@
 """
-    mobilenetv1(width_mult, config;
-                activation = relu,
-                inchannels = 3,
-                nclasses = 1000)
+    mobilenetv1(width_mult::Number, config::Vector{<:Tuple}; activation = relu,
+                inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create a MobileNetv1 model ([reference](https://arxiv.org/abs/1704.04861v1)).
 
@@ -21,10 +19,8 @@ Create a MobileNetv1 model ([reference](https://arxiv.org/abs/1704.04861v1)).
   - `inchannels`: The number of input channels. The default value is 3.
   - `nclasses`: The number of output classes
 """
-function mobilenetv1(width_mult, config;
-                     activation = relu,
-                     inchannels = 3,
-                     nclasses = 1000)
+function mobilenetv1(width_mult::Number, config::Vector{<:Tuple}; activation = relu,
+                     inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = []
     for (dw, outch, stride, nrepeats) in config
         outch = Int(outch * width_mult)
@@ -61,7 +57,8 @@ const MOBILENETV1_CONFIGS = [
 ]
 
 """
-    MobileNetv1(width_mult = 1; inchannels = 3, pretrain = false, nclasses = 1000)
+    MobileNetv1(width_mult = 1; inchannels::Integer = 3, pretrain::Bool = false,
+                nclasses::Integer = 1000)
 
 Create a MobileNetv1 model with the baseline configuration
 ([reference](https://arxiv.org/abs/1704.04861v1)).
@@ -83,8 +80,8 @@ struct MobileNetv1
 end
 @functor MobileNetv1
 
-function MobileNetv1(width_mult::Number = 1; inchannels = 3, pretrain = false,
-                     nclasses = 1000)
+function MobileNetv1(width_mult::Number = 1; pretrain::Bool = false,
+                     inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = mobilenetv1(width_mult, MOBILENETV1_CONFIGS; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, string("MobileNetv1"))
diff --git a/src/convnets/mobilenet/mobilenetv2.jl b/src/convnets/mobilenet/mobilenetv2.jl
index b97fc16ff..9dd35e9f9 100644
--- a/src/convnets/mobilenet/mobilenetv2.jl
+++ b/src/convnets/mobilenet/mobilenetv2.jl
@@ -1,5 +1,7 @@
 """
-    mobilenetv2(width_mult, configs; inchannels = 3, max_width = 1280, nclasses = 1000)
+    mobilenetv2(width_mult::Number, configs::Vector{<:Tuple};
+                max_width::Integer = 1280, inchannels::Integer = 3,
+                nclasses::Integer = 1000)
 
 Create a MobileNetv2 model.
 ([reference](https://arxiv.org/abs/1801.04381)).
@@ -20,7 +22,9 @@ Create a MobileNetv2 model.
   - `max_width`: The maximum number of feature maps in any layer of the network
   - `nclasses`: The number of output classes
 """
-function mobilenetv2(width_mult, configs; inchannels = 3, max_width = 1280, nclasses = 1000)
+function mobilenetv2(width_mult::Number, configs::Vector{<:Tuple};
+                     max_width::Integer = 1280, inchannels::Integer = 3,
+                     nclasses::Integer = 1000)
     # building first layer
     inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8)
     layers = []
@@ -30,7 +34,7 @@ function mobilenetv2(width_mult, configs; inchannels = 3, max_width = 1280, ncla
         outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8)
         for i in 1:n
             push!(layers,
-                  invertedresidual((3, 3), inplanes, inplanes * t, outplanes, a;
+                  invertedresidual((3, 3), inplanes, outplanes, a; expansion = t,
                                    stride = i == 1 ? s : 1))
             inplanes = outplanes
         end
@@ -57,13 +61,9 @@ const MOBILENETV2_CONFIGS = [
     (6, 320, 1, 1, relu6),
 ]
 
-struct MobileNetv2
-    layers::Any
-end
-@functor MobileNetv2
-
 """
-    MobileNetv2(width_mult = 1.0; inchannels = 3, pretrain = false, nclasses = 1000)
+    MobileNetv2(width_mult = 1.0; inchannels::Integer = 3, pretrain::Bool = false,
+                nclasses::Integer = 1000)
 
 Create a MobileNetv2 model with the specified configuration.
 ([reference](https://arxiv.org/abs/1801.04381)).
@@ -74,14 +74,19 @@ Set `pretrain` to `true` to load the pretrained weights for ImageNet.
   - `width_mult`: Controls the number of output feature maps in each block
     (with 1.0 being the default in the paper;
     this is usually a value between 0.1 and 1.4)
-  - `inchannels`: The number of input channels.
   - `pretrain`: Whether to load the pre-trained weights for ImageNet
+  - `inchannels`: The number of input channels.
   - `nclasses`: The number of output classes
 
 See also [`Metalhead.mobilenetv2`](#).
 """
-function MobileNetv2(width_mult::Number = 1; inchannels = 3, pretrain = false,
-                     nclasses = 1000)
+struct MobileNetv2
+    layers::Any
+end
+@functor MobileNetv2
+
+function MobileNetv2(width_mult::Number = 1; pretrain::Bool = false,
+                     inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = mobilenetv2(width_mult, MOBILENETV2_CONFIGS; inchannels, nclasses)
     pretrain && loadpretrain!(layers, string("MobileNetv2"))
     if pretrain
diff --git a/src/convnets/mobilenet/mobilenetv3.jl b/src/convnets/mobilenet/mobilenetv3.jl
index d6873ac57..00c0e0139 100644
--- a/src/convnets/mobilenet/mobilenetv3.jl
+++ b/src/convnets/mobilenet/mobilenetv3.jl
@@ -1,5 +1,7 @@
 """
-    mobilenetv3(width_mult, configs; inchannels = 3, max_width = 1024, nclasses = 1000)
+    mobilenetv3(width_mult::Number, configs::Vector{<:Tuple};
+                max_width::Integer = 1024, inchannels::Integer = 3,
+                nclasses::Integer = 1000)
 
 Create a MobileNetv3 model.
 ([reference](https://arxiv.org/abs/1905.02244)).
@@ -22,7 +24,9 @@ Create a MobileNetv3 model.
   - `max_width`: The maximum number of feature maps in any layer of the network
   - `nclasses`: the number of output classes
 """
-function mobilenetv3(width_mult, configs; inchannels = 3, max_width = 1024, nclasses = 1000)
+function mobilenetv3(width_mult::Number, configs::Vector{<:Tuple};
+                     max_width::Integer = 1024, inchannels::Integer = 3,
+                     nclasses::Integer = 1000)
     # building first layer
     inplanes = _round_channels(16 * width_mult, 8)
     layers = []
@@ -86,13 +90,9 @@ const MOBILENETV3_CONFIGS = Dict(:small => [
                                      (5, 6, 160, 4, hardswish, 1),
                                  ])
 
-struct MobileNetv3
-    layers::Any
-end
-@functor MobileNetv3
-
 """
-    MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; inchannels = 3, pretrain = false, nclasses = 1000)
+    MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain::Bool = false,
+                inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create a MobileNetv3 model with the specified configuration.
 ([reference](https://arxiv.org/abs/1905.02244)).
@@ -104,15 +104,20 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
   - `width_mult`: Controls the number of output feature maps in each block
     (with 1.0 being the default in the paper;
     this is usually a value between 0.1 and 1.4)
-  - `inchannels`: The number of channels in the input.
   - `pretrain`: whether to load the pre-trained weights for ImageNet
+  - `inchannels`: The number of channels in the input.
   - `nclasses`: the number of output classes
 
 See also [`Metalhead.mobilenetv3`](#).
 """
-function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; inchannels = 3,
-                     pretrain = false, nclasses = 1000)
-    @assert mode in [:large, :small] "`mode` has to be either :large or :small"
+struct MobileNetv3
+    layers::Any
+end
+@functor MobileNetv3
+
+function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain::Bool = false,
+                     inchannels::Integer = 3, nclasses::Integer = 1000)
+    _checkconfig(mode, [:small, :large])
     max_width = (mode == :large) ? 1280 : 1024
     layers = mobilenetv3(width_mult, MOBILENETV3_CONFIGS[mode]; inchannels, max_width,
                          nclasses)
diff --git a/src/convnets/resnets/core.jl b/src/convnets/resnets/core.jl
index 329663c13..940565f3a 100644
--- a/src/convnets/resnets/core.jl
+++ b/src/convnets/resnets/core.jl
@@ -132,7 +132,7 @@ end
 # end
 
 """
-    resnet_stem(; stem_type = :default, inchannels = 3, replace_stem_pool = false,
+    resnet_stem(; stem_type = :default, inchannels::Integer = 3, replace_stem_pool = false,
                   norm_layer = BatchNorm, activation = relu)
 
 Builds a stem to be used in a ResNet model. See the `stem` argument of [`resnet`](#) for details
diff --git a/src/convnets/resnets/resnet.jl b/src/convnets/resnets/resnet.jl
index fac7e7415..9bf9cd82c 100644
--- a/src/convnets/resnets/resnet.jl
+++ b/src/convnets/resnets/resnet.jl
@@ -1,5 +1,5 @@
 """
-    ResNet(depth::Integer; pretrain = false, inchannels = 3, nclasses = 1000)
+    ResNet(depth::Integer; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ResNet model with the specified depth.
 ((reference)[https://arxiv.org/abs/1512.03385])
@@ -22,7 +22,8 @@ struct ResNet
 end
 @functor ResNet
 
-function ResNet(depth::Integer; pretrain = false, inchannels = 3, nclasses = 1000)
+function ResNet(depth::Integer; pretrain::Bool = false, inchannels::Integer = 3,
+                nclasses::Integer = 1000)
     _checkconfig(depth, keys(RESNET_CONFIGS))
     layers = resnet(RESNET_CONFIGS[depth]...; inchannels, nclasses)
     if pretrain
@@ -37,7 +38,7 @@ backbone(m::ResNet) = m.layers[1]
 classifier(m::ResNet) = m.layers[2]
 
 """
-    WideResNet(depth::Integer; pretrain = false, inchannels = 3, nclasses = 1000)
+    WideResNet(depth::Integer; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a Wide ResNet model with the specified depth. The model is the same as ResNet
 except for the bottleneck number of channels which is twice larger in every block.
@@ -62,7 +63,8 @@ struct WideResNet
 end
 @functor WideResNet
 
-function WideResNet(depth::Integer; pretrain = false, inchannels = 3, nclasses = 1000)
+function WideResNet(depth::Integer; pretrain::Bool = false, inchannels::Integer = 3,
+                    nclasses::Integer = 1000)
     _checkconfig(depth, [50, 101])
     layers = resnet(RESNET_CONFIGS[depth]...; base_width = 128, inchannels, nclasses)
     if pretrain
diff --git a/src/convnets/resnets/resnext.jl b/src/convnets/resnets/resnext.jl
index 8032df5ab..29d89e3f1 100644
--- a/src/convnets/resnets/resnext.jl
+++ b/src/convnets/resnets/resnext.jl
@@ -1,6 +1,6 @@
 """
-    ResNeXt(depth::Integer; pretrain = false, cardinality = 32,
-            base_width = 4, inchannels = 3, nclasses = 1000)
+    ResNeXt(depth::Integer; pretrain::Bool = false, cardinality = 32,
+            base_width = 4, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ResNeXt model with the specified depth, cardinality, and base width.
 ((reference)[https://arxiv.org/abs/1611.05431])
@@ -27,8 +27,8 @@ end
 
 (m::ResNeXt)(x) = m.layers(x)
 
-function ResNeXt(depth::Integer; pretrain = false, cardinality = 32,
-                 base_width = 4, inchannels = 3, nclasses = 1000)
+function ResNeXt(depth::Integer; pretrain::Bool = false, cardinality = 32,
+                 base_width = 4, inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(depth, sort(collect(keys(RESNET_CONFIGS)))[3:end])
     layers = resnet(RESNET_CONFIGS[depth]...; inchannels, nclasses, cardinality, base_width)
     if pretrain
diff --git a/src/convnets/resnets/seresnet.jl b/src/convnets/resnets/seresnet.jl
index 05d842173..61eee3aad 100644
--- a/src/convnets/resnets/seresnet.jl
+++ b/src/convnets/resnets/seresnet.jl
@@ -1,5 +1,5 @@
 """
-    SEResNet(depth::Integer; pretrain = false, inchannels = 3, nclasses = 1000)
+    SEResNet(depth::Integer; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a SEResNet model with the specified depth.
 ((reference)[https://arxiv.org/pdf/1709.01507.pdf])
@@ -24,7 +24,8 @@ end
 
 (m::SEResNet)(x) = m.layers(x)
 
-function SEResNet(depth::Integer; pretrain = false, inchannels = 3, nclasses = 1000)
+function SEResNet(depth::Integer; pretrain::Bool = false, inchannels::Integer = 3,
+                  nclasses::Integer = 1000)
     _checkconfig(depth, keys(RESNET_CONFIGS))
     layers = resnet(RESNET_CONFIGS[depth]...; inchannels, nclasses,
                     attn_fn = squeeze_excite)
@@ -38,8 +39,8 @@ backbone(m::SEResNet) = m.layers[1]
 classifier(m::SEResNet) = m.layers[2]
 
 """
-    SEResNeXt(depth::Integer; pretrain = false, cardinality = 32, base_width = 4,
-              inchannels = 3, nclasses = 1000)
+    SEResNeXt(depth::Integer; pretrain::Bool = false, cardinality = 32, base_width = 4,
+              inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a SEResNeXt model with the specified depth, cardinality, and base width.
 ((reference)[https://arxiv.org/pdf/1709.01507.pdf])
@@ -66,8 +67,8 @@ end
 
 (m::SEResNeXt)(x) = m.layers(x)
 
-function SEResNeXt(depth::Integer; pretrain = false, cardinality = 32, base_width = 4,
-                   inchannels = 3, nclasses = 1000)
+function SEResNeXt(depth::Integer; pretrain::Bool = false, cardinality = 32, base_width = 4,
+                   inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(depth, sort(collect(keys(RESNET_CONFIGS)))[3:end])
     layers = resnet(RESNET_CONFIGS[depth]...; inchannels, nclasses, cardinality, base_width,
                     attn_fn = squeeze_excite)
diff --git a/src/convnets/squeezenet.jl b/src/convnets/squeezenet.jl
index abcdd63f8..3ee6653bc 100644
--- a/src/convnets/squeezenet.jl
+++ b/src/convnets/squeezenet.jl
@@ -1,5 +1,6 @@
 """
-    fire(inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes)
+    fire(inplanes::Integer, squeeze_planes::Integer, expand1x1_planes::Integer,
+         expand3x3_planes::Integer)
 
 Create a fire module
 ([reference](https://arxiv.org/abs/1602.07360v4)).
@@ -11,7 +12,8 @@ Create a fire module
   - `expand1x1_planes`: number of output feature maps for the 1x1 expansion convolution
   - `expand3x3_planes`: number of output feature maps for the 3x3 expansion convolution
 """
-function fire(inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes)
+function fire(inplanes::Integer, squeeze_planes::Integer, expand1x1_planes::Integer,
+              expand3x3_planes::Integer)
     branch_1 = Conv((1, 1), inplanes => squeeze_planes, relu)
     branch_2 = Conv((1, 1), squeeze_planes => expand1x1_planes, relu)
     branch_3 = Conv((3, 3), squeeze_planes => expand3x3_planes, relu; pad = 1)
@@ -19,13 +21,18 @@ function fire(inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes)
 end
 
 """
-    squeezenet()
+    squeezenet(; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create a SqueezeNet
 ([reference](https://arxiv.org/abs/1602.07360v4)).
+
+# Arguments
+
+  - `inchannels`: number of input channels.
+  - `nclasses`: the number of output classes.
 """
-function squeezenet()
-    return Chain(Chain(Conv((3, 3), 3 => 64, relu; stride = 2),
+function squeezenet(; inchannels::Integer = 3, nclasses::Integer = 1000)
+    return Chain(Chain(Conv((3, 3), inchannels => 64, relu; stride = 2),
                        MaxPool((3, 3); stride = 2),
                        fire(64, 16, 64, 64),
                        fire(128, 16, 64, 64),
@@ -38,17 +45,23 @@ function squeezenet()
                        fire(384, 64, 256, 256),
                        fire(512, 64, 256, 256),
                        Dropout(0.5),
-                       Conv((1, 1), 512 => 1000, relu)),
+                       Conv((1, 1), 512 => nclasses, relu)),
                  AdaptiveMeanPool((1, 1)),
                  MLUtils.flatten)
 end
 
 """
-    SqueezeNet(; pretrain = false)
+    SqueezeNet(; pretrain::Bool = false, inchannels::Integer = 3,
+           nclasses::Integer = 1000)
 
 Create a SqueezeNet
 ([reference](https://arxiv.org/abs/1602.07360v4)).
-Set `pretrain=true` to load the model with pre-trained weights for ImageNet.
+
+# Arguments
+
+  - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
+  - `inchannels`: number of input channels.
+  - `nclasses`: the number of output classes.
 
 !!! warning
     
@@ -61,8 +74,9 @@ struct SqueezeNet
 end
 @functor SqueezeNet
 
-function SqueezeNet(; pretrain = false)
-    layers = squeezenet()
+function SqueezeNet(; pretrain::Bool = false, inchannels::Integer = 3,
+                    nclasses::Integer = 1000)
+    layers = squeezenet(; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "SqueezeNet")
     end
diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl
index ccfdd2cff..0b6026eb8 100644
--- a/src/convnets/vgg.jl
+++ b/src/convnets/vgg.jl
@@ -11,7 +11,7 @@ A VGG block of convolution layers
   - `depth`: number of convolution/convolution + batch norm layers
   - `batchnorm`: set to `true` to include batch normalization after each convolution
 """
-function vgg_block(ifilters, ofilters, depth, batchnorm)
+function vgg_block(ifilters::Integer, ofilters::Integer, depth::Integer, batchnorm::Bool)
     k = (3, 3)
     p = (1, 1)
     layers = []
@@ -40,7 +40,8 @@ Create VGG convolution layers
   - `batchnorm`: set to `true` to include batch normalization after each convolution
   - `inchannels`: number of input channels
 """
-function vgg_convolutional_layers(config, batchnorm, inchannels)
+function vgg_convolutional_layers(config::Vector{<:Tuple}, batchnorm::Bool,
+                                  inchannels::Integer)
     layers = []
     ifilters = inchannels
     for c in config
@@ -65,7 +66,8 @@ Create VGG classifier (fully connected) layers
   - `fcsize`: input and output size of the intermediate fully connected layer
   - `dropout_rate`: the dropout level between each fully connected layer
 """
-function vgg_classifier_layers(imsize, nclasses, fcsize, dropout_rate)
+function vgg_classifier_layers(imsize::NTuple{3, <:Integer}, nclasses::Integer,
+                               fcsize::Integer, dropout_rate)
     return Chain(MLUtils.flatten,
                  Dense(Int(prod(imsize)), fcsize, relu),
                  Dropout(dropout_rate),
@@ -92,7 +94,8 @@ Create a VGG model
     (see [`Metalhead.vgg_classifier_layers`](#))
   - `dropout_rate`: dropout level between fully connected layers
 """
-function vgg(imsize; config, inchannels, batchnorm = false, nclasses, fcsize, dropout_rate)
+function vgg(imsize::Dims{2}; config, batchnorm::Bool = false, fcsize::Integer = 4096,
+             dropout_rate = 0.0, inchannels::Integer = 3, nclasses::Integer = 1000)
     conv = vgg_convolutional_layers(config, batchnorm, inchannels)
     imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3]
     class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout_rate)
@@ -109,10 +112,6 @@ const VGG_CONFIGS = Dict(11 => :A,
                          16 => :D,
                          19 => :E)
 
-struct VGG
-    layers::Any
-end
-
 """
     VGG(imsize::Dims{2}; config, inchannels, batchnorm = false, nclasses, fcsize, dropout_rate)
 
@@ -120,46 +119,53 @@ Construct a VGG model with the specified input image size. Typically, the image
 
 ## Keyword Arguments:
 
-  - `config` : VGG convolutional block configuration. It is defined as a vector of tuples `(output_channels, num_convolutions)` for each block
-  - `inchannels`::Integer : number of input channels
-  - `batchnorm`::Bool : set to `true` to use batch normalization after each convolution
-  - `nclasses`::Integer : number of output classes
+  - `config` : VGG convolutional block configuration. It is defined as a vector of tuples
+    `(output_channels, num_convolutions)` for each block
+  - `inchannels`: number of input channels
+  - `batchnorm`: set to `true` to use batch normalization after each convolution
+  - `nclasses`: number of output classes
   - `fcsize`: intermediate fully connected layer size
     (see [`Metalhead.vgg_classifier_layers`](#))
   - `dropout_rate`: dropout level between fully connected layers
 """
-function VGG(imsize::Dims{2}; config, inchannels, batchnorm = false, nclasses, fcsize,
-             dropout_rate)
-    layers = vgg(imsize; config, inchannels, batchnorm, nclasses, fcsize, dropout_rate)
-    return VGG(layers)
+struct VGG
+    layers::Any
 end
-
 @functor VGG
 
+function VGG(imsize::Dims{2}; config, batchnorm::Bool = false, dropout_rate = 0.5,
+             inchannels::Integer = 3, nclasses::Integer = 1000)
+    layers = vgg(imsize; config, inchannels, batchnorm, nclasses, dropout_rate)
+    return VGG(layers)
+end
+
 (m::VGG)(x) = m.layers(x)
 
 backbone(m::VGG) = m.layers[1]
 classifier(m::VGG) = m.layers[2]
 
 """
-    VGG(depth::Integer = 16; pretrain = false, batchnorm = false)
+    VGG(depth::Integer; pretrain::Bool = false, batchnorm::Bool = false,
+        inchannels::Integer = 3, nclasses::Integer = 1000)
 
-Create a VGG style model with specified `depth`. Available values include (11, 13, 16, 19).
+Create a VGG style model with specified `depth`.
 ([reference](https://arxiv.org/abs/1409.1556v6)).
-See also [`VGG`](#).
 
 # Arguments
 
+  - `depth`: the depth of the VGG model. Must be one of [11, 13, 16, 19].
   - `pretrain`: set to `true` to load pre-trained model weights for ImageNet
+  - `batchnorm`: set to `true` to use batch normalization after each convolution
+  - `inchannels`: number of input channels
+  - `nclasses`: number of output classes
+
+See also [`vgg`](#).
 """
-function VGG(depth::Integer = 16; pretrain = false, batchnorm = false, nclasses = 1000)
+function VGG(depth::Integer; pretrain::Bool = false, batchnorm::Bool = false,
+             inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(depth, keys(VGG_CONFIGS))
-    model = VGG((224, 224); config = VGG_CONV_CONFIGS[VGG_CONFIGS[depth]],
-                inchannels = 3,
-                batchnorm = batchnorm,
-                nclasses = nclasses,
-                fcsize = 4096,
-                dropout_rate = 0.5)
+    model = VGG((224, 224); config = VGG_CONV_CONFIGS[VGG_CONFIGS[depth]], batchnorm,
+                inchannels, nclasses)
     if pretrain && !batchnorm
         loadpretrain!(model, string("vgg", depth))
     elseif pretrain
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 557db23a7..75b40708c 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -23,9 +23,9 @@ Create a convolution + batch normalization pair with activation.
   - `groups`: groups for the convolution kernel
   - `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#))
 """
-function conv_norm(kernel_size, inplanes::Int, outplanes::Int, activation = relu;
-                   norm_layer = BatchNorm, revnorm = false, preact = false, use_norm = true,
-                   kwargs...)
+function conv_norm(kernel_size, inplanes::Integer, outplanes::Integer, activation = relu;
+                   norm_layer = BatchNorm, revnorm::Bool = false, preact::Bool = false,
+                   use_norm::Bool = true, kwargs...)
     if !use_norm
         if (preact || revnorm)
             throw(ArgumentError("`preact` only supported with `use_norm = true`"))
@@ -60,8 +60,8 @@ end
 
 """
     depthwise_sep_conv_norm(kernel_size, inplanes, outplanes, activation = relu;
-                               revnorm = false, use_norm = (true, true),
-                               stride = 1, pad = 0, dilation = 1, [bias, weight, init])
+                            revnorm = false, use_norm = (true, true),
+                            stride = 1, pad = 0, dilation = 1, [bias, weight, init])
 
 Create a depthwise separable convolution chain as used in MobileNetv1.
 This is sequence of layers:
@@ -86,9 +86,11 @@ See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
   - `dilation`: dilation of the first convolution kernel
   - `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#))
 """
-function depthwise_sep_conv_norm(kernel_size, inplanes, outplanes, activation = relu;
-                                 norm_layer = BatchNorm, revnorm = false,
-                                 use_norm = (true, true), stride = 1, kwargs...)
+function depthwise_sep_conv_norm(kernel_size, inplanes::Integer, outplanes::Integer,
+                                 activation = relu; norm_layer = BatchNorm,
+                                 revnorm::Bool = false,
+                                 use_norm::NTuple{2, Bool} = (true, true),
+                                 stride::Integer = 1, kwargs...)
     return vcat(conv_norm(kernel_size, inplanes, inplanes, activation;
                           norm_layer, revnorm, use_norm = use_norm[1], stride,
                           groups = inplanes, kwargs...),
diff --git a/src/layers/drop.jl b/src/layers/drop.jl
index b4a882cff..f823d5c22 100644
--- a/src/layers/drop.jl
+++ b/src/layers/drop.jl
@@ -1,5 +1,6 @@
 # Generates the mask to be used for `DropBlock`
-@inline function _dropblock_mask(rng, x, gamma, clipped_block_size)
+@inline function _dropblock_mask(rng, x::AbstractArray{T, 4}, gamma,
+                                 clipped_block_size::Integer) where {T}
     block_mask = rand_like(rng, x)
     block_mask .= block_mask .< gamma
     return 1 .- maxpool(block_mask, (clipped_block_size, clipped_block_size);
@@ -28,8 +29,8 @@ If you are an end-user, you do not want this function. Use [`DropBlock`](#) inst
 """
 # TODO add experimental `DropBlock` options from timm such as gaussian noise and
 # more precise `DropBlock` to deal with edges (#188)
-function dropblock(rng::AbstractRNG, x::AbstractArray{T, 4}, drop_block_prob, block_size,
-                   gamma_scale) where {T}
+function dropblock(rng::AbstractRNG, x::AbstractArray{T, 4}, drop_block_prob,
+                   block_size::Integer, gamma_scale) where {T}
     H, W, _, _ = size(x)
     total_size = H * W
     clipped_block_size = min(block_size, min(H, W))
@@ -100,7 +101,7 @@ size `block_size` in the input. During inference, it simply returns the input `x
   - `rng`: can be used to pass in a custom RNG instead of the default. Custom RNGs are only
     supported on the CPU.
 """
-function DropBlock(drop_block_prob = 0.1, block_size = 7, gamma_scale = 1.0,
+function DropBlock(drop_block_prob = 0.1, block_size::Integer = 7, gamma_scale = 1.0,
                    rng = rng_from_array())
     if drop_block_prob == 0.0
         return identity
diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl
index 3e85f18d9..560ac074d 100644
--- a/src/layers/embeddings.jl
+++ b/src/layers/embeddings.jl
@@ -1,7 +1,7 @@
 _flatten_spatial(x) = permutedims(reshape(x, (:, size(x, 3), size(x, 4))), (2, 1, 3))
 
 """
-    PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels = 3,
+    PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels::Integer = 3,
                    patch_size::Dims{2} = (16, 16), embedplanes = 768,
                    norm_layer = planes -> identity, flatten = true)
 
@@ -19,8 +19,8 @@ patches.
   - `flatten`: set true to flatten the input spatial dimensions after the embedding
 """
 function PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels::Integer = 3,
-                        patch_size::Dims{2} = (16, 16), embedplanes = 768,
-                        norm_layer = planes -> identity, flatten = true)
+                        patch_size::Dims{2} = (16, 16), embedplanes::Integer = 768,
+                        norm_layer = planes -> identity, flatten::Bool = true)
     im_height, im_width = imsize
     patch_height, patch_width = patch_size
 
@@ -33,13 +33,15 @@ function PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels::Integer = 3,
 end
 
 """
-    ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) -> rand(Float32, dims))
+    ViPosEmbedding(embedsize::Integer, npatches::Integer; 
+                   init = (dims::Dims{2}) -> rand(Float32, dims))
 
 Positional embedding layer used by many vision transformer-like models.
 """
 struct ViPosEmbedding{T}
     vectors::T
 end
+@functor ViPosEmbedding
 
 function ViPosEmbedding(embedsize::Integer, npatches::Integer;
                         init = (dims::Dims{2}) -> rand(Float32, dims))
@@ -48,22 +50,20 @@ end
 
 (p::ViPosEmbedding)(x) = x .+ p.vectors
 
-@functor ViPosEmbedding
-
 """
-    ClassTokens(dim; init = Flux.zeros32)
+    ClassTokens(planes::Integer; init = Flux.zeros32)
 
-Appends class tokens to an input with embedding dimension `dim` for use in many vision transformer models.
+Appends class tokens to an input with embedding dimension `planes` for use in many
+vision transformer models.
 """
 struct ClassTokens{T}
     token::T
 end
+@functor ClassTokens
 
-ClassTokens(dim::Integer; init = Flux.zeros32) = ClassTokens(init(dim, 1, 1))
+ClassTokens(planes::Integer; init = Flux.zeros32) = ClassTokens(init(planes, 1, 1))
 
 function (m::ClassTokens)(x::AbstractArray{T, 3}) where {T}
     tokens = m.token .* MLUtils.ones_like(x, T, (1, 1, size(x, 3)))
     return hcat(tokens, x)
 end
-
-@functor ClassTokens
diff --git a/src/layers/mlp.jl b/src/layers/mlp.jl
index a3bdb0fb5..3a1c27413 100644
--- a/src/layers/mlp.jl
+++ b/src/layers/mlp.jl
@@ -47,8 +47,9 @@ end
 gated_mlp_block(::typeof(identity), args...; kwargs...) = mlp_block(args...; kwargs...)
 
 """
-    create_classifier(inplanes, nclasses; pool_layer = AdaptiveMeanPool((1, 1)),
-                      dropout_rate = 0.0, use_conv = false)
+    create_classifier(inplanes::Integer, nclasses::Integer;
+                      pool_layer = AdaptiveMeanPool((1, 1)),
+                      dropout_rate = 0.0, use_conv::Bool = false)
 
 Creates a classifier head to be used for models.
 
@@ -61,8 +62,9 @@ Creates a classifier head to be used for models.
   - `dropout_rate`: dropout rate used in the classifier head.
   - `use_conv`: whether to use a 1x1 convolutional layer instead of a `Dense` layer.
 """
-function create_classifier(inplanes, nclasses; pool_layer = AdaptiveMeanPool((1, 1)),
-                           dropout_rate = 0.0, use_conv = false)
+function create_classifier(inplanes::Integer, nclasses::Integer;
+                           pool_layer = AdaptiveMeanPool((1, 1)),
+                           dropout_rate = 0.0, use_conv::Bool = false)
     # Pooling
     if pool_layer === identity
         @assert use_conv
diff --git a/src/layers/pool.jl b/src/layers/pool.jl
index 1962ab0fb..049c06451 100644
--- a/src/layers/pool.jl
+++ b/src/layers/pool.jl
@@ -1,5 +1,6 @@
 """
-    AdaptiveMeanMaxPool(output_size = (1, 1); connection = +)
+    AdaptiveMeanMaxPool(connection = +, output_size::Tuple = (1, 1))
+    AdaptiveMeanMaxPool(output_size::Tuple = (1, 1))
 
 A type of adaptive pooling layer which uses both mean and max pooling and combines them to
 produce a single output. Note that this is equivalent to
@@ -10,7 +11,7 @@ produce a single output. Note that this is equivalent to
   - `output_size`: The size of the output after pooling.
   - `connection`: The connection type to use.
 """
-function AdaptiveMeanMaxPool(connection, output_size = (1, 1))
+function AdaptiveMeanMaxPool(connection, output_size::Tuple = (1, 1))
     return Parallel(connection, AdaptiveMeanPool(output_size), AdaptiveMaxPool(output_size))
 end
 AdaptiveMeanMaxPool(output_size::Tuple = (1, 1)) = AdaptiveMeanMaxPool(+, output_size)
diff --git a/src/layers/scale.jl b/src/layers/scale.jl
index 965b50f38..f3a555b76 100644
--- a/src/layers/scale.jl
+++ b/src/layers/scale.jl
@@ -9,7 +9,7 @@ _input_scale(λ, activation, x) = activation.(λ .* x)
 _input_scale(λ, ::typeof(identity), x) = λ .* x
 
 """
-    LayerScale(λ, planes::Integer)
+    LayerScale(planes::Integer, λ)
 
 Creates a `Flux.Scale` layer that performs "`LayerScale`"
 ([reference](https://arxiv.org/abs/2103.17239)).
diff --git a/src/layers/selayers.jl b/src/layers/selayers.jl
index db0f3715d..0756225ba 100644
--- a/src/layers/selayers.jl
+++ b/src/layers/selayers.jl
@@ -15,9 +15,9 @@ Creates a squeeze-and-excitation layer used in MobileNets and SE-Nets.
   - `norm_layer`: The normalization layer to be used after the convolution layers
   - `rd_planes`: The number of hidden feature maps in a squeeze and excite layer
 """
-function squeeze_excite(inplanes; reduction = 16, rd_divisor = 8,
-                        activation = relu, gate_activation = sigmoid,
-                        norm_layer = planes -> identity,
+function squeeze_excite(inplanes::Integer; reduction::Integer = 16,
+                        rd_divisor::Integer = 8, activation = relu,
+                        gate_activation = sigmoid, norm_layer = planes -> identity,
                         rd_planes = _round_channels(inplanes ÷ reduction, rd_divisor, 0))
     layers = [AdaptiveMeanPool((1, 1)),
         Conv((1, 1), inplanes => rd_planes),
@@ -40,7 +40,7 @@ Effective squeeze-and-excitation layer.
   - `inplanes`: The number of input feature maps
   - `gate_activation`: The activation function for the gate layer
 """
-function effective_squeeze_excite(inplanes; gate_activation = sigmoid, kwargs...)
+function effective_squeeze_excite(inplanes::Integer; gate_activation = sigmoid)
     return SkipConnection(Chain(AdaptiveMeanPool((1, 1)),
                                 Conv((1, 1), inplanes, inplanes),
                                 gate_activation), .*)
diff --git a/src/mixers/core.jl b/src/mixers/core.jl
index 9f9d3b305..18f66aaa8 100644
--- a/src/mixers/core.jl
+++ b/src/mixers/core.jl
@@ -1,7 +1,7 @@
 """
-    mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3, norm_layer = LayerNorm,
+    mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels::Integer = 3, norm_layer = LayerNorm,
              patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.,
-             depth = 12, nclasses = 1000, kwargs...)
+             depth = 12, nclasses::Integer = 1000, kwargs...)
 
 Creates a model with the MLPMixer architecture.
 ([reference](https://arxiv.org/pdf/2105.01601)).
@@ -21,10 +21,9 @@ Creates a model with the MLPMixer architecture.
   - `kwargs`: additional arguments (if any) to pass to the mixer block. Will use the defaults if
     not specified.
 """
-function mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3,
-                  norm_layer = LayerNorm, patch_size::Dims{2} = (16, 16),
-                  embedplanes = 512, drop_path_rate = 0.0,
-                  depth = 12, nclasses = 1000, kwargs...)
+function mlpmixer(block, imsize::Dims{2} = (224, 224); norm_layer = LayerNorm,
+                  patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.0,
+                  depth = 12, inchannels::Integer = 3, nclasses::Integer = 1000, kwargs...)
     npatches = prod(imsize .÷ patch_size)
     dp_rates = linear_scheduler(drop_path_rate; depth)
     layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
diff --git a/src/mixers/gmlp.jl b/src/mixers/gmlp.jl
index 9ebd2dce3..df4a52b70 100644
--- a/src/mixers/gmlp.jl
+++ b/src/mixers/gmlp.jl
@@ -42,9 +42,9 @@ function (m::SpatialGatingUnit)(x)
 end
 
 """
-    spatial_gating_block(planes, npatches; mlp_ratio = 4.0, mlp_layer = gated_mlp_block,
-                         norm_layer = LayerNorm, dropout_rate = 0.0, drop_path_rate = 0.0,
-                         activation = gelu)
+    spatial_gating_block(planes::Integer, npatches::Integer; mlp_ratio = 4.0,
+                         norm_layer = LayerNorm, mlp_layer = gated_mlp_block,
+                         dropout_rate = 0.0, drop_path_rate = 0.0, activation = gelu)
 
 Creates a feedforward block based on the gMLP model architecture described in the paper.
 ([reference](https://arxiv.org/abs/2105.08050))
@@ -60,10 +60,9 @@ Creates a feedforward block based on the gMLP model architecture described in th
   - `drop_path_rate`: Stochastic depth rate
   - `activation`: the activation function to use in the MLP blocks
 """
-function spatial_gating_block(planes, npatches; mlp_ratio = 4.0, norm_layer = LayerNorm,
-                              mlp_layer = gated_mlp_block, dropout_rate = 0.0,
-                              drop_path_rate = 0.0,
-                              activation = gelu)
+function spatial_gating_block(planes::Integer, npatches::Integer; mlp_ratio = 4.0,
+                              norm_layer = LayerNorm, mlp_layer = gated_mlp_block,
+                              dropout_rate = 0.0, drop_path_rate = 0.0, activation = gelu)
     channelplanes = Int(mlp_ratio * planes)
     sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer)
     return SkipConnection(Chain(norm_layer(planes),
@@ -72,14 +71,9 @@ function spatial_gating_block(planes, npatches; mlp_ratio = 4.0, norm_layer = La
                                 DropPath(drop_path_rate)), +)
 end
 
-struct gMLP
-    layers::Any
-end
-@functor gMLP
-
 """
-    gMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-         imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000)
+    gMLP(size::Symbol; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
+         inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a model with the gMLP architecture.
 ([reference](https://arxiv.org/abs/2105.08050)).
@@ -89,18 +83,23 @@ Creates a model with the gMLP architecture.
   - `size`: the size of the model - one of `small`, `base`, `large` or `huge`
   - `patch_size`: the size of the patches
   - `imsize`: the size of the input image
-  - `drop_path_rate`: Stochastic depth rate
+  - `inchannels`: the number of input channels
   - `nclasses`: number of output classes
 
 See also [`Metalhead.mlpmixer`](#).
 """
-function gMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-              imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+struct gMLP
+    layers::Any
+end
+@functor gMLP
+
+function gMLP(size::Symbol; imsize::Dims{2} = (224, 224), patch_size::Dims{2} = (16, 16),
+              inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(size, keys(MIXER_CONFIGS))
     depth = MIXER_CONFIGS[size][:depth]
     embedplanes = MIXER_CONFIGS[size][:planes]
-    layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block,
-                      patch_size, embedplanes, drop_path_rate, depth, nclasses)
+    layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block, patch_size,
+                      embedplanes, depth, inchannels, nclasses)
     return gMLP(layers)
 end
 
diff --git a/src/mixers/mlpmixer.jl b/src/mixers/mlpmixer.jl
index 7b6d4aa09..06aefbd48 100644
--- a/src/mixers/mlpmixer.jl
+++ b/src/mixers/mlpmixer.jl
@@ -1,6 +1,7 @@
 """
-    mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block, 
-               dropout_rate = 0., drop_path_rate = 0., activation = gelu)
+    mixerblock(planes::Integer, npatches::Integer; mlp_layer = mlp_block,
+               mlp_ratio = (0.5, 4.0), dropout_rate = 0.0, drop_path_rate = 0.0,
+               activation = gelu)
 
 Creates a feedforward block for the MLPMixer architecture.
 ([reference](https://arxiv.org/pdf/2105.01601))
@@ -16,9 +17,10 @@ Creates a feedforward block for the MLPMixer architecture.
   - `drop_path_rate`: Stochastic depth rate
   - `activation`: the activation function to use in the MLP blocks
 """
-function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block,
-                    dropout_rate = 0.0, drop_path_rate = 0.0, activation = gelu)
-    tokenplanes, channelplanes = [Int(r * planes) for r in mlp_ratio]
+function mixerblock(planes::Integer, npatches::Integer; mlp_layer = mlp_block,
+                    mlp_ratio::NTuple{2, Number} = (0.5, 4.0), dropout_rate = 0.0,
+                    drop_path_rate = 0.0, activation = gelu)
+    tokenplanes, channelplanes = Int.(planes .* mlp_ratio)
     return Chain(SkipConnection(Chain(LayerNorm(planes),
                                       swapdims((2, 1, 3)),
                                       mlp_layer(npatches, tokenplanes; activation,
@@ -31,14 +33,9 @@ function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_bl
                                       DropPath(drop_path_rate)), +))
 end
 
-struct MLPMixer
-    layers::Any
-end
-@functor MLPMixer
-
 """
-    MLPMixer(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-             imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000)
+MLPMixer(size::Symbol; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
+         inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a model with the MLPMixer architecture.
 ([reference](https://arxiv.org/pdf/2105.01601)).
@@ -49,17 +46,22 @@ Creates a model with the MLPMixer architecture.
   - `patch_size`: the size of the patches
   - `imsize`: the size of the input image
   - `drop_path_rate`: Stochastic depth rate
+  - `inchannels`: the number of input channels
   - `nclasses`: number of output classes
 
 See also [`Metalhead.mlpmixer`](#).
 """
-function MLPMixer(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-                  imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+struct MLPMixer
+    layers::Any
+end
+@functor MLPMixer
+
+function MLPMixer(size::Symbol; imsize::Dims{2} = (224, 224), patch_size::Dims{2} = (16, 16),
+                  inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(size, keys(MIXER_CONFIGS))
     depth = MIXER_CONFIGS[size][:depth]
     embedplanes = MIXER_CONFIGS[size][:planes]
-    layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, drop_path_rate,
-                      nclasses)
+    layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, inchannels,nclasses)
     return MLPMixer(layers)
 end
 
diff --git a/src/mixers/resmlp.jl b/src/mixers/resmlp.jl
index 17e340310..f2c9ece15 100644
--- a/src/mixers/resmlp.jl
+++ b/src/mixers/resmlp.jl
@@ -1,6 +1,6 @@
 """
     resmixerblock(planes, npatches; dropout_rate = 0., drop_path_rate = 0., mlp_ratio = 4.0,
-                  activation = gelu, λ = 1e-4)
+                  activation = gelu, layerscale_init = 1e-4)
 
 Creates a block for the ResMixer architecture.
 ([reference](https://arxiv.org/abs/2105.03404)).
@@ -15,33 +15,28 @@ Creates a block for the ResMixer architecture.
   - `dropout_rate`: the dropout rate to use in the MLP blocks
   - `drop_path_rate`: Stochastic depth rate
   - `activation`: the activation function to use in the MLP blocks
-  - `λ`: initialisation constant for the LayerScale
+  - `layerscale_init`: initialisation constant for the LayerScale
 """
-function resmixerblock(planes, npatches; mlp_ratio = 4.0, mlp_layer = mlp_block,
-                       dropout_rate = 0.0, drop_path_rate = 0.0, activation = gelu,
-                       λ = 1e-4)
+function resmixerblock(planes::Integer, npatches::Integer; mlp_layer = mlp_block,
+                       mlp_ratio = 4.0, layerscale_init = 1e-4, dropout_rate = 0.0,
+                       drop_path_rate = 0.0, activation = gelu)
     return Chain(SkipConnection(Chain(Flux.Scale(planes),
                                       swapdims((2, 1, 3)),
                                       Dense(npatches, npatches),
                                       swapdims((2, 1, 3)),
-                                      LayerScale(planes, λ),
+                                      LayerScale(planes, layerscale_init),
                                       DropPath(drop_path_rate)), +),
                  SkipConnection(Chain(Flux.Scale(planes),
                                       mlp_layer(planes, Int(mlp_ratio * planes);
                                                 dropout_rate,
                                                 activation),
-                                      LayerScale(planes, λ),
+                                      LayerScale(planes, layerscale_init),
                                       DropPath(drop_path_rate)), +))
 end
 
-struct ResMLP
-    layers::Any
-end
-@functor ResMLP
-
 """
-    ResMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
-           drop_path_rate = 0., nclasses = 1000)
+    ResMLP(size::Symbol; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
+           inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a model with the ResMLP architecture.
 ([reference](https://arxiv.org/abs/2105.03404)).
@@ -51,18 +46,23 @@ Creates a model with the ResMLP architecture.
   - `size`: the size of the model - one of `small`, `base`, `large` or `huge`
   - `patch_size`: the size of the patches
   - `imsize`: the size of the input image
-  - `drop_path_rate`: Stochastic depth rate
+  - `inchannels`: the number of input channels
   - `nclasses`: number of output classes
 
 See also [`Metalhead.mlpmixer`](#).
 """
-function ResMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-                imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+struct ResMLP
+    layers::Any
+end
+@functor ResMLP
+
+function ResMLP(size::Symbol; imsize::Dims{2} = (224, 224), patch_size::Dims{2} = (16, 16),
+                inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(size, keys(MIXER_CONFIGS))
     depth = MIXER_CONFIGS[size][:depth]
     embedplanes = MIXER_CONFIGS[size][:planes]
     layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes,
-                      drop_path_rate, depth, nclasses)
+                      depth, inchannels, nclasses)
     return ResMLP(layers)
 end
 
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
index 1fece2191..1c049e46e 100644
--- a/src/vit-based/vit.jl
+++ b/src/vit-based/vit.jl
@@ -12,7 +12,8 @@ Transformer as used in the base ViT architecture.
   - `mlp_ratio`: ratio of MLP layers to the number of input channels
   - `dropout_rate`: dropout rate
 """
-function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout_rate = 0.0)
+function transformer_encoder(planes::Integer, depth::Integer, nheads::Integer;
+                             mlp_ratio = 4.0, dropout_rate = 0.0)
     layers = [Chain(SkipConnection(prenorm(planes,
                                            MHAttention(planes, nheads;
                                                        attn_dropout_rate = dropout_rate,
@@ -26,9 +27,9 @@ function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout_rat
 end
 
 """
-    vit(imsize::Dims{2} = (256, 256); inchannels = 3, patch_size::Dims{2} = (16, 16),
+    vit(imsize::Dims{2} = (256, 256); inchannels::Integer = 3, patch_size::Dims{2} = (16, 16),
         embedplanes = 768, depth = 6, nheads = 16, mlp_ratio = 4.0, dropout_rate = 0.1,
-        emb_dropout_rate = 0.1, pool = :class, nclasses = 1000)
+        emb_dropout_rate = 0.1, pool = :class, nclasses::Integer = 1000)
 
 Creates a Vision Transformer (ViT) model.
 ([reference](https://arxiv.org/abs/2010.11929)).
@@ -47,9 +48,10 @@ Creates a Vision Transformer (ViT) model.
   - `pool`: pooling type, either :class or :mean
   - `nclasses`: number of classes in the output
 """
-function vit(imsize::Dims{2} = (256, 256); inchannels = 3, patch_size::Dims{2} = (16, 16),
-             embedplanes = 768, depth = 6, nheads = 16, mlp_ratio = 4.0, dropout_rate = 0.1,
-             emb_dropout_rate = 0.1, pool = :class, nclasses = 1000)
+function vit(imsize::Dims{2} = (256, 256); inchannels::Integer = 3,
+             patch_size::Dims{2} = (16, 16), embedplanes::Integer = 768,
+             depth::Integer = 6, nheads::Integer = 16, mlp_ratio = 4.0, dropout_rate = 0.1,
+             emb_dropout_rate = 0.1, pool::Symbol = :class, nclasses::Integer = 1000)
     @assert pool in [:class, :mean]
     "Pool type must be either `:class` (class token) or `:mean` (mean pooling)"
     npatches = prod(imsize .÷ patch_size)
@@ -74,8 +76,8 @@ const VIT_CONFIGS = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3),
                                        mlp_ratio = 64 // 13))
 
 """
-    ViT(mode::Symbol = base; imsize::Dims{2} = (256, 256), inchannels = 3,
-        patch_size::Dims{2} = (16, 16), pool = :class, nclasses = 1000)
+    ViT(mode::Symbol = base; imsize::Dims{2} = (256, 256), inchannels::Integer = 3,
+        patch_size::Dims{2} = (16, 16), pool = :class, nclasses::Integer = 1000)
 
 Creates a Vision Transformer (ViT) model.
 ([reference](https://arxiv.org/abs/2010.11929)).
@@ -97,11 +99,11 @@ struct ViT
 end
 @functor ViT
 
-function ViT(mode::Symbol = :base; imsize::Dims{2} = (256, 256), inchannels = 3,
-             patch_size::Dims{2} = (16, 16), pool = :class, nclasses = 1000)
+function ViT(mode::Symbol = :base; imsize::Dims{2} = (256, 256), patch_size::Dims{2} = (16, 16),
+             inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(mode, keys(VIT_CONFIGS))
     kwargs = VIT_CONFIGS[mode]
-    layers = vit(imsize; inchannels, patch_size, nclasses, pool, kwargs...)
+    layers = vit(imsize; inchannels, patch_size, nclasses, kwargs...)
     return ViT(layers)
 end
 
diff --git a/test/convnets.jl b/test/convnets.jl
index e62b14299..35a745b87 100644
--- a/test/convnets.jl
+++ b/test/convnets.jl
@@ -263,15 +263,12 @@ end
     end
 end
 
-
 @testset "ConvNeXt" verbose = true begin
     @testset for mode in [:small, :base, :large, :tiny, :xlarge]
-        @testset for drop_path_rate in [0.0, 0.5]
-            m = ConvNeXt(mode; drop_path_rate)
-            @test size(m(x_224)) == (1000, 1)
-            @test gradtest(m, x_224)
-            _gc()
-        end
+        m = ConvNeXt(mode)
+        @test size(m(x_224)) == (1000, 1)
+        @test gradtest(m, x_224)
+        _gc()
     end
 end
 
diff --git a/test/mixers.jl b/test/mixers.jl
index 885ff5838..51cdd736e 100644
--- a/test/mixers.jl
+++ b/test/mixers.jl
@@ -1,32 +1,8 @@
-@testset "MLPMixer" begin
-	@testset for mode in [:small, :base, :large] #:huge]
-		@testset for drop_path_rate in [0.0, 0.5]
-			m = MLPMixer(mode; drop_path_rate)
-			@test size(m(x_224)) == (1000, 1)
-			@test gradtest(m, x_224)
-			_gc()
-		end
-	end
-end
-
-@testset "ResMLP" begin
-    @testset for mode in [:small, :base, :large] #:huge]
-        @testset for drop_path_rate in [0.0, 0.5]
-            m = ResMLP(mode; drop_path_rate)
-            @test size(m(x_224)) == (1000, 1)
-            @test gradtest(m, x_224)
-            _gc()
-        end
-    end
-end
-
-@testset "gMLP" begin
-    @testset for mode in [:small, :base, :large] #:huge]
-		@testset for drop_path_rate in [0.0, 0.5]
-			m = gMLP(mode; drop_path_rate)
-			@test size(m(x_224)) == (1000, 1)
-			@test gradtest(m, x_224)
-			_gc()
-		end
+@testset for model in [MLPMixer, ResMLP, gMLP]
+    @testset for mode in [:small, :base, :large]
+        m = model(mode)
+        @test size(m(x_224)) == (1000, 1)
+        @test gradtest(m, x_224)
+        _gc()
     end
 end

From 5aece44e27505d4cc8ab14efd76ad223b6a1c6ad Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Tue, 2 Aug 2022 10:07:51 +0530
Subject: [PATCH 5/8] Use `create_classifier` more

---
 src/convnets/alexnet.jl                     | 34 +++++++-------
 src/convnets/convmixer.jl                   |  3 +-
 src/convnets/convnext.jl                    |  9 ++--
 src/convnets/densenet.jl                    |  5 +--
 src/convnets/efficientnet.jl                | 14 +++---
 src/convnets/inception/googlenet.jl         | 48 ++++++++++----------
 src/convnets/inception/inceptionresnetv2.jl | 35 +++++++--------
 src/convnets/inception/inceptionv3.jl       | 43 +++++++++---------
 src/convnets/inception/inceptionv4.jl       | 49 ++++++++++-----------
 src/convnets/inception/xception.jl          | 23 +++++-----
 src/convnets/mobilenet/mobilenetv1.jl       |  6 +--
 src/convnets/mobilenet/mobilenetv2.jl       | 16 +++----
 src/convnets/mobilenet/mobilenetv3.jl       | 14 +++---
 src/convnets/squeezenet.jl                  | 31 +++++++------
 src/convnets/vgg.jl                         |  2 +-
 src/layers/mlp.jl                           | 21 +++++----
 src/mixers/core.jl                          |  7 ++-
 src/mixers/mlpmixer.jl                      |  8 ++--
 src/vit-based/vit.jl                        |  3 +-
 19 files changed, 175 insertions(+), 196 deletions(-)

diff --git a/src/convnets/alexnet.jl b/src/convnets/alexnet.jl
index 75ba5ad48..6b384f80c 100644
--- a/src/convnets/alexnet.jl
+++ b/src/convnets/alexnet.jl
@@ -9,22 +9,21 @@ Create an AlexNet model
   - `nclasses`: the number of output classes
 """
 function alexnet(; inchannels::Integer = 3, nclasses::Integer = 1000)
-    layers = Chain(Chain(Conv((11, 11), inchannels => 64, relu; stride = (4, 4), pad = (2, 2)),
-                         MaxPool((3, 3); stride = (2, 2)),
-                         Conv((5, 5), 64 => 192, relu; pad = (2, 2)),
-                         MaxPool((3, 3); stride = (2, 2)),
-                         Conv((3, 3), 192 => 384, relu; pad = (1, 1)),
-                         Conv((3, 3), 384 => 256, relu; pad = (1, 1)),
-                         Conv((3, 3), 256 => 256, relu; pad = (1, 1)),
-                         MaxPool((3, 3); stride = (2, 2)),
-                         AdaptiveMeanPool((6, 6))),
-                   Chain(MLUtils.flatten,
-                         Dropout(0.5),
-                         Dense(256 * 6 * 6, 4096, relu),
-                         Dropout(0.5),
-                         Dense(4096, 4096, relu),
-                         Dense(4096, nclasses)))
-    return layers
+    backbone = Chain(Conv((11, 11), inchannels => 64, relu; stride = 4, pad = 2),
+                     MaxPool((3, 3); stride = 2),
+                     Conv((5, 5), 64 => 192, relu; pad = 2),
+                     MaxPool((3, 3); stride = 2),
+                     Conv((3, 3), 192 => 384, relu; pad = 1),
+                     Conv((3, 3), 384 => 256, relu; pad = 1),
+                     Conv((3, 3), 256 => 256, relu; pad = 1),
+                     MaxPool((3, 3); stride = 2))
+    classifier = Chain(AdaptiveMeanPool((6, 6)), MLUtils.flatten,
+                       Dropout(0.5),
+                       Dense(256 * 6 * 6, 4096, relu),
+                       Dropout(0.5),
+                       Dense(4096, 4096, relu),
+                       Dense(4096, nclasses))
+    return Chain(backbone, classifier)
 end
 
 """
@@ -47,7 +46,8 @@ struct AlexNet
 end
 @functor AlexNet
 
-function AlexNet(; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
+function AlexNet(; pretrain::Bool = false, inchannels::Integer = 3,
+                 nclasses::Integer = 1000)
     layers = alexnet(; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "AlexNet")
diff --git a/src/convnets/convmixer.jl b/src/convnets/convmixer.jl
index c75303184..efde886cb 100644
--- a/src/convnets/convmixer.jl
+++ b/src/convnets/convmixer.jl
@@ -26,8 +26,7 @@ function convmixer(planes::Integer, depth::Integer; kernel_size = (9, 9),
                                                    pad = SamePad())), +),
                     conv_norm((1, 1), planes, planes, activation; preact = true)...)
               for _ in 1:depth]
-    head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses))
-    return Chain(Chain(stem..., Chain(blocks)), head)
+    return Chain(Chain(stem..., Chain(blocks)), create_classifier(planes, nclasses))
 end
 
 const CONVMIXER_CONFIGS = Dict(:base => Dict(:planes => 1536, :depth => 20,
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
index d7c39cc04..7bb265c24 100644
--- a/src/convnets/convnext.jl
+++ b/src/convnets/convnext.jl
@@ -63,11 +63,10 @@ function convnext(depths::Vector{<:Integer}, planes::Vector{<:Integer};
         cur += depths[i]
     end
     backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages))))
-    head = Chain(GlobalMeanPool(),
-                 MLUtils.flatten,
-                 LayerNorm(planes[end]),
-                 Dense(planes[end], nclasses))
-    return Chain(Chain(backbone), head)
+    classifier = Chain(GlobalMeanPool(), MLUtils.flatten,
+                       LayerNorm(planes[end]),
+                       Dense(planes[end], nclasses))
+    return Chain(Chain(backbone...), classifier)
 end
 
 # Configurations for ConvNeXt models
diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl
index 0b164e2ab..9720a0212 100644
--- a/src/convnets/densenet.jl
+++ b/src/convnets/densenet.jl
@@ -83,10 +83,7 @@ function densenet(inplanes::Integer, growth_rates; reduction = 0.5, inchannels::
         inplanes = floor(Int, outplanes * reduction)
     end
     push!(layers, BatchNorm(outplanes, relu))
-    return Chain(Chain(layers),
-                 Chain(AdaptiveMeanPool((1, 1)),
-                       MLUtils.flatten,
-                       Dense(outplanes, nclasses)))
+    return Chain(Chain(layers...), create_classifier(outplanes, nclasses))
 end
 
 """
diff --git a/src/convnets/efficientnet.jl b/src/convnets/efficientnet.jl
index 730840fa4..86ba9373f 100644
--- a/src/convnets/efficientnet.jl
+++ b/src/convnets/efficientnet.jl
@@ -28,8 +28,8 @@ function efficientnet(scalings, block_configs; max_width::Integer = 1280,
     scalew(w) = wscale ≈ 1 ? w : ceil(Int64, wscale * w)
     scaled(d) = dscale ≈ 1 ? d : ceil(Int64, dscale * d)
     out_channels = _round_channels(scalew(32), 8)
-    stem = conv_norm((3, 3), inchannels, out_channels, swish;
-                     bias = false, stride = 2, pad = SamePad())
+    stem = conv_norm((3, 3), inchannels, out_channels, swish; bias = false, stride = 2,
+                     pad = SamePad())
     blocks = []
     for (n, k, s, e, i, o) in block_configs
         in_channels = _round_channels(scalew(i), 8)
@@ -44,13 +44,11 @@ function efficientnet(scalings, block_configs; max_width::Integer = 1280,
                                    stride = 1, reduction = 4))
         end
     end
-    blocks = Chain(blocks...)
     head_out_channels = _round_channels(max_width, 8)
-    head = conv_norm((1, 1), out_channels, head_out_channels, swish;
-                     bias = false, pad = SamePad())
-    top = Dense(head_out_channels, nclasses)
-    return Chain(Chain([stem..., blocks, head...]),
-                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, top))
+    append!(blocks,
+            conv_norm((1, 1), out_channels, head_out_channels, swish;
+                      bias = false, pad = SamePad()))
+    return Chain(Chain(stem..., blocks...), create_classifier(head_out_channels, nclasses))
 end
 
 # n: # of block repetitions
diff --git a/src/convnets/inception/googlenet.jl b/src/convnets/inception/googlenet.jl
index 90f92ddfc..a72ba5e6c 100644
--- a/src/convnets/inception/googlenet.jl
+++ b/src/convnets/inception/googlenet.jl
@@ -36,32 +36,29 @@ Create an Inception-v1 model (commonly referred to as GoogLeNet)
 
   - `nclasses`: the number of output classes
 """
-function googlenet(; nclasses::Integer = 1000)
-    layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3),
-                         MaxPool((3, 3); stride = 2, pad = 1),
-                         Conv((1, 1), 64 => 64),
-                         Conv((3, 3), 64 => 192; pad = 1),
-                         MaxPool((3, 3); stride = 2, pad = 1),
-                         _inceptionblock(192, 64, 96, 128, 16, 32, 32),
-                         _inceptionblock(256, 128, 128, 192, 32, 96, 64),
-                         MaxPool((3, 3); stride = 2, pad = 1),
-                         _inceptionblock(480, 192, 96, 208, 16, 48, 64),
-                         _inceptionblock(512, 160, 112, 224, 24, 64, 64),
-                         _inceptionblock(512, 128, 128, 256, 24, 64, 64),
-                         _inceptionblock(512, 112, 144, 288, 32, 64, 64),
-                         _inceptionblock(528, 256, 160, 320, 32, 128, 128),
-                         MaxPool((3, 3); stride = 2, pad = 1),
-                         _inceptionblock(832, 256, 160, 320, 32, 128, 128),
-                         _inceptionblock(832, 384, 192, 384, 48, 128, 128)),
-                   Chain(AdaptiveMeanPool((1, 1)),
-                         MLUtils.flatten,
-                         Dropout(0.4),
-                         Dense(1024, nclasses)))
-    return layers
+function googlenet(; inchannels::Integer = 3, nclasses::Integer = 1000)
+    backbone = Chain(Conv((7, 7), inchannels => 64; stride = 2, pad = 3),
+                     MaxPool((3, 3); stride = 2, pad = 1),
+                     Conv((1, 1), 64 => 64),
+                     Conv((3, 3), 64 => 192; pad = 1),
+                     MaxPool((3, 3); stride = 2, pad = 1),
+                     _inceptionblock(192, 64, 96, 128, 16, 32, 32),
+                     _inceptionblock(256, 128, 128, 192, 32, 96, 64),
+                     MaxPool((3, 3); stride = 2, pad = 1),
+                     _inceptionblock(480, 192, 96, 208, 16, 48, 64),
+                     _inceptionblock(512, 160, 112, 224, 24, 64, 64),
+                     _inceptionblock(512, 128, 128, 256, 24, 64, 64),
+                     _inceptionblock(512, 112, 144, 288, 32, 64, 64),
+                     _inceptionblock(528, 256, 160, 320, 32, 128, 128),
+                     MaxPool((3, 3); stride = 2, pad = 1),
+                     _inceptionblock(832, 256, 160, 320, 32, 128, 128),
+                     _inceptionblock(832, 384, 192, 384, 48, 128, 128))
+    classifier = create_classifier(1024, nclasses; dropout_rate = 0.4)
+    return Chain(backbone, classifier)
 end
 
 """
-    GoogLeNet(; pretrain::Bool = false, nclasses::Integer = 1000)
+    GoogLeNet(; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an Inception-v1 model (commonly referred to as `GoogLeNet`)
 ([reference](https://arxiv.org/abs/1409.4842v1)).
@@ -82,8 +79,9 @@ struct GoogLeNet
 end
 @functor GoogLeNet
 
-function GoogLeNet(; pretrain::Bool = false, nclasses::Integer = 1000)
-    layers = googlenet(; nclasses = nclasses)
+function GoogLeNet(; pretrain::Bool = false, inchannels::Integer = 3,
+                   nclasses::Integer = 1000)
+    layers = googlenet(; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "GoogLeNet")
     end
diff --git a/src/convnets/inception/inceptionresnetv2.jl b/src/convnets/inception/inceptionresnetv2.jl
index 747da2fb2..96b391b65 100644
--- a/src/convnets/inception/inceptionresnetv2.jl
+++ b/src/convnets/inception/inceptionresnetv2.jl
@@ -77,24 +77,23 @@ Creates an InceptionResNetv2 model.
 """
 function inceptionresnetv2(; inchannels::Integer = 3, dropout_rate = 0.0,
                            nclasses::Integer = 1000)
-    body = Chain(conv_norm((3, 3), inchannels, 32; stride = 2)...,
-                 conv_norm((3, 3), 32, 32)...,
-                 conv_norm((3, 3), 32, 64; pad = 1)...,
-                 MaxPool((3, 3); stride = 2),
-                 conv_norm((3, 3), 64, 80)...,
-                 conv_norm((3, 3), 80, 192)...,
-                 MaxPool((3, 3); stride = 2),
-                 mixed_5b(),
-                 [block35(0.17f0) for _ in 1:10]...,
-                 mixed_6a(),
-                 [block17(0.10f0) for _ in 1:20]...,
-                 mixed_7a(),
-                 [block8(0.20f0) for _ in 1:9]...,
-                 block8(; activation = relu),
-                 conv_norm((1, 1), 2080, 1536)...)
-    head = Chain(GlobalMeanPool(), MLUtils.flatten, Dropout(dropout_rate),
-                 Dense(1536, nclasses))
-    return Chain(body, head)
+    backbone = Chain(conv_norm((3, 3), inchannels, 32; stride = 2)...,
+                     conv_norm((3, 3), 32, 32)...,
+                     conv_norm((3, 3), 32, 64; pad = 1)...,
+                     MaxPool((3, 3); stride = 2),
+                     conv_norm((3, 3), 64, 80)...,
+                     conv_norm((3, 3), 80, 192)...,
+                     MaxPool((3, 3); stride = 2),
+                     mixed_5b(),
+                     [block35(0.17f0) for _ in 1:10]...,
+                     mixed_6a(),
+                     [block17(0.10f0) for _ in 1:20]...,
+                     mixed_7a(),
+                     [block8(0.20f0) for _ in 1:9]...,
+                     block8(; activation = relu),
+                     conv_norm((1, 1), 2080, 1536)...)
+    classifier = create_classifier(1536, nclasses; dropout_rate)
+    return Chain(backbone, classifier)
 end
 
 """
diff --git a/src/convnets/inception/inceptionv3.jl b/src/convnets/inception/inceptionv3.jl
index 8d9977d80..8a5e19849 100644
--- a/src/convnets/inception/inceptionv3.jl
+++ b/src/convnets/inception/inceptionv3.jl
@@ -136,29 +136,26 @@ Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
   - `nclasses`: the number of output classes
 """
 function inceptionv3(; inchannels::Integer = 3, nclasses::Integer = 1000)
-    layer = Chain(Chain(conv_norm((3, 3), inchannels, 32; stride = 2)...,
-                        conv_norm((3, 3), 32, 32)...,
-                        conv_norm((3, 3), 32, 64; pad = 1)...,
-                        MaxPool((3, 3); stride = 2),
-                        conv_norm((1, 1), 64, 80)...,
-                        conv_norm((3, 3), 80, 192)...,
-                        MaxPool((3, 3); stride = 2),
-                        inceptionv3_a(192, 32),
-                        inceptionv3_a(256, 64),
-                        inceptionv3_a(288, 64),
-                        inceptionv3_b(288),
-                        inceptionv3_c(768, 128),
-                        inceptionv3_c(768, 160),
-                        inceptionv3_c(768, 160),
-                        inceptionv3_c(768, 192),
-                        inceptionv3_d(768),
-                        inceptionv3_e(1280),
-                        inceptionv3_e(2048)),
-                  Chain(AdaptiveMeanPool((1, 1)),
-                        Dropout(0.2),
-                        MLUtils.flatten,
-                        Dense(2048, nclasses)))
-    return layer
+    backbone = Chain(conv_norm((3, 3), inchannels, 32; stride = 2)...,
+                     conv_norm((3, 3), 32, 32)...,
+                     conv_norm((3, 3), 32, 64; pad = 1)...,
+                     MaxPool((3, 3); stride = 2),
+                     conv_norm((1, 1), 64, 80)...,
+                     conv_norm((3, 3), 80, 192)...,
+                     MaxPool((3, 3); stride = 2),
+                     inceptionv3_a(192, 32),
+                     inceptionv3_a(256, 64),
+                     inceptionv3_a(288, 64),
+                     inceptionv3_b(288),
+                     inceptionv3_c(768, 128),
+                     inceptionv3_c(768, 160),
+                     inceptionv3_c(768, 160),
+                     inceptionv3_c(768, 192),
+                     inceptionv3_d(768),
+                     inceptionv3_e(1280),
+                     inceptionv3_e(2048))
+    classifier = create_classifier(2048, nclasses; dropout_rate = 0.2)
+    return Chain(backbone, classifier)
 end
 
 """
diff --git a/src/convnets/inception/inceptionv4.jl b/src/convnets/inception/inceptionv4.jl
index b84232fb8..8d4f00eb2 100644
--- a/src/convnets/inception/inceptionv4.jl
+++ b/src/convnets/inception/inceptionv4.jl
@@ -95,31 +95,30 @@ Create an Inceptionv4 model.
 """
 function inceptionv4(; dropout_rate = 0.0, inchannels::Integer = 3,
                      nclasses::Integer = 1000)
-    body = Chain(conv_norm((3, 3), inchannels, 32; stride = 2)...,
-                 conv_norm((3, 3), 32, 32)...,
-                 conv_norm((3, 3), 32, 64; pad = 1)...,
-                 mixed_3a(),
-                 mixed_4a(),
-                 mixed_5a(),
-                 inceptionv4_a(),
-                 inceptionv4_a(),
-                 inceptionv4_a(),
-                 inceptionv4_a(),
-                 reduction_a(),  # mixed_6a
-                 inceptionv4_b(),
-                 inceptionv4_b(),
-                 inceptionv4_b(),
-                 inceptionv4_b(),
-                 inceptionv4_b(),
-                 inceptionv4_b(),
-                 inceptionv4_b(),
-                 reduction_b(),  # mixed_7a
-                 inceptionv4_c(),
-                 inceptionv4_c(),
-                 inceptionv4_c())
-    head = Chain(GlobalMeanPool(), MLUtils.flatten, Dropout(dropout_rate),
-                 Dense(1536, nclasses))
-    return Chain(body, head)
+    backbone = Chain(conv_norm((3, 3), inchannels, 32; stride = 2)...,
+                     conv_norm((3, 3), 32, 32)...,
+                     conv_norm((3, 3), 32, 64; pad = 1)...,
+                     mixed_3a(),
+                     mixed_4a(),
+                     mixed_5a(),
+                     inceptionv4_a(),
+                     inceptionv4_a(),
+                     inceptionv4_a(),
+                     inceptionv4_a(),
+                     reduction_a(),  # mixed_6a
+                     inceptionv4_b(),
+                     inceptionv4_b(),
+                     inceptionv4_b(),
+                     inceptionv4_b(),
+                     inceptionv4_b(),
+                     inceptionv4_b(),
+                     inceptionv4_b(),
+                     reduction_b(),  # mixed_7a
+                     inceptionv4_c(),
+                     inceptionv4_c(),
+                     inceptionv4_c())
+    classifier = create_classifier(1536, nclasses; dropout_rate)
+    return Chain(backbone, classifier)
 end
 
 """
diff --git a/src/convnets/inception/xception.jl b/src/convnets/inception/xception.jl
index 8d2ad13d8..71a4efc15 100644
--- a/src/convnets/inception/xception.jl
+++ b/src/convnets/inception/xception.jl
@@ -57,18 +57,17 @@ Creates an Xception model.
   - `nclasses`: the number of output classes.
 """
 function xception(; dropout_rate = 0.0, inchannels::Integer = 3, nclasses::Integer = 1000)
-    body = Chain(conv_norm((3, 3), inchannels, 32; stride = 2, bias = false)...,
-                 conv_norm((3, 3), 32, 64; bias = false)...,
-                 xception_block(64, 128, 2; stride = 2, start_with_relu = false),
-                 xception_block(128, 256, 2; stride = 2),
-                 xception_block(256, 728, 2; stride = 2),
-                 [xception_block(728, 728, 3) for _ in 1:8]...,
-                 xception_block(728, 1024, 2; stride = 2, grow_at_start = false),
-                 depthwise_sep_conv_norm((3, 3), 1024, 1536; pad = 1)...,
-                 depthwise_sep_conv_norm((3, 3), 1536, 2048; pad = 1)...)
-    head = Chain(GlobalMeanPool(), MLUtils.flatten, Dropout(dropout_rate),
-                 Dense(2048, nclasses))
-    return Chain(body, head)
+    backbone = Chain(conv_norm((3, 3), inchannels, 32; stride = 2, bias = false)...,
+                     conv_norm((3, 3), 32, 64; bias = false)...,
+                     xception_block(64, 128, 2; stride = 2, start_with_relu = false),
+                     xception_block(128, 256, 2; stride = 2),
+                     xception_block(256, 728, 2; stride = 2),
+                     [xception_block(728, 728, 3) for _ in 1:8]...,
+                     xception_block(728, 1024, 2; stride = 2, grow_at_start = false),
+                     depthwise_sep_conv_norm((3, 3), 1024, 1536; pad = 1)...,
+                     depthwise_sep_conv_norm((3, 3), 1536, 2048; pad = 1)...)
+    classifier = create_classifier(2048, nclasses; dropout_rate)
+    return Chain(backbone, classifier)
 end
 
 """
diff --git a/src/convnets/mobilenet/mobilenetv1.jl b/src/convnets/mobilenet/mobilenetv1.jl
index e31f8835b..ca20b4a64 100644
--- a/src/convnets/mobilenet/mobilenetv1.jl
+++ b/src/convnets/mobilenet/mobilenetv1.jl
@@ -34,11 +34,7 @@ function mobilenetv1(width_mult::Number, config::Vector{<:Tuple}; activation = r
             inchannels = outch
         end
     end
-
-    return Chain(Chain(layers),
-                 Chain(GlobalMeanPool(),
-                       MLUtils.flatten,
-                       Dense(inchannels, nclasses)))
+    return Chain(Chain(layers...), create_classifier(inchannels, nclasses))
 end
 
 # Layer configurations for MobileNetv1
diff --git a/src/convnets/mobilenet/mobilenetv2.jl b/src/convnets/mobilenet/mobilenetv2.jl
index 9dd35e9f9..59e147829 100644
--- a/src/convnets/mobilenet/mobilenetv2.jl
+++ b/src/convnets/mobilenet/mobilenetv2.jl
@@ -25,13 +25,14 @@ Create a MobileNetv2 model.
 function mobilenetv2(width_mult::Number, configs::Vector{<:Tuple};
                      max_width::Integer = 1280, inchannels::Integer = 3,
                      nclasses::Integer = 1000)
+    divisor = width_mult == 0.1 ? 4 : 8
     # building first layer
-    inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8)
+    inplanes = _round_channels(32 * width_mult, divisor)
     layers = []
     append!(layers, conv_norm((3, 3), inchannels, inplanes; pad = 1, stride = 2))
     # building inverted residual blocks
     for (t, c, n, s, a) in configs
-        outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8)
+        outplanes = _round_channels(c * width_mult, divisor)
         for i in 1:n
             push!(layers,
                   invertedresidual((3, 3), inplanes, outplanes, a; expansion = t,
@@ -39,14 +40,11 @@ function mobilenetv2(width_mult::Number, configs::Vector{<:Tuple};
             inplanes = outplanes
         end
     end
-    # building last several layers
-    outplanes = (width_mult > 1) ?
-                _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) :
+    # building last layers
+    outplanes = width_mult > 1 ? _round_channels(max_width * width_mult, divisor) :
                 max_width
-    return Chain(Chain(Chain(layers),
-                       conv_norm((1, 1), inplanes, outplanes, relu6; bias = false)...),
-                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten,
-                       Dense(outplanes, nclasses)))
+    append!(layers, conv_norm((1, 1), inplanes, outplanes, relu6; bias = false))
+    return Chain(Chain(layers...), create_classifier(outplanes, nclasses))
 end
 
 # Layer configurations for MobileNetv2
diff --git a/src/convnets/mobilenet/mobilenetv3.jl b/src/convnets/mobilenet/mobilenetv3.jl
index 00c0e0139..1c5e5825b 100644
--- a/src/convnets/mobilenet/mobilenetv3.jl
+++ b/src/convnets/mobilenet/mobilenetv3.jl
@@ -44,16 +44,16 @@ function mobilenetv3(width_mult::Number, configs::Vector{<:Tuple};
                                stride = s, reduction = r))
         inplanes = outplanes
     end
-    # building last several layers
+    # building last layers
     output_channel = max_width
     output_channel = width_mult > 1.0 ? _round_channels(output_channel * width_mult, 8) :
                      output_channel
-    classifier = Chain(Dense(explanes, output_channel, hardswish),
+    append!(layers, conv_norm((1, 1), inplanes, explanes, hardswish; bias = false))
+    classifier = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten,
+                       Dense(explanes, output_channel, hardswish),
                        Dropout(0.2),
                        Dense(output_channel, nclasses))
-    return Chain(Chain(Chain(layers),
-                       conv_norm((1, 1), inplanes, explanes, hardswish; bias = false)...),
-                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier))
+    return Chain(Chain(layers...), classifier)
 end
 
 # Layer configurations for small and large models for MobileNetv3
@@ -91,7 +91,7 @@ const MOBILENETV3_CONFIGS = Dict(:small => [
                                  ])
 
 """
-    MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain::Bool = false,
+    MobileNetv3(mode::Symbol; width_mult::Number = 1, pretrain::Bool = false,
                 inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create a MobileNetv3 model with the specified configuration.
@@ -115,7 +115,7 @@ struct MobileNetv3
 end
 @functor MobileNetv3
 
-function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain::Bool = false,
+function MobileNetv3(mode::Symbol; width_mult::Number = 1, pretrain::Bool = false,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(mode, [:small, :large])
     max_width = (mode == :large) ? 1280 : 1024
diff --git a/src/convnets/squeezenet.jl b/src/convnets/squeezenet.jl
index 3ee6653bc..b3cfb0293 100644
--- a/src/convnets/squeezenet.jl
+++ b/src/convnets/squeezenet.jl
@@ -32,22 +32,21 @@ Create a SqueezeNet
   - `nclasses`: the number of output classes.
 """
 function squeezenet(; inchannels::Integer = 3, nclasses::Integer = 1000)
-    return Chain(Chain(Conv((3, 3), inchannels => 64, relu; stride = 2),
-                       MaxPool((3, 3); stride = 2),
-                       fire(64, 16, 64, 64),
-                       fire(128, 16, 64, 64),
-                       MaxPool((3, 3); stride = 2),
-                       fire(128, 32, 128, 128),
-                       fire(256, 32, 128, 128),
-                       MaxPool((3, 3); stride = 2),
-                       fire(256, 48, 192, 192),
-                       fire(384, 48, 192, 192),
-                       fire(384, 64, 256, 256),
-                       fire(512, 64, 256, 256),
-                       Dropout(0.5),
-                       Conv((1, 1), 512 => nclasses, relu)),
-                 AdaptiveMeanPool((1, 1)),
-                 MLUtils.flatten)
+    backbone = Chain(Conv((3, 3), inchannels => 64, relu; stride = 2),
+                     MaxPool((3, 3); stride = 2),
+                     fire(64, 16, 64, 64),
+                     fire(128, 16, 64, 64),
+                     MaxPool((3, 3); stride = 2),
+                     fire(128, 32, 128, 128),
+                     fire(256, 32, 128, 128),
+                     MaxPool((3, 3); stride = 2),
+                     fire(256, 48, 192, 192),
+                     fire(384, 48, 192, 192),
+                     fire(384, 64, 256, 256),
+                     fire(512, 64, 256, 256))
+    classifier = Chain(Dropout(0.5), Conv((1, 1), 512 => nclasses, relu),
+                       AdaptiveMeanPool((1, 1)), MLUtils.flatten)
+    return Chain(backbone, classifier)
 end
 
 """
diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl
index 0b6026eb8..e685620a3 100644
--- a/src/convnets/vgg.jl
+++ b/src/convnets/vgg.jl
@@ -99,7 +99,7 @@ function vgg(imsize::Dims{2}; config, batchnorm::Bool = false, fcsize::Integer =
     conv = vgg_convolutional_layers(config, batchnorm, inchannels)
     imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3]
     class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout_rate)
-    return Chain(Chain(conv), class)
+    return Chain(Chain(conv...), class)
 end
 
 const VGG_CONV_CONFIGS = Dict(:A => [(64, 1), (128, 1), (256, 2), (512, 2), (512, 2)],
diff --git a/src/layers/mlp.jl b/src/layers/mlp.jl
index 3a1c27413..9b8c48de2 100644
--- a/src/layers/mlp.jl
+++ b/src/layers/mlp.jl
@@ -47,7 +47,7 @@ end
 gated_mlp_block(::typeof(identity), args...; kwargs...) = mlp_block(args...; kwargs...)
 
 """
-    create_classifier(inplanes::Integer, nclasses::Integer;
+    create_classifier(inplanes::Integer, nclasses::Integer, activation = relu;
                       pool_layer = AdaptiveMeanPool((1, 1)),
                       dropout_rate = 0.0, use_conv::Bool = false)
 
@@ -57,26 +57,25 @@ Creates a classifier head to be used for models.
 
   - `inplanes`: number of input feature maps
   - `nclasses`: number of output classes
+  - `activation`: activation function to use
   - `pool_layer`: pooling layer to use. This is passed in with the layer instantiated with
     any arguments that are needed i.e. as `AdaptiveMeanPool((1, 1))`, for example.
   - `dropout_rate`: dropout rate used in the classifier head.
   - `use_conv`: whether to use a 1x1 convolutional layer instead of a `Dense` layer.
 """
-function create_classifier(inplanes::Integer, nclasses::Integer;
-                           pool_layer = AdaptiveMeanPool((1, 1)),
-                           dropout_rate = 0.0, use_conv::Bool = false)
+function create_classifier(inplanes::Integer, nclasses::Integer, activation = identity;
+                           use_conv::Bool = falsepool_layer = AdaptiveMeanPool((1, 1)),
+                           dropout_rate = nothing)
     # Pooling
-    if pool_layer === identity
-        @assert use_conv
-        "Pooling can only be disabled if classifier is also removed or a convolution-based classifier is used"
-    end
     flatten_in_pool = !use_conv && pool_layer !== identity
     if use_conv
         @assert pool_layer === identity
         "`pool_layer` must be identity if `use_conv` is true"
     end
-    global_pool = flatten_in_pool ? Chain(pool_layer, MLUtils.flatten) : pool_layer
+    global_pool = flatten_in_pool ? [pool_layer, MLUtils.flatten] : [pool_layer]
     # Fully-connected layer
-    fc = use_conv ? Conv((1, 1), inplanes => nclasses) : Dense(inplanes => nclasses)
-    return Chain(global_pool, Dropout(dropout_rate), fc)
+    fc = use_conv ? Conv((1, 1), inplanes => nclasses, activation) :
+         Dense(inplanes => nclasses, activation)
+    drop = isnothing(dropout_rate) ? [] : [Dropout(dropout_rate)]
+    return Chain(global_pool..., drop..., fc)
 end
diff --git a/src/mixers/core.jl b/src/mixers/core.jl
index 18f66aaa8..f08a5f5d5 100644
--- a/src/mixers/core.jl
+++ b/src/mixers/core.jl
@@ -29,10 +29,9 @@ function mlpmixer(block, imsize::Dims{2} = (224, 224); norm_layer = LayerNorm,
     layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
                    Chain([block(embedplanes, npatches; drop_path_rate = dp_rates[i],
                                 kwargs...)
-                          for i in 1:depth]))
-    classification_head = Chain(norm_layer(embedplanes), seconddimmean,
-                                Dense(embedplanes, nclasses))
-    return Chain(layers, classification_head)
+                          for i in 1:depth]...))
+    classifier = Chain(norm_layer(embedplanes), seconddimmean, Dense(embedplanes, nclasses))
+    return Chain(layers, classifier)
 end
 
 # Configurations for MLPMixer models
diff --git a/src/mixers/mlpmixer.jl b/src/mixers/mlpmixer.jl
index 06aefbd48..336a29a33 100644
--- a/src/mixers/mlpmixer.jl
+++ b/src/mixers/mlpmixer.jl
@@ -35,7 +35,7 @@ end
 
 """
 MLPMixer(size::Symbol; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
-         inchannels::Integer = 3, nclasses::Integer = 1000)
+inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a model with the MLPMixer architecture.
 ([reference](https://arxiv.org/pdf/2105.01601)).
@@ -56,12 +56,14 @@ struct MLPMixer
 end
 @functor MLPMixer
 
-function MLPMixer(size::Symbol; imsize::Dims{2} = (224, 224), patch_size::Dims{2} = (16, 16),
+function MLPMixer(size::Symbol; imsize::Dims{2} = (224, 224),
+                  patch_size::Dims{2} = (16, 16),
                   inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(size, keys(MIXER_CONFIGS))
     depth = MIXER_CONFIGS[size][:depth]
     embedplanes = MIXER_CONFIGS[size][:planes]
-    layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, inchannels,nclasses)
+    layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, inchannels,
+                      nclasses)
     return MLPMixer(layers)
 end
 
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
index 1c049e46e..03e520076 100644
--- a/src/vit-based/vit.jl
+++ b/src/vit-based/vit.jl
@@ -99,7 +99,8 @@ struct ViT
 end
 @functor ViT
 
-function ViT(mode::Symbol = :base; imsize::Dims{2} = (256, 256), patch_size::Dims{2} = (16, 16),
+function ViT(mode::Symbol = :base; imsize::Dims{2} = (256, 256),
+             patch_size::Dims{2} = (16, 16),
              inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(mode, keys(VIT_CONFIGS))
     kwargs = VIT_CONFIGS[mode]

From d90a6ae7b94dcc255fd64c7af1aa0b99bd0c7827 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Tue, 2 Aug 2022 10:43:13 +0530
Subject: [PATCH 6/8] Unify higher level DenseNet API

---
 .github/workflows/CI.yml |  3 ++-
 src/convnets/densenet.jl | 48 ++++++++++++----------------------------
 test/vits.jl             |  2 +-
 3 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index c13f1c2d6..a1bf822b9 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -34,7 +34,8 @@ jobs:
           - '"Inception"'
           - '"DenseNet"'
           - '["ConvNeXt", "ConvMixer"]'
-          - '[r"ViTs", r"Mixers"]'
+          - 'r"Mixers"'
+          - 'r"ViTs"'
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1
diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl
index 9720a0212..b82f138fb 100644
--- a/src/convnets/densenet.jl
+++ b/src/convnets/densenet.jl
@@ -105,44 +105,13 @@ function densenet(nblocks::Vector{<:Integer}; growth_rate::Integer = 32, reducti
                     reduction, inchannels, nclasses)
 end
 
-"""
-    DenseNet(nblocks::Vector{<:Integer}; growth_rate::Integer = 32, reduction = 0.5,
-             inchannels = 3, nclasses::Integer = 1000)
-
-Create a DenseNet model
-([reference](https://arxiv.org/abs/1608.06993)).
-See also [`densenet`](#).
-
-# Arguments
-
-  - `nblocks`: number of dense blocks between transitions
-  - `growth_rate`: the output feature map growth rate of dense blocks (i.e. `k` in the paper)
-  - `reduction`: the factor by which the number of feature maps is scaled across each transition
-  - `nclasses`: the number of output classes
-"""
-struct DenseNet
-    layers::Any
-end
-@functor DenseNet
-
-function DenseNet(nblocks::Vector{<:Integer}; growth_rate::Integer = 32, reduction = 0.5,
-                  inchannels = 3, nclasses::Integer = 1000)
-    layers = densenet(nblocks; growth_rate, reduction, inchannels, nclasses)
-    return DenseNet(layers)
-end
-
-(m::DenseNet)(x) = m.layers(x)
-
-backbone(m::DenseNet) = m.layers[1]
-classifier(m::DenseNet) = m.layers[2]
-
 const DENSENET_CONFIGS = Dict(121 => [6, 12, 24, 16],
                               161 => [6, 12, 36, 24],
                               169 => [6, 12, 32, 32],
                               201 => [6, 12, 48, 32])
 
 """
-    DenseNet(config::Integer = 121; pretrain::Bool = false, nclasses::Integer = 1000)
+    DenseNet(config::Integer; pretrain::Bool = false, nclasses::Integer = 1000)
     DenseNet(transition_configs::NTuple{N,Integer})
 
 Create a DenseNet model with specified configuration. Currently supported values are (121, 161, 169, 201)
@@ -155,11 +124,22 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 See also [`Metalhead.densenet`](#).
 """
-function DenseNet(config::Integer = 121; pretrain::Bool = false, nclasses::Integer = 1000)
+struct DenseNet
+    layers::Any
+end
+@functor DenseNet
+
+function DenseNet(config::Integer; pretrain::Bool = false, growth_rate::Integer = 32,
+                  reduction = 0.5, inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(config, keys(DENSENET_CONFIGS))
-    model = DenseNet(DENSENET_CONFIGS[config]; nclasses = nclasses)
+    model = densenet(DENSENET_CONFIGS[config]; growth_rate, reduction, inchannels, nclasses)
     if pretrain
         loadpretrain!(model, string("DenseNet", config))
     end
     return model
 end
+
+(m::DenseNet)(x) = m.layers(x)
+
+backbone(m::DenseNet) = m.layers[1]
+classifier(m::DenseNet) = m.layers[2]
diff --git a/test/vits.jl b/test/vits.jl
index 13733ddec..fb9fd6b02 100644
--- a/test/vits.jl
+++ b/test/vits.jl
@@ -1,5 +1,5 @@
 @testset "ViT" begin
-    for mode in [:tiny, :small, :base, :large, :huge] #:giant, #:gigantic
+    for mode in [:tiny, :small, :base, :large, :huge] # :giant, :gigantic]
         m = ViT(mode)
         @test size(m(x_256)) == (1000, 1)
         @test gradtest(m, x_256)

From 8ce0dce82e301112f82af4759c46a7975dd5fd57 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Tue, 2 Aug 2022 19:41:31 +0530
Subject: [PATCH 7/8] Add a bunch of Compat entries

---
 Project.toml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/Project.toml b/Project.toml
index 6230fdbee..691003944 100644
--- a/Project.toml
+++ b/Project.toml
@@ -20,9 +20,13 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 [compat]
 BSON = "0.3.2"
 Flux = "0.13"
-Functors = "0.2"
-MLUtils = "0.2.6"
-NNlib = "0.7.34, 0.8"
+Functors = "0.2, 0.3"
+CUDA = "3"
+ChainRulesCore = "1"
+PartialFunctions = "1"
+MLUtils = "0.2.10"
+NNlib = "0.8"
+NNlibCUDA = "0.2"
 julia = "1.6"
 
 [publish]

From 59e1ef40a6e30c78ee044f5f505bff603318d697 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Wed, 3 Aug 2022 09:13:07 +0530
Subject: [PATCH 8/8] More uniformity + cleanup

---
 src/Metalhead.jl                            |  8 ++-
 src/convnets/alexnet.jl                     | 19 ++++---
 src/convnets/convmixer.jl                   | 35 ++++++-------
 src/convnets/convnext.jl                    | 30 +++++------
 src/convnets/densenet.jl                    |  3 +-
 src/convnets/efficientnet.jl                | 57 ++++++---------------
 src/convnets/inception/googlenet.jl         |  3 +-
 src/convnets/inception/inceptionresnetv2.jl |  3 +-
 src/convnets/inception/inceptionv3.jl       |  3 +-
 src/convnets/inception/inceptionv4.jl       |  3 +-
 src/convnets/inception/xception.jl          |  4 +-
 src/convnets/mobilenet/mobilenetv1.jl       |  8 +--
 src/convnets/mobilenet/mobilenetv2.jl       |  7 ++-
 src/convnets/mobilenet/mobilenetv3.jl       | 18 +++----
 src/convnets/resnets/core.jl                | 31 ++++++-----
 src/convnets/vgg.jl                         |  9 ++--
 src/layers/attention.jl                     |  3 +-
 src/layers/conv.jl                          | 36 ++++++++-----
 src/layers/drop.jl                          | 34 ++++++------
 src/layers/embeddings.jl                    |  2 -
 src/layers/pool.jl                          |  5 +-
 src/mixers/core.jl                          | 13 ++---
 src/mixers/gmlp.jl                          | 14 +++--
 src/mixers/mlpmixer.jl                      | 15 +++---
 src/mixers/resmlp.jl                        | 20 ++++----
 src/utilities.jl                            |  2 +-
 src/vit-based/vit.jl                        | 10 ++--
 test/convnets.jl                            | 30 +++++------
 test/mixers.jl                              |  4 +-
 test/vits.jl                                |  4 +-
 30 files changed, 202 insertions(+), 231 deletions(-)

diff --git a/src/Metalhead.jl b/src/Metalhead.jl
index 374f28615..78073c154 100644
--- a/src/Metalhead.jl
+++ b/src/Metalhead.jl
@@ -56,14 +56,12 @@ include("vit-based/vit.jl")
 include("pretrain.jl")
 
 export AlexNet, VGG, VGG11, VGG13, VGG16, VGG19,
-       ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, ResNeXt,
+       ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152,
+       WideResNet, ResNeXt, SEResNet, SEResNeXt,
        DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201,
        GoogLeNet, Inception3, Inceptionv3, Inceptionv4, InceptionResNetv2, Xception,
        SqueezeNet, MobileNetv1, MobileNetv2, MobileNetv3, EfficientNet,
-       WideResNet, SEResNet, SEResNeXt,
-       MLPMixer, ResMLP, gMLP,
-       ViT,
-       ConvMixer, ConvNeXt
+       MLPMixer, ResMLP, gMLP, ViT, ConvMixer, ConvNeXt
 
 # use Flux._big_show to pretty print large models
 for T in (:AlexNet, :VGG, :ResNet, :ResNeXt, :DenseNet, :SEResNet, :SEResNeXt,
diff --git a/src/convnets/alexnet.jl b/src/convnets/alexnet.jl
index 6b384f80c..3c713839e 100644
--- a/src/convnets/alexnet.jl
+++ b/src/convnets/alexnet.jl
@@ -1,11 +1,12 @@
 """
-    alexnet(; nclasses::Integer = 1000)
+    alexnet(; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an AlexNet model
 ([reference](https://papers.nips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf)).
 
 # Arguments
 
+  - `inchannels`: The number of input channels.
   - `nclasses`: the number of output classes
 """
 function alexnet(; inchannels::Integer = 3, nclasses::Integer = 1000)
@@ -27,19 +28,23 @@ function alexnet(; inchannels::Integer = 3, nclasses::Integer = 1000)
 end
 
 """
-    AlexNet(; pretrain::Bool = false, nclasses::Integer = 1000)
+    AlexNet(; pretrain::Bool = false, inchannels::Integer = 3,
+            nclasses::Integer = 1000)
 
 Create a `AlexNet`.
-See also [`alexnet`](#).
-
-!!! warning
-    
-    `AlexNet` does not currently support pretrained weights.
+([reference](https://papers.nips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf)).
 
 # Arguments
 
   - `pretrain`: set to `true` to load pre-trained weights for ImageNet
+  - `inchannels`: The number of input channels.
   - `nclasses`: the number of output classes
+
+!!! warning
+    
+    `AlexNet` does not currently support pretrained weights.
+
+See also [`alexnet`](#).
 """
 struct AlexNet
     layers::Any
diff --git a/src/convnets/convmixer.jl b/src/convnets/convmixer.jl
index efde886cb..309989d2d 100644
--- a/src/convnets/convmixer.jl
+++ b/src/convnets/convmixer.jl
@@ -26,28 +26,28 @@ function convmixer(planes::Integer, depth::Integer; kernel_size = (9, 9),
                                                    pad = SamePad())), +),
                     conv_norm((1, 1), planes, planes, activation; preact = true)...)
               for _ in 1:depth]
-    return Chain(Chain(stem..., Chain(blocks)), create_classifier(planes, nclasses))
+    return Chain(Chain(stem..., Chain(blocks...)), create_classifier(planes, nclasses))
 end
 
-const CONVMIXER_CONFIGS = Dict(:base => Dict(:planes => 1536, :depth => 20,
-                                             :kernel_size => (9, 9),
-                                             :patch_size => (7, 7)),
-                               :small => Dict(:planes => 768, :depth => 32,
-                                              :kernel_size => (7, 7),
-                                              :patch_size => (7, 7)),
-                               :large => Dict(:planes => 1024, :depth => 20,
-                                              :kernel_size => (9, 9),
-                                              :patch_size => (7, 7)))
+const CONVMIXER_CONFIGS = Dict(:base => ((1536, 20),
+                                         (kernel_size = (9, 9),
+                                          patch_size = (7, 7))),
+                               :small => ((768, 32),
+                                          (kernel_size = (7, 7),
+                                           patch_size = (7, 7))),
+                               :large => ((1024, 20),
+                                          (kernel_size = (9, 9),
+                                           patch_size = (7, 7))))
 
 """
-    ConvMixer(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
+    ConvMixer(config::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ConvMixer model.
 ([reference](https://arxiv.org/abs/2201.09792))
 
 # Arguments
 
-  - `mode`: the mode of the model, either `:base`, `:small` or `:large`
+  - `config`: the size of the model, either `:base`, `:small` or `:large`
   - `inchannels`: The number of channels in the input.
   - `nclasses`: number of classes in the output
 """
@@ -56,13 +56,10 @@ struct ConvMixer
 end
 @functor ConvMixer
 
-function ConvMixer(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
-    _checkconfig(mode, keys(CONVMIXER_CONFIGS))
-    planes = CONVMIXER_CONFIGS[mode][:planes]
-    depth = CONVMIXER_CONFIGS[mode][:depth]
-    kernel_size = CONVMIXER_CONFIGS[mode][:kernel_size]
-    patch_size = CONVMIXER_CONFIGS[mode][:patch_size]
-    layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, nclasses)
+function ConvMixer(config::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
+    _checkconfig(config, keys(CONVMIXER_CONFIGS))
+    layers = convmixer(CONVMIXER_CONFIGS[config][1]...; CONVMIXER_CONFIGS[config][2]...,
+                       inchannels, nclasses)
     return ConvMixer(layers)
 end
 
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
index 7bb265c24..040a409ab 100644
--- a/src/convnets/convnext.jl
+++ b/src/convnets/convnext.jl
@@ -22,7 +22,7 @@ function convnextblock(planes::Integer, drop_path_rate = 0.0, layerscale_init =
 end
 
 """
-    convnext(depths::Vector{<:Integer}, planes::Vector{<:Integer};
+    convnext(depths::AbstractVector{<:Integer}, planes::AbstractVector{<:Integer};
              drop_path_rate = 0.0, layerscale_init = 1.0f-6, inchannels::Integer = 3,
              nclasses::Integer = 1000)
 
@@ -31,27 +31,27 @@ Creates the layers for a ConvNeXt model.
 
 # Arguments
 
-  - `inchannels`: number of input channels.
   - `depths`: list with configuration for depth of each block
   - `planes`: list with configuration for number of output channels in each block
   - `drop_path_rate`: Stochastic depth rate.
   - `layerscale_init`: Initial value for [`LayerScale`](#)
     ([reference](https://arxiv.org/abs/2103.17239))
+  - `inchannels`: number of input channels.
   - `nclasses`: number of output classes
 """
-function convnext(depths::Vector{<:Integer}, planes::Vector{<:Integer};
+function convnext(depths::AbstractVector{<:Integer}, planes::AbstractVector{<:Integer};
                   drop_path_rate = 0.0, layerscale_init = 1.0f-6, inchannels::Integer = 3,
                   nclasses::Integer = 1000)
     @assert length(depths) == length(planes)
     "`planes` should have exactly one value for each block"
     downsample_layers = []
-    stem = Chain(Conv((4, 4), inchannels => planes[1]; stride = 4),
-                 ChannelLayerNorm(planes[1]))
-    push!(downsample_layers, stem)
+    push!(downsample_layers,
+          Chain(conv_norm((4, 4), inchannels => planes[1]; stride = 4,
+                          norm_layer = ChannelLayerNorm)...))
     for m in 1:(length(depths) - 1)
-        downsample_layer = Chain(ChannelLayerNorm(planes[m]),
-                                 Conv((2, 2), planes[m] => planes[m + 1]; stride = 2))
-        push!(downsample_layers, downsample_layer)
+        push!(downsample_layers,
+              Chain(conv_norm((2, 2), planes[m] => planes[m + 1]; stride = 2,
+                              norm_layer = ChannelLayerNorm, revnorm = true)...))
     end
     stages = []
     dp_rates = linear_scheduler(drop_path_rate; depth = sum(depths))
@@ -64,8 +64,7 @@ function convnext(depths::Vector{<:Integer}, planes::Vector{<:Integer};
     end
     backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages))))
     classifier = Chain(GlobalMeanPool(), MLUtils.flatten,
-                       LayerNorm(planes[end]),
-                       Dense(planes[end], nclasses))
+                       LayerNorm(planes[end]), Dense(planes[end], nclasses))
     return Chain(Chain(backbone...), classifier)
 end
 
@@ -77,13 +76,14 @@ const CONVNEXT_CONFIGS = Dict(:tiny => ([3, 3, 9, 3], [96, 192, 384, 768]),
                               :xlarge => ([3, 3, 27, 3], [256, 512, 1024, 2048]))
 
 """
-    ConvNeXt(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
+    ConvNeXt(config::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ConvNeXt model.
 ([reference](https://arxiv.org/abs/2201.03545))
 
 # Arguments
 
+  - `config`: The size of the model, one of `tiny`, `small`, `base`, `large` or `xlarge`.
   - `inchannels`: The number of channels in the input.
   - `nclasses`: number of output classes
 
@@ -94,9 +94,9 @@ struct ConvNeXt
 end
 @functor ConvNeXt
 
-function ConvNeXt(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
-    _checkconfig(mode, keys(CONVNEXT_CONFIGS))
-    layers = convnext(CONVNEXT_CONFIGS[mode]...; inchannels, nclasses)
+function ConvNeXt(config::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
+    _checkconfig(config, keys(CONVNEXT_CONFIGS))
+    layers = convnext(CONVNEXT_CONFIGS[config]...; inchannels, nclasses)
     return ConvNeXt(layers)
 end
 
diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl
index b82f138fb..ab833bd41 100644
--- a/src/convnets/densenet.jl
+++ b/src/convnets/densenet.jl
@@ -99,7 +99,8 @@ Create a DenseNet model
   - `reduction`: the factor by which the number of feature maps is scaled across each transition
   - `nclasses`: the number of output classes
 """
-function densenet(nblocks::Vector{<:Integer}; growth_rate::Integer = 32, reduction = 0.5,
+function densenet(nblocks::AbstractVector{<:Integer}; growth_rate::Integer = 32,
+                  reduction = 0.5,
                   inchannels::Integer = 3, nclasses::Integer = 1000)
     return densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks];
                     reduction, inchannels, nclasses)
diff --git a/src/convnets/efficientnet.jl b/src/convnets/efficientnet.jl
index 86ba9373f..91986fb92 100644
--- a/src/convnets/efficientnet.jl
+++ b/src/convnets/efficientnet.jl
@@ -22,8 +22,10 @@ Create an EfficientNet model ([reference](https://arxiv.org/abs/1905.11946v5)).
   - `max_width`: maximum number of output channels before the fully connected
     classification blocks
 """
-function efficientnet(scalings, block_configs; max_width::Integer = 1280,
-                      inchannels::Integer = 3, nclasses::Integer = 1000)
+function efficientnet(scalings::NTuple{2, Real},
+                      block_configs::AbstractVector{NTuple{6, Int}};
+                      max_width::Integer = 1280, inchannels::Integer = 3,
+                      nclasses::Integer = 1000)
     wscale, dscale = scalings
     scalew(w) = wscale ≈ 1 ? w : ceil(Int64, wscale * w)
     scaled(d) = dscale ≈ 1 ? d : ceil(Int64, dscale * d)
@@ -83,61 +85,32 @@ const EFFICIENTNET_GLOBAL_CONFIGS = Dict(:b0 => (224, (1.0, 1.0)),
                                          :b8 => (672, (2.2, 3.6)))
 
 """
-    EfficientNet(scalings, block_configs; max_width::Integer = 1280,
-                 inchannels::Integer = 3, nclasses::Integer = 1000)
+    EfficientNet(config::Symbol; pretrain::Bool = false)
 
 Create an EfficientNet model ([reference](https://arxiv.org/abs/1905.11946v5)).
 See also [`efficientnet`](#).
 
 # Arguments
 
-  - `scalings`: global width and depth scaling (given as a tuple)
-
-  - `block_configs`: configuration for each inverted residual block,
-    given as a vector of tuples with elements:
-    
-      + `n`: number of block repetitions (will be scaled by global depth scaling)
-      + `k`: kernel size
-      + `s`: kernel stride
-      + `e`: expansion ratio
-      + `i`: block input channels (will be scaled by global width scaling)
-      + `o`: block output channels (will be scaled by global width scaling)
-  - `inchannels`: number of input channels
-  - `nclasses`: number of output classes
-  - `max_width`: maximum number of output channels before the fully connected
-    classification blocks
+  - `config`: name of default configuration
+    (can be `:b0`, `:b1`, `:b2`, `:b3`, `:b4`, `:b5`, `:b6`, `:b7`, `:b8`)
+  - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
 """
 struct EfficientNet
     layers::Any
 end
 @functor EfficientNet
 
-function EfficientNet(scalings, block_configs; max_width::Integer = 1280,
-                      inchannels::Integer = 3, nclasses::Integer = 1000)
-    layers = efficientnet(scalings, block_configs; inchannels, nclasses, max_width)
-    return EfficientNet(layers)
+function EfficientNet(config::Symbol; pretrain::Bool = false)
+    _checkconfig(config, keys(EFFICIENTNET_GLOBAL_CONFIGS))
+    model = efficientnet(EFFICIENTNET_GLOBAL_CONFIGS[config][2], EFFICIENTNET_BLOCK_CONFIGS)
+    if pretrain
+        loadpretrain!(model, string("efficientnet-", config))
+    end
+    return model
 end
 
 (m::EfficientNet)(x) = m.layers(x)
 
 backbone(m::EfficientNet) = m.layers[1]
 classifier(m::EfficientNet) = m.layers[2]
-
-"""
-    EfficientNet(name::Symbol; pretrain::Bool = false)
-
-Create an EfficientNet model ([reference](https://arxiv.org/abs/1905.11946v5)).
-See also [`efficientnet`](#).
-
-# Arguments
-
-  - `name`: name of default configuration
-    (can be `:b0`, `:b1`, `:b2`, `:b3`, `:b4`, `:b5`, `:b6`, `:b7`, `:b8`)
-  - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
-"""
-function EfficientNet(name::Symbol; pretrain::Bool = false)
-    _checkconfig(name, keys(EFFICIENTNET_GLOBAL_CONFIGS))
-    model = EfficientNet(EFFICIENTNET_GLOBAL_CONFIGS[name][2], EFFICIENTNET_BLOCK_CONFIGS)
-    pretrain && loadpretrain!(model, string("efficientnet-", name))
-    return model
-end
diff --git a/src/convnets/inception/googlenet.jl b/src/convnets/inception/googlenet.jl
index a72ba5e6c..11d4dd7d3 100644
--- a/src/convnets/inception/googlenet.jl
+++ b/src/convnets/inception/googlenet.jl
@@ -53,8 +53,7 @@ function googlenet(; inchannels::Integer = 3, nclasses::Integer = 1000)
                      MaxPool((3, 3); stride = 2, pad = 1),
                      _inceptionblock(832, 256, 160, 320, 32, 128, 128),
                      _inceptionblock(832, 384, 192, 384, 48, 128, 128))
-    classifier = create_classifier(1024, nclasses; dropout_rate = 0.4)
-    return Chain(backbone, classifier)
+    return Chain(backbone, create_classifier(1024, nclasses; dropout_rate = 0.4))
 end
 
 """
diff --git a/src/convnets/inception/inceptionresnetv2.jl b/src/convnets/inception/inceptionresnetv2.jl
index 96b391b65..c2855191b 100644
--- a/src/convnets/inception/inceptionresnetv2.jl
+++ b/src/convnets/inception/inceptionresnetv2.jl
@@ -92,8 +92,7 @@ function inceptionresnetv2(; inchannels::Integer = 3, dropout_rate = 0.0,
                      [block8(0.20f0) for _ in 1:9]...,
                      block8(; activation = relu),
                      conv_norm((1, 1), 2080, 1536)...)
-    classifier = create_classifier(1536, nclasses; dropout_rate)
-    return Chain(backbone, classifier)
+    return Chain(backbone, create_classifier(1536, nclasses; dropout_rate))
 end
 
 """
diff --git a/src/convnets/inception/inceptionv3.jl b/src/convnets/inception/inceptionv3.jl
index 8a5e19849..e5083feb5 100644
--- a/src/convnets/inception/inceptionv3.jl
+++ b/src/convnets/inception/inceptionv3.jl
@@ -154,8 +154,7 @@ function inceptionv3(; inchannels::Integer = 3, nclasses::Integer = 1000)
                      inceptionv3_d(768),
                      inceptionv3_e(1280),
                      inceptionv3_e(2048))
-    classifier = create_classifier(2048, nclasses; dropout_rate = 0.2)
-    return Chain(backbone, classifier)
+    return Chain(backbone, create_classifier(2048, nclasses; dropout_rate = 0.2))
 end
 
 """
diff --git a/src/convnets/inception/inceptionv4.jl b/src/convnets/inception/inceptionv4.jl
index 8d4f00eb2..cd4971742 100644
--- a/src/convnets/inception/inceptionv4.jl
+++ b/src/convnets/inception/inceptionv4.jl
@@ -117,8 +117,7 @@ function inceptionv4(; dropout_rate = 0.0, inchannels::Integer = 3,
                      inceptionv4_c(),
                      inceptionv4_c(),
                      inceptionv4_c())
-    classifier = create_classifier(1536, nclasses; dropout_rate)
-    return Chain(backbone, classifier)
+    return Chain(backbone, create_classifier(1536, nclasses; dropout_rate))
 end
 
 """
diff --git a/src/convnets/inception/xception.jl b/src/convnets/inception/xception.jl
index 71a4efc15..1c97daddc 100644
--- a/src/convnets/inception/xception.jl
+++ b/src/convnets/inception/xception.jl
@@ -45,15 +45,15 @@ function xception_block(inchannels::Integer, outchannels::Integer, nrepeats::Int
 end
 
 """
-    xception(; inchannels::Integer = 3, dropout_rate = 0.0, nclasses::Integer = 1000)
+    xception(; dropout_rate = 0.0, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates an Xception model.
 ([reference](https://arxiv.org/abs/1610.02357))
 
 # Arguments
 
-  - `inchannels`: number of input channels.
   - `dropout_rate`: rate of dropout in classifier head.
+  - `inchannels`: number of input channels.
   - `nclasses`: the number of output classes.
 """
 function xception(; dropout_rate = 0.0, inchannels::Integer = 3, nclasses::Integer = 1000)
diff --git a/src/convnets/mobilenet/mobilenetv1.jl b/src/convnets/mobilenet/mobilenetv1.jl
index ca20b4a64..b6d9fe8ee 100644
--- a/src/convnets/mobilenet/mobilenetv1.jl
+++ b/src/convnets/mobilenet/mobilenetv1.jl
@@ -1,5 +1,5 @@
 """
-    mobilenetv1(width_mult::Number, config::Vector{<:Tuple}; activation = relu,
+    mobilenetv1(width_mult::Real, config::AbstractVector{<:Tuple}; activation = relu,
                 inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create a MobileNetv1 model ([reference](https://arxiv.org/abs/1704.04861v1)).
@@ -19,11 +19,11 @@ Create a MobileNetv1 model ([reference](https://arxiv.org/abs/1704.04861v1)).
   - `inchannels`: The number of input channels. The default value is 3.
   - `nclasses`: The number of output classes
 """
-function mobilenetv1(width_mult::Number, config::Vector{<:Tuple}; activation = relu,
+function mobilenetv1(width_mult::Real, config::AbstractVector{<:Tuple}; activation = relu,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = []
     for (dw, outch, stride, nrepeats) in config
-        outch = Int(outch * width_mult)
+        outch = floor(Int, outch * width_mult)
         for _ in 1:nrepeats
             layer = dw ?
                     depthwise_sep_conv_norm((3, 3), inchannels, outch, activation;
@@ -76,7 +76,7 @@ struct MobileNetv1
 end
 @functor MobileNetv1
 
-function MobileNetv1(width_mult::Number = 1; pretrain::Bool = false,
+function MobileNetv1(width_mult::Real = 1; pretrain::Bool = false,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = mobilenetv1(width_mult, MOBILENETV1_CONFIGS; inchannels, nclasses)
     if pretrain
diff --git a/src/convnets/mobilenet/mobilenetv2.jl b/src/convnets/mobilenet/mobilenetv2.jl
index 59e147829..84162e985 100644
--- a/src/convnets/mobilenet/mobilenetv2.jl
+++ b/src/convnets/mobilenet/mobilenetv2.jl
@@ -1,5 +1,5 @@
 """
-    mobilenetv2(width_mult::Number, configs::Vector{<:Tuple};
+    mobilenetv2(width_mult::Real, configs::AbstractVector{<:Tuple};
                 max_width::Integer = 1280, inchannels::Integer = 3,
                 nclasses::Integer = 1000)
 
@@ -22,7 +22,7 @@ Create a MobileNetv2 model.
   - `max_width`: The maximum number of feature maps in any layer of the network
   - `nclasses`: The number of output classes
 """
-function mobilenetv2(width_mult::Number, configs::Vector{<:Tuple};
+function mobilenetv2(width_mult::Real, configs::AbstractVector{<:Tuple};
                      max_width::Integer = 1280, inchannels::Integer = 3,
                      nclasses::Integer = 1000)
     divisor = width_mult == 0.1 ? 4 : 8
@@ -83,10 +83,9 @@ struct MobileNetv2
 end
 @functor MobileNetv2
 
-function MobileNetv2(width_mult::Number = 1; pretrain::Bool = false,
+function MobileNetv2(width_mult::Real = 1; pretrain::Bool = false,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
     layers = mobilenetv2(width_mult, MOBILENETV2_CONFIGS; inchannels, nclasses)
-    pretrain && loadpretrain!(layers, string("MobileNetv2"))
     if pretrain
         loadpretrain!(layers, string("MobileNetv2"))
     end
diff --git a/src/convnets/mobilenet/mobilenetv3.jl b/src/convnets/mobilenet/mobilenetv3.jl
index 1c5e5825b..7d06ab14d 100644
--- a/src/convnets/mobilenet/mobilenetv3.jl
+++ b/src/convnets/mobilenet/mobilenetv3.jl
@@ -1,5 +1,5 @@
 """
-    mobilenetv3(width_mult::Number, configs::Vector{<:Tuple};
+    mobilenetv3(width_mult::Real, configs::AbstractVector{<:Tuple};
                 max_width::Integer = 1024, inchannels::Integer = 3,
                 nclasses::Integer = 1000)
 
@@ -24,7 +24,7 @@ Create a MobileNetv3 model.
   - `max_width`: The maximum number of feature maps in any layer of the network
   - `nclasses`: the number of output classes
 """
-function mobilenetv3(width_mult::Number, configs::Vector{<:Tuple};
+function mobilenetv3(width_mult::Real, configs::AbstractVector{<:Tuple};
                      max_width::Integer = 1024, inchannels::Integer = 3,
                      nclasses::Integer = 1000)
     # building first layer
@@ -91,7 +91,7 @@ const MOBILENETV3_CONFIGS = Dict(:small => [
                                  ])
 
 """
-    MobileNetv3(mode::Symbol; width_mult::Number = 1, pretrain::Bool = false,
+    MobileNetv3(config::Symbol; width_mult::Real = 1, pretrain::Bool = false,
                 inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create a MobileNetv3 model with the specified configuration.
@@ -100,7 +100,7 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 # Arguments
 
-  - `mode`: :small or :large for the size of the model (see paper).
+  - `config`: :small or :large for the size of the model (see paper).
   - `width_mult`: Controls the number of output feature maps in each block
     (with 1.0 being the default in the paper;
     this is usually a value between 0.1 and 1.4)
@@ -115,14 +115,14 @@ struct MobileNetv3
 end
 @functor MobileNetv3
 
-function MobileNetv3(mode::Symbol; width_mult::Number = 1, pretrain::Bool = false,
+function MobileNetv3(config::Symbol; width_mult::Real = 1, pretrain::Bool = false,
                      inchannels::Integer = 3, nclasses::Integer = 1000)
-    _checkconfig(mode, [:small, :large])
-    max_width = (mode == :large) ? 1280 : 1024
-    layers = mobilenetv3(width_mult, MOBILENETV3_CONFIGS[mode]; inchannels, max_width,
+    _checkconfig(config, [:small, :large])
+    max_width = (config == :large) ? 1280 : 1024
+    layers = mobilenetv3(width_mult, MOBILENETV3_CONFIGS[config]; max_width, inchannels,
                          nclasses)
     if pretrain
-        loadpretrain!(layers, string("MobileNetv3", mode))
+        loadpretrain!(layers, string("MobileNetv3", config))
     end
     return MobileNetv3(layers)
 end
diff --git a/src/convnets/resnets/core.jl b/src/convnets/resnets/core.jl
index 940565f3a..79deadfb2 100644
--- a/src/convnets/resnets/core.jl
+++ b/src/convnets/resnets/core.jl
@@ -65,7 +65,7 @@ function bottleneck(inplanes::Integer, planes::Integer; stride::Integer,
                     norm_layer = BatchNorm, revnorm::Bool = false,
                     drop_block = identity, drop_path = identity,
                     attn_fn = planes -> identity)
-    width = floor(Int, planes * (base_width / 64)) * cardinality
+    width = fld(planes * base_width, 64) * cardinality
     first_planes = width ÷ reduction_factor
     outplanes = planes * 4
     conv_bn1 = conv_norm((1, 1), inplanes => first_planes, activation; norm_layer, revnorm,
@@ -190,15 +190,16 @@ function resnet_stem(stem_type::Symbol = :default; inchannels::Integer = 3,
     return Chain(conv1, bn1, stempool)
 end
 
-function resnet_planes(block_repeats::Vector{<:Integer})
+function resnet_planes(block_repeats::AbstractVector{<:Integer})
     return Iterators.flatten((64 * 2^(stage_idx - 1) for _ in 1:stages)
                              for (stage_idx, stages) in enumerate(block_repeats))
 end
 
-function basicblock_builder(block_repeats::Vector{<:Integer}; inplanes::Integer = 64,
-                            reduction_factor::Integer = 1, expansion::Integer = 1,
-                            norm_layer = BatchNorm, revnorm::Bool = false,
-                            activation = relu, attn_fn = planes -> identity,
+function basicblock_builder(block_repeats::AbstractVector{<:Integer};
+                            inplanes::Integer = 64, reduction_factor::Integer = 1,
+                            expansion::Integer = 1, norm_layer = BatchNorm,
+                            revnorm::Bool = false, activation = relu,
+                            attn_fn = planes -> identity,
                             drop_block_rate = 0.0, drop_path_rate = 0.0,
                             stride_fn = resnet_stride, planes_fn = resnet_planes,
                             downsample_tuple = (downsample_conv, downsample_identity))
@@ -228,11 +229,12 @@ function basicblock_builder(block_repeats::Vector{<:Integer}; inplanes::Integer
     return get_layers
 end
 
-function bottleneck_builder(block_repeats::Vector{<:Integer}; inplanes::Integer = 64,
-                            cardinality::Integer = 1, base_width::Integer = 64,
-                            reduction_factor::Integer = 1, expansion::Integer = 4,
-                            norm_layer = BatchNorm, revnorm::Bool = false,
-                            activation = relu, attn_fn = planes -> identity,
+function bottleneck_builder(block_repeats::AbstractVector{<:Integer};
+                            inplanes::Integer = 64, cardinality::Integer = 1,
+                            base_width::Integer = 64, reduction_factor::Integer = 1,
+                            expansion::Integer = 4, norm_layer = BatchNorm,
+                            revnorm::Bool = false, activation = relu,
+                            attn_fn = planes -> identity,
                             drop_block_rate = 0.0, drop_path_rate = 0.0,
                             stride_fn = resnet_stride, planes_fn = resnet_planes,
                             downsample_tuple = (downsample_conv, downsample_identity))
@@ -265,7 +267,7 @@ function bottleneck_builder(block_repeats::Vector{<:Integer}; inplanes::Integer
     return get_layers
 end
 
-function resnet_stages(get_layers, block_repeats::Vector{<:Integer}, connection)
+function resnet_stages(get_layers, block_repeats::AbstractVector{<:Integer}, connection)
     # Construct each stage
     stages = []
     for (stage_idx, num_blocks) in enumerate(block_repeats)
@@ -277,7 +279,8 @@ function resnet_stages(get_layers, block_repeats::Vector{<:Integer}, connection)
     return Chain(stages...)
 end
 
-function resnet(img_dims, stem, get_layers, block_repeats::Vector{<:Integer}, connection,
+function resnet(img_dims, stem, get_layers, block_repeats::AbstractVector{<:Integer},
+                connection,
                 classifier_fn)
     # Build stages of the ResNet
     stage_blocks = resnet_stages(get_layers, block_repeats, connection)
@@ -288,7 +291,7 @@ function resnet(img_dims, stem, get_layers, block_repeats::Vector{<:Integer}, co
     return Chain(backbone, classifier)
 end
 
-function resnet(block_type::Symbol, block_repeats::Vector{<:Integer};
+function resnet(block_type::Symbol, block_repeats::AbstractVector{<:Integer};
                 downsample_opt::NTuple{2, Any} = (downsample_conv, downsample_identity),
                 cardinality::Integer = 1, base_width::Integer = 64, inplanes::Integer = 64,
                 reduction_factor::Integer = 1, imsize::Dims{2} = (256, 256),
diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl
index e685620a3..de232d9a3 100644
--- a/src/convnets/vgg.jl
+++ b/src/convnets/vgg.jl
@@ -40,7 +40,7 @@ Create VGG convolution layers
   - `batchnorm`: set to `true` to include batch normalization after each convolution
   - `inchannels`: number of input channels
 """
-function vgg_convolutional_layers(config::Vector{<:Tuple}, batchnorm::Bool,
+function vgg_convolutional_layers(config::AbstractVector{<:Tuple}, batchnorm::Bool,
                                   inchannels::Integer)
     layers = []
     ifilters = inchannels
@@ -69,7 +69,7 @@ Create VGG classifier (fully connected) layers
 function vgg_classifier_layers(imsize::NTuple{3, <:Integer}, nclasses::Integer,
                                fcsize::Integer, dropout_rate)
     return Chain(MLUtils.flatten,
-                 Dense(Int(prod(imsize)), fcsize, relu),
+                 Dense(prod(imsize), fcsize, relu),
                  Dropout(dropout_rate),
                  Dense(fcsize, fcsize, relu),
                  Dropout(dropout_rate),
@@ -107,10 +107,7 @@ const VGG_CONV_CONFIGS = Dict(:A => [(64, 1), (128, 1), (256, 2), (512, 2), (512
                               :D => [(64, 2), (128, 2), (256, 3), (512, 3), (512, 3)],
                               :E => [(64, 2), (128, 2), (256, 4), (512, 4), (512, 4)])
 
-const VGG_CONFIGS = Dict(11 => :A,
-                         13 => :B,
-                         16 => :D,
-                         19 => :E)
+const VGG_CONFIGS = Dict(11 => :A, 13 => :B, 16 => :D, 19 => :E)
 
 """
     VGG(imsize::Dims{2}; config, inchannels, batchnorm = false, nclasses, fcsize, dropout_rate)
diff --git a/src/layers/attention.jl b/src/layers/attention.jl
index e2276aa01..b8fd38165 100644
--- a/src/layers/attention.jl
+++ b/src/layers/attention.jl
@@ -1,5 +1,6 @@
 """
-    MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, attn_dropout_rate = 0., proj_dropout_rate = 0.)
+    MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, 
+                attn_dropout_rate = 0., proj_dropout_rate = 0.)
 
 Multi-head self-attention layer.
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 75b40708c..c355eac2f 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,7 +1,11 @@
 """
-    conv_norm(kernel_size, inplanes::Int, outplanes::Int, activation = relu;
-              norm_layer = BatchNorm, revnorm = false, preact = false, use_norm = true,
-              stride = 1, pad = 0, dilation = 1, groups = 1, [bias, weight, init])
+    conv_norm(kernel_size, inplanes::Integer, outplanes::Integer, activation = relu;
+              norm_layer = BatchNorm, revnorm::Bool = false, preact::Bool = false,
+              use_norm::Bool = true, stride::Integer = 1, pad::Integer = 0,
+              dilation::Integer = 1, groups::Integer = 1, [bias, weight, init])
+
+    conv_norm(kernel_size, inplanes => outplanes, activation = identity;
+              kwargs...)
 
 Create a convolution + batch normalization pair with activation.
 
@@ -59,17 +63,21 @@ function conv_norm(kernel_size, ch::Pair{<:Integer, <:Integer}, activation = ide
 end
 
 """
-    depthwise_sep_conv_norm(kernel_size, inplanes, outplanes, activation = relu;
-                            revnorm = false, use_norm = (true, true),
-                            stride = 1, pad = 0, dilation = 1, [bias, weight, init])
+    depthwise_sep_conv_norm(kernel_size, inplanes::Integer, outplanes::Integer,
+                            activation = relu; norm_layer = BatchNorm,
+                            revnorm::Bool = false, stride::Integer = 1,
+                            use_norm::NTuple{2, Bool} = (true, true),
+                            pad::Integer = 0, dilation::Integer = 1, [bias, weight, init])
 
 Create a depthwise separable convolution chain as used in MobileNetv1.
 This is sequence of layers:
 
   - a `kernel_size` depthwise convolution from `inplanes => inplanes`
-  - a batch norm layer + `activation` (if `use_norm[1] == true`; otherwise `activation` is applied to the convolution output)
+  - a (batch) normalisation layer + `activation` (if `use_norm[1] == true`; otherwise
+    `activation` is applied to the convolution output)
   - a `kernel_size` convolution from `inplanes => outplanes`
-  - a batch norm layer + `activation` (if `use_norm[2] == true`; otherwise `activation` is applied to the convolution output)
+  - a (batch) normalisation layer + `activation` (if `use_norm[2] == true`; otherwise
+    `activation` is applied to the convolution output)
 
 See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
 
@@ -80,7 +88,8 @@ See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
   - `outplanes`: number of output feature maps
   - `activation`: the activation function for the final layer
   - `revnorm`: set to `true` to place the batch norm before the convolution
-  - `use_norm`: a tuple of two booleans to specify whether to use normalization for the first and second convolution
+  - `use_norm`: a tuple of two booleans to specify whether to use normalization for the first and
+    second convolution
   - `stride`: stride of the first convolution kernel
   - `pad`: padding of the first convolution kernel
   - `dilation`: dilation of the first convolution kernel
@@ -88,9 +97,8 @@ See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
 """
 function depthwise_sep_conv_norm(kernel_size, inplanes::Integer, outplanes::Integer,
                                  activation = relu; norm_layer = BatchNorm,
-                                 revnorm::Bool = false,
-                                 use_norm::NTuple{2, Bool} = (true, true),
-                                 stride::Integer = 1, kwargs...)
+                                 revnorm::Bool = false, stride::Integer = 1,
+                                 use_norm::NTuple{2, Bool} = (true, true), kwargs...)
     return vcat(conv_norm(kernel_size, inplanes, inplanes, activation;
                           norm_layer, revnorm, use_norm = use_norm[1], stride,
                           groups = inplanes, kwargs...),
@@ -135,9 +143,9 @@ function invertedresidual(kernel_size, inplanes::Integer, hidden_planes::Integer
 end
 
 function invertedresidual(kernel_size, inplanes::Integer, outplanes::Integer,
-                          activation = relu; stride::Integer, expansion,
+                          activation = relu; stride::Integer, expansion::Real,
                           reduction::Union{Nothing, Integer} = nothing)
-    hidden_planes = Int(inplanes * expansion)
+    hidden_planes = floor(Int, inplanes * expansion)
     return invertedresidual(kernel_size, inplanes, hidden_planes, outplanes, activation;
                             stride, reduction)
 end
diff --git a/src/layers/drop.jl b/src/layers/drop.jl
index f823d5c22..31c06c07a 100644
--- a/src/layers/drop.jl
+++ b/src/layers/drop.jl
@@ -50,6 +50,23 @@ end
 # Dispatch for CPU
 dropblock_mask(rng, x, gamma, bs) = _dropblock_mask(rng, x, gamma, bs)
 
+"""
+    DropBlock(drop_block_prob = 0.1, block_size = 7, gamma_scale = 1.0,
+              rng = rng_from_array())
+
+The `DropBlock` layer. While training, it zeroes out continguous regions of
+size `block_size` in the input. During inference, it simply returns the input `x`.
+((reference)[https://arxiv.org/abs/1810.12890])
+
+# Arguments
+
+  - `drop_block_prob`: probability of dropping a block
+  - `block_size`: size of the block to drop
+  - `gamma_scale`: multiplicative factor for `gamma` used. For the calculation of gamma,
+    refer to [the paper](https://arxiv.org/abs/1810.12890).
+  - `rng`: can be used to pass in a custom RNG instead of the default. Custom RNGs are only
+    supported on the CPU.
+"""
 mutable struct DropBlock{F, R <: AbstractRNG}
     drop_block_prob::F
     block_size::Integer
@@ -84,23 +101,6 @@ function Flux.testmode!(m::DropBlock, mode = true)
     return (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 end
 
-"""
-    DropBlock(drop_block_prob = 0.1, block_size = 7, gamma_scale = 1.0,
-              rng = rng_from_array())
-
-The `DropBlock` layer. While training, it zeroes out continguous regions of
-size `block_size` in the input. During inference, it simply returns the input `x`.
-((reference)[https://arxiv.org/abs/1810.12890])
-
-# Arguments
-
-  - `drop_block_prob`: probability of dropping a block
-  - `block_size`: size of the block to drop
-  - `gamma_scale`: multiplicative factor for `gamma` used. For the calculation of gamma,
-    refer to [the paper](https://arxiv.org/abs/1810.12890).
-  - `rng`: can be used to pass in a custom RNG instead of the default. Custom RNGs are only
-    supported on the CPU.
-"""
 function DropBlock(drop_block_prob = 0.1, block_size::Integer = 7, gamma_scale = 1.0,
                    rng = rng_from_array())
     if drop_block_prob == 0.0
diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl
index 560ac074d..cb9b8378c 100644
--- a/src/layers/embeddings.jl
+++ b/src/layers/embeddings.jl
@@ -23,10 +23,8 @@ function PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels::Integer = 3,
                         norm_layer = planes -> identity, flatten::Bool = true)
     im_height, im_width = imsize
     patch_height, patch_width = patch_size
-
     @assert (im_height % patch_height == 0) && (im_width % patch_width == 0)
     "Image dimensions must be divisible by the patch size."
-
     return Chain(Conv(patch_size, inchannels => embedplanes; stride = patch_size),
                  flatten ? _flatten_spatial : identity,
                  norm_layer(embedplanes))
diff --git a/src/layers/pool.jl b/src/layers/pool.jl
index 049c06451..60447ddea 100644
--- a/src/layers/pool.jl
+++ b/src/layers/pool.jl
@@ -4,12 +4,13 @@
 
 A type of adaptive pooling layer which uses both mean and max pooling and combines them to
 produce a single output. Note that this is equivalent to
-`Parallel(connection, AdaptiveMeanPool(output_size), AdaptiveMaxPool(output_size))`
+`Parallel(connection, AdaptiveMeanPool(output_size), AdaptiveMaxPool(output_size))`.
+When `connection` is not specified, it defaults to `+`.
 
 # Arguments
 
-  - `output_size`: The size of the output after pooling.
   - `connection`: The connection type to use.
+  - `output_size`: The size of the output after pooling.
 """
 function AdaptiveMeanMaxPool(connection, output_size::Tuple = (1, 1))
     return Parallel(connection, AdaptiveMeanPool(output_size), AdaptiveMaxPool(output_size))
diff --git a/src/mixers/core.jl b/src/mixers/core.jl
index f08a5f5d5..875136b2e 100644
--- a/src/mixers/core.jl
+++ b/src/mixers/core.jl
@@ -1,7 +1,7 @@
 """
     mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels::Integer = 3, norm_layer = LayerNorm,
              patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.,
-             depth = 12, nclasses::Integer = 1000, kwargs...)
+             depth::Integer = 12, nclasses::Integer = 1000, kwargs...)
 
 Creates a model with the MLPMixer architecture.
 ([reference](https://arxiv.org/pdf/2105.01601)).
@@ -23,7 +23,8 @@ Creates a model with the MLPMixer architecture.
 """
 function mlpmixer(block, imsize::Dims{2} = (224, 224); norm_layer = LayerNorm,
                   patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.0,
-                  depth = 12, inchannels::Integer = 3, nclasses::Integer = 1000, kwargs...)
+                  depth::Integer = 12, inchannels::Integer = 3, nclasses::Integer = 1000,
+                  kwargs...)
     npatches = prod(imsize .÷ patch_size)
     dp_rates = linear_scheduler(drop_path_rate; depth)
     layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
@@ -35,7 +36,7 @@ function mlpmixer(block, imsize::Dims{2} = (224, 224); norm_layer = LayerNorm,
 end
 
 # Configurations for MLPMixer models
-const MIXER_CONFIGS = Dict(:small => Dict(:depth => 8, :planes => 512),
-                           :base => Dict(:depth => 12, :planes => 768),
-                           :large => Dict(:depth => 24, :planes => 1024),
-                           :huge => Dict(:depth => 32, :planes => 1280))
+const MIXER_CONFIGS = Dict(:small => (depth = 8, embedplanes = 512),
+                           :base => (depth = 12, embedplanes = 768),
+                           :large => (depth = 24, embedplanes = 1024),
+                           :huge => (depth = 32, embedplanes = 1280))
diff --git a/src/mixers/gmlp.jl b/src/mixers/gmlp.jl
index df4a52b70..ab89baadc 100644
--- a/src/mixers/gmlp.jl
+++ b/src/mixers/gmlp.jl
@@ -63,7 +63,7 @@ Creates a feedforward block based on the gMLP model architecture described in th
 function spatial_gating_block(planes::Integer, npatches::Integer; mlp_ratio = 4.0,
                               norm_layer = LayerNorm, mlp_layer = gated_mlp_block,
                               dropout_rate = 0.0, drop_path_rate = 0.0, activation = gelu)
-    channelplanes = Int(mlp_ratio * planes)
+    channelplanes = floor(Int, mlp_ratio * planes)
     sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer)
     return SkipConnection(Chain(norm_layer(planes),
                                 mlp_layer(sgu, planes, channelplanes; activation,
@@ -72,7 +72,7 @@ function spatial_gating_block(planes::Integer, npatches::Integer; mlp_ratio = 4.
 end
 
 """
-    gMLP(size::Symbol; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
+    gMLP(config::Symbol; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
          inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a model with the gMLP architecture.
@@ -80,7 +80,7 @@ Creates a model with the gMLP architecture.
 
 # Arguments
 
-  - `size`: the size of the model - one of `small`, `base`, `large` or `huge`
+  - `config`: the size of the model - one of `small`, `base`, `large` or `huge`
   - `patch_size`: the size of the patches
   - `imsize`: the size of the input image
   - `inchannels`: the number of input channels
@@ -93,13 +93,11 @@ struct gMLP
 end
 @functor gMLP
 
-function gMLP(size::Symbol; imsize::Dims{2} = (224, 224), patch_size::Dims{2} = (16, 16),
+function gMLP(config::Symbol; imsize::Dims{2} = (224, 224), patch_size::Dims{2} = (16, 16),
               inchannels::Integer = 3, nclasses::Integer = 1000)
-    _checkconfig(size, keys(MIXER_CONFIGS))
-    depth = MIXER_CONFIGS[size][:depth]
-    embedplanes = MIXER_CONFIGS[size][:planes]
+    _checkconfig(config, keys(MIXER_CONFIGS))
     layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block, patch_size,
-                      embedplanes, depth, inchannels, nclasses)
+                      MIXER_CONFIGS[config]..., inchannels, nclasses)
     return gMLP(layers)
 end
 
diff --git a/src/mixers/mlpmixer.jl b/src/mixers/mlpmixer.jl
index 90a6aaebb..b784a8f8e 100644
--- a/src/mixers/mlpmixer.jl
+++ b/src/mixers/mlpmixer.jl
@@ -20,7 +20,7 @@ Creates a feedforward block for the MLPMixer architecture.
 function mixerblock(planes::Integer, npatches::Integer; mlp_layer = mlp_block,
                     mlp_ratio::NTuple{2, Number} = (0.5, 4.0), dropout_rate = 0.0,
                     drop_path_rate = 0.0, activation = gelu)
-    tokenplanes, channelplanes = Int.(planes .* mlp_ratio)
+    tokenplanes, channelplanes = floor.(Int, planes .* mlp_ratio)
     return Chain(SkipConnection(Chain(LayerNorm(planes),
                                       swapdims((2, 1, 3)),
                                       mlp_layer(npatches, tokenplanes; activation,
@@ -34,7 +34,7 @@ function mixerblock(planes::Integer, npatches::Integer; mlp_layer = mlp_block,
 end
 
 """
-    MLPMixer(size::Symbol; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
+    MLPMixer(config::Symbol; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
              inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a model with the MLPMixer architecture.
@@ -42,7 +42,7 @@ Creates a model with the MLPMixer architecture.
 
 # Arguments
 
-  - `size`: the size of the model - one of `small`, `base`, `large` or `huge`
+  - `config`: the size of the model - one of `small`, `base`, `large` or `huge`
   - `patch_size`: the size of the patches
   - `imsize`: the size of the input image
   - `drop_path_rate`: Stochastic depth rate
@@ -56,13 +56,10 @@ struct MLPMixer
 end
 @functor MLPMixer
 
-function MLPMixer(size::Symbol; imsize::Dims{2} = (224, 224),
-                  patch_size::Dims{2} = (16, 16),
+function MLPMixer(config::Symbol; imsize::Dims{2} = (224, 224), patch_size::Dims{2} = (16, 16),
                   inchannels::Integer = 3, nclasses::Integer = 1000)
-    _checkconfig(size, keys(MIXER_CONFIGS))
-    depth = MIXER_CONFIGS[size][:depth]
-    embedplanes = MIXER_CONFIGS[size][:planes]
-    layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, inchannels,
+    _checkconfig(config, keys(MIXER_CONFIGS))
+    layers = mlpmixer(mixerblock, imsize; patch_size, MIXER_CONFIGS[config]..., inchannels,
                       nclasses)
     return MLPMixer(layers)
 end
diff --git a/src/mixers/resmlp.jl b/src/mixers/resmlp.jl
index f2c9ece15..21ad89d65 100644
--- a/src/mixers/resmlp.jl
+++ b/src/mixers/resmlp.jl
@@ -27,15 +27,14 @@ function resmixerblock(planes::Integer, npatches::Integer; mlp_layer = mlp_block
                                       LayerScale(planes, layerscale_init),
                                       DropPath(drop_path_rate)), +),
                  SkipConnection(Chain(Flux.Scale(planes),
-                                      mlp_layer(planes, Int(mlp_ratio * planes);
-                                                dropout_rate,
-                                                activation),
+                                      mlp_layer(planes, floor(Int, mlp_ratio * planes);
+                                                dropout_rate, activation),
                                       LayerScale(planes, layerscale_init),
                                       DropPath(drop_path_rate)), +))
 end
 
 """
-    ResMLP(size::Symbol; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
+    ResMLP(config::Symbol; patch_size::Dims{2} = (16, 16), imsize::Dims{2} = (224, 224),
            inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a model with the ResMLP architecture.
@@ -43,7 +42,7 @@ Creates a model with the ResMLP architecture.
 
 # Arguments
 
-  - `size`: the size of the model - one of `small`, `base`, `large` or `huge`
+  - `config`: the size of the model - one of `small`, `base`, `large` or `huge`
   - `patch_size`: the size of the patches
   - `imsize`: the size of the input image
   - `inchannels`: the number of input channels
@@ -56,13 +55,12 @@ struct ResMLP
 end
 @functor ResMLP
 
-function ResMLP(size::Symbol; imsize::Dims{2} = (224, 224), patch_size::Dims{2} = (16, 16),
+function ResMLP(config::Symbol; imsize::Dims{2} = (224, 224),
+                patch_size::Dims{2} = (16, 16),
                 inchannels::Integer = 3, nclasses::Integer = 1000)
-    _checkconfig(size, keys(MIXER_CONFIGS))
-    depth = MIXER_CONFIGS[size][:depth]
-    embedplanes = MIXER_CONFIGS[size][:planes]
-    layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes,
-                      depth, inchannels, nclasses)
+    _checkconfig(config, keys(MIXER_CONFIGS))
+    layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size,
+                      MIXER_CONFIGS[config]..., inchannels, nclasses)
     return ResMLP(layers)
 end
 
diff --git a/src/utilities.jl b/src/utilities.jl
index 981777228..359010cfe 100644
--- a/src/utilities.jl
+++ b/src/utilities.jl
@@ -67,7 +67,7 @@ end
 
 Returns the dropout rates for a given depth using the linear scaling rule.
 """
-function linear_scheduler(drop_rate = 0.0; depth, start_value = 0.0)
+function linear_scheduler(drop_rate = 0.0; depth::Integer, start_value = 0.0)
     return LinRange(start_value, drop_rate, depth)
 end
 
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
index 6f145a4bb..099d00639 100644
--- a/src/vit-based/vit.jl
+++ b/src/vit-based/vit.jl
@@ -76,7 +76,7 @@ const VIT_CONFIGS = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3),
                                        mlp_ratio = 64 // 13))
 
 """
-    ViT(mode::Symbol = base; imsize::Dims{2} = (256, 256), inchannels::Integer = 3,
+    ViT(config::Symbol = base; imsize::Dims{2} = (256, 256), inchannels::Integer = 3,
         patch_size::Dims{2} = (16, 16), pool = :class, nclasses::Integer = 1000)
 
 Creates a Vision Transformer (ViT) model.
@@ -84,7 +84,7 @@ Creates a Vision Transformer (ViT) model.
 
 # Arguments
 
-  - `mode`: the model configuration, one of
+  - `config`: the model configuration, one of
     `[:tiny, :small, :base, :large, :huge, :giant, :gigantic]`
   - `imsize`: image size
   - `inchannels`: number of input channels
@@ -99,10 +99,10 @@ struct ViT
 end
 @functor ViT
 
-function ViT(mode::Symbol; imsize::Dims{2} = (256, 256), patch_size::Dims{2} = (16, 16),
+function ViT(config::Symbol; imsize::Dims{2} = (256, 256), patch_size::Dims{2} = (16, 16),
              inchannels::Integer = 3, nclasses::Integer = 1000)
-    _checkconfig(mode, keys(VIT_CONFIGS))
-    layers = vit(imsize; inchannels, patch_size, nclasses, VIT_CONFIGS[mode]...)
+    _checkconfig(config, keys(VIT_CONFIGS))
+    layers = vit(imsize; inchannels, patch_size, nclasses, VIT_CONFIGS[config]...)
     return ViT(layers)
 end
 
diff --git a/test/convnets.jl b/test/convnets.jl
index 35a745b87..c58cad7be 100644
--- a/test/convnets.jl
+++ b/test/convnets.jl
@@ -123,16 +123,16 @@ end
 end
 
 @testset "EfficientNet" begin
-    @testset "EfficientNet($name)" for name in [:b0, :b1, :b2, :b3, :b4, :b5] #:b6, :b7, :b8]
+    @testset "EfficientNet($config)" for config in [:b0, :b1, :b2, :b3, :b4, :b5] #:b6, :b7, :b8]
         # preferred image resolution scaling
-        r = Metalhead.EFFICIENTNET_GLOBAL_CONFIGS[name][1]
+        r = Metalhead.EFFICIENTNET_GLOBAL_CONFIGS[config][1]
         x = rand(Float32, r, r, 3, 1)
-        m = EfficientNet(name)
+        m = EfficientNet(config)
         @test size(m(x)) == (1000, 1)
-        if (EfficientNet, name) in PRETRAINED_MODELS
-            @test acctest(EfficientNet(name, pretrain = true))
+        if (EfficientNet, config) in PRETRAINED_MODELS
+            @test acctest(EfficientNet(config, pretrain = true))
         else
-            @test_throws ArgumentError EfficientNet(name, pretrain = true)
+            @test_throws ArgumentError EfficientNet(config, pretrain = true)
         end
         @test gradtest(m, x)
         _gc()
@@ -249,13 +249,13 @@ end
     end
     _gc()
     @testset "MobileNetv3" verbose = true begin
-        @testset for mode in [:small, :large]
-            m = MobileNetv3(mode)
+        @testset for config in [:small, :large]
+            m = MobileNetv3(config)
             @test size(m(x_224)) == (1000, 1)
-            if (MobileNetv3, mode) in PRETRAINED_MODELS
-                @test acctest(MobileNetv3(mode; pretrain = true))
+            if (MobileNetv3, config) in PRETRAINED_MODELS
+                @test acctest(MobileNetv3(config; pretrain = true))
             else
-                @test_throws ArgumentError MobileNetv3(mode; pretrain = true)
+                @test_throws ArgumentError MobileNetv3(config; pretrain = true)
             end
             @test gradtest(m, x_224)
             _gc()
@@ -264,8 +264,8 @@ end
 end
 
 @testset "ConvNeXt" verbose = true begin
-    @testset for mode in [:small, :base, :large, :tiny, :xlarge]
-        m = ConvNeXt(mode)
+    @testset for config in [:small, :base, :large, :tiny, :xlarge]
+        m = ConvNeXt(config)
         @test size(m(x_224)) == (1000, 1)
         @test gradtest(m, x_224)
         _gc()
@@ -273,8 +273,8 @@ end
 end
 
 @testset "ConvMixer" verbose = true begin
-    @testset for mode in [:small, :base, :large]
-        m = ConvMixer(mode)
+    @testset for config in [:small, :base, :large]
+        m = ConvMixer(config)
         @test size(m(x_224)) == (1000, 1)
         @test gradtest(m, x_224)
         _gc()
diff --git a/test/mixers.jl b/test/mixers.jl
index 51cdd736e..2a5d9af70 100644
--- a/test/mixers.jl
+++ b/test/mixers.jl
@@ -1,6 +1,6 @@
 @testset for model in [MLPMixer, ResMLP, gMLP]
-    @testset for mode in [:small, :base, :large]
-        m = model(mode)
+    @testset for config in [:small, :base, :large]
+        m = model(config)
         @test size(m(x_224)) == (1000, 1)
         @test gradtest(m, x_224)
         _gc()
diff --git a/test/vits.jl b/test/vits.jl
index fb9fd6b02..7561cfdb5 100644
--- a/test/vits.jl
+++ b/test/vits.jl
@@ -1,6 +1,6 @@
 @testset "ViT" begin
-    for mode in [:tiny, :small, :base, :large, :huge] # :giant, :gigantic]
-        m = ViT(mode)
+    for config in [:tiny, :small, :base, :large, :huge] # :giant, :gigantic]
+        m = ViT(config)
         @test size(m(x_256)) == (1000, 1)
         @test gradtest(m, x_256)
         _gc()