diff --git a/benchmarking/Project.toml b/benchmarking/Project.toml
new file mode 100644
index 00000000..9607bdd9
--- /dev/null
+++ b/benchmarking/Project.toml
@@ -0,0 +1,25 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
+MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
+Metalhead = "dbeba491-748d-5e0e-a39e-b530a07fa0cc"
+Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
+ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
+TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
+
+[compat]
+Colors = "0.12"
+CUDA = "5"
+DataFrames = "1"
+Flux = "0.14"
+GLMakie = "0.20"
+MLDatasets = "0.7"
+Metalhead = "0.9"
+Optimisers = "0.3"
+ProgressMeter = "1.9"
+TimerOutputs = "0.5"
+cuDNN = "1.2"
diff --git a/benchmarking/benchmark.jl b/benchmarking/benchmark.jl
new file mode 100644
index 00000000..a090a6a5
--- /dev/null
+++ b/benchmarking/benchmark.jl
@@ -0,0 +1,105 @@
+
+using Colors
+using CUDA, cuDNN
+using DataFrames
+using Flux
+using Flux: logitcrossentropy, onecold, onehotbatch
+using GLMakie
+using Metalhead
+using MLDatasets
+using Optimisers
+using ProgressMeter
+using TimerOutputs
+
+include("tooling.jl")
+
+function run()
+
+epochs = 45
+batchsize = 1000
+device = gpu
+CUDA.allowscalar(false)
+allow_skips = true
+
+train_loader, test_loader, labels = load_cifar10(; batchsize)
+nlabels = length(labels)
+firstbatch = first(first(train_loader))
+imsize = size(firstbatch)[1:2]
+
+@info "Benchmarking" epochs batchsize device imsize
+
+to = TimerOutput()
+
+common = "pretrain=false, inchannels=3, nclasses=$(length(labels))"
+
+# these should all be the smallest variant of each that is tested in `/test`
+modelstrings = (
+    "AlexNet(; $common)",
+    "VGG(11, batchnorm=true; $common)",
+    "SqueezeNet(; $common)",
+    "ResNet(18; $common)",
+    "WideResNet(50; $common)",
+    "ResNeXt(50, cardinality=32, base_width=4; $common)",
+    "SEResNet(18; $common)",
+    "SEResNeXt(50, cardinality=32, base_width=4; $common)",
+    "Res2Net(50, base_width=26, scale=4; $common)",
+    "Res2NeXt(50; $common)",
+    "GoogLeNet(batchnorm=true; $common)",
+    "DenseNet(121; $common)",
+    "Inceptionv3(; $common)",
+    "Inceptionv4(; $common)",
+    "InceptionResNetv2(; $common)",
+    "Xception(; $common)",
+    "MobileNetv1(0.5; $common)",
+    "MobileNetv2(0.5; $common)",
+    "MobileNetv3(:small, width_mult=0.5; $common)",
+    "MNASNet(:A1, width_mult=0.5; $common)",
+    "EfficientNet(:b0; $common)",
+    "EfficientNetv2(:small; $common)",
+    "ConvMixer(:small; $common)",
+    "ConvNeXt(:small; $common)",
+    # "MLPMixer(; $common)", # no tests found
+    # "ResMLP(; $common)", # no tests found
+    # "gMLP(; $common)", # no tests found
+    "ViT(:tiny; $common)",
+    # "UNet(; $common)" # doesn't support kwargs "inchannels", "nclasses"
+    )
+df = DataFrame(; model=String[], train_loss=Float64[], train_acc=Float64[], test_loss=Float64[], test_acc=Float64[])
+cols = distinguishable_colors(length(modelstrings), [RGB(1,1,1), RGB(0,0,0)], dropseed=true)
+f = Figure()
+ax = Axis(f[1, 1], title="CIFAR-10 Training on a Nvidia 3090, batch 1000\nTest accuracy vs. time over 45 epochs", xlabel="Time (s)", ylabel="Testset Accuracy")
+display(f)
+max_x = 0
+for (i, modstring) in enumerate(modelstrings)
+    @timeit to "$modstring" begin
+        @info "Evaluating $i/$(length(modelstrings)): $modstring"
+        # Initial precompile is variable based on what came before, so don't time first load
+        eval(Meta.parse(modstring))
+        # second load simulates what might be possible with a proper set-up pkgimage workload
+        @timeit to "Load" model=eval(Meta.parse(modstring))
+        @timeit to "Training" ret = train(model,
+            train_loader,
+            test_loader;
+            limit = 1,
+            to,
+            device
+            )
+        elapsed, train_loss, train_acc, test_loss, test_acc = if isnothing(ret)
+            allow_skips || break
+            missing, missing, missing, missing, missing
+        else
+            elapsed, train_loss_hist, train_acc_hist, test_loss_hist, test_acc_hist = ret
+            max_x = max(maximum(elapsed), max_x)
+            lines!(ax, elapsed, test_acc_hist, label=modstring, color=cols[i])
+            elapsed, train_loss_hist[end], train_acc_hist[end], test_loss_hist[end], test_acc_hist[end]
+        end
+        push!(df, (modstring, train_loss, train_acc, test_loss, test_acc), promote=true)
+    end
+    GC.gc(true)
+end
+f[1, 2] = Legend(f, ax, "Models", framevisible = false)
+display(f)
+display(df)
+print_timer(to; sortby = :firstexec)
+end
+run()
diff --git a/benchmarking/tooling.jl b/benchmarking/tooling.jl
new file mode 100644
index 00000000..06beab2a
--- /dev/null
+++ b/benchmarking/tooling.jl
@@ -0,0 +1,104 @@
+function loss_and_accuracy(data_loader, model, device; limit = nothing)
+    acc = 0
+    ls = 0.0f0
+    num = 0
+    i = 0
+    for (x, y) in data_loader
+        x, y = x |> device, y |> device
+        ŷ = model(x)
+        ls += logitcrossentropy(ŷ, y, agg=sum)
+        acc += sum(onecold(ŷ) .== onecold(y))
+        num +=  size(x)[end]
+        if limit !== nothing
+            i == limit && break
+            i += 1
+        end
+    end
+    return ls / num, acc / num
+end
+
+function load_cifar10(; batchsize=1000)
+    @info "loading CIFAR-10 dataset"
+    train_dataset, test_dataset = CIFAR10(split=:train), CIFAR10(split=:test)
+    train_x, train_y = train_dataset[:]
+    test_x, test_y = test_dataset[:]
+    @assert train_dataset.metadata["class_names"] == test_dataset.metadata["class_names"]
+    labels = train_dataset.metadata["class_names"]
+
+    # CIFAR10 label indices seem to be zero-indexed
+    train_y .+= 1
+    test_y .+= 1
+
+    train_y_ohb = Flux.onehotbatch(train_y, eachindex(labels))
+    test_y_ohb = Flux.onehotbatch(test_y, eachindex(labels))
+
+    train_loader = Flux.DataLoader((data=train_x, labels=train_y_ohb); batchsize, shuffle=true)
+    test_loader = Flux.DataLoader((data=test_x, labels=test_y_ohb); batchsize)
+
+    return train_loader, test_loader, labels
+end
+
+function _train(model, train_loader, test_loader; epochs = 45,
+        device = gpu, limit=nothing, gpu_gc=true, gpu_stats=false, show_plots=false, to=TimerOutput())
+
+    model = model |> device
+
+    opt = Optimisers.Adam()
+    state = Optimisers.setup(opt, model)
+
+    train_loss_hist, train_acc_hist = Float64[], Float64[]
+    test_loss_hist, test_acc_hist = Float64[], Float64[]
+    elapsed = Float64[]
+    @showprogress "training" for epoch in 1:epochs
+        i = 0
+        for (x, y) in train_loader
+            x, y = x |> device, y |> device
+            @timeit to "batch step" begin
+                gs, _ = gradient(model, x) do m, _x
+                    logitcrossentropy(m(_x), y)
+                end
+                state, model = Optimisers.update!(state, model, gs)
+            end
+
+            device === gpu && gpu_stats && CUDA.memory_status()
+            if limit !== nothing
+                i == limit && break
+                i += 1
+            end
+        end
+
+        train_loss, train_acc = loss_and_accuracy(train_loader, model, device; limit)
+        @timeit to "testing" test_loss, test_acc = loss_and_accuracy(test_loader, model, device; limit)
+        push!(train_loss_hist, train_loss); push!(train_acc_hist, train_acc);
+        push!(test_loss_hist, test_loss); push!(test_acc_hist, test_acc);
+        push!(elapsed, time())
+        if show_plots
+            plt2 = lineplot(1:epoch, train_loss_hist, name = "train_loss", xlabel="epoch", ylabel="loss")
+            lineplot!(plt2, 1:epoch, test_loss_hist, name = "test_loss")
+            display(plt2)
+            plt2 = lineplot(1:epoch, train_acc_hist, name = "train_acc", xlabel="epoch", ylabel="acc")
+            lineplot!(plt2, 1:epoch, test_acc_hist, name = "test_acc")
+            display(plt2)
+        end
+        if device === gpu && gpu_gc
+            GC.gc() # GPU will OOM without this
+        end
+    end
+    elapsed = elapsed .- elapsed[1]
+    train_loss, train_acc, test_loss, test_acc = train_loss_hist[end], train_acc_hist[end], test_loss_hist[end], test_acc_hist[end]
+    @info "results after $epochs epochs $(repr(map(x->round(x, digits=3), (; train_loss, train_acc, test_loss, test_acc))))"
+    return elapsed, train_loss_hist, train_acc_hist, test_loss_hist, test_acc_hist
+end
+
+# because Flux stacktraces are ludicrously big on <1.10 so don't show them
+function train(args...;kwargs...)
+    try
+        _train(args...; kwargs...)
+    catch ex
+        # rethrow()
+        println()
+        println(sprint(showerror, ex))
+        GC.gc()
+        return nothing
+    end
+end