forked from bethgelab/foolbox
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubstituion_model_pytorch_resnet18.py
executable file
·91 lines (80 loc) · 3.3 KB
/
substituion_model_pytorch_resnet18.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python3
# mypy: no-disallow-untyped-defs
"""
Sometimes one wants to replace the gradient of a model with a different gradient
from another model to make the attack more reliable. That is, the forward pass
should go through model 1, but the backward pass should go through model 2.
This example shows how that can be done in Foolbox.
"""
import torchvision.models as models
import eagerpy as ep
from foolbox import PyTorchModel, accuracy, samples
from foolbox.attacks import LinfPGD
from foolbox.attacks.base import get_criterion
def main() -> None:
# instantiate a model (could also be a TensorFlow or JAX model)
model = models.resnet18(pretrained=True).eval()
preprocessing = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], axis=-3)
fmodel = PyTorchModel(model, bounds=(0, 1), preprocessing=preprocessing)
# get data and test the model
# wrapping the tensors with ep.astensors is optional, but it allows
# us to work with EagerPy tensors in the following
images, labels = ep.astensors(*samples(fmodel, dataset="imagenet", batchsize=16))
clean_acc = accuracy(fmodel, images, labels)
print(f"clean accuracy: {clean_acc * 100:.1f} %")
# replace the gradient with the gradient from another model
model2 = fmodel # demo, we just use the same model
# TODO: this is still a bit annoying because we need
# to overwrite run to get the labels
class Attack(LinfPGD):
def value_and_grad(self, loss_fn, x):
val1 = loss_fn(x)
loss_fn2 = self.get_loss_fn(model2, self.labels)
_, grad2 = ep.value_and_grad(loss_fn2, x)
return val1, grad2
def run(self, model, inputs, criterion, *, epsilon, **kwargs):
criterion_ = get_criterion(criterion)
self.labels = criterion_.labels
return super().run(model, inputs, criterion_, epsilon=epsilon, **kwargs)
# apply the attack
attack = Attack()
epsilons = [
0.0,
0.0002,
0.0005,
0.0008,
0.001,
0.0015,
0.002,
0.003,
0.01,
0.1,
0.3,
0.5,
1.0,
]
raw_advs, clipped_advs, success = attack(fmodel, images, labels, epsilons=epsilons)
# calculate and report the robust accuracy (the accuracy of the model when
# it is attacked)
robust_accuracy = 1 - success.float32().mean(axis=-1)
print("robust accuracy for perturbations with")
for eps, acc in zip(epsilons, robust_accuracy):
print(f" Linf norm ≤ {eps:<6}: {acc.item() * 100:4.1f} %")
# we can also manually check this
# we will use the clipped advs instead of the raw advs, otherwise
# we would need to check if the perturbation sizes are actually
# within the specified epsilon bound
print()
print("we can also manually check this:")
print()
print("robust accuracy for perturbations with")
for eps, advs_ in zip(epsilons, clipped_advs):
acc2 = accuracy(fmodel, advs_, labels)
print(f" Linf norm ≤ {eps:<6}: {acc2 * 100:4.1f} %")
print(" perturbation sizes:")
perturbation_sizes = (advs_ - images).norms.linf(axis=(1, 2, 3)).numpy()
print(" ", str(perturbation_sizes).replace("\n", "\n" + " "))
if acc2 == 0:
break
if __name__ == "__main__":
main()