-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfull_batch_lbfgs_example.py
144 lines (111 loc) · 4.12 KB
/
full_batch_lbfgs_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
Full-Batch L-BFGS Implementation with Wolfe Line Search
Demonstrates how to implement a simple full-batch L-BFGS with weak Wolfe line search
without Powell damping to train a simple convolutional neural network using the LBFGS
optimizer.
This implementation is CUDA-compatible.
Implemented by: Hao-Jun Michael Shi and Dheevatsa Mudigere
Last edited 10/20/20.
Requirements:
- Keras (for CIFAR-10 dataset)
- NumPy
- PyTorch
Run Command:
python full_batch_lbfgs_example.py
"""
import numpy as np
import torch
import torch.optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models.resnet import resnet18 as _resnet18
from einops import rearrange, reduce, repeat
from tensorflow.keras.datasets import mnist, cifar10 # to load dataset
from functions.utils import compute_stats, get_grad
from functions.LBFGS import FullBatchLBFGS
# Parameters for L-BFGS training
max_iter = 200 # note each iteration is NOT an epoch
ghost_batch = 512
batch_size = 512
overlap_ratio = 0.25 # should be in (0, 0.5)
lr = 1
# Load data
(X_train, y_train), (X_test, y_test) = cifar10.load_data()
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train = X_train / 255
X_test = X_test / 255
if len(X_train.shape) == 3:
X_train = repeat(X_train, 'b h w -> b c h w', c = 3)
X_test = repeat(X_test, 'b h w -> b c h w', c = 3)
else:
X_train = rearrange(X_train, 'b h w c -> b c h w')
X_test = rearrange(X_test, 'b h w c -> b c h w')
# X_train = np.transpose(X_train, (0, 3, 1, 2))
# X_test = np.transpose(X_test, (0, 3, 1, 2))
# Define network
def resnet18(pretrained=False, **kwargs):
""" # This docstring shows up in hub.help()
Resnet18 model
pretrained (bool): kwargs, load pretrained weights into the model
"""
# Call the model, load pretrained weights
model = _resnet18(pretrained=pretrained, **kwargs)
return model
# Check cuda availability
cuda = torch.cuda.is_available()
# Create neural network model
if cuda:
torch.cuda.manual_seed(2018)
model = resnet18().cuda()
else:
torch.manual_seed(2018)
model = resnet18()
# Define helper functions
# Forward pass
if cuda:
opfun = lambda X: model.forward(torch.from_numpy(X).cuda())
else:
opfun = lambda X: model.forward(torch.from_numpy(X))
# Forward pass through the network given the input
if cuda:
predsfun = lambda op: np.argmax(op.cpu().data.numpy(), 1)
else:
predsfun = lambda op: np.argmax(op.data.numpy(), 1)
# Do the forward pass, then compute the accuracy
accfun = lambda op, y: np.mean(np.equal(predsfun(op), y.squeeze())) * 100
# Define optimizer
optimizer = FullBatchLBFGS(model.parameters(), lr=1., history_size=10, line_search='Wolfe', debug=True)
# Main training loop
no_samples = X_train.shape[0]
# compute initial gradient and objective
grad, obj = get_grad(optimizer, X_train, y_train, opfun)
# main loop
for n_iter in range(max_iter):
# training mode
model.train()
# define closure for line search
def closure():
optimizer.zero_grad()
if cuda:
loss_fn = torch.tensor(0, dtype=torch.float).cuda()
else:
loss_fn = torch.tensor(0, dtype=torch.float)
for subsmpl in np.array_split(np.arange(no_samples), max(int(no_samples / ghost_batch), 1)):
ops = opfun(X_train[subsmpl])
if cuda:
tgts = torch.from_numpy(y_train[subsmpl]).cuda().long().squeeze()
else:
tgts = torch.from_numpy(y_train[subsmpl]).long().squeeze()
loss_fn += F.cross_entropy(ops, tgts) * (len(subsmpl) / no_samples)
return loss_fn
# perform line search step
options = {'closure': closure, 'current_loss': obj}
obj, grad, lr, _, _, _, _, _ = optimizer.step(options)
# compute statistics
model.eval()
train_loss, test_loss, test_acc = compute_stats(X_train, y_train, X_test, y_test, opfun, accfun,
ghost_batch=128)
# print data
print('Iter:', n_iter + 1, 'lr:', lr, 'Training Loss:', train_loss, 'Test Loss:', test_loss,
'Test Accuracy:', test_acc)