forked from kelvinxu/arctic-captions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoptimizers.py
120 lines (97 loc) · 5.28 KB
/
optimizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import theano
import theano.tensor as tensor
import numpy
def itemlist(tparams):
return [vv for kk, vv in tparams.iteritems()]
"""
General Optimizer Structure: (adadelta, adam, rmsprop, sgd)
Parameters
----------
lr : theano shared variable
learning rate, currently only necessaary for sgd
tparams : OrderedDict()
dictionary of shared variables {name: variable}
grads :
dictionary of gradients
inputs :
inputs required to compute gradients
cost :
objective of optimization
hard_attn_up :
additional updates required for hard attention mechanism learning
Returns
-------
f_grad_shared : compute cost, update optimizer shared variables
f_update : update parameters
"""
# See "ADADELTA: An adaptive learning rate method", Matt Zeiler (2012) arXiv
# preprint http://arxiv.org/abs/1212.5701
def adadelta(lr, tparams, grads, inp, cost, hard_attn_up):
zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()]
running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rup2'%k) for k, p in tparams.iteritems()]
running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()]
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)]
f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up+hard_attn_up, profile=False)
updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)]
ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)]
param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
f_update = theano.function([lr], [], updates=ru2up+param_up, on_unused_input='ignore', profile=False)
return f_grad_shared, f_update
# See Lecture 6.5, Coursera: Neural Networks for Machine Learning (2012),
# Tieleman, T. and Hinton. G. for original methods
#
# This implementation (with Nesterov Momentum) is described well in:
# "Generating Sequences with Recurrent Neural Networks", Alex Graves, arxiv preprint
# http://arxiv.org/abs/1308.0850
def rmsprop(lr, tparams, grads, inp, cost, hard_attn_up):
zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()]
running_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad'%k) for k, p in tparams.iteritems()]
running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()]
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)]
f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up+hard_attn_up, profile=False)
updir = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_updir'%k) for k, p in tparams.iteritems()]
updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)]
param_up = [(p, p + udn[1]) for p, udn in zip(itemlist(tparams), updir_new)]
f_update = theano.function([lr], [], updates=updir_new+param_up, on_unused_input='ignore', profile=False)
return f_grad_shared, f_update
# See "Adam: A Method for Stochastic Optimization" Kingma et al. (ICLR 2015)
# Theano implementation adapted from Soren Kaae Sonderby (https://github.com/skaae)
# preprint: http://arxiv.org/abs/1412.6980
def adam(lr, tparams, grads, inp, cost, hard_attn_up):
gshared = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()]
gsup = [(gs, g) for gs, g in zip(gshared, grads)]
f_grad_shared = theano.function(inp, cost, updates=gsup+hard_attn_up)
lr0 = 0.0002
b1 = 0.1
b2 = 0.001
e = 1e-8
updates = []
i = theano.shared(numpy.float32(0.))
i_t = i + 1.
fix1 = 1. - b1**(i_t)
fix2 = 1. - b2**(i_t)
lr_t = lr0 * (tensor.sqrt(fix2) / fix1)
for p, g in zip(tparams.values(), gshared):
m = theano.shared(p.get_value() * numpy.float32(0.))
v = theano.shared(p.get_value() * numpy.float32(0.))
m_t = (b1 * g) + ((1. - b1) * m)
v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
g_t = m_t / (tensor.sqrt(v_t) + e)
p_t = p - (lr_t * g_t)
updates.append((m, m_t))
updates.append((v, v_t))
updates.append((p, p_t))
updates.append((i, i_t))
f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore')
return f_grad_shared, f_update
# Vanilla SGD
def sgd(lr, tparams, grads, inp, cost, hard_attn_up):
gshared = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()]
gsup = [(gs, g) for gs, g in zip(gshared, grads)]
f_grad_shared = theano.function(inp, cost, updates=gsup+hard_attn_up, profile=False)
pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
f_update = theano.function([lr], [], updates=pup, profile=False)
return f_grad_shared, f_update