-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimproved version.R
170 lines (128 loc) · 4.74 KB
/
improved version.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#测试函数
c2<-0.9
rb <- function(theta,getg=FALSE,k=10) {
z <- theta[1]; x <- theta[2]
f <- k*(z-x^2)^2 + (1-x)^2 + 1
if (getg) {
attr(f,"gradient") <- c(2*k*(z-x^2),
-4*k*x*(z-x^2) -2*(1-x))
}
f
}
#梯度函数
g <- function(theta,f,...){
if (is.null(attr(f(theta,...),"gradient"))){
#if (f[2] == TRUE){
f0 <- f(theta,...) ## f at theta
eps <- 1e-7 ## finite difference interval
grad <- c() ##initialize vector to store gradient values
for (i in 1:length(theta)) { ## loop over parameters
th <- theta; th[i] <- th[i] + eps ## increase theta by eps
f1 <- f(th,...) ## compute f at theta+eps
grad[i] <- (f1 - f0) / eps ## approximate first order derivative
}
}
else{
grad <- attr(f(theta,...),'gradient')
}
return(grad)
}
#计算step
delta_compute <- function(f0,f1,f,delta,theta,...){
count<- 0
grad_delta1 <- drop(crossprod(g(theta,f,...),delta))
grad_delta2 <- drop(crossprod(g(theta + delta,f,...),delta))
while(!is.finite(f1) | f1 > f0 | grad_delta2 < c2 * grad_delta1) {
if(count > maxit){
warning("spend too much time")
break
}else if(!is.finite(f1) | f1 > f0){
delta <- delta / 2
}else if(grad_delta2 < c2 * grad_delta1){
delta<- 1.5 * delta
}else{
break
}
f0 <- f(theta,...)
f1 <- f(theta + delta,...)
count <- count + 1
}
print(count)
}
#优化函数
bfgs <- function(theta, f, ..., tol, fscale, maxit){
param_num <- length(theta) ##length of vector theta
B <- I <- diag(param_num) ##initialize identity matrix as the initial value of the inverse of Hessian matrix
iter<-0
grad <- g(theta,f,...)
while(iter <= maxit & max(abs(grad)) >= (abs(f(theta,...)) + fscale) * tol){
iter <- iter +1
count<-0
f0<-f(theta,...)
grad <- g(theta,f,...)
grad_index <- which(is.finite(grad)==FALSE)
##check if objective function is finite or not
if (is.finite(f0)==FALSE){
warning("Objective function is not finite at given initial theta value!")
break
}
##check if derivative is finite or not
#else if (g(theta,f,...) %in% c(-Inf,Inf) | is.na(g(theta,f,...)))
else if (length(grad_index)!=0){
warning("Derivative is not finite at given initial theta value!")
break
}
delta <- drop(- B %*% g(theta,f,...))
f1<-f(theta+delta,...)
grad_delta1 <- drop(crossprod(g(theta,f,...),delta))
grad_delta2 <- drop(crossprod(g(theta + delta,f,...),delta))
while( f1 > f0 | grad_delta2 < c2 * grad_delta1) {
count<- count +1
if(count > maxit){
warning("spend too much time")
break
}else if(!is.finite(f1) | f1 > f0){
delta <- delta / 2}else if(grad_delta2 < c2 * grad_delta1){
delta<-(1 + 0.2) * delta}else{
break}
f1 <- f(theta + delta,...)
grad_delta1 <- drop(crossprod(g(theta,f,...),delta))
grad_delta2 <- drop(crossprod(g(theta + delta,f,...),delta))
}
print(count)
s <- delta
theta_new <- theta + s
if (f(theta_new,...) > f(theta,...)){
warning('Steps failed to reduced objective values but convergence not occured!')
break
}
#iteration of B
y <- g(theta_new,f,...) - g(theta,f,...)
inv_rho <- drop(crossprod(delta,y)) ##rho^(-1)=s^T*y
rho <- 1/inv_rho ##1/rho^(-1)
s_yt <- tcrossprod(s,y) ##s*y^T
y_st <- tcrossprod(y,s) ##y*s^T
B <- B-rho*B%*%y_st -rho*s_yt%*%B+rho^2*s_yt%*%B%*%y_st+ rho * tcrossprod(s) ##update B
theta <- theta_new
grad <- g(theta,f,...)
delta <- drop(- B %*% g(theta,f,...))
}
if (max(abs(grad)) >= (abs(f(theta,...)) + fscale) * tol){
warning('Convergence not reached until maximum iteration!')
}
H <- matrix(0, nrow = param_num, ncol = param_num) ##initialize Hessian matrix
g0 <- g(theta,f,...) ##gradient value at the minimum theta value
eps <- 1e-7 ## finite difference interval
for (i in 1:param_num) { ## loop over parameters
th <- theta; th[i] <- th[i] + eps ##increase theta by eps
g1 <- g(th,f,...) ##gradient value at theta+eps
H[i,] <- (g1 - g0)/eps ## approximate second derivatives in Hessian matrix
}
##convert asymmetric Hessian matrix into symmetric Hessian matrix
if (isSymmetric(H)==FALSE){
H <- 0.5 * (t(H) + H)
}
f_optimum <- f(theta,...) ##scalar value of objective function at the minimum
list(f = f_optimum, theta = theta, iter = iter, g = g0, H = H)
}
bfgs(theta=c(-1,2),f=rb,getg=FALSE,k=10,tol=1e-5,fscale=1,maxit=100)