diff --git a/src/inference_engine/trainable_engine_s.f90 b/src/inference_engine/trainable_engine_s.f90 index 78888f82c..00c2b3d71 100644 --- a/src/inference_engine/trainable_engine_s.f90 +++ b/src/inference_engine/trainable_engine_s.f90 @@ -133,14 +133,12 @@ pure function outer_product(u, v) result(u_v_T) allocate(delta, mold=b) allocate(dcdb, mold=b) ! Gradient of cost function with respect with biases - w = 0.e0 ! Initialize weights - b = 0.e0 ! Initialize biases - + w = 0.; b = 0.e0 ! Initialize weights and biases + + iterate_across_batches: & do iter = 1, size(mini_batches) - cost = 0.e0 - dcdw = 0.e0 - dcdb = 0.e0 + cost = 0.; dcdw = 0.; dcdb = 0. associate(input_output_pairs => mini_batches(iter)%input_output_pairs()) inputs = input_output_pairs%inputs() @@ -148,49 +146,47 @@ pure function outer_product(u, v) result(u_v_T) mini_batch_size = size(input_output_pairs) end associate + iterate_through_batch: & do pair = 1, mini_batch_size - ! Create an AND gate a(1:num_inputs,0) = inputs(pair)%values() y = expected_outputs(pair)%outputs() - ! Feedforward + feed_forward: & do l = 1,output_layer z(1:n(l),l) = matmul(w(1:n(l),1:n(l-1),l), a(1:n(l-1),l-1)) + b(1:n(l),l) a(1:n(l),l) = self%differentiable_activation_strategy_%activation(z(1:n(l),l)) - end do + end do feed_forward - cost = cost + sum((y(1:n(output_layer))-a(1:n(output_layer),output_layer))**2) + cost = cost + sum((y(1:n(output_layer))-a(1:n(output_layer),output_layer))**2)/(2.e0*mini_batch_size) delta(1:n(output_layer),output_layer) = & (a(1:n(output_layer),output_layer) - y(1:n(output_layer))) & * self%differentiable_activation_strategy_%activation_derivative(z(1:n(output_layer),output_layer)) - - ! Backpropagate the error + + back_propagate_error: & do l = n_hidden,1,-1 delta(1:n(l),l) = matmul(transpose(w(1:n(l+1),1:n(l),l+1)), delta(1:n(l+1),l+1)) delta(1:n(l),l) = delta(1:n(l),l) * self%differentiable_activation_strategy_%activation_derivative(z(1:n(l),l)) - end do + end do back_propagate_error - ! Sum up gradients in the inner iteration + sum_gradients: & do l = 1,output_layer dcdb(1:n(l),l) = dcdb(1:n(l),l) + delta(1:n(l),l) do concurrent(j = 1:n(l)) dcdw(j,1:n(l-1),l) = dcdw(j,1:n(l-1),l) + a(1:n(l-1),l-1)*delta(j,l) end do - end do - end do + end do sum_gradients + end do iterate_through_batch - cost = cost/(2.e0*mini_batch_size) - + adjust_weights_and_biases: & do l = 1,output_layer dcdb(1:n(l),l) = dcdb(1:n(l),l)/mini_batch_size b(1:n(l),l) = b(1:n(l),l) - eta*dcdb(1:n(l),l) ! Adjust biases dcdw(1:n(l),1:n(l-1),l) = dcdw(1:n(l),1:n(l-1),l)/mini_batch_size w(1:n(l),1:n(l-1),l) = w(1:n(l),1:n(l-1),l) - eta*dcdw(1:n(l),1:n(l-1),l) ! Adjust weights - end do - - end do + end do adjust_weights_and_biases + end do iterate_across_batches end associate end associate