Merge pull request #83 from BerkeleyLab/converge

Feature: 1st converging cloud microphysics model
BerkeleyLab · Sep 12, 2023 · c51e218 · c51e218
2 parents d163d33 + d80416e
commit c51e218
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 16 deletions.
diff --git a/app/train-cloud-microphysics.f90 b/app/train-cloud-microphysics.f90
@@ -48,7 +48,7 @@ program train_cloud_microphysics
   type(string_t), allocatable :: lines(:)
   character(len=*), parameter :: plot_file_name = "cost.plt"
   character(len=:), allocatable :: base_name, stride_string, epochs_string, last_line
-  integer plot_unit, stride, starting_epoch, ending_epoch, num_epochs, last_epoch_in_file
+  integer plot_unit, stride, num_epochs, previous_epoch
   logical preexisting_plot_file
 
   call system_clock(t_start, clock_rate)
@@ -68,19 +68,17 @@ program train_cloud_microphysics
 
   if (.not. preexisting_plot_file) then
     write(plot_unit,*) "      Epoch   Cost (min)       Cost (max)       Cost (avg)"
-    starting_epoch = 1
+    previous_epoch = 0
   else
     plot_file = file_t(string_t(plot_file_name))
     lines = plot_file%lines()
     last_line = lines(size(lines))%string()
-    read(last_line,*) last_epoch_in_file
-    starting_epoch = last_epoch_in_file + 1
+    read(last_line,*) previous_epoch
   end if
 
-  ending_epoch = starting_epoch + num_epochs - 1
-
   call read_train_write
 
+  close(plot_unit)
   call system_clock(t_finish)
   print *,"System clock time: ", real(t_finish - t_start, real64)/real(clock_rate, real64)
   print *,new_line('a') // "______training_cloud_microhpysics done _______"
@@ -100,6 +98,7 @@ subroutine read_train_write
     integer, allocatable :: lbounds(:)
     integer t, b, t_end
     character(len=:), allocatable :: network_input, network_output, network_file
+    logical stop_requested
 
     network_input = base_name // "_input.nc"
     network_output = base_name // "_output.nc"
@@ -182,10 +181,9 @@ subroutine read_train_write
       type(bin_t), allocatable :: bins(:)
       type(input_output_pair_t), allocatable :: input_output_pairs(:)
       type(tensor_t), allocatable, dimension(:) :: inputs, outputs
-      real(rkind), parameter :: keep = 0.3
+      real(rkind), parameter :: keep = 0.01
       real(rkind), allocatable :: cost(:)
       real(rkind), allocatable :: harvest(:)
-      integer, parameter :: mini_batch_size=1
       integer i, batch, lon, lat, level, time, network_unit, io_status, final_step, epoch
 
       open(newunit=network_unit, file=network_file, form='formatted', status='old', iostat=io_status, action='read')
@@ -197,7 +195,7 @@ subroutine read_train_write
       else
         close(network_unit)
         print *,"Initializing a new network"
-        trainable_engine = new_engine(num_hidden_layers=12, nodes_per_hidden_layer=16, num_inputs=8, num_outputs=6, random=.true.)
+        trainable_engine = new_engine(num_hidden_layers=6, nodes_per_hidden_layer=16, num_inputs=8, num_outputs=6, random=.false.)
       end if
 
       print *,"Defining tensors from time steps 1 through", t_end, "with strides of", stride
@@ -232,13 +230,13 @@ subroutine read_train_write
       end associate
 
 
-      associate(num_pairs => size(input_output_pairs), n_bins => size(input_output_pairs)/10000)
+      associate(num_pairs => size(input_output_pairs), n_bins => 1) ! also tried n_bins => size(input_output_pairs)/10000
         bins = [(bin_t(num_items=num_pairs, num_bins=n_bins, bin_number=b), b = 1, n_bins)]
 
         print *,"Training network"
         print *, "       Epoch   Cost (min)       Cost (max)       Cost (avg)"
 
-        do epoch = starting_epoch, ending_epoch
+        do epoch = previous_epoch + 1, previous_epoch + num_epochs
 
           call shuffle(input_output_pairs) ! set up for stochastic gradient descent
           mini_batches = [(mini_batch_t(input_output_pairs(bins(b)%first():bins(b)%last())), b = 1, size(bins))]
@@ -249,17 +247,23 @@ subroutine read_train_write
           open(newunit=network_unit, file=network_file, form='formatted', status='unknown', iostat=io_status, action='write')
           associate(inference_engine => trainable_engine%to_inference_engine())
             associate(json_file => inference_engine%to_json())
-              print *,"Writing network to " // network_file
               call json_file%write_lines(string_t(network_file))
             end associate
           end associate
+
           close(network_unit)
 
+          inquire(file="stop-training", exist=stop_requested)
+
+          graceful_exit: &
+          if (stop_requested) then
+            print *,'Shutting down because a file named "stop-training" was found.'
+            return
+          end if graceful_exit
+
         end do
       end associate
 
-      close(plot_unit)
-
     end block train_network
 
   end subroutine read_train_write

diff --git a/example/read.f90 → example/print-network-properties.f90 b/example/read.f90 → example/print-network-properties.f90
@@ -25,7 +25,10 @@ program read_json
   inference_engine = inference_engine_t(file_t(input_file_name))
   print *, "number of inputs: ", inference_engine%num_inputs()
   print *, "number of outputs: ", inference_engine%num_outputs()
-  print *, "number of nodes per layer: ", inference_engine%nodes_per_layer()
+  associate(nodes => inference_engine%nodes_per_layer())
+    print *, "number of layers: ", size(nodes) 
+    print *, "number of nodes per layer: ", nodes 
+  end associate
   activation_name = inference_engine%activation_function_name()
   print *, "activation function: ", activation_name%string()
   print *, "using skip connections: ", merge("true ", "false", inference_engine%skip())

diff --git a/src/inference_engine/trainable_engine_s.f90 b/src/inference_engine/trainable_engine_s.f90
@@ -82,7 +82,7 @@
 
   module procedure train
     integer l, batch, mini_batch_size, pair
-    real(rkind), parameter :: eta = 1.5e0 ! Learning parameter
+    real(rkind), parameter :: eta = 3.e0 ! Learning parameter
     real(rkind), allocatable :: &
       z(:,:), a(:,:), delta(:,:), dcdw(:,:,:), dcdb(:,:), vdw(:,:,:), sdw(:,:,:), vdb(:,:), sdb(:,:), vdwc(:,:,:), sdwc(:,:,:), &
       vdbc(:,:), sdbc(:,:)