SDCCA · vmgaribay · Aug 5, 2024 · Jul 25, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/dgl_ptm/.gitignore b/dgl_ptm/.gitignore
@@ -29,3 +29,7 @@ env
 env3
 venv
 venv3
+
+# Testing
+my_model/
+test_model/
diff --git a/dgl_ptm/dgl_ptm/config.py b/dgl_ptm/dgl_ptm/config.py
@@ -213,6 +213,8 @@ class Config(BaseModel):
     model_graph: object = None # TODO: might be possible to move it from config to model
     step_count: int = 0
     step_target: PositiveInt = 5
+    checkpoint_period: int = 10
+    milestones: Optional[List[PositiveInt]] = None
     steering_parameters: SteeringParams = SteeringParams()
     alpha_dist: AlphaDist = AlphaDist()
     capital_dist: CapitalDist = CapitalDist()

diff --git a/dgl_ptm/dgl_ptm/config.yaml b/dgl_ptm/dgl_ptm/config.yaml
@@ -8,6 +8,7 @@ initial_graph_args:
   new_node_edges: 1
 step_count: 0
 step_target: 5
+checkpoint_period: 10
 cost_vals:
 - 0.0
 - 0.45

diff --git a/dgl_ptm/dgl_ptm/model/initialize_model.py b/dgl_ptm/dgl_ptm/model/initialize_model.py
@@ -1,16 +1,15 @@
 import copy
-from pathlib import Path
 import dgl
 import torch
 import pickle
 import logging
+
 from pathlib import Path
+from dgl.data.utils import save_graphs, load_graphs
 
 from dgl_ptm.network.network_creation import network_creation
 from dgl_ptm.model.step import ptm_step
 from dgl_ptm.agentInteraction.weight_update import weight_update
-from dgl_ptm.model.data_collection import data_collection
-from dgl.data.utils import save_graphs, load_graphs
 from dgl_ptm.config import Config, CONFIG
 from dgl_ptm.util.network_metrics import average_degree
 
@@ -57,7 +56,7 @@ def sample_distribution_tensor(type, distParameters, nSamples, round=False, deci
         uniform_samples = torch.rand(size)
         sample_ppf = torch.sqrt(torch.tensor(2.0)) * torch.erfinv(2 *(cdf_min + (cdf_max - cdf_min) * uniform_samples) - 1)
 
-        dist = destParameters[0] + destParameters[1] * sample_ppf
+        dist = distParameters[0] + distParameters[1] * sample_ppf
 
     else:
         raise NotImplementedError('Currently only uniform, normal, multinomial, and bernoulli distributions are supported')
@@ -135,21 +134,16 @@ class PovertyTrapModel(Model):
 
     """
 
-    def __init__(self,*, model_identifier, restart=False, savestate=10):
+    def __init__(self,*, model_identifier):
         """
-        restore from a savestate or create a PVT model instance.
+        restore from a checkpoint or create a PVT model instance.
         Checks whether a model indentifier has been specified.
 
         param: model_identifier: str, required. Identifier for the model. Used to save and load model states.
-        param: restart: boolean, optional. If True, the model is run from last
-        saved step. Default False.
-        param: savestate: int, optional. If provided, the model state is saved
-        on this frequency. Default is 10 i.e. every 10th time step.
+
         """
 
         super().__init__(model_identifier = model_identifier)
-        self.restart = restart
-        self.savestate = savestate
 
         # default values
         self.device = CONFIG.device
@@ -169,8 +163,14 @@ def __init__(self,*, model_identifier, restart=False, savestate=10):
         self.model_graph = CONFIG.model_graph
         self.step_count = CONFIG.step_count
         self.step_target = CONFIG.step_target
+        self.checkpoint_period = CONFIG.checkpoint_period
+        self.milestones = CONFIG.milestones
         self.steering_parameters = CONFIG.steering_parameters
 
+        # Code version.
+        self.version = Path('version.md').read_text().splitlines()[0]
+
+
     def set_model_parameters(self, *, parameterFilePath=None, **kwargs):
         """
         Load or set model parameters
@@ -220,10 +220,12 @@ def set_model_parameters(self, *, parameterFilePath=None, **kwargs):
         self.steering_parameters['npath'] = str(parent_dir / Path(cfg.steering_parameters.npath))
         self.steering_parameters['epath'] = str(parent_dir / Path(cfg.steering_parameters.epath))
 
-        # save updated config to yaml file
+        # save updated config to yaml files
         cfg_filename = parent_dir / f'{self._model_identifier}.yaml'
         cfg.to_yaml(cfg_filename)
-        logger.warning(f'The model parameters are saved to {cfg_filename}.')
+        cfg_filename_step = parent_dir / f'{self._model_identifier}_{self.step_count}.yaml'
+        cfg.to_yaml(cfg_filename_step)
+        logger.warning(f'The model parameters are saved to {cfg_filename} and {cfg_filename_step}.')
 
     def initialize_model(self):
         """
@@ -392,11 +394,27 @@ def step(self):
             #TODO add model dump here. Also check against previous save to avoid overwriting
             raise RuntimeError(f'execution of step failed for step {self.step_count}')
 
-    def run(self):
-        """ run the model for each step until the step_target is reached."""
+    def run(self, restart=False):
+        """
+        run the model for each step until the step_target is reached.
+
+        param: restart: boolean or int or a pair of ints, optional.
+        If True, the model is run from last checkpoint,
+        if an int, the model is run from the first milestone at that step,
+        if a pair of ints, the model is run from that milestone at that step.
+        Default False.
+        """
+
+        self.inputs = None
+        if isinstance(restart, bool):
+            if restart:
+                self.inputs = _load_model(f'./{self._model_identifier}')
+        elif isinstance(restart, int):
+            self.inputs = _load_model(f'./{self._model_identifier}/milestone_{restart}')
+        elif isinstance(restart, tuple):
+            self.inputs = _load_model(f'./{self._model_identifier}/milestone_{restart[0]}_{restart[1]}')
 
-        if self.restart:
-            self.inputs = _load_model(f'./{self._model_identifier}')
+        if self.inputs:
             self.model_graph = copy.deepcopy(self.inputs["model_graph"])
             #self.model_data = self.inputs["model_data"]
             self.generator_state = self.inputs["generator_state"]
@@ -407,23 +425,42 @@ def run(self):
         while self.step_count < self.step_target:
             self.step()
 
-            # save the model state every step reported by savestate
-            if self.savestate and self.step_count % self.savestate == 0:
+            # save the model state every step reported by checkpoint_period and at specific milestones.
+            # checkpoint saves overwrite the previous checkpoint; milestone get unique folders.
+            save_checkpoint = 0 < self.checkpoint_period and self.step_count % self.checkpoint_period == 0
+            save_milestone = self.milestones and self.step_count in self.milestones
+            if save_checkpoint or save_milestone:
                 self.inputs = {
                     'model_graph': copy.deepcopy(self.model_graph),
                     #'model_data': copy.deepcopy(self.model_data),
                     'generator_state': generator.get_state(),
-                    'step_count': self.step_count
+                    'step_count': self.step_count,
+                    'code_version': self.version
                 }
-                _save_model(f'./{self._model_identifier}', self.inputs)
 
+                # Note that a sinlge step could be both a checkpoint and a milestone.
+                # The checkpoint could be necessary to restore a crashed process while
+                # the milestone is required output.
+                if save_checkpoint:
+                    _save_model(f'./{self._model_identifier}', self.inputs)
+                if save_milestone:
+                    milestone_path = _make_path_unique(f'./{self._model_identifier}/milestone_{self.step_count}')
+                    _save_model(milestone_path, self.inputs)
+
+def _make_path_unique(path):
+    if Path(path).exists():
+        incr = 1
+        def add_incr(path, incr): return f'{path}_{incr}'
+        while Path(add_incr(path, incr)).exists(): incr += 1
+        path = add_incr(path, incr)
+    return path
 
 def _save_model(path, inputs):
-    """ save the model_graph, generator_state and model_data in files."""
+    """ save the model_graph, generator_state and code_version in files."""
 
     # save the model_graph with a label
-    graph_label = {'step_count': torch.tensor([inputs["step_count"]])}
-    save_graphs(str(Path(path) / "model_graphs.bin"), inputs["model_graph"], graph_label)
+    graph_labels = {'step_count': torch.tensor([inputs["step_count"]])}
+    save_graphs(str(Path(path) / "model_graph.bin"), inputs["model_graph"], graph_labels)
 
     # save the generator_state
     with open(Path(path) / "generator_state.bin", 'wb') as file:
@@ -433,16 +470,20 @@ def _save_model(path, inputs):
     #with open(Path(path) / "model_data.bin", 'wb') as file:
     #    pickle.dump([inputs["model_data"], inputs["step_count"]], file)
 
+    # save the code version
+    with open(Path(path) / "version.md", 'w') as file:
+        file.writelines([inputs["code_version"] + '\n', f'step={inputs["step_count"]}\n'])
+
 
 def _load_model(path):
-    # Load model graphs
-    path_model_graph = Path(path) / "model_graphs.bin"
+    # Load model graph
+    path_model_graph = Path(path) / "model_graph.bin"
     if not path_model_graph.is_file():
         raise ValueError(f'The path {path_model_graph} is not a file.')
 
-    graph, graph_lebel = load_graphs(str(path_model_graph))
+    graph, graph_labels = load_graphs(str(path_model_graph))
     graph = graph[0]
-    graph_step = graph_lebel['step_count'].tolist()[0]
+    graph_step = graph_labels['step_count'].tolist()[0]
 
     # Load generator_state
     path_generator_state = Path(path) / "generator_state.bin"
@@ -460,10 +501,23 @@ def _load_model(path):
     #with open(path_model_data, 'rb') as file:
     #    data, data_step = pickle.load(file)
 
+    # Load code version
+    path_code_version = Path(path) / "version.md"
+    if not path_code_version.is_file():
+        raise ValueError(f'The path {path_code_version} is not a file.')
+
+    with open(path_code_version, 'r') as file:
+        code_version = file.readlines()[0]
+
     # Check if graph_step, generator_step and data_step are the same
     if graph_step != generator_step: #or graph_step != data_step:
         msg = 'The step count in the model_graph and generator_state are not the same.'# and model_data are not the same.'
         raise ValueError(msg)
+
+    # Check if the saved version and current code version are the same
+    version = Path('version.md').read_text().splitlines()[0]
+    if code_version != version:
+        logger.warning(f'Warning: loading model generated using earlier code version: {code_version}.')
 
     # Show which step is loaded
     logger.warning(f'Loading model state from step {generator_step}.')
@@ -472,6 +526,7 @@ def _load_model(path):
         'model_graph': graph,
         #'model_data': data,
         'generator_state': generator,
-        'step_count': generator_step
+        'step_count': generator_step,
+        'code_version': code_version
     }
     return inputs
diff --git a/dgl_ptm/regen_version.sh b/dgl_ptm/regen_version.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+git rev-parse HEAD > version.md
+
diff --git a/dgl_ptm/tests/test_model.py b/dgl_ptm/tests/test_model.py
@@ -47,36 +47,6 @@ def test_ptm_step_timestep1(self, model):
          model.step() # timestep 1
          assert Path('my_model/edge_data/1.zarr').exists()
 
-
-class TestDataCollection:
-    def test_data_collection(self, model):
-        data_collection(model.model_graph, timestep=0, npath = model.steering_parameters['npath'],
-                        epath = model.steering_parameters['epath'], ndata = model.steering_parameters['ndata'],
-                        edata = model.steering_parameters['edata'], format = model.steering_parameters['format'],
-                        mode = model.steering_parameters['mode'])
-
-        assert Path('my_model/agent_data.zarr').exists()
-        assert Path('my_model/edge_data/0.zarr').exists()
-
-    def test_data_collection_timestep1(self, model):
-        model.step() # timestep 0
-        data_collection(model.model_graph, timestep=1, npath = model.steering_parameters['npath'],
-                        epath = model.steering_parameters['epath'], ndata = model.steering_parameters['ndata'],
-                        edata = model.steering_parameters['edata'], format = model.steering_parameters['format'],
-                        mode = model.steering_parameters['mode'])
-
-        assert Path('my_model/agent_data.zarr').exists()
-        assert Path('my_model/edge_data/0.zarr').exists()
-        assert Path('my_model/edge_data/1.zarr').exists()
-
-        # check if dimension 'n_time' exist in agent_data.zarr
-        agent_data = xr.open_zarr('my_model/agent_data.zarr')
-        assert 'n_time' in agent_data.dims
-
-        # check variable names in edge_data/1.zarr
-        edge_data = xr.open_zarr('my_model/edge_data/1.zarr')
-        assert 'weight' in edge_data.variables
-
     def test_data_collection_period(self, model):
         if Path('my_model/edge_data/').exists():
             shutil.rmtree('my_model/edge_data/')
@@ -135,6 +105,36 @@ def test_data_collection_period_and_list(self, model):
         assert Path('my_model/edge_data/9.zarr').exists()
 
 
+class TestDataCollection:
+    def test_data_collection(self, model):
+        data_collection(model.model_graph, timestep=0, npath = model.steering_parameters['npath'],
+                        epath = model.steering_parameters['epath'], ndata = model.steering_parameters['ndata'],
+                        edata = model.steering_parameters['edata'], format = model.steering_parameters['format'],
+                        mode = model.steering_parameters['mode'])
+
+        assert Path('my_model/agent_data.zarr').exists()
+        assert Path('my_model/edge_data/0.zarr').exists()
+
+    def test_data_collection_timestep1(self, model):
+        model.step() # timestep 0
+        data_collection(model.model_graph, timestep=1, npath = model.steering_parameters['npath'],
+                        epath = model.steering_parameters['epath'], ndata = model.steering_parameters['ndata'],
+                        edata = model.steering_parameters['edata'], format = model.steering_parameters['format'],
+                        mode = model.steering_parameters['mode'])
+
+        assert Path('my_model/agent_data.zarr').exists()
+        assert Path('my_model/edge_data/0.zarr').exists()
+        assert Path('my_model/edge_data/1.zarr').exists()
+
+        # check if dimension 'n_time' exist in agent_data.zarr
+        agent_data = xr.open_zarr('my_model/agent_data.zarr')
+        assert 'n_time' in agent_data.dims
+
+        # check variable names in edge_data/1.zarr
+        edge_data = xr.open_zarr('my_model/edge_data/1.zarr')
+        assert 'weight' in edge_data.variables
+
+
 class TestInitializeModel:
     def test_set_model_parameters(self):
         model = dgl_ptm.PovertyTrapModel(model_identifier='test_model')
@@ -226,31 +226,57 @@ def test_run(self, model):
         assert model.model_graph.number_of_nodes() == 100
 
     def test_model_init_savestate(self, model):
-        model.savestate = 1
+        model.checkpoint_period = 1
         model.run()
 
         assert model.inputs is not None
-        assert Path('my_model/model_graphs.bin').exists()
+        assert Path('my_model/model_graph.bin').exists()
         assert Path('my_model/generator_state.bin').exists()
+        assert Path('my_model/version.md').exists()
         assert model.inputs["step_count"] == 5
 
     def test_model_init_savestate_not_default(self, model):
-        model.savestate = 2
+        model.checkpoint_period = 2
         model.run()
 
         assert model.inputs["step_count"] == 4
 
     def test_model_init_restart(self, model):
-        model.savestate = 1
+        model.checkpoint_period = 1
         model.step_target = 3 # only run the model till step 3
         model.run()
         expected_generator_state = set(model.inputs["generator_state"].tolist())
 
-        model.restart = True
-        model.step_target = 5 # contiune the model till step 5
-        model.run()
+        model.step_target = 5 # restart the model and run till step 5
+        model.run(restart=True)
         stored_generator_state = set(model.inputs["generator_state"].tolist())
 
         assert model.inputs is not None
         assert model.inputs["step_count"] == 5
         assert stored_generator_state == expected_generator_state 
+
+    def test_model_milestone(self, model):
+        model.milestones = [2]
+        model.run()
+
+        assert model.inputs is not None
+        assert Path('my_model/milestone_2/model_graph.bin').exists()
+        assert Path('my_model/milestone_2/generator_state.bin').exists()
+        assert Path('my_model/milestone_2/version.md').exists()
+        assert model.inputs["step_count"] == 2
+
+    def test_model_milestone_restart(self, model):
+        model.milestones = [1]
+        model.step_target = 3 # only run the model till step 3
+        model.run()
+        expected_generator_state = set(model.inputs["generator_state"].tolist())
+
+        model.step_target = 5 # restart the model and run till step 5
+        model.run(restart=1)
+        stored_generator_state = set(model.inputs["generator_state"].tolist())
+
+        assert model.inputs is not None
+        assert model.inputs["step_count"] == 1
+        assert model.step_count == 5
+        assert stored_generator_state == expected_generator_state 
+
diff --git a/dgl_ptm/version.md b/dgl_ptm/version.md
@@ -0,0 +1 @@
+d3da61c8094ea16199e12f767b3ec8cfd1f4dae8
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/bash
Copy link Member SarahAlidoost Aug 7, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. can we add some documentation in this file or in a md file in a docs folder? Copy link Author vanlankveldthijs Aug 8, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. The current md file in the docs folder is about running on snellius. I think it's better to add some documentation to the CONTRIBUTING.md file.
		git rev-parse HEAD > version.md