From d27c8b13f7c61af587cd5ae608a6b8a95b9a5117 Mon Sep 17 00:00:00 2001
From: dachengx <dx2227@columbia.edu>
Date: Thu, 9 Nov 2023 19:27:26 -0600
Subject: [PATCH 1/7] Read poi directly from output_filename to accelerate
 NeymanConstructor

---
 alea/submitters/local.py | 83 +++++++++++++++++++++++++++++++---------
 1 file changed, 64 insertions(+), 19 deletions(-)

diff --git a/alea/submitters/local.py b/alea/submitters/local.py
index 1f65eaef..828b1085 100644
--- a/alea/submitters/local.py
+++ b/alea/submitters/local.py
@@ -5,10 +5,12 @@
 import subprocess
 import itertools
 import operator
+from glob import glob
 from functools import reduce
 from copy import deepcopy
 from typing import List, Dict, Any, Optional, Callable, cast
 
+import h5py
 import numpy as np
 from scipy.interpolate import interp1d, RegularGridInterpolator
 from inference_interface import toyfiles_to_numpy
@@ -68,13 +70,14 @@ class NeymanConstructor(SubmitterLocal):
 
     """
 
-    allowed_special_args = ["free_name", "true_name", "confidence_levels"]
+    allowed_special_args = ["free_name", "true_name", "confidence_levels", "check_poi_expectation"]
 
     def submit(
         self,
         free_name: str = "free",
         true_name: str = "true",
         confidence_levels: List[float] = [0.8, 0.9, 0.95],
+        check_poi_expectation: bool = False,
     ):
         """Read the likelihood ratio from the output files and calculate the Neyman threshold. The
         threshold will be saved into a json file. The threshold will be sorted based on the elements
@@ -84,6 +87,8 @@ def submit(
             free_name: the name of the free hypothesis
             true_name: the name of the true hypothesis
             confidence_levels: the confidence levels to calculate the threshold
+            check_poi_expectation: whether to check if the poi_expectation is consistent in
+                runner args and model's expectation values
 
         Example:
             >>> data = json.load(open("limit_threshold.json")); print(json.dumps(data, indent=4))
@@ -164,11 +169,45 @@ def submit(
             }
 
             # read the likelihood ratio
-            output_filename = runner_args["output_filename"]
-            # add a * to the output_filename to read all the files
+            output_filename = runner_args["output_filename"].format(**needed_kwargs)
+            # try to add a * to the output_filename to read all the files
             fpat_split = os.path.splitext(output_filename)
-            output_filename = fpat_split[0] + "*" + fpat_split[1]
-            results = toyfiles_to_numpy(output_filename.format(**needed_kwargs))
+            _output_filename = fpat_split[0] + "_*" + fpat_split[1]
+            if len(sorted(glob(_output_filename))) != 0:
+                output_filename = _output_filename
+            output_filename_list = sorted(glob(output_filename))
+            if len(output_filename_list) == 0:
+                raise ValueError(f"Can not find any output file {output_filename}!")
+            # read metadata including generate_values
+            metadata_list = []
+            for _output_filename in output_filename_list:
+                with h5py.File(_output_filename, "r", libver="latest", swmr=True) as ipt:
+                    metadata = dict(
+                        zip(
+                            ipt.attrs.keys(),
+                            [json.loads(ipt.attrs[key]) for key in ipt.attrs.keys()],
+                        )
+                    )
+                metadata.pop("date", None)
+                metadata_list.append(metadata)
+            if len(set([deterministic_hash(m) for m in metadata_list])) != 1:
+                raise ValueError(
+                    "The metadata are not the same for all "
+                    f"the {len(output_filename_list)} output!"
+                )
+            metadata = metadata_list[0]
+            if metadata["poi"] != self.poi:
+                raise ValueError(
+                    f"The poi in the metadata {metadata['poi']} is not "
+                    f"the same as the poi {self.poi}!"
+                )
+            needed_kwargs = {**metadata["generate_values"], **needed_kwargs}
+            poi_expectation = needed_kwargs.get("poi_expectation", None)
+            poi_value = needed_kwargs.get(self.poi, None)
+            if poi_value is None:
+                raise ValueError("Can not find the poi value in the generate_values in metadata!")
+            # read the likelihood ratio
+            results = toyfiles_to_numpy(output_filename)
             llfree = results[free_name]["ll"]
             lltrue = results[true_name]["ll"]
             llrs = 2.0 * (llfree - lltrue)
@@ -179,24 +218,30 @@ def submit(
                     f"{(llrs < 0.0).sum() / len(llrs):.2f}, "
                     "the median if negative log likelihood ratios "
                     f"is {np.median(llrs[llrs < 0.0]):.2e}, "
-                    f"there might be a problem in your fitting.",
+                    f"there might be a problem in your fitting."
                 )
             if len(llrs) < 1000:
                 self.logging.warning(
-                    "The number of toys is less than 1000, the threshold might not be accurate!",
+                    "The number of toys is less than 1000, the threshold might not be accurate!"
                 )
 
-            # update poi according to poi_expectation
-            runner_args["statistical_model_args"].pop("limit_threshold", None)
-            runner = Runner(**runner_args)
-            expectation_values = runner.model.get_expectation_values(
-                **{**nominal_values, **generate_values}
-            )
-            # in some rare cases the poi is not a rate multiplier
-            # then the poi_expectation is not in the nominal_expectation_values
-            component = self.poi.replace("_rate_multiplier", "")
-            poi_expectation = expectation_values.get(component, None)
-            poi_value = generate_values.pop(self.poi)
+            if check_poi_expectation:
+                # check if the poi_expectation is consistent
+                runner_args["statistical_model_args"].pop("limit_threshold", None)
+                runner = Runner(**runner_args)
+                expectation_values = runner.model.get_expectation_values(
+                    **{**nominal_values, **generate_values}
+                )
+                component = self.poi.replace("_rate_multiplier", "")
+                # in some rare cases the poi is not a rate multiplier
+                # then the poi_expectation is not in the nominal_expectation_values
+                _poi_expectation = expectation_values.get(component, None)
+                if None not in [poi_expectation, _poi_expectation]:
+                    if poi_expectation != _poi_expectation:
+                        raise ValueError(
+                            f"The poi_expectation from model {poi_expectation} is not "
+                            f"the same as the poi_expectation from toymc {_poi_expectation}!"
+                        )
 
             # make sure no poi and poi_expectation in the hashed_keys
             generate_values.pop(self.poi, None)
@@ -226,7 +271,7 @@ def submit(
                         "threshold": [],
                         "poi_expectation": [],
                     }
-                    threshold[threshold_key] = threshold_value
+                    threshold[threshold_key] = deepcopy(threshold_value)
                 threshold[threshold_key][self.poi].append(poi_value)
                 threshold[threshold_key]["threshold"].append(q_llr)
                 threshold[threshold_key]["poi_expectation"].append(poi_expectation)

From efb7a216e788bb2547584392c9a0cc5b80f3cbbf Mon Sep 17 00:00:00 2001
From: dachengx <dx2227@columbia.edu>
Date: Thu, 9 Nov 2023 22:47:57 -0600
Subject: [PATCH 2/7] Add option to add poi_expectation to limit_threshold

---
 alea/models/blueice_extended_model.py | 25 ++++++++++-
 alea/submitters/local.py              | 65 ++++++++++++++++-----------
 2 files changed, 63 insertions(+), 27 deletions(-)

diff --git a/alea/models/blueice_extended_model.py b/alea/models/blueice_extended_model.py
index da107c52..231201da 100644
--- a/alea/models/blueice_extended_model.py
+++ b/alea/models/blueice_extended_model.py
@@ -2,6 +2,7 @@
 from typing import List, Dict, Callable, Optional, Union, cast
 from copy import deepcopy
 from pydoc import locate
+import itertools
 
 import numpy as np
 import scipy.stats as stats
@@ -138,11 +139,31 @@ def get_source_name_list(self, likelihood_name: str) -> list:
         ll_index = self.likelihood_names.index(likelihood_name)
         return self.likelihood_list[ll_index].source_name_list
 
+    def get_all_source_name(self) -> set:
+        """Return a set of possible source names from all likelihood terms.
+
+        Args:
+            likelihood_name (str): Name of the likelihood.
+
+        Returns:
+            set: set of source names.
+
+        """
+        source_name = set(
+            itertools.chain.from_iterable([ll.source_name_list for ll in self.likelihood_list[:-1]])
+        )
+        return source_name
+
     @property
     def likelihood_list(self) -> List:
         """Return a list of likelihood terms."""
         return self._likelihood.likelihood_list
 
+    @property
+    def likelihood_parameters(self) -> List:
+        """Return a list of likelihood parameters."""
+        return self._likelihood.likelihood_parameters
+
     def get_expectation_values(self, per_likelihood_term=False, **kwargs) -> dict:
         """Return total expectation values (summed over all likelihood terms with the same name)
         given a number of named parameters (kwargs)
@@ -179,9 +200,9 @@ def get_expectation_values(self, per_likelihood_term=False, **kwargs) -> dict:
 
         # ancillary likelihood does not contribute
         for ll_term, ll_name, parameter_names, livetime_parameter in zip(
-            self_copy._likelihood.likelihood_list[:-1],
+            self_copy.likelihood_list[:-1],
             self_copy.likelihood_names[:-1],
-            self_copy._likelihood.likelihood_parameters,
+            self_copy.likelihood_parameters,
             self_copy.livetime_parameter_names,
         ):
             ret[ll_name] = {}
diff --git a/alea/submitters/local.py b/alea/submitters/local.py
index 828b1085..89df17bc 100644
--- a/alea/submitters/local.py
+++ b/alea/submitters/local.py
@@ -70,14 +70,14 @@ class NeymanConstructor(SubmitterLocal):
 
     """
 
-    allowed_special_args = ["free_name", "true_name", "confidence_levels", "check_poi_expectation"]
+    allowed_special_args = ["free_name", "true_name", "confidence_levels", "add_poi_expectation"]
 
     def submit(
         self,
         free_name: str = "free",
         true_name: str = "true",
         confidence_levels: List[float] = [0.8, 0.9, 0.95],
-        check_poi_expectation: bool = False,
+        add_poi_expectation: bool = False,
     ):
         """Read the likelihood ratio from the output files and calculate the Neyman threshold. The
         threshold will be saved into a json file. The threshold will be sorted based on the elements
@@ -87,8 +87,9 @@ def submit(
             free_name: the name of the free hypothesis
             true_name: the name of the true hypothesis
             confidence_levels: the confidence levels to calculate the threshold
-            check_poi_expectation: whether to check if the poi_expectation is consistent in
-                runner args and model's expectation values
+            add_poi_expectation: whether to calculate and add poi_expectation to limit_threshold,
+                if so, will also check whether the poi_expectation is consistent in runner args
+                and model's expectation values
 
         Example:
             >>> data = json.load(open("limit_threshold.json")); print(json.dumps(data, indent=4))
@@ -134,6 +135,9 @@ def submit(
         if "confidence_levels" in runner_args:
             confidence_levels = runner_args["confidence_levels"]
             self.logging.info(f"Overwrite confidence_levels to {confidence_levels}.")
+        if "add_poi_expectation" in runner_args:
+            add_poi_expectation = runner_args["add_poi_expectation"]
+            self.logging.info(f"Overwrite add_poi_expectation to {add_poi_expectation}.")
 
         # extract limit_threshold from the statistical_model_args
         limit_threshold = runner_args["statistical_model_args"].get("limit_threshold", None)
@@ -161,8 +165,8 @@ def submit(
                 raise ValueError("confidence_levels" + message)
 
             # prepare the needed nominal_values and generate_values
-            nominal_values = runner_args["nominal_values"]
-            generate_values = runner_args["generate_values"]
+            nominal_values = deepcopy(runner_args["nominal_values"])
+            generate_values = deepcopy(runner_args["generate_values"])
             needed_kwargs = {
                 **nominal_values,
                 **generate_values,
@@ -202,10 +206,39 @@ def submit(
                     f"the same as the poi {self.poi}!"
                 )
             needed_kwargs = {**metadata["generate_values"], **needed_kwargs}
-            poi_expectation = needed_kwargs.get("poi_expectation", None)
+            poi_expectation = needed_kwargs.pop("poi_expectation", None)
             poi_value = needed_kwargs.get(self.poi, None)
             if poi_value is None:
                 raise ValueError("Can not find the poi value in the generate_values in metadata!")
+            if add_poi_expectation:
+                # check if the poi_expectation is consistent
+                for args in self.allowed_special_args:
+                    runner_args.pop(args, None)
+                runner_args["statistical_model_args"].pop("limit_threshold", None)
+                runner = Runner(**runner_args)
+                source = self.poi.replace("_rate_multiplier", "")
+                all_source_name = runner.model.get_all_source_name()
+                if source not in all_source_name:
+                    raise ValueError(
+                        f"poi {self.poi} does not corresponds to any source in the model!"
+                        " so can not calculate the poi_expectation!"
+                    )
+                expectation_values = runner.model.get_expectation_values(**needed_kwargs)
+                _poi_expectation = expectation_values.get(source, None)
+                if poi_expectation is not None:
+                    if not np.isclose(poi_expectation, _poi_expectation):
+                        raise ValueError(
+                            f"The poi_expectation from model {poi_expectation} is not "
+                            f"the same as the poi_expectation from toymc {_poi_expectation}!"
+                        )
+                else:
+                    warnings.warn(
+                        "Can not find the poi_expectation in the generate_values "
+                        f"{generate_values}, so will not check if the poi_expectation "
+                        "are consistent!"
+                    )
+                    poi_expectation = _poi_expectation
+
             # read the likelihood ratio
             results = toyfiles_to_numpy(output_filename)
             llfree = results[free_name]["ll"]
@@ -225,24 +258,6 @@ def submit(
                     "The number of toys is less than 1000, the threshold might not be accurate!"
                 )
 
-            if check_poi_expectation:
-                # check if the poi_expectation is consistent
-                runner_args["statistical_model_args"].pop("limit_threshold", None)
-                runner = Runner(**runner_args)
-                expectation_values = runner.model.get_expectation_values(
-                    **{**nominal_values, **generate_values}
-                )
-                component = self.poi.replace("_rate_multiplier", "")
-                # in some rare cases the poi is not a rate multiplier
-                # then the poi_expectation is not in the nominal_expectation_values
-                _poi_expectation = expectation_values.get(component, None)
-                if None not in [poi_expectation, _poi_expectation]:
-                    if poi_expectation != _poi_expectation:
-                        raise ValueError(
-                            f"The poi_expectation from model {poi_expectation} is not "
-                            f"the same as the poi_expectation from toymc {_poi_expectation}!"
-                        )
-
             # make sure no poi and poi_expectation in the hashed_keys
             generate_values.pop(self.poi, None)
             generate_values.pop("poi_expectation", None)

From 8f0d11e4585b2960273cdc05e28f8c64f66bb7cf Mon Sep 17 00:00:00 2001
From: dachengx <dx2227@columbia.edu>
Date: Fri, 10 Nov 2023 00:22:16 -0600
Subject: [PATCH 3/7] Save expectation_values to metadata, so NeymanConstructor
 can refer to

---
 alea/runner.py           |  3 +++
 alea/submitters/local.py | 39 ++++++++++++++++-----------------------
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/alea/runner.py b/alea/runner.py
index 1b4d896a..722d18e9 100644
--- a/alea/runner.py
+++ b/alea/runner.py
@@ -119,6 +119,7 @@ def __init__(
             statistical_model_args = {}
         # nominal_values is keyword argument
         self.nominal_values = nominal_values if nominal_values else {}
+        # initialize nominal_values only once
         statistical_model_args["nominal_values"] = self.nominal_values
         # likelihood_config is keyword argument, because not all statistical model needs it
         statistical_model_args["likelihood_config"] = likelihood_config
@@ -338,6 +339,8 @@ def write_output(self, results):
         metadata["poi"] = self.poi
         metadata["common_hypothesis"] = self.common_hypothesis
         metadata["generate_values"] = self.generate_values
+        metadata["nominal_values"] = self.nominal_values
+        metadata["expectation_values"] = self.model.get_expectation_values(**self.generate_values)
 
         array_metadatas = [{"hypotheses_values": ea} for ea in self._hypotheses_values]
         numpy_arrays_and_names = [(r, rn) for r, rn in zip(results, result_names)]
diff --git a/alea/submitters/local.py b/alea/submitters/local.py
index 89df17bc..80490821 100644
--- a/alea/submitters/local.py
+++ b/alea/submitters/local.py
@@ -70,14 +70,13 @@ class NeymanConstructor(SubmitterLocal):
 
     """
 
-    allowed_special_args = ["free_name", "true_name", "confidence_levels", "add_poi_expectation"]
+    allowed_special_args = ["free_name", "true_name", "confidence_levels"]
 
     def submit(
         self,
         free_name: str = "free",
         true_name: str = "true",
         confidence_levels: List[float] = [0.8, 0.9, 0.95],
-        add_poi_expectation: bool = False,
     ):
         """Read the likelihood ratio from the output files and calculate the Neyman threshold. The
         threshold will be saved into a json file. The threshold will be sorted based on the elements
@@ -87,9 +86,6 @@ def submit(
             free_name: the name of the free hypothesis
             true_name: the name of the true hypothesis
             confidence_levels: the confidence levels to calculate the threshold
-            add_poi_expectation: whether to calculate and add poi_expectation to limit_threshold,
-                if so, will also check whether the poi_expectation is consistent in runner args
-                and model's expectation values
 
         Example:
             >>> data = json.load(open("limit_threshold.json")); print(json.dumps(data, indent=4))
@@ -135,9 +131,6 @@ def submit(
         if "confidence_levels" in runner_args:
             confidence_levels = runner_args["confidence_levels"]
             self.logging.info(f"Overwrite confidence_levels to {confidence_levels}.")
-        if "add_poi_expectation" in runner_args:
-            add_poi_expectation = runner_args["add_poi_expectation"]
-            self.logging.info(f"Overwrite add_poi_expectation to {add_poi_expectation}.")
 
         # extract limit_threshold from the statistical_model_args
         limit_threshold = runner_args["statistical_model_args"].get("limit_threshold", None)
@@ -182,6 +175,7 @@ def submit(
             output_filename_list = sorted(glob(output_filename))
             if len(output_filename_list) == 0:
                 raise ValueError(f"Can not find any output file {output_filename}!")
+
             # read metadata including generate_values
             metadata_list = []
             for _output_filename in output_filename_list:
@@ -205,27 +199,26 @@ def submit(
                     f"The poi in the metadata {metadata['poi']} is not "
                     f"the same as the poi {self.poi}!"
                 )
+
+            # read poi and poi_expectation
             needed_kwargs = {**metadata["generate_values"], **needed_kwargs}
             poi_expectation = needed_kwargs.pop("poi_expectation", None)
             poi_value = needed_kwargs.get(self.poi, None)
             if poi_value is None:
                 raise ValueError("Can not find the poi value in the generate_values in metadata!")
-            if add_poi_expectation:
-                # check if the poi_expectation is consistent
-                for args in self.allowed_special_args:
-                    runner_args.pop(args, None)
-                runner_args["statistical_model_args"].pop("limit_threshold", None)
-                runner = Runner(**runner_args)
-                source = self.poi.replace("_rate_multiplier", "")
-                all_source_name = runner.model.get_all_source_name()
-                if source not in all_source_name:
-                    raise ValueError(
-                        f"poi {self.poi} does not corresponds to any source in the model!"
-                        " so can not calculate the poi_expectation!"
-                    )
-                expectation_values = runner.model.get_expectation_values(**needed_kwargs)
-                _poi_expectation = expectation_values.get(source, None)
+            # read expectation_values from metadata
+            expectation_values = metadata["expectation_values"]
+            # check if the poi_expectation is in expectation_values
+            source = self.poi.replace("_rate_multiplier", "")
+            if source not in expectation_values:
+                warnings.warn(
+                    f"poi {self.poi} does not corresponds to any source in the model!"
+                    " so can not calculate the poi_expectation!"
+                )
+            else:
+                _poi_expectation = expectation_values[source]
                 if poi_expectation is not None:
+                    # check if the poi_expectation is consistent
                     if not np.isclose(poi_expectation, _poi_expectation):
                         raise ValueError(
                             f"The poi_expectation from model {poi_expectation} is not "

From e39d0a416ccce658a07c7265e9c44343bb334515 Mon Sep 17 00:00:00 2001
From: dachengx <dx2227@columbia.edu>
Date: Fri, 10 Nov 2023 00:33:26 -0600
Subject: [PATCH 4/7] Be compatible to the simplest model

---
 alea/model.py  | 2 +-
 alea/runner.py | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/alea/model.py b/alea/model.py
index f7bdf748..cfb85378 100644
--- a/alea/model.py
+++ b/alea/model.py
@@ -255,7 +255,7 @@ def get_expectation_values(self, **parameter_values):
             parameter_values: values of the parameters
 
         """
-        return NotImplementedError("get_expectation_values is optional to implement")
+        raise NotImplementedError("get_expectation_values is optional to implement")
 
     @property
     def nominal_expectation_values(self):
diff --git a/alea/runner.py b/alea/runner.py
index 722d18e9..68ec059f 100644
--- a/alea/runner.py
+++ b/alea/runner.py
@@ -340,7 +340,12 @@ def write_output(self, results):
         metadata["common_hypothesis"] = self.common_hypothesis
         metadata["generate_values"] = self.generate_values
         metadata["nominal_values"] = self.nominal_values
-        metadata["expectation_values"] = self.model.get_expectation_values(**self.generate_values)
+        try:
+            metadata["expectation_values"] = self.model.get_expectation_values(
+                **self.generate_values
+            )
+        except NotImplementedError:
+            metadata["expectation_values"] = {}
 
         array_metadatas = [{"hypotheses_values": ea} for ea in self._hypotheses_values]
         numpy_arrays_and_names = [(r, rn) for r, rn in zip(results, result_names)]

From cf02ab02779663d3d82fd8f114504c122feb82e9 Mon Sep 17 00:00:00 2001
From: dachengx <dx2227@columbia.edu>
Date: Mon, 13 Nov 2023 08:12:07 -0600
Subject: [PATCH 5/7] Remove unnecessary functions

---
 alea/models/blueice_extended_model.py | 25 ++-----------------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/alea/models/blueice_extended_model.py b/alea/models/blueice_extended_model.py
index 231201da..da107c52 100644
--- a/alea/models/blueice_extended_model.py
+++ b/alea/models/blueice_extended_model.py
@@ -2,7 +2,6 @@
 from typing import List, Dict, Callable, Optional, Union, cast
 from copy import deepcopy
 from pydoc import locate
-import itertools
 
 import numpy as np
 import scipy.stats as stats
@@ -139,31 +138,11 @@ def get_source_name_list(self, likelihood_name: str) -> list:
         ll_index = self.likelihood_names.index(likelihood_name)
         return self.likelihood_list[ll_index].source_name_list
 
-    def get_all_source_name(self) -> set:
-        """Return a set of possible source names from all likelihood terms.
-
-        Args:
-            likelihood_name (str): Name of the likelihood.
-
-        Returns:
-            set: set of source names.
-
-        """
-        source_name = set(
-            itertools.chain.from_iterable([ll.source_name_list for ll in self.likelihood_list[:-1]])
-        )
-        return source_name
-
     @property
     def likelihood_list(self) -> List:
         """Return a list of likelihood terms."""
         return self._likelihood.likelihood_list
 
-    @property
-    def likelihood_parameters(self) -> List:
-        """Return a list of likelihood parameters."""
-        return self._likelihood.likelihood_parameters
-
     def get_expectation_values(self, per_likelihood_term=False, **kwargs) -> dict:
         """Return total expectation values (summed over all likelihood terms with the same name)
         given a number of named parameters (kwargs)
@@ -200,9 +179,9 @@ def get_expectation_values(self, per_likelihood_term=False, **kwargs) -> dict:
 
         # ancillary likelihood does not contribute
         for ll_term, ll_name, parameter_names, livetime_parameter in zip(
-            self_copy.likelihood_list[:-1],
+            self_copy._likelihood.likelihood_list[:-1],
             self_copy.likelihood_names[:-1],
-            self_copy.likelihood_parameters,
+            self_copy._likelihood.likelihood_parameters,
             self_copy.livetime_parameter_names,
         ):
             ret[ll_name] = {}

From 2b38a2f9ca0106ab6f68cabc8b783ac054b3c94a Mon Sep 17 00:00:00 2001
From: dachengx <dx2227@columbia.edu>
Date: Mon, 13 Nov 2023 08:58:25 -0600
Subject: [PATCH 6/7] Move out search_filename_pattern to utils

---
 alea/submitters/local.py | 16 ++++++----------
 alea/utils.py            | 22 ++++++++++++++++++++++
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/alea/submitters/local.py b/alea/submitters/local.py
index 80490821..0b0ee836 100644
--- a/alea/submitters/local.py
+++ b/alea/submitters/local.py
@@ -21,6 +21,7 @@
     load_json,
     asymptotic_critical_value,
     deterministic_hash,
+    search_filename_pattern,
 )
 
 
@@ -166,15 +167,10 @@ def submit(
             }
 
             # read the likelihood ratio
-            output_filename = runner_args["output_filename"].format(**needed_kwargs)
-            # try to add a * to the output_filename to read all the files
-            fpat_split = os.path.splitext(output_filename)
-            _output_filename = fpat_split[0] + "_*" + fpat_split[1]
-            if len(sorted(glob(_output_filename))) != 0:
-                output_filename = _output_filename
-            output_filename_list = sorted(glob(output_filename))
-            if len(output_filename_list) == 0:
-                raise ValueError(f"Can not find any output file {output_filename}!")
+            output_filename_pattern = search_filename_pattern(
+                runner_args["output_filename"].format(**needed_kwargs)
+            )
+            output_filename_list = sorted(glob(output_filename_pattern))
 
             # read metadata including generate_values
             metadata_list = []
@@ -233,7 +229,7 @@ def submit(
                     poi_expectation = _poi_expectation
 
             # read the likelihood ratio
-            results = toyfiles_to_numpy(output_filename)
+            results = toyfiles_to_numpy(output_filename_pattern)
             llfree = results[free_name]["ll"]
             lltrue = results[true_name]["ll"]
             llrs = 2.0 * (llfree - lltrue)
diff --git a/alea/utils.py b/alea/utils.py
index 30b1e896..b04f159e 100644
--- a/alea/utils.py
+++ b/alea/utils.py
@@ -374,6 +374,28 @@ def add_i_batch(filename: str) -> str:
     return fpat_split[0] + "_{i_batch:d}" + fpat_split[1]
 
 
+def search_filename_pattern(filename: str) -> str:
+    """Return pattern for a given existing filename. This is needed because sometimes the filename
+    is not appended by "_{i_batch:d}". We need to distinguish between the two cases and return the
+    correct pattern.
+
+    Returns:
+        str: existing pattern for filename, either filename or filename w/ inserted "_*"
+
+    """
+    # try to add a * to the filename to read all the files
+    fpat_split = os.path.splitext(filename)
+    _filename = fpat_split[0] + "_*" + fpat_split[1]
+    if len(sorted(glob(_filename))) != 0:
+        pattern = _filename
+    else:
+        pattern = filename
+    filename_list = sorted(glob(pattern))
+    if len(filename_list) == 0:
+        raise ValueError(f"Can not find any output file {filename}!")
+    return pattern
+
+
 def can_expand_grid(variations: dict) -> bool:
     """Check if variations can be expanded into a grid.
 

From 7ef203f22f0d80c5f5be6ef54aad44a4fd141164 Mon Sep 17 00:00:00 2001
From: dachengx <dx2227@columbia.edu>
Date: Mon, 13 Nov 2023 09:12:36 -0600
Subject: [PATCH 7/7] Split functions to utils and methods of NeymanConstructor

---
 alea/submitters/local.py | 115 ++++++++++++++++++++-------------------
 alea/utils.py            |  18 ++++++
 2 files changed, 78 insertions(+), 55 deletions(-)

diff --git a/alea/submitters/local.py b/alea/submitters/local.py
index 0b0ee836..ec31daaa 100644
--- a/alea/submitters/local.py
+++ b/alea/submitters/local.py
@@ -10,7 +10,6 @@
 from copy import deepcopy
 from typing import List, Dict, Any, Optional, Callable, cast
 
-import h5py
 import numpy as np
 from scipy.interpolate import interp1d, RegularGridInterpolator
 from inference_interface import toyfiles_to_numpy
@@ -22,6 +21,7 @@
     asymptotic_critical_value,
     deterministic_hash,
     search_filename_pattern,
+    get_metadata,
 )
 
 
@@ -166,67 +166,20 @@ def submit(
                 **generate_values,
             }
 
-            # read the likelihood ratio
+            # get the output filename pattern
             output_filename_pattern = search_filename_pattern(
                 runner_args["output_filename"].format(**needed_kwargs)
             )
-            output_filename_list = sorted(glob(output_filename_pattern))
 
             # read metadata including generate_values
-            metadata_list = []
-            for _output_filename in output_filename_list:
-                with h5py.File(_output_filename, "r", libver="latest", swmr=True) as ipt:
-                    metadata = dict(
-                        zip(
-                            ipt.attrs.keys(),
-                            [json.loads(ipt.attrs[key]) for key in ipt.attrs.keys()],
-                        )
-                    )
-                metadata.pop("date", None)
-                metadata_list.append(metadata)
-            if len(set([deterministic_hash(m) for m in metadata_list])) != 1:
-                raise ValueError(
-                    "The metadata are not the same for all "
-                    f"the {len(output_filename_list)} output!"
-                )
-            metadata = metadata_list[0]
-            if metadata["poi"] != self.poi:
-                raise ValueError(
-                    f"The poi in the metadata {metadata['poi']} is not "
-                    f"the same as the poi {self.poi}!"
-                )
+            metadata = self._read_metadata(output_filename_pattern)
 
-            # read poi and poi_expectation
+            # combine the generate_values from
+            # the metadata(where the _rate_multiplier is already calculated)
             needed_kwargs = {**metadata["generate_values"], **needed_kwargs}
-            poi_expectation = needed_kwargs.pop("poi_expectation", None)
-            poi_value = needed_kwargs.get(self.poi, None)
-            if poi_value is None:
-                raise ValueError("Can not find the poi value in the generate_values in metadata!")
-            # read expectation_values from metadata
-            expectation_values = metadata["expectation_values"]
-            # check if the poi_expectation is in expectation_values
-            source = self.poi.replace("_rate_multiplier", "")
-            if source not in expectation_values:
-                warnings.warn(
-                    f"poi {self.poi} does not corresponds to any source in the model!"
-                    " so can not calculate the poi_expectation!"
-                )
-            else:
-                _poi_expectation = expectation_values[source]
-                if poi_expectation is not None:
-                    # check if the poi_expectation is consistent
-                    if not np.isclose(poi_expectation, _poi_expectation):
-                        raise ValueError(
-                            f"The poi_expectation from model {poi_expectation} is not "
-                            f"the same as the poi_expectation from toymc {_poi_expectation}!"
-                        )
-                else:
-                    warnings.warn(
-                        "Can not find the poi_expectation in the generate_values "
-                        f"{generate_values}, so will not check if the poi_expectation "
-                        "are consistent!"
-                    )
-                    poi_expectation = _poi_expectation
+
+            # read poi and poi_expectation
+            poi_value, poi_expectation = self._read_poi(needed_kwargs)
 
             # read the likelihood ratio
             results = toyfiles_to_numpy(output_filename_pattern)
@@ -294,6 +247,58 @@ def submit(
             json.dump(threshold, f, indent=4)
         print(f"Saving {limit_threshold}")
 
+    def _read_metadata(self, output_filename_pattern):
+        """Read metadata from the output files."""
+        output_filename_list = sorted(glob(output_filename_pattern))
+        metadata_list = get_metadata(output_filename_pattern)
+        for m in metadata_list:
+            m.pop("date", None)
+        if len(set([deterministic_hash(m) for m in metadata_list])) != 1:
+            raise ValueError(
+                f"The metadata are not the same for all the {len(output_filename_list)} output!"
+            )
+        metadata = metadata_list[0]
+        if metadata["poi"] != self.poi:
+            raise ValueError(
+                f"The poi in the metadata {metadata['poi']} is not "
+                f"the same as the poi {self.poi}!"
+            )
+        return metadata
+
+    def _read_poi(self, metadata, **kwargs):
+        """Read poi and poi_expectation from the metadata, and check if the poi_expectation is
+        consistent with the poi_expectation from the model."""
+        poi_expectation = kwargs.pop("poi_expectation", None)
+        poi_value = kwargs.get(self.poi, None)
+        if poi_value is None:
+            raise ValueError("Can not find the poi value in the generate_values in metadata!")
+        # read expectation_values from metadata
+        expectation_values = metadata["expectation_values"]
+        # check if the poi_expectation is in expectation_values
+        source = self.poi.replace("_rate_multiplier", "")
+        if source not in expectation_values:
+            warnings.warn(
+                f"poi {self.poi} does not corresponds to any source in the model!"
+                " so can not calculate the poi_expectation!"
+            )
+        else:
+            _poi_expectation = expectation_values[source]
+            if poi_expectation is not None:
+                # check if the poi_expectation is consistent
+                if not np.isclose(poi_expectation, _poi_expectation):
+                    raise ValueError(
+                        f"The poi_expectation from model {poi_expectation} is not "
+                        f"the same as the poi_expectation from toymc {_poi_expectation}!"
+                    )
+            else:
+                warnings.warn(
+                    "Can not find the poi_expectation in the generate_values, "
+                    "so will not check if the poi_expectation "
+                    "are consistent!"
+                )
+                poi_expectation = _poi_expectation
+        return poi_value, poi_expectation
+
     @staticmethod
     def build_interpolator(
         poi,
diff --git a/alea/utils.py b/alea/utils.py
index b04f159e..818286a5 100644
--- a/alea/utils.py
+++ b/alea/utils.py
@@ -13,6 +13,8 @@
 from collections.abc import Mapping
 from typing import Any, List, Dict, Tuple, Optional, Union, cast, get_args, get_origin
 
+import h5py
+
 # These imports are needed to evaluate strings
 import numpy  # noqa: F401
 import numpy as np  # noqa: F401
@@ -396,6 +398,22 @@ def search_filename_pattern(filename: str) -> str:
     return pattern
 
 
+def get_metadata(output_filename_pattern: str) -> list:
+    """Get metadata from output files."""
+    output_filename_list = sorted(glob(output_filename_pattern))
+    metadata_list = []
+    for _output_filename in output_filename_list:
+        with h5py.File(_output_filename, "r", libver="latest", swmr=True) as ipt:
+            metadata = dict(
+                zip(
+                    ipt.attrs.keys(),
+                    [json.loads(ipt.attrs[key]) for key in ipt.attrs.keys()],
+                )
+            )
+        metadata_list.append(metadata)
+    return metadata_list
+
+
 def can_expand_grid(variations: dict) -> bool:
     """Check if variations can be expanded into a grid.