From d27c8b13f7c61af587cd5ae608a6b8a95b9a5117 Mon Sep 17 00:00:00 2001 From: dachengx Date: Thu, 9 Nov 2023 19:27:26 -0600 Subject: [PATCH 1/7] Read poi directly from output_filename to accelerate NeymanConstructor --- alea/submitters/local.py | 83 +++++++++++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 19 deletions(-) diff --git a/alea/submitters/local.py b/alea/submitters/local.py index 1f65eaef..828b1085 100644 --- a/alea/submitters/local.py +++ b/alea/submitters/local.py @@ -5,10 +5,12 @@ import subprocess import itertools import operator +from glob import glob from functools import reduce from copy import deepcopy from typing import List, Dict, Any, Optional, Callable, cast +import h5py import numpy as np from scipy.interpolate import interp1d, RegularGridInterpolator from inference_interface import toyfiles_to_numpy @@ -68,13 +70,14 @@ class NeymanConstructor(SubmitterLocal): """ - allowed_special_args = ["free_name", "true_name", "confidence_levels"] + allowed_special_args = ["free_name", "true_name", "confidence_levels", "check_poi_expectation"] def submit( self, free_name: str = "free", true_name: str = "true", confidence_levels: List[float] = [0.8, 0.9, 0.95], + check_poi_expectation: bool = False, ): """Read the likelihood ratio from the output files and calculate the Neyman threshold. The threshold will be saved into a json file. The threshold will be sorted based on the elements @@ -84,6 +87,8 @@ def submit( free_name: the name of the free hypothesis true_name: the name of the true hypothesis confidence_levels: the confidence levels to calculate the threshold + check_poi_expectation: whether to check if the poi_expectation is consistent in + runner args and model's expectation values Example: >>> data = json.load(open("limit_threshold.json")); print(json.dumps(data, indent=4)) @@ -164,11 +169,45 @@ def submit( } # read the likelihood ratio - output_filename = runner_args["output_filename"] - # add a * to the output_filename to read all the files + output_filename = runner_args["output_filename"].format(**needed_kwargs) + # try to add a * to the output_filename to read all the files fpat_split = os.path.splitext(output_filename) - output_filename = fpat_split[0] + "*" + fpat_split[1] - results = toyfiles_to_numpy(output_filename.format(**needed_kwargs)) + _output_filename = fpat_split[0] + "_*" + fpat_split[1] + if len(sorted(glob(_output_filename))) != 0: + output_filename = _output_filename + output_filename_list = sorted(glob(output_filename)) + if len(output_filename_list) == 0: + raise ValueError(f"Can not find any output file {output_filename}!") + # read metadata including generate_values + metadata_list = [] + for _output_filename in output_filename_list: + with h5py.File(_output_filename, "r", libver="latest", swmr=True) as ipt: + metadata = dict( + zip( + ipt.attrs.keys(), + [json.loads(ipt.attrs[key]) for key in ipt.attrs.keys()], + ) + ) + metadata.pop("date", None) + metadata_list.append(metadata) + if len(set([deterministic_hash(m) for m in metadata_list])) != 1: + raise ValueError( + "The metadata are not the same for all " + f"the {len(output_filename_list)} output!" + ) + metadata = metadata_list[0] + if metadata["poi"] != self.poi: + raise ValueError( + f"The poi in the metadata {metadata['poi']} is not " + f"the same as the poi {self.poi}!" + ) + needed_kwargs = {**metadata["generate_values"], **needed_kwargs} + poi_expectation = needed_kwargs.get("poi_expectation", None) + poi_value = needed_kwargs.get(self.poi, None) + if poi_value is None: + raise ValueError("Can not find the poi value in the generate_values in metadata!") + # read the likelihood ratio + results = toyfiles_to_numpy(output_filename) llfree = results[free_name]["ll"] lltrue = results[true_name]["ll"] llrs = 2.0 * (llfree - lltrue) @@ -179,24 +218,30 @@ def submit( f"{(llrs < 0.0).sum() / len(llrs):.2f}, " "the median if negative log likelihood ratios " f"is {np.median(llrs[llrs < 0.0]):.2e}, " - f"there might be a problem in your fitting.", + f"there might be a problem in your fitting." ) if len(llrs) < 1000: self.logging.warning( - "The number of toys is less than 1000, the threshold might not be accurate!", + "The number of toys is less than 1000, the threshold might not be accurate!" ) - # update poi according to poi_expectation - runner_args["statistical_model_args"].pop("limit_threshold", None) - runner = Runner(**runner_args) - expectation_values = runner.model.get_expectation_values( - **{**nominal_values, **generate_values} - ) - # in some rare cases the poi is not a rate multiplier - # then the poi_expectation is not in the nominal_expectation_values - component = self.poi.replace("_rate_multiplier", "") - poi_expectation = expectation_values.get(component, None) - poi_value = generate_values.pop(self.poi) + if check_poi_expectation: + # check if the poi_expectation is consistent + runner_args["statistical_model_args"].pop("limit_threshold", None) + runner = Runner(**runner_args) + expectation_values = runner.model.get_expectation_values( + **{**nominal_values, **generate_values} + ) + component = self.poi.replace("_rate_multiplier", "") + # in some rare cases the poi is not a rate multiplier + # then the poi_expectation is not in the nominal_expectation_values + _poi_expectation = expectation_values.get(component, None) + if None not in [poi_expectation, _poi_expectation]: + if poi_expectation != _poi_expectation: + raise ValueError( + f"The poi_expectation from model {poi_expectation} is not " + f"the same as the poi_expectation from toymc {_poi_expectation}!" + ) # make sure no poi and poi_expectation in the hashed_keys generate_values.pop(self.poi, None) @@ -226,7 +271,7 @@ def submit( "threshold": [], "poi_expectation": [], } - threshold[threshold_key] = threshold_value + threshold[threshold_key] = deepcopy(threshold_value) threshold[threshold_key][self.poi].append(poi_value) threshold[threshold_key]["threshold"].append(q_llr) threshold[threshold_key]["poi_expectation"].append(poi_expectation) From efb7a216e788bb2547584392c9a0cc5b80f3cbbf Mon Sep 17 00:00:00 2001 From: dachengx Date: Thu, 9 Nov 2023 22:47:57 -0600 Subject: [PATCH 2/7] Add option to add poi_expectation to limit_threshold --- alea/models/blueice_extended_model.py | 25 ++++++++++- alea/submitters/local.py | 65 ++++++++++++++++----------- 2 files changed, 63 insertions(+), 27 deletions(-) diff --git a/alea/models/blueice_extended_model.py b/alea/models/blueice_extended_model.py index da107c52..231201da 100644 --- a/alea/models/blueice_extended_model.py +++ b/alea/models/blueice_extended_model.py @@ -2,6 +2,7 @@ from typing import List, Dict, Callable, Optional, Union, cast from copy import deepcopy from pydoc import locate +import itertools import numpy as np import scipy.stats as stats @@ -138,11 +139,31 @@ def get_source_name_list(self, likelihood_name: str) -> list: ll_index = self.likelihood_names.index(likelihood_name) return self.likelihood_list[ll_index].source_name_list + def get_all_source_name(self) -> set: + """Return a set of possible source names from all likelihood terms. + + Args: + likelihood_name (str): Name of the likelihood. + + Returns: + set: set of source names. + + """ + source_name = set( + itertools.chain.from_iterable([ll.source_name_list for ll in self.likelihood_list[:-1]]) + ) + return source_name + @property def likelihood_list(self) -> List: """Return a list of likelihood terms.""" return self._likelihood.likelihood_list + @property + def likelihood_parameters(self) -> List: + """Return a list of likelihood parameters.""" + return self._likelihood.likelihood_parameters + def get_expectation_values(self, per_likelihood_term=False, **kwargs) -> dict: """Return total expectation values (summed over all likelihood terms with the same name) given a number of named parameters (kwargs) @@ -179,9 +200,9 @@ def get_expectation_values(self, per_likelihood_term=False, **kwargs) -> dict: # ancillary likelihood does not contribute for ll_term, ll_name, parameter_names, livetime_parameter in zip( - self_copy._likelihood.likelihood_list[:-1], + self_copy.likelihood_list[:-1], self_copy.likelihood_names[:-1], - self_copy._likelihood.likelihood_parameters, + self_copy.likelihood_parameters, self_copy.livetime_parameter_names, ): ret[ll_name] = {} diff --git a/alea/submitters/local.py b/alea/submitters/local.py index 828b1085..89df17bc 100644 --- a/alea/submitters/local.py +++ b/alea/submitters/local.py @@ -70,14 +70,14 @@ class NeymanConstructor(SubmitterLocal): """ - allowed_special_args = ["free_name", "true_name", "confidence_levels", "check_poi_expectation"] + allowed_special_args = ["free_name", "true_name", "confidence_levels", "add_poi_expectation"] def submit( self, free_name: str = "free", true_name: str = "true", confidence_levels: List[float] = [0.8, 0.9, 0.95], - check_poi_expectation: bool = False, + add_poi_expectation: bool = False, ): """Read the likelihood ratio from the output files and calculate the Neyman threshold. The threshold will be saved into a json file. The threshold will be sorted based on the elements @@ -87,8 +87,9 @@ def submit( free_name: the name of the free hypothesis true_name: the name of the true hypothesis confidence_levels: the confidence levels to calculate the threshold - check_poi_expectation: whether to check if the poi_expectation is consistent in - runner args and model's expectation values + add_poi_expectation: whether to calculate and add poi_expectation to limit_threshold, + if so, will also check whether the poi_expectation is consistent in runner args + and model's expectation values Example: >>> data = json.load(open("limit_threshold.json")); print(json.dumps(data, indent=4)) @@ -134,6 +135,9 @@ def submit( if "confidence_levels" in runner_args: confidence_levels = runner_args["confidence_levels"] self.logging.info(f"Overwrite confidence_levels to {confidence_levels}.") + if "add_poi_expectation" in runner_args: + add_poi_expectation = runner_args["add_poi_expectation"] + self.logging.info(f"Overwrite add_poi_expectation to {add_poi_expectation}.") # extract limit_threshold from the statistical_model_args limit_threshold = runner_args["statistical_model_args"].get("limit_threshold", None) @@ -161,8 +165,8 @@ def submit( raise ValueError("confidence_levels" + message) # prepare the needed nominal_values and generate_values - nominal_values = runner_args["nominal_values"] - generate_values = runner_args["generate_values"] + nominal_values = deepcopy(runner_args["nominal_values"]) + generate_values = deepcopy(runner_args["generate_values"]) needed_kwargs = { **nominal_values, **generate_values, @@ -202,10 +206,39 @@ def submit( f"the same as the poi {self.poi}!" ) needed_kwargs = {**metadata["generate_values"], **needed_kwargs} - poi_expectation = needed_kwargs.get("poi_expectation", None) + poi_expectation = needed_kwargs.pop("poi_expectation", None) poi_value = needed_kwargs.get(self.poi, None) if poi_value is None: raise ValueError("Can not find the poi value in the generate_values in metadata!") + if add_poi_expectation: + # check if the poi_expectation is consistent + for args in self.allowed_special_args: + runner_args.pop(args, None) + runner_args["statistical_model_args"].pop("limit_threshold", None) + runner = Runner(**runner_args) + source = self.poi.replace("_rate_multiplier", "") + all_source_name = runner.model.get_all_source_name() + if source not in all_source_name: + raise ValueError( + f"poi {self.poi} does not corresponds to any source in the model!" + " so can not calculate the poi_expectation!" + ) + expectation_values = runner.model.get_expectation_values(**needed_kwargs) + _poi_expectation = expectation_values.get(source, None) + if poi_expectation is not None: + if not np.isclose(poi_expectation, _poi_expectation): + raise ValueError( + f"The poi_expectation from model {poi_expectation} is not " + f"the same as the poi_expectation from toymc {_poi_expectation}!" + ) + else: + warnings.warn( + "Can not find the poi_expectation in the generate_values " + f"{generate_values}, so will not check if the poi_expectation " + "are consistent!" + ) + poi_expectation = _poi_expectation + # read the likelihood ratio results = toyfiles_to_numpy(output_filename) llfree = results[free_name]["ll"] @@ -225,24 +258,6 @@ def submit( "The number of toys is less than 1000, the threshold might not be accurate!" ) - if check_poi_expectation: - # check if the poi_expectation is consistent - runner_args["statistical_model_args"].pop("limit_threshold", None) - runner = Runner(**runner_args) - expectation_values = runner.model.get_expectation_values( - **{**nominal_values, **generate_values} - ) - component = self.poi.replace("_rate_multiplier", "") - # in some rare cases the poi is not a rate multiplier - # then the poi_expectation is not in the nominal_expectation_values - _poi_expectation = expectation_values.get(component, None) - if None not in [poi_expectation, _poi_expectation]: - if poi_expectation != _poi_expectation: - raise ValueError( - f"The poi_expectation from model {poi_expectation} is not " - f"the same as the poi_expectation from toymc {_poi_expectation}!" - ) - # make sure no poi and poi_expectation in the hashed_keys generate_values.pop(self.poi, None) generate_values.pop("poi_expectation", None) From 8f0d11e4585b2960273cdc05e28f8c64f66bb7cf Mon Sep 17 00:00:00 2001 From: dachengx Date: Fri, 10 Nov 2023 00:22:16 -0600 Subject: [PATCH 3/7] Save expectation_values to metadata, so NeymanConstructor can refer to --- alea/runner.py | 3 +++ alea/submitters/local.py | 39 ++++++++++++++++----------------------- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/alea/runner.py b/alea/runner.py index 1b4d896a..722d18e9 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -119,6 +119,7 @@ def __init__( statistical_model_args = {} # nominal_values is keyword argument self.nominal_values = nominal_values if nominal_values else {} + # initialize nominal_values only once statistical_model_args["nominal_values"] = self.nominal_values # likelihood_config is keyword argument, because not all statistical model needs it statistical_model_args["likelihood_config"] = likelihood_config @@ -338,6 +339,8 @@ def write_output(self, results): metadata["poi"] = self.poi metadata["common_hypothesis"] = self.common_hypothesis metadata["generate_values"] = self.generate_values + metadata["nominal_values"] = self.nominal_values + metadata["expectation_values"] = self.model.get_expectation_values(**self.generate_values) array_metadatas = [{"hypotheses_values": ea} for ea in self._hypotheses_values] numpy_arrays_and_names = [(r, rn) for r, rn in zip(results, result_names)] diff --git a/alea/submitters/local.py b/alea/submitters/local.py index 89df17bc..80490821 100644 --- a/alea/submitters/local.py +++ b/alea/submitters/local.py @@ -70,14 +70,13 @@ class NeymanConstructor(SubmitterLocal): """ - allowed_special_args = ["free_name", "true_name", "confidence_levels", "add_poi_expectation"] + allowed_special_args = ["free_name", "true_name", "confidence_levels"] def submit( self, free_name: str = "free", true_name: str = "true", confidence_levels: List[float] = [0.8, 0.9, 0.95], - add_poi_expectation: bool = False, ): """Read the likelihood ratio from the output files and calculate the Neyman threshold. The threshold will be saved into a json file. The threshold will be sorted based on the elements @@ -87,9 +86,6 @@ def submit( free_name: the name of the free hypothesis true_name: the name of the true hypothesis confidence_levels: the confidence levels to calculate the threshold - add_poi_expectation: whether to calculate and add poi_expectation to limit_threshold, - if so, will also check whether the poi_expectation is consistent in runner args - and model's expectation values Example: >>> data = json.load(open("limit_threshold.json")); print(json.dumps(data, indent=4)) @@ -135,9 +131,6 @@ def submit( if "confidence_levels" in runner_args: confidence_levels = runner_args["confidence_levels"] self.logging.info(f"Overwrite confidence_levels to {confidence_levels}.") - if "add_poi_expectation" in runner_args: - add_poi_expectation = runner_args["add_poi_expectation"] - self.logging.info(f"Overwrite add_poi_expectation to {add_poi_expectation}.") # extract limit_threshold from the statistical_model_args limit_threshold = runner_args["statistical_model_args"].get("limit_threshold", None) @@ -182,6 +175,7 @@ def submit( output_filename_list = sorted(glob(output_filename)) if len(output_filename_list) == 0: raise ValueError(f"Can not find any output file {output_filename}!") + # read metadata including generate_values metadata_list = [] for _output_filename in output_filename_list: @@ -205,27 +199,26 @@ def submit( f"The poi in the metadata {metadata['poi']} is not " f"the same as the poi {self.poi}!" ) + + # read poi and poi_expectation needed_kwargs = {**metadata["generate_values"], **needed_kwargs} poi_expectation = needed_kwargs.pop("poi_expectation", None) poi_value = needed_kwargs.get(self.poi, None) if poi_value is None: raise ValueError("Can not find the poi value in the generate_values in metadata!") - if add_poi_expectation: - # check if the poi_expectation is consistent - for args in self.allowed_special_args: - runner_args.pop(args, None) - runner_args["statistical_model_args"].pop("limit_threshold", None) - runner = Runner(**runner_args) - source = self.poi.replace("_rate_multiplier", "") - all_source_name = runner.model.get_all_source_name() - if source not in all_source_name: - raise ValueError( - f"poi {self.poi} does not corresponds to any source in the model!" - " so can not calculate the poi_expectation!" - ) - expectation_values = runner.model.get_expectation_values(**needed_kwargs) - _poi_expectation = expectation_values.get(source, None) + # read expectation_values from metadata + expectation_values = metadata["expectation_values"] + # check if the poi_expectation is in expectation_values + source = self.poi.replace("_rate_multiplier", "") + if source not in expectation_values: + warnings.warn( + f"poi {self.poi} does not corresponds to any source in the model!" + " so can not calculate the poi_expectation!" + ) + else: + _poi_expectation = expectation_values[source] if poi_expectation is not None: + # check if the poi_expectation is consistent if not np.isclose(poi_expectation, _poi_expectation): raise ValueError( f"The poi_expectation from model {poi_expectation} is not " From e39d0a416ccce658a07c7265e9c44343bb334515 Mon Sep 17 00:00:00 2001 From: dachengx Date: Fri, 10 Nov 2023 00:33:26 -0600 Subject: [PATCH 4/7] Be compatible to the simplest model --- alea/model.py | 2 +- alea/runner.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/alea/model.py b/alea/model.py index f7bdf748..cfb85378 100644 --- a/alea/model.py +++ b/alea/model.py @@ -255,7 +255,7 @@ def get_expectation_values(self, **parameter_values): parameter_values: values of the parameters """ - return NotImplementedError("get_expectation_values is optional to implement") + raise NotImplementedError("get_expectation_values is optional to implement") @property def nominal_expectation_values(self): diff --git a/alea/runner.py b/alea/runner.py index 722d18e9..68ec059f 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -340,7 +340,12 @@ def write_output(self, results): metadata["common_hypothesis"] = self.common_hypothesis metadata["generate_values"] = self.generate_values metadata["nominal_values"] = self.nominal_values - metadata["expectation_values"] = self.model.get_expectation_values(**self.generate_values) + try: + metadata["expectation_values"] = self.model.get_expectation_values( + **self.generate_values + ) + except NotImplementedError: + metadata["expectation_values"] = {} array_metadatas = [{"hypotheses_values": ea} for ea in self._hypotheses_values] numpy_arrays_and_names = [(r, rn) for r, rn in zip(results, result_names)] From cf02ab02779663d3d82fd8f114504c122feb82e9 Mon Sep 17 00:00:00 2001 From: dachengx Date: Mon, 13 Nov 2023 08:12:07 -0600 Subject: [PATCH 5/7] Remove unnecessary functions --- alea/models/blueice_extended_model.py | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/alea/models/blueice_extended_model.py b/alea/models/blueice_extended_model.py index 231201da..da107c52 100644 --- a/alea/models/blueice_extended_model.py +++ b/alea/models/blueice_extended_model.py @@ -2,7 +2,6 @@ from typing import List, Dict, Callable, Optional, Union, cast from copy import deepcopy from pydoc import locate -import itertools import numpy as np import scipy.stats as stats @@ -139,31 +138,11 @@ def get_source_name_list(self, likelihood_name: str) -> list: ll_index = self.likelihood_names.index(likelihood_name) return self.likelihood_list[ll_index].source_name_list - def get_all_source_name(self) -> set: - """Return a set of possible source names from all likelihood terms. - - Args: - likelihood_name (str): Name of the likelihood. - - Returns: - set: set of source names. - - """ - source_name = set( - itertools.chain.from_iterable([ll.source_name_list for ll in self.likelihood_list[:-1]]) - ) - return source_name - @property def likelihood_list(self) -> List: """Return a list of likelihood terms.""" return self._likelihood.likelihood_list - @property - def likelihood_parameters(self) -> List: - """Return a list of likelihood parameters.""" - return self._likelihood.likelihood_parameters - def get_expectation_values(self, per_likelihood_term=False, **kwargs) -> dict: """Return total expectation values (summed over all likelihood terms with the same name) given a number of named parameters (kwargs) @@ -200,9 +179,9 @@ def get_expectation_values(self, per_likelihood_term=False, **kwargs) -> dict: # ancillary likelihood does not contribute for ll_term, ll_name, parameter_names, livetime_parameter in zip( - self_copy.likelihood_list[:-1], + self_copy._likelihood.likelihood_list[:-1], self_copy.likelihood_names[:-1], - self_copy.likelihood_parameters, + self_copy._likelihood.likelihood_parameters, self_copy.livetime_parameter_names, ): ret[ll_name] = {} From 2b38a2f9ca0106ab6f68cabc8b783ac054b3c94a Mon Sep 17 00:00:00 2001 From: dachengx Date: Mon, 13 Nov 2023 08:58:25 -0600 Subject: [PATCH 6/7] Move out search_filename_pattern to utils --- alea/submitters/local.py | 16 ++++++---------- alea/utils.py | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/alea/submitters/local.py b/alea/submitters/local.py index 80490821..0b0ee836 100644 --- a/alea/submitters/local.py +++ b/alea/submitters/local.py @@ -21,6 +21,7 @@ load_json, asymptotic_critical_value, deterministic_hash, + search_filename_pattern, ) @@ -166,15 +167,10 @@ def submit( } # read the likelihood ratio - output_filename = runner_args["output_filename"].format(**needed_kwargs) - # try to add a * to the output_filename to read all the files - fpat_split = os.path.splitext(output_filename) - _output_filename = fpat_split[0] + "_*" + fpat_split[1] - if len(sorted(glob(_output_filename))) != 0: - output_filename = _output_filename - output_filename_list = sorted(glob(output_filename)) - if len(output_filename_list) == 0: - raise ValueError(f"Can not find any output file {output_filename}!") + output_filename_pattern = search_filename_pattern( + runner_args["output_filename"].format(**needed_kwargs) + ) + output_filename_list = sorted(glob(output_filename_pattern)) # read metadata including generate_values metadata_list = [] @@ -233,7 +229,7 @@ def submit( poi_expectation = _poi_expectation # read the likelihood ratio - results = toyfiles_to_numpy(output_filename) + results = toyfiles_to_numpy(output_filename_pattern) llfree = results[free_name]["ll"] lltrue = results[true_name]["ll"] llrs = 2.0 * (llfree - lltrue) diff --git a/alea/utils.py b/alea/utils.py index 30b1e896..b04f159e 100644 --- a/alea/utils.py +++ b/alea/utils.py @@ -374,6 +374,28 @@ def add_i_batch(filename: str) -> str: return fpat_split[0] + "_{i_batch:d}" + fpat_split[1] +def search_filename_pattern(filename: str) -> str: + """Return pattern for a given existing filename. This is needed because sometimes the filename + is not appended by "_{i_batch:d}". We need to distinguish between the two cases and return the + correct pattern. + + Returns: + str: existing pattern for filename, either filename or filename w/ inserted "_*" + + """ + # try to add a * to the filename to read all the files + fpat_split = os.path.splitext(filename) + _filename = fpat_split[0] + "_*" + fpat_split[1] + if len(sorted(glob(_filename))) != 0: + pattern = _filename + else: + pattern = filename + filename_list = sorted(glob(pattern)) + if len(filename_list) == 0: + raise ValueError(f"Can not find any output file {filename}!") + return pattern + + def can_expand_grid(variations: dict) -> bool: """Check if variations can be expanded into a grid. From 7ef203f22f0d80c5f5be6ef54aad44a4fd141164 Mon Sep 17 00:00:00 2001 From: dachengx Date: Mon, 13 Nov 2023 09:12:36 -0600 Subject: [PATCH 7/7] Split functions to utils and methods of NeymanConstructor --- alea/submitters/local.py | 115 ++++++++++++++++++++------------------- alea/utils.py | 18 ++++++ 2 files changed, 78 insertions(+), 55 deletions(-) diff --git a/alea/submitters/local.py b/alea/submitters/local.py index 0b0ee836..ec31daaa 100644 --- a/alea/submitters/local.py +++ b/alea/submitters/local.py @@ -10,7 +10,6 @@ from copy import deepcopy from typing import List, Dict, Any, Optional, Callable, cast -import h5py import numpy as np from scipy.interpolate import interp1d, RegularGridInterpolator from inference_interface import toyfiles_to_numpy @@ -22,6 +21,7 @@ asymptotic_critical_value, deterministic_hash, search_filename_pattern, + get_metadata, ) @@ -166,67 +166,20 @@ def submit( **generate_values, } - # read the likelihood ratio + # get the output filename pattern output_filename_pattern = search_filename_pattern( runner_args["output_filename"].format(**needed_kwargs) ) - output_filename_list = sorted(glob(output_filename_pattern)) # read metadata including generate_values - metadata_list = [] - for _output_filename in output_filename_list: - with h5py.File(_output_filename, "r", libver="latest", swmr=True) as ipt: - metadata = dict( - zip( - ipt.attrs.keys(), - [json.loads(ipt.attrs[key]) for key in ipt.attrs.keys()], - ) - ) - metadata.pop("date", None) - metadata_list.append(metadata) - if len(set([deterministic_hash(m) for m in metadata_list])) != 1: - raise ValueError( - "The metadata are not the same for all " - f"the {len(output_filename_list)} output!" - ) - metadata = metadata_list[0] - if metadata["poi"] != self.poi: - raise ValueError( - f"The poi in the metadata {metadata['poi']} is not " - f"the same as the poi {self.poi}!" - ) + metadata = self._read_metadata(output_filename_pattern) - # read poi and poi_expectation + # combine the generate_values from + # the metadata(where the _rate_multiplier is already calculated) needed_kwargs = {**metadata["generate_values"], **needed_kwargs} - poi_expectation = needed_kwargs.pop("poi_expectation", None) - poi_value = needed_kwargs.get(self.poi, None) - if poi_value is None: - raise ValueError("Can not find the poi value in the generate_values in metadata!") - # read expectation_values from metadata - expectation_values = metadata["expectation_values"] - # check if the poi_expectation is in expectation_values - source = self.poi.replace("_rate_multiplier", "") - if source not in expectation_values: - warnings.warn( - f"poi {self.poi} does not corresponds to any source in the model!" - " so can not calculate the poi_expectation!" - ) - else: - _poi_expectation = expectation_values[source] - if poi_expectation is not None: - # check if the poi_expectation is consistent - if not np.isclose(poi_expectation, _poi_expectation): - raise ValueError( - f"The poi_expectation from model {poi_expectation} is not " - f"the same as the poi_expectation from toymc {_poi_expectation}!" - ) - else: - warnings.warn( - "Can not find the poi_expectation in the generate_values " - f"{generate_values}, so will not check if the poi_expectation " - "are consistent!" - ) - poi_expectation = _poi_expectation + + # read poi and poi_expectation + poi_value, poi_expectation = self._read_poi(needed_kwargs) # read the likelihood ratio results = toyfiles_to_numpy(output_filename_pattern) @@ -294,6 +247,58 @@ def submit( json.dump(threshold, f, indent=4) print(f"Saving {limit_threshold}") + def _read_metadata(self, output_filename_pattern): + """Read metadata from the output files.""" + output_filename_list = sorted(glob(output_filename_pattern)) + metadata_list = get_metadata(output_filename_pattern) + for m in metadata_list: + m.pop("date", None) + if len(set([deterministic_hash(m) for m in metadata_list])) != 1: + raise ValueError( + f"The metadata are not the same for all the {len(output_filename_list)} output!" + ) + metadata = metadata_list[0] + if metadata["poi"] != self.poi: + raise ValueError( + f"The poi in the metadata {metadata['poi']} is not " + f"the same as the poi {self.poi}!" + ) + return metadata + + def _read_poi(self, metadata, **kwargs): + """Read poi and poi_expectation from the metadata, and check if the poi_expectation is + consistent with the poi_expectation from the model.""" + poi_expectation = kwargs.pop("poi_expectation", None) + poi_value = kwargs.get(self.poi, None) + if poi_value is None: + raise ValueError("Can not find the poi value in the generate_values in metadata!") + # read expectation_values from metadata + expectation_values = metadata["expectation_values"] + # check if the poi_expectation is in expectation_values + source = self.poi.replace("_rate_multiplier", "") + if source not in expectation_values: + warnings.warn( + f"poi {self.poi} does not corresponds to any source in the model!" + " so can not calculate the poi_expectation!" + ) + else: + _poi_expectation = expectation_values[source] + if poi_expectation is not None: + # check if the poi_expectation is consistent + if not np.isclose(poi_expectation, _poi_expectation): + raise ValueError( + f"The poi_expectation from model {poi_expectation} is not " + f"the same as the poi_expectation from toymc {_poi_expectation}!" + ) + else: + warnings.warn( + "Can not find the poi_expectation in the generate_values, " + "so will not check if the poi_expectation " + "are consistent!" + ) + poi_expectation = _poi_expectation + return poi_value, poi_expectation + @staticmethod def build_interpolator( poi, diff --git a/alea/utils.py b/alea/utils.py index b04f159e..818286a5 100644 --- a/alea/utils.py +++ b/alea/utils.py @@ -13,6 +13,8 @@ from collections.abc import Mapping from typing import Any, List, Dict, Tuple, Optional, Union, cast, get_args, get_origin +import h5py + # These imports are needed to evaluate strings import numpy # noqa: F401 import numpy as np # noqa: F401 @@ -396,6 +398,22 @@ def search_filename_pattern(filename: str) -> str: return pattern +def get_metadata(output_filename_pattern: str) -> list: + """Get metadata from output files.""" + output_filename_list = sorted(glob(output_filename_pattern)) + metadata_list = [] + for _output_filename in output_filename_list: + with h5py.File(_output_filename, "r", libver="latest", swmr=True) as ipt: + metadata = dict( + zip( + ipt.attrs.keys(), + [json.loads(ipt.attrs[key]) for key in ipt.attrs.keys()], + ) + ) + metadata_list.append(metadata) + return metadata_list + + def can_expand_grid(variations: dict) -> bool: """Check if variations can be expanded into a grid.