Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Read poi and expectation directly from output_filename to accelerate NeymanConstructor #108

Merged
merged 8 commits into from
Nov 13, 2023
2 changes: 1 addition & 1 deletion alea/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def get_expectation_values(self, **parameter_values):
parameter_values: values of the parameters

"""
return NotImplementedError("get_expectation_values is optional to implement")
raise NotImplementedError("get_expectation_values is optional to implement")

@property
def nominal_expectation_values(self):
Expand Down
25 changes: 23 additions & 2 deletions alea/models/blueice_extended_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import List, Dict, Callable, Optional, Union, cast
from copy import deepcopy
from pydoc import locate
import itertools

import numpy as np
import scipy.stats as stats
Expand Down Expand Up @@ -138,11 +139,31 @@ def get_source_name_list(self, likelihood_name: str) -> list:
ll_index = self.likelihood_names.index(likelihood_name)
return self.likelihood_list[ll_index].source_name_list

def get_all_source_name(self) -> set:
"""Return a set of possible source names from all likelihood terms.

Args:
likelihood_name (str): Name of the likelihood.

Returns:
set: set of source names.

"""
source_name = set(
itertools.chain.from_iterable([ll.source_name_list for ll in self.likelihood_list[:-1]])
)
return source_name

@property
def likelihood_list(self) -> List:
"""Return a list of likelihood terms."""
return self._likelihood.likelihood_list

@property
def likelihood_parameters(self) -> List:
"""Return a list of likelihood parameters."""
return self._likelihood.likelihood_parameters

def get_expectation_values(self, per_likelihood_term=False, **kwargs) -> dict:
"""Return total expectation values (summed over all likelihood terms with the same name)
given a number of named parameters (kwargs)
Expand Down Expand Up @@ -179,9 +200,9 @@ def get_expectation_values(self, per_likelihood_term=False, **kwargs) -> dict:

# ancillary likelihood does not contribute
for ll_term, ll_name, parameter_names, livetime_parameter in zip(
self_copy._likelihood.likelihood_list[:-1],
self_copy.likelihood_list[:-1],
self_copy.likelihood_names[:-1],
self_copy._likelihood.likelihood_parameters,
self_copy.likelihood_parameters,
self_copy.livetime_parameter_names,
):
ret[ll_name] = {}
Expand Down
8 changes: 8 additions & 0 deletions alea/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def __init__(
statistical_model_args = {}
# nominal_values is keyword argument
self.nominal_values = nominal_values if nominal_values else {}
# initialize nominal_values only once
statistical_model_args["nominal_values"] = self.nominal_values
# likelihood_config is keyword argument, because not all statistical model needs it
statistical_model_args["likelihood_config"] = likelihood_config
Expand Down Expand Up @@ -338,6 +339,13 @@ def write_output(self, results):
metadata["poi"] = self.poi
metadata["common_hypothesis"] = self.common_hypothesis
metadata["generate_values"] = self.generate_values
metadata["nominal_values"] = self.nominal_values
try:
metadata["expectation_values"] = self.model.get_expectation_values(
**self.generate_values
)
except NotImplementedError:
metadata["expectation_values"] = {}

array_metadatas = [{"hypotheses_values": ea} for ea in self._hypotheses_values]
numpy_arrays_and_names = [(r, rn) for r, rn in zip(results, result_names)]
Expand Down
95 changes: 74 additions & 21 deletions alea/submitters/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
import subprocess
import itertools
import operator
from glob import glob
from functools import reduce
from copy import deepcopy
from typing import List, Dict, Any, Optional, Callable, cast

import h5py
import numpy as np
from scipy.interpolate import interp1d, RegularGridInterpolator
from inference_interface import toyfiles_to_numpy
Expand Down Expand Up @@ -156,19 +158,82 @@ def submit(
raise ValueError("confidence_levels" + message)

# prepare the needed nominal_values and generate_values
nominal_values = runner_args["nominal_values"]
generate_values = runner_args["generate_values"]
nominal_values = deepcopy(runner_args["nominal_values"])
generate_values = deepcopy(runner_args["generate_values"])
needed_kwargs = {
**nominal_values,
**generate_values,
}

# read the likelihood ratio
output_filename = runner_args["output_filename"]
# add a * to the output_filename to read all the files
output_filename = runner_args["output_filename"].format(**needed_kwargs)
# try to add a * to the output_filename to read all the files
fpat_split = os.path.splitext(output_filename)
output_filename = fpat_split[0] + "*" + fpat_split[1]
results = toyfiles_to_numpy(output_filename.format(**needed_kwargs))
_output_filename = fpat_split[0] + "_*" + fpat_split[1]
if len(sorted(glob(_output_filename))) != 0:
output_filename = _output_filename
output_filename_list = sorted(glob(output_filename))
if len(output_filename_list) == 0:
raise ValueError(f"Can not find any output file {output_filename}!")

# read metadata including generate_values
metadata_list = []
for _output_filename in output_filename_list:
with h5py.File(_output_filename, "r", libver="latest", swmr=True) as ipt:
metadata = dict(
zip(
ipt.attrs.keys(),
[json.loads(ipt.attrs[key]) for key in ipt.attrs.keys()],
)
)
metadata.pop("date", None)
metadata_list.append(metadata)
if len(set([deterministic_hash(m) for m in metadata_list])) != 1:
raise ValueError(
"The metadata are not the same for all "
f"the {len(output_filename_list)} output!"
)
metadata = metadata_list[0]
if metadata["poi"] != self.poi:
raise ValueError(
f"The poi in the metadata {metadata['poi']} is not "
f"the same as the poi {self.poi}!"
)

# read poi and poi_expectation
needed_kwargs = {**metadata["generate_values"], **needed_kwargs}
poi_expectation = needed_kwargs.pop("poi_expectation", None)
poi_value = needed_kwargs.get(self.poi, None)
if poi_value is None:
raise ValueError("Can not find the poi value in the generate_values in metadata!")
# read expectation_values from metadata
expectation_values = metadata["expectation_values"]
# check if the poi_expectation is in expectation_values
source = self.poi.replace("_rate_multiplier", "")
if source not in expectation_values:
warnings.warn(
f"poi {self.poi} does not corresponds to any source in the model!"
" so can not calculate the poi_expectation!"
)
else:
_poi_expectation = expectation_values[source]
if poi_expectation is not None:
# check if the poi_expectation is consistent
if not np.isclose(poi_expectation, _poi_expectation):
raise ValueError(
f"The poi_expectation from model {poi_expectation} is not "
f"the same as the poi_expectation from toymc {_poi_expectation}!"
)
else:
warnings.warn(
"Can not find the poi_expectation in the generate_values "
f"{generate_values}, so will not check if the poi_expectation "
"are consistent!"
)
poi_expectation = _poi_expectation

# read the likelihood ratio
results = toyfiles_to_numpy(output_filename)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we test some of this?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, this method has become pretty lengthy. Do you think it could make sense to break out some functionality into smaller functions that you can then call in submit? This makes it easier to test and to read 😊

Copy link
Collaborator Author

@dachengx dachengx Nov 13, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The functions are split. I have no idea currently how to test the code of NeymanConstructor because then we need to save the output files or run the configs when testing. That can be time-consuming.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right. Maybe we can write and load some trivial example or something? But it's not that straightforward, I agree.

llfree = results[free_name]["ll"]
lltrue = results[true_name]["ll"]
llrs = 2.0 * (llfree - lltrue)
Expand All @@ -179,25 +244,13 @@ def submit(
f"{(llrs < 0.0).sum() / len(llrs):.2f}, "
"the median if negative log likelihood ratios "
f"is {np.median(llrs[llrs < 0.0]):.2e}, "
f"there might be a problem in your fitting.",
f"there might be a problem in your fitting."
)
if len(llrs) < 1000:
self.logging.warning(
"The number of toys is less than 1000, the threshold might not be accurate!",
"The number of toys is less than 1000, the threshold might not be accurate!"
)

# update poi according to poi_expectation
runner_args["statistical_model_args"].pop("limit_threshold", None)
runner = Runner(**runner_args)
expectation_values = runner.model.get_expectation_values(
**{**nominal_values, **generate_values}
)
# in some rare cases the poi is not a rate multiplier
# then the poi_expectation is not in the nominal_expectation_values
component = self.poi.replace("_rate_multiplier", "")
poi_expectation = expectation_values.get(component, None)
poi_value = generate_values.pop(self.poi)

# make sure no poi and poi_expectation in the hashed_keys
generate_values.pop(self.poi, None)
generate_values.pop("poi_expectation", None)
Expand Down Expand Up @@ -226,7 +279,7 @@ def submit(
"threshold": [],
"poi_expectation": [],
}
threshold[threshold_key] = threshold_value
threshold[threshold_key] = deepcopy(threshold_value)
threshold[threshold_key][self.poi].append(poi_value)
threshold[threshold_key]["threshold"].append(q_llr)
threshold[threshold_key]["poi_expectation"].append(poi_expectation)
Expand Down