Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First runner manipulating statistical model #50

Merged
merged 27 commits into from
Aug 4, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
232c6de
Recover runner
dachengx Jul 21, 2023
628d452
Improve code style
dachengx Jul 21, 2023
cc0d06e
Minor change
dachengx Jul 21, 2023
f869822
Merge remote-tracking branch 'origin/main' into first_runner
dachengx Jul 22, 2023
0e334dd
Remove duplicated codes
dachengx Jul 22, 2023
f4c063e
Merge remote-tracking branch 'origin/main' into first_runner
dachengx Jul 28, 2023
4fb9113
Simplify toydata mode
dachengx Jul 31, 2023
d31d4bc
Add test_runner.py
dachengx Jul 31, 2023
80e0dd6
Happier code style
dachengx Jul 31, 2023
2c624dc
Minor change
dachengx Jul 31, 2023
06ad8f3
Help model to save dict data and list data in a unified function
dachengx Jul 31, 2023
5104568
Happier code style
dachengx Jul 31, 2023
dec5ca0
Minor change at docstring
dachengx Aug 1, 2023
3823458
Minor change
dachengx Aug 1, 2023
c0e69ff
Confidential interval calculation
dachengx Aug 1, 2023
8f22c0c
Happier code style
dachengx Aug 1, 2023
a1b82fe
Merge remote-tracking branch 'origin/main' into first_runner
dachengx Aug 1, 2023
345f976
Merge branch 'main' into first_runner
dachengx Aug 1, 2023
9e5c79f
Set sigma as not fittable otherwise horrible things when CL calculati…
dachengx Aug 1, 2023
89c0bd3
Remove todo of runner because it is done
dachengx Aug 1, 2023
b710149
Make the placeholder is already an OK gaussian model example
dachengx Aug 1, 2023
9215c9c
Few change, add warnings and improve performance
dachengx Aug 2, 2023
9a577e3
Convert the data generation of runner into a generator
dachengx Aug 2, 2023
7958d71
More compact data generator
dachengx Aug 2, 2023
6bab21b
Also specify what if toydata_mode is no_toydata
dachengx Aug 3, 2023
3dbaa98
Warning when parameter_interval_bounds contains None
dachengx Aug 4, 2023
f35e0c7
Update docstring a bit
dachengx Aug 4, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion alea/models/blueice_extended_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def _build_ll_from_config(self, likelihood_config: dict) -> "LogLikelihoodSum":
for i, source in enumerate(config["sources"]):
parameters_to_ignore: List[str] = [
p.name for p in self.parameters if (
p.ptype == "shape") & (p.name not in source["parameters"])]
p.ptype == "shape") and (p.name not in source["parameters"])]
# no efficiency affects PDF:
parameters_to_ignore += [p.name for p in self.parameters if (p.ptype == "efficiency")]
parameters_to_ignore += source.get("extra_dont_hash_settings", [])
Expand Down
227 changes: 227 additions & 0 deletions alea/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
from copy import deepcopy
from typing import Callable, Optional
from datetime import datetime
from pydoc import locate
import inspect

from tqdm import tqdm
import numpy as np

from inference_interface import toydata_from_file, numpy_to_toyfile
from alea.statistical_model import StatisticalModel


class Runner:
"""
Runner manipulates statistical model and toydata.
- initialize statistical model
- generate or read toy data
- save toy data if needed
- fit parameters
- write output file

One toyfile can contain multiple toydata, but all of them are from the same generate_values

Attributes:
statistical_model: statistical model
poi: parameter of interest
hypotheses (list): list of hypotheses
common_hypothesis (dict): common hypothesis, the values are copied to each hypothesis
generate_values (dict): generate values for toydata
n_mc (int): number of Monte Carlo
toydata_file (str): toydata filename
toydata_mode (str): toydata mode, 'read', 'generate', 'generate_and_write', 'no_toydata'
metadata (dict): metadata, if None, it is set to {}
output_file (str): output filename
result_names (list): list of result names
result_dtype (list): list of result dtypes
hypotheses_values (list): list of values for hypotheses

Args:
statistical_model (str): statistical model class name
poi (str): parameter of interest
hypotheses (list): list of hypotheses
n_mc (int): number of Monte Carlo
statistical_model_args (dict, optional (default=None)): arguments for statistical model
parameter_definition (dict or list, optional (default=None)): parameter definition
confidence_level (float, optional (default=0.9)): confidence level
confidence_interval_kind (str, optional (default='central')):
kind of confidence interval, choice from 'central', 'upper' or 'lower'
confidence_interval_threshold (Callable[[float], float], optional (default=None)):
confidence interval threshold of likelihood ratio
common_hypothesis (dict, optional (default=None)): common hypothesis, the values are copied to each hypothesis
generate_values (dict, optional (default=None)):
generate values of toydata. If None, toydata depend on statistical model.
toydata_mode (str, optional (default='generate_and_write')):
toydata mode, choice from 'read', 'generate', 'generate_and_write', 'no_toydata'
toydata_file (str, optional (default=None)): toydata filename
metadata (dict, optional (default=None)): metadata
output_file (str, optional (default='test_toymc.hdf5')): output filename

Todo:
Implement confidence interval calculation
"""

def __init__(
self,
statistical_model: str,
poi: str,
hypotheses: list,
n_mc: int,
generate_values: dict = None,
statistical_model_args: dict = None,
parameter_definition: Optional[dict or list] = None,
confidence_level: float = 0.9,
confidence_interval_kind: str = 'central',
confidence_interval_threshold: Callable[[float], float] = None,
common_hypothesis: dict = None,
toydata_mode: str = 'generate_and_write',
toydata_file: str = None,
metadata: dict = None,
output_file: str = 'test_toymc.hdf5',
):
"""
Initialize statistical model,
parameters list, and generate values list
"""
statistical_model_class = locate(statistical_model)
if statistical_model_class is None:
raise ValueError(f'Could not find {statistical_model}!')
if not inspect.isclass(statistical_model_class):
raise ValueError(f'{statistical_model_class} is not a class!')
if not issubclass(statistical_model_class, StatisticalModel):
raise ValueError(f'{statistical_model_class} is not a subclass of StatisticalModel!')
self.statistical_model = statistical_model_class(
parameter_definition=parameter_definition,
confidence_level=confidence_level,
confidence_interval_kind=confidence_interval_kind,
confidence_interval_threshold=confidence_interval_threshold,
**(statistical_model_args if statistical_model_args else {}),
)

self.poi = poi
self.hypotheses = hypotheses if hypotheses else []
self.common_hypothesis = common_hypothesis
self.generate_values = generate_values
self.n_mc = n_mc
self.toydata_file = toydata_file
self.toydata_mode = toydata_mode
self.metadata = metadata if metadata else {}
self.output_file = output_file

self.result_names, self.result_dtype = self._get_parameter_list()

self.hypotheses_values = self._get_hypotheses()

def _get_parameter_list(self):
"""Get parameter list and result list from statistical model"""
parameter_list = sorted(self.statistical_model.get_parameter_list())
# add likelihood, lower limit, and upper limit
result_names = parameter_list + ['ll', 'dl', 'ul']
result_dtype = [(n, float) for n in parameter_list]
result_dtype += [(n, float) for n in ['ll', 'dl', 'ul']]
return result_names, result_dtype

def _get_hypotheses(self):
"""Get generate values list from hypotheses"""
hypotheses_values = []
hypotheses = deepcopy(self.hypotheses)

for hypothesis in hypotheses:
if hypothesis == 'null':
# there is no signal component
hypothesis = {self.poi: 0.}
elif hypothesis == 'true':
# the true signal component is used
if self.poi not in self.generate_values:
raise ValueError(
f'{self.poi} should be provided in generate_values',
)
hypothesis = {
self.poi: self.generate_values.get(self.poi),
}
elif hypothesis == 'free':
hypothesis = {}

array = deepcopy(self.common_hypothesis)
array.update(hypothesis)
hypotheses_values.append(array)
return hypotheses_values

def write_output(self, results):
"""Write output file with metadata"""
metadata = deepcopy(self.metadata)

result_names = [f'{i:d}' for i in range(len(self.hypotheses_values))]
for i, ea in enumerate(self.hypotheses):
if ea in ['null', 'free', 'true']:
result_names[i] = ea

metadata['date'] = datetime.now().strftime('%Y%m%d_%H:%M:%S')
metadata['poi'] = self.poi
metadata['common_hypothesis'] = self.common_hypothesis
metadata['generate_values'] = self.generate_values

array_metadatas = [{'hypotheses_values': ea} for ea in self.hypotheses_values]

numpy_arrays_and_names = [(r, rn) for r, rn in zip(results, result_names)]

print(f'Saving {self.output_file}')
numpy_to_toyfile(
self.output_file,
numpy_arrays_and_names=numpy_arrays_and_names,
metadata=metadata,
array_metadatas=array_metadatas)

def read_toydata(self):
"""Read toydata from file"""
toydata, toydata_names = toydata_from_file(self.toydata_file)
return toydata, toydata_names

def toy_simulation(self):
"""
Run toy simulation a specified different toydata mode
and loop over generate values
"""
if self.toydata_mode not in {
'read', 'generate', 'generate_and_write', 'no_toydata',
}:
raise ValueError(f'Unknown toydata mode: {self.toydata_mode}')

toydata = []
toydata_names = None
if self.toydata_mode == 'read':
toydata, toydata_names = self.read_toydata()

results = [np.zeros(self.n_mc, dtype=self.result_dtype) for _ in self.hypotheses_values]
for i_mc in tqdm(range(self.n_mc)):
if self.toydata_mode == 'generate' or self.toydata_mode == 'generate_and_write':
self.statistical_model.data = self.statistical_model.generate_data(
**self.generate_values)
fit_results = []
for hypothesis_values in self.hypotheses_values:
if self.toydata_mode == 'read':
self.statistical_model.data = toydata[i_mc]

fit_result, max_llh = self.statistical_model.fit(**hypothesis_values)
fit_result['ll'] = max_llh
fit_result['dl'] = -1.
fit_result['ul'] = -1.

fit_results.append(fit_result)
# appemd toydata
toydata.append(self.statistical_model.data)
# assign fitting results
for fit_result, result_array in zip(fit_results, results):
result_array[i_mc] = tuple(fit_result[pn] for pn in self.result_names)

if self.toydata_mode == 'generate_and_write':
self.statistical_model.store_data(self.toydata_file, toydata, toydata_names)

return results

def run(self):
"""Run toy simulation"""
results = self.toy_simulation()

self.write_output(results)