Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First runner manipulating statistical model #50

Merged
merged 27 commits into from
Aug 4, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
232c6de
Recover runner
dachengx Jul 21, 2023
628d452
Improve code style
dachengx Jul 21, 2023
cc0d06e
Minor change
dachengx Jul 21, 2023
f869822
Merge remote-tracking branch 'origin/main' into first_runner
dachengx Jul 22, 2023
0e334dd
Remove duplicated codes
dachengx Jul 22, 2023
f4c063e
Merge remote-tracking branch 'origin/main' into first_runner
dachengx Jul 28, 2023
4fb9113
Simplify toydata mode
dachengx Jul 31, 2023
d31d4bc
Add test_runner.py
dachengx Jul 31, 2023
80e0dd6
Happier code style
dachengx Jul 31, 2023
2c624dc
Minor change
dachengx Jul 31, 2023
06ad8f3
Help model to save dict data and list data in a unified function
dachengx Jul 31, 2023
5104568
Happier code style
dachengx Jul 31, 2023
dec5ca0
Minor change at docstring
dachengx Aug 1, 2023
3823458
Minor change
dachengx Aug 1, 2023
c0e69ff
Confidential interval calculation
dachengx Aug 1, 2023
8f22c0c
Happier code style
dachengx Aug 1, 2023
a1b82fe
Merge remote-tracking branch 'origin/main' into first_runner
dachengx Aug 1, 2023
345f976
Merge branch 'main' into first_runner
dachengx Aug 1, 2023
9e5c79f
Set sigma as not fittable otherwise horrible things when CL calculati…
dachengx Aug 1, 2023
89c0bd3
Remove todo of runner because it is done
dachengx Aug 1, 2023
b710149
Make the placeholder is already an OK gaussian model example
dachengx Aug 1, 2023
9215c9c
Few change, add warnings and improve performance
dachengx Aug 2, 2023
9a577e3
Convert the data generation of runner into a generator
dachengx Aug 2, 2023
7958d71
More compact data generator
dachengx Aug 2, 2023
6bab21b
Also specify what if toydata_mode is no_toydata
dachengx Aug 3, 2023
3dbaa98
Warning when parameter_interval_bounds contains None
dachengx Aug 4, 2023
f35e0c7
Update docstring a bit
dachengx Aug 4, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions alea/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from .models import *

from .runner import *

from .utils import *

from .parameters import *
Expand Down
11 changes: 6 additions & 5 deletions alea/examples/configs/unbinned_wimp_running.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
statistical_model: alea.models.BlueiceExtendedModel
statistical_model_config: unbinned_wimp_statistical_model.yaml

poi: wimp_rate_multiplier
Expand All @@ -13,7 +14,7 @@ computation:
parameters_in_common:
{
hypotheses: ["true", "null", "free"],
output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.hdf5",
output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.h5",
n_mc: 5000,
n_batch: 40,
}
Expand All @@ -25,11 +26,11 @@ computation:
parameters_in_common:
{
hypotheses: ["true", "null", "free"],
output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.hdf5",
output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.h5",
n_mc: 5000,
n_batch: 40,
}
limit_threshold: "thresholds.hdf5"
limit_threshold: "thresholds.h5"
toydata_mode: "generate"
parameters_as_wildcards: ["poi_expectation", "n_mc", "n_batch"]

Expand All @@ -39,11 +40,11 @@ computation:
parameters_in_common:
{
hypotheses: ["true", "null", "free"],
output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.hdf5",
output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.h5",
n_mc: 5000,
n_batch: 40,
compute_confidence_interval: True,
limit_threshold: "thresholds.hdf5",
limit_threshold: "thresholds.h5",
}
toydata_mode: "generate"

Expand Down
4 changes: 2 additions & 2 deletions alea/examples/gaussian_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class GaussianModel(StatisticalModel):
def __init__(
self,
parameter_definition: Optional[dict or list] = None,
**kwargs,
*args, **kwargs,
):
"""
Initialise a model of a gaussian measurement (hatmu),
Expand All @@ -20,7 +20,7 @@ def __init__(
"""
if parameter_definition is None:
parameter_definition = ["mu", "sigma"]
super().__init__(parameter_definition=parameter_definition, **kwargs)
super().__init__(parameter_definition=parameter_definition, *args, **kwargs)

def _ll(self, mu=None, sigma=None):
hat_mu = self.data[0]['hat_mu'][0]
Expand Down
24 changes: 19 additions & 5 deletions alea/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def __init__(
confidence_level: float = 0.9,
confidence_interval_kind: str = "central", # one of central, upper, lower
confidence_interval_threshold: Callable[[float], float] = None,
**kwargs,
):
"""Initialize a statistical model"""
if type(self) == StatisticalModel:
Expand Down Expand Up @@ -159,23 +160,36 @@ def data(self, data):
self.is_data_set = True

def store_data(
self, file_name, data_list, data_name_list=None, metadata = None):
self, file_name, data_list, data_name_list=None, metadata=None):
"""
Store a list of datasets (each on the form of a list of one or more structured arrays)
Store a list of datasets.
(each on the form of a list of one or more structured arrays or dicts)
Using inference_interface, but included here to allow over-writing.
structure would be: [[datasets1], [datasets2], ..., [datasetsn]]
where each of datasets is a list of structured arrays
if you specify, it is set, if not it will read from self.get_likelihood_term_names
if not defined, it will be ["0", "1", ..., "n-1"]
"""
if all([isinstance(d, dict) for d in data_list]):
_data_list = [list(d.values()) for d in data_list]
elif all([isinstance(d, list) for d in data_list]):
_data_list = data_list
else:
raise ValueError(
'Unsupported mixed toydata format! '
'toydata should be a list of dict or a list of list',)

if data_name_list is None:
if hasattr(self, "likelihood_names"):
data_name_list = self.likelihood_names
else:
data_name_list = ["{:d}".format(i) for i in range(len(data_list[0]))]
data_name_list = ["{:d}".format(i) for i in range(len(_data_list[0]))]

kw = {'metadata': metadata} if metadata is not None else dict()
toydata_to_file(file_name, data_list, data_name_list, **kw)
if len(_data_list[0]) != len(data_name_list):
raise ValueError(
"The number of data sets and data names must be the same")
toydata_to_file(file_name, _data_list, data_name_list, **kw)

def get_expectation_values(self, **parameter_values):
return NotImplementedError("get_expectation_values is optional to implement")
Expand All @@ -195,7 +209,7 @@ def get_likelihood_term_from_name(self, likelihood_name):
"""
if hasattr(self, "likelihood_names"):
likelihood_names = self.likelihood_names
return {n:i for i,n in enumerate(likelihood_names)}[likelihood_name]
return {n: i for i, n in enumerate(likelihood_names)}[likelihood_name]
else:
raise NotImplementedError("The attribute likelihood_names is not defined.")

Expand Down
39 changes: 32 additions & 7 deletions alea/models/blueice_extended_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,17 @@ class BlueiceExtendedModel(StatisticalModel):
likelihood_config (dict): A dictionary defining the likelihood.
"""

def __init__(self, parameter_definition: dict, likelihood_config: dict):
def __init__(
self,
parameter_definition: dict, likelihood_config: dict,
*args, **kwargs):
"""Initializes the statistical model.

Args:
parameter_definition (dict): A dictionary defining the model parameters.
likelihood_config (dict): A dictionary defining the likelihood.
"""
super().__init__(parameter_definition=parameter_definition)
super().__init__(parameter_definition=parameter_definition, *args, **kwargs)
self._likelihood = self._build_ll_from_config(likelihood_config)
self.likelihood_names = [t["name"] for t in likelihood_config["likelihood_terms"]]
self.likelihood_names.append("ancillary_likelihood")
Expand Down Expand Up @@ -62,14 +65,23 @@ def data(self) -> dict:
return super().data

@data.setter
def data(self, data: dict):
def data(self, data: dict or list):
"""
Overrides default setter. Will also set the data of the blueice ll.
Data-sets are expected to be in the form of a list of one
or more structured arrays-- representing the data-sets of one or more likelihood terms.

Args:
data (dict or list): Data of the statistical model.
If data is a list, it must be a list of length len(self.likelihood_names) + 1.
"""
# iterate through all likelihood terms and set the science data in the blueice ll
# last entry in data are the generate_values
if isinstance(data, list):
if len(data) != len(self.likelihood_names) + 1:
raise ValueError(
f"Data must be a list of length {len(self.likelihood_names) + 1}")
data = dict(zip(self.likelihood_names + ["generate_values"], data))
for i, (dataset_name, d) in enumerate(data.items()):
if dataset_name != "generate_values":
ll_term = self._likelihood.likelihood_list[i]
Expand Down Expand Up @@ -138,7 +150,7 @@ def _build_ll_from_config(self, likelihood_config: dict) -> "LogLikelihoodSum":
for i, source in enumerate(config["sources"]):
parameters_to_ignore: List[str] = [
p.name for p in self.parameters if (
p.ptype == "shape") & (p.name not in source["parameters"])]
p.ptype == "shape") and (p.name not in source["parameters"])]
# no efficiency affects PDF:
parameters_to_ignore += [p.name for p in self.parameters if (p.ptype == "efficiency")]
parameters_to_ignore += source.get("extra_dont_hash_settings", [])
Expand All @@ -158,6 +170,7 @@ def _build_ll_from_config(self, likelihood_config: dict) -> "LogLikelihoodSum":
rate_parameter = rate_parameters[0]
if rate_parameter.endswith("_rate_multiplier"):
rate_parameter = rate_parameter.replace("_rate_multiplier", "")
# The ancillary term is handled in CustomAncillaryLikelihood
ll.add_rate_parameter(rate_parameter, log_prior=None)
else:
raise NotImplementedError(
Expand All @@ -175,6 +188,7 @@ def _build_ll_from_config(self, likelihood_config: dict) -> "LogLikelihoodSum":
anchors = self.parameters[p].blueice_anchors
if anchors is None:
raise ValueError(f"Shape parameter {p} does not have any anchors.")
# The ancillary term is handled in CustomAncillaryLikelihood
ll.add_shape_parameter(p, anchors=anchors, log_prior=None)

ll.prepare()
Expand Down Expand Up @@ -207,12 +221,23 @@ def _generate_data(self, **generate_values) -> dict:
data["generate_values"] = dict_to_structured_array(generate_values)
return data

def store_data(self, file_name, data_list, data_name_list=None, metadata=None):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I appreciate this here, but I feel like it is a bit troublesome-- we want to say you only need ll, generate_data, but here we're redefining other functionality also, when we could just set the data_name_list at init, I guess?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to change likelihood_names here to data_name_list:

alea/alea/model.py

Lines 214 to 218 in b710149

if data_name_list is None:
if hasattr(self, "likelihood_names"):
data_name_list = self.likelihood_names
else:
data_name_list = ["{:d}".format(i) for i in range(len(_data_list[0]))]
?

I think the logic is:

  1. ll and generate_data are mandatory for a new model, this is already demonstrated at GaussianModel
  2. BlueiceExtendedModel needs an overwritten store_data function because it has an advantage that generate_values is also in the self.data, so that the StatisticalModel.store_data can not be directly applied on BlueiceExtendedModel.
  3. Someone overwriting store_data, does not violate the truth that only ll and generate_data are mandatory. In principle, one can overwrite everything.

"""
Store data in a file.
Append the generate_values to the data_name_list.
"""
if data_name_list is None:
data_name_list = self.likelihood_names + ["generate_values"]
super().store_data(file_name, data_list, data_name_list, metadata)

def _generate_science_data(self, **generate_values) -> dict:
science_data = [gen.simulate(**generate_values)
for gen in self.data_generators]
return dict(zip(self.likelihood_names, science_data))
"""Generate the science data for all likelihood terms except the ancillary likelihood."""
science_data = [
gen.simulate(**generate_values) for gen in self.data_generators]
return dict(zip(self.likelihood_names[:-1], science_data))

def _generate_ancillary_measurements(self, **generate_values) -> dict:
"""Generate the ancillary measurements."""
ancillary_measurements = {}
anc_ll = self._likelihood.likelihood_list[-1]
ancillary_generators = anc_ll._get_constraint_functions(**generate_values)
Expand Down
14 changes: 9 additions & 5 deletions alea/parameters.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from typing import Any, Dict, List, Optional, Tuple

# These imports are needed to evaluate the uncertainty string
import numpy
import scipy


class Parameter:
"""
Expand Down Expand Up @@ -40,12 +44,12 @@ def __init__(
self.nominal_value = nominal_value
self.fittable = fittable
self.ptype = ptype
self._uncertainty = uncertainty
self.uncertainty = uncertainty
self.relative_uncertainty = relative_uncertainty
self.blueice_anchors = blueice_anchors
self.fit_limits = fit_limits
self.parameter_interval_bounds = parameter_interval_bounds
self._fit_guess = fit_guess
self.fit_guess = fit_guess
self.description = description

def __repr__(self) -> str:
Expand All @@ -63,7 +67,7 @@ def uncertainty(self) -> float or Any:
If the uncertainty is a string, it can be evaluated as a numpy or scipy function.
"""
if isinstance(self._uncertainty, str):
# Evaluate the uncertainty if it's a string
# Evaluate the uncertainty if it's a string starting with "scipy." or "numpy."
if self._uncertainty.startswith("scipy.") or self._uncertainty.startswith("numpy."):
return eval(self._uncertainty)
else:
Expand Down Expand Up @@ -275,8 +279,8 @@ def __call__(
if any(i is None for k, i in values.items()):
emptypars = ", ".join([k for k, i in values.items() if i is None])
raise AssertionError(
"All parameters must be set explicitly, or have a nominal value,"
" encountered for: " + emptypars)
"All parameters must be set explicitly, or have a nominal value, "
"not satisfied for: " + emptypars)
return values

def __getattr__(self, name: str) -> Parameter:
Expand Down
Loading