diff --git a/docs/api.md b/docs/api.md index 219be15b0..9900cec54 100644 --- a/docs/api.md +++ b/docs/api.md @@ -51,6 +51,7 @@ .. autofunction:: scores.probability.murphy_thetas .. autofunction:: scores.probability.roc_curve_data .. autofunction:: scores.probability.brier_score +.. autofunction:: scores.probability.brier_score_for_ensemble .. autofunction:: scores.probability.isotonic_fit ``` diff --git a/docs/included.md b/docs/included.md index 9cdeabf0e..8c571e92d 100644 --- a/docs/included.md +++ b/docs/included.md @@ -168,6 +168,10 @@ - [API](api.md#scores.probability.brier_score) - [Tutorial](project:./tutorials/Brier_Score.md) - [Brier (1950)](https://doi.org/10.1175/1520-0493(1950)078<0001:VOFEIT>2.0.CO;2) +* - Brier Score for Ensembles + - [API](api.md#scores.probability.brier_score_for_ensemble) + - [Tutorial](project:./tutorials/Brier_Score.md) + - [Ferro (2013)](https://doi.org/10.1002/qj.2270) * - Continuous Ranked Probability Score (CRPS) for Cumulative Distribution Functions (CDFs) - - diff --git a/src/scores/probability/__init__.py b/src/scores/probability/__init__.py index c851b74f8..8e102e69b 100644 --- a/src/scores/probability/__init__.py +++ b/src/scores/probability/__init__.py @@ -3,7 +3,7 @@ """ from scores.continuous.murphy_impl import murphy_score, murphy_thetas -from scores.probability.brier_impl import brier_score +from scores.probability.brier_impl import brier_score, brier_score_for_ensemble from scores.probability.crps_impl import ( adjust_fcst_for_crps, crps_cdf, @@ -21,6 +21,7 @@ "murphy_score", "murphy_thetas", "brier_score", + "brier_score_for_ensemble", "adjust_fcst_for_crps", "crps_cdf", "crps_cdf_brier_decomposition", diff --git a/src/scores/probability/brier_impl.py b/src/scores/probability/brier_impl.py index dbc069c88..a0afc3ea5 100644 --- a/src/scores/probability/brier_impl.py +++ b/src/scores/probability/brier_impl.py @@ -2,13 +2,18 @@ This module contains methods related to the Brier score """ -from typing import Optional +import operator +from collections.abc import Sequence +from numbers import Real +from typing import Callable, Optional import xarray as xr from scores.continuous import mse +from scores.functions import apply_weights +from scores.processing import binary_discretise from scores.typing import FlexibleDimensionTypes, XarrayLike -from scores.utils import check_binary +from scores.utils import check_binary, gather_dimensions def brier_score( @@ -42,8 +47,8 @@ def brier_score( point (i.e. single-value comparison against observed), and the forecast and observed dimensions must match precisely. weights: Optionally provide an array for weighted averaging (e.g. by area, by latitude, - by population, custom) - check_args: will perform some tests on the data if set to True + by population, custom). + check_args: will perform some tests on the data if set to True. Raises: ValueError: if ``fcst`` contains non-nan values outside of the range [0, 1] ValueError: if ``obs`` contains non-nan values not in the set {0, 1} @@ -52,6 +57,9 @@ def brier_score( - Brier, G. W. (1950). Verification of forecasts expressed in terms of probability. Monthly Weather Review, 78(1), 1-3. https://doi.org/fp62r6 - https://en.wikipedia.org/wiki/Brier_score + + See Also: + - :py:func:`scores.probability.brier_score_for_ensemble` """ if check_args: error_msg = ValueError("`fcst` contains values outside of the range [0, 1]") @@ -65,3 +73,168 @@ def brier_score( check_binary(obs, "obs") return mse(fcst, obs, reduce_dims=reduce_dims, preserve_dims=preserve_dims, weights=weights) + + +def brier_score_for_ensemble( + fcst: XarrayLike, # type: ignore + obs: XarrayLike, # type: ignore + ensemble_member_dim: str, + event_thresholds: Real | Sequence[Real], + *, # Force keywords arguments to be keyword-only + reduce_dims: Optional[FlexibleDimensionTypes] = None, + preserve_dims: Optional[FlexibleDimensionTypes] = None, + weights: Optional[xr.DataArray] = None, + fair_correction: bool = True, + event_threshold_operator: Callable = operator.ge, + threshold_dim: str = "threshold", +) -> XarrayLike: # type: ignore + """ + Calculates the Brier score for an ensemble forecast for the provided thresholds. By default, + the fair Brier score is calculated, which is a modified version of the Brier score that + applies a correction related to the number of ensemble members and the number of + ensemble members that predict the event. + + Given an event defined by ``event_threshold``, the fair Brier score for ensembles is defined as + + + .. math:: + s_{i,y} = \\left(\\frac{i}{m} - y\\right)^2 - \\frac{i(m-i)}{m^2(m-1)} + + Where: + + - :math:`i` is the number of ensemble members that predict the event + - :math:`m` is the total number of ensemble members + - :math:`y` is the binary observation (0 if the event was not observed or 1 otherwise) + + When the fair correction is not applied (i.e., ``fair_correction=False``), + the Brier score is calculated as: + + + .. math:: + s_{i,y} = \\left(\\frac{i}{m} - y\\right)^2 + + The fair correction will only be applied at coordinates where the number of ensemble members + is greater than 1 and ``fair_correction=True``. + When the ``fair_correction`` is set to ``True`` and the number of ensemble members is 1, the + fair correction will not be applied to avoid division by zero. If you want to + set the minimum number of ensemble members required to calculate the Brier score, + we recommend preprocessing your data to remove data points with fewer than the + minimum number of ensemble members that you want. + + Args: + fcst: Real valued ensemble forecasts in xarray. The proportion of ensemble members + above a threshold is calculated within this function. + obs: Real valued observations in xarray. These values are converted to binary + values in this function based on the supplied event thresholds and operators. + ensemble_member_dim: The dimension name of the ensemble member. + event_thresholds: The threshold(s) that define what is considered an event. If + multiple thresholds are provided, they should be monotonically increasing with no NaNs. + reduce_dims: Optionally specify which dimensions to reduce when + calculating the Brier score. All other dimensions will be preserved. + preserve_dims: Optionally specify which dimensions to preserve when + calculating the score. All other dimensions will be reduced. As a + special case, 'all' will allow all dimensions to be preserved. In + this case, the result will be in the same shape/dimensionality + as the forecast, and the errors will be the absolute error at each + point (i.e. single-value comparison against observed), and the + forecast and observed dimensions must match precisely. + weights: Optionally provide an array for weighted averaging (e.g. by area, by latitude) + when calculating the mean score across specified dimensions via ``reduce_dims`` or ``preserve_dims``. + fair_correction: Whether or not to apply the fair Brier score correction. Default is True. + event_threshold_operator: The mode to use to define an event relative to a supplied threshold. + Default is ``operator.ge``, which means that an event occurs + if the observation is greater than or equal to the threshold. + Similarly, an event will have been considered to be forecast by an ensemble member + if the forecast is greater than or equal to the threshold. + Passing in ``operator.lt`` will give the same results as it is mathematically + equivalent in this case. Alternatively, you can provide + ``operator.gt`` which means that an event occurs if the observation + is strictly greater than the threshold. Again, an event for an ensemble member + will have been considered to be forecast if the forecast is greater than the threshold. + Using``operator.le`` will give the same results as ``operator.gt`` as it is mathematically + equivalent in this case. + threshold_dim: The name of the threshold dimension in the output. Default is 'threshold'. + It must not exist as a dimension in the forecast or observation data. + + + Returns: + The Brier score for the ensemble forecast. + + Raises: + ValueError: if values in ``event_thresholds`` are not monotonically increasing. + ValueError: if ``fcst`` contains the dimension provided in ``threshold_dim``. + ValueError: if ``obs`` contains the dimension provided in ``threshold_dim``. + ValueError: if ``weights`` contains the dimension provided in ``threshold_dim``. + ValueError: if ``ensemble_member_dim`` is not in ``fcst`` dimensions. + + References: + - Ferro, C. A. T. (2013). Fair scores for ensemble forecasts. Quarterly + Journal of the Royal Meteorological Society, 140(683), 1917–1923. + https://doi.org/10.1002/qj.2270 + + See Also: + - :py:func:`scores.probability.brier_score` + + Examples: + Calculate the Brier score for an ensemble forecast for a single threshold: + + >>> import numpy as np + >>> import xarray as xr + >>> from scores.probability import brier_score_for_ensemble + >>> fcst = 10 * xr.DataArray(np.random.rand(10, 10), dims=['time', 'ensemble']) + >>> obs = 10 * xr.DataArray(np.random.rand(10), dims=['time']) + >>> brier_score_for_ensemble(fcst, obs, ensemble_member_dim='ensemble', thresholds=0.5) + + Calculate the Brier score for an ensemble forecast for multiple thresholds: + >>> thresholds = [0.1, 5, 9] + >>> brier_score_for_ensemble(fcst, obs, ensemble_member_dim='ensemble', thresholds=thresholds) + + """ + if event_threshold_operator not in [operator.ge, operator.gt, operator.le, operator.lt]: + raise ValueError("event_threshold_operator must be one of operator.ge, operator.gt, operator.le, operator.lt") + if threshold_dim in fcst.dims: + raise ValueError("threshold_dim is not allowed to be the same as a dim in fcst") + if threshold_dim in obs.dims: + raise ValueError("threshold_dim is not allowed to be the same as a dim in obs") + + weights_dims = None + if weights is not None: + if threshold_dim in weights.dims: + raise ValueError("threshold_dim is not allowed to be the same as a dim in weights") + weights_dims = weights.dims + + dims_for_mean = gather_dimensions( + fcst.dims, + obs.dims, + weights_dims=weights_dims, + reduce_dims=reduce_dims, + preserve_dims=preserve_dims, + score_specific_fcst_dims=ensemble_member_dim, + ) + if isinstance(event_thresholds, (float, int)): + event_thresholds = [event_thresholds] + thresholds_xr = xr.DataArray(event_thresholds, dims=[threshold_dim], coords={threshold_dim: event_thresholds}) + + # calculate i term in equation + member_event_count = event_threshold_operator(fcst, thresholds_xr).sum(dim=ensemble_member_dim) + # calculate m term in equation + total_member_count = fcst.notnull().sum(dim=ensemble_member_dim) + + binary_obs = binary_discretise(obs, event_thresholds, event_threshold_operator) + + result = (member_event_count / total_member_count - binary_obs) ** 2 + if fair_correction: + fair_corr = (member_event_count * (total_member_count - member_event_count)) / ( + total_member_count**2 * (total_member_count - 1) + ) + # For coordinates where the ensemble member count is 1, the fair correction will + # have a NaN value. In this situation, we set the fair correction to 0 at these coordinates. + # This may be important if you have a lagged ensemble where at least one time step + # has only a single ensemble member + fair_correction = fair_corr.fillna(0) + result -= fair_correction + + # apply weights and take means across specified dims + result = apply_weights(result, weights=weights).mean(dim=dims_for_mean) + + return result diff --git a/tests/probabilty/brier_test_data.py b/tests/probabilty/brier_test_data.py new file mode 100644 index 000000000..ae1042bc0 --- /dev/null +++ b/tests/probabilty/brier_test_data.py @@ -0,0 +1,183 @@ +import numpy as np +import xarray as xr + +FCST1 = xr.DataArray( + [[[0.5, 0], [0, 0.5], [1, 0]], [[0.5, 0], [0.5, 0], [0, np.nan]]], + dims=["a", "b", "c"], + coords={"a": [0, 1], "b": [0, 1, 2], "c": [0, 1]}, +) +OBS1 = xr.DataArray( + [[[1, 0], [0, 0], [np.nan, 0]], [[0, 0], [1, 0], [0, 1]]], + dims=["a", "b", "c"], + coords={"a": [0, 1], "b": [0, 1, 2], "c": [0, 1]}, +) + +DA_FCST_ENS = xr.DataArray( + data=[[0.0, 4, 3, 7], [0, -1, 2, 4], [0, 1, 4, np.nan], [2, 3, 4, 1], [0, np.nan, np.nan, np.nan]], + dims=["stn", "ens_member"], + coords={"stn": [101, 102, 103, 104, 105], "ens_member": [1, 2, 3, 4]}, +) +DA_FCST_ENS_LT = xr.DataArray( + data=[ + [[0.0, 4, 3, 7], [0, -1, 2, 4], [0, 1, 4, np.nan], [2, 3, 4, 1], [0, np.nan, np.nan, np.nan]], + [[0.0, 0, 0, 0], [0, -1, 2, 4], [np.nan, np.nan, 4, np.nan], [2, 3, 4, 1], [0, np.nan, np.nan, np.nan]], + ], + dims=[ + "lead_time", + "stn", + "ens_member", + ], + coords={"stn": [101, 102, 103, 104, 105], "ens_member": [1, 2, 3, 4], "lead_time": [1, 2]}, +) +DA_OBS_ENS = xr.DataArray(data=[0, 3, 1, np.nan, 4, 5], dims=["stn"], coords={"stn": [101, 102, 103, 104, 105, 106]}) + + +EXP_BRIER_ENS_ALL = xr.DataArray( + data=[[(3 / 4) ** 2, (2 / 4 - 1) ** 2, (2 / 3 - 1) ** 2, np.nan, 1]], + dims=["threshold", "stn"], + coords={"stn": [101, 102, 103, 104, 105], "threshold": [1]}, +).T + +EXP_BRIER_ENS_ALL_GREATER = xr.DataArray( + data=[[(3 / 4) ** 2, (2 / 4 - 1) ** 2, (1 / 3) ** 2, np.nan, 1]], + dims=["threshold", "stn"], + coords={"stn": [101, 102, 103, 104, 105], "threshold": [1]}, +).T + +EXP_BRIER_ENS_ALL_LT = xr.DataArray( + data=[ + [[(3 / 4) ** 2, (2 / 4 - 1) ** 2, (2 / 3 - 1) ** 2, np.nan, 1]], + [[0, (2 / 4 - 1) ** 2, 0, np.nan, 1]], + ], + dims=["lead_time", "threshold", "stn"], + coords={"stn": [101, 102, 103, 104, 105], "threshold": [1], "lead_time": [1, 2]}, +) +I1 = 3 +M1 = 4 +I2 = 2 +M2 = 4 +I3 = 2 +M3 = 3 +FAIR_CORR_ALL = xr.DataArray( + data=[ + [ + I1 * (M1 - I1) / (M1**2 * (M1 - 1)), + I2 * (M2 - I2) / (M2**2 * (M2 - 1)), + I3 * (M3 - I3) / (M3**2 * (M3 - 1)), + np.nan, + 0, + ] + ], + dims=["threshold", "stn"], + coords={"stn": [101, 102, 103, 104, 105], "threshold": [1]}, +) + +EXP_BRIER_ENS_FAIR_ALL = EXP_BRIER_ENS_ALL - FAIR_CORR_ALL +EXP_BRIER_ENS_FAIR_ALL_MEAN = EXP_BRIER_ENS_FAIR_ALL.mean("stn") + +EXP_BRIER_ENS_ALL_MULTI = xr.DataArray( + data=[ + [0, 0, 0, np.nan, 0], + [(3 / 4) ** 2, (2 / 4 - 1) ** 2, (2 / 3 - 1) ** 2, np.nan, 1], + [0, 0, 0, np.nan, 0], + ], + dims=["threshold", "stn"], + coords={"stn": [101, 102, 103, 104, 105], "threshold": [-100, 1, 100]}, +).T + +I12 = 0 +FAIR_CORR_ALL_LT = xr.DataArray( + data=[ + [ + [ + I1 * (M1 - I1) / (M1**2 * (M1 - 1)), + I2 * (M2 - I2) / (M2**2 * (M2 - 1)), + I3 * (M3 - I3) / (M3**2 * (M3 - 1)), + np.nan, + 0, + ], + [ + I12 * (M1 - I12) / (M1**2 * (M1 - 1)), + I2 * (M2 - I2) / (M2**2 * (M2 - 1)), + 0, + np.nan, + 0, + ], + ] + ], + dims=["threshold", "lead_time", "stn"], + coords={"stn": [101, 102, 103, 104, 105], "threshold": [1], "lead_time": [1, 2]}, +) +EXP_BRIER_ENS_FAIR_ALL_LT = EXP_BRIER_ENS_ALL_LT - FAIR_CORR_ALL_LT +EXP_BRIER_ENS_FAIR_ALL_LT = EXP_BRIER_ENS_FAIR_ALL_LT.transpose("lead_time", "stn", "threshold") +ENS_BRIER_WEIGHTS = xr.DataArray( + [2, 1, np.nan, np.nan, np.nan], dims=["stn"], coords={"stn": [101, 102, 103, 104, 105]} +) +EXP_BRIER_ENS_WITH_WEIGHTS = xr.DataArray( + data=[(2 * (3 / 4) ** 2 + (2 / 4 - 1) ** 2) / 2], + dims=["threshold"], + coords={"threshold": [1]}, +) + +FCST_ENS_DS = xr.Dataset( + { + "a": xr.DataArray( + [[0.0, 4, 3, 7], [0, -1, 2, 4], [0, 1, 4, np.nan], [2, 3, 4, 1], [0, np.nan, np.nan, np.nan]], + dims=["stn", "ens_member"], + coords={"stn": [101, 102, 103, 104, 105], "ens_member": [1, 2, 3, 4]}, + ), + "b": xr.DataArray( + [[0.0, 0, 0, 0], [0, -1, 2, 4], [np.nan, np.nan, 4, np.nan], [2, 3, 4, 1], [0, np.nan, np.nan, np.nan]], + dims=["stn", "ens_member"], + coords={"stn": [101, 102, 103, 104, 105], "ens_member": [1, 2, 3, 4]}, + ), + }, +) +OBS_ENS_DS = xr.Dataset({"a": DA_OBS_ENS, "b": DA_OBS_ENS}) + + +EXP_BRIER_ENS_ALL_DS = xr.Dataset( + { + "a": xr.DataArray( + [[(3 / 4) ** 2, (2 / 4 - 1) ** 2, (2 / 3 - 1) ** 2, np.nan, 1]], + dims=["threshold", "stn"], + coords={"stn": [101, 102, 103, 104, 105], "threshold": [1]}, + ).T, + "b": xr.DataArray( + [[0, (2 / 4 - 1) ** 2, 0, np.nan, 1]], + dims=["threshold", "stn"], + coords={"stn": [101, 102, 103, 104, 105], "threshold": [1]}, + ).T, + }, +) +FAIR_CORR_ALL_DS = xr.Dataset( + { + "a": xr.DataArray( + [ + [ + I1 * (M1 - I1) / (M1**2 * (M1 - 1)), + I2 * (M2 - I2) / (M2**2 * (M2 - 1)), + I3 * (M3 - I3) / (M3**2 * (M3 - 1)), + np.nan, + 0, + ] + ], + dims=["threshold", "stn"], + coords={"stn": [101, 102, 103, 104, 105], "threshold": [1]}, + ).T, + "b": xr.DataArray( + [ + [ + I12 * (M1 - I12) / (M1**2 * (M1 - 1)), + I2 * (M2 - I2) / (M2**2 * (M2 - 1)), + 0, + np.nan, + 0, + ], + ], + dims=["threshold", "stn"], + coords={"stn": [101, 102, 103, 104, 105], "threshold": [1]}, + ).T, + }, +) +EXP_BRIER_ENS_FAIR_ALL_DS = EXP_BRIER_ENS_ALL_DS - FAIR_CORR_ALL_DS diff --git a/tests/probabilty/test_brier.py b/tests/probabilty/test_brier.py index 21a5e36b6..5db64bda7 100644 --- a/tests/probabilty/test_brier.py +++ b/tests/probabilty/test_brier.py @@ -8,22 +8,14 @@ except: # noqa: E722 allow bare except here # pylint: disable=bare-except # pragma: no cover dask = "Unavailable" # type: ignore # pylint: disable=invalid-name # pragma: no cover +import operator + import numpy as np import pytest import xarray as xr -from scores.probability import brier_score - -FCST1 = xr.DataArray( - [[[0.5, 0], [0, 0.5], [1, 0]], [[0.5, 0], [0.5, 0], [0, np.nan]]], - dims=["a", "b", "c"], - coords={"a": [0, 1], "b": [0, 1, 2], "c": [0, 1]}, -) -OBS1 = xr.DataArray( - [[[1, 0], [0, 0], [np.nan, 0]], [[0, 0], [1, 0], [0, 1]]], - dims=["a", "b", "c"], - coords={"a": [0, 1], "b": [0, 1, 2], "c": [0, 1]}, -) +from scores.probability import brier_score, brier_score_for_ensemble +from tests.probabilty import brier_test_data as btd @pytest.mark.parametrize( @@ -31,32 +23,32 @@ [ # Reduce all dims ( - FCST1, - OBS1, + btd.FCST1, + btd.OBS1, None, None, xr.DataArray(0.1), ), # preserve dim "a" ( - FCST1, - OBS1, + btd.FCST1, + btd.OBS1, ["a"], None, xr.DataArray([0.1, 0.1], dims=["a"], coords={"a": [0, 1]}), ), # doesn't break with all NaNs ( - FCST1, - OBS1 * np.nan, + btd.FCST1, + btd.OBS1 * np.nan, ["a"], None, xr.DataArray([np.nan, np.nan], dims=["a"], coords={"a": [0, 1]}), ), # Check it works with DataSets ( - xr.Dataset({"1": FCST1, "2": 0.5 * FCST1}), - xr.Dataset({"1": OBS1, "2": OBS1}), + xr.Dataset({"1": btd.FCST1, "2": 0.5 * btd.FCST1}), + xr.Dataset({"1": btd.OBS1, "2": btd.OBS1}), ["a"], None, xr.Dataset( @@ -86,7 +78,7 @@ def test_brier_score_dask(): if dask == "Unavailable": # pragma: no cover pytest.skip("Dask unavailable, could not run test") # pragma: no cover - result = brier_score(FCST1.chunk(), OBS1.chunk()) + result = brier_score(btd.FCST1.chunk(), btd.OBS1.chunk()) assert isinstance(result.data, dask.array.Array) result = result.compute() assert isinstance(result.data, (np.ndarray, np.generic)) @@ -97,11 +89,11 @@ def test_brier_score_dask(): ("fcst", "obs", "error_msg_snippet"), [ # Fcst > 1 - (FCST1 + 0.0000001, OBS1, r"`fcst` contains values outside of the range \[0, 1\]"), + (btd.FCST1 + 0.0000001, btd.OBS1, r"`fcst` contains values outside of the range \[0, 1\]"), # Fcst < 0 - (FCST1 - 0.0000001, OBS1, r"`fcst` contains values outside of the range \[0, 1\]"), + (btd.FCST1 - 0.0000001, btd.OBS1, r"`fcst` contains values outside of the range \[0, 1\]"), # Obs = 1/2 - (FCST1, OBS1 / 2, "`obs` contains values that are not in the set {0, 1, np.nan}"), + (btd.FCST1, btd.OBS1 / 2, "`obs` contains values that are not in the set {0, 1, np.nan}"), ], ) def test_brier_score_raises(fcst, obs, error_msg_snippet): @@ -119,9 +111,9 @@ def test_brier_score_raises(fcst, obs, error_msg_snippet): ("fcst", "obs", "expected"), [ # FCST doubled - (FCST1 * 2, OBS1, xr.DataArray(0.2)), + (btd.FCST1 * 2, btd.OBS1, xr.DataArray(0.2)), # OBS halved - (FCST1, OBS1 / 2, xr.DataArray(0.05)), + (btd.FCST1, btd.OBS1 / 2, xr.DataArray(0.05)), ], ) def test_brier_doesnt_raise(fcst, obs, expected): @@ -134,3 +126,235 @@ def test_brier_doesnt_raise(fcst, obs, expected): # Check again but with input data as a DataSet result = brier_score(xr.Dataset({"x": fcst}), xr.Dataset({"x": obs}), check_args=False) xr.testing.assert_equal(result, xr.Dataset({"x": expected})) + + +@pytest.mark.parametrize( + ( + "fcst", + "obs", + "ensemble_member_dim", + "thresholds", + "preserve_dims", + "reduce_dims", + "weights", + "fair_correction", + "threshold_operator", + "expected", + ), + [ + # Fair=False, single threshold, preserve all, threshold is an int + ( + btd.DA_FCST_ENS, + btd.DA_OBS_ENS, + "ens_member", + 1, + "all", + None, + None, + False, + operator.ge, + btd.EXP_BRIER_ENS_ALL, + ), + # Fair=True, single threshold, preserve all, threshold is a float + ( + btd.DA_FCST_ENS, + btd.DA_OBS_ENS, + "ens_member", + 1.0, + "all", + None, + None, + True, + operator.ge, + btd.EXP_BRIER_ENS_FAIR_ALL, + ), + # Test reduce_dim arg + ( + btd.DA_FCST_ENS, + btd.DA_OBS_ENS, + "ens_member", + 1, + None, + "stn", + None, + True, + operator.ge, + btd.EXP_BRIER_ENS_FAIR_ALL_MEAN, + ), + # Fair=False, multiple_thresholds, preserve all + ( + btd.DA_FCST_ENS, + btd.DA_OBS_ENS, + "ens_member", + np.array([-100, 1, 100]), + "all", + None, + None, + False, + operator.ge, + btd.EXP_BRIER_ENS_ALL_MULTI, + ), + # Test with broadcast with a lead day dimension with Fair=True + ( + btd.DA_FCST_ENS_LT, + btd.DA_OBS_ENS, + "ens_member", + 1, + "all", + None, + None, + True, + operator.ge, + btd.EXP_BRIER_ENS_FAIR_ALL_LT, + ), + # Test with weights + ( + btd.DA_FCST_ENS, + btd.DA_OBS_ENS, + "ens_member", + 1, + None, + "stn", + btd.ENS_BRIER_WEIGHTS, + False, + operator.ge, + btd.EXP_BRIER_ENS_WITH_WEIGHTS, + ), + # Test with Datasets + ( + btd.FCST_ENS_DS, + btd.OBS_ENS_DS, + "ens_member", + 1, + "all", + None, + None, + True, + operator.ge, + btd.EXP_BRIER_ENS_FAIR_ALL_DS, + ), + # Check threshold_operator=operator.gt + ( + btd.DA_FCST_ENS, + btd.DA_OBS_ENS, + "ens_member", + 1, + "all", + None, + None, + False, + operator.gt, + btd.EXP_BRIER_ENS_ALL_GREATER, + ), + # Check threshold_operator=operator.le + ( + btd.DA_FCST_ENS, + btd.DA_OBS_ENS, + "ens_member", + 1, + "all", + None, + None, + False, + operator.le, + btd.EXP_BRIER_ENS_ALL_GREATER, + ), + # Check threshold_operator=operator.lt + ( + btd.DA_FCST_ENS, + btd.DA_OBS_ENS, + "ens_member", + 1.0, + "all", + None, + None, + True, + operator.lt, + btd.EXP_BRIER_ENS_FAIR_ALL, + ), + ], +) +def test_brier_score_for_ensemble( + fcst, + obs, + ensemble_member_dim, + thresholds, + preserve_dims, + reduce_dims, + weights, + fair_correction, + threshold_operator, + expected, +): + """Tests brier_score_for_ensemble.""" + result = brier_score_for_ensemble( + fcst, + obs, + ensemble_member_dim, + thresholds, + preserve_dims=preserve_dims, + reduce_dims=reduce_dims, + weights=weights, + fair_correction=fair_correction, + event_threshold_operator=threshold_operator, + ) + xr.testing.assert_allclose(result, expected) + + +def test_brier_score_for_ensemble_raises(): + """ + Tests that the brier_score_for_ensemble function raises the correct errors. + """ + fcst = xr.DataArray(np.random.rand(10, 10), dims=["time", "ensemble"]) + fcst_threshold = xr.DataArray(np.random.rand(10, 10), dims=["threshold", "ensemble"]) + obs = xr.DataArray(np.random.rand(10), dims=["time"]) + obs_threshold = xr.DataArray(np.random.rand(10), dims=["threshold"]) + thresholds = [0.1, 0.5, 0.9] + weights = xr.DataArray(np.random.rand(10), dims=["threshold"]) + + # Test if event_threshold_operator is not [operator.ge, operator.gt, operator.le, operator.lt"] + with pytest.raises( + ValueError, match="event_threshold_operator must be one of operator.ge, operator.gt, operator.le, operator.lt" + ): + brier_score_for_ensemble(fcst, obs, "ensemble", thresholds, event_threshold_operator="=") + + # Test if ensemble_member_dim is not in fcst.dims + with pytest.raises(ValueError, match="`score_specific_fcst_dims` must be a subset of `fcst` dimensions"): + brier_score_for_ensemble(fcst, obs, "number", thresholds) + + # Test if fcst contains the dimension 'threshold' + with pytest.raises(ValueError, match="threshold_dim is not allowed to be the same as a dim in fcst"): + brier_score_for_ensemble(fcst_threshold, obs, "time", thresholds) + + # Test if fcst contains the dimension specified by the threshold_dim argument + with pytest.raises(ValueError, match="threshold_dim is not allowed to be the same as a dim in fcst"): + brier_score_for_ensemble(fcst, obs, "time", thresholds, threshold_dim="time") + + # Test if obs contains the dimension 'threshold' + with pytest.raises(ValueError, match="threshold_dim is not allowed to be the same as a dim in obs"): + brier_score_for_ensemble(fcst, obs_threshold, "ensemble", thresholds) + + # Test if weights contains the dimension 'threshold' + with pytest.raises(ValueError, match="threshold_dim is not allowed to be the same as a dim in weights"): + brier_score_for_ensemble(fcst, obs, "ensemble", thresholds, weights=weights) + + +def test_brier_score_for_ensemble_dask(): + """Tests that the brier_score_for_ensemble works with dask""" + if dask == "Unavailable": # pragma: no cover + pytest.skip("Dask unavailable, could not run test") # pragma: no cover + + result = brier_score_for_ensemble( + btd.DA_FCST_ENS.chunk(), + btd.DA_OBS_ENS.chunk(), + "ens_member", + 1, + preserve_dims="all", + reduce_dims=None, + weights=None, + fair_correction=False, + ) + assert isinstance(result.data, dask.array.Array) + result = result.compute() + assert isinstance(result.data, (np.ndarray, np.generic)) + xr.testing.assert_equal(result, btd.EXP_BRIER_ENS_ALL) diff --git a/tutorials/Brier_Score.ipynb b/tutorials/Brier_Score.ipynb index 6dc26ac38..36dc3ec37 100644 --- a/tutorials/Brier_Score.ipynb +++ b/tutorials/Brier_Score.ipynb @@ -1,131 +1,2205 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Brier score\n", - "\n", - "The Brier score is the most commonly used verification metric for evaluating a probability of a binary outcome forecast, such as a \"chance of rainfall\" forecast.\n", - "\n", - "Probabilistic forecasts of binary events are expressed as values between 0 and 1, and observations are exactly 0 (event did not occur), or 1 (event occured).\n", - "\n", - "The metric is then calculated the same way as MSE. The Brier score is a [strictly proper scoring rule](https://sites.stat.washington.edu/people/raftery/Research/PDF/Gneiting2007jasa.pdf) where lower values are better (it is negatively oriented) where a perfect score is 0 and the worst score is 1.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from scores.probability import brier_score\n", - "from scipy.stats import beta, binom\n", - "\n", - "import numpy as np\n", - "import xarray as xr" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# To learn more about the implemenation of the Brier score, uncomment the following\n", - "# help(brier_score)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We generate two synthetic forecasts. By design, `fcst1` is a good forecast, while `fcst2` is a poor forecast. We measure the difference in skill by calculating and comparing their Brier Scores." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "fcst1 = beta.rvs(2, 1, size=1000)\n", - "obs = binom.rvs(1, fcst1)\n", - "fcst2 = beta.rvs(0.5, 1, size=1000)\n", - "fcst1 = xr.DataArray(data=fcst1, dims=\"time\", coords={\"time\": np.arange(0, 1000)})\n", - "fcst2 = xr.DataArray(data=fcst2, dims=\"time\", coords={\"time\": np.arange(0, 1000)})\n", - "obs = xr.DataArray(data=obs, dims=\"time\", coords={\"time\": np.arange(0, 1000)})" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Brier score for fcst1 = 0.16\n", - "Brier score for fcst2 = 0.43\n" - ] - } - ], - "source": [ - "brier_fcst1 = brier_score(fcst1, obs)\n", - "brier_fcst2 = brier_score(fcst2, obs)\n", - "\n", - "print(f\"Brier score for fcst1 = {brier_fcst1.item():.2f}\")\n", - "print(f\"Brier score for fcst2 = {brier_fcst2.item():.2f}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As expected, fcst1 has the lower Brier Score quantifying the degree to which it is better than fcst2." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Notes\n", - "- If you are using the Brier score on large data with Dask, consider setting `check_args` arg to `False` in `brier_score`. \n", - "- In the future, the Brier score components calculation will be added.\n", - "- You may be interested in working through the Murphy Diagram tutorial which allows you to break down the performance of the Brier score based on each threshold probability.\n", - "\n", - "**Reference**: Brier, G. W. (1950). Verification of forecasts expressed in terms of probability. Monthly weather review, 78(1), 1-3. [https://doi.org/fp62r6](https://doi.org/fp62r6)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Brier Score\n", + "\n", + "The Brier score is the most commonly used verification metric for evaluating a probability of a binary outcome forecast, such as a \"chance of rainfall\" forecast.\n", + "\n", + "Probabilistic forecasts can be expressed as values between 0 and 1, or they could be expressed as an ensemble. We can use `scores` to evaluate both kinds of forecasts.\n", + "\n", + "## Using the Brier score to evaluate probabilistic forecasts (expressed as values between 0 and 1) of binary events\n", + "\n", + "Probabilistic forecasts of binary events can be expressed with values between 0 and 1, and observations are exactly 0 (event did not occur), or 1 (event occurred).\n", + "\n", + "\n", + "The metric is then calculated the same way as MSE and is defined as\n", + "\n", + "$$ s(x,y) = (x - y)^2$$\n", + "\n", + "Where\n", + "\n", + "- $x$ is the forecast between 0 and 1, and \n", + "- $y$ is the observation which is either 0 or 1.\n", + "\n", + "The Brier score is a [strictly proper scoring rule](https://sites.stat.washington.edu/people/raftery/Research/PDF/Gneiting2007jasa.pdf) where lower values are better (it is negatively oriented), where a perfect score is 0 and the worst score is 1.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from scores.probability import brier_score\n", + "from scipy.stats import beta, binom\n", + "\n", + "import numpy as np\n", + "import xarray as xr\n", + "\n", + "np.random.seed(100)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# To learn more about the implementation of the Brier score, uncomment the following\n", + "# help(brier_score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We generate two synthetic forecasts. By design, `fcst1` is a good forecast, while `fcst2` is a poor forecast. We measure the difference in skill by calculating and comparing their Brier scores." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "fcst1 = beta.rvs(2, 1, size=1000)\n", + "obs = binom.rvs(1, fcst1)\n", + "fcst2 = beta.rvs(0.5, 1, size=1000)\n", + "fcst1 = xr.DataArray(data=fcst1, dims=\"time\", coords={\"time\": np.arange(0, 1000)})\n", + "fcst2 = xr.DataArray(data=fcst2, dims=\"time\", coords={\"time\": np.arange(0, 1000)})\n", + "obs = xr.DataArray(data=obs, dims=\"time\", coords={\"time\": np.arange(0, 1000)})" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Brier score for fcst1 = 0.17\n", + "Brier score for fcst2 = 0.43\n" + ] + } + ], + "source": [ + "brier_fcst1 = brier_score(fcst1, obs)\n", + "brier_fcst2 = brier_score(fcst2, obs)\n", + "\n", + "print(f\"Brier score for fcst1 = {brier_fcst1.item():.2f}\")\n", + "print(f\"Brier score for fcst2 = {brier_fcst2.item():.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As expected, `fcst1` has the lower Brier score quantifying the degree to which it is better than `fcst2`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Notes\n", + "- If you are using the Brier score on large data with Dask, consider setting `check_args` arg to `False` in `brier_score`. \n", + "- In the future, the Brier score components calculation will be added.\n", + "- You may be interested in working through the Murphy Diagram tutorial which allows you to break down the performance of the Brier score based on each threshold probability.\n", + "\n", + "## Using the Brier score to evaluate ensemble forecasts\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`scores` can also be used to evaluate ensembles using the Brier score. By default, it calculates the fair Brier score which is defined as\n", + "\n", + "$$s_{i,y} = \\left(\\frac{i}{m} - y\\right)^2 - \\frac{i(m-i)}{m^2(m-1)}$$\n", + "\n", + "Where:\n", + "\n", + "- $i$ is the number of ensemble members that predict the event\n", + "- $m$ is the total number of ensemble members\n", + "- $y$ is the observed event\n", + "\n", + "It is possible to calculate the Brier score without the fair correction (by setting ``fair_correction=False``). In this case, the Brier score is calculated as\n", + "\n", + "\n", + "$$s_{i,y} = \\left(\\frac{i}{m} - y\\right)^2$$\n", + "\n", + "The use of the Brier score without the fair adjustment may favour ensembles that have members that have the behaviour of being sampled from a different distribution than the observations. For more information, see [Ferro (2014)](https://doi.org/10.1002/qj.2270).\n", + "\n", + "Let's first do a synthetic experiment with 10,000 timesteps to demonstrate the impact of the fair correction using the `scores.probability.brier_score_for_ensemble` function. Suppose the observations are randomly drawn from the distribution $\\mathrm{Pr}(y=1) = 0.5$. Let's now create two ensemble forecasts. The first (we will call `fcst_a`) has 2 ensemble members where the value for each member is randomly drawn from the same distribution that the observations are drawn from and is an ideal forecast. The second ensemble (called `fcst_b`) has 20 ensemble members but is biased. The value for each member is randomly drawn from the distribution $\\mathrm{Pr}(y=1) = 0.4$.\n", + "\n", + "Let's now calculate the Brier score with and without the fair adjustment." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from scores.probability import brier_score_for_ensemble\n", + "\n", + "# Uncomment the following line to learn more about the ensemble Brier score\n", + "# help(brier_score_for_ensemble)\n", + "\n", + "N = 10000\n", + "M1 = 2\n", + "M2 = 20\n", + "obs = np.random.choice([0, 1], size=N, p=[0.5, 0.5])\n", + "time = np.arange(N)\n", + "obs = xr.DataArray(obs, dims=[\"time\"], coords={\"time\": time})\n", + "\n", + "fcst_a = np.random.choice([0, 1], size=(M1, N), p=[0.5, 0.5])\n", + "ensemble = np.arange(M1)\n", + "fcst_a = xr.DataArray(fcst_a, dims=[\"ensemble\", \"time\"], coords={\"ensemble\": ensemble, \"time\": time})\n", + "\n", + "fcst_b = np.random.choice([0, 1], size=(M2, N), p=[0.6, 0.4])\n", + "ensemble = np.arange(M2)\n", + "fcst_b = xr.DataArray(fcst_b, dims=[\"ensemble\", \"time\"], coords={\"ensemble\": ensemble, \"time\": time})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we calculate the fair Brier score for both ensembles. The `thresholds` arg defines the threshold that an event occurs. By default, events are inclusive of the exact value of the threshold.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray (threshold: 1)>\n",
+                            "array([0.2532])\n",
+                            "Coordinates:\n",
+                            "  * threshold  (threshold) float64 0.5
" + ], + "text/plain": [ + "\n", + "array([0.2532])\n", + "Coordinates:\n", + " * threshold (threshold) float64 0.5" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "brier_score_for_ensemble(fcst_a, obs, event_thresholds=0.5, ensemble_member_dim=\"ensemble\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray (threshold: 1)>\n",
+                            "array([0.26047579])\n",
+                            "Coordinates:\n",
+                            "  * threshold  (threshold) float64 0.5
" + ], + "text/plain": [ + "\n", + "array([0.26047579])\n", + "Coordinates:\n", + " * threshold (threshold) float64 0.5" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "brier_score_for_ensemble(fcst_b, obs, event_thresholds=0.5, ensemble_member_dim=\"ensemble\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As expected, `fcst_a` had a lower (better) Brier score than `fcst_b`. Now, let's calculate the Brier score without the fair adjustment." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray (threshold: 1)>\n",
+                            "array([0.37765])\n",
+                            "Coordinates:\n",
+                            "  * threshold  (threshold) float64 0.5
" + ], + "text/plain": [ + "\n", + "array([0.37765])\n", + "Coordinates:\n", + " * threshold (threshold) float64 0.5" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "brier_score_for_ensemble(fcst_a, obs, event_thresholds=0.5, ensemble_member_dim=\"ensemble\", fair_correction=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray (threshold: 1)>\n",
+                            "array([0.27247375])\n",
+                            "Coordinates:\n",
+                            "  * threshold  (threshold) float64 0.5
" + ], + "text/plain": [ + "\n", + "array([0.27247375])\n", + "Coordinates:\n", + " * threshold (threshold) float64 0.5" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "brier_score_for_ensemble(fcst_b, obs, event_thresholds=0.5, ensemble_member_dim=\"ensemble\", fair_correction=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that not using the fair correction with `brier_score_for_ensemble` ranked the biased `fcst_b` much better than `fcst_a`. This shows the importance of using fair scores with ensembles to avoid getting misleading results!\n", + "\n", + "**Note**, if an ensemble is passed into the function with only one ensemble member, the un-adjusted Brier score is returned in order to avoid a divide by zero error." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Calculating the Brier score for multiple thresholds with ensembles \n", + "You can also calculate the Brier score for multiple thresholds" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray (threshold: 4)>\n",
+                            "array([0.    , 0.2532, 0.2532, 0.    ])\n",
+                            "Coordinates:\n",
+                            "  * threshold  (threshold) float64 -1.0 0.5 1.0 3.0
" + ], + "text/plain": [ + "\n", + "array([0. , 0.2532, 0.2532, 0. ])\n", + "Coordinates:\n", + " * threshold (threshold) float64 -1.0 0.5 1.0 3.0" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "brier_score_for_ensemble(fcst_a, obs, event_thresholds=[-1, 0.5, 1, 3], ensemble_member_dim=\"ensemble\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Things to try next\n", + "- You can pass in a list of thresholds into the `threshold` arg if you want to calculate the Brier score for multiple thresholds.\n", + "- You can set `event_threshold_mode=operator.gt` if you don't want the exact threshold to be included as an event.\n", + "\n", + "### References\n", + "\n", + "- Brier, G. W. (1950). Verification of forecasts expressed in terms of probability. Monthly Weather Review, 78(1), 1-3. [https://doi.org/fp62r6](https://doi.org/fp62r6)\n", + "- Ferro, C. A. T. (2013). Fair scores for ensemble forecasts. Quarterly Journal of the Royal Meteorological Society, 140(683), 1917–1923. https://doi.org/10.1002/qj.2270" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 }