Skip to content

Commit

Permalink
feat: add proportion exceeding wrapper for flip flop index
Browse files Browse the repository at this point in the history
  • Loading branch information
aidanjgriffiths committed Dec 5, 2023
1 parent ee69685 commit 3166f65
Show file tree
Hide file tree
Showing 5 changed files with 369 additions and 10 deletions.
5 changes: 4 additions & 1 deletion src/scores/continuous/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
"""
Import the functions from the implementations into the public API
"""
from scores.continuous.flip_flop_impl import flip_flop_index
from scores.continuous.flip_flop_impl import (
flip_flop_index,
flip_flop_index_proportion_exceeding,
)
from scores.continuous.isoreg import isotonic_fit
from scores.continuous.murphy_impl import murphy_score, murphy_thetas
from scores.continuous.quantile_loss_impl import quantile_score
Expand Down
220 changes: 213 additions & 7 deletions src/scores/continuous/flip_flop_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import xarray as xr

from scores.functions import angular_difference
from scores.processing import binary_discretise
from scores.typing import XarrayLike
from scores.utils import DimensionError, check_dims, dims_complement

Expand All @@ -35,7 +36,7 @@ def _flip_flop_index(data: xr.DataArray, sampling_dim: str, is_angular: bool = F
# check that `sampling_dim` is in `data`.
check_dims(data, [sampling_dim], mode="superset")

# the maximum possible number of discrete flipflops
# the maximum possible number of discrete flip_flops
sequence_length = len(data[sampling_dim])
max_possible_flip_flop_count = sequence_length - 2

Expand All @@ -50,21 +51,21 @@ def _flip_flop_index(data: xr.DataArray, sampling_dim: str, is_angular: bool = F
# maximum possible angular difference between two forecasts
enc_size = encompassing_sector_size(data=data, dims=dims_to_preserve)
range_val = np.clip(enc_size, a_min=None, a_max=180.0)
flipflop = angular_difference(data.shift({sampling_dim: 1}), data)
flip_flop = angular_difference(data.shift({sampling_dim: 1}), data)
else:
max_val = data.max(dim=sampling_dim, skipna=False)
min_val = data.min(dim=sampling_dim, skipna=False)
range_val = max_val - min_val
# subtract each consecutive 'row' from eachother
flipflop = data.shift({sampling_dim: 1}) - data
flip_flop = data.shift({sampling_dim: 1}) - data

# take the absolute value and sum.
# I don't do skipna=False here because .shift makes a row of nan
flipflop = abs(flipflop).sum(dim=sampling_dim)
flip_flop = abs(flip_flop).sum(dim=sampling_dim)
# adjust based on the range. This is where nan will be introduced.
flipflop = flipflop - range_val
flip_flop = flip_flop - range_val
# normalise
return flipflop / max_possible_flip_flop_count
return flip_flop / max_possible_flip_flop_count


# If there are selections, a DataSet is always returned
Expand Down Expand Up @@ -98,7 +99,7 @@ def flip_flop_index(
direction).
**selections: Additional keyword arguments specify
subsets to draw from the dimension `sampling_dim` of the supplied `data`
before calculation of the flipflop index. e.g. days123=[1, 2, 3]
before calculation of the flip_flop index. e.g. days123=[1, 2, 3]
Returns:
If `selections` are not supplied: An xarray.DataArray, the Flip-flop
Expand Down Expand Up @@ -348,3 +349,208 @@ def _encompassing_sector_size_np(
n_unique_angles = (angular_diffs != 0).sum(axis=0)
result = np.where(n_unique_angles <= 2, np.max(angular_diffs, axis=0), result)
return result


def flip_flop_index_proportion_exceeding(
data: xr.DataArray,
sampling_dim: str,
thresholds: Iterable,
is_angular: bool = False,
dims: Optional[Sequence[str]] = None,
**selections: Iterable[int],
):
"""
Calculates the flip-flop index and returns the proportion exceeding
(or equal to) each of the supplied `thresholds`.
Args:
data (xarray.DataArray): Data from which to draw subsets.
sampling_dim: The name of the dimension along which to calculate
thresholds (iterable): The proportion of Flip-Flop index results
equal to or exceeding these thresholds will be calculated.
equal to or exceeding these thresholds will be calculated.
the flip-flop index.
is_angular: specifies whether `data` is directional data (e.g. wind
direction).
**selections (Optional[iterable]): Additional keyword arguments specify
subsets to draw from the dimension `sampling_dim` of the supplied `data`
before calculation of the flip_flop index. e.g. days123=[1, 2, 3]
dims (Optional[iterable]): Strings corresponding to the dimensions in the input
xarray data objects that we wish to preserve in the output. All other
dimensions in the input data objects are collapsed.
Returns:
If `selections` are not supplied - An xarray.DataArray with dimensions
`dims` + 'threshold'. The DataArray is the proportion of the Flip-flop
Index calculated by collapsing dimension `sampling_dim` exceeding or
equal to `thresholds`.
If `selections` are supplied - An xarray.Dataset with dimensions `dims`
+ 'threshold'. There is a data variable for each keyword in
`selections`, and corresponds to the Flip-Flop Index proportion
exceeding for the subset of data specified by the keyword values.
Examples:
>>> data = xr.DataArray(
... [[50, 20, 40, 80], [10, 50, 10, 100], [0, 30, 20, 50]],
... dims=['station_number', 'lead_day'],
... coords=[[10001, 10002, 10003], [1, 2, 3, 4]]
... )
>>> flip_flop_index_proportion_exceeding(data, 'lead_day', [20])
<xarray.DataArray (threshold: 1)>
array([ 0.33333333])
Coordinates:
* threshold (threshold) int64 20
Attributes:
sampling_dim: lead_day
>>> flip_flop_index_proportion_exceeding(
... data, 'lead_day', [20], days123=[1, 2, 3], all_days=[1, 2, 3, 4]
... )
<xarray.Dataset>
Dimensions: (threshold: 1)
Coordinates:
* threshold (threshold) int64 20
Data variables:
days123 (threshold) float64 0.6667
all_days (threshold) float64 0.3333
Attributes:
selections: {{'days123': [1, 2, 3], 'all_days': [1, 2, 3, 4]}}
sampling_dim: lead_day
See also:
`scores.continuous.flip_flop_index`
"""
if (dims is not None) and (sampling_dim in list(dims)):
raise DimensionError(
f"`sampling_dim`: '{sampling_dim}' must not be in dimensions to preserve " f"`dims`: {list(dims)}"
)
# calculate the flip-flop index
flip_flop_data = flip_flop_index(data, sampling_dim, is_angular=is_angular, **selections)
# calculate the proportion exceeding each threshold
flip_flop_exceeding = proportion_exceeding(flip_flop_data, thresholds, dims=dims)
# overwrite the attributes
flip_flop_exceeding.attrs = flip_flop_data.attrs

return flip_flop_exceeding


def proportion_exceeding(data: XarrayLike, thresholds: Iterable, dims: Optional[Iterable] = None):
"""
Calculates the proportion of `data` equal to or exceeding `thresholds`.
Args:
data (xarray.Dataset or xarray.DataArray): The data from which
to calculate the proportion exceeding `thresholds`
thresholds (iterable): The proportion of Flip-Flop index results
equal to or exceeding these thresholds will be calculated.\
equal to or exceeding these thresholds will be calculated.\
the flip-flop index.
dims (Optional[iterable]): Strings corresponding to the dimensions in the input
xarray data objects that we wish to preserve in the output. All other
dimensions in the input data objects are collapsed.\
Returns:
An xarray data object with the type of `data` and dimensions
`dims` + 'threshold'. The values are the proportion of `data`
that are greater than or equal to the corresponding threshold.
"""
return _binary_discretise_proportion(data, thresholds, ">=", dims=dims)


def _binary_discretise_proportion(
data: XarrayLike,
thresholds: Iterable,
mode: str,
dims: Optional[Iterable] = None,
abs_tolerance: Optional[bool] = None,
autosqueeze: Optional[bool] = False,
):
"""
Returns the proportion of `data` in each category. The categories are
defined by the relationship of data to threshold as specified by
the operation `mode`.
Args:
data (xarray.Dataset or xarray.DataArray): The data to convert
into 0 and 1 according the thresholds before calculating the
proportion.
thresholds (iterable): The proportion of Flip-Flop index results
equal to or exceeding these thresholds will be calculated.
equal to or exceeding these thresholds will be calculated.
the flip-flop index.
mode (str): Specifies the required relation of `data` to `thresholds`
for a value to fall in the 'event' category (i.e. assigned to 1).
Allowed modes are:
- '>=' values in `data` greater than or equal to the
corresponding threshold are assigned as 1.
- '>' values in `data` greater than the corresponding threshold
are assigned as 1.
- '<=' values in `data` less than or equal to the corresponding
threshold are assigned as 1.
- '<' values in `data` less than the corresponding threshold
are assigned as 1.
- '==' values in `data` equal to the corresponding threshold
are assigned as 1
- '!=' values in `data` not equal to the corresponding threshold
are assigned as 1.\
dims (Optional[iterable]): Strings corresponding to the dimensions in the input
xarray data objects that we wish to preserve in the output. All other
dimensions in the input data objects are collapsed.
The dimension 'threshold' should not be supplied, it will automatically
be preserved.
abs_tolerance (Optional[float]): If supplied, values in data that are
within abs_tolerance of a threshold are considered to be equal to
that threshold. This is generally used to correct for floating
point rounding, e.g. we may want to consider 1.0000000000000002 as
equal to 1.\
autosqueeze (Optional[bool]): If True and only one threshold is
supplied, then the dimension 'threshold' is squeezed out of the
output. If `thresholds` is float-like, then this is forced to
True, otherwise defaults to False.
Returns:
An xarray data object with the type of `data`, dimension `dims` +
'threshold'. The values of the output are the proportion of `data` that
satisfy the relationship to `thresholds` as specified by `mode`.
Examples:
>>> data = xr.DataArray([0, 0.5, 0.5, 1])
>>> _binary_discretise_proportion(data, [0, 0.5, 1], '==')
<xarray.DataArray (threshold: 3)>
array([ 0.25, 0.5 , 0.25])
Coordinates:
* threshold (threshold) float64 0.0 0.5 1.0
Attributes:
discretisation_tolerance: 0
discretisation_mode: ==
>>> _binary_discretise_proportion(data, [0, 0.5, 1], '>=')
<xarray.DataArray (threshold: 3)>
array([ 1. , 0.75, 0.25])
Coordinates:
* threshold (threshold) float64 0.0 0.5 1.0
Attributes:
discretisation_tolerance: 0
discretisation_mode: >=
See also:
`scores.processing.binary_discretise`
"""
# values are 1 when (data {mode} threshold), and 0 when ~(data {mode} threshold).
discrete_data = binary_discretise(data, thresholds, mode, abs_tolerance=abs_tolerance, autosqueeze=autosqueeze)

# the proportion in each category
dims_to_collapse = dims_complement(data, dims=dims)
proportion = discrete_data.mean(dim=dims_to_collapse)

# attach attributes
proportion.attrs = discrete_data.attrs

return proportion
4 changes: 2 additions & 2 deletions src/scores/continuous/standard_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
This module contains standard methods which may be used for continuous scoring
"""

import xarray as xr

import scores.functions
import scores.utils
from scores.typing import FlexibleArrayType, FlexibleDimensionTypes

import xarray as xr


def mse(
fcst: FlexibleArrayType,
Expand Down
70 changes: 70 additions & 0 deletions tests/continuous/flip_flop_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,3 +248,73 @@
ENC_SIZE_3D_TEST_AXIS0 = ENC_SIZE_3D_TEST_AXIS2.swapaxes(0, 2)
ENC_SIZE_3D_ANSW_AXIS0_SKIPNA = ENC_SIZE_3D_ANSW_AXIS2_SKIPNA.swapaxes(0, 1)
ENC_SIZE_3D_ANSW_AXIS0_NOSKIPNA = ENC_SIZE_3D_ANSW_AXIS2_NOSKIPNA.swapaxes(0, 1)


"""
Test data for flipflipindex_proportion_exceeding
"""
EXP_FFI_PE_NONE = xr.Dataset(
{
"one": xr.DataArray([1, 2 / 3, 0], coords=[("threshold", [0, 1, 5])]),
"two": xr.DataArray([1, 0.5, 0.25], coords=[("threshold", [0, 1, 5])]),
},
attrs={"sampling_dim": "int", "selections": {"one": [1, 2, 3], "two": [2, 3, 4]}},
)
EXP_FFI_PE_CHAR = xr.Dataset(
{
"one": xr.DataArray(
[[1.0, 0.0, 0.0], [1.0, 1.0, 0.0]],
coords=[("char", ["a", "b"]), ("threshold", [0, 1, 5])],
),
"two": xr.DataArray(
[[1, 0.5, 0.5], [1, 0.5, 0]],
coords=[("char", ["a", "b"]), ("threshold", [0, 1, 5])],
),
},
attrs={"sampling_dim": "int", "selections": {"one": [1, 2, 3], "two": [2, 3, 4]}},
)
EXP_FFI_PE_CHARBOOL = xr.Dataset(
{
"one": xr.DataArray(
[[[1, 0, 0], [np.nan, np.nan, np.nan]], [[1, 1, 0], [1, 1, 0]]],
coords=[
("char", ["a", "b"]),
("bool", [True, False]),
("threshold", [0, 1, 5]),
],
),
"two": xr.DataArray(
[[[1.0, 0.0, 0.0], [1.0, 1.0, 1.0]], [[1.0, 0.0, 0.0], [1.0, 1.0, 0.0]]],
coords=[
("char", ["a", "b"]),
("bool", [True, False]),
("threshold", [0, 1, 5]),
],
),
},
attrs={"sampling_dim": "int", "selections": {"one": [1, 2, 3], "two": [2, 3, 4]}},
)
EXP_FFI_PE_CHARBOOL_DIR = xr.Dataset(
{
"one": xr.DataArray(
[
[[1.0, 0.0, 0.0], [np.nan, np.nan, np.nan]],
[[1.0, 0.0, 0.0], [1.0, 1.0, 0.0]],
],
coords=[
("char", ["a", "b"]),
("bool", [True, False]),
("threshold", [0, 50, 100]),
],
),
"two": xr.DataArray(
[[[1.0, 0.0, 0.0], [1.0, 1.0, 0.0]], [[1.0, 0.0, 0.0], [1.0, 0.0, 0.0]]],
coords=[
("char", ["a", "b"]),
("bool", [True, False]),
("threshold", [0, 50, 100]),
],
),
},
attrs={"sampling_dim": "int", "selections": {"one": [1, 2, 3], "two": [2, 3, 4]}},
)
Loading

0 comments on commit 3166f65

Please sign in to comment.