feat: add proportion exceeding wrapper for flip flop index

nci · Dec 5, 2023 · 3166f65 · 3166f65
1 parent ee69685
commit 3166f65
Show file tree

Hide file tree

Showing 5 changed files with 369 additions and 10 deletions.
diff --git a/src/scores/continuous/__init__.py b/src/scores/continuous/__init__.py
@@ -1,7 +1,10 @@
 """
 Import the functions from the implementations into the public API
 """
-from scores.continuous.flip_flop_impl import flip_flop_index
+from scores.continuous.flip_flop_impl import (
+    flip_flop_index,
+    flip_flop_index_proportion_exceeding,
+)
 from scores.continuous.isoreg import isotonic_fit
 from scores.continuous.murphy_impl import murphy_score, murphy_thetas
 from scores.continuous.quantile_loss_impl import quantile_score

diff --git a/src/scores/continuous/flip_flop_impl.py b/src/scores/continuous/flip_flop_impl.py
@@ -9,6 +9,7 @@
 import xarray as xr
 
 from scores.functions import angular_difference
+from scores.processing import binary_discretise
 from scores.typing import XarrayLike
 from scores.utils import DimensionError, check_dims, dims_complement
 
@@ -35,7 +36,7 @@ def _flip_flop_index(data: xr.DataArray, sampling_dim: str, is_angular: bool = F
     # check that `sampling_dim` is in `data`.
     check_dims(data, [sampling_dim], mode="superset")
 
-    # the maximum possible number of discrete flipflops
+    # the maximum possible number of discrete flip_flops
     sequence_length = len(data[sampling_dim])
     max_possible_flip_flop_count = sequence_length - 2
 
@@ -50,21 +51,21 @@ def _flip_flop_index(data: xr.DataArray, sampling_dim: str, is_angular: bool = F
         # maximum possible angular difference between two forecasts
         enc_size = encompassing_sector_size(data=data, dims=dims_to_preserve)
         range_val = np.clip(enc_size, a_min=None, a_max=180.0)
-        flipflop = angular_difference(data.shift({sampling_dim: 1}), data)
+        flip_flop = angular_difference(data.shift({sampling_dim: 1}), data)
     else:
         max_val = data.max(dim=sampling_dim, skipna=False)
         min_val = data.min(dim=sampling_dim, skipna=False)
         range_val = max_val - min_val
         # subtract each consecutive 'row' from eachother
-        flipflop = data.shift({sampling_dim: 1}) - data
+        flip_flop = data.shift({sampling_dim: 1}) - data
 
     # take the absolute value and sum.
     # I don't do skipna=False here because .shift makes a row of nan
-    flipflop = abs(flipflop).sum(dim=sampling_dim)
+    flip_flop = abs(flip_flop).sum(dim=sampling_dim)
     # adjust based on the range. This is where nan will be introduced.
-    flipflop = flipflop - range_val
+    flip_flop = flip_flop - range_val
     # normalise
-    return flipflop / max_possible_flip_flop_count
+    return flip_flop / max_possible_flip_flop_count
 
 
 # If there are selections, a DataSet is always returned
@@ -98,7 +99,7 @@ def flip_flop_index(
             direction).
         **selections: Additional keyword arguments specify
             subsets to draw from the dimension `sampling_dim` of the supplied `data`
-            before calculation of the flipflop index. e.g. days123=[1, 2, 3]
+            before calculation of the flip_flop index. e.g. days123=[1, 2, 3]
 
     Returns:
         If `selections` are not supplied: An xarray.DataArray, the Flip-flop
@@ -348,3 +349,208 @@ def _encompassing_sector_size_np(
     n_unique_angles = (angular_diffs != 0).sum(axis=0)
     result = np.where(n_unique_angles <= 2, np.max(angular_diffs, axis=0), result)
     return result
+
+
+def flip_flop_index_proportion_exceeding(
+    data: xr.DataArray,
+    sampling_dim: str,
+    thresholds: Iterable,
+    is_angular: bool = False,
+    dims: Optional[Sequence[str]] = None,
+    **selections: Iterable[int],
+):
+    """
+    Calculates the flip-flop index and returns the proportion exceeding
+    (or equal to) each of the supplied `thresholds`.
+
+    Args:
+        data (xarray.DataArray): Data from which to draw subsets.
+        sampling_dim: The name of the dimension along which to calculate
+        thresholds (iterable): The proportion of Flip-Flop index results
+            equal to or exceeding these thresholds will be calculated.
+            equal to or exceeding these thresholds will be calculated.
+            the flip-flop index.
+        is_angular: specifies whether `data` is directional data (e.g. wind
+            direction).
+        **selections (Optional[iterable]): Additional keyword arguments specify
+            subsets to draw from the dimension `sampling_dim` of the supplied `data`
+            before calculation of the flip_flop index. e.g. days123=[1, 2, 3]
+        dims (Optional[iterable]): Strings corresponding to the dimensions in the input
+            xarray data objects that we wish to preserve in the output. All other
+            dimensions in the input data objects are collapsed.
+    Returns:
+        If `selections` are not supplied - An xarray.DataArray with dimensions
+        `dims` + 'threshold'. The DataArray is the proportion of the Flip-flop
+        Index calculated by collapsing dimension `sampling_dim` exceeding or
+        equal to `thresholds`.
+
+        If `selections` are supplied - An xarray.Dataset with dimensions `dims`
+        + 'threshold'. There is a data variable for each keyword in
+        `selections`, and corresponds to the Flip-Flop Index proportion
+        exceeding for the subset of data specified by the keyword values.
+
+    Examples:
+        >>> data = xr.DataArray(
+        ...     [[50, 20, 40, 80], [10, 50, 10, 100], [0, 30, 20, 50]],
+        ...     dims=['station_number', 'lead_day'],
+        ...     coords=[[10001, 10002, 10003], [1, 2, 3, 4]]
+        ... )
+
+        >>> flip_flop_index_proportion_exceeding(data, 'lead_day', [20])
+        <xarray.DataArray (threshold: 1)>
+        array([ 0.33333333])
+        Coordinates:
+          * threshold  (threshold) int64 20
+        Attributes:
+            sampling_dim: lead_day
+
+        >>> flip_flop_index_proportion_exceeding(
+        ...     data, 'lead_day', [20], days123=[1, 2, 3], all_days=[1, 2, 3, 4]
+        ... )
+        <xarray.Dataset>
+        Dimensions:    (threshold: 1)
+        Coordinates:
+          * threshold  (threshold) int64 20
+        Data variables:
+            days123    (threshold) float64 0.6667
+            all_days   (threshold) float64 0.3333
+        Attributes:
+            selections: {{'days123': [1, 2, 3], 'all_days': [1, 2, 3, 4]}}
+            sampling_dim: lead_day
+
+    See also:
+        `scores.continuous.flip_flop_index`
+
+    """
+    if (dims is not None) and (sampling_dim in list(dims)):
+        raise DimensionError(
+            f"`sampling_dim`: '{sampling_dim}' must not be in dimensions to preserve " f"`dims`: {list(dims)}"
+        )
+    # calculate the flip-flop index
+    flip_flop_data = flip_flop_index(data, sampling_dim, is_angular=is_angular, **selections)
+    # calculate the proportion exceeding each threshold
+    flip_flop_exceeding = proportion_exceeding(flip_flop_data, thresholds, dims=dims)
+    # overwrite the attributes
+    flip_flop_exceeding.attrs = flip_flop_data.attrs
+
+    return flip_flop_exceeding
+
+
+def proportion_exceeding(data: XarrayLike, thresholds: Iterable, dims: Optional[Iterable] = None):
+    """
+    Calculates the proportion of `data` equal to or exceeding `thresholds`.
+
+    Args:
+        data (xarray.Dataset or xarray.DataArray): The data from which
+            to calculate the proportion exceeding `thresholds`
+        thresholds (iterable): The proportion of Flip-Flop index results
+            equal to or exceeding these thresholds will be calculated.\
+            equal to or exceeding these thresholds will be calculated.\
+            the flip-flop index.
+        dims (Optional[iterable]): Strings corresponding to the dimensions in the input
+            xarray data objects that we wish to preserve in the output. All other
+            dimensions in the input data objects are collapsed.\
+
+    Returns:
+        An xarray data object with the type of `data` and dimensions
+        `dims` + 'threshold'. The values are the proportion of `data`
+        that are greater than or equal to the corresponding threshold.
+
+    """
+    return _binary_discretise_proportion(data, thresholds, ">=", dims=dims)
+
+
+def _binary_discretise_proportion(
+    data: XarrayLike,
+    thresholds: Iterable,
+    mode: str,
+    dims: Optional[Iterable] = None,
+    abs_tolerance: Optional[bool] = None,
+    autosqueeze: Optional[bool] = False,
+):
+    """
+    Returns the proportion of `data` in each category. The categories are
+    defined by the relationship of data to threshold as specified by
+    the operation `mode`.
+
+    Args:
+        data (xarray.Dataset or xarray.DataArray): The data to convert
+           into 0 and 1 according the thresholds before calculating the
+           proportion.
+        thresholds (iterable): The proportion of Flip-Flop index results
+            equal to or exceeding these thresholds will be calculated.
+            equal to or exceeding these thresholds will be calculated.
+            the flip-flop index.
+        mode (str): Specifies the required relation of `data` to `thresholds`
+            for a value to fall in the 'event' category (i.e. assigned to 1).
+            Allowed modes are:
+
+            - '>=' values in `data` greater than or equal to the
+              corresponding threshold are assigned as 1.
+            - '>' values in `data` greater than the corresponding threshold
+              are assigned as 1.
+            - '<=' values in `data` less than or equal to the corresponding
+              threshold are assigned as 1.
+            - '<' values in `data` less than the corresponding threshold
+              are assigned as 1.
+            - '==' values in `data` equal to the corresponding threshold
+              are assigned as 1
+            - '!=' values in `data` not equal to the corresponding threshold
+              are assigned as 1.\
+        dims (Optional[iterable]): Strings corresponding to the dimensions in the input
+            xarray data objects that we wish to preserve in the output. All other
+            dimensions in the input data objects are collapsed.
+            The dimension 'threshold' should not be supplied, it will automatically
+            be preserved.
+        abs_tolerance (Optional[float]): If supplied, values in data that are
+            within abs_tolerance of a threshold are considered to be equal to
+            that threshold. This is generally used to correct for floating
+            point rounding, e.g. we may want to consider 1.0000000000000002 as
+            equal to 1.\
+        autosqueeze (Optional[bool]): If True and only one threshold is
+            supplied, then the dimension 'threshold' is squeezed out of the
+            output. If `thresholds` is float-like, then this is forced to
+            True, otherwise defaults to False.
+
+    Returns:
+        An xarray data object with the type of `data`, dimension `dims` +
+        'threshold'. The values of the output are the proportion of `data` that
+        satisfy the relationship to `thresholds` as specified by `mode`.
+
+    Examples:
+
+        >>> data = xr.DataArray([0, 0.5, 0.5, 1])
+
+        >>> _binary_discretise_proportion(data, [0, 0.5, 1], '==')
+        <xarray.DataArray (threshold: 3)>
+        array([ 0.25,  0.5 ,  0.25])
+        Coordinates:
+          * threshold  (threshold) float64 0.0 0.5 1.0
+        Attributes:
+            discretisation_tolerance: 0
+            discretisation_mode: ==
+
+        >>> _binary_discretise_proportion(data, [0, 0.5, 1], '>=')
+        <xarray.DataArray (threshold: 3)>
+        array([ 1.  ,  0.75,  0.25])
+        Coordinates:
+          * threshold  (threshold) float64 0.0 0.5 1.0
+        Attributes:
+            discretisation_tolerance: 0
+            discretisation_mode: >=
+
+    See also:
+        `scores.processing.binary_discretise`
+
+    """
+    # values are 1 when (data {mode} threshold), and 0 when ~(data {mode} threshold).
+    discrete_data = binary_discretise(data, thresholds, mode, abs_tolerance=abs_tolerance, autosqueeze=autosqueeze)
+
+    # the proportion in each category
+    dims_to_collapse = dims_complement(data, dims=dims)
+    proportion = discrete_data.mean(dim=dims_to_collapse)
+
+    # attach attributes
+    proportion.attrs = discrete_data.attrs
+
+    return proportion
diff --git a/src/scores/continuous/standard_impl.py b/src/scores/continuous/standard_impl.py
@@ -2,12 +2,12 @@
 This module contains standard methods which may be used for continuous scoring
 """
 
+import xarray as xr
+
 import scores.functions
 import scores.utils
 from scores.typing import FlexibleArrayType, FlexibleDimensionTypes
 
-import xarray as xr
-
 
 def mse(
     fcst: FlexibleArrayType,

diff --git a/tests/continuous/flip_flop_test_data.py b/tests/continuous/flip_flop_test_data.py
@@ -248,3 +248,73 @@
 ENC_SIZE_3D_TEST_AXIS0 = ENC_SIZE_3D_TEST_AXIS2.swapaxes(0, 2)
 ENC_SIZE_3D_ANSW_AXIS0_SKIPNA = ENC_SIZE_3D_ANSW_AXIS2_SKIPNA.swapaxes(0, 1)
 ENC_SIZE_3D_ANSW_AXIS0_NOSKIPNA = ENC_SIZE_3D_ANSW_AXIS2_NOSKIPNA.swapaxes(0, 1)
+
+
+"""
+Test data for flipflipindex_proportion_exceeding
+"""
+EXP_FFI_PE_NONE = xr.Dataset(
+    {
+        "one": xr.DataArray([1, 2 / 3, 0], coords=[("threshold", [0, 1, 5])]),
+        "two": xr.DataArray([1, 0.5, 0.25], coords=[("threshold", [0, 1, 5])]),
+    },
+    attrs={"sampling_dim": "int", "selections": {"one": [1, 2, 3], "two": [2, 3, 4]}},
+)
+EXP_FFI_PE_CHAR = xr.Dataset(
+    {
+        "one": xr.DataArray(
+            [[1.0, 0.0, 0.0], [1.0, 1.0, 0.0]],
+            coords=[("char", ["a", "b"]), ("threshold", [0, 1, 5])],
+        ),
+        "two": xr.DataArray(
+            [[1, 0.5, 0.5], [1, 0.5, 0]],
+            coords=[("char", ["a", "b"]), ("threshold", [0, 1, 5])],
+        ),
+    },
+    attrs={"sampling_dim": "int", "selections": {"one": [1, 2, 3], "two": [2, 3, 4]}},
+)
+EXP_FFI_PE_CHARBOOL = xr.Dataset(
+    {
+        "one": xr.DataArray(
+            [[[1, 0, 0], [np.nan, np.nan, np.nan]], [[1, 1, 0], [1, 1, 0]]],
+            coords=[
+                ("char", ["a", "b"]),
+                ("bool", [True, False]),
+                ("threshold", [0, 1, 5]),
+            ],
+        ),
+        "two": xr.DataArray(
+            [[[1.0, 0.0, 0.0], [1.0, 1.0, 1.0]], [[1.0, 0.0, 0.0], [1.0, 1.0, 0.0]]],
+            coords=[
+                ("char", ["a", "b"]),
+                ("bool", [True, False]),
+                ("threshold", [0, 1, 5]),
+            ],
+        ),
+    },
+    attrs={"sampling_dim": "int", "selections": {"one": [1, 2, 3], "two": [2, 3, 4]}},
+)
+EXP_FFI_PE_CHARBOOL_DIR = xr.Dataset(
+    {
+        "one": xr.DataArray(
+            [
+                [[1.0, 0.0, 0.0], [np.nan, np.nan, np.nan]],
+                [[1.0, 0.0, 0.0], [1.0, 1.0, 0.0]],
+            ],
+            coords=[
+                ("char", ["a", "b"]),
+                ("bool", [True, False]),
+                ("threshold", [0, 50, 100]),
+            ],
+        ),
+        "two": xr.DataArray(
+            [[[1.0, 0.0, 0.0], [1.0, 1.0, 0.0]], [[1.0, 0.0, 0.0], [1.0, 0.0, 0.0]]],
+            coords=[
+                ("char", ["a", "b"]),
+                ("bool", [True, False]),
+                ("threshold", [0, 50, 100]),
+            ],
+        ),
+    },
+    attrs={"sampling_dim": "int", "selections": {"one": [1, 2, 3], "two": [2, 3, 4]}},
+)