From 112f4b594e5fc78f5a01f9043e477f2a95af1268 Mon Sep 17 00:00:00 2001 From: john Date: Tue, 7 Dec 2021 14:06:51 +0000 Subject: [PATCH 01/20] Adding labelling disaggregation to spatial_aggregate + initial test --- .../features/location/spatial_aggregate.py | 22 +++++++++++++++---- flowmachine/tests/test_spatial_aggregate.py | 20 +++++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/flowmachine/flowmachine/features/location/spatial_aggregate.py b/flowmachine/flowmachine/features/location/spatial_aggregate.py index 0383beca5b..49cefa5f13 100644 --- a/flowmachine/flowmachine/features/location/spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/spatial_aggregate.py @@ -18,31 +18,45 @@ class SpatialAggregate(GeoDataMixin, Query): Parameters ---------- locations : subscriber location query + subscriber_label: A query with a column to further disaggregate by + label_column: Name of the column containing the label: defaults to 'value' """ - def __init__(self, *, locations): + def __init__(self, *, locations, subscriber_labels=None, label_column="value"): self.locations = locations # self.spatial_unit is used in self._geo_augmented_query self.spatial_unit = locations.spatial_unit + self.subscriber_labels = subscriber_labels + self.label_column = label_column super().__init__() @property def column_names(self) -> List[str]: + # Do we want to return the labels column as well? return self.spatial_unit.location_id_columns + ["value"] def _make_query(self): aggregate_cols = ",".join(self.spatial_unit.location_id_columns) + if self.subscriber_labels: + label_clause = f"JOIN ({self.subscriber_labels.get_query()}) AS labels USING (subscriber)" + label_select_clause = f", labels.{self.label_column} AS label" + label_agg_clause = f", label" + else: + label_clause = "" + label_select_clause = "" + label_agg_clause = "" + sql = f""" SELECT - {aggregate_cols}, + {aggregate_cols}{label_select_clause}, count(*) AS value FROM - ({self.locations.get_query()}) AS to_agg + ({self.locations.get_query()}) AS to_agg {label_clause} GROUP BY - {aggregate_cols} + {aggregate_cols}{label_agg_clause} """ return sql diff --git a/flowmachine/tests/test_spatial_aggregate.py b/flowmachine/tests/test_spatial_aggregate.py index 657d0237f7..8468336821 100644 --- a/flowmachine/tests/test_spatial_aggregate.py +++ b/flowmachine/tests/test_spatial_aggregate.py @@ -8,6 +8,7 @@ daily_location, SubscriberHandsetCharacteristic, ) +from flowmachine.features.location.spatial_aggregate import SpatialAggregate from flowmachine.features.location.joined_spatial_aggregate import ( JoinedSpatialAggregate, ) @@ -62,3 +63,22 @@ def test_can_be_aggregated_admin3_distribution(get_dataframe): df = get_dataframe(agg) assert ["pcod", "metric", "key", "value"] == list(df.columns) assert all(df[df.metric == "value"].groupby("pcod").sum() == 1.0) + + +def test_labelled_spatial_aggregate(get_dataframe): + """ + Tests disaggregation by a label query + """ + locations = locate_subscribers( + "2016-01-01", + "2016-01-02", + spatial_unit=make_spatial_unit("admin", level=3), + method="most-common", + ) + metric = SubscriberHandsetCharacteristic( + "2016-01-01", "2016-01-02", characteristic="hnd_type" + ) + labelled = SpatialAggregate(locations=locations, subscriber_labels=metric) + df = get_dataframe(labelled) + assert len(df) == 50 + assert len(df.pcod.unique()) == 25 From ffdca9347d16b27e47de4fd8fac22eda103ac438 Mon Sep 17 00:00:00 2001 From: john Date: Tue, 7 Dec 2021 15:12:11 +0000 Subject: [PATCH 02/20] Spatial_aggregate returns labels if used --- .../flowmachine/features/location/spatial_aggregate.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/features/location/spatial_aggregate.py b/flowmachine/flowmachine/features/location/spatial_aggregate.py index 49cefa5f13..61dba60df2 100644 --- a/flowmachine/flowmachine/features/location/spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/spatial_aggregate.py @@ -34,7 +34,10 @@ def __init__(self, *, locations, subscriber_labels=None, label_column="value"): @property def column_names(self) -> List[str]: # Do we want to return the labels column as well? - return self.spatial_unit.location_id_columns + ["value"] + if self.subscriber_labels: + return self.spatial_unit.location_id_columns + ["label", "value"] + else: + return self.spatial_unit.location_id_columns + ["value"] def _make_query(self): From 7c6b56c82a53fb33d88330c2f6ac299faf5ba23a Mon Sep 17 00:00:00 2001 From: john Date: Tue, 7 Dec 2021 15:13:10 +0000 Subject: [PATCH 03/20] Spatial_aggregate returns labels if used --- flowmachine/tests/test_spatial_aggregate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/flowmachine/tests/test_spatial_aggregate.py b/flowmachine/tests/test_spatial_aggregate.py index 8468336821..d7a62a77b3 100644 --- a/flowmachine/tests/test_spatial_aggregate.py +++ b/flowmachine/tests/test_spatial_aggregate.py @@ -82,3 +82,4 @@ def test_labelled_spatial_aggregate(get_dataframe): df = get_dataframe(labelled) assert len(df) == 50 assert len(df.pcod.unique()) == 25 + assert len(df.label.unique()) == 2 From 96fa35f562c053526b716fcc81bf433315dcda69 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 8 Dec 2021 10:32:08 +0000 Subject: [PATCH 04/20] Breaking labelled spatial aggregate into its own class + permitting multiple labels --- .../location/labelled_spatial_aggregate.py | 56 +++++++++++++++++++ .../features/location/spatial_aggregate.py | 27 ++------- flowmachine/tests/test_spatial_aggregate.py | 9 ++- 3 files changed, 68 insertions(+), 24 deletions(-) create mode 100644 flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py diff --git a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py new file mode 100644 index 0000000000..cf94c7a483 --- /dev/null +++ b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py @@ -0,0 +1,56 @@ +from typing import List + +from flowmachine.core import Query +from flowmachine.core.mixins import GeoDataMixin +from flowmachine.features.location.spatial_aggregate import SpatialAggregate + + +class LabelledSpatialAggregate(GeoDataMixin, Query): + """ + Class represeneting a disaggregation of a SpatialAggregate by some label + """ + + def __init__( + self, + locations: Query, + subscriber_labels: Query, + label_columns: List[str] = ("value",), + ): + self.locations = locations + self.subscriber_labels = subscriber_labels + self.label_columns = list(label_columns) + super().__init__() + + @property + def column_names(self) -> List[str]: + return ( + self.locations.spatial_unit.location_id_columns + + [f"label_{label}" for label in self.label_columns] + + ["value"] + ) + + def _make_query(self): + + aggregate_cols = ",".join( + f"agg.{agg_col}" + for agg_col in self.locations.spatial_unit.location_id_columns + ) + label_select = ",".join( + f"labels.{label_col} AS label_{label_col}" + for label_col in self.label_columns + ) + label_group = ",".join( + f"labels.{label_col}" for label_col in self.label_columns + ) + + sql = f""" + SELECT + {aggregate_cols}, {label_select}, count(*) AS value + FROM + ({self.locations.get_query()}) AS agg + JOIN + ({self.subscriber_labels.get_query()}) AS labels USING (subscriber) + GROUP BY + {aggregate_cols}, {label_group} + """ + return sql diff --git a/flowmachine/flowmachine/features/location/spatial_aggregate.py b/flowmachine/flowmachine/features/location/spatial_aggregate.py index 61dba60df2..0383beca5b 100644 --- a/flowmachine/flowmachine/features/location/spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/spatial_aggregate.py @@ -18,48 +18,31 @@ class SpatialAggregate(GeoDataMixin, Query): Parameters ---------- locations : subscriber location query - subscriber_label: A query with a column to further disaggregate by - label_column: Name of the column containing the label: defaults to 'value' """ - def __init__(self, *, locations, subscriber_labels=None, label_column="value"): + def __init__(self, *, locations): self.locations = locations # self.spatial_unit is used in self._geo_augmented_query self.spatial_unit = locations.spatial_unit - self.subscriber_labels = subscriber_labels - self.label_column = label_column super().__init__() @property def column_names(self) -> List[str]: - # Do we want to return the labels column as well? - if self.subscriber_labels: - return self.spatial_unit.location_id_columns + ["label", "value"] - else: - return self.spatial_unit.location_id_columns + ["value"] + return self.spatial_unit.location_id_columns + ["value"] def _make_query(self): aggregate_cols = ",".join(self.spatial_unit.location_id_columns) - if self.subscriber_labels: - label_clause = f"JOIN ({self.subscriber_labels.get_query()}) AS labels USING (subscriber)" - label_select_clause = f", labels.{self.label_column} AS label" - label_agg_clause = f", label" - else: - label_clause = "" - label_select_clause = "" - label_agg_clause = "" - sql = f""" SELECT - {aggregate_cols}{label_select_clause}, + {aggregate_cols}, count(*) AS value FROM - ({self.locations.get_query()}) AS to_agg {label_clause} + ({self.locations.get_query()}) AS to_agg GROUP BY - {aggregate_cols}{label_agg_clause} + {aggregate_cols} """ return sql diff --git a/flowmachine/tests/test_spatial_aggregate.py b/flowmachine/tests/test_spatial_aggregate.py index d7a62a77b3..3d7923b67d 100644 --- a/flowmachine/tests/test_spatial_aggregate.py +++ b/flowmachine/tests/test_spatial_aggregate.py @@ -15,6 +15,10 @@ from flowmachine.features.subscriber.daily_location import locate_subscribers from flowmachine.utils import list_of_dates +from flowmachine.features.location.labelled_spatial_aggregate import ( + LabelledSpatialAggregate, +) + def test_can_be_aggregated_admin3(get_dataframe): """ @@ -78,8 +82,9 @@ def test_labelled_spatial_aggregate(get_dataframe): metric = SubscriberHandsetCharacteristic( "2016-01-01", "2016-01-02", characteristic="hnd_type" ) - labelled = SpatialAggregate(locations=locations, subscriber_labels=metric) + labelled = LabelledSpatialAggregate(locations=locations, subscriber_labels=metric) + foo = labelled.get_query() df = get_dataframe(labelled) assert len(df) == 50 assert len(df.pcod.unique()) == 25 - assert len(df.label.unique()) == 2 + assert len(df.label_value.unique()) == 2 From 0d95848bbdfa9ce86649869ef610baab1aa009c0 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 8 Dec 2021 12:17:26 +0000 Subject: [PATCH 05/20] Implementing specialised redaction + test --- .../location/labelled_spatial_aggregate.py | 23 ++++++-- .../redacted_labelled_spatial_aggregate.py | 55 +++++++++++++++++++ .../tests/test_labelled_spatial_aggregate.py | 25 +++++++++ ...test_redacted_labelled_spatial_location.py | 43 +++++++++++++++ flowmachine/tests/test_spatial_aggregate.py | 26 --------- 5 files changed, 140 insertions(+), 32 deletions(-) create mode 100644 flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py create mode 100644 flowmachine/tests/test_labelled_spatial_aggregate.py create mode 100644 flowmachine/tests/test_redacted_labelled_spatial_location.py diff --git a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py index cf94c7a483..d9f7e63772 100644 --- a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py @@ -7,7 +7,8 @@ class LabelledSpatialAggregate(GeoDataMixin, Query): """ - Class represeneting a disaggregation of a SpatialAggregate by some label + Class representing a disaggregation of a SpatialAggregate by some label or set of labels + Returns a query with """ def __init__( @@ -19,25 +20,35 @@ def __init__( self.locations = locations self.subscriber_labels = subscriber_labels self.label_columns = list(label_columns) + self.spatial_unit = locations.spatial_unit + self.label_columns = label_columns + self.out_label_columns = [ + f"label_{label_col}" for label_col in self.label_columns + ] super().__init__() @property def column_names(self) -> List[str]: return ( - self.locations.spatial_unit.location_id_columns + list(self.spatial_unit.location_id_columns) + [f"label_{label}" for label in self.label_columns] + ["value"] ) + @property + def out_label_columns_as_string_list(self): + return ",".join(self.out_label_columns) + def _make_query(self): aggregate_cols = ",".join( - f"agg.{agg_col}" - for agg_col in self.locations.spatial_unit.location_id_columns + f"agg.{agg_col}" for agg_col in self.spatial_unit.location_id_columns ) label_select = ",".join( - f"labels.{label_col} AS label_{label_col}" - for label_col in self.label_columns + f"labels.{label_col} AS {out_label_col}" + for label_col, out_label_col in zip( + self.label_columns, self.out_label_columns + ) ) label_group = ",".join( f"labels.{label_col}" for label_col in self.label_columns diff --git a/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py new file mode 100644 index 0000000000..c3a6ca73d3 --- /dev/null +++ b/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py @@ -0,0 +1,55 @@ +from typing import List + +from flowmachine.features.location.labelled_spatial_aggregate import ( + LabelledSpatialAggregate, +) +from flowmachine.core import Query +from flowmachine.core.mixins import GeoDataMixin + + +class RedactedLabelledSpatialAggregate(GeoDataMixin, Query): + """ + Query that drops and reaggregates data if any 'label' column falls below redcation_threshold + Parameters + ---------- + labelled_spatial_aggregate + redaction_threshold + """ + + def __init__( + self, + *, + labelled_spatial_aggregate: LabelledSpatialAggregate, + redaction_threshold=15, + ): + + self.labelled_spatial_aggregate = labelled_spatial_aggregate + self.redaction_threshold = redaction_threshold + super().__init__() + + @property + def column_names(self) -> List[str]: + return self.labelled_spatial_aggregate.column_names + + def _make_query(self): + + aggs = ",".join( + self.labelled_spatial_aggregate.spatial_unit.location_id_columns + ) + labels = self.labelled_spatial_aggregate.out_label_columns_as_string_list + all_cols = self.labelled_spatial_aggregate.column_names_as_string_list + + sql = f""" + WITH lsa_query AS( + SELECT * + FROM({self.labelled_spatial_aggregate.get_query()}) AS t + ), aggs_to_keep AS( + SELECT {aggs} + FROM lsa_query + GROUP BY {aggs} + HAVING min(value) > {self.redaction_threshold} + ) + SELECT {all_cols} + FROM lsa_query INNER JOIN aggs_to_keep USING ({aggs}) + """ + return sql diff --git a/flowmachine/tests/test_labelled_spatial_aggregate.py b/flowmachine/tests/test_labelled_spatial_aggregate.py new file mode 100644 index 0000000000..7c11f472c4 --- /dev/null +++ b/flowmachine/tests/test_labelled_spatial_aggregate.py @@ -0,0 +1,25 @@ +from core import make_spatial_unit +from features import SubscriberHandsetCharacteristic +from features.location.labelled_spatial_aggregate import LabelledSpatialAggregate +from features.subscriber.daily_location import locate_subscribers + + +def test_labelled_spatial_aggregate(get_dataframe): + """ + Tests disaggregation by a label query + """ + locations = locate_subscribers( + "2016-01-01", + "2016-01-02", + spatial_unit=make_spatial_unit("admin", level=3), + method="most-common", + ) + metric = SubscriberHandsetCharacteristic( + "2016-01-01", "2016-01-02", characteristic="hnd_type" + ) + labelled = LabelledSpatialAggregate(locations=locations, subscriber_labels=metric) + foo = labelled.get_query() + df = get_dataframe(labelled) + assert len(df) == 50 + assert len(df.pcod.unique()) == 25 + assert len(df.label_value.unique()) == 2 diff --git a/flowmachine/tests/test_redacted_labelled_spatial_location.py b/flowmachine/tests/test_redacted_labelled_spatial_location.py new file mode 100644 index 0000000000..87fe6f67c1 --- /dev/null +++ b/flowmachine/tests/test_redacted_labelled_spatial_location.py @@ -0,0 +1,43 @@ +import pytest + +import pandas as pd +from pandas.testing import assert_frame_equal + +from flowmachine.core import make_spatial_unit +from flowmachine.features import SubscriberHandsetCharacteristic +from flowmachine.features.location.labelled_spatial_aggregate import ( + LabelledSpatialAggregate, +) +from flowmachine.features.location.redacted_labelled_spatial_aggregate import ( + RedactedLabelledSpatialAggregate, +) +from flowmachine.features.subscriber.daily_location import locate_subscribers + + +def test_no_redaction(get_dataframe): + + locations = locate_subscribers( + "2016-01-01", + "2016-01-02", + spatial_unit=make_spatial_unit("admin", level=3), + method="most-common", + ) + metric = SubscriberHandsetCharacteristic( + "2016-01-01", "2016-01-02", characteristic="hnd_type" + ) + labelled = LabelledSpatialAggregate(locations=locations, subscriber_labels=metric) + + redacted = RedactedLabelledSpatialAggregate(labelled_spatial_aggregate=labelled) + redacted_df = get_dataframe(redacted) + + target = pd.DataFrame( + [ + ["524 3 08 44", "Feature", 36], + ["524 3 08 44", "Smart", 28], + ["524 4 12 62", "Feature", 44], + ["524 4 12 62", "Smart", 19], + ], + columns=["pcod", "label_value", "value"], + ) + + assert_frame_equal(redacted_df, target) diff --git a/flowmachine/tests/test_spatial_aggregate.py b/flowmachine/tests/test_spatial_aggregate.py index 3d7923b67d..657d0237f7 100644 --- a/flowmachine/tests/test_spatial_aggregate.py +++ b/flowmachine/tests/test_spatial_aggregate.py @@ -8,17 +8,12 @@ daily_location, SubscriberHandsetCharacteristic, ) -from flowmachine.features.location.spatial_aggregate import SpatialAggregate from flowmachine.features.location.joined_spatial_aggregate import ( JoinedSpatialAggregate, ) from flowmachine.features.subscriber.daily_location import locate_subscribers from flowmachine.utils import list_of_dates -from flowmachine.features.location.labelled_spatial_aggregate import ( - LabelledSpatialAggregate, -) - def test_can_be_aggregated_admin3(get_dataframe): """ @@ -67,24 +62,3 @@ def test_can_be_aggregated_admin3_distribution(get_dataframe): df = get_dataframe(agg) assert ["pcod", "metric", "key", "value"] == list(df.columns) assert all(df[df.metric == "value"].groupby("pcod").sum() == 1.0) - - -def test_labelled_spatial_aggregate(get_dataframe): - """ - Tests disaggregation by a label query - """ - locations = locate_subscribers( - "2016-01-01", - "2016-01-02", - spatial_unit=make_spatial_unit("admin", level=3), - method="most-common", - ) - metric = SubscriberHandsetCharacteristic( - "2016-01-01", "2016-01-02", characteristic="hnd_type" - ) - labelled = LabelledSpatialAggregate(locations=locations, subscriber_labels=metric) - foo = labelled.get_query() - df = get_dataframe(labelled) - assert len(df) == 50 - assert len(df.pcod.unique()) == 25 - assert len(df.label_value.unique()) == 2 From 66be2f0ddc1127d98557748ad0cec406dd7d5dbc Mon Sep 17 00:00:00 2001 From: john Date: Wed, 8 Dec 2021 13:33:28 +0000 Subject: [PATCH 06/20] Fixing imports --- flowmachine/flowmachine/core/spatial_unit.py | 6 +++--- .../features/location/labelled_spatial_aggregate.py | 1 - .../location/redacted_labelled_spatial_aggregate.py | 2 +- flowmachine/tests/test_labelled_spatial_aggregate.py | 11 ++++++----- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/flowmachine/flowmachine/core/spatial_unit.py b/flowmachine/flowmachine/core/spatial_unit.py index 3ab85fc1d8..c4a2e11b24 100644 --- a/flowmachine/flowmachine/core/spatial_unit.py +++ b/flowmachine/flowmachine/core/spatial_unit.py @@ -12,9 +12,9 @@ from flowmachine.utils import get_name_and_alias from flowmachine.core.errors import InvalidSpatialUnitError -from . import Query, Table -from .context import get_db -from .grid import Grid +from flowmachine.core import Query, Table +from flowmachine.core.context import get_db +from flowmachine.core.grid import Grid # TODO: Currently most spatial units require a FlowDB connection at init time. # It would be useful to remove this requirement wherever possible, and instead diff --git a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py index d9f7e63772..84a61e6877 100644 --- a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py @@ -8,7 +8,6 @@ class LabelledSpatialAggregate(GeoDataMixin, Query): """ Class representing a disaggregation of a SpatialAggregate by some label or set of labels - Returns a query with """ def __init__( diff --git a/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py index c3a6ca73d3..23e08b0092 100644 --- a/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py @@ -20,7 +20,7 @@ def __init__( self, *, labelled_spatial_aggregate: LabelledSpatialAggregate, - redaction_threshold=15, + redaction_threshold: int = 15, ): self.labelled_spatial_aggregate = labelled_spatial_aggregate diff --git a/flowmachine/tests/test_labelled_spatial_aggregate.py b/flowmachine/tests/test_labelled_spatial_aggregate.py index 7c11f472c4..bbcf977d03 100644 --- a/flowmachine/tests/test_labelled_spatial_aggregate.py +++ b/flowmachine/tests/test_labelled_spatial_aggregate.py @@ -1,7 +1,9 @@ -from core import make_spatial_unit -from features import SubscriberHandsetCharacteristic -from features.location.labelled_spatial_aggregate import LabelledSpatialAggregate -from features.subscriber.daily_location import locate_subscribers +from flowmachine.core import make_spatial_unit +from flowmachine.features import SubscriberHandsetCharacteristic +from flowmachine.features.location.labelled_spatial_aggregate import ( + LabelledSpatialAggregate, +) +from flowmachine.features.subscriber.daily_location import locate_subscribers def test_labelled_spatial_aggregate(get_dataframe): @@ -18,7 +20,6 @@ def test_labelled_spatial_aggregate(get_dataframe): "2016-01-01", "2016-01-02", characteristic="hnd_type" ) labelled = LabelledSpatialAggregate(locations=locations, subscriber_labels=metric) - foo = labelled.get_query() df = get_dataframe(labelled) assert len(df) == 50 assert len(df.pcod.unique()) == 25 From f289b92c55669958dceeb81141688cfc6c816343 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 8 Dec 2021 13:51:47 +0000 Subject: [PATCH 07/20] Validation + test --- .../location/labelled_spatial_aggregate.py | 7 +++++++ .../tests/test_labelled_spatial_aggregate.py | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py index 84a61e6877..ab388eec73 100644 --- a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py @@ -16,6 +16,13 @@ def __init__( subscriber_labels: Query, label_columns: List[str] = ("value",), ): + if any( + label_column not in subscriber_labels.column_names + for label_column in label_columns + ): + raise ValueError( + f"One of {label_columns} not a column of {subscriber_labels}" + ) self.locations = locations self.subscriber_labels = subscriber_labels self.label_columns = list(label_columns) diff --git a/flowmachine/tests/test_labelled_spatial_aggregate.py b/flowmachine/tests/test_labelled_spatial_aggregate.py index bbcf977d03..94c874d483 100644 --- a/flowmachine/tests/test_labelled_spatial_aggregate.py +++ b/flowmachine/tests/test_labelled_spatial_aggregate.py @@ -1,3 +1,5 @@ +import pytest + from flowmachine.core import make_spatial_unit from flowmachine.features import SubscriberHandsetCharacteristic from flowmachine.features.location.labelled_spatial_aggregate import ( @@ -24,3 +26,19 @@ def test_labelled_spatial_aggregate(get_dataframe): assert len(df) == 50 assert len(df.pcod.unique()) == 25 assert len(df.label_value.unique()) == 2 + + +def test_validation(): + with pytest.raises(ValueError, match="not a column of"): + locations = locate_subscribers( + "2016-01-01", + "2016-01-02", + spatial_unit=make_spatial_unit("admin", level=3), + method="most-common", + ) + metric = SubscriberHandsetCharacteristic( + "2016-01-01", "2016-01-02", characteristic="hnd_type" + ) + labelled = LabelledSpatialAggregate( + locations=locations, subscriber_labels=metric, label_columns=["foo", "bar"] + ) From cddb3ee76093d2458799bacd8d8e6ba74a5ce6d4 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 8 Dec 2021 14:50:01 +0000 Subject: [PATCH 08/20] Docs + tests + some renaming --- .../location/labelled_spatial_aggregate.py | 40 ++++++++++++++++ .../tests/test_labelled_spatial_aggregate.py | 47 ++++++++++++++++++- ...st_redacted_labelled_spatial_aggregate.py} | 2 +- 3 files changed, 87 insertions(+), 2 deletions(-) rename flowmachine/tests/{test_redacted_labelled_spatial_location.py => test_redacted_labelled_spatial_aggregate.py} (97%) diff --git a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py index ab388eec73..090e0b3df2 100644 --- a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py @@ -8,6 +8,35 @@ class LabelledSpatialAggregate(GeoDataMixin, Query): """ Class representing a disaggregation of a SpatialAggregate by some label or set of labels + + Parameters + ---------- + locations: Query + Any query with a subscriber and location columns + subscriber_labels: Query + Any query with a subscriber column + label_columns: List[str] + A list of columns in subscriber_labels to aggregate on + + Example: + + >>> locations = locate_subscribers( + ... "2016-01-01", + ... "2016-01-02", + ... spatial_unit=make_spatial_unit("admin", level=3), + ... method="most-common", + ... ) + >>> metric = SubscriberHandsetCharacteristic( + ... "2016-01-01", "2016-01-02", characteristic="hnd_type" + ... ) + >>> labelled = LabelledSpatialAggregate(locations=locations, subscriber_labels=metric) + >>> labelled.get_dataframe() + pcod label_value value + 0 524 3 08 44 Feature 36 + 1 524 3 08 44 Smart 28 + 2 524 4 12 62 Feature 44 + 3 524 4 12 62 Smart 19' + """ def __init__( @@ -16,6 +45,7 @@ def __init__( subscriber_labels: Query, label_columns: List[str] = ("value",), ): + if any( label_column not in subscriber_labels.column_names for label_column in label_columns @@ -23,6 +53,13 @@ def __init__( raise ValueError( f"One of {label_columns} not a column of {subscriber_labels}" ) + if "subscriber" not in locations.column_names: + raise ValueError(f"Locations query must have a subscriber column") + if "spatial_unit" not in dir(locations): + raise ValueError(f"Locations must have a spatial_unit attribute") + if "subscriber" in label_columns: + raise ValueError(f"'subscriber' cannot be a label") + self.locations = locations self.subscriber_labels = subscriber_labels self.label_columns = list(label_columns) @@ -43,6 +80,9 @@ def column_names(self) -> List[str]: @property def out_label_columns_as_string_list(self): + """ + Returns the label column heading as a single string + """ return ",".join(self.out_label_columns) def _make_query(self): diff --git a/flowmachine/tests/test_labelled_spatial_aggregate.py b/flowmachine/tests/test_labelled_spatial_aggregate.py index 94c874d483..60dc66d7b5 100644 --- a/flowmachine/tests/test_labelled_spatial_aggregate.py +++ b/flowmachine/tests/test_labelled_spatial_aggregate.py @@ -1,5 +1,7 @@ import pytest +from flowmachine.core.dummy_query import DummyQuery +from flowmachine.features import TotalLocationEvents from flowmachine.core import make_spatial_unit from flowmachine.features import SubscriberHandsetCharacteristic from flowmachine.features.location.labelled_spatial_aggregate import ( @@ -28,7 +30,7 @@ def test_labelled_spatial_aggregate(get_dataframe): assert len(df.label_value.unique()) == 2 -def test_validation(): +def test_label_validation(): with pytest.raises(ValueError, match="not a column of"): locations = locate_subscribers( "2016-01-01", @@ -42,3 +44,46 @@ def test_validation(): labelled = LabelledSpatialAggregate( locations=locations, subscriber_labels=metric, label_columns=["foo", "bar"] ) + + +def test_loc_sub_validation(): + with pytest.raises(ValueError, match="Locations query must have a subscriber"): + _ = LabelledSpatialAggregate( + locations=TotalLocationEvents("2016-01-01", "2016-01-02"), + subscriber_labels=SubscriberHandsetCharacteristic( + "2016-01-01", "2016-01-02", characteristic="hnd_type" + ), + ) + + +def test_col_sub_validation(): + with pytest.raises(ValueError, match="cannot be a label"): + _ = LabelledSpatialAggregate( + locations=locate_subscribers( + "2016-01-01", + "2016-01-02", + spatial_unit=make_spatial_unit("admin", level=3), + method="most-common", + ), + subscriber_labels=SubscriberHandsetCharacteristic( + "2016-01-01", "2016-01-02", characteristic="hnd_type" + ), + label_columns=["subscriber"], + ) + + +class SubHavingQuery(DummyQuery): + @property + def column_names(self): + return ["subscriber"] + + +def test_spatial_unit_validation(): + dq = SubHavingQuery("foo") + with pytest.raises(ValueError, match="spatial_unit"): + _ = LabelledSpatialAggregate( + locations=dq, + subscriber_labels=SubscriberHandsetCharacteristic( + "2016-01-01", "2016-01-02", characteristic="hnd_type" + ), + ) diff --git a/flowmachine/tests/test_redacted_labelled_spatial_location.py b/flowmachine/tests/test_redacted_labelled_spatial_aggregate.py similarity index 97% rename from flowmachine/tests/test_redacted_labelled_spatial_location.py rename to flowmachine/tests/test_redacted_labelled_spatial_aggregate.py index 87fe6f67c1..b941f8c960 100644 --- a/flowmachine/tests/test_redacted_labelled_spatial_location.py +++ b/flowmachine/tests/test_redacted_labelled_spatial_aggregate.py @@ -14,7 +14,7 @@ from flowmachine.features.subscriber.daily_location import locate_subscribers -def test_no_redaction(get_dataframe): +def test_redaction(get_dataframe): locations = locate_subscribers( "2016-01-01", From 1e718bbff3d20a76d4dd38ec9aca6f366cbe4189 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 8 Dec 2021 15:02:26 +0000 Subject: [PATCH 09/20] Headers + more docs --- .../features/location/labelled_spatial_aggregate.py | 3 +++ .../location/redacted_labelled_spatial_aggregate.py | 13 ++++++++++--- .../test_redacted_labelled_spatial_aggregate.py | 4 ++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py index 090e0b3df2..693c39f8db 100644 --- a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py @@ -1,3 +1,6 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. from typing import List from flowmachine.core import Query diff --git a/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py index 23e08b0092..e159133545 100644 --- a/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py @@ -1,3 +1,7 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + from typing import List from flowmachine.features.location.labelled_spatial_aggregate import ( @@ -9,11 +13,14 @@ class RedactedLabelledSpatialAggregate(GeoDataMixin, Query): """ - Query that drops and reaggregates data if any 'label' column falls below redcation_threshold + Query that drops any locations that, when disaggregated by label, reveal a number of subscribers + less than redaction_threshold Parameters ---------- - labelled_spatial_aggregate - redaction_threshold + labelled_spatial_aggregate: LabelledSpatialAggregate + The LabelledSpatialAggregate query to redact + redaction_threshold: int default 15 + If any labels within a location reveal less than this number of subscribers, that location is dropeed """ def __init__( diff --git a/flowmachine/tests/test_redacted_labelled_spatial_aggregate.py b/flowmachine/tests/test_redacted_labelled_spatial_aggregate.py index b941f8c960..fb10b40d06 100644 --- a/flowmachine/tests/test_redacted_labelled_spatial_aggregate.py +++ b/flowmachine/tests/test_redacted_labelled_spatial_aggregate.py @@ -1,3 +1,7 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + import pytest import pandas as pd From dc9007a30a5759c44c30011c910483bca28edf2c Mon Sep 17 00:00:00 2001 From: john Date: Wed, 8 Dec 2021 15:05:19 +0000 Subject: [PATCH 10/20] Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27dc355a5d..8f17bc9e83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Added `PerSubscriberAggregate` query. [#4559](https://github.com/Flowminder/FlowKit/issues/4559) - Added FlowETL QA checks 'count_imeis', 'count_imsis', 'count_locatable_location_ids', 'count_null_imeis', 'count_null_imsis', 'count_null_location_ids', 'max_msisdns_per_imei', 'max_msisdns_per_imsi', 'count_added_rows_outgoing', 'count_null_counterparts', 'count_null_durations', 'count_onnet_msisdns_incoming', 'count_onnet_msisdns_outgoing', 'count_onnet_msisdns', 'max_duration' and 'median_duration'. [#4552](https://github.com/Flowminder/FlowKit/issues/4552) - Added `FilteredReferenceLocation` query, which returns only rows where a subscriber visited a reference location the required number of times. [#4584](https://github.com/Flowminder/FlowKit/issues/4584) +- Added `LabelledSpatialAggregate` query and redaction, which sub-aggregates by subscriber labels. [#4668](https://github.com/Flowminder/FlowKit/issues/4668) ### Changed - Harmonised FlowAPI parameter names for start and end dates. They are now all `start_date` and `end_date` From 06e809d145fa455d0be74dae3942dcc065d25905 Mon Sep 17 00:00:00 2001 From: Thingus Date: Wed, 8 Dec 2021 15:32:37 +0000 Subject: [PATCH 11/20] Update flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py Co-authored-by: James Harrison --- .../features/location/labelled_spatial_aggregate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py index 693c39f8db..8e8080f9ba 100644 --- a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py @@ -21,7 +21,8 @@ class LabelledSpatialAggregate(GeoDataMixin, Query): label_columns: List[str] A list of columns in subscriber_labels to aggregate on - Example: + Examples + -------- >>> locations = locate_subscribers( ... "2016-01-01", From dad684f1b48ea5d91dd71bb94f53f834d0f8102f Mon Sep 17 00:00:00 2001 From: Thingus Date: Wed, 8 Dec 2021 16:24:28 +0000 Subject: [PATCH 12/20] Update flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py Co-authored-by: James Harrison --- .../flowmachine/features/location/labelled_spatial_aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py index 8e8080f9ba..6e36c6ff9b 100644 --- a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py @@ -78,7 +78,7 @@ def __init__( def column_names(self) -> List[str]: return ( list(self.spatial_unit.location_id_columns) - + [f"label_{label}" for label in self.label_columns] + + self.out_label_columns + ["value"] ) From fc159d4c82cdd1f889403e1ac842adc90d658b92 Mon Sep 17 00:00:00 2001 From: Thingus Date: Wed, 8 Dec 2021 16:24:52 +0000 Subject: [PATCH 13/20] Update flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py Co-authored-by: James Harrison --- .../flowmachine/features/location/labelled_spatial_aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py index 6e36c6ff9b..34b50d6e76 100644 --- a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py @@ -59,7 +59,7 @@ def __init__( ) if "subscriber" not in locations.column_names: raise ValueError(f"Locations query must have a subscriber column") - if "spatial_unit" not in dir(locations): + if not hasattr(locations, "spatial_unit"): raise ValueError(f"Locations must have a spatial_unit attribute") if "subscriber" in label_columns: raise ValueError(f"'subscriber' cannot be a label") From 33a59fca7070e0e459e95be3b993141ec08d5722 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 8 Dec 2021 16:25:50 +0000 Subject: [PATCH 14/20] Changing to left join --- .../flowmachine/features/location/labelled_spatial_aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py index 34b50d6e76..9eaffe757f 100644 --- a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py @@ -109,7 +109,7 @@ def _make_query(self): {aggregate_cols}, {label_select}, count(*) AS value FROM ({self.locations.get_query()}) AS agg - JOIN + LEFT JOIN ({self.subscriber_labels.get_query()}) AS labels USING (subscriber) GROUP BY {aggregate_cols}, {label_group} From 8bc88352c1b28742bbea2e646551b9ec53685015 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 8 Dec 2021 16:29:44 +0000 Subject: [PATCH 15/20] More changes from review --- .../features/location/labelled_spatial_aggregate.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py index 9eaffe757f..2c3fa7c858 100644 --- a/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/labelled_spatial_aggregate.py @@ -50,13 +50,9 @@ def __init__( label_columns: List[str] = ("value",), ): - if any( - label_column not in subscriber_labels.column_names - for label_column in label_columns - ): - raise ValueError( - f"One of {label_columns} not a column of {subscriber_labels}" - ) + for label_column in label_columns: + if label_column not in subscriber_labels.column_names: + raise ValueError(f"{label_column} not a column of {subscriber_labels}") if "subscriber" not in locations.column_names: raise ValueError(f"Locations query must have a subscriber column") if not hasattr(locations, "spatial_unit"): From c758cf8ccc6a29689518868a4bf43e655229d705 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 8 Dec 2021 16:37:15 +0000 Subject: [PATCH 16/20] Adding spatial_unit to redacted_labelled_spatial_aggregate.py --- .../features/location/redacted_labelled_spatial_aggregate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py b/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py index e159133545..b0ddd058ff 100644 --- a/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py +++ b/flowmachine/flowmachine/features/location/redacted_labelled_spatial_aggregate.py @@ -20,7 +20,7 @@ class RedactedLabelledSpatialAggregate(GeoDataMixin, Query): labelled_spatial_aggregate: LabelledSpatialAggregate The LabelledSpatialAggregate query to redact redaction_threshold: int default 15 - If any labels within a location reveal less than this number of subscribers, that location is dropeed + If any labels within a location reveal less than this number of subscribers, that location is dropped """ def __init__( @@ -32,6 +32,7 @@ def __init__( self.labelled_spatial_aggregate = labelled_spatial_aggregate self.redaction_threshold = redaction_threshold + self.spatial_unit = labelled_spatial_aggregate.spatial_unit super().__init__() @property @@ -43,7 +44,6 @@ def _make_query(self): aggs = ",".join( self.labelled_spatial_aggregate.spatial_unit.location_id_columns ) - labels = self.labelled_spatial_aggregate.out_label_columns_as_string_list all_cols = self.labelled_spatial_aggregate.column_names_as_string_list sql = f""" From 2d48f1403da4052c9f61074cd126cb23cac3f5b5 Mon Sep 17 00:00:00 2001 From: john Date: Wed, 8 Dec 2021 17:18:38 +0000 Subject: [PATCH 17/20] Extra tests --- .../tests/test_labelled_spatial_aggregate.py | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/flowmachine/tests/test_labelled_spatial_aggregate.py b/flowmachine/tests/test_labelled_spatial_aggregate.py index 60dc66d7b5..7e9dfb6240 100644 --- a/flowmachine/tests/test_labelled_spatial_aggregate.py +++ b/flowmachine/tests/test_labelled_spatial_aggregate.py @@ -10,7 +10,7 @@ from flowmachine.features.subscriber.daily_location import locate_subscribers -def test_labelled_spatial_aggregate(get_dataframe): +def test_one_label(get_dataframe): """ Tests disaggregation by a label query """ @@ -28,6 +28,41 @@ def test_labelled_spatial_aggregate(get_dataframe): assert len(df) == 50 assert len(df.pcod.unique()) == 25 assert len(df.label_value.unique()) == 2 + # Total number of values should equal the initial number of subscribers + assert sum(map(int, df.value.tolist())) == len(get_dataframe(metric)) + + +def test_multiple_labels(get_dataframe): + """ + Tests disaggregation by multiple labels + """ + locations = locate_subscribers( + "2016-01-01", + "2016-01-02", + spatial_unit=make_spatial_unit("admin", level=3), + method="most-common", + ) + metric = SubscriberHandsetCharacteristic( + "2016-01-01", "2016-01-02", characteristic="hnd_type" + ).join( + SubscriberHandsetCharacteristic( + "2016-01-01", "2016-01-02", characteristic="model" + ), + on_left="subscriber", + left_append="_hnd_type", + right_append="_model", + ) + labelled = LabelledSpatialAggregate( + locations=locations, + subscriber_labels=metric, + label_columns=["value_hnd_type", "value_model"], + ) + df = get_dataframe(labelled) + foo = labelled.get_query() + assert all( + df.columns == ["pcod", "label_value_hnd_type", "label_value_model", "value"] + ) + assert len(df) > 300 def test_label_validation(): From 7cad6446c6c8848babf4254401484d969d2d39de Mon Sep 17 00:00:00 2001 From: john Date: Wed, 8 Dec 2021 20:32:36 +0000 Subject: [PATCH 18/20] More extra tests --- flowmachine/tests/test_labelled_spatial_aggregate.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/flowmachine/tests/test_labelled_spatial_aggregate.py b/flowmachine/tests/test_labelled_spatial_aggregate.py index 7e9dfb6240..b7d47c61d0 100644 --- a/flowmachine/tests/test_labelled_spatial_aggregate.py +++ b/flowmachine/tests/test_labelled_spatial_aggregate.py @@ -1,5 +1,8 @@ import pytest +from pandas.testing import assert_series_equal + +from flowmachine.features.location.spatial_aggregate import SpatialAggregate from flowmachine.core.dummy_query import DummyQuery from flowmachine.features import TotalLocationEvents from flowmachine.core import make_spatial_unit @@ -29,7 +32,11 @@ def test_one_label(get_dataframe): assert len(df.pcod.unique()) == 25 assert len(df.label_value.unique()) == 2 # Total number of values should equal the initial number of subscribers - assert sum(map(int, df.value.tolist())) == len(get_dataframe(metric)) + assert df.value.sum() == len(get_dataframe(locations)) + assert ( + df.groupby("pcod").value.sum().tolist() + == get_dataframe(SpatialAggregate(locations=locations)).value.tolist() + ) def test_multiple_labels(get_dataframe): @@ -58,7 +65,6 @@ def test_multiple_labels(get_dataframe): label_columns=["value_hnd_type", "value_model"], ) df = get_dataframe(labelled) - foo = labelled.get_query() assert all( df.columns == ["pcod", "label_value_hnd_type", "label_value_model", "value"] ) From 0958ea1595d8fc3e318eca3bdfb3c97bbbb960ef Mon Sep 17 00:00:00 2001 From: john Date: Wed, 8 Dec 2021 23:21:54 +0000 Subject: [PATCH 19/20] ADR --- ...action-strategy-for-labelled-aggregates.md | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 docs/source/developer/adr/0011-redaction-strategy-for-labelled-aggregates.md diff --git a/docs/source/developer/adr/0011-redaction-strategy-for-labelled-aggregates.md b/docs/source/developer/adr/0011-redaction-strategy-for-labelled-aggregates.md new file mode 100644 index 0000000000..4807f27b40 --- /dev/null +++ b/docs/source/developer/adr/0011-redaction-strategy-for-labelled-aggregates.md @@ -0,0 +1,25 @@ +# Redaction strategy for labelled aggregation + +Date: 08-12-2021 + +# Status + +Proposed + +## Context + +At present, any rows in a spatial query are dropped if they return an aggregate of 15 subscribers or less. With new +labelling disaggregation queries being added to Flowkit, there is an increased risk of deanonymization attacks +using the increased resolution of information - we need to consider further redaction strategies to mitigate this. + +## Decision + +For any labelled spatial aggregate queries, we drop any aggregation zone that contains any disaggregation less than 15 +(for consistency with the rest of the dissagregation strategy). + +## Consequences + +This disaggregation strategy could reveal that a given zone contains less than 15 of _something_, by comparing a +disaggregated query to it's non-disaggregated counterpart. +This could lead to zones being dropped prematurely in disaggregations if a label choice leads to a small number of +subscribers appearing in a corner-case disaggregation From 6cff7cccaa1bd1e2c9be14b9450372a2740d61de Mon Sep 17 00:00:00 2001 From: john Date: Wed, 8 Dec 2021 23:23:38 +0000 Subject: [PATCH 20/20] More test --- flowmachine/tests/test_labelled_spatial_aggregate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/flowmachine/tests/test_labelled_spatial_aggregate.py b/flowmachine/tests/test_labelled_spatial_aggregate.py index b7d47c61d0..1a5a062b31 100644 --- a/flowmachine/tests/test_labelled_spatial_aggregate.py +++ b/flowmachine/tests/test_labelled_spatial_aggregate.py @@ -68,6 +68,7 @@ def test_multiple_labels(get_dataframe): assert all( df.columns == ["pcod", "label_value_hnd_type", "label_value_model", "value"] ) + assert all(df.columns == labelled.column_names) assert len(df) > 300