From e361a0bf1222d65fbada9cf0d9dc0ade7d78afd8 Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 16 Dec 2024 10:28:48 -0800 Subject: [PATCH] Improving datamap performance (#5601) --- CHANGELOG.md | 1 + .../reporting/DatamapReportTableColumns.tsx | 64 ++++--- src/fides/api/models/sql_models.py | 57 ++++--- src/fides/api/util/data_category.py | 32 +++- tests/conftest.py | 157 +++++++++++++++++- tests/ctl/core/test_privacy_declaration.py | 52 ++++-- tests/ctl/core/test_system.py | 25 ++- 7 files changed, 317 insertions(+), 71 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d49da9bca2..e782bafdeb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ The types of changes are: ### Fixed - Fixing quickstart.py script [#5585](https://github.com/ethyca/fides/pull/5585) +- Fixed miscellaneous performance issues with Systems and PrivacyDeclarations [#5601](https://github.com/ethyca/fides/pull/5601) ### Changed - Adjusted Ant's Select component colors and icon [#5594](https://github.com/ethyca/fides/pull/5594) diff --git a/clients/admin-ui/src/features/datamap/reporting/DatamapReportTableColumns.tsx b/clients/admin-ui/src/features/datamap/reporting/DatamapReportTableColumns.tsx index a4dd638b51..e433f89723 100644 --- a/clients/admin-ui/src/features/datamap/reporting/DatamapReportTableColumns.tsx +++ b/clients/admin-ui/src/features/datamap/reporting/DatamapReportTableColumns.tsx @@ -426,49 +426,65 @@ export const getDatamapReportColumns = ({ columnHelper.accessor((row) => row.system_undeclared_data_categories, { id: COLUMN_IDS.SYSTEM_UNDECLARED_DATA_CATEGORIES, cell: (props) => { - const value = props.getValue(); - + const cellValues = props.getValue(); + if (!cellValues || cellValues.length === 0) { + return null; + } + const values = isArray(cellValues) + ? cellValues.map((value) => { + return { label: getDataCategoryDisplayName(value), key: value }; + }) + : [ + { + label: getDataCategoryDisplayName(cellValues), + key: cellValues, + }, + ]; return ( - ); }, meta: { showHeaderMenu: !isRenaming, + showHeaderMenuWrapOption: true, width: "auto", + overflow: "hidden", }, }), columnHelper.accessor((row) => row.data_use_undeclared_data_categories, { id: COLUMN_IDS.DATA_USE_UNDECLARED_DATA_CATEGORIES, cell: (props) => { - const value = props.getValue(); - + const cellValues = props.getValue(); + if (!cellValues || cellValues.length === 0) { + return null; + } + const values = isArray(cellValues) + ? cellValues.map((value) => { + return { label: getDataCategoryDisplayName(value), key: value }; + }) + : [ + { + label: getDataCategoryDisplayName(cellValues), + key: cellValues, + }, + ]; return ( - ); }, meta: { showHeaderMenu: !isRenaming, + showHeaderMenuWrapOption: true, width: "auto", + overflow: "hidden", }, }), columnHelper.accessor((row) => row.cookies, { diff --git a/src/fides/api/models/sql_models.py b/src/fides/api/models/sql_models.py index f80abae5f9..2132ffe597 100644 --- a/src/fides/api/models/sql_models.py +++ b/src/fides/api/models/sql_models.py @@ -27,6 +27,7 @@ UniqueConstraint, case, cast, + func, select, text, type_coerce, @@ -36,7 +37,7 @@ from sqlalchemy.ext.asyncio import AsyncSession, async_object_session from sqlalchemy.ext.hybrid import hybrid_property from sqlalchemy.orm import Session, relationship -from sqlalchemy.sql import Select, func +from sqlalchemy.sql import Select from sqlalchemy.sql.elements import Case from sqlalchemy.sql.sqltypes import DateTime from typing_extensions import Protocol, runtime_checkable @@ -404,15 +405,6 @@ class System(Base, FidesBase): "Cookies", back_populates="system", lazy="selectin", uselist=True, viewonly=True ) - # index scan using ix_ctl_datasets_fides_key on ctl_datasets - datasets = relationship( - "Dataset", - primaryjoin="foreign(Dataset.fides_key)==any_(System.dataset_references)", - lazy="selectin", - uselist=True, - viewonly=True, - ) - @classmethod def get_data_uses( cls: Type[System], systems: List[System], include_parents: bool = True @@ -430,11 +422,21 @@ def get_data_uses( data_uses.add(data_use) return data_uses - @property - def undeclared_data_categories(self) -> Set[str]: + def dataset_data_categories(self, data_categories: Dict[str, Set[str]]) -> Set[str]: + aggregate = set() + for dataset_key in self.dataset_references or []: + aggregate.update(data_categories.get(dataset_key, set())) + return aggregate + + def undeclared_data_categories( + self, data_categories: Dict[str, Set[str]] + ) -> Set[str]: """ Returns a set of data categories defined on the system's datasets that are not associated with any data use (privacy declaration). + + Looks up the unique set of data categories for a given dataset from the pre-computed data_categories map. + This is done to improve performance. """ privacy_declaration_data_categories = set() @@ -443,9 +445,9 @@ def undeclared_data_categories(self) -> Set[str]: privacy_declaration.data_categories ) - system_dataset_data_categories = set() - for dataset in self.datasets: - system_dataset_data_categories.update(dataset.field_data_categories) + system_dataset_data_categories = set( + self.dataset_data_categories(data_categories) + ) return find_undeclared_categories( system_dataset_data_categories, privacy_declaration_data_categories @@ -501,13 +503,6 @@ class PrivacyDeclaration(Base): cookies = relationship( "Cookies", back_populates="privacy_declaration", lazy="joined", uselist=True ) - datasets = relationship( - "Dataset", - primaryjoin="foreign(Dataset.fides_key)==any_(PrivacyDeclaration.dataset_references)", - lazy="selectin", - uselist=True, - viewonly=True, - ) @classmethod def create( @@ -547,11 +542,21 @@ def purpose(cls) -> Case: else_=None, ) - @property - def undeclared_data_categories(self) -> Set[str]: + def dataset_data_categories(self, data_categories: Dict[str, Set[str]]) -> Set[str]: + aggregate = set() + for dataset_key in self.dataset_references or []: + aggregate.update(data_categories.get(dataset_key, set())) + return aggregate + + def undeclared_data_categories( + self, data_categories: Dict[str, Set[str]] + ) -> Set[str]: """ Aggregates a unique set of data categories across the collections in the associated datasets and returns the data categories that are not defined directly on this or any sibling privacy declarations. + + Looks up the unique set of data categories for a given dataset from the pre-computed data_categories map. + This is done to improve performance. """ # Note: This property evaluates the data categories attached to the datasets associated with this specific @@ -559,9 +564,7 @@ def undeclared_data_categories(self) -> Set[str]: # data categories across this privacy declaration and its sibling privacy declarations. # all data categories from the datasets - dataset_data_categories = set() - for dataset in self.datasets: - dataset_data_categories.update(dataset.field_data_categories) + dataset_data_categories = set(self.dataset_data_categories(data_categories)) # all data categories specified directly on this and sibling privacy declarations declared_data_categories = set() diff --git a/src/fides/api/util/data_category.py b/src/fides/api/util/data_category.py index 71c24cf366..5c533bd44f 100644 --- a/src/fides/api/util/data_category.py +++ b/src/fides/api/util/data_category.py @@ -1,14 +1,16 @@ from enum import Enum as EnumType -from typing import List, Type +from typing import Dict, List, Set, Type from fideslang.default_taxonomy import DEFAULT_TAXONOMY from fideslang.validation import FidesKey +from sqlalchemy import func, select, text from sqlalchemy.orm import Session from fides.api import common_exceptions from fides.api.models.sql_models import ( # type: ignore[attr-defined] # isort: skip DataCategory as DataCategoryDbModel, + Dataset, ) @@ -89,3 +91,31 @@ def filter_data_categories( } return sorted(list(default_categories)) return sorted(user_categories) + + +def get_data_categories_map(db: Session) -> Dict[str, Set[str]]: + """ + Returns a map of all datasets, where the keys are the fides keys + of each dataset and the value is a set of data categories associated with each dataset + """ + + subquery = ( + select( + Dataset.fides_key, + func.jsonb_array_elements_text( + text( + "jsonb_path_query(collections::jsonb, '$.** ? (@.data_categories != null).data_categories')" + ) + ).label("category"), + ).select_from(Dataset) + ).cte() + + query = ( + select( + [subquery.c.fides_key, func.array_agg(func.distinct(subquery.c.category))] + ) + .select_from(subquery) + .group_by(subquery.c.fides_key) + ) + result = db.execute(query) + return {key: set(value) if value else set() for key, value in result.all()} diff --git a/tests/conftest.py b/tests/conftest.py index d513472ee3..98e8f07f97 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1400,7 +1400,78 @@ def system_with_undeclared_data_categories(db: Session) -> System: @pytest.fixture(scope="function") -def privacy_declaration_with_dataset_references(db: Session) -> System: +def system_with_a_single_dataset_reference(db: Session) -> System: + first_dataset = CtlDataset.create_from_dataset_dict( + db, + { + "fides_key": f"dataset_key-f{uuid4()}", + "collections": [ + { + "name": "loyalty", + "fields": [ + { + "name": "id", + "data_categories": ["user.unique_id"], + }, + ], + } + ], + }, + ) + second_dataset = CtlDataset.create_from_dataset_dict( + db, + { + "fides_key": f"dataset_key-f{uuid4()}", + "collections": [ + { + "name": "customer", + "fields": [ + { + "name": "shipping_info", + "fields": [ + { + "name": "street", + "data_categories": ["user.contact.address.street"], + } + ], + }, + { + "name": "first_name", + "data_categories": ["user.name.first"], + }, + ], + }, + { + "name": "activity", + "fields": [ + { + "name": "last_login", + "data_categories": ["user.behavior"], + }, + ], + }, + ], + }, + ) + system = System.create( + db=db, + data={ + "fides_key": f"system_key-f{uuid4()}", + "name": f"system-{uuid4()}", + "description": "fixture-made-system", + "organization_fides_key": "default_organization", + "system_type": "Service", + "dataset_references": [first_dataset.fides_key, second_dataset.fides_key], + }, + ) + + return system + + +@pytest.fixture(scope="function") +def privacy_declaration_with_single_dataset_reference( + db: Session, +) -> PrivacyDeclaration: ctl_dataset = CtlDataset.create_from_dataset_dict( db, { @@ -1447,6 +1518,90 @@ def privacy_declaration_with_dataset_references(db: Session) -> System: return privacy_declaration +@pytest.fixture(scope="function") +def privacy_declaration_with_multiple_dataset_references( + db: Session, +) -> PrivacyDeclaration: + first_dataset = CtlDataset.create_from_dataset_dict( + db, + { + "fides_key": f"dataset_key-f{uuid4()}", + "collections": [ + { + "name": "loyalty", + "fields": [ + { + "name": "id", + "data_categories": ["user.unique_id"], + }, + ], + } + ], + }, + ) + second_dataset = CtlDataset.create_from_dataset_dict( + db, + { + "fides_key": f"dataset_key-f{uuid4()}", + "collections": [ + { + "name": "customer", + "fields": [ + { + "name": "shipping_info", + "fields": [ + { + "name": "street", + "data_categories": ["user.contact.address.street"], + } + ], + }, + { + "name": "first_name", + "data_categories": ["user.name.first"], + }, + ], + }, + { + "name": "activity", + "fields": [ + { + "name": "last_login", + "data_categories": ["user.behavior"], + }, + ], + }, + ], + }, + ) + system = System.create( + db=db, + data={ + "fides_key": f"system_key-f{uuid4()}", + "name": f"system-{uuid4()}", + "description": "fixture-made-system", + "organization_fides_key": "default_organization", + "system_type": "Service", + }, + ) + + privacy_declaration = PrivacyDeclaration.create( + db=db, + data={ + "name": "Collect data for third party sharing", + "system_id": system.id, + "data_categories": ["user.device.cookie_id"], + "data_use": "third_party_sharing", + "data_subjects": ["customer"], + "dataset_references": [first_dataset.fides_key, second_dataset.fides_key], + "egress": None, + "ingress": None, + }, + ) + + return privacy_declaration + + @pytest.fixture(scope="function") def system_multiple_decs(db: Session, system: System) -> Generator[System, None, None]: """ diff --git a/tests/ctl/core/test_privacy_declaration.py b/tests/ctl/core/test_privacy_declaration.py index ba2e11cdb2..0adc58949a 100644 --- a/tests/ctl/core/test_privacy_declaration.py +++ b/tests/ctl/core/test_privacy_declaration.py @@ -1,25 +1,51 @@ +from typing import Dict, Set + +import pytest + from fides.api.models.sql_models import PrivacyDeclaration +from fides.api.util.data_category import get_data_categories_map class TestPrivacyDeclaration: - def test_privacy_declaration_datasets( + + @pytest.fixture(scope="function") + def data_categories_map(self, db) -> Dict[str, Set[str]]: + return get_data_categories_map(db) + + def test_privacy_declaration_dataset_data_categories( self, - privacy_declaration_with_dataset_references: PrivacyDeclaration, - ) -> None: - assert len(privacy_declaration_with_dataset_references.datasets) == 1 + privacy_declaration_with_multiple_dataset_references: PrivacyDeclaration, + data_categories_map, + ): + assert set( + privacy_declaration_with_multiple_dataset_references.dataset_data_categories( + data_categories_map + ) + ) == { + "user.behavior", + "user.name.first", + "user.unique_id", + "user.contact.address.street", + } def test_privacy_declaration_undeclared_data_categories( - self, privacy_declaration_with_dataset_references + self, + privacy_declaration_with_single_dataset_reference: PrivacyDeclaration, + data_categories_map, ): - assert ( - privacy_declaration_with_dataset_references.undeclared_data_categories - == {"user.contact.email"} - ) + assert privacy_declaration_with_single_dataset_reference.undeclared_data_categories( + data_categories_map + ) == { + "user.contact.email" + } def test_privacy_declaration_data_category_defined_on_sibling( - self, db, privacy_declaration_with_dataset_references + self, + db, + privacy_declaration_with_single_dataset_reference: PrivacyDeclaration, + data_categories_map, ): - system = privacy_declaration_with_dataset_references.system + system = privacy_declaration_with_single_dataset_reference.system # Create a new privacy declaration with the data category we're searching for PrivacyDeclaration.create( @@ -41,6 +67,8 @@ def test_privacy_declaration_data_category_defined_on_sibling( # Check that the original privacy declaration doesn't have any undeclared data categories # because we also search sibling privacy declarations for the data category assert ( - privacy_declaration_with_dataset_references.undeclared_data_categories + privacy_declaration_with_single_dataset_reference.undeclared_data_categories( + data_categories_map + ) == set() ) diff --git a/tests/ctl/core/test_system.py b/tests/ctl/core/test_system.py index 8761c96915..e93363aef0 100644 --- a/tests/ctl/core/test_system.py +++ b/tests/ctl/core/test_system.py @@ -13,6 +13,7 @@ from fides.api.db.system import create_system, upsert_cookies from fides.api.models.sql_models import Cookies, PrivacyDeclaration from fides.api.models.sql_models import System as sql_System +from fides.api.util.data_category import get_data_categories_map from fides.config import FidesConfig from fides.connectors.models import OktaConfig from fides.core import api @@ -60,16 +61,28 @@ def test_get_system_data_uses(db, system) -> None: assert sql_System.get_data_uses([system]) == set() -def test_system_datasets(system_with_dataset_references: System) -> None: - assert len(system_with_dataset_references.datasets) == 1 +def test_system_dataset_data_categories( + db, + system_with_a_single_dataset_reference: System, +) -> None: + assert set( + system_with_a_single_dataset_reference.dataset_data_categories( + get_data_categories_map(db) + ) + ) == { + "user.behavior", + "user.contact.address.street", + "user.name.first", + "user.unique_id", + } def test_system_undeclared_data_categories( - system_with_undeclared_data_categories: System, + db, system_with_undeclared_data_categories: System ) -> None: - assert system_with_undeclared_data_categories.undeclared_data_categories == { - "user.contact.email" - } + assert system_with_undeclared_data_categories.undeclared_data_categories( + get_data_categories_map(db) + ) == {"user.contact.email"} @pytest.fixture(scope="function")