diff --git a/DEV_ENV.md b/DEV_ENV.md index e5e7fbc6966ad..e0d7c2cfbc47a 100644 --- a/DEV_ENV.md +++ b/DEV_ENV.md @@ -51,11 +51,14 @@ The dev environment is initialized with AWS Secrets/S3 data in the [scripts/setu ### Make targets for running tests in dev -| Command | Description | Notes | -| ---------------------------- | --------------------------------------------- | ----- | -| `make local-unit-test` | Run backend tests in the dev environment | | -| `make local-functional-test` | Run functional tests in the dev environment | | -| `make local-smoke-test` | Run frontend/e2e tests in the dev environment | | +| Command | Description | Notes | +|-------------------------------------|-------------------------------------------------------------------------------------------| ----- | +| `make local-unit-test` | Run backend tests in the local dockerized environment, against mock of persistence layer | | +| `make local-integration-test-backend` | Run backend tests in the local dockerized environment, against dockerized database instance. | | +| `make local-unit-test-processing` | Run dataset processing upload job unit tests in the 'processing' docker environment. | | +| `make local-unit-test-wmg-backend` | Run wmg backend unit tests in the local dockerized environment. | | +| `make local-functional-test` | Run backend functional tests in local dockerized environment | | +| `make local-smoke-test` | Run frontend/e2e tests in the local dockerized environment | | ### External dependencies diff --git a/backend/Makefile b/backend/Makefile index 89b3f75beba96..cc77fecf368f3 100644 --- a/backend/Makefile +++ b/backend/Makefile @@ -77,9 +77,9 @@ db/local/load-schema: # Imports the corpora_dev.sqlc schema (schema ONLY) into the corpora_test database # Usage: make db/local/load-schema INFILE= $(eval DB_PW = $(shell aws secretsmanager get-secret-value --secret-id corpora/backend/test/database --region us-west-2 | jq -r '.SecretString | match(":([^:]*)@").captures[0].string')) - PGPASSWORD=${DB_PW} pg_restore --schema-only --clean --no-owner --host 0.0.0.0 --dbname corpora $(INFILE) + PGPASSWORD=${DB_PW} pg_restore --schema-only --schema=persistence-schema --clean --no-owner --host 0.0.0.0 --username corpora --dbname corpora $(INFILE) # Also import alembic schema version - PGPASSWORD=${DB_PW} pg_restore --data-only --table=alembic_version --no-owner --host 0.0.0.0 --dbname corpora $(INFILE) + PGPASSWORD=${DB_PW} pg_restore --data-only --table=alembic_version --no-owner --host 0.0.0.0 --username corpora --dbname corpora $(INFILE) db/dump_schema: ifeq ($(DEPLOYMENT_STAGE),test) diff --git a/backend/database/README.md b/backend/database/README.md index 78ec150a0c23f..ff1fc551a59bb 100644 --- a/backend/database/README.md +++ b/backend/database/README.md @@ -16,7 +16,7 @@ 5. In the generated file, update the `Revision ID` and the `revision` (used by Alembic) to include the migration count. For example `Revision ID: a8cd0dc08805` becomes `Revision ID: 18_a8cd0dc08805` and `revision = "a8cd0dc08805"` becomes `revision = "18_a8cd0dc08805"` 6. [Test your migration](#test-a-migration) -7. Check that [corpora_orm.py](../common/corpora_orm.py) matches up with your changes. +7. Check that [orm.py](../layers/persistence/orm.py) matches up with your changes. 8. Once you've completed the changes, create a PR to get the functions reviewed. 9. Once the PR is merged, migrations will be run as part of the deployment process to each env. 10. [Connect to Remote RDS](#connect-to-remote-rds) to single-cell-dev @@ -29,7 +29,7 @@ DEPLOYMENT_STAGE=test make db/migrate ## How to autogenerate migration script -1. Make changes to the ORM class(es) in [corpora_orm.py](../common/corpora_orm.py) +1. Make changes to the ORM class(es) in [orm.py](../layers/persistence/orm.py) 2. [Connect to Remote RDS](#connect-to-remote-rds). Note, generally, you would be connecting to prod (`AWS_PROFILE=single-cell-prod DEPLOYMENT_STAGE=prod`) since we want to generate a migration from the database schema currently deployed in prod. However, if there are migrations haven't been @@ -50,8 +50,7 @@ AWS_PROFILE=single-cell-{dev,prod} DEPLOYMENT_STAGE={dev,staging,prod} CORPORA_L The following steps will test that a migration script works on a local database using data downloaded from a deployed database. -1. [Connect to Remote RDS](#connect-to-remote-rds) -2. Open a new terminal and using the same values for `AWS_PROFILE` and `DEPLOYMENT_STAGE`, download the remote dev database schema: +1. Open a new terminal and using the same values for `AWS_PROFILE` and `DEPLOYMENT_STAGE`, download the remote dev database schema: ```shell cd $REPO_ROOT/backend @@ -60,15 +59,15 @@ AWS_PROFILE=single-cell-{dev,prod} DEPLOYMENT_STAGE={dev,staging,prod} make db/d This will download the database to `$REPO_ROOT/backend/corpora_dev.sqlc`. -3. The tunnel to dev should close automatically (but worth verifying `ps ax | grep ssh`) -4. Start the local database environment: +2. The tunnel to dev should close automatically (but worth verifying `ps ax | grep ssh`) +3. Start the local database environment: ```shell cd $REPO_ROOT make local-start ``` -5. Import the remote database schema into your local database: +4. Import the remote database schema into your local database: ```shell cd $REPO_ROOT/backend @@ -85,7 +84,7 @@ You may need to run this a few times, until there are no significant errors. - Note: `pg_restore: error: could not execute query: ERROR: role "rdsadmin" does not exist` is not a significant error -6. Run the migration test: +5. Run the migration test: ```shell AWS_PROFILE=single-cell-{dev,prod} DEPLOYMENT_STAGE=test make db/test_migration @@ -99,7 +98,7 @@ This test will: 1. Dump the schema (after) 1. Compare the before vs after schemas. These should be identical if the database migration's `upgrade()` and `downgrade()` functions were implemented correctly. -If there are no differences then the test passed. If the test didn't pass, make adjustments to your migration script and restart from step 5. Repeat until there are no errors. +If there are no differences then the test passed. If the test didn't pass, make adjustments to your migration script and restart from step 4. Repeat until there are no errors. ## Connect to Remote RDS diff --git a/backend/database/versions/34_2be441104b48_standardize_db_fields.py b/backend/database/versions/34_2be441104b48_standardize_db_fields.py new file mode 100644 index 0000000000000..8a4255f204827 --- /dev/null +++ b/backend/database/versions/34_2be441104b48_standardize_db_fields.py @@ -0,0 +1,60 @@ +"""standardize db fields + +Revision ID: 34_2be441104b48 +Revises: 33_c5aaf6e2ca9e +Create Date: 2023-01-09 16:40:13.184800 + +""" +from alembic import op + +# revision identifiers, used by Alembic. +revision = "34_2be441104b48" +down_revision = "33_c5aaf6e2ca9e" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column("Collection", "tombstoned", new_column_name="tombstone", schema="persistence_schema") + op.alter_column("CollectionVersion", "version_id", new_column_name="id", schema="persistence_schema") + op.alter_column("CollectionVersion", "metadata", new_column_name="collection_metadata", schema="persistence_schema") + op.alter_column("Dataset", "dataset_id", new_column_name="id", schema="persistence_schema") + op.alter_column("Dataset", "dataset_version_id", new_column_name="version_id", schema="persistence_schema") + op.alter_column("DatasetVersion", "version_id", new_column_name="id", schema="persistence_schema") + op.alter_column("DatasetVersion", "metadata", new_column_name="dataset_metadata", schema="persistence_schema") + op.drop_constraint( + "DatasetVersion_dataset_id_fkey", "DatasetVersion", schema="persistence_schema", type_="foreignkey" + ) + op.create_foreign_key( + "DatasetVersion_id_fkey", + "DatasetVersion", + "Dataset", + ["dataset_id"], + ["id"], + source_schema="persistence_schema", + referent_schema="persistence_schema", + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column("Collection", "tombstone", new_column_name="tombstoned", schema="persistence_schema") + op.alter_column("CollectionVersion", "id", new_column_name="version_id", schema="persistence_schema") + op.alter_column("CollectionVersion", "collection_metadata", new_column_name="metadata", schema="persistence_schema") + op.alter_column("Dataset", "id", new_column_name="dataset_id", schema="persistence_schema") + op.alter_column("Dataset", "version_id", new_column_name="dataset_version_id", schema="persistence_schema") + op.alter_column("DatasetVersion", "id", new_column_name="version_id", schema="persistence_schema") + op.alter_column("DatasetVersion", "dataset_metadata", new_column_name="metadata", schema="persistence_schema") + op.drop_constraint("DatasetVersion_id_fkey", "DatasetVersion", schema="persistence_schema", type_="foreignkey") + op.create_foreign_key( + "DatasetVersion_dataset_id_fkey", + "DatasetVersion", + "Dataset", + ["dataset_id"], + ["dataset_id"], + source_schema="persistence_schema", + referent_schema="persistence_schema", + ) + # ### end Alembic commands ### diff --git a/backend/layers/common/entities.py b/backend/layers/common/entities.py index eec8ca5d64e0a..e3d2da39c221d 100644 --- a/backend/layers/common/entities.py +++ b/backend/layers/common/entities.py @@ -4,6 +4,8 @@ from typing import List, Optional from urllib.parse import urlparse +import uuid + from dataclasses_json import dataclass_json @@ -99,43 +101,34 @@ def empty(): @dataclass -class CollectionId: +class EntityId: id: str - def __repr__(self) -> str: - return self.id - - -@dataclass -class CollectionVersionId: - id: str + def __init__(self, entity_id: str = None): + self.id = str(entity_id) if entity_id is not None else str(uuid.uuid4()) def __repr__(self) -> str: return self.id -@dataclass -class DatasetId: - id: str +class CollectionId(EntityId): + pass - def __repr__(self) -> str: - return self.id +class CollectionVersionId(EntityId): + pass -@dataclass -class DatasetVersionId: - id: str - def __repr__(self) -> str: - return self.id +class DatasetId(EntityId): + pass -@dataclass -class DatasetArtifactId: - id: str +class DatasetVersionId(EntityId): + pass - def __repr__(self) -> str: - return self.id + +class DatasetArtifactId(EntityId): + pass @dataclass diff --git a/backend/layers/persistence/orm.py b/backend/layers/persistence/orm.py index f41edb6273c1b..9c24dcf02a9f1 100644 --- a/backend/layers/persistence/orm.py +++ b/backend/layers/persistence/orm.py @@ -1,4 +1,4 @@ -from sqlalchemy import Column, DateTime, Enum, ForeignKey, String, Table +from sqlalchemy import Column, DateTime, Enum, ForeignKey, String from sqlalchemy.dialects.postgresql import ARRAY, BOOLEAN, JSON, UUID from sqlalchemy.orm import registry from sqlalchemy.schema import MetaData @@ -11,71 +11,61 @@ @mapper_registry.mapped -class Collection: +class CollectionTable: - __table__ = Table( - "Collection", - mapper_registry.metadata, - Column("id", UUID(as_uuid=True), primary_key=True), - Column("version_id", UUID(as_uuid=True)), - Column("originally_published_at", DateTime), - Column("tombstoned", BOOLEAN), - ) + __tablename__ = "Collection" + + id = Column(UUID(as_uuid=True), primary_key=True) + version_id = Column(UUID(as_uuid=True)) + originally_published_at = Column(DateTime) + tombstone = Column(BOOLEAN) @mapper_registry.mapped -class CollectionVersion: - - __table__ = Table( - "CollectionVersion", - mapper_registry.metadata, - Column("version_id", UUID(as_uuid=True), primary_key=True), - Column("collection_id", UUID(as_uuid=True)), - Column("metadata", JSON), - Column("owner", String), - Column("curator_name", String), - Column("publisher_metadata", JSON), - Column("published_at", DateTime), - Column("created_at", DateTime), - Column("datasets", ARRAY(UUID(as_uuid=True))), - ) +class CollectionVersionTable: + + __tablename__ = "CollectionVersion" + + id = Column(UUID(as_uuid=True), primary_key=True) + collection_id = Column(UUID(as_uuid=True)) + collection_metadata = Column(JSON) + owner = Column(String) + curator_name = Column(String) + publisher_metadata = Column(JSON) + published_at = Column(DateTime) + created_at = Column(DateTime) + datasets = Column(ARRAY(UUID(as_uuid=True))) @mapper_registry.mapped -class Dataset: +class DatasetTable: + + __tablename__ = "Dataset" - __table__ = Table( - "Dataset", - mapper_registry.metadata, - Column("dataset_id", UUID(as_uuid=True), primary_key=True), - Column("dataset_version_id", UUID(as_uuid=True)), - Column("published_at", DateTime), - ) + id = Column(UUID(as_uuid=True), primary_key=True) + version_id = Column(UUID(as_uuid=True)) + published_at = Column(DateTime) @mapper_registry.mapped -class DatasetVersion: +class DatasetVersionTable: - __table__ = Table( - "DatasetVersion", - mapper_registry.metadata, - Column("version_id", UUID(as_uuid=True), primary_key=True), - Column("dataset_id", UUID(as_uuid=True), ForeignKey("Dataset.dataset_id")), - Column("collection_id", UUID(as_uuid=True)), - Column("created_at", DateTime), - Column("metadata", JSON), - Column("artifacts", ARRAY(UUID(as_uuid=True))), - Column("status", JSON), - ) + __tablename__ = "DatasetVersion" + + id = Column(UUID(as_uuid=True), primary_key=True) + dataset_id = Column(UUID(as_uuid=True), ForeignKey("Dataset.id")) + collection_id = Column(UUID(as_uuid=True)) + created_at = Column(DateTime) + dataset_metadata = Column(JSON) + artifacts = Column(ARRAY(UUID(as_uuid=True))) + status = Column(JSON) @mapper_registry.mapped -class DatasetArtifact: - - __table__ = Table( - "DatasetArtifact", - mapper_registry.metadata, - Column("id", UUID(as_uuid=True), primary_key=True), - Column("type", Enum(DatasetArtifactType)), - Column("uri", String), - ) +class DatasetArtifactTable: + + __tablename__ = "DatasetArtifact" + + id = Column(UUID(as_uuid=True), primary_key=True) + type = Column(Enum(DatasetArtifactType)) + uri = Column(String) diff --git a/backend/layers/persistence/persistence.py b/backend/layers/persistence/persistence.py index 9b5cbb95b8843..93fa51564afdc 100644 --- a/backend/layers/persistence/persistence.py +++ b/backend/layers/persistence/persistence.py @@ -34,11 +34,11 @@ from backend.layers.business.exceptions import CollectionIsPublishedException from backend.layers.persistence.constants import SCHEMA_NAME from backend.layers.persistence.orm import ( - Collection as CollectionTable, - CollectionVersion as CollectionVersionTable, - Dataset as DatasetTable, - DatasetArtifact as DatasetArtifactTable, - DatasetVersion as DatasetVersionTable, + CollectionTable, + CollectionVersionTable, + DatasetTable, + DatasetArtifactTable, + DatasetVersionTable, ) from backend.layers.persistence.persistence_interface import DatabaseProviderInterface, PersistenceException @@ -93,17 +93,13 @@ def _manage_session(self, **kwargs): if session is not None: session.close() - @staticmethod - def _generate_id(): - return str(uuid.uuid4()) - def _row_to_collection_version(self, row: Any, canonical_collection: CanonicalCollection) -> CollectionVersion: return CollectionVersion( collection_id=CollectionId(str(row.collection_id)), - version_id=CollectionVersionId(str(row.version_id)), + version_id=CollectionVersionId(str(row.id)), owner=row.owner, curator_name=row.curator_name, - metadata=CollectionMetadata.from_json(row.metadata), + metadata=CollectionMetadata.from_json(row.collection_metadata), publisher_metadata=None if row.publisher_metadata is None else json.loads(row.publisher_metadata), datasets=[DatasetVersionId(str(id)) for id in row.datasets], published_at=row.published_at, @@ -116,10 +112,10 @@ def _row_to_collection_version_with_datasets( ) -> CollectionVersionWithDatasets: return CollectionVersionWithDatasets( collection_id=CollectionId(str(row.collection_id)), - version_id=CollectionVersionId(str(row.version_id)), + version_id=CollectionVersionId(str(row.id)), owner=row.owner, curator_name=row.curator_name, - metadata=CollectionMetadata.from_json(row.metadata), + metadata=CollectionMetadata.from_json(row.collection_metadata), publisher_metadata=None if row.publisher_metadata is None else json.loads(row.publisher_metadata), datasets=datasets, published_at=row.published_at, @@ -139,13 +135,13 @@ def _row_to_dataset_version(self, row: Any, canonical_dataset: CanonicalDataset, status = row.status else: status = DatasetStatus.from_json(row.status) - if type(row.metadata) is DatasetMetadata or row.metadata is None: - metadata = row.metadata + if type(row.dataset_metadata) is DatasetMetadata or row.dataset_metadata is None: + metadata = row.dataset_metadata else: - metadata = DatasetMetadata.from_json(row.metadata) + metadata = DatasetMetadata.from_json(row.dataset_metadata) return DatasetVersion( DatasetId(str(row.dataset_id)), - DatasetVersionId(str(row.version_id)), + DatasetVersionId(str(row.id)), CollectionId(str(row.collection_id)), status, metadata, @@ -171,15 +167,15 @@ def get_canonical_collection(self, collection_id: CollectionId) -> CanonicalColl CollectionId(str(collection.id)), None if collection.version_id is None else CollectionVersionId(str(collection.version_id)), collection.originally_published_at, - collection.tombstoned, + collection.tombstone, ) def get_canonical_dataset(self, dataset_id: DatasetId) -> CanonicalDataset: with self._manage_session() as session: - dataset = session.query(DatasetTable).filter_by(dataset_id=dataset_id.id).one_or_none() + dataset = session.query(DatasetTable).filter_by(id=dataset_id.id).one_or_none() if dataset is None: return None - return CanonicalDataset(dataset_id, DatasetVersionId(str(dataset.dataset_version_id)), dataset.published_at) + return CanonicalDataset(dataset_id, DatasetVersionId(str(dataset.version_id)), dataset.published_at) def create_canonical_collection( self, owner: str, curator_name: str, collection_metadata: CollectionMetadata @@ -188,22 +184,22 @@ def create_canonical_collection( Creates a new canonical collection, generating a canonical collection_id and a new version_id. Returns the newly created CollectionVersion """ - collection_id = CollectionId((self._generate_id())) - version_id = CollectionVersionId((self._generate_id())) + collection_id = CollectionId() + version_id = CollectionVersionId() now = datetime.utcnow() canonical_collection = CollectionTable( id=collection_id.id, version_id=None, - tombstoned=False, + tombstone=False, originally_published_at=None, ) collection_version_row = CollectionVersionTable( + id=version_id.id, collection_id=collection_id.id, - version_id=version_id.id, owner=owner, curator_name=curator_name, - metadata=collection_metadata.to_json(), + collection_metadata=collection_metadata.to_json(), publisher_metadata=None, published_at=None, created_at=now, @@ -223,7 +219,7 @@ def get_collection_version(self, version_id: CollectionVersionId) -> CollectionV Retrieves a specific collection version by id """ with self._manage_session() as session: - collection_version = session.query(CollectionVersionTable).filter_by(version_id=version_id.id).one_or_none() + collection_version = session.query(CollectionVersionTable).filter_by(id=version_id.id).one_or_none() if collection_version is None: return None collection_id = CollectionId(str(collection_version.collection_id)) @@ -239,7 +235,7 @@ def get_collection_version_with_datasets(self, version_id: CollectionVersionId) Retrieves a specific collection version by id, with datasets """ with self._manage_session() as session: - collection_version = session.query(CollectionVersionTable).filter_by(version_id=version_id.id).one_or_none() + collection_version = session.query(CollectionVersionTable).filter_by(id=version_id.id).one_or_none() if collection_version is None: return None collection_id = CollectionId(str(collection_version.collection_id)) @@ -257,7 +253,7 @@ def get_collection_mapped_version(self, collection_id: CollectionId) -> Optional if version_id is None or version_id[0] is None: return None version_id = version_id[0] - collection_version = session.query(CollectionVersionTable).filter_by(version_id=version_id).one() + collection_version = session.query(CollectionVersionTable).filter_by(id=version_id).one() canonical_collection = self.get_canonical_collection(collection_id) datasets = self._get_datasets([DatasetVersionId(str(id)) for id in collection_version.datasets]) return self._row_to_collection_version_with_datasets(collection_version, canonical_collection, datasets) @@ -292,7 +288,7 @@ def get_all_collections_versions(self) -> Iterable[CollectionVersion]: CollectionId(str(collection_row.id)), None if collection_row.version_id is None else CollectionVersionId(str(collection_row.version_id)), collection_row.originally_published_at, - collection_row.tombstoned, + collection_row.tombstone, ) result = [] @@ -314,25 +310,23 @@ def get_all_mapped_collection_versions(self, get_tombstoned: bool = False) -> It canonical_collections = ( session.query(CollectionTable) .filter(CollectionTable.version_id.isnot(None)) - .filter_by(tombstoned=False) + .filter_by(tombstone=False) .all() ) mapped_version_ids = [i.version_id for i in canonical_collections] versions = ( - session.query(CollectionVersionTable) - .filter(CollectionVersionTable.version_id.in_(mapped_version_ids)) - .all() + session.query(CollectionVersionTable).filter(CollectionVersionTable.id.in_(mapped_version_ids)).all() ) # noqa for version in versions: # TODO: should be optimized using a map - canonical_row = next(cc for cc in canonical_collections if cc.version_id == version.version_id) + canonical_row = next(cc for cc in canonical_collections if cc.version_id == version.id) canonical = CanonicalCollection( CollectionId(str(canonical_row.id)), CollectionVersionId(str(canonical_row.version_id)), canonical_row.originally_published_at, - canonical_row.tombstoned, + canonical_row.tombstone, ) yield self._row_to_collection_version(version, canonical) @@ -343,7 +337,7 @@ def delete_canonical_collection(self, collection_id: CollectionId) -> None: with self._manage_session() as session: canonical_collection = session.query(CollectionTable).filter_by(id=collection_id.id).one_or_none() if canonical_collection: - canonical_collection.tombstoned = True + canonical_collection.tombstone = True def save_collection_metadata( self, version_id: CollectionVersionId, collection_metadata: CollectionMetadata @@ -352,8 +346,8 @@ def save_collection_metadata( Saves collection metadata for a collection version """ with self._manage_session() as session: - version = session.query(CollectionVersionTable).filter_by(version_id=version_id.id).one() - version.metadata = collection_metadata.to_json() + version = session.query(CollectionVersionTable).filter_by(id=version_id.id).one() + version.collection_metadata = collection_metadata.to_json() def save_collection_publisher_metadata( self, version_id: CollectionVersionId, publisher_metadata: Optional[dict] @@ -362,7 +356,7 @@ def save_collection_publisher_metadata( Saves publisher metadata for a collection version. Specify None to remove it """ with self._manage_session() as session: - version = session.query(CollectionVersionTable).filter_by(version_id=version_id.id).one() + version = session.query(CollectionVersionTable).filter_by(id=version_id.id).one() version.publisher_metadata = json.dumps(publisher_metadata) def add_collection_version(self, collection_id: CollectionId) -> CollectionVersionId: @@ -373,12 +367,12 @@ def add_collection_version(self, collection_id: CollectionId) -> CollectionVersi """ with self._manage_session() as session: current_version_id = session.query(CollectionTable.version_id).filter_by(id=collection_id.id).one()[0] - current_version = session.query(CollectionVersionTable).filter_by(version_id=current_version_id).one() - new_version_id = self._generate_id() + current_version = session.query(CollectionVersionTable).filter_by(id=current_version_id).one() + new_version_id = CollectionVersionId() new_version = CollectionVersionTable( - version_id=new_version_id, + id=new_version_id.id, collection_id=collection_id.id, - metadata=current_version.metadata, + collection_metadata=current_version.collection_metadata, owner=current_version.owner, curator_name=current_version.curator_name, publisher_metadata=current_version.publisher_metadata, @@ -394,7 +388,7 @@ def delete_collection_version(self, version_id: CollectionVersionId) -> None: Deletes a collection version, if it is unpublished. """ with self._manage_session() as session: - version = session.query(CollectionVersionTable).filter_by(version_id=version_id.id).one_or_none() + version = session.query(CollectionVersionTable).filter_by(id=version_id.id).one_or_none() if version: if version.published_at: raise CollectionIsPublishedException(f"Published Collection Version {version_id} cannot be deleted") @@ -407,53 +401,31 @@ def finalize_collection_version( published_at: Optional[datetime] = None, ) -> None: """ - Finalizes a collection version. This is equivalent to calling: - 1. update_collection_version_mapping - 2. set_collection_version_published_at - 3. finalize_dataset_versions + Finalizes a collection version """ published_at = published_at if published_at else datetime.utcnow() - self.update_collection_version_mapping(collection_id, version_id, published_at) - self.set_collection_version_published_at(version_id, published_at) - self.finalize_dataset_versions(version_id, published_at) - - def update_collection_version_mapping( - self, collection_id: CollectionId, version_id: CollectionVersionId, published_at: datetime - ) -> None: - """ - Updates the mapping between the canonical collection `collection_id` and its `version_id` - """ with self._manage_session() as session: + # update canonical collection -> collection version mapping collection = session.query(CollectionTable).filter_by(id=collection_id.id).one() collection.version_id = version_id.id + + # update canonical collection if this is its first publish if collection.originally_published_at is None: collection.originally_published_at = published_at - def set_collection_version_published_at(self, version_id: CollectionVersionId, published_at: datetime) -> None: - """ - Sets the `published_at` datetime for a collection version - """ - with self._manage_session() as session: - collection_version = session.query(CollectionVersionTable).filter_by(version_id=version_id.id).one() + # update collection version + collection_version = session.query(CollectionVersionTable).filter_by(id=version_id.id).one() collection_version.published_at = published_at - def finalize_dataset_versions(self, version_id: CollectionVersionId, published_at: datetime) -> None: - """ - 1. Updates the mapping between the canonical dataset and the latest published dataset version for each dataset - in a CollectionVersion - 2. Sets the each canonical dataset's 'published_at' if not previously set - """ - with self._manage_session() as session: - dataset_version_ids = ( - session.query(CollectionVersionTable.datasets).filter_by(version_id=version_id.id).one()[0] - ) + # finalize collection version's dataset versions + dataset_version_ids = session.query(CollectionVersionTable.datasets).filter_by(id=version_id.id).one()[0] for dataset_version, dataset in ( session.query(DatasetVersionTable, DatasetTable) - .filter(DatasetVersionTable.dataset_id == DatasetTable.dataset_id) - .filter(DatasetVersionTable.version_id.in_(dataset_version_ids)) + .filter(DatasetVersionTable.dataset_id == DatasetTable.id) + .filter(DatasetVersionTable.id.in_(dataset_version_ids)) .all() ): - dataset.version_id = dataset_version.version_id + dataset.version_id = dataset_version.id if dataset.published_at is None: dataset.published_at = published_at @@ -462,9 +434,7 @@ def get_dataset_version(self, dataset_version_id: DatasetVersionId) -> DatasetVe Returns a dataset version by id. """ with self._manage_session() as session: - dataset_version = ( - session.query(DatasetVersionTable).filter_by(version_id=dataset_version_id.id).one_or_none() - ) + dataset_version = session.query(DatasetVersionTable).filter_by(id=dataset_version_id.id).one_or_none() if dataset_version is None: return None return self._hydrate_dataset_version(dataset_version) @@ -492,7 +462,7 @@ def get_all_datasets(self) -> Iterable[DatasetVersion]: acc = [] with self._manage_session() as session: for version in session.query(DatasetVersionTable).all(): # noqa - if str(version.version_id) in active_datasets: + if str(version.id) in active_datasets: acc.append(self._hydrate_dataset_version(version)) return acc @@ -525,20 +495,16 @@ def create_canonical_dataset(self, collection_version_id: CollectionVersionId) - """ with self._manage_session() as session: collection_id = ( - session.query(CollectionVersionTable.collection_id) - .filter_by(version_id=collection_version_id.id) - .one()[0] + session.query(CollectionVersionTable.collection_id).filter_by(id=collection_version_id.id).one()[0] ) - dataset_id = DatasetId(self._generate_id()) - dataset_version_id = DatasetVersionId(self._generate_id()) - canonical_dataset = DatasetTable( - dataset_id=dataset_id.id, dataset_version_id=dataset_version_id.id, published_at=None - ) + dataset_id = DatasetId() + dataset_version_id = DatasetVersionId() + canonical_dataset = DatasetTable(id=dataset_id.id, version_id=dataset_version_id.id, published_at=None) dataset_version = DatasetVersionTable( - version_id=dataset_version_id.id, + id=dataset_version_id.id, dataset_id=dataset_id.id, collection_id=collection_id, - metadata=None, + dataset_metadata=None, artifacts=list(), status=DatasetStatus.empty().to_json(), created_at=datetime.utcnow(), @@ -557,11 +523,11 @@ def add_dataset_artifact( """ Adds a dataset artifact to an existing dataset version. """ - artifact_id = DatasetArtifactId(self._generate_id()) + artifact_id = DatasetArtifactId() artifact = DatasetArtifactTable(id=artifact_id.id, type=artifact_type, uri=artifact_uri) with self._manage_session() as session: session.add(artifact) - dataset_version = session.query(DatasetVersionTable).filter_by(version_id=version_id.id).one() + dataset_version = session.query(DatasetVersionTable).filter_by(id=version_id.id).one() artifacts = list(dataset_version.artifacts) artifacts.append(uuid.UUID(artifact_id.id)) dataset_version.artifacts = artifacts @@ -572,7 +538,7 @@ def update_dataset_processing_status(self, version_id: DatasetVersionId, status: Updates the processing status for a dataset version. """ with self._manage_session() as session: - dataset_version = session.query(DatasetVersionTable).filter_by(version_id=version_id.id).one() + dataset_version = session.query(DatasetVersionTable).filter_by(id=version_id.id).one() dataset_version_status = json.loads(dataset_version.status) dataset_version_status["processing_status"] = status.value dataset_version.status = json.dumps(dataset_version_status) @@ -582,7 +548,7 @@ def update_dataset_validation_status(self, version_id: DatasetVersionId, status: Updates the validation status for a dataset version. """ with self._manage_session() as session: - dataset_version = session.query(DatasetVersionTable).filter_by(version_id=version_id.id).one() + dataset_version = session.query(DatasetVersionTable).filter_by(id=version_id.id).one() dataset_version_status = json.loads(dataset_version.status) dataset_version_status["validation_status"] = status.value dataset_version.status = json.dumps(dataset_version_status) @@ -592,7 +558,7 @@ def update_dataset_upload_status(self, version_id: DatasetVersionId, status: Dat Updates the upload status for a dataset version. """ with self._manage_session() as session: - dataset_version = session.query(DatasetVersionTable).filter_by(version_id=version_id.id).one() + dataset_version = session.query(DatasetVersionTable).filter_by(id=version_id.id).one() dataset_version_status = json.loads(dataset_version.status) dataset_version_status["upload_status"] = status.value dataset_version.status = json.dumps(dataset_version_status) @@ -604,14 +570,14 @@ def update_dataset_conversion_status( Updates the conversion status for a dataset version and for `status_type` """ with self._manage_session() as session: - dataset_version = session.query(DatasetVersionTable).filter_by(version_id=version_id.id).one() + dataset_version = session.query(DatasetVersionTable).filter_by(id=version_id.id).one() dataset_version_status = json.loads(dataset_version.status) dataset_version_status[status_type] = status.value dataset_version.status = json.dumps(dataset_version_status) def update_dataset_validation_message(self, version_id: DatasetVersionId, validation_message: str) -> None: with self._manage_session() as session: - dataset_version = session.query(DatasetVersionTable).filter_by(version_id=version_id.id).one() + dataset_version = session.query(DatasetVersionTable).filter_by(id=version_id.id).one() dataset_version_status = json.loads(dataset_version.status) dataset_version_status["validation_message"] = validation_message dataset_version.status = json.dumps(dataset_version_status) @@ -621,7 +587,7 @@ def get_dataset_version_status(self, version_id: DatasetVersionId) -> DatasetSta Returns the status for a dataset version """ with self._manage_session() as session: - status = session.query(DatasetVersionTable.status).filter_by(version_id=version_id.id).one() + status = session.query(DatasetVersionTable.status).filter_by(id=version_id.id).one() return DatasetStatus.from_json(status[0]) def set_dataset_metadata(self, version_id: DatasetVersionId, metadata: DatasetMetadata) -> None: @@ -629,8 +595,8 @@ def set_dataset_metadata(self, version_id: DatasetVersionId, metadata: DatasetMe Sets the metadata for a dataset version """ with self._manage_session() as session: - dataset_version = session.query(DatasetVersionTable).filter_by(version_id=version_id.id).one() - dataset_version.metadata = metadata.to_json() + dataset_version = session.query(DatasetVersionTable).filter_by(id=version_id.id).one() + dataset_version.dataset_metadata = metadata.to_json() def add_dataset_to_collection_version_mapping( self, collection_version_id: CollectionVersionId, dataset_version_id: DatasetVersionId @@ -639,9 +605,7 @@ def add_dataset_to_collection_version_mapping( Adds a mapping between an existing collection version and a dataset version """ with self._manage_session() as session: - collection_version = ( - session.query(CollectionVersionTable).filter_by(version_id=collection_version_id.id).one() - ) + collection_version = session.query(CollectionVersionTable).filter_by(id=collection_version_id.id).one() # TODO: alternatively use postgres `array_append` # TODO: make sure that the UUID conversion works updated_datasets = list(collection_version.datasets) @@ -655,9 +619,7 @@ def delete_dataset_from_collection_version( Removes a mapping between a collection version and a dataset version """ with self._manage_session() as session: - collection_version = ( - session.query(CollectionVersionTable).filter_by(version_id=collection_version_id.id).one() - ) + collection_version = session.query(CollectionVersionTable).filter_by(id=collection_version_id.id).one() # TODO: alternatively use postgres `array_remove` updated_datasets = list(collection_version.datasets) updated_datasets.remove(uuid.UUID(dataset_version_id.id)) @@ -672,19 +634,15 @@ def replace_dataset_in_collection_version( # TODO: this method should probably be split into multiple - it contains too much logic with self._manage_session() as session: collection_id = ( - session.query(CollectionVersionTable.collection_id) - .filter_by(version_id=collection_version_id.id) - .one()[0] + session.query(CollectionVersionTable.collection_id).filter_by(id=collection_version_id.id).one()[0] ) # noqa - dataset_id = ( - session.query(DatasetVersionTable.dataset_id).filter_by(version_id=old_dataset_version_id.id).one()[0] - ) - new_dataset_version_id = DatasetVersionId(self._generate_id()) + dataset_id = session.query(DatasetVersionTable.dataset_id).filter_by(id=old_dataset_version_id.id).one()[0] + new_dataset_version_id = DatasetVersionId() new_dataset_version = DatasetVersionTable( - version_id=new_dataset_version_id.id, + id=new_dataset_version_id.id, dataset_id=dataset_id, collection_id=collection_id, - metadata=None, + dataset_metadata=None, artifacts=list(), status=DatasetStatus.empty().to_json(), created_at=datetime.utcnow(), @@ -692,7 +650,7 @@ def replace_dataset_in_collection_version( session.add(new_dataset_version) collection_version = ( - session.query(CollectionVersionTable).filter_by(version_id=collection_version_id.id).one() + session.query(CollectionVersionTable).filter_by(id=collection_version_id.id).one() ) # noqa # This replaces the dataset while preserving the order of datasets datasets = list(collection_version.datasets) @@ -707,13 +665,11 @@ def get_dataset_mapped_version(self, dataset_id: DatasetId) -> Optional[DatasetV Returns the dataset version mapped to a canonical dataset_id, or None if not existing """ with self._manage_session() as session: - canonical_dataset = session.query(DatasetTable).filter_by(dataset_id=dataset_id.id).one_or_none() + canonical_dataset = session.query(DatasetTable).filter_by(id=dataset_id.id).one_or_none() if canonical_dataset is None: return None - if canonical_dataset.dataset_version_id is None: + if canonical_dataset.version_id is None: return None - dataset_version = ( - session.query(DatasetVersionTable).filter_by(version_id=canonical_dataset.dataset_version_id).one() - ) + dataset_version = session.query(DatasetVersionTable).filter_by(id=canonical_dataset.version_id).one() dataset_version.canonical_dataset = canonical_dataset return self._hydrate_dataset_version(dataset_version) diff --git a/backend/layers/persistence/persistence_mock.py b/backend/layers/persistence/persistence_mock.py index a4993d09f1c36..ab7a843491ac2 100644 --- a/backend/layers/persistence/persistence_mock.py +++ b/backend/layers/persistence/persistence_mock.py @@ -1,5 +1,4 @@ import copy -import uuid from datetime import datetime from typing import Dict, Iterable, List, Optional, Union @@ -58,16 +57,12 @@ def __init__(self) -> None: self.datasets = {} # rename to: active_datasets self.datasets_versions = {} - @staticmethod - def _generate_id(): - return str(uuid.uuid4()) - # TODO: add publisher_metadata here? def create_canonical_collection( self, owner: str, curator_name: str, collection_metadata: CollectionMetadata ) -> CollectionVersion: - collection_id = CollectionId(self._generate_id()) - version_id = CollectionVersionId(self._generate_id()) + collection_id = CollectionId() + version_id = CollectionVersionId() canonical = CanonicalCollection(collection_id, None, None, False) version = CollectionVersion( collection_id=collection_id, @@ -146,10 +141,9 @@ def add_collection_version(self, collection_id: CollectionId) -> CollectionVersi cc = self.collections[collection_id.id] current_version_id = cc.version_id current_version = self.collections_versions[current_version_id.id] - new_version_id = CollectionVersionId(self._generate_id()) + new_version_id = CollectionVersionId() # Note: since datasets are immutable, there is no need to clone datasets here, # but the list that contains datasets needs to be copied, since it's a pointer. - self.datasets_versions new_dataset_list = copy.deepcopy(current_version.datasets) collection_version = CollectionVersion( @@ -262,8 +256,8 @@ def get_dataset_artifacts_by_version_id(self, dataset_version_id: DatasetId) -> def create_canonical_dataset(self, collection_version_id: CollectionVersionId) -> DatasetVersion: # Creates a dataset and initializes it with one version - dataset_id = DatasetId(self._generate_id()) - version_id = DatasetVersionId(self._generate_id()) + dataset_id = DatasetId() + version_id = DatasetVersionId() collection_version = self.collections_versions[collection_version_id.id] version = DatasetVersion( dataset_id=dataset_id, @@ -290,7 +284,7 @@ def add_dataset_artifact( self, version_id: DatasetVersionId, artifact_type: str, artifact_uri: str ) -> DatasetArtifactId: version = self.datasets_versions[version_id.id] - artifact_id = DatasetArtifactId(self._generate_id()) + artifact_id = DatasetArtifactId() version.artifacts.append(DatasetArtifact(artifact_id, artifact_type, artifact_uri)) return artifact_id @@ -335,7 +329,7 @@ def delete_dataset_from_collection_version( def replace_dataset_in_collection_version( self, collection_version_id: CollectionVersionId, old_dataset_version_id: DatasetVersionId ) -> DatasetVersion: - new_version_id = DatasetVersionId(self._generate_id()) + new_version_id = DatasetVersionId() old_version = self.get_dataset_version(old_dataset_version_id) collection_version = self.collections_versions[collection_version_id.id] new_version = DatasetVersion( diff --git a/scripts/cxg_admin_scripts/migrate.py b/scripts/cxg_admin_scripts/migrate.py index 7f39c7e7365b1..2813c17fafa90 100644 --- a/scripts/cxg_admin_scripts/migrate.py +++ b/scripts/cxg_admin_scripts/migrate.py @@ -414,11 +414,11 @@ def migrate_redesign_write(ctx): from sqlalchemy.orm import Session from backend.layers.persistence.orm import ( - Collection as CollectionRow, - CollectionVersion as CollectionVersionRow, - Dataset as DatasetRow, - DatasetVersion as DatasetVersionRow, - DatasetArtifact as DatasetArtifactRow, + CollectionTable as CollectionRow, + CollectionVersionTable as CollectionVersionRow, + DatasetTable as DatasetRow, + DatasetVersionTable as DatasetVersionRow, + DatasetArtifactTabel as DatasetArtifactRow, ) database_pass = os.getenv("PGPASSWORD") @@ -437,7 +437,7 @@ def migrate_redesign_write(ctx): id=collection["id"], version_id=collection["version_id"], originally_published_at=collection.get("originally_published_at"), - tombstoned=collection["tombstoned"], + tombstone=collection["tombstoned"], ) session.add(canonical_collection_row) @@ -451,10 +451,10 @@ def migrate_redesign_write(ctx): del link["url"] collection_version_row = CollectionVersionRow( + id=version["version_id"], collection_id=version["collection_id"], - version_id=version["version_id"], owner=version["owner"], - metadata=json.dumps(coll_metadata), + collection_metadata=json.dumps(coll_metadata), publisher_metadata=json.dumps(version["publisher_metadata"]), published_at=version["published_at"], datasets=version["datasets"], @@ -466,8 +466,8 @@ def migrate_redesign_write(ctx): for dataset in datasets: dataset_row = DatasetRow( - dataset_id=dataset["dataset_id"], - dataset_version_id=dataset["dataset_version_id"], + id=dataset["dataset_id"], + version_id=dataset["dataset_version_id"], published_at=dataset.get("published_at"), ) @@ -480,10 +480,10 @@ def migrate_redesign_write(ctx): continue dataset_version_row = DatasetVersionRow( - version_id=dataset_version["version_id"], + id=dataset_version["version_id"], dataset_id=dataset_version["dataset_id"], collection_id=dataset_version["collection_id"], - metadata=json.dumps(dataset_version["metadata"]), + dataset_metadata=json.dumps(dataset_version["metadata"]), artifacts=dataset_version["artifacts"], status=json.dumps(dataset_version["status"]), created_at=dataset_version.get("created_at"), diff --git a/tests/unit/backend/layers/api/test_portal_api.py b/tests/unit/backend/layers/api/test_portal_api.py index 71543adc732cb..8afeaa7edf5d9 100644 --- a/tests/unit/backend/layers/api/test_portal_api.py +++ b/tests/unit/backend/layers/api/test_portal_api.py @@ -263,7 +263,7 @@ def test__get_collection__ok(self): # ✅ def test__get_collection_id__403_not_found(self): """Verify the test collection exists and the expected fields exist.""" - fake_id = self._generate_id() + fake_id = CollectionId() test_url = furl(path=f"/dp/v1/collections/{fake_id}", query_params=dict(visibility="PUBLIC")) response = self.app.get(test_url.url, headers=dict(host="localhost")) self.assertEqual(403, response.status_code) @@ -827,7 +827,7 @@ def test_delete_collection__not_owner(self): self.assertEqual(response.status_code, 403) def test_delete_collection__does_not_exist(self): - fake_id = self._generate_id() + fake_id = CollectionId() test_url = furl(path=f"/dp/v1/collections/{fake_id}", query_params=dict(visibility="PRIVATE")) headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} response = self.app.delete(test_url.url, headers=headers) @@ -1241,7 +1241,7 @@ def test__post_dataset_asset__file_SERVER_ERROR(self): # ✅ def test__post_dataset_asset__dataset_NOT_FOUND(self): - fake_id = self._generate_id() + fake_id = DatasetVersionId() test_url = furl(path=f"/dp/v1/datasets/{fake_id}/asset/{fake_id}") response = self.app.post(test_url.url, headers=dict(host="localhost")) self.assertEqual(404, response.status_code) @@ -1470,7 +1470,7 @@ def test__cancel_dataset_download__ok(self): # ✅ def test__cancel_dataset_download__dataset_does_not_exist(self): - fake_id = self._generate_id() + fake_id = DatasetVersionId() test_url = f"/dp/v1/datasets/{fake_id}" headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} response = self.app.delete(test_url, headers=headers) @@ -1613,7 +1613,7 @@ def test__dataset_meta__ok(self): # ✅ def test__dataset_meta__404(self): - fake_id = self._generate_id() + fake_id = DatasetVersionId() headers = {"host": "localhost", "Content-Type": "application/json"} fake_url = f"http://base.url/{fake_id}.cxg/" test_url_404 = f"/dp/v1/datasets/meta?url={fake_url}" @@ -1745,7 +1745,7 @@ def test__revision_nonexistent__403(self): """ Start a revision on a non-existing collection. """ - fake_collection_id = self._generate_id() + fake_collection_id = CollectionId() headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} response = self.app.post(f"/dp/v1/collections/{fake_collection_id}", headers=headers) self.assertEqual(403, response.status_code) @@ -1947,7 +1947,7 @@ def test__publish_revision_bad_id__403(self): """ Publish a collection with a bad uuid (non existant) returns 403 """ - collection_id = self._generate_id() + collection_id = CollectionId() body = {"data_submission_policy_version": "1.0"} path = f"{self.base_path}/{collection_id}/publish" response = self.app.post(path, headers=self.headers, data=json.dumps(body)) @@ -2218,7 +2218,7 @@ def test__oversized__413(self): # ✅ def test__link_fake_collection__403(self): - fake_id = self._generate_id() + fake_id = CollectionId() path = f"/dp/v1/collections/{fake_id}/upload-links" headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} body = {"url": self.good_link} diff --git a/tests/unit/backend/layers/business/test_business.py b/tests/unit/backend/layers/business/test_business.py index a8f8d0ac8e15e..27914de2c9866 100644 --- a/tests/unit/backend/layers/business/test_business.py +++ b/tests/unit/backend/layers/business/test_business.py @@ -921,7 +921,7 @@ def test_create_collection_version_fails_if_collection_not_exists(self): """ A collection version can only be created on an existing collection """ - non_existing_collection_id = self.database_provider._generate_id() + non_existing_collection_id = CollectionId() with self.assertRaises(CollectionVersionException): self.business_logic.create_collection_version(CollectionId(non_existing_collection_id)) diff --git a/tests/unit/backend/layers/common/base_test.py b/tests/unit/backend/layers/common/base_test.py index 4b0dfa2922d09..202a4d1563ca4 100644 --- a/tests/unit/backend/layers/common/base_test.py +++ b/tests/unit/backend/layers/common/base_test.py @@ -258,9 +258,6 @@ def generate_dataset( [a.id for a in artifact_ids], ) - def _generate_id(self): - return DatabaseProviderMock._generate_id() - def remove_timestamps(self, body: dict, remove: typing.List[str] = None) -> dict: # TODO: implement as needed return body