feat(deposition)!: Allow groups to specify which BioProject (and BioS…

…ample) we upload their sequences into (#3650) * use project_id as project_table primary key * allow overwrite of bioproject accession with given value * update values.yaml - changes will also need to be made in pathoplexus * allow addition of already existing biosample * Add RUN_REF if insdcRawReadsAccession is in metadata * make code clearer * Require user-supplied project/sample accession actually exists --------- Co-authored-by: Cornelius Roemer <[email protected]>
loculus-project · Feb 14, 2025 · d6402c9 · d6402c9
1 parent 317aae4
commit d6402c9
Show file tree

Hide file tree

Showing 13 changed files with 308 additions and 98 deletions.
diff --git a/ena-submission/README.md b/ena-submission/README.md
@@ -31,8 +31,8 @@ This script runs once daily as a kubernetes cronjob. It calls the Loculus backen
 - data must be state "OPEN" for use
 - data must not already exist in ENA or be in the submission process, this means:
   - data was not submitted by the `config.ingest_pipeline_submitter`
-  - data is not in the `ena-submission.submission_table`
-  - as an extra check we discard all sequences with `ena-specific-metadata` fields
+  - data is not in the `ena-submission.submission_table` (and data with a later version is also not in the table)
+  - as an extra check we upload data with `ena-specific-metadata` fields in a separate file with a warning
 
 ## Threads
 
@@ -47,10 +47,14 @@ Download file in `github_url` every 30s. If data is not in submission table alre
 In a loop:
 
 - Get sequences in `submission_table` in state READY_TO_SUBMIT
+  - if (bioprojectAccession exists in the metadata):
+    - if (for (group_id, organism) a project entry with this accession already exists use that project_id)
+      and update submission_table to SUBMITTED_PROJECT (add center_name and project_id)
+    - else: create an entry in `project_table` and then update `submission_table` with results (center_name and project_id)
   - if (there exists an entry in the project_table for the corresponding (group_id, organism)):
     - if (entry is in status SUBMITTED): update `submission_table` to SUBMITTED_PROJECT.
     - else: update submission_table to SUBMITTING_PROJECT.
-  - else: create project entry in `project_table` for (group_id, organism).
+  - else: create project entry in `project_table` for (group_id, organism) -> creates a unique `project_id`.
 - Get sequences in `submission_table` in state SUBMITTING_PROJECT
   - if (corresponding `project_table` entry is in state SUBMITTED): update entries to state SUBMITTED_PROJECT.
 - Get sequences in `project_table` in state READY, prepare submission object, set status to SUBMITTING
@@ -69,6 +73,10 @@ Maps loculus metadata to ena metadata using template: https://www.ebi.ac.uk/ena/
 In a loop
 
 - Get sequences in `submission_table` in state SUBMITTED_PROJECT
+  - if (biosampleAccession exists in the metadata):
+    - if (for (accession, version) entry with this accession already exists use that sample_id)
+      and update submission_table to SUBMITTED_SAMPLE
+    - else: create an entry in `sample_table` and then update `submission_table` with results
   - if (there exists an entry in the `sample_table` for the corresponding (accession, version)):
     - if (entry is in status SUBMITTED): update `submission_table` to SUBMITTED_SAMPLE.
     - else: update submission_table to SUBMITTING_SAMPLE.
@@ -205,7 +213,7 @@ pip install -e .
 flyway -user=postgres -password=unsecure -url=jdbc:postgresql://127.0.0.1:5432/loculus -schemas=ena_deposition_schema -locations=filesystem:./flyway/sql migrate
 ```
 
-2. Submit data to the backend as test user (create group, submit and approve), e.g. using [example data](https://github.com/pathoplexus/example_data). (To test the full submission cycle with insdc accessions submit cchf example data with only 2 segments.)
+2. Submit data to the backend as test user (create group, submit and approve - create 2 groups if insdc_ingest_group has not be created), e.g. using [example data](https://github.com/pathoplexus/example_data). (To test the full submission cycle with insdc accessions submit cchf example data with only 2 segments.)
 
 ```sh
 KEYCLOAK_TOKEN_URL="http://localhost:8083/realms/loculus/protocol/openid-connect/token"
@@ -230,7 +238,7 @@ curl -X 'POST' 'http://localhost:8079/groups' \
   },
   "contactEmail": "[email protected]"}'
 LOCULUS_ACCESSION=$(curl -X 'POST' \
-  'http://localhost:8079/cchf/submit?groupId=1&dataUseTermsType=OPEN' \
+  'http://localhost:8079/cchf/submit?groupId=2&dataUseTermsType=OPEN' \
   -H 'accept: application/json' \
   -H "Authorization: Bearer ${JWT}" \
   -H 'Content-Type: multipart/form-data' \
@@ -243,9 +251,10 @@ curl -X 'POST' 'http://localhost:8079/cchf/approve-processed-data' \
   -d '{"scope": "ALL"}'
 ```
 
-3. Get list of sequences ready to submit to ENA, locally this will write `results/ena_submission_list.json`.
+3. Get list of sequences ready to submit to ENA, locally this will write `results/ena_submission_list.json`, you need to copy the config produced by `../generate_local_test_config.sh`.
 
 ```sh
+cp ../website/tests/config/ena-submission-config.yaml config/config.yaml
 python scripts/get_ena_submission_list.py --config-file=config/config.yaml --output-file=results/ena_submission_list.json
 ```
 

diff --git a/ena-submission/flyway/sql/V1.2__alter_project_table_id.sql b/ena-submission/flyway/sql/V1.2__alter_project_table_id.sql
@@ -0,0 +1,8 @@
+
+ALTER TABLE project_table DROP CONSTRAINT project_table_pkey;
+ALTER TABLE project_table ADD COLUMN project_id BIGSERIAL PRIMARY KEY;
+
+ALTER TABLE submission_table ADD project_id text;
+
+CREATE INDEX idx_project_table_group_id ON project_table(group_id);
+CREATE INDEX idx_project_table_organism ON project_table(organism);
diff --git a/ena-submission/scripts/deposition_dry_run.py b/ena-submission/scripts/deposition_dry_run.py
@@ -139,7 +139,7 @@ def local_ena_submission_generator(
         logger.info(f"Writing results to {directory}")
 
         manifest_object = create_manifest_object(
-            config, dummy_sample_dict, dummy_project_dict, entry, entry, entry, dir=directory
+            config, dummy_sample_dict, dummy_project_dict, entry, entry, dir=directory
         )
         create_manifest(manifest_object, is_broker=config.is_broker, dir=directory)
         logger.info(

diff --git a/ena-submission/scripts/get_ena_submission_list.py b/ena-submission/scripts/get_ena_submission_list.py
@@ -146,10 +146,10 @@ def get_ena_submission_list(config_file):
             message = (
                 f"{config.backend_url}: {organism} - ENA Submission pipeline found "
                 f"{len(entries_with_external_metadata)} sequences with ena-specific-metadata fields"
-                " and not submitted by us or ingested from the INSDC, this might be a user error or"
-                " require manual submission to ENA (e.g. manually setting the bioproject in the "
-                "PROJECT and the biosample in the SAMPLE table - see details in "
-                "https://loculus.slack.com/archives/C07HW5NAL03/p1724960217646709)"
+                " and not submitted by us or ingested from the INSDC, this might be a user error."
+                " If you think this is accurate ensure bioproject and biosample are set correctly."
+                " Bioprojects should be public and SRA accessions should also include bioprojects"
+                " and biosamples."
             )
             output_file = f"{organism}_with_ena_fields_{output_file_suffix}"
             send_slack_notification_with_file(

diff --git a/ena-submission/scripts/test_ena_submission.py b/ena-submission/scripts/test_ena_submission.py
@@ -247,21 +247,21 @@ def test_create_fasta(self):
 
     def test_create_manifest(self):
         config = mock_config()
-        group_key = {"group_id": 1, "organism": "Test organism"}
         study_accession = "Test Study Accession"
         sample_accession = "Test Sample Accession"
         results_in_sample_table = {"result": {"ena_sample_accession": sample_accession}}
         results_in_project_table = {
             "result": {"bioproject_accession": study_accession},
             "center_name": "generic_center_name",
+            "group_id": 1,
+            "organism": "Test organism",
         }
         manifest = create_manifest_object(
             config,
             results_in_sample_table,
             results_in_project_table,
             sample_data_in_submission_table,
             self.seq_key,
-            group_key,
         )
         manifest_file_name = create_manifest(manifest)
         data = {}

diff --git a/ena-submission/src/ena_deposition/create_assembly.py b/ena-submission/src/ena_deposition/create_assembly.py
@@ -89,7 +89,6 @@ def create_manifest_object(
     project_table_entry: dict[str, str],
     submission_table_entry: dict[str, str],
     seq_key: dict[str, str],
-    group_key: dict[str, str],
     test=False,
     dir: str | None = None,
 ) -> AssemblyManifest:
@@ -123,13 +122,11 @@ def create_manifest_object(
             address_string = ", ".join([x for x in address_list if x is not None])
             logger.debug("Created address from group_info")
         except Exception as e:
-            logger.error(
-                f"Was unable to create address, setting address to center_name due to {e}"
-            )
+            logger.error(f"Was unable to create address, setting address to center_name due to {e}")
 
     metadata = submission_table_entry["metadata"]
     unaligned_nucleotide_sequences = submission_table_entry["unaligned_nucleotide_sequences"]
-    organism_metadata = config.organisms[group_key["organism"]]["enaDeposition"]
+    organism_metadata = config.organisms[project_table_entry["organism"]]["enaDeposition"]
     chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key)
     logger.debug("Created chromosome list object")
     chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object, dir=dir)
@@ -195,6 +192,11 @@ def create_manifest_object(
         else seq_key["accession"]
     )
 
+    if metadata.get("insdcRawReadsAccession") and metadata["insdcRawReadsAccession"]:
+        run_ref = [metadata["insdcRawReadsAccession"]]
+    else:
+        run_ref = None
+
     return AssemblyManifest(
         study=study_accession,
         sample=sample_accession,
@@ -207,6 +209,7 @@ def create_manifest_object(
         chromosome_list=chromosome_list_file,
         description=description,
         moleculetype=moleculetype,
+        run_ref=run_ref,
         authors=authors,
         address=address_string,
     )
@@ -341,9 +344,8 @@ def assembly_table_create(
         if len(sample_data_in_submission_table) == 0:
             error_msg = f"Entry {row['accession']} not found in submitting_table"
             raise RuntimeError(error_msg)
-        group_key = {
-            "group_id": sample_data_in_submission_table[0]["group_id"],
-            "organism": sample_data_in_submission_table[0]["organism"],
+        project_id = {
+            "project_id": sample_data_in_submission_table[0]["project_id"],
         }
         center_name = sample_data_in_submission_table[0]["center_name"]
 
@@ -355,7 +357,7 @@ def assembly_table_create(
             raise RuntimeError(error_msg)
 
         results_in_project_table = find_conditions_in_db(
-            db_config, table_name="project_table", conditions=group_key
+            db_config, table_name="project_table", conditions=project_id
         )
         if len(results_in_project_table) == 0:
             error_msg = f"Entry {row['accession']} not found in project_table"
@@ -368,7 +370,6 @@ def assembly_table_create(
                 results_in_project_table[0],
                 sample_data_in_submission_table[0],
                 seq_key,
-                group_key,
                 test,
             )
             manifest_file = create_manifest(manifest_object, is_broker=config.is_broker)