Merge pull request #155 from ejseqera/geoids

Add support back in for GEO IDs
nf-core · May 16, 2023 · 7a5417d · 7a5417d
2 parents 1dbc8bf + dc72345
commit 7a5417d
Show file tree

Hide file tree

Showing 11 changed files with 92 additions and 74 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [[1.10.0](https://github.com/nf-core/fetchngs/releases/tag/1.10.0)] - 2023-05-15
+## [[1.10.0](https://github.com/nf-core/fetchngs/releases/tag/1.10.0)] - 2023-05-16
 
 ### Credits
 
@@ -21,6 +21,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements
 ### Enhancements & fixes
 
 - [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR ids associated with ArrayExpress
+- [#104](https://github.com/nf-core/fetchngs/issues/104) - Add support back in for [GEO IDs](https://www.ncbi.nlm.nih.gov/geo) (removed in v1.7)
 - [#129](https://github.com/nf-core/fetchngs/issues/129) - Pipeline is working with SRA run ids but failing with corresponding Biosample ids
 - [#138](https://github.com/nf-core/fetchngs/issues/138) - Add support for downloading protected dbGAP data using a JWT file
 - [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data

diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@
 
 ## Introduction
 
-**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)).
+**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / GEO / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)).
 
 ## Usage
 
@@ -56,7 +56,7 @@ For more details, please refer to the [usage documentation](https://nf-co.re/fet
 
 Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.csv)) the pipeline performs the following steps:
 
-### SRA / ENA / DDBJ ids
+### SRA / ENA / DDBJ / GEO ids
 
 1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html)
 2. Fetch extensive id metadata via ENA API
@@ -65,18 +65,6 @@ Via a single file of ids, provided one-per-line (see [example input file](https:
    - Otherwise use [`sra-tools`](https://github.com/ncbi/sra-tools) to download `.sra` files and convert them to FastQ
 4. Collate id metadata and paths to FastQ files in a single samplesheet
 
-### GEO ids
-
-Support for GEO ids was dropped in [[v1.7](https://github.com/nf-core/fetchngs/releases/tag/1.7)] due to breaking changes introduced in the NCBI API. For more detailed information please see [this PR](https://github.com/nf-core/fetchngs/pull/102).
-
-As a workaround, if you have a GEO accession you can directly download a text file containing the appropriate SRA ids to pass to the pipeline instead:
-
-- Search for your GEO accession on [GEO](https://www.ncbi.nlm.nih.gov/geo)
-- Click `SRA Run Selector` at the bottom of the GEO accession page
-- Select the desired samples in the `SRA Run Selector` and then download the `Accession List`
-
-This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline once renamed with a .csv extension e.g. `--input SRR_Acc_List.csv`.
-
 ### Synapse ids
 
 1. Resolve Synapse directory ids to their corresponding FastQ files ids via the `synapse list` command.

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -8,8 +8,8 @@
         "type": "array",
         "items": {
             "type": "string",
-            "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(syn))(\\d+)$",
-            "errorMessage": "Please provide a valid SRA, ENA, DDBJ identifier"
+            "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(GS[EM])|(syn))(\\d+)$",
+            "errorMessage": "Please provide a valid SRA, ENA, DDBJ or GEO identifier"
         }
     }
 }
diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py
@@ -14,7 +14,8 @@
 from urllib.error import HTTPError, URLError
 from urllib.parse import urlencode
 from urllib.request import urlopen
-
+import json
+import time
 
 logger = logging.getLogger()
 
@@ -188,9 +189,9 @@ def is_valid(cls, identifier):
 class DatabaseResolver:
     """Define a service class for resolving various identifiers to experiments."""
 
-    _GEO_PREFIXES = {"GSE", "GSM"}
+    _GEO_GSM_PREFIXES = {"GSM"}
+    _GEO_GSE_PREFIXES = {"GDS", "GSE"}
     _SRA_PREFIXES = {
-        "PRJNA",
         "DRA",
         "DRP",
         "DRS",
@@ -214,7 +215,9 @@ def expand_identifier(cls, identifier):
 
         """
         prefix = ID_REGEX.match(identifier).group(1)
-        if prefix in cls._GEO_PREFIXES:
+        if prefix in cls._GEO_GSM_PREFIXES:
+            return cls._gsm_to_srx(identifier)
+        elif prefix in cls._GEO_GSE_PREFIXES:
             return cls._gse_to_srx(identifier)
         elif prefix in cls._SRA_PREFIXES:
             return cls._id_to_srx(identifier)
@@ -239,21 +242,44 @@ def _id_to_srx(cls, identifier):
         return [row["Experiment"] for row in open_table(response, delimiter=",")]
 
     @classmethod
-    def _gse_to_srx(cls, identifier):
-        """Resolve the identifier to SRA experiments."""
+    def _gsm_to_srx(cls, identifier):
+        """Resolve the GEO identifier to SRA experiments."""
         ids = []
-        params = {"id": identifier, "db": "gds", "rettype": "runinfo", "retmode": "text"}
-        response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{urlencode(params)}")
+        params = {"term": identifier, "db": "sra", "retmode": "json"}
+        response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{urlencode(params)}")
         cls._content_check(response, identifier)
-        gsm_ids = [
-            line.split("=")[1].strip()
-            for line in response.text().splitlines()
-            if line.split("=")[1].strip().startswith("GSM")
-        ]
+        r_json = json.loads(response.text())
+        gsm_ids = r_json["esearchresult"]["idlist"]
         for gsm_id in gsm_ids:
             ids += cls._id_to_srx(gsm_id)
         return ids
 
+    @classmethod
+    def _gds_to_gsm(cls, identifier):
+        """Resolve the GEO UIDs to GSM IDs to then resolve to SRA IDs."""
+        ids = []
+        params = {"id": identifier, "db": "gds", "retmode": "json", "retmax": 10}
+        response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?{urlencode(params)}")
+        cls._content_check(response, identifier)
+        r_json = json.loads(response.text())
+
+        for each in r_json["result"][identifier]["samples"][0:]:
+            ids += cls._gsm_to_srx(each["accession"])
+        return ids
+
+    @classmethod
+    def _gse_to_srx(cls, identifier):
+        """Resolve the GSE identifier to GEO UIDs."""
+        ids = []
+        params = {"term": identifier, "db": "gds", "retmode": "json"}
+        response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{urlencode(params)}")
+        cls._content_check(response, identifier)
+        r_json = json.loads(response.text())
+        gds_uids = r_json["esearchresult"]["idlist"]
+        for gds_uid in gds_uids:
+            ids += cls._gds_to_gsm(gds_uid)
+        return ids
+
     @classmethod
     def _id_to_erx(cls, identifier):
         """Resolve the identifier to ENA experiments."""
@@ -374,13 +400,40 @@ def validate_fields_parameter(param, valid_vals, param_desc):
 
 def fetch_url(url):
     """Return a response object for the given URL and handle errors appropriately."""
+    sleep_time = 5  # Hardcode sleep duration in seconds
+    max_num_attempts = 3  # Hardcode max number of request attempts
+    attempt = 0
+
     try:
         with urlopen(url) as response:
             return Response(response=response)
+
     except HTTPError as e:
-        logger.error("The server couldn't fulfill the request.")
-        logger.error(f"Status: {e.code} {e.reason}")
-        sys.exit(1)
+        if e.status == 429:
+            # If the response is 429, sleep and retry
+            if "Retry-After" in e.headers:
+                retry_after = int(e.headers["Retry-After"])
+                logging.warning(f"Received 429 response from server. Retrying after {retry_after} seconds...")
+                time.sleep(retry_after)
+            else:
+                logging.warning(f"Received 429 response from server. Retrying in {sleep_time} seconds...")
+                time.sleep(sleep_time)
+                sleep_time *= 2  # Increment sleep time
+            attempt += 1
+            return fetch_url(url)  # Recursive call to retry request
+
+        elif e.status == 500:
+            # If the response is 500, sleep and retry max 3 times
+            if attempt <= max_num_attempts:
+                logging.warning(f"Received 500 response from server. Retrying in {sleep_time} seconds...")
+                time.sleep(sleep_time)
+                sleep_time *= 2
+                attempt += 1
+                return fetch_url(url)
+            else:
+                logging.error("Exceeded max request attempts. Exiting.")
+                sys.exit(1)
+
     except URLError as e:
         logger.error("We failed to reach a server.")
         logger.error(f"Reason: {e.reason}")

diff --git a/docs/output.md b/docs/output.md
@@ -9,19 +9,19 @@ This document describes the output produced by the pipeline. The directories lis
 The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data depending on the type of ids provided:
 
 - Download FastQ files and create samplesheet from:
-  1. [SRA / ENA / DDBJ ids](#sra--ena--ddbj-ids)
+  1. [SRA / ENA / DDBJ / GEO ids](#sra--ena--ddbj--geo-ids)
   2. [Synapse ids](#synapse-ids)
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
 Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline.
 
-### SRA / ENA / DDBJ ids
+### SRA / ENA / DDBJ / GEO ids
 
 <details markdown="1">
 <summary>Output files</summary>
 
 - `fastq/`
-  - `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ.
+  - `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ / GEO.
 - `fastq/md5/`
   - `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA.
 - `samplesheet/`

diff --git a/docs/usage.md b/docs/usage.md
@@ -8,15 +8,15 @@
 
 The pipeline has been set-up to automatically download and process the raw FastQ files from both public and private repositories. Identifiers can be provided in a file, one-per-line via the `--input` parameter. Currently, the following types of example identifiers are supported:
 
-| `SRA`        | `ENA`        | `DDBJ`       | `Synapse`   |
-| ------------ | ------------ | ------------ | ----------- |
-| SRR11605097  | ERR4007730   | DRR171822    | syn26240435 |
-| SRX8171613   | ERX4009132   | DRX162434    |             |
-| SRS6531847   | ERS4399630   | DRS090921    |             |
-| SAMN14689442 | SAMEA6638373 | SAMD00114846 |             |
-| SRP256957    | ERP120836    | DRP004793    |             |
-| SRA1068758   | ERA2420837   | DRA008156    |             |
-| PRJNA625551  | PRJEB37513   | PRJDB4176    |             |
+| `SRA`        | `ENA`        | `DDBJ`       | `GEO`      | `Synapse`   |
+| ------------ | ------------ | ------------ | ---------- | ----------- |
+| SRR11605097  | ERR4007730   | DRR171822    | GSM4432381 | syn26240435 |
+| SRX8171613   | ERX4009132   | DRX162434    | GSE147507  |             |
+| SRS6531847   | ERS4399630   | DRS090921    |            |             |
+| SAMN14689442 | SAMEA6638373 | SAMD00114846 |            |             |
+| SRP256957    | ERP120836    | DRP004793    |            |             |
+| SRA1068758   | ERA2420837   | DRA008156    |            |             |
+| PRJNA625551  | PRJEB37513   | PRJDB4176    |            |             |
 
 ### SRR / ERR / DRR ids
 

diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy
@@ -111,7 +111,7 @@ class WorkflowMain {
             if (num_match == total_ids) {
                 is_sra = true
             } else {
-                Nextflow.error("Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!")
+                Nextflow.error("Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / GEO / DDBJ or Synapse ids!")
             }
         }
         return is_sra
@@ -135,7 +135,7 @@ class WorkflowMain {
             if (num_match == total_ids) {
                 is_synapse = true
             } else {
-                Nextflow.error("Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!")
+                Nextflow.error("Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / GEO / DDBJ or Synapse ids!")
             }
         }
         return is_synapse

diff --git a/lib/WorkflowSra.groovy b/lib/WorkflowSra.groovy
@@ -30,21 +30,4 @@ class WorkflowSra {
             "  running nf-core/other pipelines.\n" +
             "==================================================================================="
     }
-
-    // Fail pipeline if input ids are from the GEO
-    public static void isGeoFail(ids) {
-        def pattern = /^(GS[EM])(\d+)$/
-        for (id in ids) {
-            if (id =~ pattern) {
-                def error_string = "===================================================================================\n" +
-                    "  GEO id detected: ${id}\n" +
-                    "  Support for GEO ids was dropped in v1.7 due to breaking changes in the NCBI API.\n" +
-                    "  Please remove any GEO ids from the input samplesheet.\n\n" +
-                    "  Please see:\n" +
-                    "  https://github.com/nf-core/fetchngs/pull/102\n" +
-                    "==================================================================================="
-                Nextflow.error(error_string)
-            }
-        }
-    }
 }
diff --git a/main.nf b/main.nf
@@ -44,7 +44,7 @@ if (WorkflowMain.isSraId(ch_input)) {
 } else if (WorkflowMain.isSynapseId(ch_input)) {
     input_type = 'synapse'
 } else {
-    exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / DDBJ or Synapse ids!'
+    exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / GEO / DDBJ or Synapse ids!'
 }
 
 if (params.input_type == input_type) {
@@ -63,7 +63,7 @@ if (params.input_type == input_type) {
 workflow NFCORE_FETCHNGS {
 
     //
-    // WORKFLOW: Download FastQ files for SRA / ENA / DDBJ ids
+    // WORKFLOW: Download FastQ files for SRA / ENA / GEO / DDBJ ids
     //
     if (params.input_type == 'sra') {
         SRA ( ch_ids )

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -19,7 +19,7 @@
                     "pattern": "^\\S+\\.(csv|tsv|txt)$",
                     "schema": "assets/schema_input.json",
                     "fa_icon": "fas fa-file-excel",
-                    "description": "File containing SRA/ENA/DDBJ identifiers one per line to download their associated metadata and FastQ files."
+                    "description": "File containing SRA/ENA/GEO/DDBJ identifiers one per line to download their associated metadata and FastQ files."
                 },
                 "input_type": {
                     "type": "string",

diff --git a/workflows/sra.nf b/workflows/sra.nf
@@ -50,13 +50,6 @@ workflow SRA {
     main:
     ch_versions = Channel.empty()
 
-    //
-    // Fail the pipeline if GEO ids detected
-    //
-    ids
-        .collect()
-        .map { WorkflowSra.isGeoFail(it) }
-
     //
     // MODULE: Get SRA run information for public database ids
     //