Skip to content

Commit

Permalink
Merge pull request #127 from tkchafin/1.2.1
Browse files Browse the repository at this point in the history
Patch 1.2.1: Fix issue where pipeline fails on missing fields from NCBI datasets JSON
  • Loading branch information
muffato authored Aug 1, 2024
2 parents 91194d2 + e12d883 commit 028ddae
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 58 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [[1.2.1](https://github.com/sanger-tol/genomenote/releases/tag/1.2.1)] [2024-07-12]

### Enhancements & fixes

- Bugfix: Now handles missing fields in `ncbi datasets` genome report

## [[1.2.0](https://github.com/sanger-tol/genomenote/releases/tag/1.2.0)] - Pyrenean Mountain Dog - [2024-05-01]

### Enhancements & fixes
Expand Down
4 changes: 2 additions & 2 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Visit https://bit.ly/cffinit to generate yours today!

cff-version: 1.2.0
title: sanger-tol/genomenote v1.2.0
title: sanger-tol/genomenote v1.2.1
message: >-
If you use this software, please cite it using the
metadata from this file.
Expand Down Expand Up @@ -34,5 +34,5 @@ identifiers:
repository-code: "https://github.com/sanger-tol/genomenote"
license: MIT
commit: TODO
version: 1.2.0
version: 1.2.1
date-released: "2022-10-07"
156 changes: 102 additions & 54 deletions bin/create_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,120 +6,159 @@
import sys
import csv
import re
import math


def parse_args(args=None):
Description = "Create a table by parsing json output to extract N50, BUSCO, QV and COMPLETENESS stats."

parser = argparse.ArgumentParser(description=Description)
parser.add_argument("--genome", help="Input NCBI genome summary JSON file.", required=True)
parser.add_argument("--sequence", help="Input NCBI sequence summary JSON file.", required=True)
parser.add_argument("--genome", required=True, help="Input NCBI genome summary JSON file.")
parser.add_argument("--sequence", required=True, help="Input NCBI sequence summary JSON file.")
parser.add_argument("--busco", help="Input BUSCO short summary JSON file.")
parser.add_argument("--qv", nargs="*", help="Input QV TSV file from MERQURYFK.")
parser.add_argument("--completeness", nargs="*", help="Input COMPLETENESS stats TSV file from MERQURYFK.")
parser.add_argument("--hic", action="append", help="HiC sample ID used for contact maps.")
parser.add_argument("--flagstat", action="append", help="HiC flagstat file created by Samtools.")
parser.add_argument("--outcsv", help="Output CSV file.", required=True)
parser.add_argument("--outcsv", required=True, help="Output CSV file.")
parser.add_argument("--version", action="version", version="%(prog)s 3.1")
return parser.parse_args(args)


def make_dir(path):
"""
Creates a directory if it doesn't exist.
Parameters:
path (str): Path of the directory to be created.
"""
if len(path) > 0:
os.makedirs(path, exist_ok=True)


# check_samplesheet.py adds a suffix like "_T1", "_T2", etc, to the sample names
# check_samplesheet.py adds a suffix like "_T1", "_T2", etc, to sample names
# We usually don't want it in the final output
def remove_sample_T_suffix(name):
"""
Removes the suffix like "_T1", "_T2", etc., from sample names.
Parameters:
name (str): Sample name to be processed.
Returns:
str: Sample name with the suffix removed.
"""
return re.sub(r"_T\d+", "", name)


def ncbi_stats(genome_in, seq_in, writer):
"""
Extracts and writes assembly information and statistics from genome and
sequence JSON files to a CSV file.
Parameters:
genome_in (str): Path to the NCBI genome summary JSON file.
seq_in (str): Path to the NCBI sequence summary JSON file.
writer (csv.writer): CSV writer object to write the extracted data.
"""
with open(genome_in, "r") as fin1:
data = json.load(fin1)
data = data.get("reports", [{}])[0]

with open(seq_in, "r") as fin2:
seq = json.load(fin2)
seq = json.load(fin2).get("reports", [])

data = data["reports"][0]
info = data["assembly_info"]
attr = info["biosample"]["attributes"]
stats = data["assembly_stats"]
seq = seq["reports"]
info = data.get("assembly_info", {})
attr = info.get("biosample", {}).get("attributes", [])
stats = data.get("assembly_stats", {})
organism = data.get("organism", {})

# Write assembly information
writer.writerow(["##Assembly_Information"])
writer.writerow(["Accession", data["accession"]])
if "common_name" in data["organism"]:
writer.writerow(["Common_Name", data["organism"]["common_name"]])
writer.writerow(["Organism_Name", data["organism"]["organism_name"]])
writer.writerow(
[
"ToL_ID",
"".join(pairs["value"] for pairs in attr if pairs["name"] == "tolid"),
]
)
writer.writerow(["Taxon_ID", data["organism"]["tax_id"]])
writer.writerow(["Assembly_Name", info["assembly_name"]])
writer.writerow(["Assembly_Level", info["assembly_level"]])
writer.writerow(
[
"Life_Stage",
"".join(pairs["value"] for pairs in attr if pairs["name"] == "life_stage"),
]
)
writer.writerow(
[
"Tissue",
"".join(pairs["value"] for pairs in attr if pairs["name"] == "tissue"),
]
)
writer.writerow(["Sex", "".join(pairs["value"] for pairs in attr if pairs["name"] == "sex")])
writer.writerow(["Accession", data.get("accession", math.nan)])
writer.writerow(["Common_Name", organism.get("common_name", math.nan)])
writer.writerow(["Organism_Name", organism.get("organism_name", math.nan)])
tol_id = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "tolid")
writer.writerow(["ToL_ID", tol_id if tol_id else math.nan])
writer.writerow(["Taxon_ID", organism.get("tax_id", math.nan)])
writer.writerow(["Assembly_Name", info.get("assembly_name", math.nan)])
writer.writerow(["Assembly_Level", info.get("assembly_level", math.nan)])
life_stage = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "life_stage")
writer.writerow(["Life_Stage", life_stage if life_stage else math.nan])
tissue = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "tissue")
writer.writerow(["Tissue", tissue if tissue else math.nan])
sex = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "sex")
writer.writerow(["Sex", sex if sex else math.nan])

# Write assembly statistics
writer.writerow(["##Assembly_Statistics"])
writer.writerow(["Total_Sequence", stats["total_sequence_length"]])
if "total_number_of_chromosomes" in stats:
writer.writerow(["Chromosomes", stats["total_number_of_chromosomes"]])
writer.writerow(["Scaffolds", stats["number_of_scaffolds"]])
writer.writerow(["Scaffold_N50", stats["scaffold_n50"]])
writer.writerow(["Contigs", stats["number_of_contigs"]])
writer.writerow(["Contig_N50", stats["contig_n50"]])
writer.writerow(["GC_Percent", stats["gc_percent"]])
writer.writerow(["Total_Sequence", stats.get("total_sequence_length", math.nan)])
writer.writerow(["Chromosomes", stats.get("total_number_of_chromosomes", math.nan)])
writer.writerow(["Scaffolds", stats.get("number_of_scaffolds", math.nan)])
writer.writerow(["Scaffold_N50", stats.get("scaffold_n50", math.nan)])
writer.writerow(["Contigs", stats.get("number_of_contigs", math.nan)])
writer.writerow(["Contig_N50", stats.get("contig_n50", math.nan)])
writer.writerow(["GC_Percent", stats.get("gc_percent", math.nan)])

chromosome_header = False
for mol in seq:
if "gc_percent" in mol and mol["assembly_unit"] != "non-nuclear":
if mol.get("gc_percent") is not None and mol.get("assembly_unit") != "non-nuclear":
if not chromosome_header:
writer.writerow(["##Chromosome", "Length", "GC_Percent"])
chromosome_header = True
writer.writerow(
[
mol["chr_name"],
round(mol["length"] / 1000000, 2),
mol["gc_percent"],
mol.get("chr_name", math.nan),
round(mol.get("length", 0) / 1000000, 2) if mol.get("length") is not None else math.nan,
mol.get("gc_percent", math.nan),
]
)

organelle_header = False
for mol in seq:
if "gc_percent" in mol and mol["assembly_unit"] == "non-nuclear":
if mol.get("gc_percent") is not None and mol.get("assembly_unit") == "non-nuclear":
if not organelle_header:
writer.writerow(["##Organelle", "Length", "GC_Percent"])
organelle_header = True
writer.writerow(
[
mol["assigned_molecule_location_type"],
round(mol["length"] / 1000000, 2),
mol["gc_percent"],
mol.get("assigned_molecule_location_type", math.nan),
round(mol.get("length", 0) / 1000000, 2) if mol.get("length") is not None else math.nan,
mol.get("gc_percent", math.nan),
]
)


def extract_busco(file_in, writer):
"""
Extracts BUSCO information from a JSON file and writes it to a CSV file.
Parameters:
file_in (str): Path to the BUSCO summary JSON file.
writer (csv.writer): CSV writer object to write the extracted data.
"""
with open(file_in, "r") as fin:
data = json.load(fin)

writer.writerow(["##BUSCO", data["lineage_dataset"]["name"]])
writer.writerow(["Summary", data["results"]["one_line_summary"]])
lineage_dataset_name = data.get("lineage_dataset", {}).get("name", math.nan)
results_summary = data.get("results", {}).get("one_line_summary", math.nan)

writer.writerow(["##BUSCO", lineage_dataset_name])
writer.writerow(["Summary", results_summary])


def extract_pacbio(qv, completeness, writer):
"""
Extracts QV and completeness information from TSV files and writes it to a
CSV file.
NOTE: completeness and qv files have to be from matching specimen names
Parameters:
qv (list): List of paths to one or more QV TSV files.
completeness (list): List of paths to completeness stats TSV files.
writer (csv.writer): CSV writer object to write the extracted data.
"""
qval = 0
qv_name = None
for f in qv:
Expand Down Expand Up @@ -153,6 +192,15 @@ def extract_pacbio(qv, completeness, writer):


def extract_mapped(sample, file_in, writer):
"""
Extracts mapping information from a flagstat file and writes it to a CSV
file.
Parameters:
sample (str): Sample ID used for the HiC contact maps.
file_in (str): Path to the HiC flagstat file created by Samtools.
writer (csv.writer): CSV writer object to write the extracted data.
"""
writer.writerow(["##HiC", remove_sample_T_suffix(sample)])
with open(file_in, "r") as fin:
for line in fin:
Expand Down
4 changes: 2 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ profiles {
arm {
docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64'
}
singularity {
singularity {
singularity.enabled = true
singularity.autoMounts = true
conda.enabled = false
Expand Down Expand Up @@ -222,7 +222,7 @@ manifest {
description = """Creating standarised genome assembly publications"""
mainScript = 'main.nf'
nextflowVersion = '!>=22.10.1'
version = '1.2.0'
version = '1.2.1'
doi = '10.5281/zenodo.7949384'
}

Expand Down

0 comments on commit 028ddae

Please sign in to comment.