Skip to content

Commit

Permalink
#5 - benchmarks - some random HGVSs from ClinVar
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Jan 28, 2022
1 parent ddb3ca6 commit c406327
Show file tree
Hide file tree
Showing 5 changed files with 793 additions and 0 deletions.
133 changes: 133 additions & 0 deletions tests/benchmark_hgvs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#!/bin/env python3
import time
from argparse import ArgumentParser

import hgvs
import hgvs.dataproviders.uta
from hgvs.assemblymapper import AssemblyMapper
from hgvs.exceptions import HGVSDataNotAvailableError, HGVSInvalidVariantError

from cdot.hgvs.dataproviders import JSONDataProvider, RESTDataProvider


def handle_args():
parser = ArgumentParser(description='Benchmark cdot')
parser.add_argument("hgvs_file")
group = parser.add_mutually_exclusive_group()
group.add_argument('--uta', action='store_true')
group.add_argument('--rest', action='store_true')
parser.add_argument('--json', help='JSON file')
args = parser.parse_args()
if not any([args.uta, args.rest, args.json]):
parser.error("You need to specify at least one of 'uta', 'rest', 'json'")
return args


def main():
args = handle_args()

hgvs_g_c_list = []
with open(args.hgvs_file) as f:
for line in f:
hgvs_g_c_list.append(line.split())

total = len(hgvs_g_c_list)
print(f"Using {total} test records")

if args.uta:
hdp = hgvs.dataproviders.uta.connect()
elif args.rest:
hdp = RESTDataProvider() # Uses API server at cdot.cc
elif args.json:
hdp = JSONDataProvider({"GRCh38": args.json})

am = AssemblyMapper(hdp,
assembly_name='GRCh38',
alt_aln_method='splign', replace_reference=True)

hp = hgvs.parser.Parser()

run_times = []
correct = 0
incorrect = 0
no_data = 0
for hgvs_g, hgvs_c in hgvs_g_c_list:
start = time.time()
try:
var_c = hp.parse_hgvs_variant(hgvs_c)
if ":c." in hgvs_c:
converted_hgvs_g = str(am.c_to_g(var_c))
else:
converted_hgvs_g = str(am.n_to_g(var_c))
except HGVSDataNotAvailableError:
no_data += 1
continue
except HGVSInvalidVariantError as ive:
print(f"{hgvs_c}: {ive}")
incorrect += 1
continue

if converted_hgvs_g == hgvs_g:
correct += 1
else:
incorrect += 1
print(f"{hgvs_c}: '{hgvs_g}' != '{converted_hgvs_g}' (actual)")
continue

end = time.time()
time_taken = end - start
run_times.append(time_taken)

print(f"Total: {total}, correct: {correct}, incorrect: {incorrect}, no data: {no_data}")
print(run_times)


if __name__ == '__main__':
main()

"""
* Get a subset of rows from ClinVar VCF
* zgrep "^#" clinvar.vcf.gz > header.txt
* zcat -v "^#" clinvar.vcf.gz | shuf -n 1000 > clinvar_1k_records.vcf
* cat header.txt clinvar_1k_rows.vcf | gzip > clinvar_1k.vcf.gz
* Annotate the VCF to get MANE transcript (via --pick)
vep -i clinvar_1k.vcf.gz -o clinvar_1k.vep_annotated.vcf.gz --cache --dir /data/annotation/VEP/vep_cache --fasta /data/annotation/fasta/GCF_000001405.39_GRCh38.p13_genomic.fna.gz --assembly GRCh38 --offline --use_given_ref --vcf --compress_output gzip --force_overwrite --pick --no_escape --hgvs --refseq --buffer_size 1000
* Extract out the g.HGVS and c.HGVS
def cyvcf2_header_types(cyvcf2_reader):
header_types = defaultdict(dict)
for h in cyvcf2_reader.header_iter():
info = h.info()
h_id = info.get("ID")
if h_id: # Not much use w/o this
header_types[h.type][h_id] = info
return header_types
reader = Reader("./clinvar_1k.vcf.gz")
header_types = cyvcf2_header_types(reader)
description = header_types["INFO"]["CSQ"]["description"]
description = description.replace('"', '') # Strip double quotes
match = "Format: "
columns_str = description[description.rfind(match) + len(match):]
vep_columns = columns_str.split("|")
hgvs = []
for v in reader:
csq = v.INFO.get("CSQ")
td = dict(zip(vep_columns, csq.split("|")))
g_hgvs = v.INFO.get("CLNHGVS")
c_hgvs = td.get("HGVSc")
if g_hgvs and c_hgvs:
hgvs.append((g_hgvs, c_hgvs))
"""
10 changes: 10 additions & 0 deletions tests/test_data/clinvar_hgvs_010.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
NC_000002.12:g.189003410G>A NM_000090.4:c.2554-1G>A
NC_000002.12:g.73572910G>A NM_015120.4:c.11036G>A
NC_000003.12:g.36996633_36996634delinsTT NM_001354619.1:c.-593_-592delinsTT
NC_000003.12:g.58149866C>T NM_001164317.2:c.6201C>T
NC_000006.12:g.52082460T>C NM_138694.4:c.213A>G
NC_000007.14:g.98906228C>T NM_001244580.1:c.1088C>T
NC_000010.11:g.120905027T>C NM_018117.12:c.3193+216T>C
NC_000012.12:g.8854218G>A NM_144670.6:c.2681G>A
NC_000016.10:g.81869317C>T NM_002661.5:c.564+19C>T
NC_000017.11:g.58734198A>G NM_058216.3:c.1107A>G
50 changes: 50 additions & 0 deletions tests/test_data/clinvar_hgvs_050.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
NC_000001.11:g.237617406G>A NM_001035.3:c.5836G>A
NC_000001.11:g.33021456_33021458del NM_001625.4:c.336_338del
NC_000001.11:g.52383874T>C NM_004153.4:c.1819A>G
NC_000001.11:g.52397773C>T NM_004153.4:c.314G>A
NC_000001.11:g.53213501A>C NM_000098.3:c.1883A>C
NC_000001.11:g.94047046C>T NM_000350.3:c.2791G>A
NC_000002.12:g.169275142G>A NM_004525.3:c.1869C>T
NC_000002.12:g.178538451C>G NM_001267550.2:c.99289+89G>C
NC_000002.12:g.178608202G>T NM_001267550.2:c.52681C>A
NC_000002.12:g.178741896G>A NM_001267550.2:c.11337C>T
NC_000002.12:g.214781051A>T NM_000465.4:c.823T>A
NC_000002.12:g.219490515_219490517del NM_005876.5:c.9028_9030del
NC_000002.12:g.46909034C>T NM_001171511.2:c.93-1065G>A
NC_000002.12:g.47806494A>C NM_001281493.1:c.2938A>C
NC_000002.12:g.47813341T>C NM_001190274.2:c.2120A>G
NC_000002.12:g.85888897C>A NM_003896.4:c.9G>T
NC_000003.12:g.123752448G>C NM_001321309.2:c.-155-12447C>G
NC_000005.10:g.10255709T>C NM_012073.5:c.332-246T>C
NC_000005.10:g.112841537T>A NM_000038.6:c.5943T>A
NC_000005.10:g.139026767A>G NM_022464.5:c.645+34T>C
NC_000005.10:g.149027638G>A NM_024577.4:c.2094C>T
NC_000005.10:g.35873480C>T NM_002185.5:c.538C>T
NC_000007.14:g.16089456T>C NM_001101426.4:c.*2239A>G
NC_000007.14:g.97852358C>T NM_001673.5:c.1587G>A
NC_000008.11:g.18062326C>A NM_177924.5:c.601G>T
NC_000008.11:g.89980722C>T NM_002485.5:c.480+12G>A
NC_000009.12:g.37784879T>G NM_016042.4:c.166A>C
NC_000009.12:g.69035905C>T NM_000144.5:c.123C>T
NC_000010.11:g.87925523T>C NM_000314.8:c.175T>C
NC_000011.10:g.112045279del NR_164072.1:n.1167+49del
NC_000011.10:g.118312837C>T NM_000733.4:c.323C>T
NC_000011.10:g.17427125C>T NR_147094.2:n.2212G>A
NC_000011.10:g.64809875del NM_130804.2:c.237del
NC_000012.12:g.120737861C>T NM_000017.4:c.497C>T
NC_000013.11:g.27920121C>T NM_000209.4:c.-18C>T
NC_000014.9:g.30879438C>T NM_004086.3:c.389C>T
NC_000016.10:g.89770196G>A NM_000135.4:c.2286C>T
NC_000017.11:g.43045725C>T NR_027676.2:n.5722G>A
NC_000017.11:g.43094198del NR_027676.2:n.1510del
NC_000017.11:g.59031925C>T NM_015294.6:c.1919G>A
NC_000017.11:g.80058782T>C NM_017950.4:c.1318-76T>C
NC_000018.10:g.58343058C>G NM_001144967.3:c.1530C>G
NC_000018.10:g.70176380_70176383del NM_173630.4:c.1476+295_1476+298del
NC_000019.10:g.12897438G>A NM_000159.4:c.1082+10G>A
NC_000021.9:g.42477365G>A NM_080860.4:c.653C>T
NC_000022.11:g.17110140A>G NM_001289905.1:c.*320A>G
NC_000023.11:g.154767342C>T NM_001363.5:c.600C>T
NC_000023.11:g.19355691T>C NM_000284.4:c.765T>C
NC_000023.11:g.45059450G>A NR_111960.1:n.1555G>A
NC_000023.11:g.74742052G>A NM_001008537.3:c.2505C>T
100 changes: 100 additions & 0 deletions tests/test_data/clinvar_hgvs_100.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
NC_000001.11:g.12007126G>A NM_001127660.1:c.1946G>A
NC_000001.11:g.21860220G>A NM_005529.7:c.4971C>T
NC_000001.11:g.237643408G>C NM_001035.3:c.7303G>C
NC_000001.11:g.23808165T>C NM_000191.3:c.720A>G
NC_000001.11:g.241517292C>A NM_000143.4:c.157G>T
NC_000001.11:g.94111501G>A NM_000350.3:c.239C>T
NC_000002.12:g.113220093C>T NM_003466.4:c.1275G>A
NC_000002.12:g.144399194T>C NM_014795.4:c.1993A>G
NC_000002.12:g.15286830T>C NM_015909.4:c.5138+243A>G
NC_000002.12:g.178592272G>A NM_001267550.2:c.59632C>T
NC_000002.12:g.178715710C>T NM_001267550.2:c.25704G>A
NC_000002.12:g.214745103C>G NM_000465.4:c.1867G>C
NC_000002.12:g.47803493G>C NM_001281493.1:c.2340G>C
NC_000002.12:g.73432303A>G NM_015120.4:c.1432+12A>G
NC_000002.12:g.73453022T>C NM_015120.4:c.6498T>C
NC_000002.12:g.85343265G>T NM_017750.4:c.1810C>A
NC_000002.12:g.85888897C>A NM_003896.4:c.9G>T
NC_000003.12:g.15644611A>G NM_001281723.3:c.695A>G
NC_000003.12:g.158691443T>C NR_164500.1:n.2195T>C
NC_000003.12:g.43598588T>A NM_018075.5:c.416A>T
NC_000003.12:g.49099390C>T NM_005051.3:c.1568G>A
NC_000003.12:g.49099430C>T NM_005051.3:c.1528G>A
NC_000003.12:g.52402867A>G NM_004656.4:c.1895T>C
NC_000003.12:g.69118837G>T NM_001304418.3:c.1518C>A
NC_000004.12:g.112646986A>G NM_016648.4:c.552+31A>G
NC_000004.12:g.113355165A>G NM_001148.6:c.6547A>G
NC_000004.12:g.43030609T>A NM_001080476.2:c.*69T>A
NC_000004.12:g.52038248C>A NM_000232.5:c.12G>T
NC_000004.12:g.83273598C>T NM_015697.8:c.590G>A
NC_000004.12:g.88268819C>T NM_152542.5:c.629G>A
NC_000005.10:g.113064054C>G NM_001085377.2:c.2143G>C
NC_000005.10:g.126550263C>A NM_001182.5:c.1348G>T
NC_000005.10:g.179126029C>T NM_014244.5:c.2719G>A
NC_000005.10:g.79051354T>C NM_013391.3:c.678A>G
NC_000005.10:g.83539495C>T NM_004385.5:c.6492C>T
NC_000006.12:g.129280072C>T NM_001079823.2:c.2462C>T
NC_000006.12:g.129353296C>A NM_001079823.2:c.4656C>A
NC_000006.12:g.52079909A>G NM_138694.4:c.381T>C
NC_000006.12:g.52082460T>C NM_138694.4:c.213A>G
NC_000007.14:g.22945636C>G NM_032581.4:c.1519G>C
NC_000007.14:g.5999116G>C NM_000535.7:c.697C>G
NC_000007.14:g.93146872C>A NM_152703.5:c.-779+11G>T
NC_000008.11:g.89980722C>T NM_002485.5:c.480+12G>A
NC_000009.12:g.2717979G>A NM_133497.4:c.240G>A
NC_000009.12:g.34648800G>A NM_001258332.1:c.399G>A
NC_000009.12:g.37745681G>A NM_014907.3:c.3649G>A
NC_000009.12:g.69035905C>T NM_000144.5:c.123C>T
NC_000009.12:g.92045993A>G NM_006415.4:c.1136+6T>C
NC_000010.11:g.110784352G>A NM_001134363.3:c.1349G>A
NC_000010.11:g.110799819A>G NM_001134363.3:c.1701A>G
NC_000010.11:g.111013593T>C NR_136749.1:n.2936T>C
NC_000011.10:g.112045279del NR_164072.1:n.1167+49del
NC_000011.10:g.118503818del NM_001197104.2:c.7926del
NC_000011.10:g.119026031A>G NM_001164279.2:c.701T>C
NC_000011.10:g.1752849G>A NM_001909.5:c.*654C>T
NC_000011.10:g.47342611del NM_000256.3:c.1595del
NC_000011.10:g.5226575del NM_000518.5:c.315+2del
NC_000011.10:g.61445957A>G NM_017841.4:c.387A>G
NC_000013.11:g.32332371_32332377delinsTACTTCAG NM_000059.3:c.893_899delinsTACTTCAG
NC_000013.11:g.32337627T>C NM_000059.3:c.3272T>C
NC_000013.11:g.32339462G>C NM_000059.3:c.5107G>C
NC_000014.9:g.23432514G>C NM_000257.4:c.503-8C>G
NC_000014.9:g.45176022C>G NM_020937.4:c.3268C>G
NC_000014.9:g.67766373A>G NM_015346.4:c.5865T>C
NC_000014.9:g.89980550C>G NM_018319.4:c.802C>G
NC_000014.9:g.92006041T>C NM_004239.4:c.1935A>G
NC_000015.10:g.90754814G>A NM_000057.4:c.963G>A
NC_000015.10:g.92985680G>T NM_001271.4:c.3413+7G>T
NC_000016.10:g.1352026C>T NM_032520.5:c.52+9C>T
NC_000016.10:g.173193A>G NM_000517.6:c.164A>G
NC_000016.10:g.2046301C>G NM_002528.7:c.181G>C
NC_000016.10:g.2048728T>C NM_000548.5:c.113T>C
NC_000016.10:g.2081776T>C NM_000548.5:c.3792T>C
NC_000016.10:g.2109256C>T NM_000296.4:c.5911G>A
NC_000016.10:g.30993204G>T NM_052874.5:c.712C>A
NC_000017.11:g.41769530G>A NM_002230.4:c.356C>T
NC_000017.11:g.43094198del NR_027676.2:n.1510del
NC_000017.11:g.50356597T>C NM_022167.4:c.1569T>C
NC_000017.11:g.59031925C>T NM_015294.6:c.1919G>A
NC_000017.11:g.65557863T>A NM_004655.4:c.758A>T
NC_000017.11:g.65557982_65557984del NM_004655.4:c.639_641del
NC_000018.10:g.46639718G>A NM_144612.6:c.409C>T
NC_000018.10:g.70176380_70176383del NM_173630.4:c.1476+295_1476+298del
NC_000019.10:g.13298566C>T NM_001127222.2:c.3067G>A
NC_000019.10:g.48969229T>A NM_002103.5:c.*59A>T
NC_000019.10:g.51353233dup NM_001014763.1:c.551dup
NC_000019.10:g.53904356T>C NM_002739.5:c.1657-279T>C
NC_000019.10:g.54191771C>T NM_001077446.4:c.294C>T
NC_000019.10:g.54193391G>A NM_001077446.4:c.*29G>A
NC_000020.11:g.10658702_10658703insA NM_000214.3:c.459_460insT
NC_000020.11:g.10673008T>C NM_000214.3:c.82-2A>G
NC_000020.11:g.31831073G>A NM_033118.4:c.1356G>A
NC_000020.11:g.35504820_35504831del NM_007186.6:c.6451_6462del
NC_000020.11:g.63494913G>A NM_001958.5:c.513C>T
NC_000021.9:g.32678630del NM_003895.3:c.1627+15del
NC_000021.9:g.34370822G>T NM_172201.1:c.344G>T
NC_000021.9:g.45468322G>A NM_130445.4:c.187G>A
NC_000021.9:g.45511078A>C NM_130445.4:c.3694-33A>C
NC_000023.11:g.22221720T>C NM_000444.6:c.1876T>C
NC_000023.11:g.41343808_41343809dup NR_126093.1:n.1696_1697dup
Loading

0 comments on commit c406327

Please sign in to comment.