-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#5 - benchmarks - some random HGVSs from ClinVar
- Loading branch information
Showing
5 changed files
with
793 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
#!/bin/env python3 | ||
import time | ||
from argparse import ArgumentParser | ||
|
||
import hgvs | ||
import hgvs.dataproviders.uta | ||
from hgvs.assemblymapper import AssemblyMapper | ||
from hgvs.exceptions import HGVSDataNotAvailableError, HGVSInvalidVariantError | ||
|
||
from cdot.hgvs.dataproviders import JSONDataProvider, RESTDataProvider | ||
|
||
|
||
def handle_args(): | ||
parser = ArgumentParser(description='Benchmark cdot') | ||
parser.add_argument("hgvs_file") | ||
group = parser.add_mutually_exclusive_group() | ||
group.add_argument('--uta', action='store_true') | ||
group.add_argument('--rest', action='store_true') | ||
parser.add_argument('--json', help='JSON file') | ||
args = parser.parse_args() | ||
if not any([args.uta, args.rest, args.json]): | ||
parser.error("You need to specify at least one of 'uta', 'rest', 'json'") | ||
return args | ||
|
||
|
||
def main(): | ||
args = handle_args() | ||
|
||
hgvs_g_c_list = [] | ||
with open(args.hgvs_file) as f: | ||
for line in f: | ||
hgvs_g_c_list.append(line.split()) | ||
|
||
total = len(hgvs_g_c_list) | ||
print(f"Using {total} test records") | ||
|
||
if args.uta: | ||
hdp = hgvs.dataproviders.uta.connect() | ||
elif args.rest: | ||
hdp = RESTDataProvider() # Uses API server at cdot.cc | ||
elif args.json: | ||
hdp = JSONDataProvider({"GRCh38": args.json}) | ||
|
||
am = AssemblyMapper(hdp, | ||
assembly_name='GRCh38', | ||
alt_aln_method='splign', replace_reference=True) | ||
|
||
hp = hgvs.parser.Parser() | ||
|
||
run_times = [] | ||
correct = 0 | ||
incorrect = 0 | ||
no_data = 0 | ||
for hgvs_g, hgvs_c in hgvs_g_c_list: | ||
start = time.time() | ||
try: | ||
var_c = hp.parse_hgvs_variant(hgvs_c) | ||
if ":c." in hgvs_c: | ||
converted_hgvs_g = str(am.c_to_g(var_c)) | ||
else: | ||
converted_hgvs_g = str(am.n_to_g(var_c)) | ||
except HGVSDataNotAvailableError: | ||
no_data += 1 | ||
continue | ||
except HGVSInvalidVariantError as ive: | ||
print(f"{hgvs_c}: {ive}") | ||
incorrect += 1 | ||
continue | ||
|
||
if converted_hgvs_g == hgvs_g: | ||
correct += 1 | ||
else: | ||
incorrect += 1 | ||
print(f"{hgvs_c}: '{hgvs_g}' != '{converted_hgvs_g}' (actual)") | ||
continue | ||
|
||
end = time.time() | ||
time_taken = end - start | ||
run_times.append(time_taken) | ||
|
||
print(f"Total: {total}, correct: {correct}, incorrect: {incorrect}, no data: {no_data}") | ||
print(run_times) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() | ||
|
||
""" | ||
* Get a subset of rows from ClinVar VCF | ||
* zgrep "^#" clinvar.vcf.gz > header.txt | ||
* zcat -v "^#" clinvar.vcf.gz | shuf -n 1000 > clinvar_1k_records.vcf | ||
* cat header.txt clinvar_1k_rows.vcf | gzip > clinvar_1k.vcf.gz | ||
* Annotate the VCF to get MANE transcript (via --pick) | ||
vep -i clinvar_1k.vcf.gz -o clinvar_1k.vep_annotated.vcf.gz --cache --dir /data/annotation/VEP/vep_cache --fasta /data/annotation/fasta/GCF_000001405.39_GRCh38.p13_genomic.fna.gz --assembly GRCh38 --offline --use_given_ref --vcf --compress_output gzip --force_overwrite --pick --no_escape --hgvs --refseq --buffer_size 1000 | ||
* Extract out the g.HGVS and c.HGVS | ||
def cyvcf2_header_types(cyvcf2_reader): | ||
header_types = defaultdict(dict) | ||
for h in cyvcf2_reader.header_iter(): | ||
info = h.info() | ||
h_id = info.get("ID") | ||
if h_id: # Not much use w/o this | ||
header_types[h.type][h_id] = info | ||
return header_types | ||
reader = Reader("./clinvar_1k.vcf.gz") | ||
header_types = cyvcf2_header_types(reader) | ||
description = header_types["INFO"]["CSQ"]["description"] | ||
description = description.replace('"', '') # Strip double quotes | ||
match = "Format: " | ||
columns_str = description[description.rfind(match) + len(match):] | ||
vep_columns = columns_str.split("|") | ||
hgvs = [] | ||
for v in reader: | ||
csq = v.INFO.get("CSQ") | ||
td = dict(zip(vep_columns, csq.split("|"))) | ||
g_hgvs = v.INFO.get("CLNHGVS") | ||
c_hgvs = td.get("HGVSc") | ||
if g_hgvs and c_hgvs: | ||
hgvs.append((g_hgvs, c_hgvs)) | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
NC_000002.12:g.189003410G>A NM_000090.4:c.2554-1G>A | ||
NC_000002.12:g.73572910G>A NM_015120.4:c.11036G>A | ||
NC_000003.12:g.36996633_36996634delinsTT NM_001354619.1:c.-593_-592delinsTT | ||
NC_000003.12:g.58149866C>T NM_001164317.2:c.6201C>T | ||
NC_000006.12:g.52082460T>C NM_138694.4:c.213A>G | ||
NC_000007.14:g.98906228C>T NM_001244580.1:c.1088C>T | ||
NC_000010.11:g.120905027T>C NM_018117.12:c.3193+216T>C | ||
NC_000012.12:g.8854218G>A NM_144670.6:c.2681G>A | ||
NC_000016.10:g.81869317C>T NM_002661.5:c.564+19C>T | ||
NC_000017.11:g.58734198A>G NM_058216.3:c.1107A>G |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
NC_000001.11:g.237617406G>A NM_001035.3:c.5836G>A | ||
NC_000001.11:g.33021456_33021458del NM_001625.4:c.336_338del | ||
NC_000001.11:g.52383874T>C NM_004153.4:c.1819A>G | ||
NC_000001.11:g.52397773C>T NM_004153.4:c.314G>A | ||
NC_000001.11:g.53213501A>C NM_000098.3:c.1883A>C | ||
NC_000001.11:g.94047046C>T NM_000350.3:c.2791G>A | ||
NC_000002.12:g.169275142G>A NM_004525.3:c.1869C>T | ||
NC_000002.12:g.178538451C>G NM_001267550.2:c.99289+89G>C | ||
NC_000002.12:g.178608202G>T NM_001267550.2:c.52681C>A | ||
NC_000002.12:g.178741896G>A NM_001267550.2:c.11337C>T | ||
NC_000002.12:g.214781051A>T NM_000465.4:c.823T>A | ||
NC_000002.12:g.219490515_219490517del NM_005876.5:c.9028_9030del | ||
NC_000002.12:g.46909034C>T NM_001171511.2:c.93-1065G>A | ||
NC_000002.12:g.47806494A>C NM_001281493.1:c.2938A>C | ||
NC_000002.12:g.47813341T>C NM_001190274.2:c.2120A>G | ||
NC_000002.12:g.85888897C>A NM_003896.4:c.9G>T | ||
NC_000003.12:g.123752448G>C NM_001321309.2:c.-155-12447C>G | ||
NC_000005.10:g.10255709T>C NM_012073.5:c.332-246T>C | ||
NC_000005.10:g.112841537T>A NM_000038.6:c.5943T>A | ||
NC_000005.10:g.139026767A>G NM_022464.5:c.645+34T>C | ||
NC_000005.10:g.149027638G>A NM_024577.4:c.2094C>T | ||
NC_000005.10:g.35873480C>T NM_002185.5:c.538C>T | ||
NC_000007.14:g.16089456T>C NM_001101426.4:c.*2239A>G | ||
NC_000007.14:g.97852358C>T NM_001673.5:c.1587G>A | ||
NC_000008.11:g.18062326C>A NM_177924.5:c.601G>T | ||
NC_000008.11:g.89980722C>T NM_002485.5:c.480+12G>A | ||
NC_000009.12:g.37784879T>G NM_016042.4:c.166A>C | ||
NC_000009.12:g.69035905C>T NM_000144.5:c.123C>T | ||
NC_000010.11:g.87925523T>C NM_000314.8:c.175T>C | ||
NC_000011.10:g.112045279del NR_164072.1:n.1167+49del | ||
NC_000011.10:g.118312837C>T NM_000733.4:c.323C>T | ||
NC_000011.10:g.17427125C>T NR_147094.2:n.2212G>A | ||
NC_000011.10:g.64809875del NM_130804.2:c.237del | ||
NC_000012.12:g.120737861C>T NM_000017.4:c.497C>T | ||
NC_000013.11:g.27920121C>T NM_000209.4:c.-18C>T | ||
NC_000014.9:g.30879438C>T NM_004086.3:c.389C>T | ||
NC_000016.10:g.89770196G>A NM_000135.4:c.2286C>T | ||
NC_000017.11:g.43045725C>T NR_027676.2:n.5722G>A | ||
NC_000017.11:g.43094198del NR_027676.2:n.1510del | ||
NC_000017.11:g.59031925C>T NM_015294.6:c.1919G>A | ||
NC_000017.11:g.80058782T>C NM_017950.4:c.1318-76T>C | ||
NC_000018.10:g.58343058C>G NM_001144967.3:c.1530C>G | ||
NC_000018.10:g.70176380_70176383del NM_173630.4:c.1476+295_1476+298del | ||
NC_000019.10:g.12897438G>A NM_000159.4:c.1082+10G>A | ||
NC_000021.9:g.42477365G>A NM_080860.4:c.653C>T | ||
NC_000022.11:g.17110140A>G NM_001289905.1:c.*320A>G | ||
NC_000023.11:g.154767342C>T NM_001363.5:c.600C>T | ||
NC_000023.11:g.19355691T>C NM_000284.4:c.765T>C | ||
NC_000023.11:g.45059450G>A NR_111960.1:n.1555G>A | ||
NC_000023.11:g.74742052G>A NM_001008537.3:c.2505C>T |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
NC_000001.11:g.12007126G>A NM_001127660.1:c.1946G>A | ||
NC_000001.11:g.21860220G>A NM_005529.7:c.4971C>T | ||
NC_000001.11:g.237643408G>C NM_001035.3:c.7303G>C | ||
NC_000001.11:g.23808165T>C NM_000191.3:c.720A>G | ||
NC_000001.11:g.241517292C>A NM_000143.4:c.157G>T | ||
NC_000001.11:g.94111501G>A NM_000350.3:c.239C>T | ||
NC_000002.12:g.113220093C>T NM_003466.4:c.1275G>A | ||
NC_000002.12:g.144399194T>C NM_014795.4:c.1993A>G | ||
NC_000002.12:g.15286830T>C NM_015909.4:c.5138+243A>G | ||
NC_000002.12:g.178592272G>A NM_001267550.2:c.59632C>T | ||
NC_000002.12:g.178715710C>T NM_001267550.2:c.25704G>A | ||
NC_000002.12:g.214745103C>G NM_000465.4:c.1867G>C | ||
NC_000002.12:g.47803493G>C NM_001281493.1:c.2340G>C | ||
NC_000002.12:g.73432303A>G NM_015120.4:c.1432+12A>G | ||
NC_000002.12:g.73453022T>C NM_015120.4:c.6498T>C | ||
NC_000002.12:g.85343265G>T NM_017750.4:c.1810C>A | ||
NC_000002.12:g.85888897C>A NM_003896.4:c.9G>T | ||
NC_000003.12:g.15644611A>G NM_001281723.3:c.695A>G | ||
NC_000003.12:g.158691443T>C NR_164500.1:n.2195T>C | ||
NC_000003.12:g.43598588T>A NM_018075.5:c.416A>T | ||
NC_000003.12:g.49099390C>T NM_005051.3:c.1568G>A | ||
NC_000003.12:g.49099430C>T NM_005051.3:c.1528G>A | ||
NC_000003.12:g.52402867A>G NM_004656.4:c.1895T>C | ||
NC_000003.12:g.69118837G>T NM_001304418.3:c.1518C>A | ||
NC_000004.12:g.112646986A>G NM_016648.4:c.552+31A>G | ||
NC_000004.12:g.113355165A>G NM_001148.6:c.6547A>G | ||
NC_000004.12:g.43030609T>A NM_001080476.2:c.*69T>A | ||
NC_000004.12:g.52038248C>A NM_000232.5:c.12G>T | ||
NC_000004.12:g.83273598C>T NM_015697.8:c.590G>A | ||
NC_000004.12:g.88268819C>T NM_152542.5:c.629G>A | ||
NC_000005.10:g.113064054C>G NM_001085377.2:c.2143G>C | ||
NC_000005.10:g.126550263C>A NM_001182.5:c.1348G>T | ||
NC_000005.10:g.179126029C>T NM_014244.5:c.2719G>A | ||
NC_000005.10:g.79051354T>C NM_013391.3:c.678A>G | ||
NC_000005.10:g.83539495C>T NM_004385.5:c.6492C>T | ||
NC_000006.12:g.129280072C>T NM_001079823.2:c.2462C>T | ||
NC_000006.12:g.129353296C>A NM_001079823.2:c.4656C>A | ||
NC_000006.12:g.52079909A>G NM_138694.4:c.381T>C | ||
NC_000006.12:g.52082460T>C NM_138694.4:c.213A>G | ||
NC_000007.14:g.22945636C>G NM_032581.4:c.1519G>C | ||
NC_000007.14:g.5999116G>C NM_000535.7:c.697C>G | ||
NC_000007.14:g.93146872C>A NM_152703.5:c.-779+11G>T | ||
NC_000008.11:g.89980722C>T NM_002485.5:c.480+12G>A | ||
NC_000009.12:g.2717979G>A NM_133497.4:c.240G>A | ||
NC_000009.12:g.34648800G>A NM_001258332.1:c.399G>A | ||
NC_000009.12:g.37745681G>A NM_014907.3:c.3649G>A | ||
NC_000009.12:g.69035905C>T NM_000144.5:c.123C>T | ||
NC_000009.12:g.92045993A>G NM_006415.4:c.1136+6T>C | ||
NC_000010.11:g.110784352G>A NM_001134363.3:c.1349G>A | ||
NC_000010.11:g.110799819A>G NM_001134363.3:c.1701A>G | ||
NC_000010.11:g.111013593T>C NR_136749.1:n.2936T>C | ||
NC_000011.10:g.112045279del NR_164072.1:n.1167+49del | ||
NC_000011.10:g.118503818del NM_001197104.2:c.7926del | ||
NC_000011.10:g.119026031A>G NM_001164279.2:c.701T>C | ||
NC_000011.10:g.1752849G>A NM_001909.5:c.*654C>T | ||
NC_000011.10:g.47342611del NM_000256.3:c.1595del | ||
NC_000011.10:g.5226575del NM_000518.5:c.315+2del | ||
NC_000011.10:g.61445957A>G NM_017841.4:c.387A>G | ||
NC_000013.11:g.32332371_32332377delinsTACTTCAG NM_000059.3:c.893_899delinsTACTTCAG | ||
NC_000013.11:g.32337627T>C NM_000059.3:c.3272T>C | ||
NC_000013.11:g.32339462G>C NM_000059.3:c.5107G>C | ||
NC_000014.9:g.23432514G>C NM_000257.4:c.503-8C>G | ||
NC_000014.9:g.45176022C>G NM_020937.4:c.3268C>G | ||
NC_000014.9:g.67766373A>G NM_015346.4:c.5865T>C | ||
NC_000014.9:g.89980550C>G NM_018319.4:c.802C>G | ||
NC_000014.9:g.92006041T>C NM_004239.4:c.1935A>G | ||
NC_000015.10:g.90754814G>A NM_000057.4:c.963G>A | ||
NC_000015.10:g.92985680G>T NM_001271.4:c.3413+7G>T | ||
NC_000016.10:g.1352026C>T NM_032520.5:c.52+9C>T | ||
NC_000016.10:g.173193A>G NM_000517.6:c.164A>G | ||
NC_000016.10:g.2046301C>G NM_002528.7:c.181G>C | ||
NC_000016.10:g.2048728T>C NM_000548.5:c.113T>C | ||
NC_000016.10:g.2081776T>C NM_000548.5:c.3792T>C | ||
NC_000016.10:g.2109256C>T NM_000296.4:c.5911G>A | ||
NC_000016.10:g.30993204G>T NM_052874.5:c.712C>A | ||
NC_000017.11:g.41769530G>A NM_002230.4:c.356C>T | ||
NC_000017.11:g.43094198del NR_027676.2:n.1510del | ||
NC_000017.11:g.50356597T>C NM_022167.4:c.1569T>C | ||
NC_000017.11:g.59031925C>T NM_015294.6:c.1919G>A | ||
NC_000017.11:g.65557863T>A NM_004655.4:c.758A>T | ||
NC_000017.11:g.65557982_65557984del NM_004655.4:c.639_641del | ||
NC_000018.10:g.46639718G>A NM_144612.6:c.409C>T | ||
NC_000018.10:g.70176380_70176383del NM_173630.4:c.1476+295_1476+298del | ||
NC_000019.10:g.13298566C>T NM_001127222.2:c.3067G>A | ||
NC_000019.10:g.48969229T>A NM_002103.5:c.*59A>T | ||
NC_000019.10:g.51353233dup NM_001014763.1:c.551dup | ||
NC_000019.10:g.53904356T>C NM_002739.5:c.1657-279T>C | ||
NC_000019.10:g.54191771C>T NM_001077446.4:c.294C>T | ||
NC_000019.10:g.54193391G>A NM_001077446.4:c.*29G>A | ||
NC_000020.11:g.10658702_10658703insA NM_000214.3:c.459_460insT | ||
NC_000020.11:g.10673008T>C NM_000214.3:c.82-2A>G | ||
NC_000020.11:g.31831073G>A NM_033118.4:c.1356G>A | ||
NC_000020.11:g.35504820_35504831del NM_007186.6:c.6451_6462del | ||
NC_000020.11:g.63494913G>A NM_001958.5:c.513C>T | ||
NC_000021.9:g.32678630del NM_003895.3:c.1627+15del | ||
NC_000021.9:g.34370822G>T NM_172201.1:c.344G>T | ||
NC_000021.9:g.45468322G>A NM_130445.4:c.187G>A | ||
NC_000021.9:g.45511078A>C NM_130445.4:c.3694-33A>C | ||
NC_000023.11:g.22221720T>C NM_000444.6:c.1876T>C | ||
NC_000023.11:g.41343808_41343809dup NR_126093.1:n.1696_1697dup |
Oops, something went wrong.