Skip to content

Commit

Permalink
Genebee results (#140)
Browse files Browse the repository at this point in the history
* genebe

* comparison v3

* cleanup

* clean
  • Loading branch information
gromdimon authored Jul 9, 2024
1 parent 25ab6ac commit 8defd0e
Show file tree
Hide file tree
Showing 11 changed files with 4,247 additions and 1,960 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ jupyternotebook:

.PHONY: jupyterlab
jupyterlab:
cp src/bench/comparison.ipynb tmp.ipynb && \
cp src/bench/results_analysis.ipynb tmp.ipynb && \
PYTHON=. pipenv run \
jupyter lab \
--ip=0.0.0.0 --allow-root --NotebookApp.custom_display_url=http://127.0.0.1:8888 \
Expand Down
11 changes: 7 additions & 4 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,6 @@ typing-extensions = "*"
msgpack = "*"
seqrepo = "*"
pytabix = "*"
jupyterlab = "*"
pandas = "*"
pandas-stubs = "*"
matplotlib = "*"

[dev-packages]
mypy = "*"
Expand All @@ -36,6 +32,13 @@ sphinxcontrib-bibtex = "*"
sphinxawesome-theme = "*"
types-requests = "*"
pytest-cov = "*"
genebe = "*"
cyvcf2 = "*"
mmh3 = "*"
jupyterlab = "*"
pandas = "*"
pandas-stubs = "*"
matplotlib = "*"

[requires]
python_version = "3.12"
3,282 changes: 1,722 additions & 1,560 deletions Pipfile.lock

Large diffs are not rendered by default.

334 changes: 334 additions & 0 deletions src/bench/comparison_v3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,334 @@
import csv
import json
import os
import time

import pandas as pd
import requests

from src.auto_acmg import AutoACMG, AutoACMGPrediction
from src.core.config import settings
from src.defs.genome_builds import GenomeRelease
from src.defs.seqvar import SeqVar

#: Path to the root directory
path_to_root = settings.PATH_TO_ROOT

#: List of all criteria
criteria = [
"PVS1",
"PS1",
"PS2",
"PS3",
"PS4",
"PM1",
"PM2",
"PM3",
"PM4",
"PM5",
"PM6",
"PP1",
"PP2",
"PP3",
"PP4",
"PP5",
"BA1",
"BS1",
"BS2",
"BS3",
"BS4",
"BP1",
"BP2",
"BP3",
"BP4",
"BP5",
"BP6",
"BP7",
]

# =========== Prepare the data ===========
# Extract criteria from the csv file
path = os.path.join(path_to_root, "src", "bench", "comparison_criteria_custom.csv")
print(f"Data path: {path}")
variants = []
with open(path, "rt") as inputf:
reader = csv.DictReader(inputf)
for record in reader:
if record["variant"].startswith("#"):
continue
variants.append(
(record["variant"], record["expected_criteria"].split(";"), record["comment"])
)
print(f"Number of variants: {len(variants)}")
# ========================================


# =========== Helper functions ===========
#: Extract criteria from the line
def extract_criteria(line):
ret = []
for c in criteria:
if c in line:
ret.append(c)
return ret


#: Append a row to the DataFrame
def append_row(df, row):
return pd.concat([df, pd.DataFrame([row], columns=row.index)]).reset_index(drop=True)


def intervar_response(variant: str):
"""
Implement searching for ACMG classification for SNVs and indels.
Proxy requests to the `WinterVar <http://wintervar.wglab.org/>`_ backend.
:param variant: request
:return: ACMG classification
:rtype: dict
"""
auto_acmg = AutoACMG(variant, GenomeRelease.GRCh37)
seqvar = auto_acmg.resolve_variant()
assert isinstance(seqvar, SeqVar)
chromosome = seqvar.chrom
position = seqvar.pos
reference = seqvar.delete
alternative = seqvar.insert

if not chromosome or not position or not reference or not alternative:
return

url = (
f"http://wintervar.wglab.org/api_new.php?"
f"queryType=position&chr={chromosome}&pos={position}"
f"&ref={reference}&alt={alternative}&build=hg19"
)
backend_resp = requests.get(url)
backend_resp.raise_for_status()
return backend_resp.json()


def genebe_response(variant: str):
"""
Implement searching for ACMG classification for SNVs and indels using GeneBe.
:param variant: variant string
:return: GeneBe response
:rtype: dict
"""
auto_acmg = AutoACMG(variant, GenomeRelease.GRCh37)
seqvar = auto_acmg.resolve_variant()
assert isinstance(seqvar, SeqVar)
chromosome = seqvar.chrom
position = seqvar.pos
reference = seqvar.delete
alternative = seqvar.insert

if not chromosome or not position or not reference or not alternative:
return

url = (
f"https://api.genebe.net/cloud/api-public/v1/variant?"
f"chr={chromosome}&pos={position}"
f"&ref={reference}&alt={alternative}&genome=hg38"
)
backend_resp = requests.get(url)
backend_resp.raise_for_status()
return backend_resp.json()


def eval_autoacmg(pred, expected):
"""
Evaluate the AutoACMG prediction.
:param pred: AutoACMG prediction
:param expected: expected criteria
:return: criteria met, true positives, false negatives, false positives
:rtype: tuple
"""
crit_met = []
for crit in pred.model_dump().values():
if crit["prediction"] == AutoACMGPrediction.Positive:
crit_met.append(crit["name"])
tp = list(set(expected) & set(crit_met))
fn = list(set(expected) - set(crit_met))
fp = list(set(crit_met) - set(expected))
return crit_met, tp, fn, fp


def eval_intervar(pred, expected):
"""
Evaluate the InterVar prediction.
:param pred: InterVar prediction
:param expected: expected criteria
:return: criteria met, true positives, false negatives, false positives
:rtype: tuple
"""
crit_met = []
for crit in pred:
if crit in [
"Intervar",
"Build",
"Chromosome",
"Position",
"Ref_allele",
"Alt_allele",
"Gene",
]:
continue
if pred[crit] == 1:
crit_met.append(crit)
tp = list(set(expected) & set(crit_met))
fn = list(set(expected) - set(crit_met))
fp = list(set(crit_met) - set(expected))
return crit_met, tp, fn, fp


def eval_genebe(resp, expected):
"""
Evaluate the VarSome prediction.
:param resp: VarSome prediction
:param expected: expected criteria
:return: criteria met, true positives, false negatives, false positives
:rtype: tuple
"""
crit_met = []
try:
acmg_crit = resp["variants"][0]["acmg_criteria"]
crit_met = extract_criteria(acmg_crit)
except Exception as e:
print(e)
tp = list(set(expected) & set(crit_met))
fn = list(set(expected) - set(crit_met))
fp = list(set(crit_met) - set(expected))
return crit_met, tp, fn, fp


# ========================================


# =========== Main comparison run ===========
#: Create a pandas DataFrame
stats = pd.DataFrame(
columns=[
"Variant",
"Expected Criteria",
"AutoACMG Criteria",
"AutoACMG Prediction time",
"AutoACMG True Positives",
"AutoACMG False Negatives",
"AutoACMG False Positives",
"Intervar Criteria",
"Intervar Prediction time",
"Intervar True Positives",
"Intervar False Negatives",
"Intervar False Positives",
"Genebe Criteria",
"Genebe Prediction time",
"Genebe True Positives",
"Genebe False Negatives",
"Genebe False Positives",
"Comment",
"AutoACMG Full Response",
"Intervar Full Response",
"Genebe Full Response",
]
)

for i, var in enumerate(variants):
# Save the stats every 10 variants
if i % 50 == 0:
print(f"Processed {i} variants")
output_path = os.path.join(path_to_root, "src", "bench", "tmp", f"_stats_{i}.csv")
stats.to_csv(output_path, index=False)

record = {
"Variant": var[0],
"Expected Criteria": ";".join(var[1]),
"AutoACMG Criteria": "",
"AutoACMG Prediction time": 0,
"AutoACMG True Positives": "",
"AutoACMG False Negatives": "",
"AutoACMG False Positives": "",
"Intervar Criteria": "",
"Intervar Prediction time": 0,
"Intervar True Positives": "",
"Intervar False Negatives": "",
"Intervar False Positives": "",
"Genebe Criteria": "",
"Genebe Prediction time": 0,
"Genebe True Positives": "",
"Genebe False Negatives": "",
"Genebe False Positives": "",
"Comment": var[2],
"AutoACMG Full Response": "",
"Intervar Full Response": "",
"Genebe Full Response": "",
}

# AutoACMG
try:
start_time = time.time()
auto_acmg = AutoACMG(var[0], GenomeRelease.GRCh37)
pred = auto_acmg.predict()
end_time = time.time()
assert pred
# Evaluate the model
crit_met, tp, fn, fp = eval_autoacmg(pred, var[1])
record["AutoACMG Criteria"] = ";".join(crit_met)
record["AutoACMG Prediction time"] = end_time - start_time
record["AutoACMG True Positives"] = ";".join(tp)
record["AutoACMG False Negatives"] = ";".join(fn)
record["AutoACMG False Positives"] = ";".join(fp)
record["AutoACMG Full Response"] = pred.model_dump()
except Exception as e:
print(f"Exception was raised for {var[0]} in AutoACMG:\n{e}")

# Intervar
try:
start_time = time.time()
resp = intervar_response(var[0])
end_time = time.time()
crit_met, tp, fn, fp = eval_intervar(resp, var[1])
record["Intervar Criteria"] = ";".join(crit_met)
record["Intervar Prediction time"] = end_time - start_time
record["Intervar True Positives"] = ";".join(tp)
record["Intervar False Negatives"] = ";".join(fn)
record["Intervar False Positives"] = ";".join(fp)
record["Intervar Full Response"] = resp
except Exception as e:
print(f"Exception was raised for {var[0]} in InterVar:\n{e}")

# Genebe
try:
start_time = time.time()
resp = genebe_response(var[0])
end_time = time.time()
crit_met, tp, fn, fp = eval_genebe(resp, var[1])
record["Genebe Criteria"] = ";".join(crit_met)
record["Genebe Prediction time"] = end_time - start_time
record["Genebe True Positives"] = ";".join(tp)
record["Genebe False Negatives"] = ";".join(fn)
record["Genebe False Positives"] = ";".join(fp)
record["Genebe Full Response"] = json.dumps(resp)
except Exception as e:
print(f"Exception was raised for {var[0]} in Genebe:\n{e}")

# Skip if both AutoACMG and Intervar did not return any criteria
if (
record["AutoACMG Criteria"] == ""
and record["Intervar Criteria"] == ""
and record["Genebe Criteria"] == ""
):
print(f"Skipping {var[0]}")
continue

# Add the record to the stats
print(f"Record for {var[0]}: {record}")
stats = append_row(stats, pd.Series(record))

# Save the final stats
output_path = os.path.join(path_to_root, "src", "bench", "_stats.csv")
stats.to_csv(output_path, index=False)
Loading

0 comments on commit 8defd0e

Please sign in to comment.