Genebee results (#140)

* genebe * comparison v3 * cleanup * clean
bihealth · Jul 9, 2024 · 8defd0e · 8defd0e
1 parent 25ab6ac
commit 8defd0e
Show file tree

Hide file tree

Showing 11 changed files with 4,247 additions and 1,960 deletions.
diff --git a/Makefile b/Makefile
@@ -128,7 +128,7 @@ jupyternotebook:
 
 .PHONY: jupyterlab
 jupyterlab:
-	cp src/bench/comparison.ipynb tmp.ipynb && \
+	cp src/bench/results_analysis.ipynb tmp.ipynb && \
 	PYTHON=. pipenv run \
 		jupyter lab \
 			--ip=0.0.0.0 --allow-root --NotebookApp.custom_display_url=http://127.0.0.1:8888 \

diff --git a/Pipfile b/Pipfile
@@ -16,10 +16,6 @@ typing-extensions = "*"
 msgpack = "*"
 seqrepo = "*"
 pytabix = "*"
-jupyterlab = "*"
-pandas = "*"
-pandas-stubs = "*"
-matplotlib = "*"
 
 [dev-packages]
 mypy = "*"
@@ -36,6 +32,13 @@ sphinxcontrib-bibtex = "*"
 sphinxawesome-theme = "*"
 types-requests = "*"
 pytest-cov = "*"
+genebe = "*"
+cyvcf2 = "*"
+mmh3 = "*"
+jupyterlab = "*"
+pandas = "*"
+pandas-stubs = "*"
+matplotlib = "*"
 
 [requires]
 python_version = "3.12"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/src/bench/comparison_v3.py b/src/bench/comparison_v3.py
@@ -0,0 +1,334 @@
+import csv
+import json
+import os
+import time
+
+import pandas as pd
+import requests
+
+from src.auto_acmg import AutoACMG, AutoACMGPrediction
+from src.core.config import settings
+from src.defs.genome_builds import GenomeRelease
+from src.defs.seqvar import SeqVar
+
+#: Path to the root directory
+path_to_root = settings.PATH_TO_ROOT
+
+#: List of all criteria
+criteria = [
+    "PVS1",
+    "PS1",
+    "PS2",
+    "PS3",
+    "PS4",
+    "PM1",
+    "PM2",
+    "PM3",
+    "PM4",
+    "PM5",
+    "PM6",
+    "PP1",
+    "PP2",
+    "PP3",
+    "PP4",
+    "PP5",
+    "BA1",
+    "BS1",
+    "BS2",
+    "BS3",
+    "BS4",
+    "BP1",
+    "BP2",
+    "BP3",
+    "BP4",
+    "BP5",
+    "BP6",
+    "BP7",
+]
+
+# =========== Prepare the data ===========
+# Extract criteria from the csv file
+path = os.path.join(path_to_root, "src", "bench", "comparison_criteria_custom.csv")
+print(f"Data path: {path}")
+variants = []
+with open(path, "rt") as inputf:
+    reader = csv.DictReader(inputf)
+    for record in reader:
+        if record["variant"].startswith("#"):
+            continue
+        variants.append(
+            (record["variant"], record["expected_criteria"].split(";"), record["comment"])
+        )
+print(f"Number of variants: {len(variants)}")
+# ========================================
+
+
+# =========== Helper functions ===========
+#: Extract criteria from the line
+def extract_criteria(line):
+    ret = []
+    for c in criteria:
+        if c in line:
+            ret.append(c)
+    return ret
+
+
+#: Append a row to the DataFrame
+def append_row(df, row):
+    return pd.concat([df, pd.DataFrame([row], columns=row.index)]).reset_index(drop=True)
+
+
+def intervar_response(variant: str):
+    """
+    Implement searching for ACMG classification for SNVs and indels.
+    Proxy requests to the `WinterVar <http://wintervar.wglab.org/>`_ backend.
+
+    :param variant: request
+    :return: ACMG classification
+    :rtype: dict
+    """
+    auto_acmg = AutoACMG(variant, GenomeRelease.GRCh37)
+    seqvar = auto_acmg.resolve_variant()
+    assert isinstance(seqvar, SeqVar)
+    chromosome = seqvar.chrom
+    position = seqvar.pos
+    reference = seqvar.delete
+    alternative = seqvar.insert
+
+    if not chromosome or not position or not reference or not alternative:
+        return
+
+    url = (
+        f"http://wintervar.wglab.org/api_new.php?"
+        f"queryType=position&chr={chromosome}&pos={position}"
+        f"&ref={reference}&alt={alternative}&build=hg19"
+    )
+    backend_resp = requests.get(url)
+    backend_resp.raise_for_status()
+    return backend_resp.json()
+
+
+def genebe_response(variant: str):
+    """
+    Implement searching for ACMG classification for SNVs and indels using GeneBe.
+
+    :param variant: variant string
+    :return: GeneBe response
+    :rtype: dict
+    """
+    auto_acmg = AutoACMG(variant, GenomeRelease.GRCh37)
+    seqvar = auto_acmg.resolve_variant()
+    assert isinstance(seqvar, SeqVar)
+    chromosome = seqvar.chrom
+    position = seqvar.pos
+    reference = seqvar.delete
+    alternative = seqvar.insert
+
+    if not chromosome or not position or not reference or not alternative:
+        return
+
+    url = (
+        f"https://api.genebe.net/cloud/api-public/v1/variant?"
+        f"chr={chromosome}&pos={position}"
+        f"&ref={reference}&alt={alternative}&genome=hg38"
+    )
+    backend_resp = requests.get(url)
+    backend_resp.raise_for_status()
+    return backend_resp.json()
+
+
+def eval_autoacmg(pred, expected):
+    """
+    Evaluate the AutoACMG prediction.
+
+    :param pred: AutoACMG prediction
+    :param expected: expected criteria
+    :return: criteria met, true positives, false negatives, false positives
+    :rtype: tuple
+    """
+    crit_met = []
+    for crit in pred.model_dump().values():
+        if crit["prediction"] == AutoACMGPrediction.Positive:
+            crit_met.append(crit["name"])
+    tp = list(set(expected) & set(crit_met))
+    fn = list(set(expected) - set(crit_met))
+    fp = list(set(crit_met) - set(expected))
+    return crit_met, tp, fn, fp
+
+
+def eval_intervar(pred, expected):
+    """
+    Evaluate the InterVar prediction.
+
+    :param pred: InterVar prediction
+    :param expected: expected criteria
+    :return: criteria met, true positives, false negatives, false positives
+    :rtype: tuple
+    """
+    crit_met = []
+    for crit in pred:
+        if crit in [
+            "Intervar",
+            "Build",
+            "Chromosome",
+            "Position",
+            "Ref_allele",
+            "Alt_allele",
+            "Gene",
+        ]:
+            continue
+        if pred[crit] == 1:
+            crit_met.append(crit)
+    tp = list(set(expected) & set(crit_met))
+    fn = list(set(expected) - set(crit_met))
+    fp = list(set(crit_met) - set(expected))
+    return crit_met, tp, fn, fp
+
+
+def eval_genebe(resp, expected):
+    """
+    Evaluate the VarSome prediction.
+
+    :param resp: VarSome prediction
+    :param expected: expected criteria
+    :return: criteria met, true positives, false negatives, false positives
+    :rtype: tuple
+    """
+    crit_met = []
+    try:
+        acmg_crit = resp["variants"][0]["acmg_criteria"]
+        crit_met = extract_criteria(acmg_crit)
+    except Exception as e:
+        print(e)
+    tp = list(set(expected) & set(crit_met))
+    fn = list(set(expected) - set(crit_met))
+    fp = list(set(crit_met) - set(expected))
+    return crit_met, tp, fn, fp
+
+
+# ========================================
+
+
+# =========== Main comparison run ===========
+#: Create a pandas DataFrame
+stats = pd.DataFrame(
+    columns=[
+        "Variant",
+        "Expected Criteria",
+        "AutoACMG Criteria",
+        "AutoACMG Prediction time",
+        "AutoACMG True Positives",
+        "AutoACMG False Negatives",
+        "AutoACMG False Positives",
+        "Intervar Criteria",
+        "Intervar Prediction time",
+        "Intervar True Positives",
+        "Intervar False Negatives",
+        "Intervar False Positives",
+        "Genebe Criteria",
+        "Genebe Prediction time",
+        "Genebe True Positives",
+        "Genebe False Negatives",
+        "Genebe False Positives",
+        "Comment",
+        "AutoACMG Full Response",
+        "Intervar Full Response",
+        "Genebe Full Response",
+    ]
+)
+
+for i, var in enumerate(variants):
+    # Save the stats every 10 variants
+    if i % 50 == 0:
+        print(f"Processed {i} variants")
+        output_path = os.path.join(path_to_root, "src", "bench", "tmp", f"_stats_{i}.csv")
+        stats.to_csv(output_path, index=False)
+
+    record = {
+        "Variant": var[0],
+        "Expected Criteria": ";".join(var[1]),
+        "AutoACMG Criteria": "",
+        "AutoACMG Prediction time": 0,
+        "AutoACMG True Positives": "",
+        "AutoACMG False Negatives": "",
+        "AutoACMG False Positives": "",
+        "Intervar Criteria": "",
+        "Intervar Prediction time": 0,
+        "Intervar True Positives": "",
+        "Intervar False Negatives": "",
+        "Intervar False Positives": "",
+        "Genebe Criteria": "",
+        "Genebe Prediction time": 0,
+        "Genebe True Positives": "",
+        "Genebe False Negatives": "",
+        "Genebe False Positives": "",
+        "Comment": var[2],
+        "AutoACMG Full Response": "",
+        "Intervar Full Response": "",
+        "Genebe Full Response": "",
+    }
+
+    # AutoACMG
+    try:
+        start_time = time.time()
+        auto_acmg = AutoACMG(var[0], GenomeRelease.GRCh37)
+        pred = auto_acmg.predict()
+        end_time = time.time()
+        assert pred
+        # Evaluate the model
+        crit_met, tp, fn, fp = eval_autoacmg(pred, var[1])
+        record["AutoACMG Criteria"] = ";".join(crit_met)
+        record["AutoACMG Prediction time"] = end_time - start_time
+        record["AutoACMG True Positives"] = ";".join(tp)
+        record["AutoACMG False Negatives"] = ";".join(fn)
+        record["AutoACMG False Positives"] = ";".join(fp)
+        record["AutoACMG Full Response"] = pred.model_dump()
+    except Exception as e:
+        print(f"Exception was raised for {var[0]} in AutoACMG:\n{e}")
+
+    # Intervar
+    try:
+        start_time = time.time()
+        resp = intervar_response(var[0])
+        end_time = time.time()
+        crit_met, tp, fn, fp = eval_intervar(resp, var[1])
+        record["Intervar Criteria"] = ";".join(crit_met)
+        record["Intervar Prediction time"] = end_time - start_time
+        record["Intervar True Positives"] = ";".join(tp)
+        record["Intervar False Negatives"] = ";".join(fn)
+        record["Intervar False Positives"] = ";".join(fp)
+        record["Intervar Full Response"] = resp
+    except Exception as e:
+        print(f"Exception was raised for {var[0]} in InterVar:\n{e}")
+
+    # Genebe
+    try:
+        start_time = time.time()
+        resp = genebe_response(var[0])
+        end_time = time.time()
+        crit_met, tp, fn, fp = eval_genebe(resp, var[1])
+        record["Genebe Criteria"] = ";".join(crit_met)
+        record["Genebe Prediction time"] = end_time - start_time
+        record["Genebe True Positives"] = ";".join(tp)
+        record["Genebe False Negatives"] = ";".join(fn)
+        record["Genebe False Positives"] = ";".join(fp)
+        record["Genebe Full Response"] = json.dumps(resp)
+    except Exception as e:
+        print(f"Exception was raised for {var[0]} in Genebe:\n{e}")
+
+    # Skip if both AutoACMG and Intervar did not return any criteria
+    if (
+        record["AutoACMG Criteria"] == ""
+        and record["Intervar Criteria"] == ""
+        and record["Genebe Criteria"] == ""
+    ):
+        print(f"Skipping {var[0]}")
+        continue
+
+    # Add the record to the stats
+    print(f"Record for {var[0]}: {record}")
+    stats = append_row(stats, pd.Series(record))
+
+# Save the final stats
+output_path = os.path.join(path_to_root, "src", "bench", "_stats.csv")
+stats.to_csv(output_path, index=False)