Skip to content

Commit

Permalink
Merge pull request #1088 from haddocking/1081-wrong-numbers-in-capri_clt
Browse files Browse the repository at this point in the history
Remove `self.params["debug"]` logic in `caprieval`
  • Loading branch information
rvhonorato authored Oct 10, 2024
2 parents 7ee7cd3 + 9507f8d commit a9f9c13
Show file tree
Hide file tree
Showing 24 changed files with 28,193 additions and 321 deletions.
1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_1.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_10.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_11.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_12.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_13.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_14.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_15.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_16.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_17.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_18.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_19.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_2.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_20.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_3.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_4.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_5.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_6.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_7.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_8.pdb

Large diffs are not rendered by default.

1,403 changes: 1,403 additions & 0 deletions integration_tests/golden_data/models_for_clustering/rigidbody_9.pdb

Large diffs are not rendered by default.

126 changes: 105 additions & 21 deletions integration_tests/test_caprieval.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import math
import shutil
import tempfile
import random
from pathlib import Path
from typing import Union

Expand All @@ -9,9 +10,11 @@
from haddock.libs.libontology import PDBFile
from haddock.modules.analysis.caprieval import (
DEFAULT_CONFIG as DEFAULT_CAPRIEVAL_CONFIG,
capri,
)
from haddock.modules.analysis.caprieval import HaddockModule as CaprievalModule
from tests import golden_data
from tests import golden_data as UNITTESTS_GOLDEN_DATA
from . import GOLDEN_DATA


@pytest.fixture
Expand Down Expand Up @@ -125,11 +128,11 @@ def __init__(self, path):

def retrieve_models(self, individualize: bool = True):
shutil.copy(
Path(golden_data, "protprot_complex_1.pdb"),
Path(UNITTESTS_GOLDEN_DATA, "protprot_complex_1.pdb"),
Path(".", "protprot_complex_1.pdb"),
)
shutil.copy(
Path(golden_data, "protprot_complex_2.pdb"),
Path(UNITTESTS_GOLDEN_DATA, "protprot_complex_2.pdb"),
Path(".", "protprot_complex_2.pdb"),
)
model_list = [
Expand All @@ -153,6 +156,24 @@ def output(self):
return None


class MockPreviousIO_with_models_to_be_clustered:
def __init__(self, path):
self.path = path

def retrieve_models(self, individualize: bool = True):
model_list = []
for m in [f"rigidbody_{i}.pdb" for i in range(1, 11)]:
src = Path(GOLDEN_DATA, "models_for_clustering", m)
dst = Path(self.path, m)
shutil.copy(src, dst)
p = PDBFile(file_name=m, path=".", score=random.uniform(-100, 100))
model_list.append(p)
return model_list

def output(self):
return None


def _cast_float_str_int(v: Union[int, str, float]) -> Union[int, str, float]:
"""Helper function to cast a value string to a float, int or str."""
try:
Expand Down Expand Up @@ -209,11 +230,7 @@ def _check_capri_ss_tsv(
):
"""Helper function to check the content of the capri_ss.tsv file."""
with open(capri_file) as f:
lines = [
_.strip().split("\t")
for _ in f.readlines()
if not _.startswith("#")
]
lines = [_.strip().split("\t") for _ in f.readlines() if not _.startswith("#")]

# Check the header
expected_header_cols = list(expected_data[0].keys())
Expand Down Expand Up @@ -250,11 +267,7 @@ def _check_capri_clt_tsv(
"""Helper function to check the content of the capri_clt.tsv file."""
with open(capri_file) as f:
# There are several `#` lines in the file, these are comments and can be ignored
lines = [
_.strip().split("\t")
for _ in f.readlines()
if not _.startswith("#")
]
lines = [_.strip().split("\t") for _ in f.readlines() if not _.startswith("#")]

# Check header
expected_header_cols = list(expected_data[0].keys())
Expand Down Expand Up @@ -284,6 +297,62 @@ def _check_capri_clt_tsv(
_compare_polymorphic_data(expected_data, oberseved_data)


def _check_means_match(
capri_ss_f: Path,
capri_clt_f: Path,
target_metric: str,
top_n: int,
):
"""Helper function to check if the means of `capri_ss` and `capri_clt` match"""

# Read the `capri_ss.tsv` file
assert capri_ss_f.exists()

assert capri_clt_f.exists()

# find the column that contains the target metric
with open(capri_ss_f, "r") as fh:
capri_ss_l = fh.readlines()

capri_ss_header = capri_ss_l[0].strip().split("\t")
capri_ss_data = capri_ss_l[1:]
try:
metric_idx = capri_ss_header.index(target_metric)
except ValueError:
# Metric not found
return

assert metric_idx is not None
values: list[float] = []
for entry in capri_ss_data:
data = entry.strip().split(sep="\t")
v = data[metric_idx]
ranking = int(data[2])
if ranking <= top_n:
values.append(float(v) if v != "-" else float("nan"))

# get the topX and calculate mean
mean_ss_v = sum(values[:top_n]) / float(top_n)

with open(capri_clt_f, "r") as fh:
capri_clt_l = fh.readlines()

# remove lines with comments
capri_clt_l = [line for line in capri_clt_l if "#" not in line]

# find the metric index
capri_clt_header = capri_clt_l[0].strip().split("\t")
capri_clt_data = capri_clt_l[1].strip().split("\t")
for metric_idx, metric in enumerate(capri_clt_header):
if metric == target_metric:
break

assert metric_idx is not None
mean_clt_v = float(capri_clt_data[metric_idx])

assert mean_clt_v == pytest.approx(mean_ss_v, 0.01), f"{target_metric} do not match"


def evaluate_caprieval_execution(
module: CaprievalModule, model_list, ss_data, clt_data
):
Expand Down Expand Up @@ -330,14 +399,29 @@ def test_caprieval_default(
)


def test_caprieval_nodebug(
caprieval_module, model_list, expected_ss_data, expected_clt_data
):
caprieval_module.previous_io = MockPreviousIO(path=caprieval_module.path)
caprieval_module.params["debug"] = False
def test_ss_clt_relation(caprieval_module):
"""Check if the values in the ss.tsv match the ones in clt.tsv"""

caprieval_module.previous_io = MockPreviousIO_with_models_to_be_clustered(
path=caprieval_module.path
)

caprieval_module.run()

evaluate_caprieval_execution(
caprieval_module, model_list, expected_ss_data, expected_clt_data
)
metrics_to_be_evaluated = [
"score",
"irmsd",
"fnat",
"lrmsd",
"dockq",
"ilrmsd",
"rmsd",
]

for metric in metrics_to_be_evaluated:
_check_means_match(
capri_ss_f=Path(caprieval_module.path, "capri_ss.tsv"),
capri_clt_f=Path(caprieval_module.path, "capri_clt.tsv"),
target_metric=metric,
top_n=caprieval_module.params["clt_threshold"],
)
61 changes: 15 additions & 46 deletions src/haddock/modules/analysis/caprieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,23 +28,16 @@
from pathlib import Path

from haddock.core.defaults import MODULE_DEFAULT_YAML
from haddock.core.typing import Any, FilePath, Union
from haddock.core.typing import FilePath, Union
from haddock.libs.libontology import PDBFile
from haddock.libs.libparallel import Scheduler
from haddock.modules import (
BaseHaddockModule,
get_engine,
get_module_steps_folders,
)
from haddock.modules.analysis import get_analysis_exec_mode
from haddock.modules import BaseHaddockModule
from haddock.modules.analysis.caprieval.capri import (
CAPRI,
capri_cluster_analysis,
dump_weights,
extract_data_from_capri_class,
merge_data,
rearrange_ss_capri_output,
)
)


RECIPE_PATH = Path(__file__).resolve().parent
Expand All @@ -60,9 +53,7 @@ def __init__(
self,
order: int,
path: Path,
*ignore: Any,
init_params: FilePath = DEFAULT_CONFIG,
**everything: Any,
) -> None:
super().__init__(order, path, init_params)

Expand Down Expand Up @@ -109,11 +100,6 @@ def _run(self) -> None:
)
reference = best_model_fname

exec_mode = get_analysis_exec_mode(self.params["mode"])
Engine = get_engine(exec_mode, self.params)

_less_io = self.params["mode"] == "local" and not self.params["debug"]

# Each model is a job; this is not the most efficient way
# but by assigning each model to an individual job
# we can handle scenarios in which the models are hetergoneous
Expand All @@ -128,45 +114,28 @@ def _run(self) -> None:
)
jobs.append(
CAPRI(
identificator=str(i),
identificator=i,
model=model_to_be_evaluated,
path=Path("."),
reference=reference,
params=self.params,
debug=not _less_io,
)
)

engine = Engine(jobs)
engine = Scheduler(
tasks=jobs, ncores=self.params["ncores"], max_cpus=self.params["max_cpus"]
)
engine.run()

if _less_io and isinstance(engine, Scheduler):
jobs = engine.results
extract_data_from_capri_class(
capri_objects=jobs,
output_fname=Path(".", "capri_ss.tsv"),
sort_key=self.params["sortby"],
sort_ascending=self.params["sort_ascending"],
)
jobs = engine.results
jobs = sorted(jobs, key=lambda capri: capri.identificator)

else:
self.log(
msg=(
"DEPRECATION NOTICE: This execution mode (debug=True) "
"will no longer be supported in the next version."
),
level="warning",
)
jobs = merge_data(jobs)

# Each job created one .tsv, unify them:
rearrange_ss_capri_output(
output_name="capri_ss.tsv",
output_count=len(jobs),
sort_key=self.params["sortby"],
sort_ascending=self.params["sort_ascending"],
path=Path("."),
)
extract_data_from_capri_class(
capri_objects=jobs,
output_fname=Path(".", "capri_ss.tsv"),
sort_key=self.params["sortby"],
sort_ascending=self.params["sort_ascending"],
)

capri_cluster_analysis(
capri_list=jobs,
Expand Down
Loading

0 comments on commit a9f9c13

Please sign in to comment.