diff --git a/modules/oncoliner_harmonization/src/harmonizator/harmonizator.py b/modules/oncoliner_harmonization/src/harmonizator/harmonizator.py index 733fcb8..66287a0 100644 --- a/modules/oncoliner_harmonization/src/harmonizator/harmonizator.py +++ b/modules/oncoliner_harmonization/src/harmonizator/harmonizator.py @@ -8,6 +8,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed import pandas as pd +import numpy as np # Add vcf-ops to the path sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', '..', '..', '..', 'shared', 'vcf_ops', 'src')) @@ -15,6 +16,7 @@ from vcf_ops.metrics import filter_metrics_recommendations # noqa from .utils import cleanup_text # noqa +from .heterogeneity_metrics import compute_h_score, compute_gdr # noqa def _read_pipeline_improvements(pipeline_folder: str) -> Dict[str, pd.DataFrame]: @@ -22,12 +24,8 @@ def _read_pipeline_improvements(pipeline_folder: str) -> Dict[str, pd.DataFrame] csv_files = glob.glob(os.path.join(pipeline_folder, 'improvement_list', '*.csv')) if len(csv_files) == 0: raise Exception(f'No improvement files found in {pipeline_folder}') - # Get the first row of the first file to get the column names - first_row = pd.read_csv(csv_files[0], nrows=1) - # Get the columns and remove all those that end with _genes - selected_columns = [column for column in first_row.columns if not column.endswith('_genes')] # Read all the improvements - pipeline_improvements = [pd.read_csv(csv_file, usecols=selected_columns) for csv_file in csv_files] + pipeline_improvements = [pd.read_csv(csv_file) for csv_file in csv_files] # Concatenate all the improvements into a single dataframe pipeline_improvements = pd.concat(pipeline_improvements, ignore_index=True) # Create a dict with the variant_type and variant_size @@ -43,7 +41,6 @@ def _get_pipelines_combinations(pipelines_folders: List[str], threads: int) -> D # Each pipeline folder contains a list of .csv files with its possible improvements # Read all of them and concatenate them into a single dataframe # Create a dictionary with the pipeline name as key and the improvements dataframe as value - pipelines_combinations = OrderedDict() pipeline_names = sorted([os.path.basename(pipeline_folder) for pipeline_folder in pipelines_folders]) # Make sure the pipeline names are unique if len(pipeline_names) != len(set(pipeline_names)): @@ -52,6 +49,8 @@ def _get_pipelines_combinations(pipelines_folders: List[str], threads: int) -> D futures = [] for pipeline_folder in pipelines_folders: futures.append(pool.submit(_read_pipeline_improvements, pipeline_folder)) + # Initialize the dictionary with the pipeline names sorted + pipelines_combinations = OrderedDict(zip(pipeline_names, [None] * len(pipeline_names))) for future in as_completed(futures): pipeline_improvements, pipeline_folder = future.result() pipeline_name = os.path.basename(pipeline_folder) @@ -59,15 +58,36 @@ def _get_pipelines_combinations(pipelines_folders: List[str], threads: int) -> D return pipelines_combinations -def _filter_combinations(pipelines_names: List[str], df: pd.DataFrame, num_callers_column: str, ranking_columns: List[str], loss_margin: float, max_recommendations: int) -> Set[str]: - +def _filter_combinations(pipelines_names: List[str], df: pd.DataFrame, num_callers_column: str, ranking_columns: List[str], priority_columns: List[str], loss_margin: float, max_recommendations: int) -> Set[str]: # Create a new column with the concatenation of the operations df = df.assign(operation=lambda x: x[pipelines_names].apply(lambda y: ';'.join(y), axis=1)) - filtered_df = filter_metrics_recommendations(df, loss_margin, max_recommendations, num_callers_column, ranking_columns) + filtered_df = filter_metrics_recommendations(df, loss_margin, max_recommendations, num_callers_column, ranking_columns, priority_columns) result = set(filtered_df['operation'].unique()) return result +def _compute_metrics(operations_combination, improvements_combinations_metrics, variant_type_size, metrics_name): + computable_metrics = [] + for op in operations_combination: + pipeline_name, operation = op.split(';') + metrics = improvements_combinations_metrics[pipeline_name][variant_type_size] + # Get the metrics for the current operation + metrics = metrics[metrics['operation'] == operation].iloc[0] + # Calculate the metrics for the current operation + computable_metrics.append(metrics) + # Calculate the metrics for the combination + if metrics_name == 'h_score': + return compute_h_score([(metrics['recall'], metrics['precision']) for metrics in computable_metrics]) + elif metrics_name == 'gdr': + return compute_gdr([metrics['protein_affected_genes'] for metrics in computable_metrics]) + elif metrics_name.endswith('_avg'): + return np.mean([metrics[metrics_name.replace('_avg', '')] for metrics in computable_metrics]) + elif metrics_name.endswith('_sum'): + return np.sum([metrics[metrics_name.replace('_sum', '')] for metrics in computable_metrics]) + else: + raise Exception(f'Unknown metric {metrics_name}') + + def _group_combinations(improvements_combinations_metrics: Dict[str, Dict[str, pd.DataFrame]], loss_margin: float, max_recommendations: int) -> Dict[str, pd.DataFrame]: first_pipeline_improvements = list(improvements_combinations_metrics.values())[0] pipeline_names = list(improvements_combinations_metrics.keys()) @@ -96,29 +116,20 @@ def _group_combinations(improvements_combinations_metrics: Dict[str, Dict[str, p combinations_row[pipeline_name] = operation combinations_row['variant_type'] = variant_type_size.split(';')[0] combinations_row['variant_size'] = variant_type_size.split(';')[1] - combinations_row['recall_avg'] = 0 - combinations_row['precision_avg'] = 0 - combinations_row['f1_score_avg'] = 0 - combinations_row['protein_affected_genes_count_avg'] = 0 - combinations_row['protein_affected_driver_genes_count_avg'] = 0 - combinations_row['added_callers_sum'] = 0 - # Calculate the metrics for each pipeline - for op in operations_combination: - pipeline_name, operation = op.split(';') - metrics = improvements_combinations_metrics[pipeline_name][variant_type_size] - # Get the metrics for the current operation - metrics = metrics[metrics['operation'] == operation].iloc[0] - # Calculate the metrics for the current operation - for combinations_row_key in combinations_row.keys(): - if combinations_row_key.endswith('_avg'): - combinations_row[combinations_row_key] += \ - metrics[combinations_row_key.replace('_avg', '')] / len(operations_combination) - elif combinations_row_key.endswith('_sum'): - combinations_row[combinations_row_key] += metrics[combinations_row_key.replace('_sum', '')] + computable_metrics = ['h_score', 'recall_avg', 'precision_avg', 'f1_score_avg', 'gdr', + 'protein_affected_genes_count_avg', 'protein_affected_driver_genes_count_avg', 'protein_affected_actionable_genes_count_avg', + 'added_callers_sum'] + for metrics_name in computable_metrics: + combinations_row[metrics_name] = _compute_metrics(operations_combination, improvements_combinations_metrics, variant_type_size, metrics_name) combinations_rows.append(combinations_row) # Create a dataframe with the metrics combinations_df = pd.DataFrame(combinations_rows) - selected_combinations.update(_filter_combinations(pipeline_names, combinations_df, 'added_callers_sum', ['f1_score_avg', 'recall_avg', 'precision_avg'], loss_margin, max_recommendations)) + # Invert values for h_score and gdr + combinations_df['h_score_inv'] = - combinations_df['h_score'] + combinations_df['gdr_inv'] = - combinations_df['gdr'] + selected_combinations.update(_filter_combinations(pipeline_names, combinations_df, 'added_callers_sum', ['h_score_inv', 'gdr_inv'], ['f1_score_avg', 'recall_avg', 'precision_avg'], loss_margin, max_recommendations)) + # Drop the inverted columns + combinations_df.drop(columns=['h_score_inv', 'gdr_inv'], inplace=True) result[variant_type_size] = combinations_df # Filter the combinations for variant_type_size, combinations_df in result.items(): diff --git a/modules/oncoliner_harmonization/src/harmonizator/heterogeneity_metrics.py b/modules/oncoliner_harmonization/src/harmonizator/heterogeneity_metrics.py new file mode 100644 index 0000000..2abfc0c --- /dev/null +++ b/modules/oncoliner_harmonization/src/harmonizator/heterogeneity_metrics.py @@ -0,0 +1,39 @@ +from typing import List, Tuple +import sys +import os + +import pandas as pd +import numpy as np + +# Add vcf-ops to the path +sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', '..', '..', '..', 'shared', 'vcf_ops', 'src')) +from vcf_ops.genes import GENE_SPLIT_SYMBOL # noqa + +def compute_h_score(metrics: List[Tuple[float, float]]) -> float: + # Compute the centroid + centroid = np.mean(metrics, axis=0) + # Compute the distance from the centroid to each point + distances = [] + for point in metrics: + distances.append(np.linalg.norm(point - centroid)) + # Compute the h-score + h_score = np.mean(distances) + return h_score + + +def compute_gdr(genes_list: List[str]) -> float: + intersect_genes = None + union_genes = None + for genes in genes_list: + if type(genes) == float: + continue + curr_genes = set(genes.split(GENE_SPLIT_SYMBOL)) + if intersect_genes is None: + intersect_genes = curr_genes + union_genes = curr_genes + continue + intersect_genes = intersect_genes.intersection(curr_genes) + union_genes = union_genes.union(curr_genes) + if union_genes is None or len(union_genes) == 0: + return 0 + return 1 - (len(intersect_genes) / len(union_genes)) diff --git a/modules/oncoliner_improvement/src/_internal/recommendator.py b/modules/oncoliner_improvement/src/_internal/recommendator.py index f02fc52..fda2cd5 100644 --- a/modules/oncoliner_improvement/src/_internal/recommendator.py +++ b/modules/oncoliner_improvement/src/_internal/recommendator.py @@ -64,7 +64,7 @@ def compute_improvements(callers_folders, user_folder, results_output_folder, re return improvement_list def filter_operations(df: pd.DataFrame, loss_margin: float, max_recommendations: int): - filtered_df = filter_metrics_recommendations(df, loss_margin, max_recommendations, 'added_callers', ['f1_score', 'recall', 'precision']) + filtered_df = filter_metrics_recommendations(df, loss_margin, max_recommendations, 'added_callers') result = set(filtered_df['operation'].unique()) return result diff --git a/oncoliner_launcher.py b/oncoliner_launcher.py index 901d4e8..31a9f8c 100644 --- a/oncoliner_launcher.py +++ b/oncoliner_launcher.py @@ -91,11 +91,8 @@ def check_samples_folders(pipeline_folder_path: str, sample_names: Set[str]) -> """ pipeline_samples = set(os.listdir(pipeline_folder_path)) missing_samples = sample_names - pipeline_samples - missing_pipeline_samples = pipeline_samples - sample_names if missing_samples: raise Exception(f'The following samples are missing in {pipeline_folder_path}: {missing_samples}') - if missing_pipeline_samples: - logging.warning(f'The following samples are missing from the config file: {missing_pipeline_samples}') if __name__ == '__main__': diff --git a/shared/vcf_ops/setup.cfg b/shared/vcf_ops/setup.cfg deleted file mode 100644 index 6e7a831..0000000 --- a/shared/vcf_ops/setup.cfg +++ /dev/null @@ -1,7 +0,0 @@ -[semantic_release] -version_variable = src/vcf_ops/__init__.py:__version__ -branch = main - -[metadata] -description-file=README.md -license_files=LICENSE diff --git a/shared/vcf_ops/src/vcf_ops/__init__.py b/shared/vcf_ops/src/vcf_ops/__init__.py index d6004ea..e69de29 100644 --- a/shared/vcf_ops/src/vcf_ops/__init__.py +++ b/shared/vcf_ops/src/vcf_ops/__init__.py @@ -1,9 +0,0 @@ -# Copyright 2023 - Barcelona Supercomputing Center -# Author: Rodrigo Martín -# BSC Dual License - -# Export VariantType from VariantExtractor -from variant_extractor.variants import VariantType - -__version__ = '0.0.1' -__author__ = 'Rapsssito' diff --git a/shared/vcf_ops/src/vcf_ops/metrics.py b/shared/vcf_ops/src/vcf_ops/metrics.py index a64a2c6..59d37c1 100644 --- a/shared/vcf_ops/src/vcf_ops/metrics.py +++ b/shared/vcf_ops/src/vcf_ops/metrics.py @@ -82,12 +82,12 @@ def aggregate_metrics(metrics_list): agg_metrics = agg_metrics[METRICS_COLUMNS] return agg_metrics -def filter_metrics_recommendations(df: pd.DataFrame, loss_margin: float, max_recommendations: int, num_callers_column='num_callers', ranking_columns=['f1_score', 'recall', 'precision']): +def filter_metrics_recommendations(df: pd.DataFrame, loss_margin: float, max_recommendations: int, num_callers_column='num_callers', ranking_columns=['f1_score', 'recall', 'precision'], priority_columns=['f1_score', 'recall', 'precision']): if len(df) == 0: return df selected_dfs = [] for i, ranking_column in enumerate(ranking_columns): - sort_columns = [ranking_column] + ranking_columns[:i] + ranking_columns[i+1:] + sort_columns = [ranking_column] + priority_columns # Filter all rows with the max element for each column - loss_margin df_temp = df for column in sort_columns: diff --git a/tools/pipeline_designer/src/main.py b/tools/pipeline_designer/src/main.py index 7a3bf37..fbd1b0e 100644 --- a/tools/pipeline_designer/src/main.py +++ b/tools/pipeline_designer/src/main.py @@ -251,7 +251,7 @@ def execute_operations(operations: List[str], combinations_folder: str, evaluati def filter_operations(df: pd.DataFrame, loss_margin: float, max_recommendations: int): - return filter_metrics_recommendations(df, loss_margin, max_recommendations, 'num_callers', ['f1_score', 'recall', 'precision'])['operation'].unique() + return filter_metrics_recommendations(df, loss_margin, max_recommendations, 'num_callers')['operation'].unique() def write_improvement_lists(evaluation_callers_folders: List[str], loss_margin: float, max_recommendations: int, output_folder: str, processes: int):