diff --git a/src/lib/djerba/plugins/fusion/plugin.py b/src/lib/djerba/plugins/fusion/plugin.py index 5d10fc8a3..b642a9942 100644 --- a/src/lib/djerba/plugins/fusion/plugin.py +++ b/src/lib/djerba/plugins/fusion/plugin.py @@ -43,14 +43,18 @@ def sort_by_actionable_level(row): total_fusion_genes = fus_reader.get_total_fusion_genes() gene_pair_fusions = fus_reader.get_fusions() if gene_pair_fusions is not None: + outputs = fus_reader.fusions_to_json(gene_pair_fusions, wrapper.get_my_string(fc.ONCOTREE_CODE)) [rows, gene_info, treatment_opts] = outputs + #sort by OncoKB level rows = sorted(rows, key=sort_by_actionable_level) rows = oncokb_levels.filter_reportable(rows) + unique_rows = set(map(lambda x: x['fusion'], rows)) + results = { fc.TOTAL_VARIANTS: total_fusion_genes, - fc.CLINICALLY_RELEVANT_VARIANTS: fus_reader.get_total_oncokb_fusions(), + fc.CLINICALLY_RELEVANT_VARIANTS: len(unique_rows), fc.NCCN_RELEVANT_VARIANTS: fus_reader.get_total_nccn_fusions(), fc.BODY: rows } diff --git a/src/lib/djerba/plugins/fusion/test/plugin_test.py b/src/lib/djerba/plugins/fusion/test/plugin_test.py index 7ed58ebeb..3cd315304 100755 --- a/src/lib/djerba/plugins/fusion/test/plugin_test.py +++ b/src/lib/djerba/plugins/fusion/test/plugin_test.py @@ -50,7 +50,7 @@ def test(self): params = { self.INI: self.INI_NAME, self.JSON: self.JSON_NAME, - self.MD5: '3e5bb853abd4a76dbd45414ea1a9af52' + self.MD5: '6b80957262c258a0a0641b9ab9652725' } self.run_basic_test(input_dir, params, 'fusion', logging.ERROR, work_dir) diff --git a/src/lib/djerba/plugins/fusion/tools.py b/src/lib/djerba/plugins/fusion/tools.py index 7ad269c83..c30279ebd 100644 --- a/src/lib/djerba/plugins/fusion/tools.py +++ b/src/lib/djerba/plugins/fusion/tools.py @@ -42,51 +42,68 @@ def __init__(self, input_dir, log_level=logging.WARNING, log_path=None): [fusions, self.total_fusion_genes, self.total_oncokb_fusions, self.total_nccn_fusions] = self._collate_row_data(fusion_data, annotations) # sort the fusions by fusion ID self.fusions = sorted(fusions, key=lambda f: f.get_fusion_id_new()) - + def _collate_row_data(self, fusion_data, annotations): - fusions = [] - fusion_genes = set() + fusions = [] # List to store valid fusion entries + fusion_genes = set() # Set to track distinct genes involved in fusions self.logger.debug("Starting to collate fusion table data.") - intragenic = 0 - nccn_fusion_total = 0 - NCCN_fusions = set() + intragenic = 0 # Counter for intragenic fusions + nccn_fusion_total = 0 # Counter for fusions rescued by NCCN annotation + NCCN_fusions = set() # Set to store NCCN-annotated fusions + + # Read NCCN-annotated fusions from a file with open(os.path.join(self.input_dir, fc.DATA_FUSIONS_NCCN_ANNOTATED)) as data_file: for row in csv.DictReader(data_file, delimiter="\t"): - NCCN_fusions.add(row['Fusion']) + NCCN_fusions.add(row['Fusion']) # Add each fusion ID to the set + + # Iterate over all fusion IDs in fusion_data for fusion_id in fusion_data.keys(): - gene2_exists = True - if len(fusion_data[fusion_id])==1: - # skip intragenic fusions, but add to the gene count + gene2_exists = True # Assume a second gene exists initially + # Case: Intragenic fusions (only one gene involved) + if len(fusion_data[fusion_id]) == 1: + # Skip intragenic fusions, but add to the gene count fusion_genes.add(fusion_data[fusion_id][0][fc.HUGO_SYMBOL]) if fusion_id in NCCN_fusions: - self.logger.debug("Fusion {0} rescued by NCCN annotation".format(fusion)) - gene2_exists = False + # If the fusion is in the NCCN-annotated list, it's "rescued" + self.logger.debug("Fusion {0} rescued by NCCN annotation".format(fusion_id)) + gene2_exists = False # No second gene; marked as "Intergenic" gene2 = "Intergenic" - nccn_fusion_total += 1 + nccn_fusion_total += 1 # Increment NCCN-rescued fusion count else: - intragenic += 1 + intragenic += 1 # Increment intragenic count and skip processing continue elif len(fusion_data[fusion_id]) >= 3: + # Error case: More than two genes for a single fusion ID msg = "More than 2 fusions with the same name: {0}".format(fusion_id) self.logger.error(msg) raise RuntimeError(msg) + + # Normal case: Valid fusion data with one or two genes gene1 = fusion_data[fusion_id][0][fc.HUGO_SYMBOL] if gene2_exists: + # If a second gene exists, retrieve it gene2 = fusion_data[fusion_id][1][fc.HUGO_SYMBOL] + # Add both genes to the set fusion_genes.add(gene1) fusion_genes.add(gene2) + + # Case: Two genes exist for the fusion if gene2_exists: for row_input in annotations[fusion_id]: - effect = row_input['MUTATION_EFFECT'] - level = oncokb_levels.parse_oncokb_level(row_input) + effect = row_input['MUTATION_EFFECT'] # Get mutation effect + level = oncokb_levels.parse_oncokb_level(row_input) # Parse oncokb level else: + # Case: No second gene (rescued by NCCN) effect = "Undetermined" level = "P" + + # If the level is valid, add therapies information if level not in ['Unknown', 'NA']: if gene2_exists: therapies = oncokb_levels.parse_actionable_therapies(row_input) else: therapies = {"P": "Prognostic"} + # Append a new fusion object to the list fusions.append( fusion( fusion_id, @@ -101,13 +118,16 @@ def _collate_row_data(self, fusion_data, annotations): ) ) total = len(fusions) - nccn_fusion_total - total_fusion_genes = len(fusion_genes) - msg = "Finished collating fusion table data. "+\ - "Found {0} fusion rows for {1} distinct genes; ".format(total, total_fusion_genes)+\ + total_fusion_genes = len(fusion_genes) # Count distinct genes + + msg = "Finished collating fusion table data. " + \ + "Found {0} fusion rows for {1} distinct genes; ".format(total, total_fusion_genes) + \ "excluded {0} intragenic rows.".format(intragenic) self.logger.info(msg) + for fusion_row in fusions: self.logger.debug("Fusions: {0}".format(fusion_row.get_genes())) + return [fusions, total_fusion_genes, total, nccn_fusion_total] def build_treatment_entries(self, fusion, therapies, oncotree_code):