diff --git a/.vscode/settings.json b/.vscode/settings.json index a49ee13..c594fa6 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,8 @@ { - "makefile.configureOnOpen": false + "makefile.configureOnOpen": false, + "python.testing.pytestArgs": [ + "tests" + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true } diff --git a/src/auto_acmg.py b/src/auto_acmg.py index 2d10881..658adf0 100644 --- a/src/auto_acmg.py +++ b/src/auto_acmg.py @@ -432,6 +432,8 @@ def parse_strucvar_data(self, strucvar: StrucVar) -> AutoACMGStrucVarResult: gene_transcript.genomeAlignments[0].strand ) self.strucvar_result.data.exons = gene_transcript.genomeAlignments[0].exons + self.strucvar_result.data.start_cdn = gene_transcript.startCodon + self.strucvar_result.data.stop_cdn = gene_transcript.stopCodon return self.strucvar_result diff --git a/src/defs/auto_acmg.py b/src/defs/auto_acmg.py index 3521ea0..a9cf1bd 100644 --- a/src/defs/auto_acmg.py +++ b/src/defs/auto_acmg.py @@ -672,10 +672,10 @@ class AutoACMGStrucVarData(AutoAcmgBaseModel): hgnc_id: str = "" transcript_id: str = "" transcript_tags: List[str] = [] - # prot_pos: int = -1 - # prot_length: int = -1 strand: GenomicStrand = GenomicStrand.NotSet exons: List[Exon] = [] + start_cdn: int = 0 + stop_cdn: int = 0 class AutoACMGStrucVarPred(AutoAcmgBaseModel): diff --git a/src/defs/mehari.py b/src/defs/mehari.py index be38f08..9517756 100644 --- a/src/defs/mehari.py +++ b/src/defs/mehari.py @@ -18,7 +18,7 @@ class Exon(BaseModel): altEndI: int altCdsStartI: int altCdsEndI: int - cigar: str + cigar: Optional[str] = None ord: Optional[int] = None diff --git a/src/strucvar/auto_pvs1.py b/src/strucvar/auto_pvs1.py index ca76bfa..a23bcbf 100644 --- a/src/strucvar/auto_pvs1.py +++ b/src/strucvar/auto_pvs1.py @@ -119,7 +119,8 @@ def _count_lof_vars(self, strucvar: StrucVar) -> Tuple[int, int]: strucvar: The structural variant being analyzed. Returns: - Tuple[int, int]: The number of frequent LoF variants and the total number of LoF variants. + Tuple[int, int]: The number of frequent LoF variants and the total number of LoF + variants. Raises: AlgorithmError: If the end position is less than the start position. @@ -157,6 +158,87 @@ def _count_lof_vars(self, strucvar: StrucVar) -> Tuple[int, int]: "Failed to get variant from range. No gnomAD genomes data." ) + def _calc_cds( + self, exons: List[Exon], strand: GenomicStrand, start_codon: int, stop_codon: int + ) -> List[Exon]: + """ + Remove UTRs from exons. + + Args: + exons: List of exons for the gene. + strand: The genomic strand of the gene. + start_codon: Position of the start codon. + stop_codon: Position of the stop codon. + + Returns: + List[Exon]: List of exons without UTRs. + + Raises: + MissingDataError: If the genomic strand is not set. + """ + if strand == GenomicStrand.NotSet: + raise MissingDataError("Genomic strand is not set. Cannot remove UTRs.") + + if strand == GenomicStrand.Plus: + # Remove 5' UTR + exons_remove = [] + for i, exon in enumerate(exons): + if exon.altEndI - exon.altStartI + 1 > start_codon: + exon.altStartI += start_codon + break + else: + exons_remove.append(i) + start_codon -= exon.altEndI - exon.altStartI + 1 + + # Remove exons + for i in exons_remove[::-1]: + exons.pop(i) + + exons_remove = [] + # Remove 3' UTR + for i, exon in enumerate(exons[::-1]): + if exon.altEndI - exon.altStartI + 1 > stop_codon: + exon.altEndI -= stop_codon + break + else: + exons_remove.append(i + 1) + stop_codon -= exon.altEndI - exon.altStartI + 1 + + # Remove exons + for i in exons_remove[::-1]: + exons.pop(-i) + + elif strand == GenomicStrand.Minus: + exons_remove = [] + # Remove 3' UTR + for i, exon in enumerate(exons): + if exon.altEndI - exon.altStartI + 1 > stop_codon: + exon.altStartI += stop_codon + break + else: + exons_remove.append(i) + stop_codon -= exon.altEndI - exon.altStartI + 1 + + # Remove exons + for i in exons_remove[::-1]: + exons.pop(i) + + exons_remove = [] + # Remove 5' UTR + for i, exon in enumerate(exons[::-1]): + if exon.altEndI - exon.altStartI + 1 > start_codon: + exon.altEndI -= start_codon + break + else: + exons_remove.append(i + 1) + start_codon -= exon.altEndI - exon.altStartI + 1 + + # Remove exons + for i in exons_remove[::-1]: + exons.pop(-i) + + return exons + def full_gene_del(self, strucvar: StrucVar, exons: List[Exon]) -> bool: """ Check if the variant is a full gene deletion. @@ -445,10 +527,53 @@ def lof_freq_in_pop(self, strucvar: StrucVar) -> bool: except AutoAcmgBaseException as e: raise AlgorithmError("Failed to predict LoF frequency for structural variant.") from e - @staticmethod - def lof_rm_gt_10pct_of_prot() -> bool: - """Check if the loss-of-function removes more than 10% of the protein.""" - return False + def lof_rm_gt_10pct_of_prot( + self, + strucvar: StrucVar, + exons: List[Exon], + strand: GenomicStrand, + start_codon: int, + stop_codon: int, + ) -> bool: + """ + Determine if the deletion removes more than 10% of the protein-coding sequence. + + First remove the UTRs from the exons. Then iterate through the CDS exons and calculate the + total CDS length and the length of the deleted region. Return True if the deletion removes + more than 10% of the protein-coding sequence, False otherwise. + + Args: + strucvar: The structural variant being analyzed. + exons: List of exons for the gene. + strand: The genomic strand of the gene. + start_codon: Position of the start codon. + stop_codon: Position of the stop codon. + + Returns: + bool: True if the deletion removes more than 10% of the protein, False otherwise. + """ + total_cds_length = 0 + deleted_length = 0 + + cds = self._calc_cds(exons, strand, start_codon, stop_codon) + + for exon in cds: + total_cds_length += exon.altEndI - exon.altStartI + 1 + + overlap_start = max(strucvar.start, exon.altStartI) + overlap_end = min(strucvar.stop, exon.altEndI) + + if overlap_start <= overlap_end: # Check if there is an overlap + overlap_length = overlap_end - overlap_start + 1 + deleted_length += overlap_length + + if total_cds_length == 0: + raise AlgorithmError( + "Total CDS length is zero. Cannot determine if deletion removes " + "more than 10% of the protein." + ) + deletion_percentage = (deleted_length // 3) / (total_cds_length // 3) + return deletion_percentage > 0.1 @staticmethod def dup_disrupt_rf() -> bool: @@ -518,7 +643,13 @@ def verify_pvs1( # pragma: no cover self.prediction_path = PVS1PredictionStrucVarPath.DEL5_1 else: self.comment_pvs1 += " =>" - if self.lof_rm_gt_10pct_of_prot(): + if self.lof_rm_gt_10pct_of_prot( + strucvar, + var_data.exons, + var_data.strand, + var_data.start_cdn, + var_data.stop_cdn, + ): self.prediction = PVS1Prediction.PVS1_Strong self.prediction_path = PVS1PredictionStrucVarPath.DEL6_1 else: @@ -538,7 +669,13 @@ def verify_pvs1( # pragma: no cover self.prediction_path = PVS1PredictionStrucVarPath.DEL5_2 else: self.comment_pvs1 += " =>" - if self.lof_rm_gt_10pct_of_prot(): + if self.lof_rm_gt_10pct_of_prot( + strucvar, + var_data.exons, + var_data.strand, + var_data.start_cdn, + var_data.stop_cdn, + ): self.prediction = PVS1Prediction.PVS1_Strong self.prediction_path = PVS1PredictionStrucVarPath.DEL6_2 else: diff --git a/tests/strucvar/test_auto_pvs1.py b/tests/strucvar/test_auto_pvs1.py index a93f3b5..732b94d 100644 --- a/tests/strucvar/test_auto_pvs1.py +++ b/tests/strucvar/test_auto_pvs1.py @@ -209,6 +209,151 @@ def test_count_lof_vars_api_failure(mock_get_variant_from_range, strucvar_helper strucvar_helper._count_lof_vars(strucvar) +# --------- _calc_cds --------- + + +@pytest.mark.parametrize( + "exons, strand, start_codon, stop_codon, expected", + [ + # Basic functionality: proper removal of UTRs + ( + [MagicMock(altStartI=100, altEndI=200), MagicMock(altStartI=300, altEndI=400)], + GenomicStrand.Plus, + 50, + 50, + [MagicMock(altStartI=150, altEndI=200), MagicMock(altStartI=300, altEndI=350)], + ), + # Edge cases: start or stop codon at the boundary + ( + [MagicMock(altStartI=100, altEndI=200)], + GenomicStrand.Plus, + 100, + 100, + [ + MagicMock(altStartI=200, altEndI=100) + ], # Exon is inverted due to complete UTR trimming + ), + # Multiple exons with varying UTR positions + ( + [MagicMock(altStartI=100, altEndI=150), MagicMock(altStartI=200, altEndI=250)], + GenomicStrand.Plus, + 25, + 25, + [MagicMock(altStartI=125, altEndI=150), MagicMock(altStartI=200, altEndI=225)], + ), + # No UTRs + ( + [MagicMock(altStartI=100, altEndI=150)], + GenomicStrand.Plus, + 0, + 0, + [MagicMock(altStartI=100, altEndI=150)], + ), + # Multiple exons deletion from the start + ( + [ + MagicMock(altStartI=100, altEndI=150), + MagicMock(altStartI=200, altEndI=250), + MagicMock(altStartI=300, altEndI=350), + ], + GenomicStrand.Plus, + 75, + 0, + [MagicMock(altStartI=224, altEndI=250), MagicMock(altStartI=300, altEndI=350)], + ), + # Multiple exons deletion from the end + ( + [ + MagicMock(altStartI=100, altEndI=150), + MagicMock(altStartI=200, altEndI=250), + MagicMock(altStartI=300, altEndI=350), + ], + GenomicStrand.Plus, + 0, + 75, + [MagicMock(altStartI=100, altEndI=150), MagicMock(altStartI=200, altEndI=226)], + ), + # Standard functionality: removing UTRs from the end of one exon and start of another + ( + [MagicMock(altStartI=100, altEndI=200), MagicMock(altStartI=300, altEndI=400)], + GenomicStrand.Minus, + 50, + 50, + [MagicMock(altStartI=150, altEndI=200), MagicMock(altStartI=300, altEndI=350)], + ), + # Boundary conditions: start or stop codon at the boundary + ( + [MagicMock(altStartI=100, altEndI=200)], + GenomicStrand.Minus, + 100, + 100, + [MagicMock(altStartI=100, altEndI=100)], # Minimal exon remains + ), + # Multiple exons with varying UTR removal + ( + [MagicMock(altStartI=100, altEndI=150), MagicMock(altStartI=200, altEndI=250)], + GenomicStrand.Minus, + 25, + 25, + [MagicMock(altStartI=125, altEndI=150), MagicMock(altStartI=200, altEndI=225)], + ), + # Complete removal of UTRs results in no exon left + ( + [MagicMock(altStartI=100, altEndI=150)], + GenomicStrand.Minus, + 150, + 150, + [], # Exon completely removed + ), + # No UTRs + ( + [MagicMock(altStartI=100, altEndI=150)], + GenomicStrand.Minus, + 0, + 0, + [MagicMock(altStartI=100, altEndI=150)], + ), + # Multiple exons deletion + ( + [ + MagicMock(altStartI=100, altEndI=150), + MagicMock(altStartI=200, altEndI=250), + MagicMock(altStartI=300, altEndI=350), + ], + GenomicStrand.Minus, + 75, + 0, + [MagicMock(altStartI=100, altEndI=150), MagicMock(altStartI=200, altEndI=226)], + ), + # Multiple exons deletion from the end + ( + [ + MagicMock(altStartI=100, altEndI=150), + MagicMock(altStartI=200, altEndI=250), + MagicMock(altStartI=300, altEndI=350), + ], + GenomicStrand.Minus, + 0, + 75, + [MagicMock(altStartI=224, altEndI=250), MagicMock(altStartI=300, altEndI=350)], + ), + ], +) +def test_calc_cds(exons, strand, start_codon, stop_codon, expected, strucvar_helper): + """Test the _calc_cds method.""" + result = strucvar_helper._calc_cds(exons, strand, start_codon, stop_codon) + print(result) + for i, exon in enumerate(result): + assert exon.altStartI == expected[i].altStartI + assert exon.altEndI == expected[i].altEndI + + +def test_calc_cds_error(strucvar_helper): + """Test error handling in _calc_cds.""" + with pytest.raises(MissingDataError): + strucvar_helper._calc_cds([], GenomicStrand.NotSet, 0, 0) + + # --------- full_gene_del --------- @@ -559,6 +704,42 @@ def test_lof_freq_in_pop_api_error(mock_count_lof_vars, strucvar_helper, strucva strucvar_helper.lof_freq_in_pop(strucvar) +# --------- lof_rm_gt_10pct_of_prot --------- + + +@pytest.mark.parametrize( + "strucvar_start, strucvar_stop, expected", + [ + (100, 103, False), # Partial small overlap with one exon + (100, 150, True), # Partial big overlap with one exon + (50, 210, True), # Full overlap including two exons + (10, 45, False), # No overlap before exons + (220, 230, False), # No overlap after exons + (151, 159, False), # No overlap between exons + (50, 150, True), # Exact boundary overlap + (160, 210, True), # Overlapping second exon fully + (50, 500, True), # Overlapping all exons excessively + ], +) +def test_lof_rm_gt_10pct_of_prot( + strucvar_helper, exons, strucvar_start, strucvar_stop, expected, monkeypatch +): + """Test the lof_rm_gt_10pct_of_prot method.""" + mock_calc_cds = MagicMock(return_value=exons) + monkeypatch.setattr(strucvar_helper, "_calc_cds", mock_calc_cds) + strucvar = StrucVar( + sv_type=StrucVarType.DEL, + genome_release=GenomeRelease.GRCh38, + chrom="1", + start=strucvar_start, + stop=strucvar_stop, + ) + result = strucvar_helper.lof_rm_gt_10pct_of_prot(strucvar, exons, GenomicStrand.Plus, 0, 0) + assert ( + result == expected + ), f"Expected {expected} for SV from {strucvar_start} to {strucvar_stop}" + + # ========== AutoPVS1 ============