Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement lof removes more than 10% of protein for strucvars #203

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
{
"makefile.configureOnOpen": false
"makefile.configureOnOpen": false,
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
2 changes: 2 additions & 0 deletions src/auto_acmg.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,8 @@ def parse_strucvar_data(self, strucvar: StrucVar) -> AutoACMGStrucVarResult:
gene_transcript.genomeAlignments[0].strand
)
self.strucvar_result.data.exons = gene_transcript.genomeAlignments[0].exons
self.strucvar_result.data.start_cdn = gene_transcript.startCodon
self.strucvar_result.data.stop_cdn = gene_transcript.stopCodon

return self.strucvar_result

Expand Down
4 changes: 2 additions & 2 deletions src/defs/auto_acmg.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,10 +672,10 @@ class AutoACMGStrucVarData(AutoAcmgBaseModel):
hgnc_id: str = ""
transcript_id: str = ""
transcript_tags: List[str] = []
# prot_pos: int = -1
# prot_length: int = -1
strand: GenomicStrand = GenomicStrand.NotSet
exons: List[Exon] = []
start_cdn: int = 0
stop_cdn: int = 0


class AutoACMGStrucVarPred(AutoAcmgBaseModel):
Expand Down
2 changes: 1 addition & 1 deletion src/defs/mehari.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Exon(BaseModel):
altEndI: int
altCdsStartI: int
altCdsEndI: int
cigar: str
cigar: Optional[str] = None
ord: Optional[int] = None


Expand Down
151 changes: 144 additions & 7 deletions src/strucvar/auto_pvs1.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ def _count_lof_vars(self, strucvar: StrucVar) -> Tuple[int, int]:
strucvar: The structural variant being analyzed.

Returns:
Tuple[int, int]: The number of frequent LoF variants and the total number of LoF variants.
Tuple[int, int]: The number of frequent LoF variants and the total number of LoF
variants.

Raises:
AlgorithmError: If the end position is less than the start position.
Expand Down Expand Up @@ -157,6 +158,87 @@ def _count_lof_vars(self, strucvar: StrucVar) -> Tuple[int, int]:
"Failed to get variant from range. No gnomAD genomes data."
)

def _calc_cds(
self, exons: List[Exon], strand: GenomicStrand, start_codon: int, stop_codon: int
) -> List[Exon]:
"""
Remove UTRs from exons.

Args:
exons: List of exons for the gene.
strand: The genomic strand of the gene.
start_codon: Position of the start codon.
stop_codon: Position of the stop codon.

Returns:
List[Exon]: List of exons without UTRs.

Raises:
MissingDataError: If the genomic strand is not set.
"""
if strand == GenomicStrand.NotSet:
raise MissingDataError("Genomic strand is not set. Cannot remove UTRs.")

if strand == GenomicStrand.Plus:
# Remove 5' UTR
exons_remove = []
for i, exon in enumerate(exons):
if exon.altEndI - exon.altStartI + 1 > start_codon:
exon.altStartI += start_codon
break
else:
exons_remove.append(i)
start_codon -= exon.altEndI - exon.altStartI + 1

# Remove exons
for i in exons_remove[::-1]:
exons.pop(i)

exons_remove = []
# Remove 3' UTR
for i, exon in enumerate(exons[::-1]):
if exon.altEndI - exon.altStartI + 1 > stop_codon:
exon.altEndI -= stop_codon
break
else:
exons_remove.append(i + 1)
stop_codon -= exon.altEndI - exon.altStartI + 1

# Remove exons
for i in exons_remove[::-1]:
exons.pop(-i)

elif strand == GenomicStrand.Minus:
exons_remove = []
# Remove 3' UTR
for i, exon in enumerate(exons):
if exon.altEndI - exon.altStartI + 1 > stop_codon:
exon.altStartI += stop_codon
break
else:
exons_remove.append(i)
stop_codon -= exon.altEndI - exon.altStartI + 1

# Remove exons
for i in exons_remove[::-1]:
exons.pop(i)

exons_remove = []
# Remove 5' UTR
for i, exon in enumerate(exons[::-1]):
if exon.altEndI - exon.altStartI + 1 > start_codon:
exon.altEndI -= start_codon
break
else:
exons_remove.append(i + 1)
start_codon -= exon.altEndI - exon.altStartI + 1

# Remove exons
for i in exons_remove[::-1]:
exons.pop(-i)

return exons

def full_gene_del(self, strucvar: StrucVar, exons: List[Exon]) -> bool:
"""
Check if the variant is a full gene deletion.
Expand Down Expand Up @@ -445,10 +527,53 @@ def lof_freq_in_pop(self, strucvar: StrucVar) -> bool:
except AutoAcmgBaseException as e:
raise AlgorithmError("Failed to predict LoF frequency for structural variant.") from e

@staticmethod
def lof_rm_gt_10pct_of_prot() -> bool:
"""Check if the loss-of-function removes more than 10% of the protein."""
return False
def lof_rm_gt_10pct_of_prot(
self,
strucvar: StrucVar,
exons: List[Exon],
strand: GenomicStrand,
start_codon: int,
stop_codon: int,
) -> bool:
"""
Determine if the deletion removes more than 10% of the protein-coding sequence.

First remove the UTRs from the exons. Then iterate through the CDS exons and calculate the
total CDS length and the length of the deleted region. Return True if the deletion removes
more than 10% of the protein-coding sequence, False otherwise.

Args:
strucvar: The structural variant being analyzed.
exons: List of exons for the gene.
strand: The genomic strand of the gene.
start_codon: Position of the start codon.
stop_codon: Position of the stop codon.

Returns:
bool: True if the deletion removes more than 10% of the protein, False otherwise.
"""
total_cds_length = 0
deleted_length = 0

cds = self._calc_cds(exons, strand, start_codon, stop_codon)

for exon in cds:
total_cds_length += exon.altEndI - exon.altStartI + 1

overlap_start = max(strucvar.start, exon.altStartI)
overlap_end = min(strucvar.stop, exon.altEndI)

if overlap_start <= overlap_end: # Check if there is an overlap
overlap_length = overlap_end - overlap_start + 1
deleted_length += overlap_length

if total_cds_length == 0:
raise AlgorithmError(
"Total CDS length is zero. Cannot determine if deletion removes "
"more than 10% of the protein."
)
deletion_percentage = (deleted_length // 3) / (total_cds_length // 3)
return deletion_percentage > 0.1

@staticmethod
def dup_disrupt_rf() -> bool:
Expand Down Expand Up @@ -518,7 +643,13 @@ def verify_pvs1( # pragma: no cover
self.prediction_path = PVS1PredictionStrucVarPath.DEL5_1
else:
self.comment_pvs1 += " =>"
if self.lof_rm_gt_10pct_of_prot():
if self.lof_rm_gt_10pct_of_prot(
strucvar,
var_data.exons,
var_data.strand,
var_data.start_cdn,
var_data.stop_cdn,
):
self.prediction = PVS1Prediction.PVS1_Strong
self.prediction_path = PVS1PredictionStrucVarPath.DEL6_1
else:
Expand All @@ -538,7 +669,13 @@ def verify_pvs1( # pragma: no cover
self.prediction_path = PVS1PredictionStrucVarPath.DEL5_2
else:
self.comment_pvs1 += " =>"
if self.lof_rm_gt_10pct_of_prot():
if self.lof_rm_gt_10pct_of_prot(
strucvar,
var_data.exons,
var_data.strand,
var_data.start_cdn,
var_data.stop_cdn,
):
self.prediction = PVS1Prediction.PVS1_Strong
self.prediction_path = PVS1PredictionStrucVarPath.DEL6_2
else:
Expand Down
Loading
Loading