Skip to content

Commit

Permalink
pre-filter transcript before read assignment
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewprzh committed Apr 3, 2024
1 parent 605afc7 commit 8dd13b5
Showing 1 changed file with 42 additions and 4 deletions.
46 changes: 42 additions & 4 deletions src/graph_based_model_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ def process(self, read_assignment_storage):

self.construct_fl_isoforms()
self.construct_assignment_based_isoforms(read_assignment_storage)

self.pre_filter_transcripts()
self.assign_reads_to_models(read_assignment_storage)
self.filter_transcripts()

Expand Down Expand Up @@ -198,10 +200,43 @@ def compare_models_with_known(self):
model.add_additional_attribute("alternatives", event_string)
self.transcript2transcript.append(assignment)

def pre_filter_transcripts(self):
internal_count_values = []
for model in self.transcript_model_storage:
if len(model.exon_blocks) <= 2:
internal_count_values.append(self.internal_counter[model.transcript_id])

internal_count_values = sorted(internal_count_values, reverse=True)
coverage_cutoff = self.params.min_novel_count
# dirty hack to avoid slow assignment in chrM
if len(internal_count_values) > 50:
coverage_cutoff = internal_count_values[50]

filtered_storage = []
for model in self.transcript_model_storage:
if len(model.exon_blocks) > 2:
filtered_storage.append(model)
continue

if (model.transcript_type != TranscriptModelType.known and
self.internal_counter[model.transcript_id] < coverage_cutoff):
del self.transcript_read_ids[model.transcript_id]
del self.internal_counter[model.transcript_id]
continue

mapq = self.mapping_quality(model.transcript_id)
if mapq < self.params.simple_models_mapq_cutoff:
del self.transcript_read_ids[model.transcript_id]
del self.internal_counter[model.transcript_id]
continue
filtered_storage.append(model)

self.transcript_model_storage = filtered_storage

def filter_transcripts(self):
filtered_storage = []
confirmed_transcipt_ids = set()
to_substitute = self.detect_similar_isoforms()
to_substitute = self.detect_similar_isoforms(self.transcript_model_storage)

for model in self.transcript_model_storage:
if model.transcript_type == TranscriptModelType.known:
Expand All @@ -223,13 +258,15 @@ def filter_transcripts(self):
#logger.debug("Novel model %s has a similar isoform %s" % (model.transcript_id, to_substitute[model.transcript_id]))
self.transcript_read_ids[to_substitute[model.transcript_id]] += self.transcript_read_ids[model.transcript_id]
del self.transcript_read_ids[model.transcript_id]
del self.internal_counter[model.transcript_id]
continue

if self.internal_counter[model.transcript_id] < novel_isoform_cutoff:
#logger.debug("Novel model %s has coverage %d < %.2f, component cov = %d" % (model.transcript_id,
# self.internal_counter[model.transcript_id],
# novel_isoform_cutoff, component_coverage))
del self.transcript_read_ids[model.transcript_id]
del self.internal_counter[model.transcript_id]
continue

if len(model.exon_blocks) <= 2:
Expand All @@ -238,6 +275,7 @@ def filter_transcripts(self):
if mapq < self.params.simple_models_mapq_cutoff:
#logger.debug("Novel model %s has poor quality" % model.transcript_id)
del self.transcript_read_ids[model.transcript_id]
del self.internal_counter[model.transcript_id]
continue

# TODO: correct ends for known
Expand All @@ -254,16 +292,16 @@ def mapping_quality(self, model):
mapq += a.mapping_quality
return mapq / len(self.transcript_read_ids[model.transcript_id])

def detect_similar_isoforms(self):
def detect_similar_isoforms(self, model_storage):
to_substitute = {}
for model in self.transcript_model_storage:
for model in model_storage:
if len(model.exon_blocks) <= 2 or model.transcript_id in to_substitute:
continue
transcript_model_gene_info = GeneInfo.from_models([model], self.params.delta)
assigner = LongReadAssigner(transcript_model_gene_info, self.params)
profile_constructor = CombinedProfileConstructor(transcript_model_gene_info, self.params)

for m in self.transcript_model_storage:
for m in model_storage:
if m.transcript_type == TranscriptModelType.known or m.transcript_id == model.transcript_id or \
m.transcript_id in to_substitute or len(m.exon_blocks) == 1 or not m.intron_path or \
len(m.exon_blocks) > len(model.exon_blocks):
Expand Down

0 comments on commit 8dd13b5

Please sign in to comment.