Skip to content

Commit

Permalink
option to specify grouped counts format
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewprzh committed Aug 26, 2024
1 parent aa62894 commit efde1d4
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 26 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,11 @@ In this case the sum of all gene/transcript TPMs may not add up to 1 million.
Experiments with simulated data show that this method could give more accurate estimations.
However, normalization method does not affect correlation/relative proportions.

`--counts_format`
Output format for grouped counts:
* `matrix` - usual format with genes as rows and groups as columns;
* `linear` - linear format, each line contains gene/transcript id, group name and respective count value (no TPM output);
* `both` - output counts in both formats (default).

### Examples
<a name="examples"></a>
Expand Down
6 changes: 6 additions & 0 deletions docs/cmd.md
Original file line number Diff line number Diff line change
Expand Up @@ -294,3 +294,9 @@ scale factor equals to 1 million divided by the number of all assigned reads.
In this case the sum of all gene/transcript TPMs may not add up to 1 million.
Experiments with simulated data show that this method could give more accurate estimations.
However, normalization method does not affect correlation/relative proportions.

`--counts_format`
Output format for grouped counts:
* `matrix` - usual format with genes as rows and groups as columns;
* `linear` - linear format, each line contains gene/transcript id, group name and respective count value (no TPM output);
* `both` - output counts in both formats (default).
5 changes: 4 additions & 1 deletion isoquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies
from src.graph_based_model_construction import StrandnessReportingLevel
from src.long_read_assigner import AmbiguityResolvingMethod
from src.long_read_counter import COUNTING_STRATEGIES, CountingStrategy, NormalizationMethod
from src.long_read_counter import COUNTING_STRATEGIES, CountingStrategy, NormalizationMethod, GroupedOutputFormat
from src.input_data_storage import InputDataStorage
from src.multimap_resolver import MultimapResolvingStrategy
from src.stats import combine_counts
Expand Down Expand Up @@ -228,6 +228,9 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help
help="TPM normalization method: simple - conventional normalization using all counted reads;"
"usable_reads - includes all assigned reads.",
default=NormalizationMethod.simple.name)
add_additional_option("--counts_format", type=str, choices=[e.name for e in GroupedOutputFormat],
help="output format for grouped counts",
default=GroupedOutputFormat.both.name)

add_additional_option_to_group(pipeline_args_group, "--keep_tmp", help="do not remove temporary files "
"in the end", action='store_true',
Expand Down
8 changes: 6 additions & 2 deletions src/dataset_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
CompositeCounter,
create_gene_counter,
create_transcript_counter,
GroupedOutputFormat,
)
from .multimap_resolver import MultimapResolver
from .read_groups import (
Expand Down Expand Up @@ -320,6 +321,7 @@ def __init__(self, args, sample, read_groups, gffutils_db=None, chr_id=None, gzi
self.read_groups = read_groups
self.common_header = "# Command line: " + args._cmd_line + "\n# IsoQuant version: " + args._version + "\n"
self.io_support = IOSupport(self.args)
self.grouped_format = GroupedOutputFormat[self.args.counts_format]

self.gene_set = set()
self.transcript_set = set()
Expand Down Expand Up @@ -369,11 +371,13 @@ def __init__(self, args, sample, read_groups, gffutils_db=None, chr_id=None, gzi
self.gene_grouped_counter = create_gene_counter(sample.out_gene_grouped_counts_tsv,
self.args.gene_quantification,
complete_feature_list=self.gene_set,
read_groups=self.read_groups)
read_groups=self.read_groups,
grouped_format=self.grouped_format)
self.transcript_grouped_counter = create_transcript_counter(sample.out_transcript_grouped_counts_tsv,
self.args.transcript_quantification,
complete_feature_list=self.transcript_set,
read_groups=self.read_groups)
read_groups=self.read_groups,
grouped_format=self.grouped_format)
self.global_counter.add_counters([self.gene_grouped_counter, self.transcript_grouped_counter])

if self.args.count_exons:
Expand Down
68 changes: 45 additions & 23 deletions src/long_read_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,19 @@ def __init__(self, counting_strategy):
self.use_inconsistent = counting_strategy.inconsistent()


@unique
class GroupedOutputFormat(Enum):
matrix = 1
linear = 2
both = 3

def output_matrix(self):
return self in [GroupedOutputFormat.matrix, GroupedOutputFormat.both]

def output_linear(self):
return self in [GroupedOutputFormat.linear, GroupedOutputFormat.both]


# the difference from defaultdict is that it does not add 0 to the dict when get is called for inexistent key
class IncrementalDict:
def __init__(self, default_type=float):
Expand Down Expand Up @@ -220,8 +233,11 @@ def add_unaligned(self, n_reads=1):
# get_feature_id --- function that returns feature id form IsoformMatch object
class AssignedFeatureCounter(AbstractCounter):
def __init__(self, output_prefix, assignment_extractor, read_groups, read_counter,
all_features=None, output_zeroes=True):
all_features=None, output_zeroes=True, grouped_format=GroupedOutputFormat.both):
AbstractCounter.__init__(self, output_prefix, not read_groups, output_zeroes)
self.output_grouped_matrix = grouped_format.output_matrix()
self.output_grouped_linear = grouped_format.output_linear()

self.assignment_extractor = assignment_extractor
self.all_features = set(all_features) if all_features is not None else set()
if not read_groups:
Expand Down Expand Up @@ -253,7 +269,6 @@ def get_linear_output_file_handler(self):
else:
return None


def add_read_info(self, read_assignment=None):
if not read_assignment:
self.not_aligned_reads += 1
Expand Down Expand Up @@ -363,23 +378,30 @@ def dump_ungrouped(self, all_features):
f.write("__usable\t%d\n" % self.reads_for_tpm)

def dump_grouped(self, all_features, all_groups):
with self.get_output_file_handler() as output_file:
with self.get_linear_output_file_handler() as linear_output_file:
assert linear_output_file is not None

output_file.write(self.format_header(all_groups))
linear_output_file.write("#feature_id\tgroup_id\tcount\n")
for feature_id in all_features:
row_count = 0
for group_id in self.feature_counter[feature_id].data.keys():
count = self.feature_counter[feature_id].data[group_id]
linear_output_file.write("%s\t%s\t%.2f\n" % (feature_id, self.ordered_groups[group_id], count))
row_count += count
if not self.output_zeroes and row_count == 0:
continue
count_values = [self.feature_counter[feature_id].get(self.group_numeric_ids[group_id]) for group_id in
all_groups]
output_file.write("%s\t%s\n" % (feature_id, "\t".join(["%.2f" % c for c in count_values])))
output_file = self.get_output_file_handler()
linear_output_file = self.get_linear_output_file_handler()

if self.output_grouped_matrix:
output_file.write(self.format_header(all_groups))
if self.output_grouped_linear:
linear_output_file.write("#feature_id\tgroup_id\tcount\n")
for feature_id in all_features:
row_count = 0
for group_id in self.feature_counter[feature_id].data.keys():
count = self.feature_counter[feature_id].data[group_id]
if self.output_grouped_linear:
linear_output_file.write("%s\t%s\t%.2f\n" % (feature_id, self.ordered_groups[group_id], count))
row_count += count
if not self.output_zeroes and row_count == 0:
continue

if self.output_grouped_matrix:
count_values = [self.feature_counter[feature_id].get(self.group_numeric_ids[group_id]) for group_id in
all_groups]
output_file.write("%s\t%s\n" % (feature_id, "\t".join(["%.2f" % c for c in count_values])))

output_file.close()
linear_output_file.close()

def convert_counts_to_tpm(self, normalization_str=NormalizationMethod.simple.name):
normalization = NormalizationMethod[normalization_str]
Expand Down Expand Up @@ -429,19 +451,19 @@ def convert_counts_to_tpm(self, normalization_str=NormalizationMethod.simple.nam


def create_gene_counter(output_file_name, strategy, complete_feature_list=None,
read_groups=None, output_zeroes=True):
read_groups=None, output_zeroes=True, grouped_format=GroupedOutputFormat.both):
read_weight_counter = ReadWeightCounter(strategy)
return AssignedFeatureCounter(output_file_name, GeneAssignmentExtractor,
read_groups, read_weight_counter,
complete_feature_list, output_zeroes)
complete_feature_list, output_zeroes, grouped_format)


def create_transcript_counter(output_file_name, strategy, complete_feature_list=None,
read_groups=None, output_zeroes=True):
read_groups=None, output_zeroes=True, grouped_format=GroupedOutputFormat.both):
read_weight_counter = ReadWeightCounter(strategy)
return AssignedFeatureCounter(output_file_name, TranscriptAssignmentExtractor,
read_groups, read_weight_counter,
complete_feature_list, output_zeroes)
complete_feature_list, output_zeroes, grouped_format)


# count simple features inclusion/exclusion (exons / introns)
Expand Down

0 comments on commit efde1d4

Please sign in to comment.