Skip to content

Commit

Permalink
Merge pull request #57 from UMCUGenetics/release/v3.4.0
Browse files Browse the repository at this point in the history
Release/v3.4.0
  • Loading branch information
rernst authored Feb 1, 2022
2 parents 45555a4 + 6f5a385 commit d8cf9fb
Show file tree
Hide file tree
Showing 11 changed files with 343 additions and 93 deletions.
2 changes: 2 additions & 0 deletions ExonCov.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
db_manager.add_command('migrate', MigrateCommand)
db_manager.add_command('stats', cli.PrintStats())
db_manager.add_command('panel_genes', cli.PrintPanelGenesTable())
db_manager.add_command('coverage_stats', cli.ExportCovStatsSampleSet())
db_manager.add_command('gene_transcripts', cli.PrintTranscripts())
db_manager.add_command('import_alias_table', cli.ImportAliasTable())
db_manager.add_command('export_alias_table', cli.ExportAliasTable())
db_manager.add_command('export_panel_bed', cli.PrintPanelBed())
# db_manager.add_command('load_design', cli.LoadDesign()) # Disabled, can be used to setup a new database from scratch

Expand Down
1 change: 0 additions & 1 deletion ExonCov/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from flask_security import Security, SQLAlchemyUserDatastore
from flask_debugtoolbar import DebugToolbarExtension


from ExonCov.utils import url_for_other_page, event_logger

# Setup APP
Expand Down
212 changes: 192 additions & 20 deletions ExonCov/cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""CLI functions."""
from collections import OrderedDict
import sys
import re
import time
Expand All @@ -11,6 +12,7 @@
from flask_script import Command, Option
from sqlalchemy.orm import joinedload
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm.exc import NoResultFound
from sqlalchemy.sql.expression import func
import tempfile
import shutil
Expand All @@ -21,7 +23,6 @@
Gene, GeneAlias, Transcript, Exon, SequencingRun, Sample, SampleProject, TranscriptMeasurement, Panel,
PanelVersion, panels_transcripts, CustomPanel, SampleSet
)
from .utils import weighted_average


class PrintStats(Command):
Expand Down Expand Up @@ -293,9 +294,9 @@ def run(self, project_name, sample_type, bam, exon_bed_file, threads, overwrite,
else:
measurement_types = ['measurement_mean_coverage', 'measurement_percentage10', 'measurement_percentage15', 'measurement_percentage20', 'measurement_percentage30', 'measurement_percentage50', 'measurement_percentage100']
for measurement_type in measurement_types:
transcripts_measurements[transcript.id][measurement_type] = weighted_average(
[transcripts_measurements[transcript.id][measurement_type], exon_measurement[measurement_type]],
[transcripts_measurements[transcript.id]['len'], exon.len]
transcripts_measurements[transcript.id][measurement_type] = utils.weighted_average(
values=[transcripts_measurements[transcript.id][measurement_type], exon_measurement[measurement_type]],
weights=[transcripts_measurements[transcript.id]['len'], exon.len]
)
transcripts_measurements[transcript.id]['len'] += exon.len

Expand Down Expand Up @@ -372,9 +373,9 @@ def run(self, samples, panels):

# Calculate average panel 15X coverage and compare with coverage_requirement_15
panel_qc = False
panel_measurement_percentage15_avg = weighted_average(
[tm[1].measurement_percentage15 for tm in transcript_measurements],
[tm[1].len for tm in transcript_measurements]
panel_measurement_percentage15_avg = utils.weighted_average(
values=[tm[1].measurement_percentage15 for tm in transcript_measurements],
weights=[tm[1].len for tm in transcript_measurements]
)
if panel_measurement_percentage15_avg >= panel.coverage_requirement_15:
panel_qc = True
Expand All @@ -385,19 +386,37 @@ def run(self, samples, panels):
if transcript.gene in panel.core_genes and transcript_measurement.measurement_percentage15 != 100:
core_gene_qc = False

print("{sample}\t{panel}\t{panel_min_15x}\t{panel_15x}\t{panel_qc}\t{core_gene_qc}".format(
self.print_result(
sample=sample.name,
panel=panel,
panel_min_15x=panel.coverage_requirement_15,
panel_15x=panel_measurement_percentage15_avg,
panel_qc=panel_qc,
core_gene_qc=core_gene_qc
))
else:
print("# Unknown sample or panel\tsample={sample}\tpanel={panel}".format(
sample=sample_id,
panel=panels[index],
))
)
else: # sample and/or panel not found
sample_msg = 'unknown_sample={0}'.format(sample_id)
panel_msg = 'unknown_panel={0}'.format(panels[index])

if sample:
sample_msg = sample.name
if panel:
panel_msg = panel

self.print_result(
sample=sample_msg,
panel=panel_msg,
)

def print_result(self, sample, panel, panel_min_15x='', panel_15x='', panel_qc='', core_gene_qc=''):
print("{sample}\t{panel}\t{panel_min_15x}\t{panel_15x}\t{panel_qc}\t{core_gene_qc}".format(
sample=sample,
panel=panel,
panel_min_15x=panel_min_15x,
panel_15x=panel_15x,
panel_qc=panel_qc,
core_gene_qc=core_gene_qc
))


class RemoveSample(Command):
Expand Down Expand Up @@ -442,18 +461,27 @@ class CreateSampleSet(Command):

option_list = (
Option('name'),
Option('-m', '--min_days', dest='min_days', type=int, default=0),
Option('-d', '--max_days', dest='max_days', type=int, default=180),
Option('-s', '--sample_filter', dest='sample_filter', default=''),
Option('-t', '--sample_type', dest='sample_type', default='WES'),
Option('-n', '--sample_number', dest='sample_number', type=int, default=100),
)

def run(self, name, max_days, sample_filter, sample_type, sample_number):
description = '{0} random {1} samples. Maximum age: {2} days. Sample name filter: {3}'.format(
sample_number, sample_type, max_days, sample_filter
def run(self, name, min_days, max_days, sample_filter, sample_type, sample_number):
description = '{0} random {1} samples. Minimum age: {2} days. Maximum age: {3} days. Sample name filter: {4}'.format(
sample_number, sample_type, min_days, max_days, sample_filter
)
filter_date = datetime.date.today() - datetime.timedelta(days=max_days)

min_date = datetime.date.today() - datetime.timedelta(days=min_days)
max_date = datetime.date.today() - datetime.timedelta(days=max_days)

if max_date > min_date:
raise ValueError(
(
"Incorect use of max_days ({max_days}) and/or min_days ({min_days}): "
"maximum date {max_date} is greater than min date {min_date}"
).format(max_days=max_days, min_days=min_days, max_date=max_date, min_date=min_date)
)
samples = (
Sample.query
.filter(Sample.name.like('%{0}%'.format(sample_filter)))
Expand All @@ -470,7 +498,8 @@ def run(self, name, max_days, sample_filter, sample_type, sample_number):
for sample in samples:
# Filter sampels: import date, 'special' project type (validation etc), Merge samples,
if (
sample.import_date > filter_date
sample.import_date > max_date
and sample.import_date <= min_date
and not sample.project.type
and len(sample.sequencing_runs) == 1
and sample.sequencing_runs[0].platform_unit in sample.project.name
Expand Down Expand Up @@ -558,6 +587,16 @@ def run(self):
print("ERROR: Can not import alias: {0} for gene: {1}".format(hgnc_gene_id, db_gene_id))


class ExportAliasTable(Command):
"""Print tab delimited HGNC alias / gene ID table"""

def run(self):
print('hgnc_gene_id_alias\tgene_id')
gene_aliases = GeneAlias.query.order_by(GeneAlias.id).all()
for gene in gene_aliases:
print('{alias}\t{gene}'.format(alias=gene.id, gene=gene.gene_id))


class PrintPanelBed(Command):
"""Print bed file containing regions in active and validated panels. FULL_autosomal and FULL_TARGET are filtered from the list."""

Expand Down Expand Up @@ -746,3 +785,136 @@ def run(self):
print("WARNING: Unkown gene: {0}".format(gene))
db.session.add(panel_version)
db.session.commit()


class ExportCovStatsSampleSet(Command):
"""Print tab delimited coverage statistics of panel or transcript as table."""

option_list = (
Option('sample_set_id', type=str),
Option('data_type', type=str, choices=["panel", "transcript"]),
Option(
'-m', '--measurement_type',
choices=[
'measurement_mean_coverage',
'measurement_percentage10',
'measurement_percentage15',
'measurement_percentage20',
'measurement_percentage30',
'measurement_percentage50',
'measurement_percentage100'
],
dest='measurement_type',
default='measurement_percentage15'
),
)


def run(self, sample_set_id, data_type, measurement_type):
# retrieve samples from sampleset
try:
sample_set = SampleSet.query.options(joinedload('samples')).filter_by(id=sample_set_id).one()
except NoResultFound as e:
print("Sample set ID {id} does not exist in database.".format(id=sample_set_id))
sys.exit(e)
sample_ids = [sample.id for sample in sample_set.samples]

# retrieve panels, transcripts measurements per sample
query = (
db.session.query(PanelVersion, TranscriptMeasurement)
.filter_by(active=True, validated=True)
.filter(PanelVersion.panel_name.notlike("%FULL%"))
.join(Transcript, PanelVersion.transcripts)
.join(TranscriptMeasurement)
.filter(TranscriptMeasurement.sample_id.in_(sample_ids))
.order_by(
PanelVersion.panel_name,
PanelVersion.id,
TranscriptMeasurement.transcript_id,
TranscriptMeasurement.sample_id)
.all()
)

if data_type == "panel":
self.retrieve_and_print_panel_measurements(
ss_samples=sample_set.samples, query=query, measurement_type=measurement_type
)
elif data_type == "transcript":
self.retrieve_and_print_transcript_measurements(query=query, measurement_type=measurement_type)


def retrieve_and_print_panel_measurements(self, ss_samples, query, measurement_type):
panels_measurements = OrderedDict()
# retrieve panel measurements
for panel, transcript_measurement in query:
sample = transcript_measurement.sample
# add new panel
if panel not in panels_measurements:
panels_measurements[panel] = OrderedDict()
# add new sample or calculate weighted avg
if sample not in panels_measurements[panel]:
panels_measurements[panel][sample] = {
'len': transcript_measurement.len,
'measurement': transcript_measurement[measurement_type]
}
else:
panels_measurements[panel][sample]['measurement'] = utils.weighted_average(
values=[panels_measurements[panel][sample]['measurement'], transcript_measurement[measurement_type]],
weights=[panels_measurements[panel][sample]['len'], transcript_measurement.len]
)
panels_measurements[panel][sample]['len'] += transcript_measurement.len

# print header
print("panel_version\tmeasurement_type\tmean\tmin\tmax")
for panel in panels_measurements:
panel_cov_stats = utils.get_summary_stats_multi_sample(
measurements=panels_measurements, keys=[panel], samples=ss_samples
)
print(
"{panel_version}\t{measurement_type}\t{mean}\t{min}\t{max}".format(
panel_version=panel,
measurement_type=measurement_type,
mean=panel_cov_stats[panel]['mean'],
min=panel_cov_stats[panel]['min'],
max=panel_cov_stats[panel]['max']
)
)


def retrieve_and_print_transcript_measurements(self, query, measurement_type):
panels_measurements = OrderedDict()

# retrieve transcript measurements
for panel, transcript_measurement in query:
sample = transcript_measurement.sample
transcript = transcript_measurement.transcript

# add new panel
if panel not in panels_measurements:
panels_measurements[panel] = OrderedDict()
# add new transcript
if transcript not in panels_measurements[panel]:
panels_measurements[panel][transcript] = {}
# add new sample
if sample not in panels_measurements[panel][transcript]:
panels_measurements[panel][transcript][sample] = transcript_measurement[measurement_type]

# print header
print("panel_version\ttranscript_id\tgene_id\tmeasurement_type\tmean\tmin\tmax")
for panel in panels_measurements:
for transcript in panels_measurements[panel]:
transcript_cov_stats = utils.get_summary_stats_multi_sample(
measurements=panels_measurements[panel], keys=[transcript]
)
# print summary
print(
"{panel_version}\t{transcript}\t{gene}\t{measurement_type}\t{mean}\t{min}\t{max}".format(
panel_version=panel,
transcript=transcript,
gene=transcript.gene_id,
measurement_type=measurement_type,
mean=transcript_cov_stats[transcript]['mean'],
min=transcript_cov_stats[transcript]['min'],
max=transcript_cov_stats[transcript]['max']
)
)
5 changes: 5 additions & 0 deletions ExonCov/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ def parse_core_gene_list(gene_list, genes=[]):
class SampleForm(FlaskForm):
"""Query samples by run or samplename field"""
sample = StringField('Sample')
sample_type = SelectField('Type', default=None, choices=[
('', ''),
('WES', 'WES'),
('WGS', 'WGS'),
])
project = StringField('Project')
run = StringField('Sequencing run')

Expand Down
7 changes: 7 additions & 0 deletions ExonCov/templates/macros/forms.html
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,13 @@
</div>
{% endmacro %}

{% macro render_inline_dropdown(field) %}
<div class="form-group">
{{ field.label(class_='control-label') }}
{{ field(class_='form-control', **kwargs) }}
</div>
{% endmacro %}

{% macro render_sample_datatable_form(search=true, filter=true) %}
<form class="form-inline" onsubmit="return false;">
<div class="form-group col-lg-3">
Expand Down
Loading

0 comments on commit d8cf9fb

Please sign in to comment.