Skip to content

Commit

Permalink
Merge pull request #191 from elsasserlab/better-report
Browse files Browse the repository at this point in the history
Improvements on final MultiQC reports
  • Loading branch information
cnluzon authored Nov 4, 2024
2 parents 15b382b + a4e561a commit ef773b1
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 47 deletions.
11 changes: 11 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# minute Changelog

## v0.9.1

### Features
* Handle barcode dropouts. Zero values appear in the final report, and
this does not prevent final stats report to be produced.
* Carry barcode sequence to stats_summary.txt

### Other

* Minor: Styling changes made to the final Minute barcode barplots

## v0.9.0

### Features
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ classifiers = [
"Topic :: Scientific/Engineering :: Bio-Informatics"
]
requires-python = ">=3.7"
version = "0.9.0"
version = "0.9.1"
dependencies = [
"ruamel.yaml",
"xopen",
Expand Down
60 changes: 46 additions & 14 deletions src/minute/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ from minute import (
flatten_scaling_groups,
get_all_pools,
get_all_replicates,
get_maplib_by_name,
Pool,
MultiplexedReplicate,
estimate_library_size,
Expand Down Expand Up @@ -57,7 +58,6 @@ direct_libraries = [lib for lib in libraries if not isinstance(lib, MultiplexedR
scaling_groups = list(read_scaling_groups("groups.tsv", libraries))
maplibs = list(flatten_scaling_groups(scaling_groups))


if not is_snakemake_calling_itself():
print(format_metadata_overview(references, libraries, maplibs, scaling_groups), file=sys.stderr)

Expand Down Expand Up @@ -463,15 +463,22 @@ rule insert_size_metrics:
bam="final/bam/{name}.bam"
log:
"log/final/{name}.insertsizes.txt.log"
shell:
"picard"
" CollectInsertSizeMetrics"
" I={input.bam}"
" O={output.txt}"
" HISTOGRAM_FILE={output.pdf}"
" MINIMUM_PCT=0.5"
" STOP_AFTER=10000000"
" 2> {log}"
run:
picard_cmd = (
"picard"
" CollectInsertSizeMetrics"
" I={input.bam}"
" O={output.txt}"
" HISTOGRAM_FILE={output.pdf}"
" MINIMUM_PCT=0.5"
" STOP_AFTER=10000000"
" 2> {log}"
)
shell(picard_cmd)
# Picard does not produce any output on empty BAM files
if not os.path.isfile(output.txt):
open(output.txt, "a").close()
open(output.pdf, "a").close()


rule unscaled_bigwig:
Expand Down Expand Up @@ -582,7 +589,12 @@ rule extract_fragment_size:
fragsize=temp("{base}.fragsize.txt")
run:
with open(output.fragsize, "w") as f:
print(int(parse_insert_size_metrics(input.insertsizes)["median_insert_size"]),
try:
is_metrics = parse_insert_size_metrics(input.insertsizes)
except StopIteration:
is_metrics = dict()

print(int(is_metrics.get("median_insert_size", "NA")),
file=f)


Expand Down Expand Up @@ -653,6 +665,11 @@ rule replicate_stats:
library=wildcards.library,
reference=wildcards.reference
)
d["barcode"] = "."
mlib = get_maplib_by_name(maplibs, wildcards.library)
if isinstance(mlib.library, MultiplexedReplicate):
d["barcode"] = mlib.library.barcode

for flagstat, name in [
(input.mapped_flagstat, "raw_mapped"),
(input.mapq_flagstat, "mapq_mapped"),
Expand All @@ -662,7 +679,11 @@ rule replicate_stats:
d[name] = parse_flagstat(flagstat).mapped_reads

d["raw_demultiplexed"] = read_int_from_file(input.demux_stats)
d["frac_mapq_filtered"] = (d["raw_mapped"] - d["mapq_mapped"]) / d["raw_mapped"]

try:
d["frac_mapq_filtered"] = (d["raw_mapped"] - d["mapq_mapped"]) / d["raw_mapped"]
except ZeroDivisionError:
d["frac_mapq_filtered"] = "NA"

# Note that here we use as total/unique single ended fragments, because we deduplicate
# based on 1st mate. However if we used proper-pairs the result would be very similar,
Expand All @@ -671,8 +692,13 @@ rule replicate_stats:
parse_duplication_metrics(input.metrics)["unpaired_reads_examined"],
parse_duplication_metrics(input.metrics)["unpaired_read_duplicates"])

try:
is_metrics = parse_insert_size_metrics(input.insertsizes)
except StopIteration:
is_metrics = dict()

d["percent_duplication"] = parse_duplication_metrics(input.metrics)["percent_duplication"]
d["insert_size"] = parse_insert_size_metrics(input.insertsizes)["median_insert_size"]
d["insert_size"] = is_metrics.get("median_insert_size", "NA")
with open(output.txt, "w") as f:
print(*d.keys(), sep="\t", file=f)
print(*d.values(), sep="\t", file=f)
Expand All @@ -693,7 +719,12 @@ rule pooled_stats:
for key in ["raw_demultiplexed", "raw_mapped", "mapq_mapped", "dedup_mapped", "library_size", "percent_duplication", "frac_mapq_filtered"]:
d[key] = "."
d["final_mapped"] = parse_flagstat(input.final_flagstat).mapped_reads
d["insert_size"] = parse_insert_size_metrics(input.insertsizes)["median_insert_size"]

try:
is_metrics = parse_insert_size_metrics(input.insertsizes)
except StopIteration:
is_metrics = dict()
d["insert_size"] = is_metrics.get("median_insert_size", "NA")
with open(output.txt, "w") as f:
print(*d.keys(), sep="\t", file=f)
print(*d.values(), sep="\t", file=f)
Expand All @@ -708,6 +739,7 @@ rule replicate_stats_summary:
header = [
"map_id",
"library",
"barcode",
"reference",
"raw_demultiplexed",
"raw_mapped",
Expand Down
45 changes: 30 additions & 15 deletions src/minute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,10 @@ def get_all_pools(maplibs: Iterable[LibraryWithReference]) -> List[LibraryWithRe
def get_all_replicates(maplibs: Iterable[LibraryWithReference]) -> List[LibraryWithReference]:
return [m for m in maplibs if not isinstance(m.library, Pool)]

def get_maplib_by_name(maplibs: Iterable[LibraryWithReference], name: str) -> LibraryWithReference:
for m in maplibs:
if m.library.name == name:
return m

def make_references(config) -> Dict[str, Reference]:
references = dict()
Expand Down Expand Up @@ -278,6 +282,11 @@ def float_or_int(s):
values.append('NA')

result = {key.lower(): float_or_int(value) for key, value in zip(header, values)}

# Je outputs missing value as ?
if result.get("percent_duplication") == "?":
result["percent_duplication"] = "NA"

return result


Expand All @@ -287,23 +296,29 @@ def compute_scaling(scaling_group, treatments, controls, infofile, genome_sizes,
for pair, treatment_path, control_path, genome_size in zip(scaling_group.normalization_pairs, treatments, controls, genome_sizes):
treatment_reads = parse_flagstat(treatment_path).mapped_reads
control_reads = parse_flagstat(control_path).mapped_reads
if first:
scaling_factor = (
genome_size / fragment_size / treatment_reads * control_reads
)
treatment_reads_ref = treatment_reads
control_reads_ref = control_reads
first = False

sample_scaling_factor = scaling_factor / control_reads
scaled_treatment_reads = sample_scaling_factor * treatment_reads

# TODO factor this out
print(pair.treatment.name, treatment_reads, scaled_treatment_reads, pair.control.name, control_reads, sample_scaling_factor, scaling_group.name, sep="\t", file=infofile)
try:
if first:
scaling_factor = (
genome_size / fragment_size / treatment_reads * control_reads
)
treatment_reads_ref = treatment_reads
control_reads_ref = control_reads
first = False

sample_scaling_factor = scaling_factor / control_reads
scaled_treatment_reads = sample_scaling_factor * treatment_reads
except ZeroDivisionError:
sample_scaling_factor = "NA"
print(pair.treatment.name, treatment_reads, scaled_treatment_reads, pair.control.name, control_reads, sample_scaling_factor, scaling_group.name, sep="\t", file=infofile)
yield sample_scaling_factor
else:
# TODO factor this out
print(pair.treatment.name, treatment_reads, scaled_treatment_reads, pair.control.name, control_reads, sample_scaling_factor, scaling_group.name, sep="\t", file=infofile)

# TODO scaled.idxstats.txt file
# TODO scaled.idxstats.txt file

yield sample_scaling_factor
yield sample_scaling_factor


def parse_stats_fields(stats_file):
Expand Down Expand Up @@ -454,7 +469,7 @@ def estimate_library_size(total_reads, duplicate_reads):
"""
unique_reads = total_reads - duplicate_reads
if total_reads == 0 or duplicate_reads == 0:
return None
return "NA"

m = 1.0
M = 100.0
Expand Down
63 changes: 46 additions & 17 deletions src/minute/summary_plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@ minute_scaled_grouped_barplot <- function(scaling_file) {
scaling <- calculate_ratios_and_groups(scaling)

ggplot(data = scaling) +
aes(x = rep_grp, y = msr, color = scaling_group, fill = scaling_group) +
aes(x = replace_delims_with_spaces(rep_grp), y = msr, color = scaling_group, fill = scaling_group) +
geom_point(data = scaling[scaling$is_pool == FALSE, ]) +
geom_bar(data = scaling[scaling$is_pool == TRUE, ], stat = "identity", alpha = 0.5) +
style_minute_barplot() +
theme(legend.position = "none") +
scale_x_discrete(labels = scales::label_wrap(20)) +
labs(subtitle = "Points - Replicates; Bars - Pooled")
}

Expand All @@ -35,18 +36,19 @@ minute_scaled_replicates_barplot <- function(scaling_file) {
scaling <- read.table(scaling_file, sep="\t", header = T, comment.char = "")
scaling <- calculate_ratios_and_groups(scaling)
ggplot(data = scaling) +
aes(x = rep_grp, y = msr, fill = rep_number) +
aes(x = replace_delims_with_spaces(rep_grp), y = msr, fill = as.factor(rep_number)) +
geom_bar(
data = scaling[scaling$is_pool == FALSE, ],
stat = "identity",
alpha = 0.9,
position = position_dodge2(preserve = "single"),
color = "#555555",
linewidth = 0.2
color = "#555555"
) +
style_minute_barplot() +
theme(legend.position = "bottom") +
scale_fill_distiller()
scale_x_discrete(labels = scales::label_wrap(20)) +
labs(fill = "Replicate", x = "") +
scale_fill_brewer(palette = "Blues")
}


Expand Down Expand Up @@ -116,13 +118,17 @@ calculate_ratios_and_groups <- function(scaling) {
#'
#' @return A list of ggproto objects
style_minute_barplot <- function() {
list(theme_classic(base_size = 10),
list(theme_classic(base_size = 8),
facet_wrap(~scaling_group, scales = "free_x", ncol = 2),
geom_hline(yintercept = 1, linetype = "dotted", alpha = 0.4),
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)),
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
strip.background = element_blank()
),
labs(title = "MINUTE-ChIP scaled global read levels",
x = "Sample",
y = "Minute-ChIP Scaled Fraction"))
x = "",
y = "Minute-ChIP Scaled Fraction")
)
}


Expand All @@ -132,17 +138,36 @@ style_minute_barplot <- function() {
style_barcode_representation <- function() {
list(
coord_flip(),
theme_minimal(base_size = 10),
theme_minimal(base_size = 8),
theme(panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
axis.ticks.x = element_line(),
axis.ticks.length.x = grid::unit(2, "mm"),
axis.ticks.length.x = grid::unit(1, "mm"),
legend.position = "bottom"),
labs(title = "Barcode representation",
x = "")
x = "",
colour = "Condition",
fill = "Condition")
)
}

#' Replace dashes, underscores and points with spaces for labelling
#' @param s String to clean up
#' @return A string
replace_delims_with_spaces <- function(s) {
gsub("_|-|\\.", " ", s)
}

#' Split a string into shorter strings delimited by \n
#'
#' @param s String to split
#' @param width Max width
#' @return A string
wrap_label <- function(s, width) {
s <- sapply(s, function(x) {paste0(strwrap(x, width = width), collapse = "\n")})
names(s) <- NULL
s
}

#' Main part of the stacked barcode representation plot.
#' Overlays the pool values with the replicate groups to have separate lines
Expand All @@ -156,13 +181,16 @@ stacked_replicate_groups_plot <- function(df_combined, value_column) {
# using .groups is experimental, so it is probably better to silence
options(dplyr.summarise.inform = FALSE)

df_combined <- df_combined %>%
mutate(condition = replace_delims_with_spaces(condition))

df_blocks <- df_combined %>%
group_by(scaling_group, rep_grp, condition) %>%
summarise(total = sum(.data[[value_column]]))

ggplot(
df_combined,
aes(x=scaling_group, y=!!sym(value_column), fill=condition, color=condition)
aes(x=scaling_group, y=!!sym(value_column), fill=wrap_label(condition, 20), color=wrap_label(condition, 20))
) +
geom_bar(stat="identity", color="white", linewidth=0.2, alpha = 0.8) +
geom_bar(
Expand All @@ -171,11 +199,11 @@ stacked_replicate_groups_plot <- function(df_combined, value_column) {
stat="identity",
color="white",
alpha = 0.5,
linewidth = 1.2
linewidth = 1,
linejoin = "mitre"
)
}


#' Calculate number of groups in a scalinginfo.txt file
#'
#' @param scaling_file Path to scalinginfo.txt file
Expand All @@ -187,10 +215,10 @@ get_scaling_groups_number <- function(scaling_file) {
scalinginfo <- snakemake@input[[1]]
ngroups <- get_scaling_groups_number(scalinginfo)

single_width <- 6
single_width <- 5

# Account for longer names
single_height <- 10
single_height <- 9

panel_width <- single_width * 2
panel_height <- ceiling(ngroups / 2) * single_height
Expand Down Expand Up @@ -252,3 +280,4 @@ ggsave(snakemake@output[[8]],
height = 7,
dpi = 300,
units = "cm")

0 comments on commit ef773b1

Please sign in to comment.