Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements on final MultiQC reports #191

Merged
merged 9 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# minute Changelog

## v0.9.1

### Features
* Handle barcode dropouts. Zero values appear in the final report, and
this does not prevent final stats report to be produced.
* Carry barcode sequence to stats_summary.txt

### Other

* Minor: Styling changes made to the final Minute barcode barplots

## v0.9.0

### Features
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ classifiers = [
"Topic :: Scientific/Engineering :: Bio-Informatics"
]
requires-python = ">=3.7"
version = "0.9.0"
version = "0.9.1"
dependencies = [
"ruamel.yaml",
"xopen",
Expand Down
60 changes: 46 additions & 14 deletions src/minute/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ from minute import (
flatten_scaling_groups,
get_all_pools,
get_all_replicates,
get_maplib_by_name,
Pool,
MultiplexedReplicate,
estimate_library_size,
Expand Down Expand Up @@ -57,7 +58,6 @@ direct_libraries = [lib for lib in libraries if not isinstance(lib, MultiplexedR
scaling_groups = list(read_scaling_groups("groups.tsv", libraries))
maplibs = list(flatten_scaling_groups(scaling_groups))


if not is_snakemake_calling_itself():
print(format_metadata_overview(references, libraries, maplibs, scaling_groups), file=sys.stderr)

Expand Down Expand Up @@ -463,15 +463,22 @@ rule insert_size_metrics:
bam="final/bam/{name}.bam"
log:
"log/final/{name}.insertsizes.txt.log"
shell:
"picard"
" CollectInsertSizeMetrics"
" I={input.bam}"
" O={output.txt}"
" HISTOGRAM_FILE={output.pdf}"
" MINIMUM_PCT=0.5"
" STOP_AFTER=10000000"
" 2> {log}"
run:
picard_cmd = (
"picard"
" CollectInsertSizeMetrics"
" I={input.bam}"
" O={output.txt}"
" HISTOGRAM_FILE={output.pdf}"
" MINIMUM_PCT=0.5"
" STOP_AFTER=10000000"
" 2> {log}"
)
shell(picard_cmd)
# Picard does not produce any output on empty BAM files
if not os.path.isfile(output.txt):
open(output.txt, "a").close()
open(output.pdf, "a").close()


rule unscaled_bigwig:
Expand Down Expand Up @@ -582,7 +589,12 @@ rule extract_fragment_size:
fragsize=temp("{base}.fragsize.txt")
run:
with open(output.fragsize, "w") as f:
print(int(parse_insert_size_metrics(input.insertsizes)["median_insert_size"]),
try:
is_metrics = parse_insert_size_metrics(input.insertsizes)
except StopIteration:
is_metrics = dict()

print(int(is_metrics.get("median_insert_size", "NA")),
file=f)


Expand Down Expand Up @@ -653,6 +665,11 @@ rule replicate_stats:
library=wildcards.library,
reference=wildcards.reference
)
d["barcode"] = "."
mlib = get_maplib_by_name(maplibs, wildcards.library)
if isinstance(mlib.library, MultiplexedReplicate):
d["barcode"] = mlib.library.barcode

for flagstat, name in [
(input.mapped_flagstat, "raw_mapped"),
(input.mapq_flagstat, "mapq_mapped"),
Expand All @@ -662,7 +679,11 @@ rule replicate_stats:
d[name] = parse_flagstat(flagstat).mapped_reads

d["raw_demultiplexed"] = read_int_from_file(input.demux_stats)
d["frac_mapq_filtered"] = (d["raw_mapped"] - d["mapq_mapped"]) / d["raw_mapped"]

try:
d["frac_mapq_filtered"] = (d["raw_mapped"] - d["mapq_mapped"]) / d["raw_mapped"]
except ZeroDivisionError:
d["frac_mapq_filtered"] = "NA"

# Note that here we use as total/unique single ended fragments, because we deduplicate
# based on 1st mate. However if we used proper-pairs the result would be very similar,
Expand All @@ -671,8 +692,13 @@ rule replicate_stats:
parse_duplication_metrics(input.metrics)["unpaired_reads_examined"],
parse_duplication_metrics(input.metrics)["unpaired_read_duplicates"])

try:
is_metrics = parse_insert_size_metrics(input.insertsizes)
except StopIteration:
is_metrics = dict()

d["percent_duplication"] = parse_duplication_metrics(input.metrics)["percent_duplication"]
d["insert_size"] = parse_insert_size_metrics(input.insertsizes)["median_insert_size"]
d["insert_size"] = is_metrics.get("median_insert_size", "NA")
with open(output.txt, "w") as f:
print(*d.keys(), sep="\t", file=f)
print(*d.values(), sep="\t", file=f)
Expand All @@ -693,7 +719,12 @@ rule pooled_stats:
for key in ["raw_demultiplexed", "raw_mapped", "mapq_mapped", "dedup_mapped", "library_size", "percent_duplication", "frac_mapq_filtered"]:
d[key] = "."
d["final_mapped"] = parse_flagstat(input.final_flagstat).mapped_reads
d["insert_size"] = parse_insert_size_metrics(input.insertsizes)["median_insert_size"]

try:
is_metrics = parse_insert_size_metrics(input.insertsizes)
except StopIteration:
is_metrics = dict()
d["insert_size"] = is_metrics.get("median_insert_size", "NA")
with open(output.txt, "w") as f:
print(*d.keys(), sep="\t", file=f)
print(*d.values(), sep="\t", file=f)
Expand All @@ -708,6 +739,7 @@ rule replicate_stats_summary:
header = [
"map_id",
"library",
"barcode",
"reference",
"raw_demultiplexed",
"raw_mapped",
Expand Down
45 changes: 30 additions & 15 deletions src/minute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,10 @@ def get_all_pools(maplibs: Iterable[LibraryWithReference]) -> List[LibraryWithRe
def get_all_replicates(maplibs: Iterable[LibraryWithReference]) -> List[LibraryWithReference]:
return [m for m in maplibs if not isinstance(m.library, Pool)]

def get_maplib_by_name(maplibs: Iterable[LibraryWithReference], name: str) -> LibraryWithReference:
for m in maplibs:
if m.library.name == name:
return m

def make_references(config) -> Dict[str, Reference]:
references = dict()
Expand Down Expand Up @@ -278,6 +282,11 @@ def float_or_int(s):
values.append('NA')

result = {key.lower(): float_or_int(value) for key, value in zip(header, values)}

# Je outputs missing value as ?
if result.get("percent_duplication") == "?":
result["percent_duplication"] = "NA"

return result


Expand All @@ -287,23 +296,29 @@ def compute_scaling(scaling_group, treatments, controls, infofile, genome_sizes,
for pair, treatment_path, control_path, genome_size in zip(scaling_group.normalization_pairs, treatments, controls, genome_sizes):
treatment_reads = parse_flagstat(treatment_path).mapped_reads
control_reads = parse_flagstat(control_path).mapped_reads
if first:
scaling_factor = (
genome_size / fragment_size / treatment_reads * control_reads
)
treatment_reads_ref = treatment_reads
control_reads_ref = control_reads
first = False

sample_scaling_factor = scaling_factor / control_reads
scaled_treatment_reads = sample_scaling_factor * treatment_reads

# TODO factor this out
print(pair.treatment.name, treatment_reads, scaled_treatment_reads, pair.control.name, control_reads, sample_scaling_factor, scaling_group.name, sep="\t", file=infofile)
try:
if first:
scaling_factor = (
genome_size / fragment_size / treatment_reads * control_reads
)
treatment_reads_ref = treatment_reads
control_reads_ref = control_reads
first = False

sample_scaling_factor = scaling_factor / control_reads
scaled_treatment_reads = sample_scaling_factor * treatment_reads
except ZeroDivisionError:
sample_scaling_factor = "NA"
print(pair.treatment.name, treatment_reads, scaled_treatment_reads, pair.control.name, control_reads, sample_scaling_factor, scaling_group.name, sep="\t", file=infofile)
yield sample_scaling_factor
else:
# TODO factor this out
print(pair.treatment.name, treatment_reads, scaled_treatment_reads, pair.control.name, control_reads, sample_scaling_factor, scaling_group.name, sep="\t", file=infofile)

# TODO scaled.idxstats.txt file
# TODO scaled.idxstats.txt file

yield sample_scaling_factor
yield sample_scaling_factor


def parse_stats_fields(stats_file):
Expand Down Expand Up @@ -454,7 +469,7 @@ def estimate_library_size(total_reads, duplicate_reads):
"""
unique_reads = total_reads - duplicate_reads
if total_reads == 0 or duplicate_reads == 0:
return None
return "NA"

m = 1.0
M = 100.0
Expand Down
63 changes: 46 additions & 17 deletions src/minute/summary_plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@ minute_scaled_grouped_barplot <- function(scaling_file) {
scaling <- calculate_ratios_and_groups(scaling)

ggplot(data = scaling) +
aes(x = rep_grp, y = msr, color = scaling_group, fill = scaling_group) +
aes(x = replace_delims_with_spaces(rep_grp), y = msr, color = scaling_group, fill = scaling_group) +
geom_point(data = scaling[scaling$is_pool == FALSE, ]) +
geom_bar(data = scaling[scaling$is_pool == TRUE, ], stat = "identity", alpha = 0.5) +
style_minute_barplot() +
theme(legend.position = "none") +
scale_x_discrete(labels = scales::label_wrap(20)) +
labs(subtitle = "Points - Replicates; Bars - Pooled")
}

Expand All @@ -35,18 +36,19 @@ minute_scaled_replicates_barplot <- function(scaling_file) {
scaling <- read.table(scaling_file, sep="\t", header = T, comment.char = "")
scaling <- calculate_ratios_and_groups(scaling)
ggplot(data = scaling) +
aes(x = rep_grp, y = msr, fill = rep_number) +
aes(x = replace_delims_with_spaces(rep_grp), y = msr, fill = as.factor(rep_number)) +
geom_bar(
data = scaling[scaling$is_pool == FALSE, ],
stat = "identity",
alpha = 0.9,
position = position_dodge2(preserve = "single"),
color = "#555555",
linewidth = 0.2
color = "#555555"
) +
style_minute_barplot() +
theme(legend.position = "bottom") +
scale_fill_distiller()
scale_x_discrete(labels = scales::label_wrap(20)) +
labs(fill = "Replicate", x = "") +
scale_fill_brewer(palette = "Blues")
}


Expand Down Expand Up @@ -116,13 +118,17 @@ calculate_ratios_and_groups <- function(scaling) {
#'
#' @return A list of ggproto objects
style_minute_barplot <- function() {
list(theme_classic(base_size = 10),
list(theme_classic(base_size = 8),
facet_wrap(~scaling_group, scales = "free_x", ncol = 2),
geom_hline(yintercept = 1, linetype = "dotted", alpha = 0.4),
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)),
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
strip.background = element_blank()
),
labs(title = "MINUTE-ChIP scaled global read levels",
x = "Sample",
y = "Minute-ChIP Scaled Fraction"))
x = "",
y = "Minute-ChIP Scaled Fraction")
)
}


Expand All @@ -132,17 +138,36 @@ style_minute_barplot <- function() {
style_barcode_representation <- function() {
list(
coord_flip(),
theme_minimal(base_size = 10),
theme_minimal(base_size = 8),
theme(panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
axis.ticks.x = element_line(),
axis.ticks.length.x = grid::unit(2, "mm"),
axis.ticks.length.x = grid::unit(1, "mm"),
legend.position = "bottom"),
labs(title = "Barcode representation",
x = "")
x = "",
colour = "Condition",
fill = "Condition")
)
}

#' Replace dashes, underscores and points with spaces for labelling
#' @param s String to clean up
#' @return A string
replace_delims_with_spaces <- function(s) {
gsub("_|-|\\.", " ", s)
}

#' Split a string into shorter strings delimited by \n
#'
#' @param s String to split
#' @param width Max width
#' @return A string
wrap_label <- function(s, width) {
s <- sapply(s, function(x) {paste0(strwrap(x, width = width), collapse = "\n")})
names(s) <- NULL
s
}

#' Main part of the stacked barcode representation plot.
#' Overlays the pool values with the replicate groups to have separate lines
Expand All @@ -156,13 +181,16 @@ stacked_replicate_groups_plot <- function(df_combined, value_column) {
# using .groups is experimental, so it is probably better to silence
options(dplyr.summarise.inform = FALSE)

df_combined <- df_combined %>%
mutate(condition = replace_delims_with_spaces(condition))

df_blocks <- df_combined %>%
group_by(scaling_group, rep_grp, condition) %>%
summarise(total = sum(.data[[value_column]]))

ggplot(
df_combined,
aes(x=scaling_group, y=!!sym(value_column), fill=condition, color=condition)
aes(x=scaling_group, y=!!sym(value_column), fill=wrap_label(condition, 20), color=wrap_label(condition, 20))
) +
geom_bar(stat="identity", color="white", linewidth=0.2, alpha = 0.8) +
geom_bar(
Expand All @@ -171,11 +199,11 @@ stacked_replicate_groups_plot <- function(df_combined, value_column) {
stat="identity",
color="white",
alpha = 0.5,
linewidth = 1.2
linewidth = 1,
linejoin = "mitre"
)
}


#' Calculate number of groups in a scalinginfo.txt file
#'
#' @param scaling_file Path to scalinginfo.txt file
Expand All @@ -187,10 +215,10 @@ get_scaling_groups_number <- function(scaling_file) {
scalinginfo <- snakemake@input[[1]]
ngroups <- get_scaling_groups_number(scalinginfo)

single_width <- 6
single_width <- 5

# Account for longer names
single_height <- 10
single_height <- 9

panel_width <- single_width * 2
panel_height <- ceiling(ngroups / 2) * single_height
Expand Down Expand Up @@ -252,3 +280,4 @@ ggsave(snakemake@output[[8]],
height = 7,
dpi = 300,
units = "cm")