Merge pull request #191 from elsasserlab/better-report

Improvements on final MultiQC reports
elsasserlab · Nov 4, 2024 · ef773b1 · ef773b1
2 parents 15b382b + a4e561a
commit ef773b1
Show file tree

Hide file tree

Showing 5 changed files with 134 additions and 47 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,16 @@
 # minute Changelog
 
+## v0.9.1
+
+### Features
+* Handle barcode dropouts. Zero values appear in the final report, and
+this does not prevent final stats report to be produced.
+* Carry barcode sequence to stats_summary.txt
+
+### Other
+
+* Minor: Styling changes made to the final Minute barcode barplots
+
 ## v0.9.0
 
 ### Features

diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Bio-Informatics"
 ]
 requires-python = ">=3.7"
-version = "0.9.0"
+version = "0.9.1"
 dependencies = [
     "ruamel.yaml",
     "xopen",

diff --git a/src/minute/Snakefile b/src/minute/Snakefile
@@ -20,6 +20,7 @@ from minute import (
     flatten_scaling_groups,
     get_all_pools,
     get_all_replicates,
+    get_maplib_by_name,
     Pool,
     MultiplexedReplicate,
     estimate_library_size,
@@ -57,7 +58,6 @@ direct_libraries = [lib for lib in libraries if not isinstance(lib, MultiplexedR
 scaling_groups = list(read_scaling_groups("groups.tsv", libraries))
 maplibs = list(flatten_scaling_groups(scaling_groups))
 
-
 if not is_snakemake_calling_itself():
     print(format_metadata_overview(references, libraries, maplibs, scaling_groups), file=sys.stderr)
 
@@ -463,15 +463,22 @@ rule insert_size_metrics:
         bam="final/bam/{name}.bam"
     log:
         "log/final/{name}.insertsizes.txt.log"
-    shell:
-        "picard"
-        " CollectInsertSizeMetrics"
-        " I={input.bam}"
-        " O={output.txt}"
-        " HISTOGRAM_FILE={output.pdf}"
-        " MINIMUM_PCT=0.5"
-        " STOP_AFTER=10000000"
-        " 2> {log}"
+    run:
+        picard_cmd = (
+            "picard"
+            " CollectInsertSizeMetrics"
+            " I={input.bam}"
+            " O={output.txt}"
+            " HISTOGRAM_FILE={output.pdf}"
+            " MINIMUM_PCT=0.5"
+            " STOP_AFTER=10000000"
+            " 2> {log}"
+        )
+        shell(picard_cmd)
+        # Picard does not produce any output on empty BAM files
+        if not os.path.isfile(output.txt):
+            open(output.txt, "a").close()
+            open(output.pdf, "a").close()
 
 
 rule unscaled_bigwig:
@@ -582,7 +589,12 @@ rule extract_fragment_size:
         fragsize=temp("{base}.fragsize.txt")
     run:
         with open(output.fragsize, "w") as f:
-            print(int(parse_insert_size_metrics(input.insertsizes)["median_insert_size"]),
+            try:
+                is_metrics = parse_insert_size_metrics(input.insertsizes)
+            except StopIteration:
+                is_metrics = dict()
+
+            print(int(is_metrics.get("median_insert_size", "NA")),
                   file=f)
 
 
@@ -653,6 +665,11 @@ rule replicate_stats:
                 library=wildcards.library,
                 reference=wildcards.reference
             )
+        d["barcode"] = "."
+        mlib = get_maplib_by_name(maplibs, wildcards.library)
+        if isinstance(mlib.library, MultiplexedReplicate):
+            d["barcode"] = mlib.library.barcode
+
         for flagstat, name in [
             (input.mapped_flagstat, "raw_mapped"),
             (input.mapq_flagstat, "mapq_mapped"),
@@ -662,7 +679,11 @@ rule replicate_stats:
             d[name] = parse_flagstat(flagstat).mapped_reads
 
         d["raw_demultiplexed"] = read_int_from_file(input.demux_stats)
-        d["frac_mapq_filtered"] = (d["raw_mapped"] - d["mapq_mapped"]) / d["raw_mapped"]
+
+        try:
+            d["frac_mapq_filtered"] = (d["raw_mapped"] - d["mapq_mapped"]) / d["raw_mapped"]
+        except ZeroDivisionError:
+            d["frac_mapq_filtered"] = "NA"
 
         # Note that here we use as total/unique single ended fragments, because we deduplicate
         # based on 1st mate. However if we used proper-pairs the result would be very similar,
@@ -671,8 +692,13 @@ rule replicate_stats:
             parse_duplication_metrics(input.metrics)["unpaired_reads_examined"],
             parse_duplication_metrics(input.metrics)["unpaired_read_duplicates"])
 
+        try:
+            is_metrics = parse_insert_size_metrics(input.insertsizes)
+        except StopIteration:
+            is_metrics = dict()
+
         d["percent_duplication"] = parse_duplication_metrics(input.metrics)["percent_duplication"]
-        d["insert_size"] = parse_insert_size_metrics(input.insertsizes)["median_insert_size"]
+        d["insert_size"] = is_metrics.get("median_insert_size", "NA")
         with open(output.txt, "w") as f:
             print(*d.keys(), sep="\t", file=f)
             print(*d.values(), sep="\t", file=f)
@@ -693,7 +719,12 @@ rule pooled_stats:
         for key in ["raw_demultiplexed", "raw_mapped", "mapq_mapped", "dedup_mapped", "library_size", "percent_duplication", "frac_mapq_filtered"]:
             d[key] = "."
         d["final_mapped"] = parse_flagstat(input.final_flagstat).mapped_reads
-        d["insert_size"] = parse_insert_size_metrics(input.insertsizes)["median_insert_size"]
+
+        try:
+            is_metrics = parse_insert_size_metrics(input.insertsizes)
+        except StopIteration:
+            is_metrics = dict()
+        d["insert_size"] = is_metrics.get("median_insert_size", "NA")
         with open(output.txt, "w") as f:
             print(*d.keys(), sep="\t", file=f)
             print(*d.values(), sep="\t", file=f)
@@ -708,6 +739,7 @@ rule replicate_stats_summary:
         header = [
             "map_id",
             "library",
+            "barcode",
             "reference",
             "raw_demultiplexed",
             "raw_mapped",

diff --git a/src/minute/__init__.py b/src/minute/__init__.py
@@ -174,6 +174,10 @@ def get_all_pools(maplibs: Iterable[LibraryWithReference]) -> List[LibraryWithRe
 def get_all_replicates(maplibs: Iterable[LibraryWithReference]) -> List[LibraryWithReference]:
     return [m for m in maplibs if not isinstance(m.library, Pool)]
 
+def get_maplib_by_name(maplibs: Iterable[LibraryWithReference], name: str) -> LibraryWithReference:
+    for m in maplibs:
+        if m.library.name == name:
+            return m
 
 def make_references(config) -> Dict[str, Reference]:
     references = dict()
@@ -278,6 +282,11 @@ def float_or_int(s):
         values.append('NA')
 
     result = {key.lower(): float_or_int(value) for key, value in zip(header, values)}
+
+    # Je outputs missing value as ?
+    if result.get("percent_duplication") == "?":
+        result["percent_duplication"] = "NA"
+
     return result
 
 
@@ -287,23 +296,29 @@ def compute_scaling(scaling_group, treatments, controls, infofile, genome_sizes,
     for pair, treatment_path, control_path, genome_size in zip(scaling_group.normalization_pairs, treatments, controls, genome_sizes):
         treatment_reads = parse_flagstat(treatment_path).mapped_reads
         control_reads = parse_flagstat(control_path).mapped_reads
-        if first:
-            scaling_factor = (
-                genome_size / fragment_size / treatment_reads * control_reads
-            )
-            treatment_reads_ref = treatment_reads
-            control_reads_ref = control_reads
-            first = False
-
-        sample_scaling_factor = scaling_factor / control_reads
-        scaled_treatment_reads = sample_scaling_factor * treatment_reads
 
-        # TODO factor this out
-        print(pair.treatment.name, treatment_reads, scaled_treatment_reads, pair.control.name, control_reads, sample_scaling_factor, scaling_group.name, sep="\t", file=infofile)
+        try:
+            if first:
+                scaling_factor = (
+                    genome_size / fragment_size / treatment_reads * control_reads
+                )
+                treatment_reads_ref = treatment_reads
+                control_reads_ref = control_reads
+                first = False
+
+            sample_scaling_factor = scaling_factor / control_reads
+            scaled_treatment_reads = sample_scaling_factor * treatment_reads
+        except ZeroDivisionError:
+            sample_scaling_factor = "NA"
+            print(pair.treatment.name, treatment_reads, scaled_treatment_reads, pair.control.name, control_reads, sample_scaling_factor, scaling_group.name, sep="\t", file=infofile)
+            yield sample_scaling_factor
+        else:
+            # TODO factor this out
+            print(pair.treatment.name, treatment_reads, scaled_treatment_reads, pair.control.name, control_reads, sample_scaling_factor, scaling_group.name, sep="\t", file=infofile)
 
-        # TODO scaled.idxstats.txt file
+            # TODO scaled.idxstats.txt file
 
-        yield sample_scaling_factor
+            yield sample_scaling_factor
 
 
 def parse_stats_fields(stats_file):
@@ -454,7 +469,7 @@ def estimate_library_size(total_reads, duplicate_reads):
     """
     unique_reads = total_reads - duplicate_reads
     if total_reads == 0 or duplicate_reads == 0:
-        return None
+        return "NA"
 
     m = 1.0
     M = 100.0

diff --git a/src/minute/summary_plots.R b/src/minute/summary_plots.R
@@ -15,11 +15,12 @@ minute_scaled_grouped_barplot <- function(scaling_file) {
   scaling <- calculate_ratios_and_groups(scaling)
 
   ggplot(data = scaling) + 
-    aes(x = rep_grp, y = msr, color = scaling_group, fill = scaling_group) + 
+    aes(x = replace_delims_with_spaces(rep_grp), y = msr, color = scaling_group, fill = scaling_group) +
     geom_point(data = scaling[scaling$is_pool == FALSE, ]) +
     geom_bar(data = scaling[scaling$is_pool == TRUE, ], stat = "identity", alpha = 0.5) +
     style_minute_barplot() +
     theme(legend.position = "none") +
+    scale_x_discrete(labels = scales::label_wrap(20)) +
     labs(subtitle = "Points - Replicates; Bars - Pooled")
 }
 
@@ -35,18 +36,19 @@ minute_scaled_replicates_barplot <- function(scaling_file) {
   scaling <- read.table(scaling_file, sep="\t", header = T, comment.char = "")
   scaling <- calculate_ratios_and_groups(scaling)
   ggplot(data = scaling) + 
-    aes(x = rep_grp, y = msr, fill = rep_number) +
+    aes(x = replace_delims_with_spaces(rep_grp), y = msr, fill = as.factor(rep_number)) +
     geom_bar(
       data = scaling[scaling$is_pool == FALSE, ],
       stat = "identity",
       alpha = 0.9,
       position = position_dodge2(preserve = "single"),
-      color = "#555555",
-      linewidth = 0.2
+      color = "#555555"
     ) +
     style_minute_barplot() +
     theme(legend.position = "bottom") +
-    scale_fill_distiller()
+    scale_x_discrete(labels = scales::label_wrap(20)) +
+    labs(fill = "Replicate", x = "") +
+    scale_fill_brewer(palette = "Blues")
 }
 
 
@@ -116,13 +118,17 @@ calculate_ratios_and_groups <- function(scaling) {
 #'
 #' @return A list of ggproto objects
 style_minute_barplot <- function() {
-  list(theme_classic(base_size = 10),
+  list(theme_classic(base_size = 8),
        facet_wrap(~scaling_group, scales = "free_x", ncol = 2),
        geom_hline(yintercept = 1, linetype = "dotted", alpha = 0.4),
-       theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)),
+       theme(
+        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
+        strip.background = element_blank()
+       ),
        labs(title = "MINUTE-ChIP scaled global read levels",
-            x = "Sample",
-            y = "Minute-ChIP Scaled Fraction"))
+            x = "",
+            y = "Minute-ChIP Scaled Fraction")
+  )
 }
 
 
@@ -132,17 +138,36 @@ style_minute_barplot <- function() {
 style_barcode_representation <- function() {
   list(
     coord_flip(),
-    theme_minimal(base_size = 10),
+    theme_minimal(base_size = 8),
     theme(panel.grid.major.y = element_blank(),
           panel.grid.minor.y = element_blank(),
           axis.ticks.x = element_line(),
-          axis.ticks.length.x = grid::unit(2, "mm"),
+          axis.ticks.length.x = grid::unit(1, "mm"),
           legend.position = "bottom"),
     labs(title = "Barcode representation",
-         x = "")
+         x = "",
+         colour = "Condition",
+         fill = "Condition")
   )
 }
 
+#' Replace dashes, underscores and points with spaces for labelling
+#' @param s String to clean up
+#' @return A string
+replace_delims_with_spaces <- function(s) {
+  gsub("_|-|\\.", " ", s)
+}
+
+#' Split a string into shorter strings delimited by \n
+#'
+#' @param s String to split
+#' @param width Max width
+#' @return A string
+wrap_label <- function(s, width) {
+  s <- sapply(s, function(x) {paste0(strwrap(x, width = width), collapse = "\n")})
+  names(s) <- NULL
+  s
+}
 
 #' Main part of the stacked barcode representation plot.
 #' Overlays the pool values with the replicate groups to have separate lines
@@ -156,13 +181,16 @@ stacked_replicate_groups_plot <- function(df_combined, value_column) {
   # using .groups is experimental, so it is probably better to silence
   options(dplyr.summarise.inform = FALSE)
 
+  df_combined <- df_combined %>%
+    mutate(condition = replace_delims_with_spaces(condition))
+
   df_blocks <- df_combined %>%
     group_by(scaling_group, rep_grp, condition) %>%
     summarise(total = sum(.data[[value_column]]))
 
   ggplot(
     df_combined,
-    aes(x=scaling_group, y=!!sym(value_column), fill=condition, color=condition)
+    aes(x=scaling_group, y=!!sym(value_column), fill=wrap_label(condition, 20), color=wrap_label(condition, 20))
   ) +
     geom_bar(stat="identity", color="white", linewidth=0.2, alpha = 0.8) +
     geom_bar(
@@ -171,11 +199,11 @@ stacked_replicate_groups_plot <- function(df_combined, value_column) {
       stat="identity",
       color="white",
       alpha = 0.5,
-      linewidth = 1.2
+      linewidth = 1,
+      linejoin = "mitre"
     )
 }
 
-
 #' Calculate number of groups in a scalinginfo.txt file
 #' 
 #' @param scaling_file Path to scalinginfo.txt file
@@ -187,10 +215,10 @@ get_scaling_groups_number <- function(scaling_file) {
 scalinginfo <- snakemake@input[[1]]
 ngroups <- get_scaling_groups_number(scalinginfo)
 
-single_width <- 6
+single_width <- 5
 
 # Account for longer names
-single_height <- 10 
+single_height <- 9
 
 panel_width <- single_width * 2
 panel_height <- ceiling(ngroups / 2) * single_height
@@ -252,3 +280,4 @@ ggsave(snakemake@output[[8]],
        height = 7,
        dpi = 300,
        units = "cm")
+