From 66c687225c7fabea70e38fa21f7be2979089d789 Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Wed, 11 Oct 2023 17:14:04 -0400 Subject: [PATCH 01/18] Minor updates to summary file scripts changing python version and adding a sample name sort for generateSummaryFiles.py --- workflow/scripts/generateSummaryFiles.py | 1 + workflow/scripts/generateSummaryFiles_ATAC.py | 4 +--- workflow/scripts/generateSummaryFiles_MULTIOME.py | 4 +--- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/workflow/scripts/generateSummaryFiles.py b/workflow/scripts/generateSummaryFiles.py index dd9aec1..3b71a21 100755 --- a/workflow/scripts/generateSummaryFiles.py +++ b/workflow/scripts/generateSummaryFiles.py @@ -26,6 +26,7 @@ def createMetricsSummary(arg1): if not os.path.isdir(metricsPath): raise Exception('Error: provided metricsPath, {}, is not a path!'.format(metricsPath)) files = glob.glob('./*/outs/metrics_summary.csv') + files.sort() #Filter out aggregate runs if they exist workbook = xlsxwriter.Workbook(metricsPath + arg1+'.xlsx') diff --git a/workflow/scripts/generateSummaryFiles_ATAC.py b/workflow/scripts/generateSummaryFiles_ATAC.py index d181870..a5b39e7 100755 --- a/workflow/scripts/generateSummaryFiles_ATAC.py +++ b/workflow/scripts/generateSummaryFiles_ATAC.py @@ -1,8 +1,6 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Created on Jul 31 2018 - @author: Vicky usage: python generateSummaryFiles.py metric_summary diff --git a/workflow/scripts/generateSummaryFiles_MULTIOME.py b/workflow/scripts/generateSummaryFiles_MULTIOME.py index 38e4c64..9904592 100755 --- a/workflow/scripts/generateSummaryFiles_MULTIOME.py +++ b/workflow/scripts/generateSummaryFiles_MULTIOME.py @@ -1,8 +1,6 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Created on Jul 31 2018 - @author: Vicky usage: python generateSummaryFiles.py metric_summary From 964166c1218d55f4c0ecf7dc3c40fd1185381283 Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Wed, 11 Oct 2023 17:45:12 -0400 Subject: [PATCH 02/18] Add script to create CSV file to run GEX aggregate --- workflow/scripts/generateAggregateCSV_GEX.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100755 workflow/scripts/generateAggregateCSV_GEX.py diff --git a/workflow/scripts/generateAggregateCSV_GEX.py b/workflow/scripts/generateAggregateCSV_GEX.py new file mode 100755 index 0000000..d2af134 --- /dev/null +++ b/workflow/scripts/generateAggregateCSV_GEX.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 + +import glob, sys, os + +def main(): + files = glob.glob('*/outs/molecule_info.h5') + aggrFile = open('AggregatedDatasets.csv', 'w') + + aggrFile.write('sample_id,molecule_h5\n') + files.sort() + for i in files: + aggrFile.write('%s,%s\n' % (i.split('/')[0], os.path.abspath(i))) + aggrFile.close() + +if __name__ == "__main__": + main() From ea9dedb3d0106d8088d9b9990db4fdaaba856b17 Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Wed, 11 Oct 2023 17:45:50 -0400 Subject: [PATCH 03/18] Add filter flag to pipeline and handle it in Snakefile --- cell-seek | 51 +++++++++++++++++++++++++++++++++++++++------- workflow/Snakefile | 1 + 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/cell-seek b/cell-seek index 39c8263..ddef0b9 100755 --- a/cell-seek +++ b/cell-seek @@ -247,7 +247,7 @@ def parsed_arguments(name, description): [--silent] [--threads THREADS] [--tmp-dir TMP_DIR] \\ [--libraries LIBRARIES] [--features FEATURES] \\ [--cmo-reference CMOREFERENCE] [--cmo-sample CMOSAMPLE] \\ - [--exclude-introns] \\ + [--exclude-introns] [--filter FILTER] \\ --input INPUT [INPUT ...] \\ --output OUTPUT \\ --version {{gex, ...}} \\ @@ -290,8 +290,8 @@ def parsed_arguments(name, description): {3}{4}Analysis options:{5} --libraries LIBRARIES Libraries file. A CSV file containing information about - each library. This file is used in feature barcode (cite), - multi, and multiome analysis.It contains each sample's + each library. This file is used in feature barcode (cite), + multi, and multiome analysis.It contains each sample's name, flowcell, demultiplexed name, and library type. Here is an example libraries.csv file: Name,Flowcell,Sample,Type @@ -359,7 +359,7 @@ def parsed_arguments(name, description): not contain whitespace. • sequence: Nucleotide barcode sequence associated with this hashtag. - • feature_type: Type of the feature. This should always be + • feature_type: Type of the feature. This should always be multiplexing capture. • read: Specifies which RNA sequencing read contains the Feature Barcode sequence. Must be R1 or R2, but @@ -386,15 +386,43 @@ def parsed_arguments(name, description): • sample_id: Unique sample ID for this hashtagged sample. Must not contain, whitespace, quote or comma characters. Each sample ID must be unique. - • cmo_ids: Unique CMO ID(s) that the sample is hashtagged + • cmo_ids: Unique CMO ID(s) that the sample is hashtagged with. Must match either entries in cmo_reference.csv file or 10x CMO IDs. Example: --cmo-sample cmo_sample.csv --exclude-introns - Exclude introns from the count alignment. This flag is - only applicable when dealing with gene expression related + Exclude introns from the count alignment. This flag is + only applicable when dealing with gene expression related data. Example: --exclude-introns + --filter FILTER + Filter threshold file. A CSV file containing the different + thresholds to be applied for individual samples within the + project during the QC analysis. The file should contain a + header row with Sample as the column name for the sample IDs, + and the name of each metric that will be filtered along with + if it is the high or low threshold for that metric. Each row + is then the entries for each sample that the manual thresholds + will be applied. If no file is provided then the default + thresholds will be used. If a cell is left blank for a sample + then that sample would not be filtered based on that criteria. + This flag is currently only applicable when dealing with GEX + projects. + Here is an example filter.csv file: + Sample,nFeature_RNA_low,nFeature_RNA_high,percent.mito_high + sample1,500,6000,15 + sample2,500,6000,5 + sample4,500,6000,5 + where: + • Sample: Unique sample ID that should match the sample name + used for Cell Ranger count. + • nFeature_RNA_low,nFeature_RNA_high,percent.mito_high: Example + entries that can be used for manual thresholding. The column + names need to be formatted as metadataname_high/low. Entries + that ends with high will be treated as the upper threshold. + Entries that ends with low will be treated as the lower + threshold. Valid metadata names include nCount_RNA, + nFeature_RNA, and percent.mito. {3}{4}Orchestration options:{5} --mode {{slurm,local}} @@ -608,6 +636,15 @@ def parsed_arguments(name, description): help = argparse.SUPPRESS ) + # Thresholds to use for filtering in QC Analysis + subparser_run.add_argument( + '--filter', + # Check if the file exists and if it is readable + type = lambda file: permissions(parser, file, os.R_OK), + required = False, + help = argparse.SUPPRESS + ) + # Orchestration Options # Execution Method, run locally # on a compute node or submit to diff --git a/workflow/Snakefile b/workflow/Snakefile index 66ea0d8..a003c99 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -30,6 +30,7 @@ exclude_introns = str_bool( # Use introns for pre mRNA config['options']['exclude_introns'] # default: False ) input_fastq = config['options']['input'] +filter_file = config['options']['filter'] # Filter threshold file for QC analysis (not used in all pipelines) if 'libraries' in config: lib_samples = list(config['libraries'].keys()) # Libraries file samples pipeline_output = [] From 0e82724d6f3961b20165964599810745b3e78685 Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Wed, 11 Oct 2023 17:46:38 -0400 Subject: [PATCH 04/18] Add aggregate and initial QC to GEX pipeline; add R script for initial sample QC --- workflow/rules/gex.smk | 81 +++++++++++- workflow/scripts/seuratSampleQC.R | 209 ++++++++++++++++++++++++++++++ 2 files changed, 289 insertions(+), 1 deletion(-) create mode 100644 workflow/scripts/seuratSampleQC.R diff --git a/workflow/rules/gex.smk b/workflow/rules/gex.smk index a96d0a9..6be0964 100644 --- a/workflow/rules/gex.smk +++ b/workflow/rules/gex.smk @@ -12,11 +12,18 @@ pipeline_output += expand( sample=samples ) +pipeline_output += ['aggregate.complete'] + +pipeline_output += expand( + join(workpath, "seurat", "{sample}", "seur_cluster.rds"), + sample=samples +) + +# Function definitions def filterFastq(wildcards): return(','.join(set([os.path.dirname(i) for i in input_fastq if len(re.findall(f"{wildcards.sample}_[\w]*R2[\w]*.fastq.gz", i)) > 0]))) -# Function defitions def count_intron(wildcards): """ Wrapper to decide whether to include introns for counting. @@ -27,6 +34,18 @@ def count_intron(wildcards): else: return('') +def filterFile(wildcards): + """ + Wrapper to decide whether to provide a filter file for Seurat + QC analysis. + See config['options']['filter'] for the encoded value. + """ + if filter_file == "None": + return("") + else: + return(f"--filterfile {filter_file}") + + rule count: output: @@ -74,3 +93,63 @@ rule summaryFiles: """ python {params.summarize} """ + +rule aggregateCSV: + input: + expand(join(workpath, "{sample}", "outs", "web_summary.html"), sample=samples) + output: + join(workpath, "AggregatedDatasets.csv") + params: + rname = "aggregateCSV", + script = join("workflow", "scripts", "generateAggregateCSV_GEX.py"), + analysis = workpath + shell: + """ + python {params.script} {params.analysis} + """ + +rule aggregate: + input: + csv=join(workpath, "AggregatedDatasets.csv") + output: + touch("aggregate.complete") + log: + err="run_10x_aggregate.err", + log="run_10x_aggregate.log" + params: + rname = "aggregate", + id = "AggregateDatasets" + shell: + """ + cellranger aggr \\ + --id {params.id} \\ + --csv {input.csv} \\ + --normalize=none \\ + 2>{log.err} 1>{log.log} + """ + +rule seuratQC: + input: + join(workpath, "{sample}", "outs", "web_summary.html") + output: + rds = join(workpath, "seurat", "{sample}", "seur_cluster.rds"), + cell_filter = join(workpath, "seurat", "{sample}", "cell_filter_info.csv") + log: + join("seurat", "{sample}", "seurat.log") + params: + rname = "seuratQC", + sample = "{sample}", + outdir = join(workpath, "seurat", "{sample}"), + data = join(workpath, "{sample}", "outs", "filtered_feature_bc_matrix"), + seurat = join("workflow", "scripts", "seuratSampleQC.R"), + filter = filterFile + shell: + """ + module load R/4.3.0 + Rscript {params.seurat} \\ + --workdir {params.outdir} \\ + --datapath {params.data} \\ + --sample {params.sample} \\ + {params.filter} \\ + > {log} + """ diff --git a/workflow/scripts/seuratSampleQC.R b/workflow/scripts/seuratSampleQC.R new file mode 100644 index 0000000..f2522ac --- /dev/null +++ b/workflow/scripts/seuratSampleQC.R @@ -0,0 +1,209 @@ +library(Seurat) +library(ggplot2) +library(gridExtra) +library(SingleR) +library(scRNAseq) +library(scater) +library(optparse) +library(dplyr) +library(stringr) + +option_list <- list( + make_option(c("-w", "--workdir"), type='character', action='store', default=NA, + help="Path to the working directory"), + make_option(c("-d", "--datapath"), type='character', action='store', default=NA, + help="Path to the 10X output filtered features directory"), + make_option(c("-s", "--sample"), type='character', action='store', default="sample", + help="Sample name to be stored within Seurat metadata"), + make_option(c("-p", "--project"), type='character', action='store', default="project", + help="Project name to be stored within Seurat metadata"), + make_option(c("-f", "--filterfile"), type='character', action='store', default=NA, + help="Project name to be stored within Seurat metadata") +) + +opt <- parse_args(OptionParser(option_list=option_list)) + +data <- Read10X(opt$datapath) + +seur <- CreateSeuratObject(counts=data, project = opt$project) + +seur$Sample <- opt$sample + +dir.create(opt$workdir, recursive=TRUE) +setwd(opt$workdir) + +figures <- list() + +## ----Pre-Filter Gene Plot---- +seur[["percent.mito"]] <- PercentageFeatureSet(seur, pattern="^[Mm][Tt]-") + +plot1 <- FeatureScatter(seur, group.by='Sample', feature1 = "nCount_RNA", feature2 = "percent.mito") + NoLegend() +plot2 <- FeatureScatter(seur, group.by='Sample', feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + NoLegend() + +png("PreFilter_Gene_Plot.png", height=5, width=10, units='in', res=300) +plot1+plot2 +dev.off() + +figures$PreFilter_Gene_Plot <- plot1+plot2 + + +## ----Cell Quality Thresholds - Default---- +thresh <- list() +defaultThreshold <- function(seur) { + thresh <- list() + thresh['nFeature_RNA_low'] <- expm1(median(log1p(seur$nFeature_RNA)) - 3*mad(log1p(seur$nFeature_RNA))) %>% round + thresh['nFeature_RNA_low'] <- expm1(median(log1p(seur$nFeature_RNA)) - 3*mad(log1p(seur$nFeature_RNA))) %>% round + thresh['nFeature_RNA_high'] <- expm1(median(log1p(seur$nFeature_RNA)) + 3*mad(log1p(seur$nFeature_RNA))) %>% round + thresh['nCount_RNA_low'] <- expm1(median(log1p(seur$nCount_RNA)) - 3*mad(log1p(seur$nCount_RNA))) %>% round + thresh['nCount_RNA_high'] <- expm1(median(log1p(seur$nCount_RNA)) + 3*mad(log1p(seur$nCount_RNA))) %>% round + thresh['percent.mito_high'] = expm1(median(log1p(seur$percent.mito)) + 3*mad(log1p(seur$percent.mito))) %>% round + + cellsToRemove <- colnames(seur)[which(seur$nFeature_RNA < thresh['nFeature_RNA_low'] | seur$nFeature_RNA > thresh['nFeature_RNA_high'])] + cellsToRemove <- union(cellsToRemove, colnames(seur)[which(seur$nCount_RNA < thresh['nCount_RNA_low'] | seur$nCount_RNA > thresh['nCount_RNA_high'])]) + cellsToRemove <- union(cellsToRemove, colnames(seur)[which(seur$percent.mito > thresh['percent.mito_high'])]) + + + thresh['numCellsRemove'] <- length(cellsToRemove) + thresh['pctCellsRemove'] <- length(cellsToRemove) / dim(seur)[2] * 100 + return(list(threshold=thresh, filter=cellsToRemove)) +} + +if (!is.na(opt$filterfile)){ + thresholds <- read.csv(opt$filterfile) + index <- grep("Sample", colnames(thresholds), ignore.case=T) + if (sum(thresholds[,index] == opt$sample) == 1) { + thresh_orig <- thresholds[which(thresholds[,index] == opt$sample),] + thresh_orig[index] <- NULL + + thresh <- list() + cellsToRemove <- character() + for (i in colnames(thresh_orig)) { + try({ + if (length(grep('_high$', i, ignore.case=T, value=T)) > 0 | length(grep('_low$', i, ignore.case=T, value=T)) > 0) { + colname <- (i %>% strsplit(split='_'))[[1]] %>% head(n=-1) %>% paste(collapse='_') + direction <- (i %>% strsplit(split='_'))[[1]] %>% tail(n=1) %>% str_to_lower + if(direction == "low") { + cellsToRemove <- union(cellsToRemove, colnames(seur)[which(seur[[colname]] < thresh_orig[[i]])]) + thresh[i] <- thresh_orig[i] + } + if (direction == "high") { + cellsToRemove <- union(cellsToRemove, colnames(seur)[which(seur[[colname]] > thresh_orig[[i]])]) + thresh[i] <- thresh_orig[i] + } + } + }) + } + thresh['numCellsRemove'] <- length(cellsToRemove) + thresh['pctCellsRemove'] <- length(cellsToRemove) / dim(seur)[2] * 100 + } else { + result <- defaultThreshold(seur) + thresh <- result$threshold + cellsToRemove <- result$filter + } +} else { + result <- defaultThreshold(seur) + thresh <- result$threshold + cellsToRemove <- result$filter +} + +write.csv(thresh, 'cell_filter_info.csv', row.names=FALSE) + +## ----Pre-Filter RNA Violin Plot------- +doVlnPlot <- function(aspect, seur, thresh) { + temp_plot <- VlnPlot(seur, group.by='Sample', features=aspect) + NoLegend() + if (length(grep(paste0('^', aspect, '_low$'), colnames(thresh), ignore.case=T)) > 0) { + try( + temp_plot <- temp_plot + geom_hline(yintercept=thresh[grep(paste0('^', aspect, '_low$'), colnames(thresh), ignore.case=T)][[1]], linetype="dashed") + ) + } + if (length(grep(paste0('^', aspect, '_high$'), colnames(thresh), ignore.case=T)) > 0) { + try( + temp_plot <- temp_plot + geom_hline(yintercept=thresh[grep(paste0('^', aspect, '_high$'), colnames(thresh), ignore.case=T)][[1]], linetype="dashed") + ) + } + return(temp_plot) +} + +plots <- sapply(c("nFeature_RNA", "nCount_RNA", "percent.mito"), function(x) doVlnPlot(aspect=x, seur=seur, thresh=thresh)) + +png("PreFilter_VlnPlot_RNA.png", height=7, width=7, units='in', res=300) +do.call("grid.arrange", c(plots, nrow=1)) +dev.off() + +figures$PreFilter_VlnPlot_RNA <- do.call("grid.arrange", c(plots, nrow=1)) + +## ----Pre-Filter UMAP Plot------- +seur <- NormalizeData(seur, normalization.method = "LogNormalize", scale.factor = 10000) +seur <- FindVariableFeatures(seur, selection.method = "vst", nfeatures = 2000) +all.genes <- rownames(seur) +seur <- ScaleData(seur, features = all.genes) +seur <- RunPCA(seur, features = VariableFeatures(object = seur)) +seur <- FindNeighbors(seur, dims = 1:30) +seur <- RunUMAP(seur, reduction = 'pca', dims = 1:30, assay = 'RNA') +seur <- FindClusters(seur, resolution = 0.8, algorithm=3, verbose = FALSE) + +png("PreFilter_UMAP_RNA.png", width=1800, height=1600, res = 300) +DimPlot(seur, reduction='umap', label = TRUE) + ggtitle("Pre-Filter UMAP") + theme(plot.title = element_text(hjust=0.5)) +dev.off() + +png("PreFilter_UMAP_RNA_Filter.png", width=1800, height=1600, res = 300) +DimPlot(seur, reduction='umap', label = TRUE, cells.highlight=list("Filtered Cells"=cellsToRemove)) + ggtitle("Pre-Filter UMAP - Filtered Cells") + theme(plot.title = element_text(hjust = 0.5)) +dev.off() + +figures$PreFilter_UMAP_RNA <- DimPlot(seur, reduction='umap', label = TRUE) + ggtitle("Pre-Filter UMAP") + theme(plot.title = element_text(hjust=0.5)) +figures$PreFilter_UMAP_RNA_Filter <- DimPlot(seur, reduction='umap', label = TRUE, cells.highlight=list("Filtered Cells"=cellsToRemove)) + ggtitle("Pre-Filter UMAP - Filtered Cells") + theme(plot.title = element_text(hjust = 0.5)) + + +seur <- subset(seur, cells = cellsToRemove, inver=T) + +## ----Post-Filter Gene Plot---- +#Post-Filter Plots +plot1 <- FeatureScatter(seur, group.by='Sample', feature1 = "nCount_RNA", feature2 = "percent.mito") +plot2 <- FeatureScatter(seur, group.by='Sample', feature1 = "nCount_RNA", feature2 = "nFeature_RNA") +png("PostFilter_Gene_Plot.png", height=5, width=10, units='in', res=300) +plot1+plot2 +dev.off() + +figures$PostFilter_Gene_Plot <- plot1+plot2 + + +## ----Post-Filter RNA Violin Plot------- +png("PostFilter_VlnPlot_RNA.png", height=7, width=7, units='in', res=300) +VlnPlot(seur, group.by='Sample', features = c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol = 3) +dev.off() + +figures$PostFilter_VlnPlot_RNA <- VlnPlot(seur, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol = 3) + +## ----RNA Normalizing and Clustering---- +seur <- NormalizeData(seur, normalization.method = "LogNormalize", scale.factor = 10000) +seur <- FindVariableFeatures(seur, selection.method = "vst", nfeatures = 2000) +all.genes <- rownames(seur) +seur <- ScaleData(seur, features = all.genes) +seur <- RunPCA(seur, npcs=50, features = VariableFeatures(object = seur)) +seur <- FindNeighbors(seur, dims = 1:30) +seur <- RunUMAP(seur, reduction = 'pca', dims = 1:30, assay = 'RNA') + +for(resolution in seq(0.2,0.8,0.2)){ + seur <- FindClusters(seur, resolution = resolution) +} + +## ----Elbow Plot---- +png("ElbowPlot.png", height=7, width=7, units='in', res=300) +ElbowPlot(seur, ndims=50) +dev.off() + +figures$ElbowPlot <- ElbowPlot(seur) + +## ----UMAP Plots---- +for (resolution in grep('_res.', colnames(seur@meta.data), value=T)) { + png(paste0("UMAP_", gsub("_snn", "", resolution), ".png"), width=1800, height=1600, res = 300) + print(DimPlot(seur, reduction='umap', group.by=resolution, label=TRUE) + ggtitle(paste0("UMAP ", gsub("_snn_", " - ", resolution))) + theme(plot.title = element_text(hjust = 0.5))) + dev.off() + figures[[paste0("UMAP_", gsub("_snn", "", resolution))]] <- DimPlot(seur, reduction='umap', group.by=resolution, label=TRUE) + ggtitle(paste0("UMAP ", gsub("_snn_", " - ", resolution))) + theme(plot.title = element_text(hjust = 0.5)) +} + + +saveRDS(seur, 'seur_cluster.rds') +#saveRDS(figures, 'seur_figures.rds') + +sessionInfo() From e5dda37eeabfd7737af1cd2e5826bef9d454367c Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Thu, 12 Oct 2023 15:08:59 -0400 Subject: [PATCH 05/18] Add report for initial GEX sample QC --- workflow/rules/gex.smk | 25 ++++ workflow/scripts/seuratSampleQCReport.Rmd | 163 ++++++++++++++++++++++ 2 files changed, 188 insertions(+) create mode 100644 workflow/scripts/seuratSampleQCReport.Rmd diff --git a/workflow/rules/gex.smk b/workflow/rules/gex.smk index 6be0964..fffa56b 100644 --- a/workflow/rules/gex.smk +++ b/workflow/rules/gex.smk @@ -12,13 +12,21 @@ pipeline_output += expand( sample=samples ) +# CellRanger aggregate analysis pipeline_output += ['aggregate.complete'] +# Seurat inital sample QC pipeline_output += expand( join(workpath, "seurat", "{sample}", "seur_cluster.rds"), sample=samples ) +#Seurat sample QC reports +pipeline_output += expand( + join(workpath, "seurat", "{sample}", "{sample}_QC_Report.html"), + sample=samples +) + # Function definitions def filterFastq(wildcards): return(','.join(set([os.path.dirname(i) for i in input_fastq if len(re.findall(f"{wildcards.sample}_[\w]*R2[\w]*.fastq.gz", i)) > 0]))) @@ -153,3 +161,20 @@ rule seuratQC: {params.filter} \\ > {log} """ + +rule seuratQCReport: + input: + rds = rules.seuratQC.output.rds, + cell_filter = rules.seuratQC.output.cell_filter + output: + report = join(workpath, "seurat", "{sample}", "{sample}_QC_Report.html") + params: + rname = "seuratQCReport", + sample = "{sample}", + seuratdir = join(workpath, "seurat", "{sample}"), + script = join("workflow", "scripts", "seuratSampleQCReport.Rmd") + shell: + """ + module load R/4.3.0 + R -e "rmarkdown::render('{params.script}', params=list(seuratdir='{params.seuratdir}', sample='{params.sample}'), output_file='{output.report}')" + """ diff --git a/workflow/scripts/seuratSampleQCReport.Rmd b/workflow/scripts/seuratSampleQCReport.Rmd new file mode 100644 index 0000000..3d82d9b --- /dev/null +++ b/workflow/scripts/seuratSampleQCReport.Rmd @@ -0,0 +1,163 @@ +--- +title: "Preliminary QC Report for Sample `r params$sample`" +subtitle: "Initial Sample Preprocessing" +date: '`r Sys.Date()`' +output: + html_document: + toc: true + toc_float: + collapsed: false + number_sections: true + code-fold: true + toc_depth: 3 + fig_height: 5 + fig_width: 6 +params: + seuratdir: seurat + sample: sample +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE, message=FALSE, warning = FALSE, results='asis') +options(knitr.graphics.error = FALSE) +``` + +```{r Packages, message=FALSE} +library(knitr) +library(stringr) +library(flextable) +library(dplyr) +# library(kableExtra) +# library(scater) +``` + +```{r Input, include=FALSE} +seuratdir <- params$seuratdir +``` + +```{r, include=FALSE} +# Init Step to make sure that the dependencies are loaded +#htmltools::tagList(DT::datatable(library_key)) +#data <- read.csv('/data/NIAMS_IDSS/projects/NIAMS-33/library_patient.csv') +#htmltools::tagList(DT::datatable(data, extensions='Buttons', options=list(dom='lBfrtip', buttons = c('copy', 'csv', 'excel'), lengthMenu = list(c(10,25,50,-1), c(10,25,50,"All"))))) + +``` + + +# QC Analysis + +The data that was generated by Cell Ranger was pulled into Seurat for initial processing. + +## Pre-Filter Violin Plots + +Violin plots of the number of genes associated with a cell, number of unique reads associated with a cell, and the percentage of counts in a cell belonging to mitochondrial genes were created. Cells with extremely low or high number of reads or counts may indicate potential low quality cells and multiplets. Cells with high percentage of mitochondrial genes indicates dead or dying cells that may not be of interest + +```{r, results='asis', fig.width=5, fig.height=7, out.width="70%"} + +cat(paste0("![](", file.path(seuratdir, "PreFilter_VlnPlot_RNA.png)")), "\n") +cat("\n") +``` + +## Pre-Filter Gene Plots + +```{r, results='asis', fig.width=5, fig.height=7, out.width="70%"} + +cat(paste0("![](", file.path(seuratdir, "PreFilter_Gene_Plot.png)")), "\n") +cat("\n") + +``` + +## Pre-Filter UMAP Plot {.tabset} + +The following are the UMAP plots for the sample that was generated prior to filtering cells. + +```{r, results='asis', fig.width=5, fig.height=7, out.width="70%"} +if (file.exists(file.path(seuratdir, 'PreFilter_UMAP_RNA.png'))) { + cat("### Clustering\n\n") + cat(paste0("![](", file.path(seuratdir, "PreFilter_UMAP_RNA.png)")), "\n") +} +cat("\n\n") +if(file.exists(file.path(seuratdir, 'PreFilter_UMAP_RNA_Filter.png'))) { + cat("### Filtered Cells\n\n") + cat(paste0("![](", file.path(seuratdir, "PreFilter_UMAP_RNA_Filter.png)")), "\n") +} +cat("\n\n") + +``` + +## Cell Filter Thresholds + +Thresholds were calculated and used to filter out potential low quality cells and multiplets. The following are the thresholds that were set for each set of statistics. + +```{r Cell Filter, warning=FALSE, message=FALSE, results='asis', fig.width=5, fig.height=7, out.width="70%"} + +thresh <- read.csv(file.path(seuratdir, 'cell_filter_info.csv')) + +printed <- c(apply(expand.grid(c("nFeature_RNA", "nCount_RNA", "percent.mito"), c("low", "high")), 1, paste, collapse='_'), "numCellsRemove", "pctCellsRemove") + +get_value <- function(thresh, value) { + if (is.null(thresh[[value]])) {NA} else{sprintf("%.0f", thresh[[value]])} +} +cat("GEX Gene Count Thresholds:", paste0("(", get_value(thresh, "nFeature_RNA_low"), ", ", get_value(thresh, "nFeature_RNA_high"), ")"), "\n\n") + +cat("GEX Read Count Thresholds:", paste0("(", get_value(thresh, "nCount_RNA_low"), ", ", get_value(thresh, "nCount_RNA_high"), ")"), "\n\n") + +cat("Mitochondrial Percentage Thresholds:", paste0("(", get_value(thresh, "percent.mito_low"), ", ", get_value(thresh, "percent.mito_high"), ")"), "\n\n") + +cat("Total cells filtered:", get_value(thresh, "numCellsRemove"), "\n\n") + +cat("Percentage cells filtered:", get_value(thresh, "pctCellsRemove"), "\n\n") + +if (length(setdiff(names(thresh), printed) > 0)) { + cat("The following additional thresholds were also used: \n\n") + for (value in setdiff(names(thresh), printed)) { + cat(paste0(value, ": ", thresh[[value]], "\n\n")) + cat("\n\n") + } +} +``` + + +## Post-Filter Violin Plots {.tabset} + +The following are the resulting violin plots generated after filtering the cells. + +```{r, results='asis', fig.width=5, fig.height=7, out.width="70%"} +cat(paste0("![](", file.path(seuratdir, 'PostFilter_VlnPlot_RNA.png)')), "\n") + +``` + +## Post-Filter Gene Plots + +```{r, results='asis', fig.width=5, fig.height=7, out.width="70%"} + +cat(paste0("![](", file.path(seuratdir, "PostFilter_Gene_Plot.png)")), "\n") +cat("\n") + +``` + +## Elbow Plot + +```{r, results='asis', fig.width=5, fig.height=7, out.width="70%"} + +cat(paste0("![](", file.path(seuratdir, "ElbowPlot.png)")), "\n") +cat("\n") + +``` + +## Post-Filter UMAP Plot {.tabset} + +The following are the UMAP plots for the sample that was generated after filtering out the potential low quality and multiplet cells. + +```{r, results='asis', fig.width=5, fig.height=7, out.width="70%"} +for (filename in Sys.glob(file.path(seuratdir, "UMAP_RNA_res*.png"))) { + resolution <- strsplit(tools::file_path_sans_ext(basename(filename)), 'res.')[[1]][2] + cat("### Resolution", resolution, "\n\n") + + cat(paste0("![](", filename, ')'), "\n\n") +} + + +``` + +
From 383fbcd904173e8b82053dc72c3c1346dfd30ffe Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Thu, 12 Oct 2023 15:42:12 -0400 Subject: [PATCH 06/18] Add cell filter summary for GEX samples --- workflow/rules/gex.smk | 25 ++++++++++++++++++++++--- workflow/scripts/cellFilterSummary.R | 26 ++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 3 deletions(-) create mode 100644 workflow/scripts/cellFilterSummary.R diff --git a/workflow/rules/gex.smk b/workflow/rules/gex.smk index fffa56b..519aaf1 100644 --- a/workflow/rules/gex.smk +++ b/workflow/rules/gex.smk @@ -13,7 +13,7 @@ pipeline_output += expand( ) # CellRanger aggregate analysis -pipeline_output += ['aggregate.complete'] +pipeline_output += [join(workpath, 'aggregate.complete')] # Seurat inital sample QC pipeline_output += expand( @@ -21,12 +21,15 @@ pipeline_output += expand( sample=samples ) -#Seurat sample QC reports +# Seurat sample QC reports pipeline_output += expand( join(workpath, "seurat", "{sample}", "{sample}_QC_Report.html"), sample=samples ) +# Cell Filter Summary File +pipeline_output += [join(workpath, "Project_Cell_Filters.csv")] + # Function definitions def filterFastq(wildcards): return(','.join(set([os.path.dirname(i) for i in input_fastq if len(re.findall(f"{wildcards.sample}_[\w]*R2[\w]*.fastq.gz", i)) > 0]))) @@ -120,7 +123,7 @@ rule aggregate: input: csv=join(workpath, "AggregatedDatasets.csv") output: - touch("aggregate.complete") + touch(join(workpath, "aggregate.complete")) log: err="run_10x_aggregate.err", log="run_10x_aggregate.log" @@ -178,3 +181,19 @@ rule seuratQCReport: module load R/4.3.0 R -e "rmarkdown::render('{params.script}', params=list(seuratdir='{params.seuratdir}', sample='{params.sample}'), output_file='{output.report}')" """ + +rule cellFilterSummary: + input: + cell_filters = expand(rules.seuratQC.output.cell_filter, sample=samples) + output: + cell_filter_summary = join(workpath, "Project_Cell_Filters.csv") + params: + rname = "cellFilterSummary", + seuratdir = join(workpath, "seurat"), + filename = "cell_filter_info.csv", + script = join("workflow", "scripts", "cellFilterSummary.R") + shell: + """ + module load R/4.3.0 + Rscript {params.script} --datapath {params.seuratdir} --filename {params.filename} --output {output.cell_filter_summary} + """ diff --git a/workflow/scripts/cellFilterSummary.R b/workflow/scripts/cellFilterSummary.R new file mode 100644 index 0000000..798d559 --- /dev/null +++ b/workflow/scripts/cellFilterSummary.R @@ -0,0 +1,26 @@ +library(optparse) +library(dplyr) + +option_list <- list( + make_option(c("-d", "--datapath"), type='character', action='store', default='./', + help="Path to where the CSV output files are stored"), + make_option(c("-f", "--filename"), type='character', action='store', default='cell_filter_info.csv', + help="Name of the files to search for"), + make_option(c("-o", "--output"), type='character', action='store', default='Project_Cell_Filters.csv', + help="Name of the output file") +) +opt <- parse_args(OptionParser(option_list=option_list)) + +filenames <- Sys.glob(file.path(opt$datapath, '*', opt$filename)) +filters <- read.csv(filenames[1]) +filters$Sample <- basename(dirname(filenames[1])) + +if(length(filenames) > 1) { + for (filename in filenames[2:length(filenames)]) { + data <- read.csv(filename) + data$Sample <- basename(dirname(filename)) + filters <- full_join(filters, data) + } +} + +write.csv(filters[,c("Sample", setdiff(colnames(filters), "Sample"))], opt$output, row.names=FALSE) From c61ddabb458d21dc663901cc320a7ef465b26480 Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Wed, 18 Oct 2023 11:55:05 -0400 Subject: [PATCH 07/18] Add cluster configuration for GEX seuratQC rule --- config/cluster.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/config/cluster.json b/config/cluster.json index 8605fc6..993861f 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -15,5 +15,10 @@ "threads": "16", "mem": "150g", "time": "2-00:00:00" + }, + "seuratQC": { + "threads": "8", + "mem": "150g", + "time": "1-00:00:00" } } From 4d434d00224028d8f8950a7992ec7706fd0a5779 Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Tue, 24 Oct 2023 11:48:14 -0400 Subject: [PATCH 08/18] Extract tmpdir config parameter in Snakefile; Change how the GEX Seurat QC report is run to prevent parallel jobs from interfering with each other --- workflow/Snakefile | 1 + workflow/rules/gex.smk | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index a003c99..bf4719c 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -17,6 +17,7 @@ from scripts.common import ( #Global workflow variables configfile: 'config.json' +tmpdir = config['options']['tmp_dir'] pipeline = config['options']['version'] inputs = config['options']['input'] samples = list(set([re.sub("_S[0-9]+_L00[0-9]", "", i) for i in config['samples']])) diff --git a/workflow/rules/gex.smk b/workflow/rules/gex.smk index 519aaf1..6b19080 100644 --- a/workflow/rules/gex.smk +++ b/workflow/rules/gex.smk @@ -57,10 +57,9 @@ def filterFile(wildcards): return(f"--filterfile {filter_file}") - rule count: output: - join(workpath, "{sample}", "outs", "web_summary.html") + html = join(workpath, "{sample}", "outs", "web_summary.html") log: err = "run_{sample}_10x_cellranger_count.err", log ="run_{sample}_10x_cellranger_count.log" @@ -130,6 +129,7 @@ rule aggregate: params: rname = "aggregate", id = "AggregateDatasets" + envmodules: config["tools"]["cellranger"] shell: """ cellranger aggr \\ @@ -175,11 +175,14 @@ rule seuratQCReport: rname = "seuratQCReport", sample = "{sample}", seuratdir = join(workpath, "seurat", "{sample}"), - script = join("workflow", "scripts", "seuratSampleQCReport.Rmd") + tmpdir = tmpdir, + script = join(workpath, "workflow", "scripts", "seuratSampleQCReport.Rmd") shell: """ module load R/4.3.0 - R -e "rmarkdown::render('{params.script}', params=list(seuratdir='{params.seuratdir}', sample='{params.sample}'), output_file='{output.report}')" + cd {params.tmpdir} + cp {params.script} ./{params.sample}.Rmd + R -e "rmarkdown::render('{params.sample}.Rmd', params=list(seuratdir='{params.seuratdir}', sample='{params.sample}'), output_file='{output.report}')" """ rule cellFilterSummary: @@ -197,3 +200,4 @@ rule cellFilterSummary: module load R/4.3.0 Rscript {params.script} --datapath {params.seuratdir} --filename {params.filename} --output {output.cell_filter_summary} """ + From 0993ee0aa512cf57c901f62d24142b09090015be Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Wed, 25 Oct 2023 09:42:57 -0400 Subject: [PATCH 09/18] Add aggregate and create-bam flags to pipeline wrapper; add support for aggrgate and create-bam flags to GEX pipeline so that Cell Ranger aggregate only runs when flag is used and no bam file is created by default --- cell-seek | 45 ++++++++++++++++++++++++++++++++++++++---- workflow/Snakefile | 4 ++++ workflow/rules/gex.smk | 35 +++++++++++++++++++++++++++----- 3 files changed, 75 insertions(+), 9 deletions(-) diff --git a/cell-seek b/cell-seek index ddef0b9..b12a998 100755 --- a/cell-seek +++ b/cell-seek @@ -245,9 +245,10 @@ def parsed_arguments(name, description): [--dry-run] [--job-name JOB_NAME] [--mode {{slurm,local}}] \\ [--sif-cache SIF_CACHE] [--singularity-cache SINGULARITY_CACHE] \\ [--silent] [--threads THREADS] [--tmp-dir TMP_DIR] \\ - [--libraries LIBRARIES] [--features FEATURES] \\ - [--cmo-reference CMOREFERENCE] [--cmo-sample CMOSAMPLE] \\ - [--exclude-introns] [--filter FILTER] \\ + [--aggregate {{mapped, none}}][--libraries LIBRARIES] \\ + [--features FEATURES] [--cmo-reference CMOREFERENCE] \\ + [--cmo-sample CMOSAMPLE] [--exclude-introns] [--filter FILTER] \\ + [--create-bam] \\ --input INPUT [INPUT ...] \\ --output OUTPUT \\ --version {{gex, ...}} \\ @@ -288,6 +289,17 @@ def parsed_arguments(name, description): options: hg38, mm10. Example: --genome hg38 {3}{4}Analysis options:{5} + --aggregate {{mapped,none}} + Cell Ranger aggregate. This option defines the + normalization mode that should be used. Mapped is what + Cell Ranger would run by default, which subsamples reads + from higher depth samples until each library type has an + equal number of reads per cell that are confidently mapped. + None means to not normalize at all. If this flag is not + used then aggregate will not be run. To run Cell Ranger + aggregate, please select one of the following options: + mapped, none. + Example: --aggregate mapped --libraries LIBRARIES Libraries file. A CSV file containing information about each library. This file is used in feature barcode (cite), @@ -423,6 +435,12 @@ def parsed_arguments(name, description): Entries that ends with low will be treated as the lower threshold. Valid metadata names include nCount_RNA, nFeature_RNA, and percent.mito. + --create-bam + Create bam files. By default the no-bam flag is used when running + Cell Ranger. Use this flag to ensure that a bam file is created for + each sample during analysis. This flag is only applicable when + dealing with gene expression related data. + Example: --create-bam {3}{4}Orchestration options:{5} --mode {{slurm,local}} @@ -565,7 +583,7 @@ def parsed_arguments(name, description): '--version', type = str.lower, required = True, - default = "slurm", + default = "gex", choices = ['gex', 'cite', 'multi', 'vdj', 'atac', 'multiome'], help = argparse.SUPPRESS ) @@ -636,6 +654,16 @@ def parsed_arguments(name, description): help = argparse.SUPPRESS ) + # How to run Cell Ranger aggregate + subparser_run.add_argument( + '--aggregate', + type = str.lower, + required = False, + default = "", + choices = ['none', 'mapped'], + help = argparse.SUPPRESS + ) + # Thresholds to use for filtering in QC Analysis subparser_run.add_argument( '--filter', @@ -645,6 +673,15 @@ def parsed_arguments(name, description): help = argparse.SUPPRESS ) + # Create BAM file during run + subparser_run.add_argument( + '--create-bam', + action = 'store_true', + required = False, + default = False, + help = argparse.SUPPRESS + ) + # Orchestration Options # Execution Method, run locally # on a compute node or submit to diff --git a/workflow/Snakefile b/workflow/Snakefile index bf4719c..cde3ea5 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -30,6 +30,10 @@ cmo_sample = config['options']['cmo_sample'] # CMO Sample files for cellrang exclude_introns = str_bool( # Use introns for pre mRNA, config['options']['exclude_introns'] # default: False ) +create_bam = str_bool( # Create BAM files during Cell Ranger analysis, + config['options']['create_bam'] # default: False +) +aggr = config['options']['aggregate'] input_fastq = config['options']['input'] filter_file = config['options']['filter'] # Filter threshold file for QC analysis (not used in all pipelines) if 'libraries' in config: diff --git a/workflow/rules/gex.smk b/workflow/rules/gex.smk index 6b19080..8227268 100644 --- a/workflow/rules/gex.smk +++ b/workflow/rules/gex.smk @@ -13,7 +13,8 @@ pipeline_output += expand( ) # CellRanger aggregate analysis -pipeline_output += [join(workpath, 'aggregate.complete')] +if aggr != "": + pipeline_output += [join(workpath, 'aggregate.complete')] # Seurat inital sample QC pipeline_output += expand( @@ -38,13 +39,35 @@ def filterFastq(wildcards): def count_intron(wildcards): """ Wrapper to decide whether to include introns for counting. - See config['options']['exclude-introns'] for the encoded value. + See config['options']['exclude_introns'] for the encoded value. """ if exclude_introns: return('--include-introns false') else: return('') +def count_bam(wildcards): + """ + Wrapper to decide whether to create BAM files during Cell Ranger alignment. + See config['options']['create_bam'] for the encoded value. + """ + if create_bam: + return('') + else: + return('--no-bam') + +def aggr_norm(wildcards): + """ + Wrapper to decide which normalization method to use for Cell Ranger aggregate. + If no option was provided while running then aggregate will not be run. + See config['options']['aggregate'] for the encoded value. + """ + supported = ['mapped', 'none'] + if aggr.lower() in supported: + return(aggr.lower()) + else: + raise Exception(f"\nAn unsupported input was provided for the aggregate flag. Please check the aggregate value under options in the config file to change it so that either the entry is left blank or is one of the following: {supported}\n\nThe currently provided value is: {aggr}\n") + def filterFile(wildcards): """ Wrapper to decide whether to provide a filter file for Seurat @@ -69,6 +92,7 @@ rule count: prefix = "{sample}", transcriptome = config["references"][genome]["gex_transcriptome"], excludeintrons = count_intron, + createbam = count_bam, fastqs = filterFastq envmodules: config["tools"]["cellranger"] shell: @@ -85,6 +109,7 @@ rule count: --transcriptome {params.transcriptome} \\ --fastqs {params.fastqs} \\ {params.excludeintrons} \\ + {params.createbam} \\ 2>{log.err} 1>{log.log} """ @@ -128,14 +153,15 @@ rule aggregate: log="run_10x_aggregate.log" params: rname = "aggregate", - id = "AggregateDatasets" + id = "AggregateDatasets", + norm = aggr_norm envmodules: config["tools"]["cellranger"] shell: """ cellranger aggr \\ --id {params.id} \\ --csv {input.csv} \\ - --normalize=none \\ + --normalize={params.norm} \\ 2>{log.err} 1>{log.log} """ @@ -200,4 +226,3 @@ rule cellFilterSummary: module load R/4.3.0 Rscript {params.script} --datapath {params.seuratdir} --filename {params.filename} --output {output.cell_filter_summary} """ - From debd415f8f2168a3d2971686533619af8f9a70a4 Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Wed, 25 Oct 2023 13:59:09 -0400 Subject: [PATCH 10/18] Add cleanup rules for GEX pipeline --- workflow/rules/gex.smk | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/workflow/rules/gex.smk b/workflow/rules/gex.smk index 8227268..e56a8a8 100644 --- a/workflow/rules/gex.smk +++ b/workflow/rules/gex.smk @@ -12,10 +12,20 @@ pipeline_output += expand( sample=samples ) -# CellRanger aggregate analysis +# CellRanger counts intermediate cleanup +pipeline_output += expand( + join(workpath, "cleanup", "{sample}.samplecleanup"), + sample=samples +) + +#CellRanger aggregate if aggr != "": + # CellRanger aggregate analysis pipeline_output += [join(workpath, 'aggregate.complete')] + # CellRanger aggregate intermediate cleanup + pipeline_output += [join(workpath, "cleanup", "aggregate.aggregatecleanup")] + # Seurat inital sample QC pipeline_output += expand( join(workpath, "seurat", "{sample}", "seur_cluster.rds"), @@ -226,3 +236,29 @@ rule cellFilterSummary: module load R/4.3.0 Rscript {params.script} --datapath {params.seuratdir} --filename {params.filename} --output {output.cell_filter_summary} """ + +rule sampleCleanup: + input: + html = rules.count.output.html + output: + cleanup = touch(join(workpath, "cleanup", "{sample}.samplecleanup")) + params: + rname = "sampleCleanup", + cr_temp = join(workpath, "{sample}", "SC_RNA_COUNTER_CS") + shell: + """ + rm -r {params.cr_temp} + """ + +rule aggregateCleanup: + input: + rules.aggregate.output + output: + cleanup = touch(join(workpath, "cleanup", "{sample}.aggregatecleanup")) + params: + rname = "aggregateCleanup", + cr_temp = join(workpath, "AggregateDatasets", "SC_RNA_AGGREGATOR_CS") + shell: + """ + rm -r {params.cr_temp} + """ From 3da4d7c2ef5f710ac12c4483507a2061b4c14b93 Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Wed, 25 Oct 2023 15:54:25 -0400 Subject: [PATCH 11/18] Add eample of filter flag in help documentation --- cell-seek | 1 + 1 file changed, 1 insertion(+) diff --git a/cell-seek b/cell-seek index b12a998..a3abb62 100755 --- a/cell-seek +++ b/cell-seek @@ -435,6 +435,7 @@ def parsed_arguments(name, description): Entries that ends with low will be treated as the lower threshold. Valid metadata names include nCount_RNA, nFeature_RNA, and percent.mito. + Example: --filter filter.csv --create-bam Create bam files. By default the no-bam flag is used when running Cell Ranger. Use this flag to ensure that a bam file is created for From 241dad4d3ecb3e30bafd7a77446bf250ed485201 Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Thu, 26 Oct 2023 12:53:40 -0400 Subject: [PATCH 12/18] Add sample cleanup rules to CITE and VDJ pipelines; Change CITE pipeline BAM file generation to off by default and add support for create-bam flag --- workflow/rules/cite.smk | 31 ++++++++++++++++++++++++++++++- workflow/rules/vdj.smk | 20 +++++++++++++++++++- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/workflow/rules/cite.smk b/workflow/rules/cite.smk index de67097..5e47ce3 100755 --- a/workflow/rules/cite.smk +++ b/workflow/rules/cite.smk @@ -18,6 +18,12 @@ pipeline_output += expand( sample=lib_samples ) +# CellRanger counts intermediate cleanup +pipeline_output += expand( + join(workpath, "cleanup", "{sample}.samplecleanup"), + sample=lib_samples +) + # Get set of input paths input_paths = [os.path.dirname(p) for p in inputs] input_paths_set = [] @@ -38,6 +44,15 @@ def count_intron(wildcards): else: return('') +def count_bam(wildcards): + """ + Wrapper to decide whether to create BAM files during Cell Ranger alignment. + See config['options']['create_bam'] for the encoded value. + """ + if create_bam: + return('') + else: + return('--no-bam') # Rule definitions rule librariesCSV: @@ -72,7 +87,8 @@ rule count: prefix = "{sample}", # numcells = lambda wildcards:s2c[wildcards.sample], transcriptome = config["references"][genome]["cite_transcriptome"], - introns = count_intron + introns = count_intron, + createbam = count_bam envmodules: config["tools"]["cellranger"] shell: """ @@ -88,6 +104,7 @@ rule count: --libraries={input.lib} \\ --feature-ref={input.features} \\ {params.introns} \\ + {params.createbam} \\ 2>{log.err} 1>{log.log} """ @@ -204,3 +221,15 @@ rule seurat_aggregate_rmd_report: R -e "rmarkdown::render('{params.seurat}', params=list(workdir = '{params.outdir}', sample='{params.sample}'), output_file = '{params.html}')" """ +rule sampleCleanup: + input: + rules.count.output + output: + cleanup = touch(join(workpath, "cleanup", "{sample}.samplecleanup")) + params: + rname = "sampleCleanup", + cr_temp = join(workpath, "{sample}", "SC_RNA_COUNTER_CS") + shell: + """ + rm -r {params.cr_temp} + """ diff --git a/workflow/rules/vdj.smk b/workflow/rules/vdj.smk index ef45b23..e401fc5 100644 --- a/workflow/rules/vdj.smk +++ b/workflow/rules/vdj.smk @@ -12,6 +12,12 @@ pipeline_output += expand( sample=samples ) +# CellRanger counts intermediate cleanup +pipeline_output += expand( + join(workpath, "cleanup", "{sample}.samplecleanup"), + sample=samples +) + def filterFastq(wildcards): return(','.join(set([os.path.dirname(i) for i in input_fastq if len(re.findall(f"{wildcards.sample}_[\w]*R2[\w]*.fastq.gz", i)) > 0]))) @@ -48,7 +54,6 @@ rule count: 2>{log.err} 1>{log.log} """ -print(expand(join(workpath, "finalreport", "summaries", "{sample}_web_summary.html"), sample=samples)) rule summaryFiles: input: expand(join(workpath, "{sample}", "outs", "web_summary.html"), sample=samples) @@ -64,3 +69,16 @@ rule summaryFiles: """ python {params.summarize} """ + +rule sampleCleanup: + input: + rules.count.output + output: + cleanup = touch(join(workpath, "cleanup", "{sample}.samplecleanup")) + params: + rname = "sampleCleanup", + cr_temp = join(workpath, "{sample}", "SC_VDJ_ASSEMBLER_CS") + shell: + """ + rm -r {params.cr_temp} + """ From 4e8e51a74287c1d6695c53a695e18172c6ed5d2a Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Thu, 26 Oct 2023 12:57:59 -0400 Subject: [PATCH 13/18] Edit generate summary file scripts to not check for intermediate cell ranger folder, and base and ATAC versions had column header code edited to keep 10x column header order for additional columns --- workflow/scripts/generateSummaryFiles.py | 2 +- workflow/scripts/generateSummaryFiles_ATAC.py | 4 +--- workflow/scripts/generateSummaryFiles_MULTI.py | 2 -- workflow/scripts/generateSummaryFiles_MULTIOME.py | 2 -- 4 files changed, 2 insertions(+), 8 deletions(-) diff --git a/workflow/scripts/generateSummaryFiles.py b/workflow/scripts/generateSummaryFiles.py index 3b71a21..9810754 100755 --- a/workflow/scripts/generateSummaryFiles.py +++ b/workflow/scripts/generateSummaryFiles.py @@ -56,7 +56,7 @@ def createMetricsSummary(arg1): if len(set(finalheaders).difference(header)) == 0: finalheaders = header else: - finalheaders = header + list(set(finalheaders).difference(header)) + finalheaders = finalheaders + [heading for heading in header if heading in set(header).difference(finalheaders)] row = 1 samples = list() diff --git a/workflow/scripts/generateSummaryFiles_ATAC.py b/workflow/scripts/generateSummaryFiles_ATAC.py index a5b39e7..0de9967 100755 --- a/workflow/scripts/generateSummaryFiles_ATAC.py +++ b/workflow/scripts/generateSummaryFiles_ATAC.py @@ -24,8 +24,6 @@ def createMetricsSummary(arg1): if not os.path.isdir(metricsPath): raise files = glob.glob('./*/outs/summary.csv') - #Filter out aggregate runs if they exist - files = [i for i in files if i.split('/')[1] in [j.split('/')[1] for j in glob.glob('./*/*COUNTER*')]] files.sort() workbook = xlsxwriter.Workbook(metricsPath + arg1+'.xlsx') @@ -55,7 +53,7 @@ def createMetricsSummary(arg1): if len(set(finalheaders).difference(header)) == 0: finalheaders = header else: - finalheaders = header + list(set(finalheaders).difference(header)) + finalheaders = finalheaders + [heading for heading in header if heading in set(header).difference(finalheaders)] row = 1 #samples = list() diff --git a/workflow/scripts/generateSummaryFiles_MULTI.py b/workflow/scripts/generateSummaryFiles_MULTI.py index 96c19e3..95f9bf1 100755 --- a/workflow/scripts/generateSummaryFiles_MULTI.py +++ b/workflow/scripts/generateSummaryFiles_MULTI.py @@ -28,8 +28,6 @@ def createMetricsSummary(arg1): files = glob.glob('./*/outs/per_sample_outs/*/metrics_summary.csv') - #Filter out aggregate runs if they exist - files = [i for i in files if i.split('/')[1] in [j.split('/')[1] for j in glob.glob('./*/*MULTI*')]] files.sort() stats = dict() diff --git a/workflow/scripts/generateSummaryFiles_MULTIOME.py b/workflow/scripts/generateSummaryFiles_MULTIOME.py index 9904592..8b0290e 100755 --- a/workflow/scripts/generateSummaryFiles_MULTIOME.py +++ b/workflow/scripts/generateSummaryFiles_MULTIOME.py @@ -33,8 +33,6 @@ def createMetricsSummary(arg1): if not os.path.isdir(metricsPath): raise files = glob.glob('./*/outs/summary.csv') - #Filter out aggregate runs if they exist - files = [i for i in files if i.split('/')[1] in [j.split('/')[1] for j in glob.glob('./*/*COUNTER*')]] files.sort() workbook = xlsxwriter.Workbook(metricsPath + arg1+'.xlsx') From 62441bb67833e911977664685d8879654910b793 Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Fri, 27 Oct 2023 12:18:53 -0400 Subject: [PATCH 14/18] Add cleanup functionality to ATAC pipeline --- workflow/rules/atac.smk | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/workflow/rules/atac.smk b/workflow/rules/atac.smk index d35040f..584fc17 100644 --- a/workflow/rules/atac.smk +++ b/workflow/rules/atac.smk @@ -12,6 +12,12 @@ pipeline_output += expand( sample=samples ) +# CellRanger counts intermediate cleanup +pipeline_output += expand( + join(workpath, "cleanup", "{sample}.samplecleanup"), + sample=samples +) + def filterFastq(wildcards): return(','.join(set([os.path.dirname(i) for i in input_fastq if len(re.findall(f"{wildcards.sample}_[\w]*R2[\w]*.fastq.gz", i)) > 0]))) @@ -63,3 +69,16 @@ rule summaryFiles: """ python {params.summarize} """ + +rule sampleCleanup: + input: + rules.count.output + output: + cleanup = touch(join(workpath, "cleanup", "{sample}.samplecleanup")) + params: + rname = "sampleCleanup", + cr_temp = join(workpath, "{sample}", "SC_ATAC_COUNTER_CS") + shell: + """ + rm -r {params.cr_temp} + """ From 8265e3045f61dbaf87389c91526d86890b681066 Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Fri, 27 Oct 2023 12:20:24 -0400 Subject: [PATCH 15/18] Add cleanup functionality and change BAM file generation to off by default and add support for create-bam flag --- workflow/rules/multi.smk | 21 +++++++++++++++++++++ workflow/scripts/write_multiconfig.py | 8 +++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/workflow/rules/multi.smk b/workflow/rules/multi.smk index 5a5b451..ba70dc3 100644 --- a/workflow/rules/multi.smk +++ b/workflow/rules/multi.smk @@ -15,6 +15,11 @@ pipeline_output += expand( # Output from summaryFiles pipeline_output += [join(workpath, "finalreport", "metric_summary.xlsx")] +# CellRanger counts intermediate cleanup +pipeline_output += expand( + join(workpath, "cleanup", "{sample}.samplecleanup"), + sample=lib_samples +) # Get set of input paths @@ -37,6 +42,9 @@ def conditional_flags(wildcards): if exclude_introns: flags.append('--exclude_introns') + if create_bam: + flags.append('--create_bam') + # if forcecells: # flags.append('--force') @@ -136,3 +144,16 @@ rule summaryFiles: """ python {params.summarize} """ + +rule sampleCleanup: + input: + html = rules.multi.output + output: + cleanup = touch(join(workpath, "cleanup", "{sample}.samplecleanup")) + params: + rname = "sampleCleanup", + cr_temp = join(workpath, "{sample}", "SC_MULTI_CS") + shell: + """ + rm -r {params.cr_temp} + """ diff --git a/workflow/scripts/write_multiconfig.py b/workflow/scripts/write_multiconfig.py index 323a0ce..ca4a7a7 100755 --- a/workflow/scripts/write_multiconfig.py +++ b/workflow/scripts/write_multiconfig.py @@ -35,6 +35,8 @@ def main(raw_args=None): help="Use force-cells flag instead of expect-cells") parser.add_argument("-i", "--exclude_introns", action="store_true", help="Set include-introns flag to false") + parser.add_argument("-b", "--create_bam", action="store_true", + help="Set no-bam flag to false") args = parser.parse_args(raw_args) @@ -52,6 +54,10 @@ def main(raw_args=None): spamwriter.writerow(['cmo-set', args.cmoref]) if args.exclude_introns: spamwriter.writerow(['include-introns', 'false']) + if args.create_bam: + spamwriter.writerow(['no-bam', 'false']) + else: + spamwriter.writerow(['no-bam', 'true']) if args.feature != None: spamwriter.writerow([]) @@ -74,7 +80,7 @@ def main(raw_args=None): line = line.strip().split(',') spamwriter.writerow([line[1], line[0], 'Any', line[2]]) - if args.cmoref != None or args.cmosample != None: + if args.cmoref != None or args.cmosample != None: spamwriter.writerow([]) spamwriter.writerow(['[samples]']) spamwriter.writerow(['sample_id', 'cmo_ids', 'description']) From e387da2613575ae7ea83c7dd8119f6ba116eba8c Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Thu, 16 Nov 2023 12:24:22 -0500 Subject: [PATCH 16/18] Add intermediate folder cleanup to multiome pipeline --- workflow/rules/multiome.smk | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/workflow/rules/multiome.smk b/workflow/rules/multiome.smk index 8383b93..bccadda 100644 --- a/workflow/rules/multiome.smk +++ b/workflow/rules/multiome.smk @@ -17,6 +17,14 @@ pipeline_output += expand( join(workpath, "finalreport", "summaries", "{sample}_web_summary.html"), sample=lib_samples ) +pipeline_output += [join(workpath, "finalreport", "metric_summary.xlsx")] + +# CellRanger counts intermediate cleanup +pipeline_output += expand( + join(workpath, "cleanup", "{sample}.samplecleanup"), + sample=samples +) + # Get set of input paths input_paths = [os.path.dirname(p) for p in inputs] @@ -39,6 +47,17 @@ def count_introns(wildcards): return('') +def count_bam(wildcards): + """ + Wrapper to decide whether to create BAM files during Cell Ranger alignment - currently unused + See config['options']['create_bam'] for the encoded value. + """ + if create_bam: + return('') + else: + return('--no-bam') + + # Rule definitions rule librariesCSV: output: @@ -103,3 +122,15 @@ rule summaryFiles: python {params.summarize} """ +rule sampleCleanup: + input: + rules.count.output + output: + cleanup = touch(join(workpath, "cleanup", "{sample}.samplecleanup")) + params: + rname = "sampleCleanup", + cr_temp = join(workpath, "{sample}", "SC_ATAC_GEX_COUNTER_CS") + shell: + """ + rm -r {params.cr_temp} + """ From bb46f152eff93deb418028b96c136996bfabb1ee Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Thu, 16 Nov 2023 12:25:01 -0500 Subject: [PATCH 17/18] seuratSampleQC.R - fix typo and add additional resolutions for clustering --- workflow/scripts/seuratSampleQC.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/scripts/seuratSampleQC.R b/workflow/scripts/seuratSampleQC.R index f2522ac..be5e087 100644 --- a/workflow/scripts/seuratSampleQC.R +++ b/workflow/scripts/seuratSampleQC.R @@ -154,7 +154,7 @@ figures$PreFilter_UMAP_RNA <- DimPlot(seur, reduction='umap', label = TRUE) + gg figures$PreFilter_UMAP_RNA_Filter <- DimPlot(seur, reduction='umap', label = TRUE, cells.highlight=list("Filtered Cells"=cellsToRemove)) + ggtitle("Pre-Filter UMAP - Filtered Cells") + theme(plot.title = element_text(hjust = 0.5)) -seur <- subset(seur, cells = cellsToRemove, inver=T) +seur <- subset(seur, cells = cellsToRemove, invert=T) ## ----Post-Filter Gene Plot---- #Post-Filter Plots @@ -183,7 +183,7 @@ seur <- RunPCA(seur, npcs=50, features = VariableFeatures(object = seur)) seur <- FindNeighbors(seur, dims = 1:30) seur <- RunUMAP(seur, reduction = 'pca', dims = 1:30, assay = 'RNA') -for(resolution in seq(0.2,0.8,0.2)){ +for(resolution in c(seq(0.2,1.0,0.2), 1.5, 2.0)){ seur <- FindClusters(seur, resolution = resolution) } From b8ed5dfb63c973a058e20592815248e2028458b3 Mon Sep 17 00:00:00 2001 From: chenv3 <43543446+chenv3@users.noreply.github.com> Date: Thu, 16 Nov 2023 17:43:37 -0500 Subject: [PATCH 18/18] Update run documentation and organizing flags by version --- docs/usage/run.md | 734 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 662 insertions(+), 72 deletions(-) diff --git a/docs/usage/run.md b/docs/usage/run.md index 47091a8..febe43b 100644 --- a/docs/usage/run.md +++ b/docs/usage/run.md @@ -1,22 +1,20 @@ # cell-seek run -## 1. About +## 1. About The `cell-seek` executable is composed of several inter-related sub commands. Please see `cell-seek -h` for all available options. -This part of the documentation describes options and concepts for cell-seek run sub command in more detail. With minimal configuration, the **`run`** sub command enables you to start running cell-seek pipeline. +This part of the documentation describes options and concepts for cell-seek run sub command in more detail. The following page lists only the options applicable when selecting the --version gex flag. With minimal configuration, the **`run`** sub command enables you to start running cell-seek pipeline. -Setting up the cell-seek pipeline is fast and easy! In its most basic form, cell-seek run only has *two required inputs*. +Setting up the cell-seek pipeline is fast and easy! In its most basic form, cell-seek run only has *four required inputs*. ## 2. Synopsis ```text $ cell-seek run [--help] \ - [--mode {slurm,local}] [--job-name JOB_NAME] [--batch-id BATCH_ID] \ - [--tmp-dir TMP_DIR] [--silent] [--sif-cache SIF_CACHE] \ - [--singularity-cache SINGULARITY_CACHE] \ - [--dry-run] [--threads THREADS] \ - [--libraries LIBRARIES] [--features FEATURES] \ - [--cmo-reference CMOREFERENCE] [--cmo-sample CMOSAMPLE] \ - [--exclude-introns] \ + [--dry-run] [--job-name JOB_NAME] [--mode {{slurm,local}}] \ + [--sif-cache SIF_CACHE] [--singularity-cache SINGULARITY_CACHE] \ + [--silent] [--threads THREADS] [--tmp-dir TMP_DIR] \ + [--aggregate {{mapped, none}}] [--exclude-introns] \ + [--filter FILTER] [--create-bam] \ --input INPUT [INPUT ...] \ --output OUTPUT \ --version {gex, ...} \ @@ -27,18 +25,22 @@ The synopsis for each command shows its arguments and their usage. Optional argu A user **must** provide a list of FastQ (globbing is supported) to analyze via `--input` argument, an output directory to store results via `--output` argument, the version of the pipeline to run via `--version` argument, and the reference genome to use via `--genome` argument. -Use you can always use the `-h` option for information on a specific command. +Use you can always use the `-h` option for information on a specific command. -### 2.1 Required arguments +The following is a breakdown of the required and optional arguments for each of the versions of the pipeline. + +### 2.1 GEX + +#### 2.1.1 Required arguments Each of the following arguments are required. Failure to provide a required argument will result in a non-zero exit-code. `--input INPUT [INPUT ...]` > **Input FastQ file(s).** > *type: file(s)* -> +> > One or more FastQ files can be provided. The pipeline does NOT support single-end data. From the command-line, each input file should seperated by a space. Globbing is supported! This makes selecting FastQ files easy. Input FastQ files should always be gzipp-ed. -> +> > ***Example:*** `--input .tests/*.R?.fastq.gz` --- @@ -47,16 +49,16 @@ Each of the following arguments are required. Failure to provide a required argu > *type: path* > > This location is where the pipeline will create all of its output files, also known as the pipeline's working directory. If the provided output directory does not exist, it will be created automatically. -> +> > ***Example:*** `--output /data/$USER/cell-seek_out` --- - `--version {gex, vdj, cite, multi, atac, multiome}` + `--version gex` > **The version of the pipeline to run.** > *type: string* > -> This option selects the version of the pipeline to run. Can be chosen from the options gene expression (GEX), VDJ, feature barcode (CITE), Cell Ranger multi (Multi), ATAC, or multiome. -> +> This option selects the version of the pipeline to run. The documentation provided is based on choosing the option for gene expression (GEX). +> > ***Example:*** `--version gex` --- @@ -65,27 +67,169 @@ Each of the following arguments are required. Failure to provide a required argu > *type: string* > > This option defines the reference genome of the samples. cell-seek does comes bundled with prebuilt reference files for human and mouse samples, e.g. hg38 or mm10. Please select one of the following options: hg38, mm10 -> +> +> ***Example:*** `--genome hg38` + +#### 2.1.2 Analysis options + +Each of the following arguments are optional, and do not need to be provided. + +`--aggregate {mapped, none}` +> **Cell Ranger aggregate normalization.** +> *type: string* +> +> This option defines the normalization mode that should be used. Mapped is what Cell Ranger would run by default, which subsamples reads from higher depth samples until each library type has an equal number of reads per cell that are confidently mapped. None means to not normalize at all. If this flag is not used then aggregate will not be run. To run Cell Ranger aggregate, please select one of the following options: mapped, none. +> +> ***Example:*** `--aggregate mapped` + + +--- + `--exclude-introns` +> **Exclude introns from the count alignment.** +> *type: boolean flag* +> +> Turns off the option of including introns when performing alignment. This flag is only applicable when dealing with gene expression related data. +> +> ***Example:*** `--exclude-introns` + + +--- + `--create-bam` +> **Create bam files.** +> *type: boolean flag* +> +> By default the no-bam flag is used when running Cell Ranger. Use this flag to ensure that a bam file is created for each sample during analysis. This flag is only applicable when dealing with gene expression related data. +> +> ***Example:*** `--create-bam` + +--- + `--filter FILTER` +> **Filter threshold file.** +> *type: file* +> +> Filter threshold file. A CSV file containing the different thresholds to be applied for individual samples within the project during the QC analysis. The file should contain a header row with Sample as the column name for the sample IDs, and the name of each metric that will be filtered along with if it is the high or low threshold for that metric. Each row is then the entries for each sample that the manual thresholds will be applied. If no file is provided then the default thresholds will be used. If a cell is left blank for a sample then that sample would not be filtered based on that criteria. This flag is currently only applicable when dealing with GEX projects. +> +> *Here is an example filter.csv file:* +> ``` +> Sample,nFeature_RNA_low,nFeature_RNA_high,percent.mito_high +> sample1,500,6000,15 +> sample2,500,6000,5 +> sample4,500,6000,5 +> ``` +> +> *Where:* +> +> - *Sample:* Unique sample ID that should match the sample name used for Cell Ranger count. +> - *nFeature_RNA_low,nFeature_RNA_high,percent.mito_high:* Example entries that can be used for manual thresholding. The column names need to be formatted as metadataname_high/low. Entries that ends with high will be treated as the upper threshold. Entries that ends with low will be treated as the lower threshold. Valid metadata names include nCount_RNA, nFeature_RNA, and percent.mito. +> +> ***Example:*** `--filter filter.csv` + + + +### 2.2 VDJ + +#### 2.2.1 Required arguments + +Each of the following arguments are required. Failure to provide a required argument will result in a non-zero exit-code. + + `--input INPUT [INPUT ...]` +> **Input FastQ file(s).** +> *type: file(s)* +> +> One or more FastQ files can be provided. The pipeline does NOT support single-end data. From the command-line, each input file should seperated by a space. Globbing is supported! This makes selecting FastQ files easy. Input FastQ files should always be gzipp-ed. +> +> ***Example:*** `--input .tests/*.R?.fastq.gz` + +--- + `--output OUTPUT` +> **Path to an output directory.** +> *type: path* +> +> This location is where the pipeline will create all of its output files, also known as the pipeline's working directory. If the provided output directory does not exist, it will be created automatically. +> +> ***Example:*** `--output /data/$USER/cell-seek_out` + +--- + `--version vdj` +> **The version of the pipeline to run.** +> *type: string* +> +> This option selects the version of the pipeline to run. The documentation provided is based on choosing the option for VDJ. +> +> ***Example:*** `--version vdj` + +--- + `--genome {hg38, mm10}` +> **Reference genome.** +> *type: string* +> +> This option defines the reference genome of the samples. cell-seek does comes bundled with prebuilt reference files for human and mouse samples, e.g. hg38 or mm10. Please select one of the following options: hg38, mm10 +> > ***Example:*** `--genome hg38` -### 2.2 Analysis options +#### 2.2.2 Analysis options -Each of the following arguments are optional, and do not need to be provided. +The VDJ pipeline currently does not have any applicable analysis flags. - `--libraries LIBRARIES` +### 2.3 CITE + +#### 2.3.1 Required arguments + +Each of the following arguments are required. Failure to provide a required argument will result in a non-zero exit-code. + + `--input INPUT [INPUT ...]` +> **Input FastQ file(s).** +> *type: file(s)* +> +> One or more FastQ files can be provided. The pipeline does NOT support single-end data. From the command-line, each input file should seperated by a space. Globbing is supported! This makes selecting FastQ files easy. Input FastQ files should always be gzipp-ed. +> +> ***Example:*** `--input .tests/*.R?.fastq.gz` + +--- + `--output OUTPUT` +> **Path to an output directory.** +> *type: path* +> +> This location is where the pipeline will create all of its output files, also known as the pipeline's working directory. If the provided output directory does not exist, it will be created automatically. +> +> ***Example:*** `--output /data/$USER/cell-seek_out` + +--- + `--version cite` +> **The version of the pipeline to run.** +> *type: string* +> +> This option selects the version of the pipeline to run. The documentation provided is based on choosing the option for CITE-seek. +> +> ***Example:*** `--version cite` + +--- + `--genome {hg38, mm10}` +> **Reference genome.** +> *type: string* +> +> This option defines the reference genome of the samples. cell-seek does comes bundled with prebuilt reference files for human and mouse samples, e.g. hg38 or mm10. Please select one of the following options: hg38, mm10 +> +> ***Example:*** `--genome hg38` + + +--- +`--libraries LIBRARIES` > **Libraries file.** > *type: file* > -> A CSV file containing information about each library. This file is used in feature barcode (cite), multi, and multiome analysis. It contains each sample's name, flowcell, demultiplexed name, and library type. More information about the libraries file and its requirements can be found on the [10x Genomics website](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis). +> A CSV file containing information about each library. TIt contains each sample's name, flowcell, demultiplexed name, and library type. More information about the libraries file and its requirements can be found on the [10x Genomics website](https://www.10xgenomics.com/support/software/cell-ranger/analysis/running-pipelines/cr-feature-bc-analysis). > *Here is an example libraries.csv file:* > ``` > Name,Flowcell,Sample,Type > IL15_LNs,H7CNNBGXG,IL15_LNs,Gene Expression -> IL15_LNs,H7CT7BGXG,IL15_LNs_BC,Antibody Capture -> ``` +> IL15_LNs,H7CT7BGXG,IL15_LNs_ADT,Antibody Capture +> EGR1,H7CNNBGXG,EGR1_GEX,Gene Expression +> EGR1,H7CT7BGXG,EGR1_ADT,Antibody Capture +> ``` -> *Where:* +> *Where:* > - *Name:* name of the sample passed to CellRanger. > - *Flowcell:* The flowcell ID that contains the FASTQ files for this set of data. @@ -95,30 +239,30 @@ Each of the following arguments are optional, and do not need to be provided. > * CRISPR Guide Capture > * Antibody Capture > * Custom - +> > ***Example:*** `--libraries libraries.csv` --- - `--features FEATURES` +`--features FEATURES` > **Features file.** > *type: file* > -> A feature reference CSV file containing information for processing a feature barcode data. This file is used in feature barcode, and may be used in multi analysis. This file should contain a unique ID for the feature, a human readable name, sequence, feature type, read, and pattern. More information about the libraries file and its requirements can be found on the [10x Genomics website](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis). +> A feature reference CSV file containing information for processing a feature barcode data. This file should contain a unique ID for the feature, a human readable name, sequence, feature type, read, and pattern. More information about the libraries file and its requirements can be found on the [10x Genomics website](https://www.10xgenomics.com/support/software/cell-ranger/analysis/running-pipelines/cr-feature-bc-analysis). > *Here is an example features.csv file:* > ``` > id,name,sequence,feature_type,read,pattern > CITE_CD64,CD64,AGTGGG,Antibody Capture,R2,5PNN(BC) > CITE_CD8,CD8,TCACCGT,Antibody Capture,R2,5PNNN(BC) -> ``` +> ``` > *Where:* -> - *id:* Unique ID for this feature. Must not contain whitespace, quote or comma characters. Each ID must be unique and must not collide with a gene identifier from the transcriptome. -> - *name:* Human-readable name for this feature. Must not contain whitespace. -> - *sequence:* Nucleotide barcode sequence associated with this feature, e.g. the antibody barcode or sgRNA protospacer sequence. -> - *read:* Specifies which RNA sequencing read contains the Feature Barcode sequence. Must be R1 or R2, but in most cases R2 is the correct read. +> - *id:* Unique ID for this feature. Must not contain whitespace, quote or comma characters. Each ID must be unique and must not collide with a gene identifier from the transcriptome. +> - *name:* Human-readable name for this feature. Must not contain whitespace. +> - *sequence:* Nucleotide barcode sequence associated with this feature, e.g. the antibody barcode or sgRNA protospacer sequence. +> - *read:* Specifies which RNA sequencing read contains the Feature Barcode sequence. Must be R1 or R2, but in most cases R2 is the correct read. > - *pattern:* Specifies how to extract the sequence of the feature barcode from the read. > - *Type:* Type of the feature. List of supported options: @@ -126,70 +270,329 @@ Each of the following arguments are optional, and do not need to be provided. > * CRISPR Guide Capture > * Antibody Capture > * Custom +> +> ***Example:*** `--features features.csv` + +#### 2.3.2 Analysis options +`--exclude-introns` +> **Exclude introns from the count alignment.** +> *type: boolean flag* +> +> Turns off the option of including introns when performing alignment. This flag is only applicable when dealing with gene expression related data. +> +> ***Example:*** `--exclude-introns` + + +--- + `--create-bam` +> **Create bam files.** +> *type: boolean flag* +> +> By default the no-bam flag is used when running Cell Ranger. Use this flag to ensure that a bam file is created for each sample during analysis. This flag is only applicable when dealing with gene expression related data. +> +> ***Example:*** `--create-bam` + +### 2.4 MULTI + +There are multiple different combinations of library types that may result in the use of Cell Ranger `multi` analysis. Any combination that combines GEX and VDJ data for cell calls, or the use of HTO with the Cell Ranger hashtag caller would need `multi` analysis. + +#### 2.4.1 Required arguments + +Each of the following arguments are required. Failure to provide a required argument will result in a non-zero exit-code. + + `--input INPUT [INPUT ...]` +> **Input FastQ file(s).** +> *type: file(s)* +> +> One or more FastQ files can be provided. The pipeline does NOT support single-end data. From the command-line, each input file should seperated by a space. Globbing is supported! This makes selecting FastQ files easy. Input FastQ files should always be gzipp-ed. +> +> ***Example:*** `--input .tests/*.R?.fastq.gz` + +--- + `--output OUTPUT` +> **Path to an output directory.** +> *type: path* +> +> This location is where the pipeline will create all of its output files, also known as the pipeline's working directory. If the provided output directory does not exist, it will be created automatically. +> +> ***Example:*** `--output /data/$USER/cell-seek_out` + +--- + `--version multi` +> **The version of the pipeline to run.** +> *type: string* +> +> This option selects the version of the pipeline to run. The documentation provided is based on choosing the option for multi analysis. +> +> ***Example:*** `--version multi` + +--- + `--genome {hg38, mm10}` +> **Reference genome.** +> *type: string* +> +> This option defines the reference genome of the samples. cell-seek does comes bundled with prebuilt reference files for human and mouse samples, e.g. hg38 or mm10. Please select one of the following options: hg38, mm10 +> +> ***Example:*** `--genome hg38` + + +--- +`--libraries LIBRARIES` +> **Libraries file.** +> *type: file* +> +> A CSV file containing information about each library. TIt contains each sample's name, flowcell, demultiplexed name, and library type. More information about the libraries file and its requirements can be found on the [10x Genomics website](https://www.10xgenomics.com/support/software/cell-ranger/analysis/running-pipelines/cr-feature-bc-analysis). + +> *Here is an example libraries.csv file:* +> ``` +> Name,Flowcell,Sample,Type +> IL15_LNs,H7CNNBGXG,IL15_LNs,Gene Expression +> IL15_LNs,H7CT7BGXG,IL15_LNs_BC,Antibody Capture +> ``` + +> *Where:* + +> - *Name:* name of the sample passed to CellRanger. +> - *Flowcell:* The flowcell ID that contains the FASTQ files for this set of data. +> - *Sample:* Name that was used when demultiplexing, this should match the FASTQ files. +> - *Type:* library type for each sample. List of supported options: +> * Gene Expression +> * CRISPR Guide Capture +> * Antibody Capture +> * Custom +> +> ***Example:*** `--libraries libraries.csv` + +#### 2.4.2 Analysis options + +Each of the following arguments are optional, and do not need to be provided. + +`--features FEATURES` +> **Features file.** +> *type: file* +> +> A feature reference CSV file containing information for processing a feature barcode data. This file is used in feature barcode, and may be used in multi analysis. This file should contain a unique ID for the feature, a human readable name, sequence, feature type, read, and pattern. More information about the libraries file and its requirements can be found on the [10x Genomics website](https://www.10xgenomics.com/support/software/cell-ranger/analysis/running-pipelines/cr-feature-bc-analysis). + +> *Here is an example features.csv file:* +> ``` +> id,name,sequence,feature_type,read,pattern +> CITE_CD64,CD64,AGTGGG,Antibody Capture,R2,5PNN(BC) +> CITE_CD8,CD8,TCACCGT,Antibody Capture,R2,5PNNN(BC) +> ``` + +> *Where:* + +> - *id:* Unique ID for this feature. Must not contain whitespace, quote or comma characters. Each ID must be unique and must not collide with a gene identifier from the transcriptome. +> - *name:* Human-readable name for this feature. Must not contain whitespace. +> - *sequence:* Nucleotide barcode sequence associated with this feature, e.g. the antibody barcode or sgRNA protospacer sequence. +> - *read:* Specifies which RNA sequencing read contains the Feature Barcode sequence. Must be R1 or R2, but in most cases R2 is the correct read. +> - *pattern:* Specifies how to extract the sequence of the feature barcode from the read. + +> - *Type:* Type of the feature. List of supported options: +> * Gene Expression +> * CRISPR Guide Capture +> * Antibody Capture +> * Custom +> > ***Example:*** `--features features.csv` --- - `--cmo-reference CMOREFERENCE` +`--cmo-reference CMOREFERENCE` > **CMO reference file.** > *type: file* > -> A CMO reference CSV file containing information for processing hashtag data, which is used in multi analysis if custom hashtags will be processed. This file should contain a unique ID for the hashtag, a human readable name, sequence, feature type, read, and pattern. More information about the cmo reference file and its requirements can be found on the [10x Genomics website](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/multi). +> A CMO reference CSV file containing information for processing hashtag data, which is used in multi analysis if custom hashtags will be processed. This file should contain a unique ID for the hashtag, a human readable name, sequence, feature type, read, and pattern. More information about the cmo reference file and its requirements can be found on the [10x Genomics website](https://www.10xgenomics.com/support/software/cell-ranger/analysis/cr-3p-multi). > *Here is an example cmo_reference.csv file:* > ``` > id,name,sequence,feature_type,read,pattern > CMO301,CMO301,ATGAGGAATTCCTGC,Multiplexing Capture,R2,5P(BC) > CMO302,CMO302,CATGCCAATAGAGCG,Multiplexing Capture,R2,5P(BC) -> ``` +> ``` > *Where:* -> - *id:* Unique ID for this feature. Must not contain whitespace, quote or comma characters. Each ID must be unique and must not collide with a gene identifier from the transcriptome. -> - *name:* Human-readable name for this feature. Must not contain whitespace. +> - *id:* Unique ID for this feature. Must not contain whitespace, quote or comma characters. Each ID must be unique and must not collide with a gene identifier from the transcriptome. +> - *name:* Human-readable name for this feature. Must not contain whitespace. > - *sequence:* Nucleotide barcode sequence associated with this hashtag -> - *feature_type: Type of the feature. This should always be multiplexing capture. -> - *read:* Specifies which RNA sequencing read contains the Feature Barcode sequence. Must be R1 or R2, but in most cases R2 is the correct read. +> - *feature_type: Type of the feature. This should always be multiplexing capture. +> - *read:* Specifies which RNA sequencing read contains the Feature Barcode sequence. Must be R1 or R2, but in most cases R2 is the correct read. > - *pattern:* Specifies how to extract the sequence of the feature barcode from the read. +> > ***Example:*** `--cmo-reference cmo_reference.csv` --- - `--cmo-sample CMOSAMPLE` +`--cmo-sample CMOSAMPLE` > **CMO sample file.** > *type: file* > -> A CMO sample CSV file containing sample to hashtag information used for multi analysis. CMO IDs should match the ones used in the CMO reference file. If no CMO reference is provided then CMO ID should match the ones used by 10x. The same CMO sample will be used on all multi libraries. This file should contain a unique sample ID for the sample, and the CMO IDs associated with that sample. If more than one CMO ID is associated with a sample then a | should be used to separate the tags. More information and examples about the samples section of the multi config file and its requirements can be found on the [10x Genomics website](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/multi). +> A CMO sample CSV file containing sample to hashtag information used for multi analysis. CMO IDs should match the ones used in the CMO reference file. If no CMO reference is provided then CMO ID should match the ones used by 10x. The same CMO sample will be used on all multi libraries. This file should contain a unique sample ID for the sample, and the CMO IDs associated with that sample. If more than one CMO ID is associated with a sample then a | should be used to separate the tags. More information and examples about the samples section of the multi config file and its requirements can be found on the [10x Genomics website](https://www.10xgenomics.com/support/software/cell-ranger/analysis/cr-3p-multi). > *Here is an example cmo_sample.csv file:* > ``` > sample_id,cmo_ids > sample1,CMO301 > sample2,CMO302|CMO303 -> ``` +> ``` > *Where:* -> - *sample_id:* Unique sample ID for this hashtagged sample. Must not contain whitespace, quote or comma characters. Each sample ID must be unique. +> - *sample_id:* Unique sample ID for this hashtagged sample. Must not contain whitespace, quote or comma characters. Each sample ID must be unique. > - *cmo_ids:* Unique CMO ID(s) that the sample is hashtagged with. Must match either entries in the cmo_reference.csv file or 10x CMO IDs. +> > ***Example:*** `--cmo-sample cmo_sample.csv` --- - `--exclude-introns` +`--exclude-introns` > **Exclude introns from the count alignment.** > *type: boolean flag* > > Turns off the option of including introns when performing alignment. This flag is only applicable when dealing with gene expression related data. +> > ***Example:*** `--exclude-introns` - -### 2.3 Orchestration options -Each of the following arguments are optional, and do not need to be provided. + +--- +`--create-bam` +> **Create bam files.** +> *type: boolean flag* +> +> By default the no-bam flag is used when running Cell Ranger. Use this flag to ensure that a bam file is created for each sample during analysis. This flag is only applicable when dealing with gene expression related data. +> +> ***Example:*** `--create-bam` + + +### 2.5 ATAC + +#### 2.5.1 Required arguments + +Each of the following arguments are required. Failure to provide a required argument will result in a non-zero exit-code. + + `--input INPUT [INPUT ...]` +> **Input FastQ file(s).** +> *type: file(s)* +> +> One or more FastQ files can be provided. The pipeline does NOT support single-end data. From the command-line, each input file should seperated by a space. Globbing is supported! This makes selecting FastQ files easy. Input FastQ files should always be gzipp-ed. +> +> ***Example:*** `--input .tests/*.R?.fastq.gz` + +--- + `--output OUTPUT` +> **Path to an output directory.** +> *type: path* +> +> This location is where the pipeline will create all of its output files, also known as the pipeline's working directory. If the provided output directory does not exist, it will be created automatically. +> +> ***Example:*** `--output /data/$USER/cell-seek_out` + +--- + `--version atac` +> **The version of the pipeline to run.** +> *type: string* +> +> This option selects the version of the pipeline to run. The documentation provided is based on choosing the option for ATAC. +> +> ***Example:*** `--version atac` + +--- + `--genome {hg38, mm10}` +> **Reference genome.** +> *type: string* +> +> This option defines the reference genome of the samples. cell-seek does comes bundled with prebuilt reference files for human and mouse samples, e.g. hg38 or mm10. Please select one of the following options: hg38, mm10 +> +> ***Example:*** `--genome hg38` + + +#### 2.5.2 Analysis options + +The ATAC pipeline currently does not have any applicable analysis flags. + +### 2.6 Multiome + +#### 2.6.1 Required arguments + +Each of the following arguments are required. Failure to provide a required argument will result in a non-zero exit-code. + + `--input INPUT [INPUT ...]` +> **Input FastQ file(s).** +> *type: file(s)* +> +> One or more FastQ files can be provided. The pipeline does NOT support single-end data. From the command-line, each input file should seperated by a space. Globbing is supported! This makes selecting FastQ files easy. Input FastQ files should always be gzipp-ed. +> +> ***Example:*** `--input .tests/*.R?.fastq.gz` + +--- + `--output OUTPUT` +> **Path to an output directory.** +> *type: path* +> +> This location is where the pipeline will create all of its output files, also known as the pipeline's working directory. If the provided output directory does not exist, it will be created automatically. +> +> ***Example:*** `--output /data/$USER/cell-seek_out` + +--- + `--version multiome` +> **The version of the pipeline to run.** +> *type: string* +> +> This option selects the version of the pipeline to run. The documentation provided is based on choosing the option for multiome. +> +> ***Example:*** `--version multiome` + +--- + `--genome {hg38, mm10}` +> **Reference genome.** +> *type: string* +> +> This option defines the reference genome of the samples. cell-seek does comes bundled with prebuilt reference files for human and mouse samples, e.g. hg38 or mm10. Please select one of the following options: hg38, mm10 +> +> ***Example:*** `--genome hg38` + + +--- +`--libraries LIBRARIES` +> **Libraries file.** +> *type: file* +> +> A CSV file containing information about each library. TIt contains each sample's name, flowcell, demultiplexed name, and library type. More information about the libraries file and its requirements can be found on the [10x Genomics website](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/count#library-csv). + +> *Here is an example libraries.csv file:* +> ``` +> Name,Flowcell,Sample,Type +> IL15_LNs,H7CNNBGXG,IL15_LNs,Gene Expression +> IL15_LNs,H7CT7BGXG,IL15_LNs_BC,Antibody Capture +> ``` + +> *Where:* + +> - *Name:* name of the sample passed to CellRanger. +> - *Flowcell:* The flowcell ID that contains the FASTQ files for this set of data. +> - *Sample:* Name that was used when demultiplexing, this should match the FASTQ files. +> - *Type:* library type for each sample. List of supported options: +> * Gene Expression +> * CRISPR Guide Capture +> * Antibody Capture +> * Custom +> +> ***Example:*** `--libraries libraries.csv` + + +#### 2.6.2 Analysis options + +The multiome pipeline currently does not have any applicable analysis flags. + + +### 2.7 Orchestration options + +Each of the following arguments are optional, and do not need to be provided. `--dry-run` > **Dry run the pipeline.** > *type: boolean flag* -> +> > Displays what steps in the pipeline remain or will be run. Does not execute anything! > > ***Example:*** `--dry-run` @@ -198,7 +601,7 @@ Each of the following arguments are optional, and do not need to be provided. `--silent` > **Silence standard output.** > *type: boolean flag* -> +> > Reduces the amount of information directed to standard output when submitting master job to the job scheduler. Only the job id of the master job is returned. > > ***Example:*** `--silent` @@ -208,15 +611,15 @@ Each of the following arguments are optional, and do not need to be provided. > **Execution Method.** > *type: string* > *default: slurm* -> -> Execution Method. Defines the mode or method of execution. Vaild mode options include: slurm or local. -> +> +> Execution Method. Defines the mode or method of execution. Vaild mode options include: slurm or local. +> > ***slurm*** > The slurm execution method will submit jobs to the [SLURM workload manager](https://slurm.schedmd.com/). It is recommended running cell-seek in this mode as execution will be significantly faster in a distributed environment. This is the default mode of execution. > > ***local*** -> Local executions will run serially on compute instance. This is useful for testing, debugging, or when a users does not have access to a high performance computing environment. If this option is not provided, it will default to a local execution mode. -> +> Local executions will run serially on compute instance. This is useful for testing, debugging, or when a users does not have access to a high performance computing environment. If this option is not provided, it will default to a local execution mode. +> > ***Example:*** `--mode slurm` --- @@ -224,9 +627,9 @@ Each of the following arguments are optional, and do not need to be provided. > **Set the name of the pipeline's master job.** > *type: string* > *default: pl:cell-seek* -> +> > When submitting the pipeline to a job scheduler, like SLURM, this option always you to set the name of the pipeline's master job. By default, the name of the pipeline's master job is set to "pl:cell-seek". -> +> > ***Example:*** `--job-name pl_id-42` --- @@ -235,8 +638,8 @@ Each of the following arguments are optional, and do not need to be provided. > *type: path* > *default: `--output OUTPUT/.singularity`* > -> Singularity will cache image layers pulled from remote registries. This ultimately speeds up the process of pull an image from DockerHub if an image layer already exists in the singularity cache directory. By default, the cache is set to the value provided to the `--output` argument. Please note that this cache cannot be shared across users. Singularity strictly enforces you own the cache directory and will return a non-zero exit code if you do not own the cache directory! See the `--sif-cache` option to create a shareable resource. -> +> Singularity will cache image layers pulled from remote registries. This ultimately speeds up the process of pull an image from DockerHub if an image layer already exists in the singularity cache directory. By default, the cache is set to the value provided to the `--output` argument. Please note that this cache cannot be shared across users. Singularity strictly enforces you own the cache directory and will return a non-zero exit code if you do not own the cache directory! See the `--sif-cache` option to create a shareable resource. +> > ***Example:*** `--singularity-cache /data/$USER/.singularity` --- @@ -245,7 +648,7 @@ Each of the following arguments are optional, and do not need to be provided. > *type: path* > > Uses a local cache of SIFs on the filesystem. This SIF cache can be shared across users if permissions are set correctly. If a SIF does not exist in the SIF cache, the image will be pulled from Dockerhub and a warning message will be displayed. The `cell-seek cache` subcommand can be used to create a local SIF cache. Please see `cell-seek cache` for more information. This command is extremely useful for avoiding DockerHub pull rate limits. It also remove any potential errors that could occur due to network issues or DockerHub being temporarily unavailable. We recommend running cell-seek with this option when ever possible. -> +> > ***Example:*** `--singularity-cache /data/$USER/SIFs` --- @@ -253,9 +656,9 @@ Each of the following arguments are optional, and do not need to be provided. > **Max number of threads for each process.** > *type: int* > *default: 2* -> +> > Max number of threads for each process. This option is more applicable when running the pipeline with `--mode local`. It is recommended setting this vaule to the maximum number of CPUs available on the host machine. -> +> > ***Example:*** `--threads 12` @@ -264,24 +667,26 @@ Each of the following arguments are optional, and do not need to be provided. > **Max number of threads for each process.** > *type: path* > *default: `/lscratch/$SLURM_JOBID`* -> +> > Path on the file system for writing temporary output files. By default, the temporary directory is set to '/lscratch/$SLURM_JOBID' for backwards compatibility with the NIH's Biowulf cluster; however, if you are running the pipeline on another cluster, this option will need to be specified. Ideally, this path should point to a dedicated location on the filesystem for writing tmp files. On many systems, this location is set to somewhere in /scratch. If you need to inject a variable into this string that should NOT be expanded, please quote this options value in single quotes. -> +> > ***Example:*** `--tmp-dir /scratch/$USER/` -### 2.4 Miscellaneous options -Each of the following arguments are optional, and do not need to be provided. +### 2.8 Miscellaneous options +Each of the following arguments are optional, and do not need to be provided. `-h, --help` > **Display Help.** > *type: boolean flag* -> +> > Shows command's synopsis, help message, and an example command -> +> > ***Example:*** `--help` ## 3. Example -```bash + +### 3.1 GEX Example +```bash # Step 1.) Grab an interactive node, # do not run on head node! srun -N 1 -n 1 --time=1:00:00 --mem=8gb --cpus-per-task=2 --pty bash @@ -297,8 +702,8 @@ module load singularity snakemake --dry-run # Step 2B.) Run the cell-seek pipeline -# The slurm mode will submit jobs to -# the cluster. It is recommended running +# The slurm mode will submit jobs to +# the cluster. It is recommended running # the pipeline in this mode. ./cell-seek run --input .tests/*.R?.fastq.gz \ --output /data/$USER/output \ @@ -306,3 +711,188 @@ module load singularity snakemake --genome hg38 \ --mode slurm ``` + +### 3.2 VDJ Example +```bash +# Step 1.) Grab an interactive node, +# do not run on head node! +srun -N 1 -n 1 --time=1:00:00 --mem=8gb --cpus-per-task=2 --pty bash +module purge +module load singularity snakemake + +# Step 2A.) Dry-run the pipeline +./cell-seek run --input .tests/*.R?.fastq.gz \ + --output /data/$USER/output \ + --version vdj \ + --genome hg38 \ + --mode slurm \ + --dry-run + +# Step 2B.) Run the cell-seek pipeline +# The slurm mode will submit jobs to +# the cluster. It is recommended running +# the pipeline in this mode. +./cell-seek run --input .tests/*.R?.fastq.gz \ + --output /data/$USER/output \ + --version vdj \ + --genome hg38 \ + --mode slurm +``` + +### 3.3 CITE Example + +```bash +# Step 1.) Grab an interactive node, +# do not run on head node! +srun -N 1 -n 1 --time=1:00:00 --mem=8gb --cpus-per-task=2 --pty bash +module purge +module load singularity snakemake + +# Step 2A.) Dry-run the pipeline +./cell-seek run --input .tests/*.R?.fastq.gz \ + --output /data/$USER/output \ + --version cite \ + --genome hg38 \ + --libraries libraries.csv \ + --features features.csv \ + --mode slurm \ + --dry-run + +# Step 2B.) Run the cell-seek pipeline +# The slurm mode will submit jobs to +# the cluster. It is recommended running +# the pipeline in this mode. +./cell-seek run --input .tests/*.R?.fastq.gz \ + --output /data/$USER/output \ + --version cite \ + --genome hg38 \ + --libraries libraries.csv \ + --features features.csv \ + --mode slurm +``` + +### 3.4 MULTI Example + +#### 3.4.1 GEX and VDJ + +The following is an example of running `multi` when combining GEX and VDJ runs. + +```bash +# Step 1.) Grab an interactive node, +# do not run on head node! +srun -N 1 -n 1 --time=1:00:00 --mem=8gb --cpus-per-task=2 --pty bash +module purge +module load singularity snakemake + +# Step 2A.) Dry-run the pipeline +./cell-seek run --input .tests/*.R?.fastq.gz \ + --output /data/$USER/output \ + --version multi \ + --genome hg38 \ + --libraries libraries.csv \ + --mode slurm \ + --dry-run + +# Step 2B.) Run the cell-seek pipeline +# The slurm mode will submit jobs to +# the cluster. It is recommended running +# the pipeline in this mode. +./cell-seek run --input .tests/*.R?.fastq.gz \ + --output /data/$USER/output \ + --version multi \ + --genome hg38 \ + --libraries libraries.csv \ + --mode slurm +``` + +#### 3.4.2 Including HTO + +The following is an example of running `multi` while providing hashtag information. + +```bash +# Step 1.) Grab an interactive node, +# do not run on head node! +srun -N 1 -n 1 --time=1:00:00 --mem=8gb --cpus-per-task=2 --pty bash +module purge +module load singularity snakemake + +# Step 2A.) Dry-run the pipeline +./cell-seek run --input .tests/*.R?.fastq.gz \ + --output /data/$USER/output \ + --version multi \ + --genome hg38 \ + --libraries libraries.csv \ + --cmo-reference cmo_reference.csv \ + --mode slurm \ + --dry-run + +# Step 2B.) Run the cell-seek pipeline +# The slurm mode will submit jobs to +# the cluster. It is recommended running +# the pipeline in this mode. +./cell-seek run --input .tests/*.R?.fastq.gz \ + --output /data/$USER/output \ + --version multi \ + --genome hg38 \ + --libraries libraries.csv \ + --cmo-reference cmo_reference.csv \ + --mode slurm +``` + +### 3.5 ATAC Example + +```bash +# Step 1.) Grab an interactive node, +# do not run on head node! +srun -N 1 -n 1 --time=1:00:00 --mem=8gb --cpus-per-task=2 --pty bash +module purge +module load singularity snakemake + +# Step 2A.) Dry-run the pipeline +./cell-seek run --input .tests/*.R?.fastq.gz \ + --output /data/$USER/output \ + --version atac \ + --genome hg38 \ + --mode slurm \ + --dry-run + +# Step 2B.) Run the cell-seek pipeline +# The slurm mode will submit jobs to +# the cluster. It is recommended running +# the pipeline in this mode. +./cell-seek run --input .tests/*.R?.fastq.gz \ + --output /data/$USER/output \ + --version atac \ + --genome hg38 \ + --mode slurm +``` + +### 3.6 Multiome Example + +```bash +# Step 1.) Grab an interactive node, +# do not run on head node! +srun -N 1 -n 1 --time=1:00:00 --mem=8gb --cpus-per-task=2 --pty bash +module purge +module load singularity snakemake + +# Step 2A.) Dry-run the pipeline +./cell-seek run --input .tests/*.R?.fastq.gz \ + --output /data/$USER/output \ + --version multiome \ + --genome hg38 \ + --libraries libraries.csv \ + --mode slurm \ + --dry-run + +# Step 2B.) Run the cell-seek pipeline +# The slurm mode will submit jobs to +# the cluster. It is recommended running +# the pipeline in this mode. +./cell-seek run --input .tests/*.R?.fastq.gz \ + --output /data/$USER/output \ + --version multiome \ + --genome hg38 \ + --libraries libraries.csv \ + --mode slurm +```