UCSF-DSCOLAB · amadeovezz · Oct 16, 2023 · Aug 4, 2023 · Aug 4, 2023 · Aug 4, 2023
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+# Nextflow specific
+*.uuid
+**/.nextflow/
+
+# Jetbrains
+.idea/
diff --git a/single_cell_RNAseq/README.md b/single_cell_RNAseq/README.md
@@ -36,9 +36,104 @@ Install nextflow
 
 #### What do I need to configure?
 
-All of the parameters you need to supply for the pipeline can be found at: `nextflow.config`.
+Unfortunately there is a ton of input that is required for pipelines to run. Such as:
+
+- Directories with fastq files
+- Location of reference genomes
+- Process specific settings and flags
+
+In order to view what all of these settings are, you can check out `nextflow.config`. 
 To actually supply the parameters to this pipeline, you must submit a json file with these values.
-There are two examples located at: `example-inputs/param_1.json` `example-inputs/param_2.json`
+Some examples include: `example-inputs/param_1.json` `example-inputs/param_2_v2.json`.
+
+There is also a directory called `config/` however these are settings that specific to c4 and typically
+do not need to be tweaked. `nextflow.config` imports these settings for you.
+
+#### Data Generation
+
+For those that are not familiar with sequencing and data generation, recall that:
+
+1. We obtain $s$ biological samples / biospecimens. Each sample maps to a unique individual and  
+contains $c$ cells.
+2. For each sample we isolate $n$ cells, and then load those cells into the sequencer. When $s > 1$ this is
+called pooling. 
+3. The sequencer runs and generates a library. Each library can contain $r$ reads.
+4. We repeat steps 2-3, except we load in a new set of cells.
+
+Note: Pooling is very specific to Colabs
+
+## Testing
+
+### Regression tests
+
+You will need nf-test installed. Please follow the instructions here: https://github.com/askimed/nf-test#installation
+
+To run regression tests: `nf-test test tests/pipeline_pee_qc.nf.test`
+
+### Test data
+
+In general test data should be located in: `/krummellab/data1/pipeline_test_data/`.
+
+Currently we only have GEX (Gene expression data) to use as a means of testing our pipeline.
+
+The fastqs and variants can be found in the following directory:
+
+`/krummellab/data1/pipeline_test_data/assays/scRNA_seq/modality/gex/downsampled_jurkat_tcell/inputs/fastqs/`
+`/krummellab/data1/pipeline_test_data/assays/scRNA_seq/modality/gex/downsampled_jurkat_tcell/inputs/variants/`
+
+Ideally we want:
+
+- CITE (Protein level data)
+- BCR (B-cell receptor)
+- TCR (T-cell receptor)
+
+
+## Conventions
+
+
+### Params
+
+If you have a process that requires input from the `params` keyword, do NOT parse/extract it
+at the beginning of the pipeline and supply those inputs to processes downstream. 
+
+Ex:
+```
+some_param = params.collectMany{ ... }
+another_param = params.collectMany { ... }
+
+PROCESS_1()
+
+PROCESS_2(another_param)
+
+PROCESS_3(some_param)
+```
+
+Do this instead:
+
+```
+// Extract library directories and their corresponding data types
+some_param = params.collectMany { ... }
+
+PROCESS_1(some_param)
+
+another_param = params.collectMany { ... }
+
+PROCESS_2(another_param)
+```
+
+It is much easier to determine which params belong to which processes this way.
+
+If your params require heavy parsing and manipulation it is probably a sign your 
+param structure must be re-designed. 
+
+### Variable names
+
+Groovy is JVM-esque so it might have made sense to use `camelCase` for variable names.
+However, most of the code written in this repo is `snake_case`, so lets continue with this
+convention.
+
+It is also helpful to prefix channels with the `ch_` prefix.
+
 
 ### Initial SC-Seq Pipeline in Nextflow
 
@@ -65,10 +160,11 @@ By default, the nextflow working directory is:
 `/c4/scratch/<user>/nextflow/<original_job_id>`
 This is deleted on successful completion of an initial run. If the pipeline fails, you can resume it with `run_repeat.sh` (which uses the same working directory), or you should manually remove this directory. After `run_repeat.sh`, you must manually remove this directory. 
 
+#### Pre-QC Pipeline
 
-
-#### Next steps:
-Below are outlined the next steps for turning this into a usable pipeline
+Please note that the `pipeline_pre_qc.nf` uses a different json structure. 
+You can see an example in `example-inputs/param_2_v2.json`. Eventually all pipelines will conform to this structure
+this structure.
 
 key additions:
 

diff --git a/single_cell_RNAseq/config/container.config b/single_cell_RNAseq/config/container.config
@@ -3,7 +3,7 @@ params {
         singularity_dir = "/krummellab/data1/singularity_images/"
         cellranger = "${params.container.singularity_dir}/cellranger/6.0.2/cellranger.sif"
         popscle = "${params.container.singularity_dir}/popscle/da70fc78da385ef049e0e890342acfd62842cae0/popscle.sif"
-        rsinglecell = "${params.container.singularity_dir}/RSingleCell/v3/RSingleCell.sif"
+        rsinglecell = "${params.container.singularity_dir}/RSingleCell/v4/RSingleCell.sif"
         python = "${params.container.singularity_dir}/cytof/v3/cytof.sif"
     }
 }
diff --git a/single_cell_RNAseq/example-inputs/param_2_v2.json b/single_cell_RNAseq/example-inputs/param_2_v2.json
@@ -0,0 +1,49 @@
+{ "project_dir" : "/krummellab/data1/amazzara/tutorial_lib_sep",
+  "settings" : {
+    "add_tcr" : false,
+    "add_bcr" : false,
+    "skip_cellranger": false,
+    "merge_for_demux" : true,
+    "merge_demux_dir" : "/krummellab/data1/amazzara/tutorial_lib_sep/freemuxlet_data/",
+    "demux_method" : "freemuxlet",
+    "run_doubletfinder" : true,
+    "mincell" : 3,
+    "minfeature" : 100,
+    "default_qc_cuts_dir": "/krummellab/data1/amazzara/sc_seq_pipeline/sc_seq_nextflow/example/",
+    "default_qc_cuts_file": "default_qc_cuts.csv",
+    "randomseed" : 21212,
+    "remove_demux_DBL": true,
+    "remove_all_DBL": true
+  },
+  "pools" : [
+    {
+      "name": "DM1",
+      "nsamples" : "2",
+      "vcf": "",
+      "libraries": [
+        {
+          "dir": "TEST-POOL-DM1-SCG1",
+          "ncells_loaded": 200,
+          "data_types": ["GEX"]
+        },
+        {
+          "dir": "TEST-POOL-DM1-SCG2",
+          "ncells_loaded": 200,
+          "data_types": ["GEX"]
+        }
+      ]
+    },
+    {
+      "nsamples": "2",
+      "vcf": "",
+      "name": "DM2",
+      "libraries": [
+        {
+          "dir": "TEST-POOL-DM2-SCG1",
+          "ncells_loaded": 100,
+          "data_types": ["GEX"]
+        }
+      ]
+    }
+  ]
+}
diff --git a/single_cell_RNAseq/helpers/params_parse.nf b/single_cell_RNAseq/helpers/params_parse.nf
@@ -0,0 +1,96 @@
+def get_c4_h5(library){
+  return file("${params.project_dir}/data/single_cell_GEX/processed/${library}/cellranger/raw_feature_bc_matrix.h5", checkIfExists: true)
+}
+def get_c4_bam(library){
+  return file("${params.project_dir}/data/single_cell_GEX/processed/${library}/cellranger/possorted_genome_bam.bam", checkIfExists: true)
+}
+
+def get_c4_h5_bam(){
+    return params.pools.collectMany {
+           pool -> pool.libraries.collect {
+               library -> [library.dir, get_c4_bam(library.dir), get_c4_h5(library.dir)]
+           }
+        }
+}
+
+def get_pool_library_meta(){
+    return params.pools.collectMany {
+        pool ->
+        return [
+            [
+                name: pool.name,
+                vcf: pool.vcf,
+                nsamples: pool.nsamples,
+                num_of_libraries: pool.libraries.size(),
+                lib_directories: pool.libraries*.dir
+            ]
+        ]
+    }
+}
+
+def get_libraries_data_type(){
+    return params.pools.collectMany {
+                pool -> pool.libraries.collect {
+                    library -> [library.dir, library.data_types.join(",")]
+                }
+           }
+}
+
+def get_library_ncells(){
+    return params.pools.collectMany {
+                pool -> pool.libraries.collect {
+                    library -> [library.dir, library.ncells_loaded]
+                }
+           }
+}
+
+def get_multi_library_by_pool() {
+    return get_pool_library_meta().findAll {it.num_of_libraries > 1}.collectMany { pool ->
+            pool.lib_directories.collect { dir ->
+                [dir, pool.name]
+            }
+        }
+}
+
+def get_single_library_by_pool() {
+    return get_pool_library_meta().findAll {it.num_of_libraries == 1}.collectMany { pool ->
+            pool.lib_directories.collect { dir ->
+                [dir, pool.name]
+            }
+        }
+}
+
+def get_library_by_pool() {
+    return get_pool_library_meta().collectMany { pool ->
+            pool.lib_directories.collect { dir ->
+                [dir, pool.name]
+            }
+        }
+}
+
+
+def get_library_by_sample_count() {
+    return get_pool_library_meta().collectMany { pool ->
+            pool.lib_directories.collect { dir ->
+                [dir, pool.nsamples]
+            }
+        }
+}
+
+def get_pool_by_sample_count() {
+    return get_pool_library_meta().collectMany { pool ->
+            [
+                [pool.name, pool.nsamples]
+            ]
+        }
+}
+
+def get_pool_vcf() {
+    return get_pool_library_meta().collectMany { pool ->
+            [
+                [pool.name, pool.vcf]
+            ]
+        }
+
+}
+
diff --git a/single_cell_RNAseq/helpers/utils.nf b/single_cell_RNAseq/helpers/utils.nf
@@ -0,0 +1,4 @@
+def extractFileName(String path) {
+    def filename = path.split('/').last() // Split by '/' and get the last part which is the filename
+    return filename.split("\\.")[0] // Split the filename on dot and return the first part
+}
diff --git a/single_cell_RNAseq/modules/pipeline_tasks.nf b/single_cell_RNAseq/modules/pipeline_tasks.nf
@@ -249,6 +249,7 @@ process MERGE_DSC {
   container "${params.container.python}"
 
   input:
+  // TODO: expand out what these path files are...
   tuple val(pool), path(pool_files)
 
   output:
@@ -316,7 +317,6 @@ process FREEMUXLET_LIBRARY {
   container "${params.container.popscle}"
   containerOptions "-B ${params.ref.fmx_dir}"
 
-
   input:
   tuple val(library), val(nsamples), path(plp_files)
 
@@ -441,10 +441,29 @@ process UNMERGE_FMX {
 process SEPARATE_FMX {
    publishDir "${params.project_dir}/data/single_cell_GEX/processed/${library}/freemuxlet", mode: 'copy', pattern: "${library}*"
    publishDir "${params.project_dir}/data/single_cell_GEX/logs/${library}/", mode: 'copy', pattern: ".command.log", saveAs: { filename -> "separate_fmx.log" }
+  input:
+   tuple val(library), path(library_files)
 
+  output:
+   tuple path("${library}.clust1.samples.gz"), path("${library}.clust1.vcf.gz"), path("${library}.lmix"), emit: fmx_files
+   tuple val(library), path("${library}.clust1.samples.reduced.tsv"), emit: sample_map
+   path(".command.log"), emit: log
+
+  """
+  gunzip -f ${library}.clust1.samples.gz
+  awk {'printf (\"%s\t%s\t%s\t%s\t%s\\n\", \$2, \$3, \$4, \$5, \$6)'} ${library}.clust1.samples > ${library}.clust1.samples.reduced.tsv
+  gzip -f ${library}.clust1.samples
+  """
+}
+
+// TODO: unify the two processes
+process SEPARATE_FMX_PRE {
+   publishDir "${params.project_dir}/data/single_cell_GEX/processed/${library}/freemuxlet", mode: 'copy', pattern: "${library}*"
+   publishDir "${params.project_dir}/data/single_cell_GEX/logs/${library}/", mode: 'copy', pattern: ".command.log", saveAs: { filename -> "separate_fmx.log" }
+
+  input:
+   tuple val(library), path(vcf_file), path(sample_file), path(lmix_file)
 
-  input: 
-   tuple val(library), path(library_files)
   output:
    tuple path("${library}.clust1.samples.gz"), path("${library}.clust1.vcf.gz"), path("${library}.lmix"), emit: fmx_files
    tuple val(library), path("${library}.clust1.samples.reduced.tsv"), emit: sample_map
@@ -458,6 +477,8 @@ process SEPARATE_FMX {
 }
 
 
+
+
 /*
  * Step 3. Run DoubletFinder
  */
@@ -572,7 +593,8 @@ process SEURAT_ADD_TCR {
 process SEURAT_QC {
   publishDir "${params.project_dir}/data/single_cell_GEX/logs/${library}/", mode: 'copy', pattern: ".command.log", saveAs: { filename -> "seurat_qc.log" }
   publishDir "${params.project_dir}/data/single_cell_GEX/processed/${library}/automated_processing", mode: 'copy', pattern: "${library}*"
-
+  // For testing
+  publishDir "${workDir}/data/single_cell_GEX/processed/${library}/automated_processing", mode: 'copy', pattern: "${library}*"
 
   container "${params.container.rsinglecell}"
   containerOptions "-B ${params.settings.default_qc_cuts_dir}"

diff --git a/single_cell_RNAseq/nf-test.config b/single_cell_RNAseq/nf-test.config
@@ -0,0 +1,6 @@
+config {
+    testsDir "tests"
+    workDir ".nf-test"
+    configFile "nextflow.config"
+    profile "test"
+}