From a1de00884278325a6e9a300899e7a78a0a78d55e Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 25 Jun 2019 09:27:28 -0700
Subject: [PATCH 01/11] Add separate step to clean featurecounts output to
 minimize memory needed for merging

---
 main.nf | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index c453a9042..e043aeb7b 100644
--- a/main.nf
+++ b/main.nf
@@ -1007,7 +1007,7 @@ process featureCounts {
     file biotypes_header from ch_biotypes_header.collect()
 
     output:
-    file "${bam_featurecounts.baseName}_gene.featureCounts.txt" into geneCounts, featureCounts_to_merge
+    file "${bam_featurecounts.baseName}_gene.featureCounts.txt" into geneCounts, featureCounts_to_clean
     file "${bam_featurecounts.baseName}_gene.featureCounts.txt.summary" into featureCounts_logs
     file "${bam_featurecounts.baseName}_biotype_counts*mqc.{txt,tsv}" into featureCounts_biotype
 
@@ -1029,10 +1029,32 @@ process featureCounts {
     """
 }
 
+
+/*
+ * STEP 10 - Clean featurecounts
+ */
+process clean_featureCounts {
+    tag "${input_files[0].baseName - '.sorted'}"
+
+    input:
+    file input_files from featureCounts_to_clean.collect()
+
+    output:
+    file featureCounts_to_merge
+
+    script:
+    """
+    csvtk cut -t -f "-Start,-Chr,-End,-Length,-Strand" \\
+        | sed 's/Aligned.sortedByCoord.out.markDups.bam//g' \\
+        > $featureCounts_to_merge
+    """
+}
+
 /*
  * STEP 10 - Merge featurecounts
  */
 process merge_featureCounts {
+    label "mid_memory"
     tag "${input_files[0].baseName - '.sorted'}"
     publishDir "${params.outdir}/featureCounts", mode: 'copy'
 
@@ -1046,7 +1068,7 @@ process merge_featureCounts {
     //if we only have 1 file, just use cat and pipe output to csvtk. Else join all files first, and then remove unwanted column names.
     def merge = input_files instanceof Path ? 'cat' : 'csvtk join -t -f "Geneid,Start,Length,End,Chr,Strand,gene_name"'
     """
-    $merge $input_files | csvtk cut -t -f "-Start,-Chr,-End,-Length,-Strand" | sed 's/Aligned.sortedByCoord.out.markDups.bam//g' > merged_gene_counts.txt
+    $merge $input_files > merged_gene_counts.txt
     """
 }
 

From e60c8c27069b2c656db0b158d600b83b5c5e0ee2 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 25 Jun 2019 09:38:11 -0700
Subject: [PATCH 02/11] Make salmon_merge also mid_memory

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index e043aeb7b..9e5621a22 100644
--- a/main.nf
+++ b/main.nf
@@ -1110,7 +1110,7 @@ if (params.pseudo_aligner == 'salmon'){
         }
 
     process salmon_merge {
-      label 'low_memory'
+      label 'mid_memory'
       publishDir "${params.outdir}/salmon", mode: 'copy'
 
       input:

From aaab9ad69a15afd8240ea20ed75966e47f048495 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 25 Jun 2019 09:50:07 -0700
Subject: [PATCH 03/11] Get clean_featureCounts to work with test data

---
 main.nf | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/main.nf b/main.nf
index 9e5621a22..b304a3f47 100644
--- a/main.nf
+++ b/main.nf
@@ -1034,19 +1034,20 @@ process featureCounts {
  * STEP 10 - Clean featurecounts
  */
 process clean_featureCounts {
-    tag "${input_files[0].baseName - '.sorted'}"
+    tag "${input_file[0].baseName - '.sorted'}"
 
     input:
-    file input_files from featureCounts_to_clean.collect()
+    file input_file from featureCounts_to_clean
 
     output:
-    file featureCounts_to_merge
+    file output into featureCounts_to_merge
 
     script:
+    output = "${input_file}_cleaned.txt"
     """
-    csvtk cut -t -f "-Start,-Chr,-End,-Length,-Strand" \\
+    csvtk cut -t -f "-Start,-Chr,-End,-Length,-Strand" $input_file \\
         | sed 's/Aligned.sortedByCoord.out.markDups.bam//g' \\
-        > $featureCounts_to_merge
+        > $output
     """
 }
 
@@ -1066,7 +1067,7 @@ process merge_featureCounts {
 
     script:
     //if we only have 1 file, just use cat and pipe output to csvtk. Else join all files first, and then remove unwanted column names.
-    def merge = input_files instanceof Path ? 'cat' : 'csvtk join -t -f "Geneid,Start,Length,End,Chr,Strand,gene_name"'
+    def merge = input_files instanceof Path ? 'cat' : 'csvtk join -t -f "Geneid,gene_name"'
     """
     $merge $input_files > merged_gene_counts.txt
     """

From 02d3ce9ee1d2880d7eb113ac17565b6cf4165b9d Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 25 Jun 2019 11:42:34 -0700
Subject: [PATCH 04/11] Use paste to merge everything

---
 main.nf | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/main.nf b/main.nf
index b304a3f47..46b610069 100644
--- a/main.nf
+++ b/main.nf
@@ -1040,14 +1040,19 @@ process clean_featureCounts {
     file input_file from featureCounts_to_clean
 
     output:
-    file output into featureCounts_to_merge
+    file counts into featureCounts_to_merge
+    file gene_ids into featureCounts_to_merge_ids
 
     script:
-    output = "${input_file}_cleaned.txt"
+    intermediate = 'intermediate.txt'
+    counts = "${input_file}_counts_only.txt"
+    gene_ids = "${input_file}_gene_ids.txt"
     """
     csvtk cut -t -f "-Start,-Chr,-End,-Length,-Strand" $input_file \\
         | sed 's/Aligned.sortedByCoord.out.markDups.bam//g' \\
-        > $output
+        > $intermediate
+    cut -f 3 $intermediate > $counts
+    cut -f 1,2 $intermediate > $gene_ids
     """
 }
 
@@ -1061,15 +1066,16 @@ process merge_featureCounts {
 
     input:
     file input_files from featureCounts_to_merge.collect()
+    file gene_ids from featureCounts_to_merge_ids.collect()
 
     output:
     file 'merged_gene_counts.txt' into featurecounts_merged
 
     script:
     //if we only have 1 file, just use cat and pipe output to csvtk. Else join all files first, and then remove unwanted column names.
-    def merge = input_files instanceof Path ? 'cat' : 'csvtk join -t -f "Geneid,gene_name"'
+    gene_ids_single = gene_ids[0]
     """
-    $merge $input_files > merged_gene_counts.txt
+    paste $gene_ids_single $input_files > merged_gene_counts.txt
     """
 }
 

From 5b2de6f649759181e27de24b8c92d1af7a96f4cf Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 26 Jun 2019 18:30:02 -0700
Subject: [PATCH 05/11] no "markdups" in filename

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 46b610069..ab5063a9f 100644
--- a/main.nf
+++ b/main.nf
@@ -1049,7 +1049,7 @@ process clean_featureCounts {
     gene_ids = "${input_file}_gene_ids.txt"
     """
     csvtk cut -t -f "-Start,-Chr,-End,-Length,-Strand" $input_file \\
-        | sed 's/Aligned.sortedByCoord.out.markDups.bam//g' \\
+        | sed 's/Aligned.sortedByCoord.out.bam//g' \\
         > $intermediate
     cut -f 3 $intermediate > $counts
     cut -f 1,2 $intermediate > $gene_ids

From 7bc52ec7998450cb8dd1984b9eff664f9aaf8858 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 27 Jun 2019 07:44:26 -0700
Subject: [PATCH 06/11] Use unix-fu to merge featurecounts

---
 main.nf | 34 ++++------------------------------
 1 file changed, 4 insertions(+), 30 deletions(-)

diff --git a/main.nf b/main.nf
index ab5063a9f..30158d1ca 100644
--- a/main.nf
+++ b/main.nf
@@ -1007,7 +1007,7 @@ process featureCounts {
     file biotypes_header from ch_biotypes_header.collect()
 
     output:
-    file "${bam_featurecounts.baseName}_gene.featureCounts.txt" into geneCounts, featureCounts_to_clean
+    file "${bam_featurecounts.baseName}_gene.featureCounts.txt" into geneCounts, featureCounts_to_merge
     file "${bam_featurecounts.baseName}_gene.featureCounts.txt.summary" into featureCounts_logs
     file "${bam_featurecounts.baseName}_biotype_counts*mqc.{txt,tsv}" into featureCounts_biotype
 
@@ -1030,31 +1030,6 @@ process featureCounts {
 }
 
 
-/*
- * STEP 10 - Clean featurecounts
- */
-process clean_featureCounts {
-    tag "${input_file[0].baseName - '.sorted'}"
-
-    input:
-    file input_file from featureCounts_to_clean
-
-    output:
-    file counts into featureCounts_to_merge
-    file gene_ids into featureCounts_to_merge_ids
-
-    script:
-    intermediate = 'intermediate.txt'
-    counts = "${input_file}_counts_only.txt"
-    gene_ids = "${input_file}_gene_ids.txt"
-    """
-    csvtk cut -t -f "-Start,-Chr,-End,-Length,-Strand" $input_file \\
-        | sed 's/Aligned.sortedByCoord.out.bam//g' \\
-        > $intermediate
-    cut -f 3 $intermediate > $counts
-    cut -f 1,2 $intermediate > $gene_ids
-    """
-}
 
 /*
  * STEP 10 - Merge featurecounts
@@ -1066,16 +1041,15 @@ process merge_featureCounts {
 
     input:
     file input_files from featureCounts_to_merge.collect()
-    file gene_ids from featureCounts_to_merge_ids.collect()
 
     output:
     file 'merged_gene_counts.txt' into featurecounts_merged
 
     script:
-    //if we only have 1 file, just use cat and pipe output to csvtk. Else join all files first, and then remove unwanted column names.
-    gene_ids_single = gene_ids[0]
+    gene_ids = "<(cut -f1,2 ${input_files[0]})"
+    counts = input_files.collect{filename -> "<(cut -f3 ${filename})"}.join(" ")
     """
-    paste $gene_ids_single $input_files > merged_gene_counts.txt
+    paste $gene_ids $counts > merged_gene_counts.txt
     """
 }
 

From 8c4ad54a7b836bce70c5a0ac0f6621632d292a80 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 27 Jun 2019 08:48:37 -0700
Subject: [PATCH 07/11] Remove first line of featurecounts files

---
 main.nf | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 30158d1ca..b03e47203 100644
--- a/main.nf
+++ b/main.nf
@@ -1046,8 +1046,10 @@ process merge_featureCounts {
     file 'merged_gene_counts.txt' into featurecounts_merged
 
     script:
-    gene_ids = "<(cut -f1,2 ${input_files[0]})"
-    counts = input_files.collect{filename -> "<(cut -f3 ${filename})"}.join(" ")
+    gene_ids = "<(tail -n +2 ${input_files[0]} | cut -f1,2 )"
+    counts = input_files.collect{filename ->
+      // Remove first line and take third column
+      "<(tail -n +2 ${filename} | cut -f3)"}.join(" ")
     """
     paste $gene_ids $counts > merged_gene_counts.txt
     """

From d8242ec40937035904787a78f1e481370b1909e4 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 27 Jun 2019 08:55:24 -0700
Subject: [PATCH 08/11] Remove csvtk from requirements

---
 environment.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index ee2d355ed..7571f1fe1 100644
--- a/environment.yml
+++ b/environment.yml
@@ -26,7 +26,6 @@ dependencies:
   - preseq=2.0.3
   - deeptools=3.2.1
   - gffread=0.11.4
-  - csvtk=0.18.2
   - qualimap=2.2.2c
   - rseqc=3.0.0
   - subread=1.6.4

From a3c30ad1c257f8c07fdcd9e84d8b4d3410e138ce Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 27 Jun 2019 08:55:44 -0700
Subject: [PATCH 09/11] add a note about redirection

---
 main.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/main.nf b/main.nf
index b03e47203..e6af36b69 100644
--- a/main.nf
+++ b/main.nf
@@ -1046,6 +1046,7 @@ process merge_featureCounts {
     file 'merged_gene_counts.txt' into featurecounts_merged
 
     script:
+    // Redirection (the `<()`) for the win!
     gene_ids = "<(tail -n +2 ${input_files[0]} | cut -f1,2 )"
     counts = input_files.collect{filename ->
       // Remove first line and take third column

From beab23f95b700fb2c0a4e546dceceae6f4e684cd Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 27 Jun 2019 09:31:48 -0700
Subject: [PATCH 10/11] Use 7th column for gene namec

---
 main.nf | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index e6af36b69..ccc54639d 100644
--- a/main.nf
+++ b/main.nf
@@ -1047,7 +1047,8 @@ process merge_featureCounts {
 
     script:
     // Redirection (the `<()`) for the win!
-    gene_ids = "<(tail -n +2 ${input_files[0]} | cut -f1,2 )"
+    // Geneid in 1st column and gene_name in 7th
+    gene_ids = "<(tail -n +2 ${input_files[0]} | cut -f1,7 )"
     counts = input_files.collect{filename ->
       // Remove first line and take third column
       "<(tail -n +2 ${filename} | cut -f3)"}.join(" ")

From 2be0b274e648290a6be23f956c8fb45f0eac31e8 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 27 Jun 2019 19:40:15 -0700
Subject: [PATCH 11/11] actually use the gene counts to merge ..

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index ccc54639d..7157007a4 100644
--- a/main.nf
+++ b/main.nf
@@ -1051,7 +1051,7 @@ process merge_featureCounts {
     gene_ids = "<(tail -n +2 ${input_files[0]} | cut -f1,7 )"
     counts = input_files.collect{filename ->
       // Remove first line and take third column
-      "<(tail -n +2 ${filename} | cut -f3)"}.join(" ")
+      "<(tail -n +2 ${filename} | sed 's:.sorted.bam::' | cut -f8)"}.join(" ")
     """
     paste $gene_ids $counts > merged_gene_counts.txt
     """