From a1de00884278325a6e9a300899e7a78a0a78d55e Mon Sep 17 00:00:00 2001 From: Olga Botvinnik <olga.botvinnik@gmail.com> Date: Tue, 25 Jun 2019 09:27:28 -0700 Subject: [PATCH 01/11] Add separate step to clean featurecounts output to minimize memory needed for merging --- main.nf | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index c453a9042..e043aeb7b 100644 --- a/main.nf +++ b/main.nf @@ -1007,7 +1007,7 @@ process featureCounts { file biotypes_header from ch_biotypes_header.collect() output: - file "${bam_featurecounts.baseName}_gene.featureCounts.txt" into geneCounts, featureCounts_to_merge + file "${bam_featurecounts.baseName}_gene.featureCounts.txt" into geneCounts, featureCounts_to_clean file "${bam_featurecounts.baseName}_gene.featureCounts.txt.summary" into featureCounts_logs file "${bam_featurecounts.baseName}_biotype_counts*mqc.{txt,tsv}" into featureCounts_biotype @@ -1029,10 +1029,32 @@ process featureCounts { """ } + +/* + * STEP 10 - Clean featurecounts + */ +process clean_featureCounts { + tag "${input_files[0].baseName - '.sorted'}" + + input: + file input_files from featureCounts_to_clean.collect() + + output: + file featureCounts_to_merge + + script: + """ + csvtk cut -t -f "-Start,-Chr,-End,-Length,-Strand" \\ + | sed 's/Aligned.sortedByCoord.out.markDups.bam//g' \\ + > $featureCounts_to_merge + """ +} + /* * STEP 10 - Merge featurecounts */ process merge_featureCounts { + label "mid_memory" tag "${input_files[0].baseName - '.sorted'}" publishDir "${params.outdir}/featureCounts", mode: 'copy' @@ -1046,7 +1068,7 @@ process merge_featureCounts { //if we only have 1 file, just use cat and pipe output to csvtk. Else join all files first, and then remove unwanted column names. def merge = input_files instanceof Path ? 'cat' : 'csvtk join -t -f "Geneid,Start,Length,End,Chr,Strand,gene_name"' """ - $merge $input_files | csvtk cut -t -f "-Start,-Chr,-End,-Length,-Strand" | sed 's/Aligned.sortedByCoord.out.markDups.bam//g' > merged_gene_counts.txt + $merge $input_files > merged_gene_counts.txt """ } From e60c8c27069b2c656db0b158d600b83b5c5e0ee2 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik <olga.botvinnik@gmail.com> Date: Tue, 25 Jun 2019 09:38:11 -0700 Subject: [PATCH 02/11] Make salmon_merge also mid_memory --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index e043aeb7b..9e5621a22 100644 --- a/main.nf +++ b/main.nf @@ -1110,7 +1110,7 @@ if (params.pseudo_aligner == 'salmon'){ } process salmon_merge { - label 'low_memory' + label 'mid_memory' publishDir "${params.outdir}/salmon", mode: 'copy' input: From aaab9ad69a15afd8240ea20ed75966e47f048495 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik <olga.botvinnik@gmail.com> Date: Tue, 25 Jun 2019 09:50:07 -0700 Subject: [PATCH 03/11] Get clean_featureCounts to work with test data --- main.nf | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 9e5621a22..b304a3f47 100644 --- a/main.nf +++ b/main.nf @@ -1034,19 +1034,20 @@ process featureCounts { * STEP 10 - Clean featurecounts */ process clean_featureCounts { - tag "${input_files[0].baseName - '.sorted'}" + tag "${input_file[0].baseName - '.sorted'}" input: - file input_files from featureCounts_to_clean.collect() + file input_file from featureCounts_to_clean output: - file featureCounts_to_merge + file output into featureCounts_to_merge script: + output = "${input_file}_cleaned.txt" """ - csvtk cut -t -f "-Start,-Chr,-End,-Length,-Strand" \\ + csvtk cut -t -f "-Start,-Chr,-End,-Length,-Strand" $input_file \\ | sed 's/Aligned.sortedByCoord.out.markDups.bam//g' \\ - > $featureCounts_to_merge + > $output """ } @@ -1066,7 +1067,7 @@ process merge_featureCounts { script: //if we only have 1 file, just use cat and pipe output to csvtk. Else join all files first, and then remove unwanted column names. - def merge = input_files instanceof Path ? 'cat' : 'csvtk join -t -f "Geneid,Start,Length,End,Chr,Strand,gene_name"' + def merge = input_files instanceof Path ? 'cat' : 'csvtk join -t -f "Geneid,gene_name"' """ $merge $input_files > merged_gene_counts.txt """ From 02d3ce9ee1d2880d7eb113ac17565b6cf4165b9d Mon Sep 17 00:00:00 2001 From: Olga Botvinnik <olga.botvinnik@gmail.com> Date: Tue, 25 Jun 2019 11:42:34 -0700 Subject: [PATCH 04/11] Use paste to merge everything --- main.nf | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index b304a3f47..46b610069 100644 --- a/main.nf +++ b/main.nf @@ -1040,14 +1040,19 @@ process clean_featureCounts { file input_file from featureCounts_to_clean output: - file output into featureCounts_to_merge + file counts into featureCounts_to_merge + file gene_ids into featureCounts_to_merge_ids script: - output = "${input_file}_cleaned.txt" + intermediate = 'intermediate.txt' + counts = "${input_file}_counts_only.txt" + gene_ids = "${input_file}_gene_ids.txt" """ csvtk cut -t -f "-Start,-Chr,-End,-Length,-Strand" $input_file \\ | sed 's/Aligned.sortedByCoord.out.markDups.bam//g' \\ - > $output + > $intermediate + cut -f 3 $intermediate > $counts + cut -f 1,2 $intermediate > $gene_ids """ } @@ -1061,15 +1066,16 @@ process merge_featureCounts { input: file input_files from featureCounts_to_merge.collect() + file gene_ids from featureCounts_to_merge_ids.collect() output: file 'merged_gene_counts.txt' into featurecounts_merged script: //if we only have 1 file, just use cat and pipe output to csvtk. Else join all files first, and then remove unwanted column names. - def merge = input_files instanceof Path ? 'cat' : 'csvtk join -t -f "Geneid,gene_name"' + gene_ids_single = gene_ids[0] """ - $merge $input_files > merged_gene_counts.txt + paste $gene_ids_single $input_files > merged_gene_counts.txt """ } From 5b2de6f649759181e27de24b8c92d1af7a96f4cf Mon Sep 17 00:00:00 2001 From: Olga Botvinnik <olga.botvinnik@gmail.com> Date: Wed, 26 Jun 2019 18:30:02 -0700 Subject: [PATCH 05/11] no "markdups" in filename --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 46b610069..ab5063a9f 100644 --- a/main.nf +++ b/main.nf @@ -1049,7 +1049,7 @@ process clean_featureCounts { gene_ids = "${input_file}_gene_ids.txt" """ csvtk cut -t -f "-Start,-Chr,-End,-Length,-Strand" $input_file \\ - | sed 's/Aligned.sortedByCoord.out.markDups.bam//g' \\ + | sed 's/Aligned.sortedByCoord.out.bam//g' \\ > $intermediate cut -f 3 $intermediate > $counts cut -f 1,2 $intermediate > $gene_ids From 7bc52ec7998450cb8dd1984b9eff664f9aaf8858 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik <olga.botvinnik@gmail.com> Date: Thu, 27 Jun 2019 07:44:26 -0700 Subject: [PATCH 06/11] Use unix-fu to merge featurecounts --- main.nf | 34 ++++------------------------------ 1 file changed, 4 insertions(+), 30 deletions(-) diff --git a/main.nf b/main.nf index ab5063a9f..30158d1ca 100644 --- a/main.nf +++ b/main.nf @@ -1007,7 +1007,7 @@ process featureCounts { file biotypes_header from ch_biotypes_header.collect() output: - file "${bam_featurecounts.baseName}_gene.featureCounts.txt" into geneCounts, featureCounts_to_clean + file "${bam_featurecounts.baseName}_gene.featureCounts.txt" into geneCounts, featureCounts_to_merge file "${bam_featurecounts.baseName}_gene.featureCounts.txt.summary" into featureCounts_logs file "${bam_featurecounts.baseName}_biotype_counts*mqc.{txt,tsv}" into featureCounts_biotype @@ -1030,31 +1030,6 @@ process featureCounts { } -/* - * STEP 10 - Clean featurecounts - */ -process clean_featureCounts { - tag "${input_file[0].baseName - '.sorted'}" - - input: - file input_file from featureCounts_to_clean - - output: - file counts into featureCounts_to_merge - file gene_ids into featureCounts_to_merge_ids - - script: - intermediate = 'intermediate.txt' - counts = "${input_file}_counts_only.txt" - gene_ids = "${input_file}_gene_ids.txt" - """ - csvtk cut -t -f "-Start,-Chr,-End,-Length,-Strand" $input_file \\ - | sed 's/Aligned.sortedByCoord.out.bam//g' \\ - > $intermediate - cut -f 3 $intermediate > $counts - cut -f 1,2 $intermediate > $gene_ids - """ -} /* * STEP 10 - Merge featurecounts @@ -1066,16 +1041,15 @@ process merge_featureCounts { input: file input_files from featureCounts_to_merge.collect() - file gene_ids from featureCounts_to_merge_ids.collect() output: file 'merged_gene_counts.txt' into featurecounts_merged script: - //if we only have 1 file, just use cat and pipe output to csvtk. Else join all files first, and then remove unwanted column names. - gene_ids_single = gene_ids[0] + gene_ids = "<(cut -f1,2 ${input_files[0]})" + counts = input_files.collect{filename -> "<(cut -f3 ${filename})"}.join(" ") """ - paste $gene_ids_single $input_files > merged_gene_counts.txt + paste $gene_ids $counts > merged_gene_counts.txt """ } From 8c4ad54a7b836bce70c5a0ac0f6621632d292a80 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik <olga.botvinnik@gmail.com> Date: Thu, 27 Jun 2019 08:48:37 -0700 Subject: [PATCH 07/11] Remove first line of featurecounts files --- main.nf | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 30158d1ca..b03e47203 100644 --- a/main.nf +++ b/main.nf @@ -1046,8 +1046,10 @@ process merge_featureCounts { file 'merged_gene_counts.txt' into featurecounts_merged script: - gene_ids = "<(cut -f1,2 ${input_files[0]})" - counts = input_files.collect{filename -> "<(cut -f3 ${filename})"}.join(" ") + gene_ids = "<(tail -n +2 ${input_files[0]} | cut -f1,2 )" + counts = input_files.collect{filename -> + // Remove first line and take third column + "<(tail -n +2 ${filename} | cut -f3)"}.join(" ") """ paste $gene_ids $counts > merged_gene_counts.txt """ From d8242ec40937035904787a78f1e481370b1909e4 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik <olga.botvinnik@gmail.com> Date: Thu, 27 Jun 2019 08:55:24 -0700 Subject: [PATCH 08/11] Remove csvtk from requirements --- environment.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/environment.yml b/environment.yml index ee2d355ed..7571f1fe1 100644 --- a/environment.yml +++ b/environment.yml @@ -26,7 +26,6 @@ dependencies: - preseq=2.0.3 - deeptools=3.2.1 - gffread=0.11.4 - - csvtk=0.18.2 - qualimap=2.2.2c - rseqc=3.0.0 - subread=1.6.4 From a3c30ad1c257f8c07fdcd9e84d8b4d3410e138ce Mon Sep 17 00:00:00 2001 From: Olga Botvinnik <olga.botvinnik@gmail.com> Date: Thu, 27 Jun 2019 08:55:44 -0700 Subject: [PATCH 09/11] add a note about redirection --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index b03e47203..e6af36b69 100644 --- a/main.nf +++ b/main.nf @@ -1046,6 +1046,7 @@ process merge_featureCounts { file 'merged_gene_counts.txt' into featurecounts_merged script: + // Redirection (the `<()`) for the win! gene_ids = "<(tail -n +2 ${input_files[0]} | cut -f1,2 )" counts = input_files.collect{filename -> // Remove first line and take third column From beab23f95b700fb2c0a4e546dceceae6f4e684cd Mon Sep 17 00:00:00 2001 From: Olga Botvinnik <olga.botvinnik@gmail.com> Date: Thu, 27 Jun 2019 09:31:48 -0700 Subject: [PATCH 10/11] Use 7th column for gene namec --- main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index e6af36b69..ccc54639d 100644 --- a/main.nf +++ b/main.nf @@ -1047,7 +1047,8 @@ process merge_featureCounts { script: // Redirection (the `<()`) for the win! - gene_ids = "<(tail -n +2 ${input_files[0]} | cut -f1,2 )" + // Geneid in 1st column and gene_name in 7th + gene_ids = "<(tail -n +2 ${input_files[0]} | cut -f1,7 )" counts = input_files.collect{filename -> // Remove first line and take third column "<(tail -n +2 ${filename} | cut -f3)"}.join(" ") From 2be0b274e648290a6be23f956c8fb45f0eac31e8 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik <olga.botvinnik@gmail.com> Date: Thu, 27 Jun 2019 19:40:15 -0700 Subject: [PATCH 11/11] actually use the gene counts to merge .. --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index ccc54639d..7157007a4 100644 --- a/main.nf +++ b/main.nf @@ -1051,7 +1051,7 @@ process merge_featureCounts { gene_ids = "<(tail -n +2 ${input_files[0]} | cut -f1,7 )" counts = input_files.collect{filename -> // Remove first line and take third column - "<(tail -n +2 ${filename} | cut -f3)"}.join(" ") + "<(tail -n +2 ${filename} | sed 's:.sorted.bam::' | cut -f8)"}.join(" ") """ paste $gene_ids $counts > merged_gene_counts.txt """