bamToFastq.wdl

version 1.0

workflow bamToFastq {
    input {
        File bamFile
        String finalOutputDirectory = ""
        String fileNaming = "{ID}"
        String outputFilePrefix
    }

    parameter_meta {
        bamFile: "A BAM file with one or more readgroups"
        fileNaming: "The naming scheme for the extracted FASTQs"
        finalOutputDirectory: "Optional final output directory. Copy out fastq files using a task"
        outputFilePrefix: "prefix to use to identify bam summary files"
    }

    meta {
        author: "Murto Hilali, Lawrence Heisler"
        email: "mhilali@oicr.on.ca, lheisler@oicr.on.ca"
        description: "Given aligned reads in bam format, this workflow will backextract to generate fastq files based on the readgroups.  By default, the files are named based on the readgroup IDs, but a custom naming scheme based on other readgroup fields can be provided.  This takes the form of mixing text with {FD} symbols, where FD is a readgroup FD (SM, PU, etc)"
        dependencies: [
            {
                name: "java/8",
                url: "https://www.java.com/en/download/manual.jsp"
            },
            {
                name: "samtools/1.16.1",
                url: "https://github.com/samtools/samtools/releases/tag/1.16"
            },
            {
                name: "picard/2.21.2",
                url: "https://broadinstitute.github.io/picard/command-line-overview.html"
            },
            {
                name: "python/3.6",
                url: "https://www.python.org/downloads/"
            }
        ]
    output_meta: {
	    bamFileFlagstat: {
		description: "A TXT file containing flag information about the BAM file",
		vidarr_label: "bamFileFlagstat"
	    },
	    fastqs: "An optional array of fastq.gz files",
            fastqc: "An optional array of FastQC report files",
	    copyLog: {
		description: "log file from copy out task, provisioned if a custom output dir requested",
		vidarr_label: "copyLog"
	    },
            summary: {
                description: "Summary file generated by processing fastQC and samstat report data",
                vidarr_label: "summary"
            }
    }
    }
    
    # call function to assess content with flagstat and pull out the readgroups
    call examineBam {
      input:
        bamFile = bamFile,
        prefix = outputFilePrefix
    }

    ### check on the rg names before doing anything
    ### by default this will check that the ID tag is present, which it should be
    ### if fileRenaming has other values, it will look to ensure those tags are present
    call nameCheck {
      input:
        bamFile = bamFile,
        fileNaming = fileNaming
    }

    
    ### proceed with backextraction only if the default ID is indicating for filename
    ###. or if the namecheck passed
    if (nameCheck.valid == true){
      call unsortBam {
        input:
          bamFileToSort = bamFile
      }
      call backExtractByRG { 
        input:
          bamFile = unsortBam.bamFile
      }
    }

    # =================================
    # We have a custom output directory
    # =================================
    if (finalOutputDirectory != "") {
      if (fileNaming != "{ID}"){
        call renameFastqs as haveCustomDirRename {
          input:
              rawFastqs = backExtractByRG.rawFastqs,
              rgData = nameCheck.rgData
        }
      }

      scatter(fq in select_first([haveCustomDirRename.modFastqs,backExtractByRG.rawFastqs])) {
         String fqidhd = basename(fq, ".fastq.gz")
         call reviewFastq as haveCustomDirReview {
           input:
             fastq = fq,
             id = fqidhd
         }

         call copyOutFastq as copyFastqs {
           input:
             Fastq = fq,
             outputDir = finalOutputDirectory 
        }
      }

      Array[File] fastqcFilesHaveDir = flatten(haveCustomDirReview.fastqcDataFiles)
      scatter(fqc in flatten(haveCustomDirReview.fastqcDataFiles)) {
        call copyOutFastq as copyFastQC {
          input:
            Fastq = fqc,
            outputDir = finalOutputDirectory    
        }
      }
    
      call composeLog {
        input:
            messages = copyFastqs.message
      }

    } 


    # ========================================
    # No custom output dir, provision normally
    # ========================================
    if (finalOutputDirectory == "") {
      if (fileNaming != "{ID}"){
        call renameFastqs as noCustomDirRename {
          input:
              rawFastqs = backExtractByRG.rawFastqs,
              rgData = nameCheck.rgData
        }
      }

      scatter(fq in select_first([noCustomDirRename.modFastqs,backExtractByRG.rawFastqs])) {
         String fqidnd = basename(fq, ".fastq.gz")
         call reviewFastq as noCustomDirReview {
           input:
             fastq = fq,
             id = fqidnd
         }
      }

      Array[File] fastqcFilesNoDir = flatten(noCustomDirReview.fastqcDataFiles)

    }

    String bamId = basename(bamFile, ".bam")
    call summarize {
       input:
         id = bamId,
         samstats = examineBam.samstats,
         fastqc = select_first([fastqcFilesHaveDir,fastqcFilesNoDir])
    }

    # =============================================================================
    # QC files are always provisioned and also, copied if custom output dir passed:
    # =============================================================================
    if (finalOutputDirectory != "") {
      scatter(qcFile in [examineBam.samstats, summarize.summary]) {
        call copyOutFastq as copyQCfiles {
          input:
            Fastq = qcFile,
            outputDir = finalOutputDirectory
        }
      }
    }

    output {
      File bamFileFlagstat = examineBam.samstats 
      Array[File]? fastqc = select_first([fastqcFilesHaveDir,fastqcFilesNoDir])
      Array[File]? fastqs = noCustomDirRename.modFastqs
      File summary = summarize.summary
      File? copyLog = composeLog.log
    } 
}


task unsortBam {
        input {
            File bamFileToSort
            Int memory = 24
            Int timeout = 12
            String modules = "samtools/1.16.1"
        }

        parameter_meta {
            bamFileToSort: "A BAM file with one or more readgroups"
            modules: "Required environment modules"
            memory: "Memory allocated for this job"
            timeout: "Time in hours before task timeout"
        }

        command <<<
            samtools sort --threads 8 -n ~{bamFileToSort} -O bam -o unsorted.bam
        >>>

        runtime {
            modules: "~{modules}"
            memory: "~{memory}G"
            timeout: "~{timeout}"
        }

        output {
            File bamFile = "unsorted.bam"
        }

        meta {
            output_meta: {
                bamFile: "unsorted bam file (sorted by readname)"
            }
        }

}


task examineBam {
        input {
            File bamFile
            String prefix
            Int memory = 24
            Int timeout = 12
            String modules = "samtools/1.16.1"

        }

        parameter_meta {
            bamFile: "A BAM file with one or more readgroups"
            prefix: "String prepended to flagstat file"
            modules: "Required environment modules"
            memory: "Memory allocated for this job"
            timeout: "Time in hours before task timeout"
        }

        command <<<
            set -euo pipefail
            samtools view -H ~{bamFile} | grep -G "^@RG" > readgroups.tsv
            samtools stats --threads 8 ~{bamFile} > ~{prefix}.samstats.txt
        >>>

        runtime {
            modules: "~{modules}"
            memory: "~{memory}G"
            timeout: "~{timeout}"
        }

        output {
            File readGroups = "readgroups.tsv"
            File samstats = "~{prefix}.samstats.txt"
        }

        meta {
            output_meta: {
                samstats : "A TXT file generated by samtools stats",
                readGroups: "A TSV file containing information about the merged BAM file"
            }
        }
}

task nameCheck {
        input {
            File bamFile
            String fileNaming
            String modules = "samtools/1.16.1"
            Int memory = 24
            Int timeout = 12
        }

        parameter_meta {
            bamFile: "The bam file from which to extract the readgroup information"
            fileNaming: "The naming scheme for the FASTQ files"
            modules: "Required environment modules"
            memory: "Memory allocated for this job"
            timeout: "Time in hours before task timeout"

        }

        command <<<
        
        samtools view -H ~{bamFile} | grep -G "^@RG" > readgroups.tsv
        python3<<CODE

        import difflib
        import re
        with open(r"readgroups.tsv",'r') as rg_data:
            rg = rg_data.read()
       
        fileNaming = "~{fileNaming}"
        
        def validate(valid="true"):
            isValid = open("isValid.txt", 'w')
            isValid.write(valid)
            isValid.close()
        
        validate()

            # Check No. 1:
            # Comparing RG tags from readgroups file and fileNaming
            # Exits if a tag is missing

        inputs = re.findall(r"[^{\}]+(?=})", fileNaming)
        tags = list(set(re.findall(r"[^\t:]+(?=:)", rg)))

        if all(i in tags for i in inputs) == False:  
            print("FAILED: Missing input tag")
            d = difflib.Differ()
            diff = d.compare(tags, inputs)
            errorMessage = '\n'.join(diff)
            validate("false")
            raise ValueError("Input tag does not exist:" + '\n' + errorMessage)

            # Check No. 2:
            # Predicting modified file name:
            # Exits if non-unique names are created

        rgArray = []
        dictArray = []

        for line in rg.splitlines():
            data = line[1:].split()
            rgArray.append(data)

        for row in rgArray:
            del row[0]
            for i in range(len(row)):
                k = row[i].split(":")
                row[i] = k
            dictArray.append({row[i][0]: row[i][1] for i in range(len(row))})

        fastqNames = []
        rgData = {}

        for j in range(len(dictArray)):
            idData = []
            for readNum in [1, 2]:
                newName = fileNaming.format_map(dictArray[j])
                predictedFileName = f"{newName}_{readNum}.fastq.gz"
                if predictedFileName in fastqNames:
                    validate("false")
                    raise ValueError("File name results in non-unique names")
                else:
                    fastqNames.append(predictedFileName)
                    idData.append(predictedFileName)
            rgData[dictArray[j]["ID"]]=idData

        print(rgData)
        
        CODE

        >>>

        runtime {
            modules: "~{modules}"
            memory: "~{memory}G"
            timeout: "~{timeout}"
        }

        output {
            String rgData = read_string(stdout())
            Boolean valid = read_boolean("isValid.txt")
        }

        meta {
            output_meta: {
                rgData: "A Python dictionary containing RG data",
                valid: "A boolean value to determine workflow continuation"
            }
        }
}

task backExtractByRG {
        input {
            File bamFile
            String modules = "picard/2.21.2"
            Int memory = 24
            Int timeout = 96
        }

        parameter_meta {
            bamFile: "A BAM file with one or more readgroups"
            modules: "Required environment modules"
            memory: "Memory allocated for this job"
            timeout: "Time in hours before task timeout"
        }

        command <<<
            java -Xmx20g -jar $PICARD_ROOT/picard.jar SamToFastq \
            INPUT=~{bamFile} \
            RG_TAG="ID" \
            OUTPUT_DIR=. \
            OUTPUT_PER_RG=true \
            COMPRESS_OUTPUTS_PER_RG=true \
            NON_PF=true \
            RE_REVERSE=true \
            VALIDATION_STRINGENCY=LENIENT

            ls *.fastq.gz > outfilenames            

        >>>

        runtime {
            modules: "~{modules}"
            memory: "~{memory}G"
            timeout: "~{timeout}"
        }

        output {
            Array[File]? rawFastqs = read_lines("outfilenames")
        }

        meta {
            output_meta: {
                rawFastqs: "An array of FASTQ files"
            }
        }
}


task renameFastqs {
        input {
            Array[File]? rawFastqs
            String rgData
            String modules = ""
            Int memory = 24
            Int timeout = 12
        }

        parameter_meta {
            rawFastqs: "An array of FASTQ files"
            rgData: "A Python array of dictionaries containing RG data"
            modules: "Required environment modules"
            memory: "Memory allocated for this job"
            timeout: "Time in hours before task timeout"
        }

        command <<<
            python3<<CODE

            import os
            import ast
            import re

            rgData = ast.literal_eval("~{rgData}")
            f = "~{sep=' ' rawFastqs}"
            fastqs = f.split()

            for fastq in fastqs:
               path = os.getcwd()
               ## get the filename, without the extension
               fastqID = re.sub(".fastq.gz","",os.path.basename(fastq))
               # determines read number
               readNum = int(fastqID[-1])
               ### get rid of the last two characters _N
               fastqID = fastqID[:-2]
               ### get the new name from the rgData dictionary
               newName = rgData[fastqID][(readNum-1)]
               ### format the file name
               formattedFileName = f"{path}/{newName}"
               ### rename the files
               os.rename(fastq, formattedFileName)
            
            CODE

            ls *.fastq.gz > outfilenames

        >>>

        runtime {
            modules: "~{modules}"
            memory: "~{memory}G"
            timeout: "~{timeout}"
        }

        output {
            Array[File]? modFastqs = read_lines("outfilenames")
        }

        meta {
            output_meta: {
                modFastqs: "FASTQs renamed in accordance to input"
            }
        }
}


task reviewFastq {
        input {
            File fastq
            String id
            String modules = "fastqc/0.11.9"
            Int memory = 24
            Int timeout = 12
        }

        parameter_meta {
            fastq: "the fastq file to review"
            id: "the expected id for the report, which should be the basename of the fastq file" 
            modules: "Required environment modules"
            memory: "Memory allocated for this job"
            timeout: "Time in hours before task timeout"
        }
        command <<<
            ## run fastqc, this will generate an html report and zipped files, all in the working directory
            fastqc ~{fastq} --outdir .
            ### locate the data file in the zipped output
            datafile=`unzip -l ~{id}_fastqc.zip | grep fastqc_data | sed 's/.* //'`
            ### pull the text data out to a file
            unzip -p ~{id}_fastqc.zip $datafile > ~{id}_fastqc_data.txt

            ls *fastqc* > outfilenames
            ls *fastqc_data* > datafilenames
        >>>

        runtime {
            modules: "~{modules}"
            memory: "~{memory}G"
            timeout: "~{timeout}"
        }

        output {
            Array[File] fastqcDataFiles = read_lines("datafilenames")
            Array[File] fastqcFiles = read_lines("outfilenames")
        }

        meta {
            output_meta: {
             fastqcFiles: "FASTQC output, html report, txt data file and zipped content",
             fastqcDataFiles: "FASTQC extracted data tables, for summary task"
            }
        }
}


task summarize {
        input {
            String id
            File samstats
            Array[File]? fastqc
            Int memory = 24
            Int timeout = 12
        }

        parameter_meta {
            id: "bam file id (basename)"
            samstats: "the samstats file generated from the input bam file"
            fastqc: "the array of fastqc files" 
            memory: "Memory allocated for this job"
            timeout: "Time in hours before task timeout"
        }

       
        command <<<
          ### samstats
          bamtotal=`cat ~{samstats} | grep ^SN | cut -f 2- | grep "raw total sequences:" | cut -f2`
          bampaired=`cat ~{samstats} | grep ^SN | cut -f 2- | grep "reads paired:" | cut -f2`
          lnavg=`cat ~{samstats}| grep ^SN | cut -f 2- | grep "average length:" | cut -f2`
          lnmax=`cat ~{samstats}| grep ^SN | cut -f 2- | grep "maximum length:" | cut -f2`
          insertsize=`cat ~{samstats} | grep ^SN | cut -f 2- | grep "insert size average:" | cut -f2`

          echo "INPUT bam file stats" > "~{id}.summary.txt"
          echo -e "bam total\t$bamtotal" >> "~{id}.summary.txt"
          echo -e "bam paired\t$bampaired" >> "~{id}.summary.txt"
          echo -e "bam mean readlength\t$lnavg" >> "~{id}.summary.txt"
          echo -e "bam max readlength\t$lnmax" >> "~{id}.summary.txt"
          echo -e "bam insertsize\t$insertsize" >> "~{id}.summary.txt"

          echo ""  >> "~{id}.summary.txt"
          echo "fastq ReadCounts"  >> "~{id}.summary.txt"
          totalreads=0
          files="~{sep=' ' fastqc}"
          for f in $files
          do
            id=`basename $f "_fastqc_data.txt"`
            count=`cat $f | grep "Total Sequences" | cut -f2`
            echo -e "$id\t$count"  >> "~{id}.summary.txt"
            totalreads=$(($totalreads + $count)) 
          done
          echo -e "fastq total\t$totalreads"  >> "~{id}.summary.txt"

          echo ""  >> "~{id}.summary.txt"
          echo "fastq Sequence Length Distributions"  >> "~{id}.summary.txt"
          for f in $files
          do
            id=`basename $f "_fastqc_data.txt"`	
            echo $id  >> "~{id}.summary.txt"
            cat $f | grep "Sequence Length Distribution" -A 999999999 | grep "Sequence Duplication Levels" -B 999999999 | grep -v ">>" >> "~{id}.summary.txt"
          done

        >>>

        runtime {
            memory: "~{memory}G"
            timeout: "~{timeout}"
        }

        output {
            File summary = "~{id}.summary.txt"
        }

        meta {
            output_meta: {
             summary: "A text file summary with fastq and bam readcounts and other metrics for review"
            }
        }
}

# =====================================================================
# This task will either copy (fastq) files to a destination directory or 
# (in a case of failure) spit out a message that it cannot copy
#
# can be used to copy other files as well
# =====================================================================
task copyOutFastq {
        input {
            File Fastq
            String outputDir
            Int memory = 24
            Int timeout = 12
        }

        parameter_meta {
            Fastq: "FASTQ file to copy"
            outputDir: "Oputput directory"
            memory: "Memory allocated for this job"
            timeout: "Time in hours before task timeout"
        }

        command <<<
          set -euxo pipefail
          if [[ -e ~{outputDir} && -d ~{outputDir} ]]; then
            cp ~{Fastq} ~{outputDir}
            echo "File ~{basename(Fastq)} copied to ~{outputDir}"
          else
            echo "Final Output Directory was not configured, ~{basename(Fastq)} was not provisioned properly"
          fi
        >>>

        runtime {
            memory: "~{memory}G"
            timeout: "~{timeout}"
        }

        output {
            String message = read_string(stdout())
        }

        meta {
            output_meta: {
                message: "Message from the copy task"
            }
        }
}

# ========================================================
# Picks up messages from copy task and write them to a log
# file (this keeps a record for reference in a future)
# ========================================================
task composeLog {
        input {
            Array[String] messages
            Int memory = 4
            Int timeout = 2
        }

        parameter_meta {
            messages: "log messages from copyOut task"
            memory: "Memory allocated for this job"
            timeout: "Time in hours before task timeout"
        }

        command <<<
          python3 <<CODE
          import json
          import re
          m_lines = re.split(",", "~{sep=',' messages}")
          with open("copy_out.log", "w") as log:
              for line in m_lines:
                  log.write(line + "\n")
          log.close()
          CODE
        >>>

        runtime {
            memory: "~{memory}G"
            timeout: "~{timeout}"
        }

        output {
            File log = "copy_out.log"
        }

        meta {
            output_meta: {
                log: "Log file with all messages from copyOut task"
            }
        }
}