#!/bin/bash

#SBATCH -p all
#SBATCH -D .
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --cpus-per-task 15
#SBATCH -J water_only_viralrecon
#SBATCH -o water_only_viralrecon.out
#SBATCH -e water_only_viralrecon.err
#SBATCH --mail-type=ALL
#SBATCH --mail-user=rarthu3@emory.edu

#Revision 1.03 Mar 2, 2022 - add initial read count tab delim txt for Ludy
#Revision 1.02 Feb 22, 2022 - changes for viralrecon 2.4
#Revision 1.01 Feb 21, 2022 - change name to bcf consensus as we use viralrecon 2.3.1

#sister script for boolean for main submission script
#will be run in the water directory

pwd
echo "Running water only analysis for a directory that should only have Water-*fastq.gz files in it"
date
echo "Doing an ls for files"
ls *fastq.gz

#begin actual work
echo "Generate samplesheet.csv"
python fastq_dir_to_samplesheet.py . samplesheet.csv
#actual analysis command
echo "Run analysis"

nextflow run nf-core/viralrecon -profile docker --input samplesheet.csv --platform illumina --protocol amplicon --genome 'MN908947.3' --primer_bed swift_refv3_primers.bed --ivar_trim_offset 5 --skip_assembly --skip_kraken2 --multiqc_config '/home/rarthur/new_multiqc_config.yaml' --email 'rarthu3@emory.edu' --outdir ./results

echo "Done at `date`"

#This script should fail if water samples have <1000 reads aligned to the covid19 
#reference genome. 
#However, we expect that and that is why we've separated these samples from the actual
#analytical samples. This should fail or have very little reads aligned and prove the
#negative samples to be true negative controls

#create failure report for water samples
echo "Report of water samples that passed or failed the 1000 read threshold:" > water_error_summary.txt
grep -A20 "summary e-mail" .nextflow.log | grep -v "DEBUG" | grep -v "{" | grep -v "PluginManager"| grep -v "Graphviz" | tail -n +1>> water_error_summary.txt
#email it
tarfolder=$(find . -name "consensus" -print|head -1)
tar -zcf water_consensus_folder.tar.gz -C . results/variants/ivar/consensus/
runname=$(pwd | rev | cut -d"/" -f1,2| rev)

#grab initial reads from multiqc yaml
echo -e "Sample Name\t# of Raw Reads" > initial_read_counts.txt
grep -B1 "Input reads" results//multiqc/multiqc_data/multiqc_data.json |xargs | sed -e 's/: {//g' -e 's/ -- //g' -e 's/,/\n/g' -e 's/ # Input reads: /\t/g' -e 's/\.\0//g' >> initial_read_counts.txt
#Send both
echo -e "Water error report for this run "$runname" is attached to this email. The aligned read count (if it exists) will be shown next to the corresponding sample. A tab-delimited text file containing the sample names and the number of raw reads is attached as well, and you can open it in Excel. If all samples failed to meet the 1000 read threshold, the consensus tarball will be useless, just an FYI. This is an automated email from an unmonitored email address. Please do not respond to this address." | mutt -s "Water Samples failure report & initial read count summary for "$runname" attached" rarthu3@emory.edu -a water_error_summary.txt water_consensus_folder.tar.gz initial_read_counts.txt --