-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path1_mapping.sh
387 lines (290 loc) · 18.5 KB
/
1_mapping.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
#!/bin/bash -l
#SBATCH -A naiss2023-5-222
#SBATCH -p devcore -n 1
#SBATCH -t 0-00:20:00
#SBATCH -J Cisco_QC
#SBATCH -e /proj/snic2020-2-19/private/cisco/qiaoling/European_Cisco_Genomic_Works/TEMP_outdir_running_logs/cisco_data-%j.err
#SBATCH -o /proj/snic2020-2-19/private/cisco/qiaoling/European_Cisco_Genomic_Works/TEMP_outdir_running_logs/cisco_data-%j.out
# This script is based on code by Jake Goodall.
# The following script aims to develop the low-pass EU Cisco data from fasta files through to genotype likelihood files.
# The sequencing was done in multiple batches so repeated samples will be combined post mapping
################################################################
## PHASE 0: Load the appropriate programs for running jobs ##
################################################################
# Load the programs required for the script
ml load bioinfo-tools
module load bwa/0.7.15
module load ANGSD/0.933
module load samtools/1.12
module load bamtools/2.5.1
module load FastQC/0.11.8
module load MultiQC/1.11
module load QualiMap/2.2.1
module load PCAngsd/0.982
module load trimmomatic/0.39
module load bowtie/1.2.3
module load bowtie2/2.4.5
module load picard/2.23.4
module load GATK/3.8-0
module load GATK/4.2.0.0
################################################################
## PHASE 1: Index the genome for mapping and setup symlinks ##
################################################################
### Sample and genome locations for housekeeping
# Individual RAW read data
# /proj/snic2020-2-19/private/cisco/reads/data_from_snpseq00071/snpseq00071/files/VB-3202/221007_A00181_0579_AHYMYLDSX3/
# RUN1 Symlinked Fasta Files
# /proj/snic2020-2-19/private/herring/users/jake/European_Cisco_Library_Quality_and_Preliminary_Structure/RUN1_Symlinks_to_individual_reads
# RUN2 Symlinked Fasta Files
# /proj/snic2020-2-19/private/herring/users/jake/European_Cisco_Library_Quality_and_Preliminary_Structure/RUN2_Symlinks_to_individual_reads
# RUN3 Symlinked Fasta Files
# /proj/snic2020-2-19/private/herring/users/jake/European_Cisco_Library_Quality_and_Preliminary_Structure/RUN3_Symlinks_to_individual_reads
# Cisco assembly location
#/proj/snic2020-2-19/private/cisco/qiaoling/new_assembly
### Generate an index for the cisco genome ###
#cd /proj/snic2020-2-19/private/cisco/qiaoling/new_assembly
#bwa index -p European_cisco_w_extra_data.ipa1.8.0.purged.primary "$cisco_genome"
#bowtie2-build $cisco_genome European_cisco_w_extra_data.ipa1.8.0.purged.primary
#
####################################################
## PHASE 2: Prepare the fasta files for mapping ##
####################################################
### Check whether the sample lists are being called properly
# Define variables for easier coding ###
cisco_genome_bwa_index='/proj/snic2020-2-19/private/cisco/qiaoling/new_assembly/European_cisco_w_extra_data.ipa1.8.0.purged.primary'
cisco_genome_bowtie_index='/proj/snic2020-2-19/private/cisco/qiaoling/new_assembly/European_cisco_w_extra_data.ipa1.8.0.purged.primary'
BASEDIR='/proj/snic2020-2-19/private/herring/users/jake/European_Cisco_Library_Quality_and_Preliminary_Structure'
ADAPTORS='/proj/snic2020-2-19/private/herring/users/jake/European_Cisco_Genomic_Works/FILES_for_processing_samples/Nextera_Mate_Pair_Adaptors.fasta'
SAMPLELIST='/proj/snic2020-2-19/private/herring/users/jake/European_Cisco_Genomic_Works/FILES_for_processing_samples/RUN1_Sample_List_MOD.txt' # Path to the sample list.
RAWFASTQSUFFIX1=_R1_001.fastq.gz # Suffix to raw fastq files. Use forward reads with paired-end data.
# Test whether the sample list is being input properly
#for SAMPLE in `cat $SAMPLELIST`; do
#
# echo $SAMPLE
# zcat $BASEDIR'/RUN1_Symlinks_to_individual_reads/'$SAMPLE$RAWFASTQSUFFIX1 | head -n 8
# echo ' '
#
#done
### Quality and adaptor trimming of raw reads ###
###################################################
RAWFASTQDIR=/proj/snic2020-2-19/private/herring/users/jake/European_Cisco_Library_Quality_and_Preliminary_Structure/RUN1_Symlinks_to_individual_reads/ # Path to raw fastq files.
RAWFASTQSUFFIX1=_R1_001.fastq.gz # Suffix to raw fastq files. Use forward reads with paired-end data.
RAWFASTQSUFFIX2=_R2_001.fastq.gz # Suffix to raw fastq files. Use reverse reads with paired-end data.
ADAPTERS='/proj/snic2020-2-19/private/herring/users/jake/European_Cisco_Genomic_Works/FILES_for_processing_samples/Nextera_Mate_Pair_Adaptors.fasta' # Path to a list of adapter/index sequences.
## Loop over each sample
for SAMPLE in `cat $SAMPLELIST`; do
SAMPLE_UNIQ_ID=$SAMPLE #define unique sample IDs
RAWFASTQ_ID=$RAWFASTQDIR$SAMPLE #data input path
SAMPLEADAPT='/proj/snic2020-2-19/private/cisco/qiaoling/Fasta_Files/RUN_1_trimmed/'$SAMPLE_UNIQ_ID #data output path
sbatch -J cisco_trim -e /proj/snic2020-2-19/private/cisco/qiaoling/European_Cisco_Genomic_Works/TEMP_outdir_running_logs/$SAMPLE-%j.err -p core -c 1 -t 0-24:00:00 -A naiss2023-5-222 --wrap="trimmomatic PE -threads 1 -phred33 $RAWFASTQ_ID$RAWFASTQSUFFIX1 $RAWFASTQ_ID$RAWFASTQSUFFIX2 \
$SAMPLEADAPT'_adapter_clipped_f_paired.fastq.gz' $SAMPLEADAPT'_adapter_clipped_f_unpaired.fastq.gz' $SAMPLEADAPT'_adapter_clipped_r_paired.fastq.gz' $SAMPLEADAPT'_adapter_clipped_r_unpaired.fastq.gz' \
'ILLUMINACLIP:'$ADAPTERS':2:30:10 MINLEN:40'"
done
######################################################
## PHASE 3: Map fasta to reference and create BAM ##
######################################################
### Map to the reference, sort, and quality filter ###
#######################################################
BASEDIR='/proj/snic2020-2-19/private/herring/users/jake/European_Cisco_Library_Quality_and_Preliminary_Structure'
SAMPLELIST=$BASEDIR'/sample_lists/TEST_MAPPING_SCRIPT/TESTMAP_Sample_list_RUN1.txt' # Path to a list of prefixes of the raw fastq files. It should be a subset of the the 1st column of the sample table.
SAMPLETABLE=$BASEDIR'/sample_lists/TEST_MAPPING_SCRIPT/TESTMAP_fastq_table_RUN1.tsv' # Path to a sample table where the 1st column is the prefix of the raw fastq files.
FASTQDIR='/proj/snic2020-2-19/private/cisco/qiaoling/Fasta_Files/RUN_1_trimmed/' # Path to the directory where fastq file are stored.
FASTQSUFFIX1=_adapter_clipped_f_paired.fastq.gz # Suffix to fastq files. Use forward reads with paired-end data.
FASTQSUFFIX2=_adapter_clipped_r_paired.fastq.gz # Suffix to fastq files. Use reverse reads with paired-end data.
MAPPINGPRESET=very-sensitive # The pre-set option to use for mapping in bowtie2 (very-sensitive for end-to-end (global) mapping [typically used when we have a full genome reference]
REFERENCE=/proj/snic2020-2-19/private/cisco/qiaoling/new_assembly/European_cisco_w_extra_data.ipa1.8.0.purged.primary.fa # Path to reference fasta file and file name
BOWTIE_INDEX='/proj/snic2020-2-19/private/cisco/qiaoling/new_assembly/European_cisco_w_extra_data.ipa1.8.0.purged.primary'
REFNAME=European_cisco.hifiasm0.16.1 # Reference name to add to output files, e.g. gadMor2
## Loop over each sample
for SAMPLEFILE in `cat $SAMPLELIST`; do
## Extract relevant values from a table of sample, sequencing, and lane ID (here in columns 4, 3, 2, respectively) for each sequenced library
SAMPLE_ID=`grep -P "${SAMPLEFILE}\t" $SAMPLETABLE | cut -f 4`
POP_ID=`grep -P "${SAMPLEFILE}\t" $SAMPLETABLE | cut -f 5`
SEQ_ID=`grep -P "${SAMPLEFILE}\t" $SAMPLETABLE | cut -f 3`
LANE_ID=`grep -P "${SAMPLEFILE}\t" $SAMPLETABLE | cut -f 2`
SAMPLE_UNIQ_ID=${SAMPLE_ID}_${POP_ID}_${LANE_ID} # When a sample has been sequenced in multiple lanes, we need to be able to identify the files from each run uniquely
## Extract data type from the sample table
DATATYPE=`grep -P "${SAMPLEFILE}\t" $SAMPLETABLE | cut -f 6`
## The input and output path and file prefix
SAMPLETOMAP=$FASTQDIR$SAMPLE_UNIQ_ID
SAMPLEBAM='/proj/snic2020-2-19/private/cisco/qiaoling/Bam_Files/Mapped_Rd1/'$SAMPLE_UNIQ_ID
## Define platform unit (PU), which is the lane number
PU=`grep -P "${SAMPLEFILE}\t" $SAMPLETABLE | cut -f 2`
## Define reference base name
REFBASENAME="${REFERENCE%.*}"
## Map reads to the reference
echo $SAMPLE_UNIQ_ID
# Map the paired-end reads
if [ $DATATYPE = pe ]; then
# We ignore the reads that get orphaned during adapter clipping because that is typically a very small proportion of reads. If a large proportion of reads get orphaned (loose their mate so they become single-end), these can be mapped in a separate step and the resulting bam files merged with the paired-end mapped reads.
sbatch -J cisco_trim -e /proj/snic2020-2-19/private/cisco/qiaoling/European_Cisco_Genomic_Works/Temp_Error_Reports/$SAMPLE_UNIQ_ID-%j.err -p core -c 1 -t 10-00:00:00 -A naiss2023-5-222 --wrap="bowtie2 -q --phred33 --$MAPPINGPRESET -p 1 -I 0 -X 1500 --fr --rg-id $SAMPLE_UNIQ_ID --rg SM:$SAMPLE_ID --rg LB:$SAMPLE_ID --rg PU:$PU --rg PL:ILLUMINA -x $BOWTIE_INDEX -1 $SAMPLETOMAP$FASTQSUFFIX1 -2 $SAMPLETOMAP$FASTQSUFFIX2 -S $SAMPLEBAM'_'$SEQ_ID'_'$DATATYPE'_bt2_'$REFNAME'.sam'
## Convert to bam file for storage (including all the mapped reads)
samtools view -bS -F 4 $SAMPLEBAM'_'$SEQ_ID'_'$DATATYPE'_bt2_'$REFNAME'.sam' > $SAMPLEBAM'_'$SEQ_ID'_'$DATATYPE'_bt2_'$REFNAME'.bam'
#rm -f $SAMPLEBAM'_'$SEQ_ID'_'$DATATYPE'_bt2_'$REFNAME'.sam'
# ## Filter the mapped reads (to onky retain reads with high mapping quality)
# # Filter bam files to remove poorly mapped reads (non-unique mappings and mappings with a quality score < 20) -- do we want the quality score filter??
samtools view -h -q 20 $SAMPLEBAM'_'$SEQ_ID'_'$DATATYPE'_bt2_'$REFNAME'.bam' | samtools view -buS - | samtools sort -o $SAMPLEBAM'_'$SEQ_ID'_'$DATATYPE'_bt2_'$REFNAME'_minq20_sorted.bam'"
fi
done
######################################################
## PHASE 4
######################################################
### Merge all run BAM files together ###
#####################################################
for SAMPLE in `cat /proj/snic2020-2-19/private/cisco/qiaoling/Bam_Files/Merge_Bams/Bam_Group_Names.txt`; do
sbatch -J cisco_merge -e /proj/snic2020-2-19/private/cisco/qiaoling/European_Cisco_Genomic_Works/Temp_Error_Reports/$SAMPLE-%j.err -p core -c 1 -t 5-00:00:00 -A naiss2023-5-222 --wrap="bamtools merge -list '/proj/snic2020-2-19/private/cisco/qiaoling/Bam_Files/Merge_Bams/'$SAMPLE'.txt' -out '/proj/snic2020-2-19/private/cisco/qiaoling/Bam_Files/Final_Merged_Bam/'$SAMPLE'_ALL_pe_bt2_European_cisco.hifiasm0.16.1_minq20_sorted.bam'"
done
######################################################
## PHASE 5
######################################################
### Deduplicate and clip overlapping read pairs ###
#####################################################
BAMLIST=/proj/snic2021-6-194/European_Cisco_LAndersson/Data_Files/Bam_Files/Final_Merged_Bam/All_Bam_List_MOD.txt # It's the same as the fastq setup so we can reuse this file
REFNAME=European_cisco.hifiasm0.16.1 # Reference name to add to output files
## Loop over each sample
for SAMPLEBAM in `cat $BAMLIST`; do
SAMPLE_UNIQ_ID=$SAMPLEBAM
## Remove duplicates and print dupstat file
## THIS ONLY NEEDS TO BE DONE IF YOU MERGED THE PE AND SE READS PREVIOUSLY
# We used to be able to just specify picard.jar on the CBSU server, but now we need to specify the path and version
# java -jar $PICARD_ROOT/picard.jar MarkDuplicates I=$BASEDIR'/Bam/'$SAMPLEBAM'_bt2_'$REFNAME'_minq20_sorted.bam' O=$BASEDIR'/Bam/'$SAMPLEBAM'_bt2_'$REFNAME'_minq20_sorted_dedup.bam' M=$BASEDIR'/Bam/'$SAMPLEBAM'_bt2_'$REFNAME'_minq20_sorted_dupstat.txt' VALIDATION_STRINGENCY=SILENT REMOVE_DUPLICATES=true
## Clip overlapping paired end reads (only necessary for paired-end data, so if you're only running se samples, you can comment this step out)
conda activate cisco_mapping
sbatch -J cisco_map -e /proj/snic2020-2-19/private/cisco/qiaoling/European_Cisco_Genomic_Works/Temp_Error_Reports/${SAMPLE_UNIQ_ID}%j.err -p core -c 1 -t 1-00:00:00 -A naiss2023-5-222 --wrap="bam clipOverlap --in '/proj/snic2020-2-19/private/cisco/qiaoling/Bam_Files/Final_Merged_Bam/'$SAMPLEBAM'_bt2_'$REFNAME'_minq20_sorted.bam' --out '/proj/snic2020-2-19/private/cisco/qiaoling/Bam_Files/Final_Merged_Bam/'$SAMPLEBAM'_bt2_'$REFNAME'_minq20_sorted_overlapclipped.bam' --stats"
done
######################################################
## PHASE 6 Indel Realignment
######################################################
#!/bin/bash -l
#SBATCH -A naiss2023-5-222
#SBATCH -p core
#SBATCH -n 8
#SBATCH -t 10-00:00:00
#SBATCH --array=1-336:1
#SBATCH -J idel_realignment
#SBATCH -e idel_realignment_%A_%a.err
#SBATCH -o idel_realignment_%A_%a.out
#SBATCH --mail-type=all
#SBATCH [email protected]
#Load modules
module load bioinfo-tools
module load samtools/1.12
module load bamtools/2.5.1
module load GATK-Queue/3.7
#Path to the directory where you have the bam-files
BASEDIR=/proj/snic2020-2-19/private/cisco/qiaoling/Bam_Files/Final_Merged_Bam/
cd $BASEDIR
# RUN THIS INSIDE THE SEQDIR ALREADY
# Refference genome
REFGENOME='/proj/snic2020-2-19/private/cisco/qiaoling/new_assembly/European_cisco_w_extra_data.ipa1.8.0.purged.primary.fa'
# List the files and get one file:
BAM_FILE=$(ls *_sorted_overlapclipped.bam | sed -n ${SLURM_ARRAY_TASK_ID}p)
OUTPUT=${BAM_FILE/_sorted_overlapclipped.bam/_sorted_overlapclipped_realigned.bam}
# Run programs
java -Xmx38g -jar /sw/bioinfo/GATK/3.7/GenomeAnalysisTK.jar \
-T RealignerTargetCreator \
-R $REFGENOME \
-I $BAM_FILE \
-o $BASEDIR/'all_samples_for_indel_realigner.intervals'
java -Xmx38g -jar /sw/bioinfo/GATK/3.7/GenomeAnalysisTK.jar \
-T IndelRealigner \
-R $REFGENOME \
-I $BAM_FILE \
-targetIntervals $BASEDIR/'all_samples_for_indel_realigner.intervals' \
-o $OUTPUT
# Index all samples running array idexes
samtools index $OUTPUT
#############################################################################
## PHASE 7 ### Check coverage stats with Qualimap multi-bam QC ###
#############################################################################
unset DISPLAY
## Qualimap statistics for all samples individually
BAMQC_SAMPLES_INPUT='/proj/snic2020-2-19/private/cisco/qiaoling/Bam_Files/Final_Merged_Bam/EU_Cisco_Sample_List_Clipped_realigned.txt'
qualimap multi-bamqc -data $BAMQC_SAMPLES_INPUT -r -outdir /proj/snic2020-2-19/private/cisco/qiaoling/Bam_Files/Bam/QC -outformat HTML
#############################################################################
## PHASE 8 ### visualization coverage stats with in R ###
#############################################################################
##### copy qualimap output from HTML, generate remapping.csv
setwd("E:/16-European_cisco/7_remapping")
library("ggplot2")
##load file
cisco_remapping <- read.csv("remapping.csv",header = TRUE)
head(cisco_remapping)
## plot
re_pop <- ggplot(cisco_remapping, aes(x=population, y=Coverage_mean_remapping,)) +
stat_boxplot(geom = 'errorbar',linewidth=0.2,cex=1)+
geom_boxplot(fill = '#8dd3c7' , color = "black")+
geom_hline(yintercept = 1.0, linetype = "solid", color = "black")+
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red")+
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(color = "black", fill = NA))
re_pop
re_insert <- ggplot(cisco_remapping, aes(x=population, y=Insert_size_median_remapping)) +
stat_boxplot(geom = 'errorbar',linewidth=0.2,cex=1)+
geom_boxplot(fill = '#bebada' , color = "black")+
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(color = "black", fill = NA))
re_insert
re_ind <- ggplot(cisco_remapping, aes(x=Sample, y=Coverage_mean_remapping)) +
stat_boxplot(geom = 'errorbar',linewidth=0.2,cex=1)+
geom_boxplot(fill = '#8dd3c7' , color = "black")+
geom_hline(yintercept = 1.0, linetype = "solid", color = "black")+
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red")+
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(color = "black", fill = NA))
re_ind
#######Scatter plot
ggplot(data = cisco_remapping, aes(x = Coverage_mean_remapping, y = Insert_size_median_remapping)) +
geom_point(shape = 18, color = "#377eb8", size = 2) +
geom_smooth(method = "lm", se = FALSE) +
geom_smooth(method = "lm", se = FALSE, color = "#e41a1c")+
labs(x = "Mean Coverage", y = "Insert size median") +
theme_minimal()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(color = "black", fill = NA))
####### compare
cisco_remapping_compare <- read.csv("remapping_compare.csv",header = TRUE)
head(cisco_remapping_compare)
Coverage <- ggplot(cisco_remapping_compare, aes(x=Mapping, y= Coverage_mean,fill=Mapping)) +
stat_boxplot(geom = 'errorbar',linewidth=0.2,cex=1)+
geom_boxplot()+
geom_hline(yintercept = 1.0, linetype = "solid", color = "black")+
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red")+
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(color = "black", fill = NA))
Coverage
Insert <- ggplot(cisco_remapping_compare, aes(x=Mapping, y=Insert_size_median,fill=Mapping)) +
stat_boxplot(geom = 'errorbar',linewidth=0.2,cex=1)+
geom_boxplot()+
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(color = "black", fill = NA))
Insert
library(gridExtra)
grid.arrange(Coverage, Insert, ncol = 2)
#######plot col
Col_coverage<- ggplot(data = cisco_remapping_compare,aes(x = Sample,y = Coverage_mean,fill = Mapping))+
geom_col(position = 'dodge',
width = 0.5)+
facet_wrap(~Mapping,ncol = 2)+
theme(axis.text.x = element_blank())+
geom_hline(yintercept = 1.0, linetype = "solid", color = "black")+
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red")
Col_coverage
Col_Insert<- ggplot(data = cisco_remapping_compare,aes(x = Sample,y = Insert_size_median,fill = Mapping))+
geom_col(position = 'dodge',
width = 0.5)+
facet_wrap(~Mapping,ncol = 2)+
theme(axis.text.x = element_blank())+
geom_hline(yintercept = 281.94, linetype = "solid", color = "black")
Col_Insert