-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathspike_in_calibration.csh
116 lines (102 loc) · 5.3 KB
/
spike_in_calibration.csh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/bin/csh
#Spike-in calibration example C-shell script
#This is an EXAMPLE of one way to do spike-in calibration
#Seven arguments
# 1 2 3 4 5 6 7
#spike_calibrate.csh genome.bed spike_genome.bed scale output(bg|bga|d) genome_chr_lens_file min_len max_len
#Assumes Illumina paired-end sequencing
#Packages used: An alignment program (e.g. bowtie2, BWA, etc.), picard and bedtools
#Requires bed files of alignments to the experimental genome as well as to the spike-in genome
#If there is no spike-in, put "none" in argument 2.
#The only information used from the spike-in alignment is the number of fragments aligned
#(the number of lines in spike_genome.bed).
#min_len and max_len refer to the fragment lengths in genome.bed
#Format for genome_chr_lens_file is (this information is in the sam file headers):
#chr1 249250621
#chr2 243199373
#...
#Before running this script:
#1. Align Illumina fastq read files to both genomes producing two sam files.
# Any alignment program that produces a sam/bam file can be used.
# However the alignments are done, be careful that not too many reads are aligned
# to both genomes. You may want to map to a masked version of the spike-in genome
# to avoid cross-mapping to rDNA and mitochondrial DNA, but be aware that masked genomes
# eliminate all repeated regions (e.g. centromeric, transposons, etc.), some of which
# may be of interest.
#2. Optionally remove duplicate reads (e.g. use picard MarkDuplicates) from the sam file.
# Be aware that all duplicates will be removed since the program cannot determine the
# origin of duplicate reads. You should only remove duplicates when you are starting with
# very low amounts of material, such as when there are few cells. Unlike ChIP-seq, where
# sonication fragments DNA more-or-less at random, CUT&RUN cuts DNA precisely, and so some
# degree of identical cleavage events are expected, especially for sequence-specific DNA
# binding proteins.
#3. Extract properly aligned fragments from both genomes.
# a. Convert sam to bam format (picard or samtools)
# b. Extract aligned fragments from bam file (bedtools bamtobed)
# For subsequent processing, the length of each fragment is
# required in the bed file, this will probably need to be
# added. awk can be used to add fragment length, e.g.:
# cat bamtobed.bed | awk -v OFS='\t' '{len = $3 - $2; print $0, len }'
# Assuming that $2 is the actual start position - 1
#----------------------------------------------------------------------------------------------
#4. Spike-in calibration
# NOTE: spike-in calibration may not be appropriate for your sample. You DO need to do it
# when you need to compare samples, such as comparing treatment to control. The primary
# genome to spike-in genome ratio per cell is expected to be the same for all samples.
# The per base pair calculation used here to make a track is:
# scale * (primary_genome_mapped_count_at_bp)/(spike-in_genome_total_of_mapped_fragments)
# You can use any scale multiplier and any program you like to do the calculation, the
# example here uses bedtools genomecov.
if ($#argv < 7) then
echo "USAGE spike_calibrate.csh genome.bed spike_genome.bed scale output(bg|bga|d) genome_chr_lens min_len max_len"
echo "Spike-in calibration using bedtools genomecov"
echo "scale is an arbitrary large number used as multiplier (e.g. 10000)"
echo "If there is no spike-in, use spike_genome.bed = none"
echo "min_len and max_len refer to the lengths of fragments in genome.bed to calibrate"
echo "To calibrate all fragments use min_len = 1 and max_len = 1000"
echo "Output will be placed in the current directory"
exit(-1)
endif
set genome_bed = $1
set spike_bed = $2
set scale = $3
set report = $4
set genome_len = $5
set min_len = $6
set max_len = $7
echo $genome_bed $spike_bed $scale $report $min_len $max_len
if (!(-e $genome_bed) || (-z $genome_bed)) then
echo "$genome_bed not found or is empty"
exit(-1)
endif
if (!(-e $genome_len) || (-z $genome_len)) then
echo "$genome_len not found or is empty"
exit(-1)
endif
set temp = $genome_bed:t
set name = $temp:r
set output = $name.${min_len}-${max_len}.$report
echo "Output is in file $output in the current directory"
if (-e $spike_bed && !(-z $spike_bed)) then
set spike_count = `wc -l $spike_bed | awk '{print $1}'`
set scale_factor = `echo "$scale / $spike_count" | bc -l`
else
set spike_count = 0
set scale_factor = $scale
endif
echo scale_factor=$scale_factor
#Select fragments within the length range, assumes fragment length is in column 4 of the bed file
cat $genome_bed | awk -v min=$min_len -v max=$max_len '{if ($4 >= min && $4 <= max) print}' > $$.temp.bed
#Use genomecov to compute spike-in calibrated genome coverage
#see http://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html
#The first position is start-1 and the last is end for bed files, -bg and -bga
#-bga prints zero intervals -bg doesn't, -d prints each bp starting from 1 (not 0)
#This is for IGV which doesn't recognize .bg or .bga files
if ($report == "bg" || $report == "bga") then
set output = $name.${min_len}-${max_len}.bedgraph
echo track type=bedGraph name=$name > $output
endif
bedtools genomecov -$report -scale $scale_factor -i $$.temp.bed -g $genome_len >> $output
unalias rm
rm $$.*
exit