forked from yezhengSTAT/mHiC
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmhic_step0-6.sh
173 lines (139 loc) · 7.24 KB
/
mhic_step0-6.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/bin/bash
## mHi-C
## Author: Ye Zheng
## Contact: [email protected]
#######################################################
## Script to call each step of mHi-C
## Take ring stage of Plasmodium falciparum for example
## Update May 2018
#######################################################
projectPath="/path/to/your/project/mHiC/mHiC_demo" ## path to save all the raw data, intermediate files and outputs.
## ************************************************
## step 0 - Download raw data - Example shown here.
## ************************************************
echo "Start step 0 - downloading!"
## Download Plasmodium falciparum - TROPHOZOITES fastq files for both ends
id="TROPHOZOITES"
fastqPath="$projectPath/fastqFiles"
if [ ! -d "$fastqPath" ]; then
mkdir -p $fastqPath
fi
wget -r "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1215nnn/GSM1215593/suppl/GSM1215593_trimmedAndFiltered-TROPHOZOITES-XL-AGGG-L2_1.fastq.gz" -O $fastqPath/$id\_1.fastq.gz
wget -r "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1215nnn/GSM1215593/suppl/GSM1215593_trimmedAndFiltered-TROPHOZOITES-XL-AGGG-L2_2.fastq.gz" -O $fastqPath/$id\_2.fastq.gz
gunzip $fastqPath/$id\_1.fastq.gz
gunzip $fastqPath/$id\_2.fastq.gz
## remove end suffix to have matching id between two ends
mv $fastqPath/$id\_1.fastq $fastqPath/$id\_1.tmp.fastq
awk '{print $1}' $fastqPath/$id\_1.tmp.fastq > $fastqPath/$id\_1.fastq
mv $fastqPath/$id\_2.fastq $fastqPath/$id\_2.tmp.fastq
awk '{print $1}' $fastqPath/$id\_2.tmp.fastq > $fastqPath/$id\_2.fastq
rm -rf $fastqPath/$id\_*tmp.fastq
## Download IMR90
# id="SRR1658591"
# sraDir="/projects/sratoolkit.2.8.2-1-centos_linux64"
# path="/projects/fastqFiles"
# mkdir -p $path
# ## tar -zxvf sratoolkit.tar.gz
# wget -r ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX764/SRX764954/SRR1658591/SRR1658591.sra -O $path/$id.sra
# $sraDir/bin/fastq-dump -F --split-files $path/$id.sra -O $path
## step 1-3: Can be run in parallel.
## ******************
## step 1: Alignment
## ******************
name="TROPHOZOITES"
ref="$projectPath/bin/PlasmoDB-9.0_Pfalciparum3D7_Genome.fasta"
bwaDir="/p/keles/yezheng/volumeA/Softwares/bwa-0.5.9" ##"$projectPath/Softwares/bwa-0.5.9" ## need downloading bwa software
samtoolsDir="/p/keles/yezheng/volumeA/Softwares/samtools" ## "$projectPath/Softwares/samtools-1.3" ## need dowloading samtools software
fastqDir="$projectPath/fastqFiles"
resultsDir="$projectPath/$name"
bin="$projectPath/bin"
nCores=8
seqLength=25
resolution=10000
saveFiles=0
cutsite="GATCGATC" ## for MboI ##"AAGCTAGCTT" for HindIII or (GATCGATC ******** ********) for multiple cutters.
## compile cutsite to trim chimeric reads
g++ -std=c++0x -o $bin/cutsite_trimming_mHiC $bin/cutsite_trimming_mHiC.cpp
## generate reference genome bwa index
$bwaDir/bwa index $ref
## alignment
echo "Start step 1 - alignment!"
bash s1_bwaAlignment.sh "$name" "$ref" "$bwaDir" "$samtoolsDir" "$fastqDir" "$resultsDir/s1" "$bin" "$nCores" "$resultsDir/mHiC.summary_w${resolution}_s1" "$saveFiles" "$seqLength" "${cutsite[@]}"
## **************************
## step 2: Read ends pairing
## **************************
name="TROPHOZOITES"
resultsDir="$projectPath/$name"
resolution=10000
echo "Start step 2 - joining read ends!"
python3 s2_joinEnd.py -r1 ${resultsDir}/s1/${name}_1.sam -r2 ${resultsDir}/s1/${name}_2.sam -o ${resultsDir}/s2/${name}.sam -sf $resultsDir/mHiC.summary_w${resolution}_s2
## *********************************
## step 3: Valid fragment filtering
## *********************************
name="TROPHOZOITES"
resultsDir="$projectPath/$name"
bin="$projectPath/bin"
refrag="MboI_resfrag.bed" #restriction fragment file
resolution=10000
lowerBound=$((resolution * 2))
refragL=50 #$((seqLength * 2))
refragU=500
echo "Start step 3 - categorize read pairs!"
python3 s3_categorizePairs.py -f ${bin}/${refrag} -r ${resultsDir}/s2/${name}.sam -o ${resultsDir}/s3 -l $refragL -u $refragU -d $lowerBound -m "window" -b $resolution -sf $resultsDir/mHiC.summary_w${resolution}_s3
# ## In case, chrM is not needed in downstream analysis
# awk -v OFS="\t" '$2 != "chrM" && $7!="chrM" {print $0}' $validP >$validP.noChrM
# rm -rf $validP
# mv $validP.noChrM $validP
## ***************************************
## step 4 - Remove duplicates and binning.
## ***************************************
name="TROPHOZOITES"
resultsDir="$projectPath/$name"
resolution=10000
bin="$projectPath/bin"
validP="${resultsDir}/s3/${name}.validPairs"
validI="${resultsDir}/s4/${name}.validPairs"
minCount=1 #min contact counts allowed
normMethod="KR" #1. "ICE" 2. "KR" 3."None"
ICEmappFile="${bin}/pfal3D7.MboI.w${resolution}" ## mappability file for ICE method
ICEminMap=0.5 ## min mappability threshold for ICE method
ICEmaxIter=150 ## maximum number of iteration for ICE method
KRchromSizeFile=${bin}/plasmodium.chrom.sizes ## chromosome size file for KR method
KRsparsePerc=10 ## remove *% of sparse regions for KR method
splitByChrom=1
saveSplitContact=0
chrList=$(seq 1 14) ## If the chromosomes start with "chr", you can just put in the array of chromosome number, such as (1 2 3) or ($(seq 1 22) X Y). If not, the prefix and chromosome number should be both given, such as (chromosome1 chromosome2 chromosome3) or it can be generated by chrList=(chromosome{1..22}). Such chromosome names should be consistent with those in reference genome and the chromosome size file for KR normalization.
echo "Start step 4 - duplicates removal and binning!"
bash s4_bin.sh "$validP" "$validI" "$bin" "$resolution" "$minCount" "$normMethod" "$ICEmappFile" "$ICEminMap" "$ICEmaxIter" "whole" "$KRchromSizeFile" "$KRsparsePerc" "$resultsDir/mHiC.summary_w${resolution}_s4" "$splitByChrom" "$saveSplitContact" "${chrList[@]}"
## **********************
## step 5 - Build prior.
## **********************
name="TROPHOZOITES"
bin="$projectPath/bin"
resultsDir="$projectPath/$name"
validI="${resultsDir}/s4/${name}.validPairs"
resolution=10000
splineBin=150
lower=10000
priorName="uniPrior"
normMethod="KR" #"ICE" ##"KR"
chromSizeFile=${bin}/plasmodium.chrom.sizes
contactFile=$validI.binPairCount.uni
echo "Starts step 5 - prior construction based on uni-reads only!"
python3 $bin/createFitHiCFragments-fixedsize.py --chrLens "$chromSizeFile" --resolution "$resolution" --outFile "$resultsDir/$name.uni.fragments.mHiC"
python3 s5_prior.py -f ${resultsDir}/$name.uni.fragments.mHiC -i $contactFile -o ${resultsDir} -t $validI.binPairCount.uni.KRnorm.bias -b $splineBin -L $lower -r $resolution -p 2
## ************************************************************************************
## step 6 - Generative model to assign probability to multi-reads potential alignments.
## ************************************************************************************
name="TROPHOZOITES"
resultsDir="$projectPath/$name"
resolution=10000
prior="${resultsDir}/s5_prior.mhic"
multi="${resultsDir}/s4/${name}.validPairs.MULTI.binPair.multi"
multiKeys="$resultsDir/s4/${name}.validPairs.MULTI.binPair.multiKeys"
uni="$resultsDir/s4/${name}.validPairs.binPairCount.uni"
filename="${name}.validPairs.binPair.multi"
threshold=0.5
echo "Starts step 6 - assign probability to multi-reads potential alignment positions !"
awk -v OFS="_" '{print $2, $3, $4, $5}' $multi | sort -u >$multiKeys
python3 s6_em.py -p $prior -u $uni -m $multi -mk $multiKeys -t $threshold -o "${resultsDir}/s6" -f $filename