-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathappend_non-standard-CADD.sh
109 lines (83 loc) · 2.42 KB
/
append_non-standard-CADD.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/bin/bash
#
# append_non-standard-CADD.sh
# This script adds to the UDP's local database of non-standard CADD scores
# INPUTS: current non-standard file, file of non-standard scores to be appended
# OUTPUTS: appended non-standard file, updated index file
#
# Last modified on: 08/20/2015
# Last modified by: Mike Warburton
usage(){
cat << EOF
This script adds web-scored variant data to the UDP's local database of non-standard CADD scores.
ASSUMPTIONS:
The local CADD database is in /data/Udpdata(/Uploads)/Reference/CADD/v*.*/hg19/
INPUT:
-v <version number> The version number for the set of CADD scores (e.g. '1.3')
-g <genome type> The genome type for the set of CADD scores - default is 'hg19'
-n <new scores> The file containing the scores to be appended
-u A flag to append to the current database in Uploads
OUTPUT:
A newly appended score file
A new index file for that database file
EOF
}
# Check and collect input
if [ $# -lt 4 ] || [ $# -gt 7 ]; then
usage
exit 1
fi
uploadsFlag=false
while getopts "v:g:n:u" OPTION; do
case $OPTION in
v)
verNum=$OPTARG;;
g)
genomeType=$OPTARG;;
n)
newFile=$OPTARG;;
u)
uploadsFlag=true;;
\?)
usage
exit 1;;
esac
done
if [ "$verNum" == "" ] || [ "$newFile" == "" ]; then
usage
exit 1
fi
if [ "$genomeType" == "" ];then
genomeType="hg19"
fi
if [[ "$newFile" == *gz ]]; then
gunzip -f $newFile
newFile=${newFile%.gz}
fi
# Set variables
oDbGz="/data/Udpdata/Reference/CADD/v$verNum/$genomeType/non-standard_Variants.tsv.gz"
if $uploadsFlag; then
oDbGz="/data/Udpdata/Uploads/Reference/CADD/v$verNum/$genomeType/non-standard_Variants.tsv.gz"
fi
oDB="/data/Udpdata/Uploads/Reference/CADD/v$verNum/$genomeType/non-standard_Variants.old.tsv"
nDB="/data/Udpdata/Uploads/Reference/CADD/v$verNum/$genomeType/non-standard_Variants.tsv"
# Unzip the non-standard CADD database into the Uploads directory
gunzip -c $oDbGz > $oDB
# Concatenate and sort the score files
echo "Combining and sorting files"
oFL=`grep -n -m1 "#CHROM" $oDB | cut -d':' -f1`
((oFL++))
nFL=`grep -n -m1 "#CHROM" $newFile | cut -d':' -f1`
((nFL++))
cat <(tail -n +$oFL $oDB) <(tail -n +$nFL $newFile) | sort -n -k 1 -k 2 -k 3 -k 4 | uniq > $nDB.temp
((oFL--))
cat <(head -$oFL $oDB) $nDB.temp > $nDB
# Zip and index the new database
echo "Zipping new file"
bgzip -f $nDB
echo "Indexing new file"
tabix -f -b 2 -e 2 ${nDB}.gz
chgrp Udpbinfo ${nDB}.gz.tbi
rm $nDB.temp
rm $oDB
echo "Finished"