Skip to content

Commit

Permalink
improved intersection and GATK-to-GATK call set analysis scripts
Browse files Browse the repository at this point in the history
  * switch from using CPX call to re-interpreted calls
  • Loading branch information
SHuang-Broad committed Apr 9, 2018
1 parent de38636 commit 74f064a
Show file tree
Hide file tree
Showing 15 changed files with 432 additions and 256 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

set -eu

if [[ "$#" -ne 3 ]]; then
if [[ "$#" -ne 5 ]]; then
echo "Please provide:"
echo "[1] absolute path to GATK VCF"
echo "[2] absolute path to the directory where analysis outputs are to be written to"
echo "[3] reference version (\"19\"|\"38\")"
echo "[3] absolute path to GATK VCF containing simple variants re-interpreted from 1-seg CPX calls"
echo "[4] absolute path to GATK VCF containing simple variants re-interpreted from multi-seg CPX calls"
echo "[5] reference version (\"19\"|\"38\")"
exit 1
fi

Expand All @@ -26,10 +28,32 @@ fi

mkdir -p "$OUTPUT_DIR"


GATK_one_seg_VCF=""
if [[ $3 == *.vcf.gz ]]; then
COMPRESSED=$(basename "$3")
pattern=".gz"
GATK_one_seg_VCF=${COMPRESSED//$pattern/}
bgzip -c -d "$3" > "$GATK_one_seg_VCF"
elif [[ $3 == *.vcf ]]; then
GATK_one_seg_VCF=$3
fi

GATK_multi_seg_VCF=""
if [[ $4 == *.vcf.gz ]]; then
COMPRESSED=$(basename "$4")
pattern=".gz"
GATK_multi_seg_VCF=${COMPRESSED//$pattern/}
bgzip -c -d "$4" > "$GATK_multi_seg_VCF"
elif [[ $4 == *.vcf ]]; then
GATK_multi_seg_VCF=$4
fi


PRIMARY_CONTIGS_PATTERN=""
if [[ $3 == "19" ]]; then
if [[ $5 == "19" ]]; then
PRIMARY_CONTIGS_PATTERN="^([0-9]{1,2}|X|Y) "
elif [[ $3 == "38" ]]; then
elif [[ $5 == "38" ]]; then
PRIMARY_CONTIGS_PATTERN="^chr([0-9]{1,2}|X|Y) "
else
echo "reference version must be either 19 or 38"
Expand Down Expand Up @@ -91,13 +115,15 @@ echo " they differ, very slightly, in their INSSEQ and/or DUPLICATED_SEQUENCE an

awk '{print $3}' "$GATK_PRIME_VAR_NO_WARN" | uniq -d > temp.duplicatedGATKIDs.ins.txt

grep -vf temp.duplicatedGATKIDs.ins.txt "$GATK_PRIME_VAR_NO_WARN" > \
"$GATK_PRIME_VAR_NO_WARN_UNIQUE_RECORDS"
grep -vf temp.duplicatedGATKIDs.ins.txt \
"$GATK_PRIME_VAR_NO_WARN" \
> "$GATK_PRIME_VAR_NO_WARN_UNIQUE_RECORDS"
echo "Number of variants in" "$GATK_PRIME_VAR_NO_WARN_UNIQUE_RECORDS"
wc -l "$GATK_PRIME_VAR_NO_WARN_UNIQUE_RECORDS" | awk '{print $1}'
echo

grep -f temp.duplicatedGATKIDs.ins.txt "$GATK_PRIME_VAR_NO_WARN" > \
parallel_grep 10 temp.duplicatedGATKIDs.ins.txt \
"$GATK_PRIME_VAR_NO_WARN" \
"$GATK_PRIME_VAR_NO_WARN_DUPLICATED_RECORDS"
echo "Number of variants in" "$GATK_PRIME_VAR_NO_WARN_DUPLICATED_RECORDS"
wc -l "$GATK_PRIME_VAR_NO_WARN_DUPLICATED_RECORDS" | awk '{print $1}'
Expand All @@ -106,6 +132,23 @@ echo
echo "#################################################"
echo "Done checking on"
echo " $1"

if [[ ! -z "${GATK_one_seg_VCF}" ]] && [[ ! -z "${GATK_multi_seg_VCF}" ]]; then

echo "Now merging $GATKVCF with $GATK_one_seg_VCF and $GATK_multi_seg_VCF"
echo

grep -v '^#' "$GATK_one_seg_VCF" > temp.oneseg.txt
grep -v '^#' "$GATK_multi_seg_VCF" > temp.multiseg.txt

cp "$GATK_PRIME_VAR_NO_WARN_UNIQUE_RECORDS" temp.major.txt
cat temp.major.txt temp.oneseg.txt temp.multiseg.txt | \
gsort -V \
> temp.merged.txt
mv temp.merged.txt "$GATK_PRIME_VAR_NO_WARN_UNIQUE_RECORDS"

echo "Done merging"
fi
echo "#################################################"
echo

Expand Down Expand Up @@ -191,8 +234,10 @@ if [[ $NUM != 0 ]]; then
wc -l tempVar.txt | awk '{print $1}'
mkdir -p "$OUTPUT_DIR""BND"
mv tempVar.txt "$OUTPUT_DIR""BND/GATK_primaryContigs_bnd.txt"
echo
else
echo "0"
fi
echo

########## clean up
rm -f temp*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ else
awk '{print $4}' "$OUTPUT_DIR""cleanDel.gatkVSmanta.intersection_f08r.txt" > temp.txt
grep -vf temp.txt "$OUTPUT_DIR""cleanDel.gatkVSpacbio.intersection_f08r.txt" | \
awk '{print $4}' | awk -F ';' '{print $1}' > \
"$OUTPUT_DIR""variants.discoveredByGATKandMANTA.missingInPacbio.gatkID.txt"
wc -l "$OUTPUT_DIR""variants.discoveredByGATKandMANTA.missingInPacbio.gatkID.txt"
"$OUTPUT_DIR""variants.discoveredByGATKandMANTA.missingInPacBio.gatkID.txt"
wc -l "$OUTPUT_DIR""variants.discoveredByGATKandMANTA.missingInPacBio.gatkID.txt"
fi

rm -f temp*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ expr "$NUM_GATK" - "$N"
echo

echo "######################"
echo "GATK vs Pacbio"
echo "GATK vs PacBio"

bedtools intersect \
-a "$GATKBED" \
Expand Down
87 changes: 52 additions & 35 deletions scripts/sv/evaluation/general/InsDupRPL/againstPacBio.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,44 +23,52 @@ if [[ "$#" -lt 9 ]]; then
fi

NAME=$1
caseID=$(echo "$NAME"|tr '[:upper:]' '[:lower:]')

TEST_INS_VCF=$2
TEST_DUP_VCF=$3
TEST_RPL_VCF=$4
TEST_INS_VAR=$2
TEST_DUP_VAR=$3
TEST_RPL_VAR=$4

PABIO_CHM_1_INS_VCF=$5
PABIO_CHM_13_INS_VCF=$6
PABIO_CHM_1_DEL_VCF=$7
PABIO_CHM_13_DEL_VCF=$8
PABIO_CHM_1_INS_VAR=$5
PABIO_CHM_13_INS_VAR=$6
PABIO_CHM_1_DEL_VAR=$7
PABIO_CHM_13_DEL_VAR=$8

OUTPUT_DIR=$9
if [[ ! $9 =~ .+/$ ]]; then
OUTPUT_DIR+="/"
fi

IDNAMES=$1"IDS"
COMPARISON=$1"vsPacbio"
COMPARISON=$1"vsPacBio"
echo "#################################################"
echo "$NAME VS PacBio"
echo

ins_against_ins "$TEST_INS_VCF" "$PABIO_CHM_1_INS_VCF" "CONTIG_DEPTH" "$COMPARISON" 1
dup_against_ins "$TEST_DUP_VCF" "$PABIO_CHM_1_INS_VCF" "CONTIG_DEPTH" "$COMPARISON" 1
rpl_against_ins "$TEST_RPL_VCF" "$PABIO_CHM_1_INS_VCF" "CONTIG_DEPTH" "$COMPARISON" 1
rpl_against_del "$TEST_RPL_VCF" "$PABIO_CHM_1_DEL_VCF" "CONTIG_DEPTH" "$COMPARISON" 1

ins_against_ins "$TEST_INS_VCF" "$PABIO_CHM_13_INS_VCF" "CONTIG_DEPTH" "$COMPARISON" 13
dup_against_ins "$TEST_DUP_VCF" "$PABIO_CHM_13_INS_VCF" "CONTIG_DEPTH" "$COMPARISON" 13
rpl_against_ins "$TEST_RPL_VCF" "$PABIO_CHM_13_INS_VCF" "CONTIG_DEPTH" "$COMPARISON" 13
rpl_against_del "$TEST_RPL_VCF" "$PABIO_CHM_13_DEL_VCF" "CONTIG_DEPTH" "$COMPARISON" 13


grep -v '^#' "$TEST_INS_VCF" | awk '{print $3}' > temp."$IDNAMES".ins.txt
grep -v '^#' "$TEST_DUP_VCF" | awk '{print $3}' > temp."$IDNAMES".dup.txt
grep -v '^#' "$TEST_RPL_VCF" | awk '{print $3}' > temp."$IDNAMES".rpl.txt
cat temp."$IDNAMES".ins.txt temp."$IDNAMES".dup.txt temp."$IDNAMES".rpl.txt | sort > temp."$IDNAMES".txt
# run simple intersection analysis which outputs variant id and matching PacBio POS (possible many)
ins_against_ins "$TEST_INS_VAR" "$PABIO_CHM_1_INS_VAR" "$COMPARISON" 1 "$caseID"
dup_against_ins "$TEST_DUP_VAR" "$PABIO_CHM_1_INS_VAR" "$COMPARISON" 1 "$caseID"
rpl_against_ins "$TEST_RPL_VAR" "$PABIO_CHM_1_INS_VAR" "$COMPARISON" 1 "$caseID"
rpl_against_del "$TEST_RPL_VAR" "$PABIO_CHM_1_DEL_VAR" "$COMPARISON" 1 "$caseID"

ins_against_ins "$TEST_INS_VAR" "$PABIO_CHM_13_INS_VAR" "$COMPARISON" 13 "$caseID"
dup_against_ins "$TEST_DUP_VAR" "$PABIO_CHM_13_INS_VAR" "$COMPARISON" 13 "$caseID"
rpl_against_ins "$TEST_RPL_VAR" "$PABIO_CHM_13_INS_VAR" "$COMPARISON" 13 "$caseID"
rpl_against_del "$TEST_RPL_VAR" "$PABIO_CHM_13_DEL_VAR" "$COMPARISON" 13 "$caseID"

# get IDS of test variant
awk '{print $3}' "$TEST_INS_VAR" > temp."$IDNAMES".ins.txt
awk '{print $3}' "$TEST_DUP_VAR" > temp."$IDNAMES".dup.txt
awk '{print $3}' "$TEST_RPL_VAR" > temp."$IDNAMES".rpl.txt
cat temp."$IDNAMES".ins.txt \
temp."$IDNAMES".dup.txt \
temp."$IDNAMES".rpl.txt | \
sort \
> temp.allTestVars.txt
echo "Total number of variants from $NAME to be tested:"
wc -l temp."$IDNAMES".txt | awk '{print $1}'
wc -l temp.allTestVars.txt | awk '{print $1}'

# get total overlaps from either cell lines
echo "Total overlaps on CHM1"
cat "$OUTPUT_DIR""insIns.""$COMPARISON""_1.window_w50.IDs.txt" \
"$OUTPUT_DIR""dupIns.""$COMPARISON""_1.window_l0r50.IDs.txt" \
Expand All @@ -77,20 +85,29 @@ cat "$OUTPUT_DIR""insIns.""$COMPARISON""_13.window_w50.IDs.txt" \
> temp_13.txt
wc -l temp_13.txt | awk '{print $1}'


caseID=$(echo "$NAME"|tr '[:upper:]' '[:lower:]')
caseID+="IDs"
cat temp_1.txt temp_13.txt > temp.txt

echo "among these, number of uniq $NAME IDs"
awk '{print $1}' temp.txt | sort | uniq > "$OUTPUT_DIR$caseID""WithMatchingPacBio.txt"
wc -l "$OUTPUT_DIR$caseID""WithMatchingPacBio.txt" | awk '{print $1}'
comm -23 temp."$IDNAMES".txt "$OUTPUT_DIR$caseID""WithMatchingPacBio.txt" \
# count number of uniq variants from test set and from PacBio calls
caseID+="IDs"
awk '{print $1}' temp.txt | \
sort | \
uniq \
> "$OUTPUT_DIR$caseID""WithMatchingPacBio.txt"
comm -23 \
temp.allTestVars.txt \
"$OUTPUT_DIR$caseID""WithMatchingPacBio.txt" \
> "$OUTPUT_DIR$caseID""NoMatchingPacBio.txt"

echo "and number of uniq PacBio IDs"
awk '{print $2}' temp.txt | sort | uniq > "$OUTPUT_DIR""combined.$COMPARISON.uniqPacbioIDs.txt"
wc -l "$OUTPUT_DIR""combined.$COMPARISON.uniqPacbioIDs.txt" | awk '{print $1}'
echo " among these, number of uniq $NAME IDs"
wc -l "$OUTPUT_DIR$caseID""WithMatchingPacBio.txt" | awk '{print $1}'
echo " and the number of variants without matching PacBio calls"
wc -l "$OUTPUT_DIR$caseID""NoMatchingPacBio.txt" | awk '{print $1}'

echo " and the number of uniq PacBio call locations"
awk '{print $2}' temp.txt | \
sort | \
uniq \
> "$OUTPUT_DIR""combined.$COMPARISON.uniqPacBioIDs.txt"
wc -l "$OUTPUT_DIR""combined.$COMPARISON.uniqPacBioIDs.txt" | awk '{print $1}'
echo

rm -f temp*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ if [[ "$#" -lt 9 ]]; then
echo "[4] absolute path to Manta duplication VCF"
echo "[5] absolute path to the directory where analysis outputs are to be written to"
echo "[6] absolute path to GATK active region interval bed file"
echo "[7] validataion call set tool name (\"Manta\"|\"Pacbio\")"
echo "[7] validataion call set tool name (\"Manta\"|\"PacBio\")"
echo "[8] absolute path to file listing uniq GATK IDs with overlapping Manta calls, and"
echo "[9] absolute path to file listing uniq Manta IDs with overlapping GATK calls, or"
echo "[8] absolute path to file listing uniq PacBio IDs with overlapping GATK calls, and"
Expand Down Expand Up @@ -86,20 +86,20 @@ if [[ $7 == "Manta" ]]; then
echo

rm -f temp*
elif [[ $7 == "Pacbio" ]]; then
elif [[ $7 == "PacBio" ]]; then

## validated Manta calls not caught by GATK
echo "Number of Manta calls validated by PacBio but not caught in the GATK call set:"
comm -13 "$8" "$9" |
awk -F ':' 'BEGIN{OFS=" "} {print $1, $2-1, $2}' > \
"$OUTPUT_DIR""mantaRecordsValidatedByPacbioUncaughtByGATK.bed"
wc -l "$OUTPUT_DIR""mantaRecordsValidatedByPacbioUncaughtByGATK.bed" | awk '{print $1}'
"$OUTPUT_DIR""mantaRecordsValidatedByPacBioUncaughtByGATK.bed"
wc -l "$OUTPUT_DIR""mantaRecordsValidatedByPacBioUncaughtByGATK.bed" | awk '{print $1}'
echo

## active region sensitivity
echo "Overlapping the unmatched, Pacbio-validated Manta calls with GATK-SV pipeline local assembly intervals"
echo "Overlapping the unmatched, PacBio-validated Manta calls with GATK-SV pipeline local assembly intervals"
bedtools window \
-a "$OUTPUT_DIR""mantaRecordsValidatedByPacbioUncaughtByGATK.bed" \
-a "$OUTPUT_DIR""mantaRecordsValidatedByPacBioUncaughtByGATK.bed" \
-b "$GATK_ACTIVE_REGIONS" \
-w 50 |
awk '{print $NF}' | \
Expand All @@ -111,8 +111,8 @@ elif [[ $7 == "Pacbio" ]]; then
## Manta and GATK calls un-validated by PacBio
echo "Number of calls shared by GATK and Manta but un-validated by PacBio:"
comm -23 "${10}" "${11}" > \
"$OUTPUT_DIR""recordsSharedByGATKandManta.unvalidatedByPacbio.gatkIDs.txt"
wc -l "$OUTPUT_DIR""recordsSharedByGATKandManta.unvalidatedByPacbio.gatkIDs.txt" | awk '{print $1}'
"$OUTPUT_DIR""recordsSharedByGATKandManta.unvalidatedByPacBio.gatkIDs.txt"
wc -l "$OUTPUT_DIR""recordsSharedByGATKandManta.unvalidatedByPacBio.gatkIDs.txt" | awk '{print $1}'
echo

else
Expand Down
Loading

0 comments on commit 74f064a

Please sign in to comment.