Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use chrX and chrY to match sex chromosomes #713

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions src/WGD/bin/estimatePloidy.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ readMatrix <- function(INFILE){
#############################################################
#####Helper function to normalize contigs for a single sample
#############################################################
normalizeContigsPerSample <- function(mat, exclude=c("X", "Y"), ploidy=2){
normalizeContigsPerSample <- function(mat, exclude, ploidy=2){
#Convert vals to numeric
mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric)

Expand All @@ -53,7 +53,7 @@ normalizeContigsPerSample <- function(mat, exclude=c("X", "Y"), ploidy=2){
###########################################################################
#####Helper function to remove rows where >X% of samples have <=Y% coverage
###########################################################################
filterZeroBins <- function(mat, exclude=c("X", "Y"), minSamp=0.8, minCov=0.2){
filterZeroBins <- function(mat, exclude, minSamp=0.8, minCov=0.2){
#Convert vals to numeric
mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric)

Expand All @@ -74,7 +74,7 @@ filterZeroBins <- function(mat, exclude=c("X", "Y"), minSamp=0.8, minCov=0.2){
##################################################
#####Helper function to run PCA on a binCov matrix
##################################################
binCovPCA <- function(dat, exclude=c("X", "Y"), topPCs=10){
binCovPCA <- function(dat, exclude, topPCs=10){
#Runs PCA
PCA <- prcomp(dat[which(!(dat[, 1] %in% exclude)), -c(1:3)], center=T, scale=T)

Expand Down Expand Up @@ -961,6 +961,9 @@ option_list <- list(
make_option(c("--maxBatch"), type="integer", default=150,
help="maximum number of samples per batch (requires -k) [default: %default]",
metavar="integer")
make_option(c("--allosomes"), type="character", default=NULL,
help="file with allosome contigs, one per line",
metavar="character")
)

#Get command-line arguments & options
Expand All @@ -986,6 +989,7 @@ nPCs <- args$options$dimensions
batch.ideal <- args$options$batchSize
batch.min <- args$options$minBatch
batch.max <- args$options$batchMax
allosomes <- if (is.null(args$options$allosomes)) c("chrX", "chrY") else trimws(readLines(args$options$allosomes))

# ##Jan 2020 dev parameters (on local machine)
# # INFILE <- "/Users/rlc/scratch/1KGP_2504_sub_batch_10_ploidy_matrix.bed.gz"
Expand All @@ -1007,16 +1011,16 @@ if(!dir.exists(OUTDIR)){
#####PART 1: DATA PROCESSING#####
#Read, normalize, and clean coverage data
dat <- readMatrix(INFILE)
dat <- normalizeContigsPerSample(dat, exclude=c("X", "Y"), ploidy=2)
dat <- filterZeroBins(dat)
dat <- normalizeContigsPerSample(dat, exclude=allosomes, ploidy=2)
dat <- filterZeroBins(dat, exclude=allosomes)
chr.dat <- medianPerContigPerSample(dat)
# chr.dat.norm <- normalizeContigsPerMatrix(chr.dat, scale.exclude=c("X", "Y"))

#####PART 2: DOSAGE-BASED BATCHING#####
#Only run if kmeans is optioned
if(kmeans==T){
#Perform PCA on full matrix
PCs <- binCovPCA(dat, exclude=c("X", "Y"), topPCs=nPCs)
PCs <- binCovPCA(dat, exclude=allosomes, topPCs=nPCs)

#Cluster samples based on dosage PCA
#Note: tries this with seeds 1-100 (iterated sequentially) until first success
Expand Down Expand Up @@ -1193,7 +1197,7 @@ binwise.dat <- data.frame(colnames(dat)[-c(1:3)],
bin.IDs <- as.character(apply(dat[1:3], 1, paste, collapse="_", sep=""))
bin.IDs <- as.character(sapply(bin.IDs, function(ID){gsub(" ", "", ID, fixed=T)}))
colnames(binwise.dat) <- c("sample_id", bin.IDs)
sapply(setdiff(unique(dat[, 1]), c("X", "Y")), function(contig){
sapply(setdiff(unique(dat[, 1]), allosomes), function(contig){
png(paste(OUTDIR, "/estimated_CN_per_bin.all_samples.", contig, ".png", sep=""),
height=1250, width=2500, res=300)
plot.dat <- binwise.dat[, c(1, which(dat[, 1]==contig)+1)]
Expand All @@ -1209,7 +1213,7 @@ colnames(binwise.dat.males) <- c("sample_id", bin.IDs)
binwise.dat.females <- data.frame(chr.dat.females$sample_id,
t(dat[, which(colnames(dat) %in% chr.dat.females$sample_id)]))
colnames(binwise.dat.females) <- c("sample_id", bin.IDs)
sapply(intersect(unique(dat[, 1]), c("X", "Y")), function(contig){
sapply(intersect(unique(dat[, 1]), allosomes), function(contig){
#Males
png(paste(OUTDIR, "/estimated_CN_per_bin.chrX_lessThan_2copies.", contig, ".png", sep=""),
height=1250, width=2500, res=300)
Expand Down
5 changes: 5 additions & 0 deletions wdl/EvidenceQC.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ workflow EvidenceQC {
# WGD files
File wgd_scoring_mask

String? chr_x
String? chr_y

# Runtime parameters
String sv_base_mini_docker
String sv_base_docker
Expand Down Expand Up @@ -88,6 +91,8 @@ workflow EvidenceQC {
input:
bincov_matrix = MakeBincovMatrix.merged_bincov,
batch = batch,
chr_x = chr_x,
chr_y = chr_y,
sv_base_mini_docker = sv_base_mini_docker,
sv_pipeline_qc_docker = sv_pipeline_qc_docker,
runtime_attr_score = ploidy_score_runtime_attr,
Expand Down
6 changes: 6 additions & 0 deletions wdl/GatherBatchEvidence.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ workflow GatherBatchEvidence {
# QC files
Int matrix_qc_distance

# Allosomes
String? chr_x
String? chr_y

# Module metrics parameters
# Run module metrics workflow at the end - off by default for GatherBatchEvidence because of runtime/expense
Boolean? run_module_metrics
Expand Down Expand Up @@ -206,6 +210,8 @@ workflow GatherBatchEvidence {
input:
bincov_matrix = merged_bincov_,
batch = batch,
chr_x = chr_x,
chr_y = chr_y,
sv_base_mini_docker = sv_base_mini_docker,
sv_pipeline_qc_docker = sv_pipeline_qc_docker,
runtime_attr_score = ploidy_score_runtime_attr,
Expand Down
12 changes: 11 additions & 1 deletion wdl/PloidyEstimation.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ workflow Ploidy {
String batch
String sv_base_mini_docker
String sv_pipeline_qc_docker
String? chr_x
String? chr_y
RuntimeAttr? runtime_attr_score
RuntimeAttr? runtime_attr_build
}
Expand All @@ -27,6 +29,8 @@ workflow Ploidy {
input:
ploidy_matrix = BuildPloidyMatrix.ploidy_matrix,
batch = batch,
chr_x = chr_x,
chr_y = chr_y,
sv_pipeline_qc_docker = sv_pipeline_qc_docker,
runtime_attr_override = runtime_attr_score
}
Expand Down Expand Up @@ -94,10 +98,14 @@ task PloidyScore {
input {
File ploidy_matrix
String batch
String? chr_x
String? chr_y
String sv_pipeline_qc_docker
RuntimeAttr? runtime_attr_override
}

Array[String] allosomes = [select_first([chr_x, "chrX"]), select_first([chr_y, "chrY"])]

RuntimeAttr default_attr = object {
cpu_cores: 1,
mem_gb: 3.75,
Expand All @@ -114,7 +122,9 @@ task PloidyScore {

set -euo pipefail
mkdir ploidy_est
Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est ~{ploidy_matrix}
Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est \
--allosomes '~{write_lines(allosomes)}' \
~{ploidy_matrix}

#TODO: hotfix for "file changed as we read it" error caused by non-blocking system() calls in the R script
sleep 10
Expand Down
Loading