Skip to content

Commit

Permalink
Included PE/SR/BAF tests
Browse files Browse the repository at this point in the history
  • Loading branch information
kjaisingh committed Aug 30, 2024
1 parent d92bdd5 commit 7da11ee
Show file tree
Hide file tree
Showing 14 changed files with 85 additions and 17 deletions.
4 changes: 2 additions & 2 deletions src/RdTest/RdTest.R
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ option_list <- list(
make_option(c("-w", "--SampleIncludeList"), type="character", default=NULL,
help="Optional:Single column file with list of samples to include", metavar="character"),
make_option(c("--outlier_sample_ids"), type="character", default=NULL,
help="Optional:File containing outlier IDs, one per line", metavar="character")
help="Optional:Path to file containing outlier sample IDs", metavar="character")
)

opt_parser <- OptionParser(option_list = option_list)
Expand Down Expand Up @@ -437,7 +437,7 @@ loadData <- function(chr, start, end, coveragefile, medianfile, bins, verylargev
cov1 <- sampleFilterResult[[1]]
allnorm <- sampleFilterResult[[2]]

#Exclude outlier samples if there are non-outlier samples left
#Exclude outlier samples only if non-outlier samples exist
if (!is.null(outlier_sample_ids)) {
outlier_ids <- readLines(outlier_sample_ids)
non_outlier_columns <- !(names(cov1) %in% outlier_ids)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ def main():
parser.add_argument('-b', '--BAFtest')
parser.add_argument('-s', '--SRtest')
parser.add_argument('-p', '--PEtest')
parser.add_argument('-o', '--outlier-sample-ids')
parser.add_argument('-o', '--outlier-sample-ids', type=argparse.FileType('r'))
parser.add_argument('--batch-list', type=argparse.FileType('r'))
parser.add_argument('--segdups', required=True)
parser.add_argument('--rmsk', required=True)
Expand Down
18 changes: 16 additions & 2 deletions src/svtk/svtk/cli/baf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
##########


def preprocess(chrom, start, end, tbx, samples, window=None):
def preprocess(chrom, start, end, tbx, samples, window=None, outlier_sample_ids=None):
"""
Report normalized BAFs in a set of samples across a desired region.
Expand Down Expand Up @@ -58,7 +58,13 @@ def preprocess(chrom, start, end, tbx, samples, window=None):
if bafs.empty:
return bafs, bafs
bafs.columns = ['chr', 'pos', 'baf', 'sample']

# Exclude outlier samples only if non-outlier samples exist
non_outlier_samples = [s for s in samples if s not in outlier_sample_ids]
if len(non_outlier_samples) > 0:
samples = non_outlier_samples
bafs = bafs[bafs['sample'].isin(samples)]

# print(bafs)
if bafs.empty:
return bafs, bafs
Expand Down Expand Up @@ -93,6 +99,8 @@ def main(argv):
parser.add_argument('file', help='Compiled snp file')
parser.add_argument('-b', '--batch',)
parser.add_argument('--index', help='Tabix index for remote bed')
parser.add_argument('--outlier-sample-ids', type=argparse.FileType('r'),
default=None, help='Path to file containing outlier sample IDs')
# help='Samples')

# Print help if no arguments specified
Expand All @@ -101,6 +109,11 @@ def main(argv):
sys.exit(1)
args = parser.parse_args(argv)

outlier_samples = set()
if args.outlier_sample_ids:
with open(args.outlier_sample_ids, 'r') as f:
outlier_samples = set([line.strip() for line in f])

# fi = args.file
# if args.vcf.startswith('s3://'):
# vcf_path = args.vcf[5:]
Expand Down Expand Up @@ -142,7 +155,8 @@ def main(argv):
type = dat[5]
try:
het_counts, called_bafs = preprocess(
chrom, start, end, tbx, samples=splist)
chrom, start, end, tbx, samples=splist, outlier_sample_ids=outlier_samples
)
except ValueError:
het_counts = pd.DataFrame()
called_bafs = pd.DataFrame()
Expand Down
19 changes: 16 additions & 3 deletions src/svtk/svtk/cli/pesr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def sr_test(argv):
'Same format as RdTest, one column per sample.')
parser.add_argument('--log', action='store_true', default=False,
help='Print progress log to stderr.')
parser.add_argument('--outlier-sample-ids', type=argparse.FileType('r'),
default=None, help='Path to file containing outlier sample IDs.')

# Print help if no arguments specified
if len(argv) == 0:
Expand Down Expand Up @@ -87,9 +89,14 @@ def sr_test(argv):
else:
medians = None

outlier_sample_ids = None
if args.outlier_sample_ids:
outlier_sample_ids = args.outlier_sample_ids

runner = SRTestRunner(vcf, countfile, fout, args.background, common=args.common,
window=args.window, ins_window=args.insertion_window,
whitelist=whitelist, medians=medians, log=args.log)
whitelist=whitelist, medians=medians, log=args.log,
outlier_sample_ids=outlier_sample_ids)
runner.run()


Expand Down Expand Up @@ -126,6 +133,8 @@ def pe_test(argv):
'Same format as RdTest, one column per sample.')
parser.add_argument('--log', action='store_true', default=False,
help='Print progress log to stderr.')
parser.add_argument('--outlier-sample-ids', type=argparse.FileType('r'),
default=None, help='Path to file containing outlier sample IDs.')

if len(argv) == 0:
parser.print_help()
Expand Down Expand Up @@ -163,8 +172,12 @@ def pe_test(argv):
else:
medians = None

runner = PETestRunner(vcf, discfile, fout, args.background, args.common,
args.window_in, args.window_out, whitelist, medians=medians, log=args.log)
outlier_sample_ids = None
if args.outlier_sample_ids:
outlier_sample_ids = args.outlier_sample_ids

runner = PETestRunner(vcf, discfile, fout, args.background, args.common, args.window_in, args.window_out,
whitelist, medians=medians, log=args.log, outlier_sample_ids=outlier_sample_ids)

runner.run()

Expand Down
6 changes: 3 additions & 3 deletions src/svtk/svtk/pesr/pe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ def _get_coords(pos, strand):

class PETestRunner(PESRTestRunner):
def __init__(self, vcf, discfile, fout, n_background=160, common=False,
window_in=50, window_out=500,
whitelist=None, blacklist=None, medians=None, log=False):
window_in=50, window_out=500, whitelist=None, blacklist=None,
medians=None, log=False, outlier_sample_ids=None):
"""
vcf : pysam.VariantFile
discfile : pysam.TabixFile
Expand All @@ -137,7 +137,7 @@ def __init__(self, vcf, discfile, fout, n_background=160, common=False,
window_out, medians=medians)
self.fout = fout

super().__init__(vcf, common, n_background, whitelist, blacklist, log)
super().__init__(vcf, common, n_background, whitelist, blacklist, log, outlier_sample_ids)

def test_record(self, record):
if not self._strand_check(record):
Expand Down
13 changes: 12 additions & 1 deletion src/svtk/svtk/pesr/pesr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def null_score(self, null_val=0.0):

class PESRTestRunner:
def __init__(self, vcf, common=False, n_background=160, whitelist=None, blacklist=None,
log=False):
log=False, outlier_sample_ids=None):
self.vcf = vcf

self.common = common
Expand All @@ -105,6 +105,12 @@ def __init__(self, vcf, common=False, n_background=160, whitelist=None, blacklis

self.log = log

outlier_samples = set()
if outlier_sample_ids:
with open(outlier_sample_ids, 'r') as f:
outlier_samples = set([line.strip() for line in f])
self.outlier_sample_ids = outlier_samples

def run(self):
if self.log:
start = datetime.datetime.now()
Expand Down Expand Up @@ -136,6 +142,11 @@ def choose_background(self, record, whitelist=None, blacklist=None):
called = svu.get_called_samples(record)
background = [s for s in self.samples if s not in called]

# Exclude outlier samples only if non-outlier samples exist
non_outlier_called = [s for s in called if s not in self.outlier_sample_ids]
if len(non_outlier_called) > 0:
called = non_outlier_called

# Permit override of specified white/blacklists
whitelist = whitelist if whitelist is not None else self.whitelist
blacklist = blacklist if blacklist is not None else self.blacklist
Expand Down
4 changes: 2 additions & 2 deletions src/svtk/svtk/pesr/sr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def _test_coord(self, coord, strand, chrom, samples, background, left_boundary,

class SRTestRunner(PESRTestRunner):
def __init__(self, vcf, countfile, fout, n_background=160, common=False, window=100, ins_window=50,
whitelist=None, blacklist=None, medians=None, log=False):
whitelist=None, blacklist=None, medians=None, log=False, outlier_sample_ids=None):
"""
vcf : pysam.VariantFile
countfile : pysam.TabixFile
Expand All @@ -196,7 +196,7 @@ def __init__(self, vcf, countfile, fout, n_background=160, common=False, window=
self.srtest = SRTest(countfile, common=common, window=window, ins_window=ins_window, medians=medians)
self.fout = fout

super().__init__(vcf, common, n_background, whitelist, blacklist, log)
super().__init__(vcf, common, n_background, whitelist, blacklist, log, outlier_sample_ids)

def test_record(self, record):
called, background = self.choose_background(record)
Expand Down
2 changes: 2 additions & 0 deletions wdl/BAFTest.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ workflow BAFTest {
Int split_size
File autosome_contigs
File ref_dict
File? outlier_sample_ids

String linux_docker
String sv_pipeline_docker
Expand All @@ -36,6 +37,7 @@ workflow BAFTest {
split_size = split_size,
chrom = autosome[0],
ref_dict = ref_dict,
outlier_sample_ids = outlier_sample_ids,
linux_docker = linux_docker,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_baftest = runtime_attr_baftest,
Expand Down
7 changes: 6 additions & 1 deletion wdl/BAFTestChromosome.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ workflow BAFTestChromosome {
Int split_size
Int? suffix_len
File ref_dict
File? outlier_sample_ids

String linux_docker
String sv_pipeline_docker
Expand Down Expand Up @@ -45,6 +46,7 @@ workflow BAFTestChromosome {
samples = samples,
prefix = basename(split),
batch = batch,
outlier_sample_ids = outlier_sample_ids,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_baftest
}
Expand Down Expand Up @@ -72,10 +74,13 @@ task BAFTest {
Array[String] samples
String prefix
String batch
File? outlier_sample_ids
String sv_pipeline_docker
RuntimeAttr? runtime_attr_override
}
String outlier_arg = if defined(outlier_sample_ids) then "--outlier-sample-ids ${outlier_sample_ids}" else ""
parameter_meta {
baf_metrics: {
localization_optional: true
Expand Down Expand Up @@ -124,7 +129,7 @@ task BAFTest {
fi

tabix -s1 -b2 -e2 local.BAF.txt.gz
svtk baf-test ~{bed} local.BAF.txt.gz --batch batch.key > ~{prefix}.metrics
svtk baf-test ~{bed} local.BAF.txt.gz --batch batch.key ~{outlier_arg} > ~{prefix}.metrics

>>>
runtime {
Expand Down
3 changes: 3 additions & 0 deletions wdl/PETest.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ workflow PETest {
File male_only_variant_ids
File samples
Int common_cnv_size_cutoff
File? outlier_sample_ids

String sv_base_mini_docker
String linux_docker
Expand Down Expand Up @@ -52,6 +53,7 @@ workflow PETest {
allosome = false,
ref_dict = ref_dict,
common_cnv_size_cutoff = common_cnv_size_cutoff,
outlier_sample_ids = outlier_sample_ids,
sv_base_mini_docker = sv_base_mini_docker,
linux_docker = linux_docker,
sv_pipeline_docker = sv_pipeline_docker,
Expand Down Expand Up @@ -81,6 +83,7 @@ workflow PETest {
allosome = true,
ref_dict = ref_dict,
common_cnv_size_cutoff = common_cnv_size_cutoff,
outlier_sample_ids = outlier_sample_ids,
sv_base_mini_docker = sv_base_mini_docker,
linux_docker = linux_docker,
sv_pipeline_docker = sv_pipeline_docker,
Expand Down
9 changes: 8 additions & 1 deletion wdl/PETestChromosome.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ workflow PETestChromosome {
File samples
Boolean allosome
Int common_cnv_size_cutoff
File? outlier_sample_ids

String sv_base_mini_docker
String linux_docker
Expand Down Expand Up @@ -56,6 +57,7 @@ workflow PETestChromosome {
prefix = basename(split),
ref_dict = ref_dict,
common_model = false,
outlier_sample_ids = outlier_sample_ids,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_petest
}
Expand All @@ -70,6 +72,7 @@ workflow PETestChromosome {
prefix = basename(split),
ref_dict = ref_dict,
common_model = false,
outlier_sample_ids = outlier_sample_ids,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_petest
}
Expand All @@ -96,6 +99,7 @@ workflow PETestChromosome {
prefix = basename(split),
ref_dict = ref_dict,
common_model = false,
outlier_sample_ids = outlier_sample_ids,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_petest
}
Expand Down Expand Up @@ -134,6 +138,7 @@ workflow PETestChromosome {
ref_dict = ref_dict,
prefix = basename(split),
common_model = true,
outlier_sample_ids = outlier_sample_ids,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_petest
}
Expand Down Expand Up @@ -177,12 +182,14 @@ task PETest {
Boolean common_model
File ref_dict
String prefix
File? outlier_sample_ids
String sv_pipeline_docker
RuntimeAttr? runtime_attr_override
}
Int window = 1000
String common_arg = if common_model then "--common" else ""
String outlier_arg = if defined(outlier_sample_ids) then "--outlier-sample-ids ${outlier_sample_ids}" else ""
parameter_meta {
discfile: {
Expand Down Expand Up @@ -228,7 +235,7 @@ task PETest {
fi

tabix -s1 -b2 -e2 local.PE.txt.gz
svtk pe-test -o ~{window} ~{common_arg} --medianfile ~{medianfile} --samples ~{include_list} ~{vcf} local.PE.txt.gz ~{prefix}.stats
svtk pe-test -o ~{window} ~{common_arg} --medianfile ~{medianfile} --samples ~{include_list} ~{vcf} local.PE.txt.gz ~{prefix}.stats ~{outlier_arg}
>>>
runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
Expand Down
3 changes: 3 additions & 0 deletions wdl/RDTestChromosome.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ workflow RDTestChromosome {
prefix = basename(split),
flags = flags,
ref_dict = ref_dict,
outlier_sample_ids = outlier_sample_ids,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_rdtest
}
Expand All @@ -72,6 +73,7 @@ workflow RDTestChromosome {
prefix = basename(split),
flags = flags,
ref_dict = ref_dict,
outlier_sample_ids = outlier_sample_ids,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_rdtest
}
Expand Down Expand Up @@ -100,6 +102,7 @@ workflow RDTestChromosome {
prefix = basename(split),
flags = flags,
ref_dict = ref_dict,
outlier_sample_ids = outlier_sample_ids,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_rdtest
}
Expand Down
3 changes: 3 additions & 0 deletions wdl/SRTest.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ workflow SRTest {
Boolean run_common
Int? common_cnv_size_cutoff # Required if run_common is true
File ref_dict
File? outlier_sample_ids

String sv_pipeline_docker
String linux_docker
Expand Down Expand Up @@ -55,6 +56,7 @@ workflow SRTest {
run_common = run_common,
common_cnv_size_cutoff = common_cnv_size_cutoff,
ref_dict = ref_dict,
outlier_sample_ids = outlier_sample_ids,
sv_base_mini_docker = sv_base_mini_docker,
linux_docker = linux_docker,
sv_pipeline_docker = sv_pipeline_docker,
Expand Down Expand Up @@ -85,6 +87,7 @@ workflow SRTest {
run_common = run_common,
common_cnv_size_cutoff = common_cnv_size_cutoff,
ref_dict = ref_dict,
outlier_sample_ids = outlier_sample_ids,
sv_base_mini_docker = sv_base_mini_docker,
linux_docker = linux_docker,
sv_pipeline_docker = sv_pipeline_docker,
Expand Down
Loading

0 comments on commit 7da11ee

Please sign in to comment.