Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter outlier samples in TrainGCNV and GenerateBatchMetrics #717

Open
wants to merge 44 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
d5d5de4
Initial commit - excluded from gCNV training
kjaisingh Aug 26, 2024
1d76cec
corrected syntactical errors
kjaisingh Aug 26, 2024
c711532
Created test changes to RD R script
kjaisingh Aug 26, 2024
dd73a57
Satisfied WDL syntax constraints
kjaisingh Aug 26, 2024
c52a88e
Rolled back changes to use filtered_vcfs array
kjaisingh Aug 26, 2024
9715e16
Removed runtime attribute
kjaisingh Aug 26, 2024
2a26539
Integrated changes to FilterBatchSites
kjaisingh Aug 27, 2024
916e3f8
Resolve python linting error
kjaisingh Aug 27, 2024
cb9ca58
Undo changes to FilterBatchSites
kjaisingh Aug 27, 2024
97722bc
Added branch to Dockstore
kjaisingh Aug 27, 2024
c3d5b95
Minor change to enable docker sync
kjaisingh Aug 27, 2024
343af21
Use new GetFilteredSubsampledIndices workflow
kjaisingh Aug 27, 2024
2241fc8
Changes to RdTest.R script
kjaisingh Aug 28, 2024
1a6e4b9
Modified specified_cnv to include outlier param
kjaisingh Aug 28, 2024
a500cd0
Modified specified_cnv to include outlier param
kjaisingh Aug 28, 2024
a05d94f
Modified specified_cnv to include outlier param
kjaisingh Aug 28, 2024
89594d5
Syntactical error
kjaisingh Aug 29, 2024
d92bdd5
Include outlier_sample_ids in AggregateTests call
kjaisingh Aug 29, 2024
7da11ee
Included PE/SR/BAF tests
kjaisingh Aug 30, 2024
ead5afb
Minor script update
kjaisingh Aug 30, 2024
2d1a385
Modified references to outlier_sample_ids file
kjaisingh Aug 30, 2024
d775fd0
Modified outlier_arg to be passed directly
kjaisingh Sep 2, 2024
8c82ed5
Removed file-opening in argument declaration
kjaisingh Sep 2, 2024
0dd90db
Modified BAF & PE Test to circumvent matrix dimension problems
kjaisingh Sep 3, 2024
7f85a8f
Minor WDL changes to RDTestChromosome
kjaisingh Sep 3, 2024
2fd6532
Resolved conflicts
kjaisingh Sep 3, 2024
616dd07
Resolved trailing whitespace issues
kjaisingh Sep 3, 2024
5dea7e3
Used original CLI calls to tests
kjaisingh Sep 3, 2024
bd1220b
Added baf test call based on original
kjaisingh Sep 3, 2024
cbb3ea5
Modified reference to outlierSampleIds to use camel case
kjaisingh Sep 3, 2024
8f1eefe
Modified BAF test preprocess() to return filtered samples
kjaisingh Sep 3, 2024
71b1e02
Updated BAFTest to use correct CLI args
kjaisingh Sep 3, 2024
b154dc8
Removed additional ncol check
kjaisingh Sep 4, 2024
0e372a0
Removed gs copy operation
kjaisingh Sep 4, 2024
7d793de
Removed outlier filtering from specified_cnv
kjaisingh Sep 4, 2024
13897c9
Minor formatting
kjaisingh Sep 5, 2024
1b272d9
Modified BAF output format
kjaisingh Sep 7, 2024
0ef0e66
Modified RDTest to exclude outliers from background
kjaisingh Sep 19, 2024
528704f
Formatting fixes
kjaisingh Sep 19, 2024
de23b4b
Removed redundant deletion
kjaisingh Sep 19, 2024
4e0cc5b
Modified Rd test to allow for single row arrays
kjaisingh Sep 20, 2024
0be7af5
Modified directionality of row inclusion
kjaisingh Sep 20, 2024
9d763a5
Merge branch 'main' into kj/44_outlier_sample_removal
kjaisingh Oct 28, 2024
40248ae
Changes yet to be tested: removes outliers from reference calls too
kjaisingh Oct 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/.dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ workflows:
filters:
branches:
- main
- kj/44_outlier_sample_removal
tags:
- /.*/

Expand Down Expand Up @@ -51,6 +52,7 @@ workflows:
filters:
branches:
- main
- kj/44_outlier_sample_removal
tags:
- /.*/

Expand Down
25 changes: 19 additions & 6 deletions src/RdTest/RdTest.R
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,9 @@ option_list <- list(
make_option(c("-l", "--SampleExcludeList"), type="character", default=NULL,
help="Optional:Single column file with list of samples to exclude", metavar="character"),
make_option(c("-w", "--SampleIncludeList"), type="character", default=NULL,
help="Optional:Single column file with list of samples to include", metavar="character")
help="Optional:Single column file with list of samples to include", metavar="character"),
make_option(c("--outlierSampleIds"), type="character", default=NULL,
help="Optional:Path to file containing outlier sample IDs", metavar="character")
)

opt_parser <- OptionParser(option_list = option_list)
Expand Down Expand Up @@ -507,13 +509,14 @@ specified_cnv <- function(cnv_matrix, sampleIDs, cnvID, chr, start, end, cnvtype
##make sure first four columns are not modified##
columnswithsamp <- c(columnswithsamp, 1, 2, 3, 4)
genotype_matrix[1,-columnswithsamp] = 2
}
}

return(genotype_matrix)
}

###Kmeans multi-CNV Test##
##interval is measured by predicted copy state##
kMeans <-function(cnv_matrix,chr,start,end,cnvID,Kinterval,Kintervalstart,Kintervalend,outFolder,outputname)
kMeans <- function(cnv_matrix,chr,start,end,cnvID,Kinterval,Kintervalstart,Kintervalend,outFolder,outputname)
{
samplenames <- rownames(cnv_matrix)
#create Eucledian matrix###
Expand Down Expand Up @@ -931,7 +934,7 @@ plotJPG <- function(genotype_matrix,cnv_matrix,chr,start,end,cnvID,sampleIDs,out
}

##Provide genotype for VCF format##
genotype<- function(cnv_matrix,genotype_matrix,refgeno,chr,start,end,cnvID,sampleIDs,cnvtype,outFolder,outputname,plot_cnvmatrix)
genotype <- function(cnv_matrix,genotype_matrix,refgeno,chr,start,end,cnvID,sampleIDs,cnvtype,outFolder,outputname,plot_cnvmatrix)
{
##get depth intensities##
cnv_median <-c(create_groups(genotype_matrix, cnv_matrix)$Control,create_groups(genotype_matrix, cnv_matrix)$Treat)
Expand Down Expand Up @@ -1107,12 +1110,11 @@ runRdTest<-function(bed)
end = round(center + (sizefilter/2))
}

##Make sure region is in tabix##
if (end - start <= 0 )
{
end=start+1
}
##Make sure region is in tabix##


##Get Intesity Data##
if (exists("poorbincov")) {
Expand Down Expand Up @@ -1177,7 +1179,18 @@ runRdTest<-function(bed)
idsforsearch<-rownames(cnv_matrix)
samplestokeep<-match(unlist(strsplit(sampleIDs,",")),idsforsearch)
sampleIDs<-idsforsearch[na.omit(samplestokeep)]

#Exclude outlier samples only if non-outlier samples exist
if (!is.null(opt$outlierSampleIds)) {
outlier_ids <- readLines(opt$outlierSampleIds)
non_outlier_samples <- setdiff(sampleIDs, outlier_ids)
if (length(non_outlier_samples) > 0) {
sampleIDs <- non_outlier_samples
}
cnv_matrix <- cnv_matrix[!(rownames(cnv_matrix) %in% outlier_ids), , drop = FALSE]
}
samplesPrior <-unlist(strsplit(as.character(sampleIDs),split=","))

##Run K Test if Specified##
if (opt$runKmeans == TRUE) {
k_matrix<-kMeans(cnv_matrix,chr,start,end,cnvID,Kinterval,Kintervalstart,Kintervalend,outFolder,outputname)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,17 @@ def fam_info_readin(fam_file):
return [fam, samp, fa, mo]


def process_metadata(variants, bed=False, batch_list=None):
def process_metadata(variants, bed=False, batch_list=None, outlier_sample_ids=None):
if bed:
samples = [s.strip() for s in batch_list.readlines()]
else:
samples = list(variants.header.samples)

outlier_set = set()
if outlier_sample_ids:
with open(outlier_sample_ids, 'r') as f:
outlier_set = set(line.strip() for line in f)

# parents = [s for s in samples if _is_parent(s)]
# children = [s for s in samples if _is_child(s)]
# n_parents = len(parents)
Expand Down Expand Up @@ -192,23 +197,11 @@ def process_metadata(variants, bed=False, batch_list=None):

# Flag variants specific to outlier samples
metadata['is_outlier_specific'] = False
for svtype in 'DEL DUP INV BND INS'.split():
counts = pd.DataFrame.from_dict(called_counts[svtype], orient='index')\
.reset_index()\
.rename(columns={'index': 'sample', 0: 'var_count'})
if counts.shape[0] == 0:
continue

q1 = counts.var_count.quantile(0.25)
q3 = counts.var_count.quantile(0.75)
thresh = q3 + 1.5 * (q3 - q1)
outliers = counts.loc[counts.var_count >= thresh, 'sample'].values

flagged = []
for var_name, samples in called_samples[svtype].items():
if samples.issubset(outliers):
flagged.append(var_name)
metadata.loc[metadata.name.isin(flagged), 'is_outlier_specific'] = True
if len(outlier_set) > 0:
for variants in called_samples.values():
for name, called in variants.items():
if called and called.issubset(outlier_set):
metadata.loc[metadata.name == name, 'is_outlier_specific'] = True

for col in 'start end svsize'.split():
metadata[col] = metadata[col].astype(int)
Expand Down Expand Up @@ -273,6 +266,7 @@ def main():
parser.add_argument('-b', '--BAFtest')
parser.add_argument('-s', '--SRtest')
parser.add_argument('-p', '--PEtest')
parser.add_argument('-o', '--outlier-sample-ids')
parser.add_argument('--batch-list', type=argparse.FileType('r'))
parser.add_argument('--segdups', required=True)
parser.add_argument('--rmsk', required=True)
Expand All @@ -290,7 +284,11 @@ def main():
variants = pysam.VariantFile(args.variants)
dtypes = 'PE SR RD BAF'.split()

metadata = process_metadata(variants, args.bed, args.batch_list)
outlier_sample_ids = None
if args.outlier_sample_ids:
outlier_sample_ids = args.outlier_sample_ids

metadata = process_metadata(variants, args.bed, args.batch_list, outlier_sample_ids)

# Calculate segdup coverage
bt = pbt.BedTool.from_dataframe(metadata['chrom start end'.split()])
Expand Down
63 changes: 38 additions & 25 deletions src/svtk/svtk/cli/baf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
##########


def preprocess(chrom, start, end, tbx, samples, window=None):
def preprocess(chrom, start, end, tbx, samples, window=None, called_samples=None, outlier_sample_ids=None):
"""
Report normalized BAFs in a set of samples across a desired region.

Expand All @@ -33,55 +33,55 @@ def preprocess(chrom, start, end, tbx, samples, window=None):
called_bafs : pd.DataFrame
BAF of each called SNP within CNV
"""

# Load and filter SNP sites
if window is None:
window = end - start
# records = tbx.fetch(chrom, start - window, end + window,parser=pysam.asTuple())
# sites = deque()

bafs = deque()
if window < 1000000:
for record in tbx.fetch(chrom, max(1, start - window), end + window, parser=pysam.asTuple()):
bafs.append(np.array(record))
else:

for record in tbx.fetch(chrom, max(1, start - 1000000), start, parser=pysam.asTuple()):
bafs.append(np.array(record))
for record in tbx.fetch(chrom, (start + end) // 2 - 500000, (start + end) // 2 + 500000, parser=pysam.asTuple()):
bafs.append(np.array(record))
for record in tbx.fetch(chrom, end, end + 1000000, parser=pysam.asTuple()):
bafs.append(np.array(record))
bafs = np.array(bafs)
# if bafs.shape[0] == 0:
# return 0,0
bafs = pd.DataFrame(bafs)

bafs = pd.DataFrame(np.array(bafs))
if bafs.empty:
return bafs, bafs
return bafs, bafs, called_samples
bafs.columns = ['chr', 'pos', 'baf', 'sample']
bafs = bafs[bafs['sample'].isin(samples)]
# print(bafs)

bafs = bafs[bafs['sample'].isin(samples) & ~bafs['sample'].isin(outlier_sample_ids)]

# Exclude outlier samples from called samples only if non-outlier samples exist
non_outlier_called_samples = [s for s in called_samples if s not in outlier_sample_ids]
if non_outlier_called_samples:
retained_samples = non_outlier_called_samples
else:
retained_samples = called_samples

if bafs.empty:
return bafs, bafs
return bafs, bafs, retained_samples

bafs['pos'] = bafs.pos.astype(int)
bafs['baf'] = bafs.baf.astype(float)
# print(bafqs)
bafs.loc[bafs.pos <= start, 'region'] = 'before'
bafs.loc[bafs.pos >= end, 'region'] = 'after'
bafs.loc[(bafs.pos > start) & (bafs.pos < end), 'region'] = 'inside'
het_counts = bafs.groupby('sample region'.split()).size(
).reset_index().rename(columns={0: 'count'})
het_counts = het_counts.pivot_table(
values='count', index='sample', columns='region', fill_value=0)
het_counts = bafs.groupby('sample region'.split()).size().reset_index().rename(columns={0: 'count'})
het_counts = het_counts.pivot_table(values='count', index='sample', columns='region', fill_value=0)
het_counts = het_counts.reindex(samples).fillna(0).astype(int)
cols = 'before inside after sample'.split()
het_counts = het_counts.reset_index()
for colname in cols:
if colname not in het_counts.columns:
het_counts[colname] = 0
# het_counts = het_counts.reset_index()[cols]

# Report BAF for variants inside CNV
called_bafs = bafs.loc[bafs.region == 'inside'].copy()
return het_counts, called_bafs
return het_counts, called_bafs, retained_samples


def main(argv):
Expand All @@ -93,7 +93,7 @@ def main(argv):
parser.add_argument('file', help='Compiled snp file')
parser.add_argument('-b', '--batch',)
parser.add_argument('--index', help='Tabix index for remote bed')
# help='Samples')
parser.add_argument('--outlier-sample-ids', help='Path to file containing outlier sample IDs')

# Print help if no arguments specified
if len(argv) == 0:
Expand Down Expand Up @@ -125,6 +125,11 @@ def main(argv):
raise Exception('Must provide tabix index with remote URL')
tbx = pysam.TabixFile(args.file)

outlier_samples = set()
if args.outlier_sample_ids:
with open(args.outlier_sample_ids, 'r') as f:
outlier_samples = set(line.strip() for line in f)

# this is necessary to avoid stochasticity in calculation of KS statistic
np.random.seed(0)
random_state = np.random.RandomState(0)
Expand All @@ -141,8 +146,11 @@ def main(argv):
samplelist = samples.split(',')
type = dat[5]
try:
het_counts, called_bafs = preprocess(
chrom, start, end, tbx, samples=splist)
het_counts, called_bafs, samplelist = preprocess(
chrom, start, end, tbx, samples=splist,
outlier_sample_ids=outlier_samples,
called_samples=samplelist
)
except ValueError:
het_counts = pd.DataFrame()
called_bafs = pd.DataFrame()
Expand All @@ -152,7 +160,12 @@ def main(argv):
min(end - start, 1000000), random_state=random_state)
KS = KS2sample(called_bafs, samplelist)
ks, ksp = KS.test(samplelist)
mean, delp = Del.Ttest(samplelist)
try:
mean, delp = Del.Ttest(samplelist)
except Exception:
print(samplelist)
print(het_counts)
raise Exception
statis = Del.stats(samplelist)
line = chrom + '\t' + str(start) + '\t' + str(end) + '\t' + id + '\t' + samples + '\t' + type + '\t' + str(
mean) + ',' + str(delp) + "\t" + str(ks) + ',' + str(ksp) + '\t' + statis
Expand Down
19 changes: 16 additions & 3 deletions src/svtk/svtk/cli/pesr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def sr_test(argv):
'Same format as RdTest, one column per sample.')
parser.add_argument('--log', action='store_true', default=False,
help='Print progress log to stderr.')
parser.add_argument('--outlier-sample-ids', default=None,
help='Path to file containing outlier sample IDs.')

# Print help if no arguments specified
if len(argv) == 0:
Expand Down Expand Up @@ -87,9 +89,14 @@ def sr_test(argv):
else:
medians = None

outlier_sample_ids = None
if args.outlier_sample_ids:
outlier_sample_ids = args.outlier_sample_ids

runner = SRTestRunner(vcf, countfile, fout, args.background, common=args.common,
window=args.window, ins_window=args.insertion_window,
whitelist=whitelist, medians=medians, log=args.log)
whitelist=whitelist, medians=medians, log=args.log,
outlier_sample_ids=outlier_sample_ids)
runner.run()


Expand Down Expand Up @@ -126,6 +133,8 @@ def pe_test(argv):
'Same format as RdTest, one column per sample.')
parser.add_argument('--log', action='store_true', default=False,
help='Print progress log to stderr.')
parser.add_argument('--outlier-sample-ids', default=None,
help='Path to file containing outlier sample IDs.')

if len(argv) == 0:
parser.print_help()
Expand Down Expand Up @@ -163,8 +172,12 @@ def pe_test(argv):
else:
medians = None

runner = PETestRunner(vcf, discfile, fout, args.background, args.common,
args.window_in, args.window_out, whitelist, medians=medians, log=args.log)
outlier_sample_ids = None
if args.outlier_sample_ids:
outlier_sample_ids = args.outlier_sample_ids

runner = PETestRunner(vcf, discfile, fout, args.background, args.common, args.window_in, args.window_out,
whitelist, medians=medians, log=args.log, outlier_sample_ids=outlier_sample_ids)

runner.run()

Expand Down
6 changes: 3 additions & 3 deletions src/svtk/svtk/pesr/pe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ def _get_coords(pos, strand):

class PETestRunner(PESRTestRunner):
def __init__(self, vcf, discfile, fout, n_background=160, common=False,
window_in=50, window_out=500,
whitelist=None, blacklist=None, medians=None, log=False):
window_in=50, window_out=500, whitelist=None, blacklist=None,
medians=None, log=False, outlier_sample_ids=None):
"""
vcf : pysam.VariantFile
discfile : pysam.TabixFile
Expand All @@ -138,7 +138,7 @@ def __init__(self, vcf, discfile, fout, n_background=160, common=False,
window_out, medians=medians)
self.fout = fout

super().__init__(vcf, common, n_background, whitelist, blacklist, log)
super().__init__(vcf, common, n_background, whitelist, blacklist, log, outlier_sample_ids)

def test_record(self, record):
if not self._strand_check(record):
Expand Down
15 changes: 13 additions & 2 deletions src/svtk/svtk/pesr/pesr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def null_score(self, null_val=0.0):

class PESRTestRunner:
def __init__(self, vcf, common=False, n_background=160, whitelist=None, blacklist=None,
log=False):
log=False, outlier_sample_ids=None):
self.vcf = vcf

self.common = common
Expand All @@ -105,6 +105,12 @@ def __init__(self, vcf, common=False, n_background=160, whitelist=None, blacklis

self.log = log

outlier_samples = set()
if outlier_sample_ids:
with open(outlier_sample_ids, 'r') as f:
outlier_samples = set([line.strip() for line in f])
self.outlier_sample_ids = outlier_samples

def run(self):
if self.log:
start = datetime.datetime.now()
Expand Down Expand Up @@ -134,7 +140,12 @@ def test_record(self, record):
def choose_background(self, record, whitelist=None, blacklist=None):
# Select called and background samples
called = svu.get_called_samples(record)
background = [s for s in self.samples if s not in called]
background = [s for s in self.samples if s not in called and s not in self.outlier_sample_ids]

# Exclude outlier samples only if non-outlier samples exist
non_outlier_called = [s for s in called if s not in self.outlier_sample_ids]
if len(non_outlier_called) > 0:
called = non_outlier_called

# Permit override of specified white/blacklists
whitelist = whitelist if whitelist is not None else self.whitelist
Expand Down
Loading
Loading