-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8846da5
commit 2c4fc93
Showing
8 changed files
with
207 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
#!/usr/bin/env python3 | ||
|
||
#split samples into a batch of approximately the desired number. | ||
#Take a list of VCFs and an integer, get samples from one vcf and split | ||
import argparse | ||
import subprocess | ||
#import math | ||
import os | ||
|
||
def get_options(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--vcf_dir", type=str, help='VCF directory') | ||
parser.add_argument("--batch_size", type=int, help='desired batch size') | ||
parser.add_argument("--outdir", type=str, help='output directory containing sample lists') | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def runcommand(cmd): | ||
try: | ||
byteoutput = subprocess.check_output(cmd, shell=True) | ||
return byteoutput.decode('UTF-8').rstrip() | ||
except subprocess.CalledProcessError as e: | ||
print(e.output) | ||
errstring = "Error in command: " + cmd | ||
return errstring | ||
|
||
def find_and_split_samples(vcf_dir, batch_size, outdir): | ||
sample_list = get_sample_list(vcf_dir) | ||
sample_list_split = split_samples(sample_list, batch_size) | ||
write_sample_lists(sample_list_split, outdir) | ||
|
||
|
||
def get_sample_list(vcf_dir): | ||
vcf_files = os.listdir(vcf_dir) | ||
for vf in vcf_files: | ||
if vf.endswith('vcf.gz'): | ||
cmd = "bcftools query -l " + vcf_dir + '/' + vf | ||
samples = runcommand(cmd) | ||
sample_list = samples.split() | ||
return sample_list | ||
|
||
|
||
def optimise_batch_size(batch_size, list_length): | ||
optimised = 'no' | ||
upper_limit = 2 * batch_size #don't let batch size end up more than double original | ||
lower_limit = batch_size / 2 #don't let batch size end up less than half original | ||
target_remainder = 0.75 * batch_size #want last chunk to be at least 75% of the chosen batch size | ||
batch_increment = batch_size | ||
batch_decrement = batch_size | ||
stop_increment = 'no' | ||
stop_decrement = 'no' | ||
while optimised == 'no': | ||
if batch_increment > upper_limit: | ||
stop_increment = 'yes' | ||
if stop_increment == 'no': | ||
batch_increment+=1 | ||
last_chunk = list_length % batch_increment | ||
if last_chunk >= target_remainder: | ||
optimised == 'yes' | ||
return batch_increment | ||
|
||
if batch_decrement < lower_limit: | ||
stop_decrement = 'yes' | ||
if stop_decrement == 'no': | ||
batch_decrement-=1 | ||
last_chunk = list_length % batch_decrement | ||
if last_chunk >= target_remainder: | ||
optimised == 'yes' | ||
return batch_decrement | ||
|
||
if stop_increment == 'yes' and stop_decrement == 'yes': | ||
#can't find anything suitable - return the higher one | ||
print("remainder is too low") | ||
optimised == 'yes' | ||
return batch_increment | ||
|
||
|
||
def split_samples(sample_list, batch_size): | ||
''' | ||
Split sample list into batches of approximately the requested size. | ||
The idea is not to have the last batch being too small | ||
''' | ||
list_length = len(sample_list) | ||
#list_length = 15367 | ||
#n_chunks = list_length / batch_size | ||
last_chunk = list_length % batch_size #the last chunk will be the remainer and will be smaller than the others | ||
if last_chunk >= (0.75 * batch_size):#if the last chunk is big enough we are happy | ||
final_batch_size = batch_size | ||
else: | ||
final_batch_size = optimise_batch_size(batch_size, list_length) | ||
|
||
samples_split_list = [sample_list[i:i + final_batch_size] for i in range(0, len(sample_list), final_batch_size)] | ||
|
||
return samples_split_list | ||
|
||
|
||
def write_sample_lists(sample_list_split, outdir): | ||
i = 1 | ||
for lst in sample_list_split: | ||
sample_file = outdir + "/samples_" + str(i) + ".txt" | ||
with open(sample_file, 'w') as sf: | ||
sf.write(('\n').join(lst)) | ||
sf.write('\n') | ||
i+=1 | ||
|
||
|
||
def main(): | ||
args = get_options() | ||
find_and_split_samples(args.vcf_dir, args.batch_size, args.outdir) | ||
|
||
|
||
|
||
if __name__ == '__main__': | ||
main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
nextflow.enable.dsl=2 | ||
|
||
include { RUN_GLIMPSE } from './workflows/run_glimpse' | ||
|
||
workflow MAIN { | ||
RUN_GLIMPSE () | ||
} | ||
|
||
workflow { | ||
MAIN () | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
process SPLIT_SAMPLES { | ||
|
||
label 'process_low' | ||
|
||
input: | ||
val(batch_size) | ||
path(vcf_dir) | ||
|
||
output: | ||
path('samples_*.txt'), emit: sample_lists | ||
|
||
script: | ||
""" | ||
split_samples.py --vcf_dir ${vcf_dir} --batch_size ${batch_size} --outdir . | ||
""" | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
name: split_samples | ||
description: Split samples in a VCF into batches | ||
keywords: | ||
- cutadapt | ||
input: | ||
- batch_size: | ||
type: var | ||
description: Target batch size | ||
- vcf_list: | ||
type: path | ||
description: List of input VCFs | ||
output: | ||
- out: | ||
type: path | ||
description: files containing sample lists | ||
pattern: "sample_*.txt" | ||
authors: | ||
- "@Ruth Eberhardt" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
process SPLIT_VCFS { | ||
|
||
input: | ||
tuple path(vcf), path(sample_list) | ||
|
||
output: | ||
path('vcf_sample_subset.vcf.gz') | ||
|
||
script: | ||
""" | ||
bcftools view -S $sample_list $vcf -Oz -o vcf_sample_subset.vcf.gz | ||
""" | ||
|
||
} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
params { | ||
|
||
// number of samples to be processed in each batch. Recommended 100-200 | ||
batch_size = 10 | ||
//vcf_list = "/lustre/scratch126/humgen/teams/hgi/users/re3/blended_genomes_exomes/joint_calling/batch1_2/per_chromosome/vcf_list.txt" ///file containing list of inpout VCFs, expects per chromosome VCFs with the same samples | ||
vcf_dir = "/lustre/scratch126/humgen/teams/hgi/users/re3/blended_genomes_exomes/glimpse_pipe_test/test_vcfs/" | ||
workdir = "/lustre/scratch126/humgen/teams/hgi/users/re3/blended_genomes_exomes/glimpse_pipe_test/work" | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
include { SPLIT_SAMPLES } from '../modules/local/split_samples/main' | ||
include { SPLIT_VCFS } from '../modules/local/split_vcfs/main' | ||
|
||
workflow RUN_GLIMPSE { | ||
|
||
SPLIT_SAMPLES(params.batch_size, params.vcf_dir) | ||
|
||
vcfs = Channel.fromPath("${params.vcf_dir}/*.vcf.gz") | ||
// pairs = vcfs.combine(SPLIT_SAMPLES.out.sample_lists.flatten()) | ||
// pairs.view() | ||
|
||
vcfs | ||
.combine(SPLIT_SAMPLES.out.sample_lists.flatten()) | ||
| SPLIT_VCFS | ||
|
||
|
||
|
||
|
||
//SPLIT_VCFS(SPLIT_SAMPLES.out.sample_lists, params.vcf_list) | ||
|
||
} |