Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
RuthEberhardt committed Apr 24, 2024
1 parent 8846da5 commit 2c4fc93
Show file tree
Hide file tree
Showing 8 changed files with 207 additions and 0 deletions.
116 changes: 116 additions & 0 deletions bin/split_samples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/usr/bin/env python3

#split samples into a batch of approximately the desired number.
#Take a list of VCFs and an integer, get samples from one vcf and split
import argparse
import subprocess
#import math
import os

def get_options():
parser = argparse.ArgumentParser()
parser.add_argument("--vcf_dir", type=str, help='VCF directory')
parser.add_argument("--batch_size", type=int, help='desired batch size')
parser.add_argument("--outdir", type=str, help='output directory containing sample lists')
args = parser.parse_args()
return args


def runcommand(cmd):
try:
byteoutput = subprocess.check_output(cmd, shell=True)
return byteoutput.decode('UTF-8').rstrip()
except subprocess.CalledProcessError as e:
print(e.output)
errstring = "Error in command: " + cmd
return errstring

def find_and_split_samples(vcf_dir, batch_size, outdir):
sample_list = get_sample_list(vcf_dir)
sample_list_split = split_samples(sample_list, batch_size)
write_sample_lists(sample_list_split, outdir)


def get_sample_list(vcf_dir):
vcf_files = os.listdir(vcf_dir)
for vf in vcf_files:
if vf.endswith('vcf.gz'):
cmd = "bcftools query -l " + vcf_dir + '/' + vf
samples = runcommand(cmd)
sample_list = samples.split()
return sample_list


def optimise_batch_size(batch_size, list_length):
optimised = 'no'
upper_limit = 2 * batch_size #don't let batch size end up more than double original
lower_limit = batch_size / 2 #don't let batch size end up less than half original
target_remainder = 0.75 * batch_size #want last chunk to be at least 75% of the chosen batch size
batch_increment = batch_size
batch_decrement = batch_size
stop_increment = 'no'
stop_decrement = 'no'
while optimised == 'no':
if batch_increment > upper_limit:
stop_increment = 'yes'
if stop_increment == 'no':
batch_increment+=1
last_chunk = list_length % batch_increment
if last_chunk >= target_remainder:
optimised == 'yes'
return batch_increment

if batch_decrement < lower_limit:
stop_decrement = 'yes'
if stop_decrement == 'no':
batch_decrement-=1
last_chunk = list_length % batch_decrement
if last_chunk >= target_remainder:
optimised == 'yes'
return batch_decrement

if stop_increment == 'yes' and stop_decrement == 'yes':
#can't find anything suitable - return the higher one
print("remainder is too low")
optimised == 'yes'
return batch_increment


def split_samples(sample_list, batch_size):
'''
Split sample list into batches of approximately the requested size.
The idea is not to have the last batch being too small
'''
list_length = len(sample_list)
#list_length = 15367
#n_chunks = list_length / batch_size
last_chunk = list_length % batch_size #the last chunk will be the remainer and will be smaller than the others
if last_chunk >= (0.75 * batch_size):#if the last chunk is big enough we are happy
final_batch_size = batch_size
else:
final_batch_size = optimise_batch_size(batch_size, list_length)

samples_split_list = [sample_list[i:i + final_batch_size] for i in range(0, len(sample_list), final_batch_size)]

return samples_split_list


def write_sample_lists(sample_list_split, outdir):
i = 1
for lst in sample_list_split:
sample_file = outdir + "/samples_" + str(i) + ".txt"
with open(sample_file, 'w') as sf:
sf.write(('\n').join(lst))
sf.write('\n')
i+=1


def main():
args = get_options()
find_and_split_samples(args.vcf_dir, args.batch_size, args.outdir)



if __name__ == '__main__':
main()

12 changes: 12 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
nextflow.enable.dsl=2

include { RUN_GLIMPSE } from './workflows/run_glimpse'

workflow MAIN {
RUN_GLIMPSE ()
}

workflow {
MAIN ()
}

17 changes: 17 additions & 0 deletions modules/local/split_samples/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
process SPLIT_SAMPLES {

label 'process_low'

input:
val(batch_size)
path(vcf_dir)

output:
path('samples_*.txt'), emit: sample_lists

script:
"""
split_samples.py --vcf_dir ${vcf_dir} --batch_size ${batch_size} --outdir .
"""

}
18 changes: 18 additions & 0 deletions modules/local/split_samples/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
name: split_samples
description: Split samples in a VCF into batches
keywords:
- cutadapt
input:
- batch_size:
type: var
description: Target batch size
- vcf_list:
type: path
description: List of input VCFs
output:
- out:
type: path
description: files containing sample lists
pattern: "sample_*.txt"
authors:
- "@Ruth Eberhardt"
14 changes: 14 additions & 0 deletions modules/local/split_vcfs/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
process SPLIT_VCFS {

input:
tuple path(vcf), path(sample_list)

output:
path('vcf_sample_subset.vcf.gz')

script:
"""
bcftools view -S $sample_list $vcf -Oz -o vcf_sample_subset.vcf.gz
"""

}
Empty file.
9 changes: 9 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
params {

// number of samples to be processed in each batch. Recommended 100-200
batch_size = 10
//vcf_list = "/lustre/scratch126/humgen/teams/hgi/users/re3/blended_genomes_exomes/joint_calling/batch1_2/per_chromosome/vcf_list.txt" ///file containing list of inpout VCFs, expects per chromosome VCFs with the same samples
vcf_dir = "/lustre/scratch126/humgen/teams/hgi/users/re3/blended_genomes_exomes/glimpse_pipe_test/test_vcfs/"
workdir = "/lustre/scratch126/humgen/teams/hgi/users/re3/blended_genomes_exomes/glimpse_pipe_test/work"

}
21 changes: 21 additions & 0 deletions workflows/run_glimpse.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
include { SPLIT_SAMPLES } from '../modules/local/split_samples/main'
include { SPLIT_VCFS } from '../modules/local/split_vcfs/main'

workflow RUN_GLIMPSE {

SPLIT_SAMPLES(params.batch_size, params.vcf_dir)

vcfs = Channel.fromPath("${params.vcf_dir}/*.vcf.gz")
// pairs = vcfs.combine(SPLIT_SAMPLES.out.sample_lists.flatten())
// pairs.view()

vcfs
.combine(SPLIT_SAMPLES.out.sample_lists.flatten())
| SPLIT_VCFS




//SPLIT_VCFS(SPLIT_SAMPLES.out.sample_lists, params.vcf_list)

}

0 comments on commit 2c4fc93

Please sign in to comment.