initial commit

wtsi-hgi · Apr 24, 2024 · 2c4fc93 · 2c4fc93
1 parent 8846da5
commit 2c4fc93
Show file tree

Hide file tree

Showing 8 changed files with 207 additions and 0 deletions.
diff --git a/bin/split_samples.py b/bin/split_samples.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+
+#split samples into a batch of approximately the desired number. 
+#Take a list of VCFs and an integer, get samples from one vcf and split
+import argparse
+import subprocess
+#import math
+import os
+
+def get_options():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vcf_dir", type=str, help='VCF directory')
+    parser.add_argument("--batch_size", type=int, help='desired batch size')
+    parser.add_argument("--outdir", type=str, help='output directory containing sample lists')
+    args = parser.parse_args()
+    return args
+
+
+def runcommand(cmd):
+    try:
+        byteoutput = subprocess.check_output(cmd, shell=True)
+        return byteoutput.decode('UTF-8').rstrip()
+    except subprocess.CalledProcessError as e:
+        print(e.output)
+        errstring = "Error in command: " + cmd
+        return errstring
+
+def find_and_split_samples(vcf_dir, batch_size, outdir):
+    sample_list = get_sample_list(vcf_dir)
+    sample_list_split = split_samples(sample_list, batch_size)
+    write_sample_lists(sample_list_split, outdir)
+
+
+def get_sample_list(vcf_dir): 
+    vcf_files = os.listdir(vcf_dir)  
+    for vf in vcf_files:
+        if vf.endswith('vcf.gz'):
+            cmd = "bcftools query -l " + vcf_dir + '/' + vf
+            samples = runcommand(cmd)
+            sample_list = samples.split()
+            return sample_list
+
+
+def optimise_batch_size(batch_size, list_length):
+    optimised = 'no'
+    upper_limit = 2 * batch_size #don't let batch size end up more than double original
+    lower_limit = batch_size / 2 #don't let batch size end up less than half original
+    target_remainder = 0.75 * batch_size #want last chunk to be at least 75% of the chosen batch size
+    batch_increment = batch_size
+    batch_decrement = batch_size
+    stop_increment = 'no'
+    stop_decrement = 'no'
+    while optimised == 'no':
+        if batch_increment > upper_limit:
+            stop_increment = 'yes'
+        if stop_increment == 'no':
+            batch_increment+=1
+            last_chunk = list_length % batch_increment
+            if last_chunk >= target_remainder:
+                optimised == 'yes'
+                return batch_increment
+
+        if batch_decrement < lower_limit:
+            stop_decrement = 'yes'
+        if stop_decrement == 'no':
+            batch_decrement-=1
+            last_chunk = list_length % batch_decrement
+            if last_chunk >= target_remainder:
+                optimised == 'yes'
+                return batch_decrement
+
+        if stop_increment == 'yes' and stop_decrement == 'yes':
+            #can't find anything suitable - return the higher one
+            print("remainder is too low")
+            optimised == 'yes'
+            return batch_increment
+
+
+def split_samples(sample_list, batch_size):
+    '''
+    Split sample list into batches of approximately the requested size. 
+    The idea is not to have the last batch being too small
+    '''
+    list_length = len(sample_list)
+    #list_length = 15367
+    #n_chunks = list_length / batch_size
+    last_chunk = list_length % batch_size #the last chunk will be the remainer and will be smaller than the others
+    if last_chunk >= (0.75 * batch_size):#if the last chunk is big enough we are happy
+        final_batch_size = batch_size
+    else:
+        final_batch_size = optimise_batch_size(batch_size, list_length)
+
+    samples_split_list = [sample_list[i:i + final_batch_size] for i in range(0, len(sample_list), final_batch_size)] 
+
+    return samples_split_list
+
+
+def write_sample_lists(sample_list_split, outdir):
+    i = 1
+    for lst in sample_list_split:
+        sample_file = outdir + "/samples_" + str(i) + ".txt"
+        with open(sample_file, 'w') as sf:
+            sf.write(('\n').join(lst))
+            sf.write('\n')
+        i+=1
+
+
+def main():
+    args = get_options()
+    find_and_split_samples(args.vcf_dir, args.batch_size, args.outdir)
+
+
+
+if __name__ == '__main__':
+    main() 
+
diff --git a/main.nf b/main.nf
@@ -0,0 +1,12 @@
+nextflow.enable.dsl=2
+
+include { RUN_GLIMPSE } from './workflows/run_glimpse'
+
+workflow MAIN {
+    RUN_GLIMPSE ()
+}
+
+workflow {
+    MAIN ()
+}
+
diff --git a/modules/local/split_samples/main.nf b/modules/local/split_samples/main.nf
@@ -0,0 +1,17 @@
+process SPLIT_SAMPLES {
+
+    label 'process_low'
+
+    input:
+    val(batch_size)
+    path(vcf_dir)
+
+    output:
+    path('samples_*.txt'), emit: sample_lists
+
+    script:
+    """
+    split_samples.py --vcf_dir ${vcf_dir} --batch_size ${batch_size} --outdir .
+    """
+
+}
diff --git a/modules/local/split_samples/meta.yml b/modules/local/split_samples/meta.yml
@@ -0,0 +1,18 @@
+name: split_samples
+description: Split samples in a VCF into batches
+keywords:
+  - cutadapt
+input:
+  - batch_size:
+      type: var
+      description: Target batch size
+  - vcf_list:
+      type: path
+      description: List of input VCFs
+output:
+  - out:
+      type: path
+      description: files containing sample lists
+      pattern: "sample_*.txt"
+authors:
+  - "@Ruth Eberhardt"
diff --git a/modules/local/split_vcfs/main.nf b/modules/local/split_vcfs/main.nf
@@ -0,0 +1,14 @@
+process SPLIT_VCFS {
+
+    input:
+    tuple path(vcf), path(sample_list)
+
+    output:
+    path('vcf_sample_subset.vcf.gz')
+
+    script:
+    """
+    bcftools view -S $sample_list $vcf -Oz -o vcf_sample_subset.vcf.gz
+    """
+
+}
diff --git a/modules/local/split_vcfs/meta.yml b/modules/local/split_vcfs/meta.yml
diff --git a/nextflow.config b/nextflow.config
@@ -0,0 +1,9 @@
+params {
+
+    // number of samples to be processed in each batch. Recommended 100-200
+    batch_size = 10 
+    //vcf_list = "/lustre/scratch126/humgen/teams/hgi/users/re3/blended_genomes_exomes/joint_calling/batch1_2/per_chromosome/vcf_list.txt" ///file containing list of inpout VCFs, expects per chromosome VCFs with the same samples
+    vcf_dir = "/lustre/scratch126/humgen/teams/hgi/users/re3/blended_genomes_exomes/glimpse_pipe_test/test_vcfs/"
+    workdir = "/lustre/scratch126/humgen/teams/hgi/users/re3/blended_genomes_exomes/glimpse_pipe_test/work"
+
+}
diff --git a/workflows/run_glimpse.nf b/workflows/run_glimpse.nf
@@ -0,0 +1,21 @@
+include { SPLIT_SAMPLES } from '../modules/local/split_samples/main'
+include { SPLIT_VCFS } from '../modules/local/split_vcfs/main'
+
+workflow RUN_GLIMPSE {
+
+    SPLIT_SAMPLES(params.batch_size, params.vcf_dir)
+
+    vcfs = Channel.fromPath("${params.vcf_dir}/*.vcf.gz")
+    // pairs = vcfs.combine(SPLIT_SAMPLES.out.sample_lists.flatten())
+    // pairs.view()
+
+    vcfs
+        .combine(SPLIT_SAMPLES.out.sample_lists.flatten())
+        | SPLIT_VCFS
+
+
+
+
+    //SPLIT_VCFS(SPLIT_SAMPLES.out.sample_lists, params.vcf_list)
+
+}