diff --git a/docs/usage.md b/docs/usage.md index 7c5f4e0..8f91754 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -31,6 +31,16 @@ CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz ``` +### Create `samplesheet.csv` using helper script + +When you have many samples, manually creating `samplesheet.csv` can be tedious and error-prone. There is a python script [samplesheet.py](../scripts/samplesheet.py) in the scripts directory that can help you extract the path of all paired-end fastq files in the specified folders and create a samplesheet.csv file. + +``` +python scripts/samplesheet.py folder1,folder2 +``` + +The sample name is extracted from the prefix of the fastq files. You may still need to manually change the sample column. + ## Running the pipeline The typical command for running the pipeline is as follows: diff --git a/scripts/samplesheet.py b/scripts/samplesheet.py new file mode 100644 index 0000000..374d23c --- /dev/null +++ b/scripts/samplesheet.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +""" +samplesheet.py folder1,folder2 +read all paired fastq files in folders and create a samplesheet.csv file. +""" +import csv +import os +import argparse + +def get_read(folder): + read1_list = [f'_1', f'R1', f'R1_001'] + read2_list = [f'_2', f'R2', f'R2_001'] + fq_list = ['fq', 'fastq'] + suffix1_list = [ + f'{x}.{y}.gz' + for x in read1_list + for y in fq_list + ] + suffix2_list = [ + f'{x}.{y}.gz' + for x in read2_list + for y in fq_list + ] + + for file in os.listdir(folder): + r1 = os.path.join(folder, file) + if os.path.isfile(r1): + for suffix1, suffix2 in zip(suffix1_list, suffix2_list): + if file.endswith(suffix1): + prefix = file[:-len(suffix1)] + r2 = os.path.join(folder, prefix+suffix2) + if not os.path.exists(r2): + print("warning: {r1} exists, but {r2} not exists. continue") + continue + prefix = prefix.split("_")[0] + print(prefix, r1, r2) + yield prefix,r1,r2 + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('folders') + args = parser.parse_args() + with open('samplesheet.csv', 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['sample', 'fastq_1', 'fastq_2']) + for folder in args.folders.split(','): + folder = os.path.abspath(folder) + for prefix, r1, r2 in get_read(folder): + writer.writerow([prefix, r1, r2]) + print("samplesheet.csv created") + +