diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..c1e4294 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,34 @@ +notifications: + slack: wtsi-cgpit:ptUMR1tkNyZJYd9TpGoss8WR + email: false + +env: + - CC=gcc + +addons: + apt: + packages: + - build-essential + - autoconf + - bsdtar + - time + - curl + - libcurl4-openssl-dev + - nettle-dev + - zlib1g-dev + - libncurses5-dev + - libpstreams-dev + - unzip + - libpng12-dev + - libexpat1-dev + +install: true + +language: perl + +perl: + - "5.22" + +script: + - ./setup.sh ~/wtsi-opt + - ~/wtsi-opt/bin/samtools view # dump usage to show intact diff --git a/MANIFEST b/MANIFEST index 03c4145..607ab83 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1,3 +1,5 @@ +.travis.yml +bin/bam2bedgraph bin/bam_stats.pl bin/bam_to_sra_sub.pl bin/bamToBw.pl @@ -9,6 +11,7 @@ bin/gnos_pull.pl bin/monitor.pl bin/xam_coverage_bins.pl bin/xml_to_bas.pl +c/bam2bedgraph.c c/bam_access.c c/bam_access.h c/bam_stats.c @@ -27,6 +30,7 @@ c/dbg.h c/khash.h c/reheadSQ.c CHANGES.md +dists/patch/Bio-BigFile_build.patch dists/snappy-1.1.2.tar.gz docs.tar.gz examples/gnos_pull.ini diff --git a/README.md b/README.md index 6c55ad0..f8e59dc 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,11 @@ ICGC-TCGA-PCAP ============== -NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project +NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project. + +| Master | Dev | +|---|---| +| [![Build Status](https://travis-ci.org/ICGC-TCGA-PanCancer/PCAP-core.svg?branch=master)](https://travis-ci.org/ICGC-TCGA-PanCancer/PCAP-core) | [![Build Status](https://travis-ci.org/ICGC-TCGA-PanCancer/PCAP-core.svg?branch=dev)](https://travis-ci.org/ICGC-TCGA-PanCancer/PCAP-core) | This repository contains code to run genomic alignments of paired end data and subsequent calling algorithms. diff --git a/bin/bamToBw.pl b/bin/bamToBw.pl index 6df35c8..bc86eb1 100755 --- a/bin/bamToBw.pl +++ b/bin/bamToBw.pl @@ -82,6 +82,7 @@ sub setup { 'r|reference=s' => \$opts{'reference'}, 'p|process=s' => \$opts{'process'}, 'i|index=i' => \$opts{'index'}, + 'f|filter=i' => \$opts{'filter'}, 'j|jobs' => \$opts{'jobs'}, ) or pod2usage(2); @@ -106,6 +107,7 @@ sub setup { delete $opts{'process'} unless(defined $opts{'process'}); delete $opts{'index'} unless(defined $opts{'index'}); + delete $opts{'filter'} unless(defined $opts{'filter'}); # now safe to apply defaults $opts{'threads'} = 1 unless(defined $opts{'threads'}); @@ -160,10 +162,13 @@ =head1 SYNOPSIS Required parameters: -bam -b BAM/CRAM file to be processed. -outdir -o Folder to output result to. - -threads -t Number of threads to use. [1] -reference -r Path to genome.fa. - Actually using fa.fai but for convention just provide '.fa' file + Optional parameters: + -threads -t Number of threads to use. [1] + -filter -f Ignore reads with the filter flags [int] + Targeted processing: -process -p Only process this step then exit, optionally set -index bamToBw - Per chromosome BigWigs diff --git a/bin/bam_stats.pl b/bin/bam_stats.pl index 0758be8..01b08a1 100755 --- a/bin/bam_stats.pl +++ b/bin/bam_stats.pl @@ -32,11 +32,8 @@ use Config; # so we can see if threads are enabled use Data::Dumper; -BEGIN { - if($Config{useithreads}) { - require threads; - } -}; +our $CAN_USE_THREADS = 0; +$CAN_USE_THREADS = eval 'use threads; 1'; use PCAP::Bam::Stats; @@ -54,15 +51,20 @@ BEGIN $out_location = 'STDOUT'; } + if($opts->{'threads'} > 1 && $CAN_USE_THREADS == 0) { + warn "Threading is not available perl component will run as a single process"; + $opts->{'threads'} = 1; + } + try{ my $stats; - if($opts->{'threads'} > 1 && $Config{useithreads}) { + if($opts->{'threads'} > 1 && $CAN_USE_THREADS) { for my $thread(0..($opts->{'threads'}-1)) { my ($thr) = threads->create(\&stat_thread, $opts, $thread); } - sleep 2 while(threads->list(threads::running) > 0); + sleep 2 while(threads->list(threads::running()) > 0); my @bas_objs; - for my $thr(threads->list(threads::joinable)) { + for my $thr(threads->list(threads::joinable())) { push @bas_objs, $thr->join; if(my $err = $thr->error) { die "Thread error: $err\n"; } } diff --git a/bin/bwa_mem.pl b/bin/bwa_mem.pl index 6dfdb83..a86031a 100755 --- a/bin/bwa_mem.pl +++ b/bin/bwa_mem.pl @@ -97,6 +97,8 @@ sub setup { 'p|process=s' => \$opts{'process'}, 'i|index=i' => \$opts{'index'}, 'b|bwa=s' => \$opts{'bwa'}, + 'c|cram' => \$opts{'cram'}, + 'sc|scramble=s' => \$opts{'scramble'}, ) or pod2usage(2); pod2usage(-verbose => 1, -exitval => 0) if(defined $opts{'h'}); @@ -126,6 +128,9 @@ sub setup { delete $opts{'process'} unless(defined $opts{'process'}); delete $opts{'index'} unless(defined $opts{'index'}); delete $opts{'bwa'} unless(defined $opts{'bwa'}); + delete $opts{'scramble'} unless(defined $opts{'scramble'}); + + PCAP::Cli::opt_requires_opts('scramble', \%opts, ['cram']); # now safe to apply defaults $opts{'threads'} = 1 unless(defined $opts{'threads'}); @@ -189,10 +194,13 @@ =head1 SYNOPSIS -threads -t Number of threads to use. [1] Optional parameters: - -fragment -f Split input into fragements of X million repairs + -fragment -f Split input into fragements of X million repairs [10] -nomarkdup -n Don't mark duplicates - -bwa -b Single quoted string of parameters to pass to BWA - - overrides all defaults except '-t,-p,-R' + -cram -c Output cram, see '-sc' + -scramble -sc Single quoted string of parameters to pass to Scramble when '-c' used + - '-I,-O' are used internally and should not be provided + -bwa -b Single quoted string of additional parameters to pass to BWA + - '-t,-p,-R' are used internally and should not be provided Targeted processing: -process -p Only process this step then exit, optionally set -index diff --git a/bin/gnos_pull.pl b/bin/gnos_pull.pl index 9eae0bc..eab9cf6 100755 --- a/bin/gnos_pull.pl +++ b/bin/gnos_pull.pl @@ -43,6 +43,9 @@ use Proc::PID::File; use Data::Dumper; +my $CAN_USE_THREADS = 0; +$CAN_USE_THREADS = eval 'use threads; 1'; + use PCAP; use PCAP::Cli; @@ -99,7 +102,7 @@ sub load_config { } if(exists $options->{'COMPOSITE_FILTERS'}->{'not_sanger_workflow'}) { my %bl_workflows = map { $_ => 1 } split /[\n,]/, $options->{'COMPOSITE_FILTERS'}->{'not_sanger_workflow'}; - $options->{'COMPOSITE_FILTERS'}->{'not_sanger_workflow'} = [keys \%bl_workflows]; + $options->{'COMPOSITE_FILTERS'}->{'not_sanger_workflow'} = [keys %bl_workflows]; } croak sprintf q{'KEY_FILE' Ssection is absent from %s}, $options->{'config'} unless($cfg->SectionExists('KEY_FILES')); @@ -132,8 +135,6 @@ sub load_config { return 1; } -use threads; - sub pull_data { my ($options, $to_process) = @_; @@ -153,6 +154,10 @@ sub pull_data { } my $thread_count = $options->{'threads'}; + if($CAN_USE_THREADS == 0) { + warn "Threading is not available perl component will run as a single process"; + $thread_count = 1; + } while(@{$to_process} > 0) { @@ -170,13 +175,13 @@ sub pull_data { warn "Submitting: $donor->{donor_unique_id}\n"; if($thread_count > 1) { - if(threads->list(threads::all) < $thread_count) { + if(threads->list(threads::all()) < $thread_count) { threads->create($code_ref, $options, $donor, $orig_base, $donor_base); # don't sleep if not full yet - next if(threads->list(threads::all) < $thread_count); + next if(threads->list(threads::all()) < $thread_count); } - sleep 1 while(threads->list(threads::joinable) == 0); - for my $thr(threads->list(threads::joinable)) { + sleep 1 while(threads->list(threads::joinable()) == 0); + for my $thr(threads->list(threads::joinable())) { $thr->join; if(my $err = $thr->error) { die "Thread error: $err\n"; } } @@ -187,8 +192,8 @@ sub pull_data { } if($thread_count > 1) { # last gasp for any remaining threads - sleep 1 while(threads->list(threads::running) > 0); - for my $thr(threads->list(threads::joinable)) { + sleep 1 while(threads->list(threads::running()) > 0); + for my $thr(threads->list(threads::joinable())) { $thr->join; if(my $err = $thr->error) { die "Thread error: $err\n"; } } diff --git a/c/Makefile b/c/Makefile index 223215f..8d5c8b4 100644 --- a/c/Makefile +++ b/c/Makefile @@ -42,7 +42,7 @@ LFLAGS?= -L$(HTSTMP) # define any libraries to link into executable: # if I want to link in libraries (libx.so or libx.a) I use the -llibname # option, something like (this will link in libmylib.so and libm.so: -LIBS =-lhts -lpthread -lz -lm +LIBS =-lhts -lpthread -lz -lm -ldl # define the C source files SRCS = ./bam_access.c ./bam_stats_output.c ./bam_stats_calcs.c @@ -66,6 +66,7 @@ MD := mkdir BAM_STATS_TARGET=../bin/bam_stats CAT_TARGET=../bin/bwcat SQ_TARGET=../bin/reheadSQ +BAM2BG_TARGET=../bin/bam2bedgraph # # The following part of the makefile is generic; it can be used to @@ -77,7 +78,7 @@ SQ_TARGET=../bin/reheadSQ .NOTPARALLEL: test -all: clean pre make_htslib_tmp $(BAM_STATS_TARGET) test remove_htslib_tmp $(CAT_TARGET) $(SQ_TARGET) +all: clean pre make_htslib_tmp $(BAM_STATS_TARGET) $(BAM2BG_TARGET) test remove_htslib_tmp $(CAT_TARGET) $(SQ_TARGET) @echo bam_stats, reheadSQ and bwcat compiled. $(BAM_STATS_TARGET): $(OBJS) @@ -86,6 +87,9 @@ $(BAM_STATS_TARGET): $(OBJS) $(CAT_TARGET): $(CC) $(CAT_INCLUDES) $(CFLAGS) ./bwcat.c $(CAT_LFLAGS) -lBigWig -lz -lcurl -lm -lgnutls -ltasn1 -lhogweed -lnettle -lgmp -lp11-kit -o $(CAT_TARGET) +$(BAM2BG_TARGET): $(OBJS) + $(CC) $(CFLAGS) $(INCLUDES) -o $(BAM2BG_TARGET) $(OBJS) $(LFLAGS) $(LIBS) ./bam2bedgraph.c + $(SQ_TARGET): $(CC) $(CFLAGS) ./reheadSQ.c -o $(SQ_TARGET) diff --git a/c/bam2bedgraph.c b/c/bam2bedgraph.c new file mode 100644 index 0000000..a78cc05 --- /dev/null +++ b/c/bam2bedgraph.c @@ -0,0 +1,228 @@ +#include +#include +#include +#include +#include "htslib/sam.h" + +static char *input_file = NULL; +static char *output_file = NULL; +static char *region = NULL; +static int filter = 0; + +typedef struct { + uint32_t ltid; + int lstart,lcoverage,lpos,beg,end; + htsFile *in; + hts_idx_t *idx; + bam_hdr_t *head; + FILE *out; +} tmpstruct_t; + +void print_version (int exit_code){ + printf ("%s\n",VERSION); + exit(exit_code); +} + +int check_exist(char *fname){ + FILE *fp; + if((fp = fopen(fname,"r"))){ + fclose(fp); + return 1; + } + return 0; +} + +void print_usage (int exit_code){ + + printf ("Usage: bam2bedgraph -i input.[cr|b]am -o file [-r region] [-h] [-v]\n\n"); + printf ("Create a BEDGraph file of genomic coverage. BAM file must be sorted.\n"); + printf ("-i --input Path to bam/cram input file. [default: stdin]\n"); + printf ("-o --output File path for output. [default: stdout]\n\n"); + printf ("Optional:\n"); + printf ("-r --region Region in bam to access.\n"); + printf ("-f --filter Ignore reads with the filter flags [int].\n"); + printf ("Other:\n"); + printf ("-h --help Display this usage information.\n"); + printf ("-v --version Prints the version number.\n\n"); + exit(exit_code); +} + +void options(int argc, char *argv[]){ + const struct option long_opts[] = + { + {"version", no_argument, 0, 'v'}, + {"help",no_argument,0,'h'}, + {"input",required_argument,0,'i'}, + {"region",required_argument,0,'r'}, + {"filter",required_argument,0,'f'}, + {"output",required_argument,0,'o'}, + {"rna",no_argument,0, 'a'}, + { NULL, 0, NULL, 0} + + }; //End of declaring opts + + int index = 0; + int iarg = 0; + + //Iterate through options + while((iarg = getopt_long(argc, argv, "i:o:r:f:vh", long_opts, &index)) != -1){ + switch(iarg){ + case 'i': + input_file = optarg; + break; + + case 'o': + output_file = optarg; + break; + + case 'r': + region = optarg; + break; + + case 'f': + if(sscanf(optarg, "%i", &filter) != 1){ + printf("Error parsing -f argument '%s'. Should be an integer > 0",optarg); + print_usage(1); + } + break; + case 'h': + print_usage(0); + break; + + case 'v': + print_version(0); + break; + + case '?': + print_usage (1); + break; + + default: + print_usage (1); + + }; // End of args switch statement + + }//End of iteration through options + + //Do some checking to ensure required arguments were passed and are accessible files + if (input_file==NULL || strcmp(input_file,"/dev/stdin")==0) { + input_file = "-"; // htslib recognises this as a special case + } + if (strcmp(input_file,"-") != 0) { + if(check_exist(input_file) != 1){ + printf("Input file (-i) %s does not exist.\n",input_file); + print_usage(1); + } + } + if (output_file==NULL || strcmp(output_file,"/dev/stdout")==0) { + output_file = "-"; // we recognise this as a special case + } + + return; +} + +// callback for bam_plbuf_init() +static int pileup_func(uint32_t tid, uint32_t position, int n, const bam_pileup1_t *pl, void *data) +{ + tmpstruct_t *tmp = (tmpstruct_t*)data; + int pos = (int)position; + int coverage = n; + int i; + for (i=0;iltid != tid || tmp->lcoverage != coverage || pos > tmp->lpos+1) { + if (tmp->lpos > 0 && tmp->lcoverage > 0) + fprintf(tmp->out,"%s\t%d\t%d\t%d\n", tmp->head->target_name[tmp->ltid], tmp->lstart,tmp->lpos+1, tmp->lcoverage); + tmp->ltid = tid; + tmp->lstart = pos; + tmp->lcoverage = coverage; + } + tmp->lpos = pos; + return 0; +} + +int main(int argc, char *argv[]){ + options(argc, argv); + tmpstruct_t tmp; + bam_plp_t buf = NULL; + bam1_t *b = NULL; + hts_itr_t *iter = NULL; + tmp.beg = 0; tmp.end = 0x7fffffff; + tmp.lstart = 0; + tmp.lcoverage = 0; + tmp.ltid = 0; + tmp.lpos = 0; + + if (strcmp(output_file,"-")==0) { + tmp.out = stdout; + } else { + tmp.out = fopen(output_file,"w"); + } + if(tmp.out == NULL){ + fprintf(stderr,"Failed to open output file for %s writing.",output_file); + return 1; + } + tmp.in = hts_open(input_file, "r"); + if (tmp.in == 0) { + fprintf(stderr, "Fail to open [CR|B]AM file %s\n", input_file); + return 1; + } + tmp.head = sam_hdr_read(tmp.in); + buf = bam_plp_init(0,0); + //Iterate through each read in bam file. + b = bam_init1(); + if(region == NULL){ + tmp.idx=NULL; + int reto; + while((reto = sam_read1(tmp.in, tmp.head, b)) >= 0){ + if((b->core.flag & filter)>0) continue; //Skip if this is a filtered read + int ret, n_plp, tid, pos; + const bam_pileup1_t *plp; + ret = bam_plp_push(buf, b); + if (ret < 0) break; + while ((plp = bam_plp_next(buf, &tid, &pos, &n_plp)) != 0) + pileup_func(tid, pos, n_plp, plp, &tmp); + } + bam_plp_push(buf,0); + }else{ + + tmp.idx = sam_index_load(tmp.in,input_file); + if (tmp.idx == 0) { + fprintf(stderr, "Fail to open [CR|B]AM index for file %s\n", input_file); + return 1; + } + iter = sam_itr_querys(tmp.idx, tmp.head, region); + int result; + while ((result = sam_itr_next(tmp.in, iter, b)) >= 0) { + if((b->core.flag & filter)>0) continue; //Skip if this is a filtered read + int ret, n_plp, tid, pos; + const bam_pileup1_t *plp; + ret = bam_plp_push(buf, b); + if (ret < 0) break; + while ((plp = bam_plp_next(buf, &tid, &pos, &n_plp)) != 0){ + pileup_func(tid, pos, n_plp, plp, &tmp); + } + } + bam_plp_push(buf,0); + } + + //Check we've written everything... + int ret, n_plp, tid, pos; + const bam_pileup1_t *plp; + while ((plp = bam_plp_next(buf, &tid, &pos, &n_plp)) != 0){ + pileup_func(tid, pos, n_plp, plp, &tmp); + } + + fprintf(tmp.out,"%s\t%d\t%d\t%d\n", tmp.head->target_name[tmp.ltid], tmp.lstart,tmp.lpos+1, tmp.lcoverage); + bam_plp_destroy(buf); + bam_destroy1(b); + fflush(tmp.out); + fclose(tmp.out); + if(iter) sam_itr_destroy(iter); + if(tmp.idx) hts_idx_destroy(tmp.idx); + if(tmp.in) hts_close(tmp.in); + if(tmp.head) bam_hdr_destroy(tmp.head); + return 0; +} + diff --git a/dists/patch/Bio-BigFile_build.patch b/dists/patch/Bio-BigFile_build.patch new file mode 100644 index 0000000..2acbfaf --- /dev/null +++ b/dists/patch/Bio-BigFile_build.patch @@ -0,0 +1,11 @@ +--- Build.PL 2016-08-03 16:36:03.408448000 +0100 ++++ Build.PL.new 2016-08-03 16:38:09.324215000 +0100 +@@ -16,7 +16,7 @@ + dist_abstract => "Manipulate Jim Kent's BigWig and BigBed index files for genomic features.", + license => 'perl', + include_dirs => [$jk_include], +- extra_linker_flags => ["$jk_lib/$LibFile",'-lz','-lssl'], ++ extra_linker_flags => ["$jk_lib/$LibFile",'-lz','-lssl','-pthread'], + + extra_compiler_flags=>[ + # turn off warnings originating in Perl's Newx* calls diff --git a/docs.tar.gz b/docs.tar.gz index ee1db10..aff935a 100644 Binary files a/docs.tar.gz and b/docs.tar.gz differ diff --git a/lib/PCAP.pm b/lib/PCAP.pm index 5c022da..4c51aa7 100644 --- a/lib/PCAP.pm +++ b/lib/PCAP.pm @@ -26,7 +26,7 @@ use base 'Exporter'; use FindBin qw($Bin); use File::Which qw(which); -our $VERSION = '3.0.1'; +our $VERSION = '3.1.0'; our @EXPORT = qw($VERSION _which); const my $LICENSE => @@ -82,6 +82,7 @@ const my %UPGRADE_PATH => ( # all earlier versions need full upgrade '2.4.0' => 'biobambam', '2.5.0' => 'biobambam', '3.0.0' => '', + '3.1.0' => '', ); sub license { diff --git a/lib/PCAP/Bam.pm b/lib/PCAP/Bam.pm index 2aa8a8c..d6bfa7d 100644 --- a/lib/PCAP/Bam.pm +++ b/lib/PCAP/Bam.pm @@ -36,8 +36,11 @@ use Data::UUID; use PCAP::Threaded; const my $BAMCOLLATE => q{(%s colsbs=268435456 collate=1 reset=1 exclude=SECONDARY,QCFAIL,SUPPLEMENTARY classes=F,F2 T=%s filename=%s level=1 > %s)}; -const my $BAMBAM_DUP => q{ index=1 md5=1 tmpfile=%s O=%s M=%s markthreads=%s }; -const my $BAMBAM_MERGE => q{ tmpfile=%s md5filename=%s.md5 indexfilename=%s.bai index=1 md5=1 > %s}; +const my $BAMBAM_DUP => q{ %s index=1 md5=1 tmpfile=%s O=%s M=%s markthreads=%s }; +const my $BAMBAM_MERGE => q{ %s tmpfile=%s md5filename=%s.md5 indexfilename=%s.bai index=1 md5=1 > %s}; +const my $BAMBAM_DUP_CRAM => q{ %s tmpfile=%s M=%s markthreads=%s level=0| scramble -r %s -I bam -O cram %s | tee %s | samtools index - %s.crai}; +const my $BAMBAM_MERGE_CRAM => q{ %s tmpfile=%s level=0 | scramble -r %s -I bam -O cram %s | tee %s | samtools index - %s.crai}; +const my $CRAM_CHKSUM => q{md5sum %s | perl -ne '/^(\S+)/; print "$1";' > %s.md5}; const my $BAM_STATS => q{ -i %s -o %s}; sub new { @@ -96,45 +99,92 @@ sub merge_and_mark_dup { my ($options, $source) = @_; my $tmp = $options->{'tmp'}; my $marked = File::Spec->catdir($options->{'outdir'}, $options->{'sample'}); - $marked .= '.bam'; - return $marked if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); - my $helper_threads = $options->{'threads'}-1; - # uncoverable branch true - # uncoverable branch false - $helper_threads = 1 if($helper_threads < 1); - # uncoverable branch true - # uncoverable branch false - my $command; - if(defined $options->{'nomarkdup'} && $options->{'nomarkdup'} == 1) { - $command = _which('bammerge') || die "Unable to find 'bammarkduplicates' in path"; - $command .= sprintf $BAMBAM_MERGE, File::Spec->catfile($tmp, 'biormdup'), - $marked, - $marked, - $marked; + + my @commands; + + if($options->{'cram'}) { + $marked .= '.cram'; } else { - my $met = "$marked.met"; - $command = _which('bammarkduplicates2') || die "Unable to find 'bammarkduplicates' in path"; - $command .= sprintf $BAMBAM_DUP, File::Spec->catfile($tmp, 'biormdup'), - $marked, - $met, - $helper_threads; + $marked .= '.bam'; } + return $marked if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); + my $helper_threads = $options->{'threads'}-1; + + $helper_threads = 1 if($helper_threads < 1); + + my $input_str = q{}; if(defined $source) { opendir(my $dh, $source); while(my $file = readdir $dh) { next unless($file =~ m/_sorted\.bam$/); - $command .= ' I='.File::Spec->catfile($source, $file); + $input_str .= ' I='.File::Spec->catfile($source, $file); } closedir $dh; } else { for(@{$options->{'meta_set'}}) { - $command .= ' I='.$_->tstub.'_sorted.bam'; + $input_str .= ' I='.$_->tstub.'_sorted.bam'; } } - PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, 0); + + my $bbb_tmp = File::Spec->catfile($tmp, 'biormdup'); + + if(defined $options->{'nomarkdup'} && $options->{'nomarkdup'} == 1) { + $commands[0] = _which('bammerge') || die "Unable to find 'bammerge' in path"; + + if($options->{'cram'}) { + my $add_sc = $options->{'scramble'} || q{}; + $commands[0] .= sprintf $BAMBAM_MERGE_CRAM, + $input_str, + $bbb_tmp, + $options->{'reference'}, + $add_sc, + $marked, + $marked; + } + else { + $commands[0] .= sprintf $BAMBAM_MERGE, + $input_str, + $bbb_tmp, + $marked, + $marked, + $marked; + } + } + else { + my $met = "$marked.met"; + $commands[0] = _which('bammarkduplicates2') || die "Unable to find 'bammarkduplicates' in path"; + if($options->{'cram'}) { + my $add_sc = $options->{'scramble'} || q{}; + $commands[0] .= sprintf $BAMBAM_DUP_CRAM, + $input_str, + $bbb_tmp, + $met, + $helper_threads, + $options->{'reference'}, + $add_sc, + $marked, + $marked; + } + else { + $commands[0] .= sprintf $BAMBAM_DUP, + $input_str, + $bbb_tmp, + $marked, + $met, + $helper_threads; + } + } + + if($options->{'cram'}) { + unshift @commands, 'set -o pipefail'; + push @commands, sprintf $CRAM_CHKSUM, $marked, $marked; + push @commands, 'set +o pipefail'; + } + + PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), \@commands, 0); PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); return $marked; } @@ -143,11 +193,13 @@ sub bam_stats { # uncoverable subroutine my $options = shift; my $tmp = $options->{'tmp'}; - my $bam = File::Spec->catdir($options->{'outdir'}, $options->{'sample'}).'.bam';; - my $bas = "$bam.bas"; + my $ext = '.bam'; + $ext = '.cram' if($options->{'cram'}); + my $xam = File::Spec->catdir($options->{'outdir'}, $options->{'sample'}).$ext; + my $bas = "$xam.bas"; return $bas if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); my $command = _which('bam_stats') || die "Unable to find 'bam_stats' in path"; - $command .= sprintf $BAM_STATS, $bam, $bas; + $command .= sprintf $BAM_STATS, $xam, $bas; PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, 0); PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); return $bas; diff --git a/lib/PCAP/BigWig.pm b/lib/PCAP/BigWig.pm index 9f11f1d..b2ac2d7 100644 --- a/lib/PCAP/BigWig.pm +++ b/lib/PCAP/BigWig.pm @@ -38,6 +38,9 @@ sub bamToBw { return if(exists $options->{'index'} && $index != $options->{'index'}); return if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), $index); + my $filter = 3844; # see https://broadinstitute.github.io/picard/explain-flags.html + $filter = $options->{'filter'} if(exists $options->{'filter'}); + my @seqs = @{$options->{'sequences'}}; my $iter = 1; for my $seq(@seqs) { @@ -46,11 +49,10 @@ sub bamToBw { my $outfile = q{'}.File::Spec->catfile($options->{'tmp'}, $seq.'.bw').q{'}; my $command = q{bash -c "set pipefail; }; - $command .= _which('samtools'); - $command .= q{ view -T }.$options->{'reference'}; - $command .= q{ -F 3844}; # don't include data that set these flags - $command .= q{ -ub }.$options->{'bam'}.q{ '}.$seq.q{'}; - $command .= ' | '._which('bam2bedgraph').' - '; + $command .= _which('bam2bedgraph'); + $command .= q{ -f }.$filter; + $command .= q{ -r }.$seq; + $command .= q{ -i }.$options->{'bam'}; $command .= ' | '; $command .= _which('wigToBigWig'); $command .= ' -fixedSummaries -keepAllChromosomes stdin '.$options->{'reference'}.'.fai '.$outfile; diff --git a/lib/PCAP/Bwa.pm b/lib/PCAP/Bwa.pm index 08d62de..27c9d9a 100644 --- a/lib/PCAP/Bwa.pm +++ b/lib/PCAP/Bwa.pm @@ -247,12 +247,12 @@ sub bwa_mem { # uncoverable branch true # uncoverable branch false $interleaved_fq = q{ -p}, unless($input->paired_fq); - if(exists $options->{'bwa'}) { - $bwa .= sprintf $BWA_MEM, $options->{'bwa'}, $interleaved_fq, $rg_line, $threads, $options->{'reference'}; - } - else { - $bwa .= sprintf $BWA_MEM, '-T 0', $interleaved_fq, $rg_line, $threads, $options->{'reference'}; - } + + my $add_options = q{}; + $add_options = $options->{'bwa'} if(exists $options->{'bwa'}); + + $bwa .= sprintf $BWA_MEM, $add_options, $interleaved_fq, $rg_line, $threads, $options->{'reference'}; + # uncoverable branch true # uncoverable branch false if($input->paired_fq) { diff --git a/lib/PCAP/Threaded.pm b/lib/PCAP/Threaded.pm index ae60996..6d0470a 100644 --- a/lib/PCAP/Threaded.pm +++ b/lib/PCAP/Threaded.pm @@ -27,17 +27,16 @@ use autodie qw(:all); use English qw( -no_match_vars ); use warnings FATAL => 'all'; use Carp qw( croak ); -use Config; # so we can see if threads are enabled use File::Spec; use File::Path qw(make_path); use Try::Tiny qw(try catch finally); use Capture::Tiny qw(capture); use IO::File; use Const::Fast qw(const); +use Scalar::Util qw(looks_like_number); -BEGIN { - if($Config{useithreads}) { use threads; } -}; +our $CAN_USE_THREADS = 0; +$CAN_USE_THREADS = eval 'use threads; 1'; const my $SCRIPT_OCT_MODE => 0777; @@ -45,15 +44,16 @@ our $OUT_ERR = 1; sub new { my ($class, $max_threads) = @_; - unless($Config{useithreads}) { - warn "Threading is not available perl component will run as a single process"; - $max_threads = 1; - } + croak "Number of threads was NAN: $max_threads" if(defined $max_threads && !looks_like_number($max_threads)); unless(defined $max_threads) { warn "Thread count not defined, defaulting to 1.\n"; $max_threads = 1; } - croak "Number of threads was NAN: $max_threads" if($max_threads !~ m/^[[:digit:]]+$/xms); + if($max_threads > 1 && $CAN_USE_THREADS == 0) { + warn "Threading is not available perl component will run as a single process"; + $max_threads = 1; + } + my $self = {'threads' => $max_threads, 'join_interval' => 1,}; bless $self, $class; @@ -109,24 +109,24 @@ sub run { my $thread_count = $self->{'functions'}->{$function_name}->{'threads'}; # uncoverable branch true - if($thread_count > 1 && $Config{useithreads}) { + if($thread_count > 1 && $CAN_USE_THREADS) { # reserve 0 for when people want to use 'success_exists/touch_success' for non-threaded steps # makes it easy to see in progress area which steps are threaded my $index = 1; while($index <= $iterations) { - while(threads->list(threads::all) < $thread_count && $index <= $iterations) { + while(threads->list(threads::all()) < $thread_count && $index <= $iterations) { threads->create($function_ref, $index++, @params); last if($index > $iterations); } - sleep $self->thread_join_interval while(threads->list(threads::joinable) == 0); - for my $thr(threads->list(threads::joinable)) { + sleep $self->thread_join_interval while(threads->list(threads::joinable()) == 0); + for my $thr(threads->list(threads::joinable())) { $thr->join; if(my $err = $thr->error) { die "Thread error: $err\n"; } } } # last gasp for any remaining threads - sleep $self->thread_join_interval while(threads->list(threads::running) > 0); - for my $thr(threads->list(threads::joinable)) { + sleep $self->thread_join_interval while(threads->list(threads::running()) > 0); + for my $thr(threads->list(threads::joinable())) { $thr->join; if(my $err = $thr->error) { die "Thread error: $err\n"; } } diff --git a/prerelease.sh b/prerelease.sh index f8c817a..73d5aba 100755 --- a/prerelease.sh +++ b/prerelease.sh @@ -37,7 +37,7 @@ export HARNESS_PERL_SWITCHES=-MDevel::Cover=-db,reports,-select='^lib/*\.pm$',-i rm -rf reports docs pm_to_blib blib cover -delete mkdir -p docs/reports_text -prove -w -I ./lib +prove -w -j 9 -I ./lib echo -e '\n\n### Generating test/pod coverage reports ###\n' # removed 'condition' from coverage as '||' 'or' doesn't work properly diff --git a/setup.sh b/setup.sh index b04db40..481f630 100755 --- a/setup.sh +++ b/setup.sh @@ -1,21 +1,25 @@ #!/bin/bash -SOURCE_BWA="https://github.com/lh3/bwa/archive/0.7.12.tar.gz" +SOURCE_BWA="https://github.com/lh3/bwa/archive/v0.7.15.tar.gz" SOURCE_SAMTOOLS="https://github.com/samtools/samtools/releases/download/1.3.1/samtools-1.3.1.tar.bz2" -# for bamstats -SOURCE_HTSLIB="https://github.com/samtools/htslib/archive/1.3.1.tar.gz" +# for bamstats and Bio::DB::HTS +SOURCE_HTSLIB="https://github.com/samtools/htslib/releases/download/1.3.1/htslib-1.3.1.tar.bz2" + +# Bio::DB::HTS +SOURCE_BIOBDHTS="https://github.com/Ensembl/Bio-HTS/archive/2.3.tar.gz" # for bigwig SOURCE_JKENT_BIN="https://github.com/ENCODE-DCC/kentUtils/raw/master/bin/linux.x86_64" # for Bio::DB::BigWig -SOURCE_KENTSRC="http://hgdownload.cse.ucsc.edu/admin/jksrc.zip" +SOURCE_KENTSRC="ftp://ftp.sanger.ac.uk/pub/cancer/legacy-dependancies/jksrc.v334.zip" +SOURCE_BIGFILE="http://www.cpan.org/authors/id/L/LD/LDS/Bio-BigFile-1.07.tar.gz" # for fast merging of per-chr BW files -SOURCE_LIB_BW="https://github.com/dpryan79/libBigWig/archive/0.1.6.tar.gz" +SOURCE_LIB_BW="https://github.com/dpryan79/libBigWig/archive/b36da5a06bffcc1b33c369e078b82f84625fd212.tar.gz" # for biobambam -SOURCE_BBB_BIN_DIST="https://github.com/gt1/biobambam2/releases/download/2.0.50-release-20160705161609/biobambam2-2.0.50-release-20160705161609-x86_64-etch-linux-gnu.tar.gz" +SOURCE_BBB_BIN_DIST="https://github.com/gt1/biobambam2/releases/download/2.0.54-release-20160802163650/biobambam2-2.0.54-release-20160802163650-x86_64-etch-linux-gnu.tar.gz" BIODBHTS_INSTALL="https://raw.githubusercontent.com/Ensembl/Bio-HTS/master/INSTALL.pl" @@ -31,10 +35,11 @@ get_distro () { echo "I don't understand the file type for $1" exit 1 fi + rm -f $1.$EXT if hash curl 2>/dev/null; then - curl -sS -o $1.$EXT -L $2 + curl --retry 10 -sS -o $1.$EXT -L $2 else - wget -nv -O $1.$EXT $2 + wget --tries=10 -nv -O $1.$EXT $2 fi } @@ -98,10 +103,16 @@ if ! ( perl -MExtUtils::MakeMaker -e 1 >/dev/null 2>&1); then echo "WARNING: Your Perl installation does not seem to include a complete set of core modules. Attempting to cope with this, but if installation fails please make sure that at least ExtUtils::MakeMaker is installed. For most users, the best way to do this is to use your system's package manager: apt, yum, fink, homebrew, or similar." fi -perlmods=( "File::ShareDir" "File::ShareDir::Install" "Const::Fast" "File::Which" ) -for i in "${perlmods[@]}" ; do - $CPANM --mirror http://cpan.metacpan.org -l $INST_PATH $i -done +if [ -e $SETUP_DIR/basePerlDeps.success ]; then + echo "Previously installed base perl deps..." +else + perlmods=( "ExtUtils::CBuilder" "Module::Build~0.42" "File::ShareDir" "File::ShareDir::Install" "Const::Fast" "File::Which" "LWP::UserAgent" "Bio::Root::Version~1.006009001") + for i in "${perlmods[@]}" ; do + $CPANM -v --no-interactive --notest --mirror http://cpan.metacpan.org -l $INST_PATH $i + done + touch $SETUP_DIR/basePerlDeps.success +fi + # figure out the upgrade path COMPILE=`echo 'nothing' | perl -I lib -MPCAP -ne 'print PCAP::upgrade_path($_);'` @@ -110,16 +121,51 @@ if [ -e "$INST_PATH/lib/perl5/PCAP.pm" ]; then fi COMPILE=",$COMPILE," -#Need to add CaVEMan stuff here... will depend on samtools too (for now). +echo -n "Get htslib ..." +if [ -e $SETUP_DIR/htslibGet.success ]; then + echo " already staged ..."; +else + echo + cd $SETUP_DIR + get_distro "htslib" $SOURCE_HTSLIB + touch $SETUP_DIR/htslibGet.success +fi + +echo -n "Building Bio::DB::HTS ..." +if [ -e $SETUP_DIR/biohts.success ]; then + echo " previously installed ..."; +else + echo + cd $SETUP_DIR + rm -rf bioDbHts + get_distro "bioDbHts" $SOURCE_BIOBDHTS + mkdir -p bioDbHts/htslib + tar --strip-components 1 -C bioDbHts -zxf bioDbHts.tar.gz + tar --strip-components 1 -C bioDbHts/htslib -jxf $SETUP_DIR/htslib.tar.bz2 + cd bioDbHts/htslib + perl -pi -e 'if($_ =~ m/^CFLAGS/ && $_ !~ m/\-fPIC/i){chomp; s/#.+//; $_ .= " -fPIC -Wno-unused -Wno-unused-result\n"};' Makefile + make -j$CPU + rm -f libhts.so* + cd ../ + env HTSLIB_DIR=$SETUP_DIR/bioDbHts/htslib perl Build.PL --install_base=$INST_PATH + ./Build test + ./Build install + cd $SETUP_DIR + rm -f bioDbHts.tar.gz + touch $SETUP_DIR/biohts.success +fi + echo -n "Building jkentUtils ..." if [ -e $SETUP_DIR/jkentUtils.success ]; then - echo -n " previously installed ..."; + echo " previously installed ..."; else + echo cd $SETUP_DIR if [[ `uname -m` == x86_64 ]] ; then get_file $INST_PATH/bin/wigToBigWig $SOURCE_JKENT_BIN/wigToBigWig chmod +x $INST_PATH/bin/wigToBigWig + touch $SETUP_DIR/jkentUtils.success else if [ ! -e $INST_DIR/bin/wigToBigWig ]; then echo "Binaries only available for x86_64, please install wigToBigWig from kentUtils: https://github.com/ENCODE-DCC/kentUtils" @@ -127,68 +173,79 @@ else fi fi fi -echo echo -n "Building htslib ..." if [ -e $SETUP_DIR/htslib.success ]; then - echo -n " previously installed ..."; + echo " previously installed ..."; else - cd $SETUP_DIR - get_distro "htslib" $SOURCE_HTSLIB + echo mkdir -p htslib - tar --strip-components 1 -C htslib -zxf htslib.tar.gz - make -C htslib -j$CPU + tar --strip-components 1 -C htslib -jxf htslib.tar.bz2 + cd htslib + ./configure --enable-plugins --enable-libcurl --prefix=$INST_PATH + make -j$CPU + make install + cd $SETUP_DIR touch $SETUP_DIR/htslib.success fi -echo + +export HTSLIB=$INST_PATH + +cd $INIT_DIR + +if [[ ",$COMPILE," == *,samtools,* ]] ; then + echo -n "Building samtools ..." + if [ -e $SETUP_DIR/samtools.success ]; then + echo " previously installed ..."; + else + echo + cd $SETUP_DIR + rm -rf samtools + get_distro "samtools" $SOURCE_SAMTOOLS + mkdir -p samtools + tar --strip-components 1 -C samtools -xjf samtools.tar.bz2 + cd samtools + ./configure --enable-plugins --enable-libcurl --prefix=$INST_PATH + make -j$CPU all all-htslib + make install all all-htslib + cd $SETUP_DIR + rm -f samtools.tar.bz2 + touch $SETUP_DIR/samtools.success + fi +else + echo "samtools - No change between PCAP versions" +fi echo -n "Building libBigWig ..." if [ -e $SETUP_DIR/libBigWig.success ]; then - echo -n " previously installed ..."; + echo " previously installed ..."; else + echo cd $SETUP_DIR get_distro "libBigWig" $SOURCE_LIB_BW mkdir -p libBigWig tar --strip-components 1 -C libBigWig -zxf libBigWig.tar.gz make -C libBigWig -j$CPU install prefix=$INST_PATH rm -f $INST_PATH/lib/libBigWig.so + rm -f libBigWig.tar.gz touch $SETUP_DIR/libBigWig.success fi -echo - -export HTSLIB="$SETUP_DIR/htslib" - -echo -n "Building bam_stats ..." -if [ -e $SETUP_DIR/bam_stats.success ]; then - echo -n " previously installed ..."; -else - cd $INIT_DIR - make -C c clean - make -C c -j$CPU prefix=$INST_PATH - cp bin/bam_stats $INST_PATH/bin/. - cp bin/bwcat $INST_PATH/bin/. - cp bin/reheadSQ $INST_PATH/bin/. - touch $SETUP_DIR/bam_stats.success - # need to clean up as will clash with other version - rm -rf $SAMTOOLS - make -C c clean -fi -echo cd $SETUP_DIR if [[ ",$COMPILE," == *,bwa,* ]] ; then echo -n "Building BWA ..." if [ -e $SETUP_DIR/bwa.success ]; then - echo -n " previously installed ..." + echo " previously installed ..." else - get_distro "bwa" $SOURCE_BWA - mkdir -p bwa - tar --strip-components 1 -C bwa -zxf bwa.tar.gz - make -C bwa -j$CPU - cp bwa/bwa $INST_PATH/bin/. - touch $SETUP_DIR/bwa.success + echo + get_distro "bwa" $SOURCE_BWA + mkdir -p bwa + tar --strip-components 1 -C bwa -zxf bwa.tar.gz + make -C bwa -j$CPU + cp bwa/bwa $INST_PATH/bin/. + rm -f bwa.tar.gz + touch $SETUP_DIR/bwa.success fi - echo else echo "BWA - No change between PCAP versions" fi @@ -198,6 +255,7 @@ if [[ ",$COMPILE," == *,biobambam,* ]] ; then if [ -e $SETUP_DIR/biobambam2.success ]; then echo " previously installed2 ..." else + echo cd $SETUP_DIR get_distro "biobambam2" $SOURCE_BBB_BIN_DIST mkdir -p biobambam2 @@ -208,8 +266,8 @@ if [[ ",$COMPILE," == *,biobambam,* ]] ; then cp -r biobambam2/etc/* $INST_PATH/etc/. cp -r biobambam2/lib/* $INST_PATH/lib/. cp -r biobambam2/share/* $INST_PATH/share/. + rm -f biobambam2.tar.gz touch $SETUP_DIR/biobambam2.success - echo fi else echo "biobambam - No change between PCAP versions" @@ -217,79 +275,73 @@ fi cd $INIT_DIR -if [[ ",$COMPILE," == *,samtools,* ]] ; then - echo -n "Building samtools ..." - if [ -e $SETUP_DIR/samtools.success ]; then - echo -n " previously installed ..."; - else - cd $SETUP_DIR - get_distro "samtools" $SOURCE_SAMTOOLS - mkdir -p samtools - tar --strip-components 1 -C samtools -xjf samtools.tar.bz2 - cd samtools - ./configure --enable-plugins --enable-libcurl --prefix=$INST_PATH - make all all-htslib - make install install-htslib - touch $SETUP_DIR/samtools.success - fi - echo -else - echo "samtools - No change between PCAP versions" -fi - -cd $INIT_DIR - -if [[ ",$COMPILE," == *,samtools,* ]] ; then - echo -n "Building Bio::DB::HTS ..." - if [ -e $SETUP_DIR/biohts.success ]; then - echo " previously installed ..."; - else - cd $SETUP_DIR - $CPANM --mirror http://cpan.metacpan.org --notest -l $INST_PATH Module::Build Bio::Root::Version - # now Bio::DB::HTS - get_file "INSTALL.pl" $BIODBHTS_INSTALL - perl -I $PERL5LIB INSTALL.pl --prefix $INST_PATH --static - rm -f INSTALL.pl - touch $SETUP_DIR/biohts.success - fi - echo -else - echo "Bio::DB::HTS - No change between PCAP versions" # based on samtools tag -fi - echo -n "Building kentsrc + Bio::DB::BigFile ..." if [ -e $SETUP_DIR/kentsrc.success ]; then echo " previously installed ..."; else + echo cd $SETUP_DIR + rm -rf kent kentsrc.zip get_distro "kentsrc" $SOURCE_KENTSRC unzip -q kentsrc.zip perl -pi -e 's/(\s+CFLAGS=)$/${1}-fPIC/' kent/src/inc/common.mk cd kent/src/lib export MACHTYPE=i686 # for a 64-bit system - make + make -j$CPU cd ../ export KENT_SRC=`pwd` cd $SETUP_DIR - $CPANM --mirror http://cpan.metacpan.org -l $INST_PATH Bio::DB::BigFile + mkdir -p bigfile + get_distro "bigfile" $SOURCE_BIGFILE + tar --strip-components 1 -C bigfile -zxf bigfile.tar.gz + cd bigfile + chmod u+w Build.PL + patch -p1 Build.PL < $INIT_DIR/dists/patch/Bio-BigFile_build.patch + perl Build.PL --install_base=$INST_PATH + ./Build + ./Build test + ./Build install + rm -f kentsrc.zip + touch $SETUP_DIR/kentsrc.success fi cd $INIT_DIR -echo -n "Installing Perl prerequisites ..." -$CPANM --mirror http://cpan.metacpan.org --notest -l $INST_PATH --installdeps . -echo +echo -n "Building PCAP-c ..." +if [ -e $SETUP_DIR/bam_stats.success ]; then + echo " previously installed ..."; +else + echo + cd $INIT_DIR + make -C c clean + env HTSLIB=$SETUP_DIR/htslib make -C c -j$CPU prefix=$INST_PATH + cp bin/bam_stats $INST_PATH/bin/. + cp bin/bwcat $INST_PATH/bin/. + cp bin/reheadSQ $INST_PATH/bin/. + cp bin/bam2bedgraph $INST_PATH/bin/. + touch $SETUP_DIR/bam_stats.success + make -C c clean +fi + +cd $INIT_DIR + +echo -n "Building PCAP_perlPrereq ..." +if [ -e $SETUP_DIR/PCAP_perlPrereq.success ]; then + echo "PCAP_perlPrereq previously installed ..."; +else + echo + $CPANM -v --no-interactive --mirror http://cpan.metacpan.org --notest -l $INST_PATH --installdeps . + touch $SETUP_DIR/PCAP_perlPrereq.success +fi echo -n "Installing PCAP ..." -$CPANM --mirror http://cpan.metacpan.org -l $INST_PATH . +$CPANM -v --no-interactive --mirror http://cpan.metacpan.org -l $INST_PATH . echo # cleanup all junk rm -rf $SETUP_DIR rm -rf $INIT_DIR/bin/biobambam - - echo echo echo "Please add the following to beginning of path:"