Skip to content

Commit

Permalink
vcfcov: clearer VCF sample format
Browse files Browse the repository at this point in the history
make-pipeline.pl:
- add --single-colour option to avoid inferring edges

vcfcov.c:
- report sum(covgs) / expected_n_kmers
- don't report previously used coverage / nkmers numbers
- add command string to output VCF header
- print stats on total number of kmers used
- updated tests to reflect changes

libs/carrays:
- add libs/carrays library to quickly get array median using qselect()

util.c:
- remove functionality that has moved to carrays

General:
- comparison functions use cmp() macro instead of weird (long) casting
- use gca_median() instead of sorting then picking median
  • Loading branch information
noporpoise committed Oct 28, 2015
1 parent c5c8b27 commit 2b13c5e
Show file tree
Hide file tree
Showing 41 changed files with 621 additions and 717 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,6 @@
[submodule "libs/vcfhp"]
path = libs/vcf-slim
url = https://github.com/noporpoise/vcf-slim
[submodule "libs/carrays"]
path = libs/carrays
url = https://github.com/noporpoise/carrays.git
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ LIB_HTS=libs/htslib/libhts.a
LIB_ALIGN=libs/seq-align/src/libalign.a
# LIB_STRS=libs/string_buffer/libstrbuf.a
LIB_STRS=libs/string_buffer/string_buffer.o
LIB_CARRAYS=libs/carrays/carrays.o

MISC_SRCS=$(wildcard libs/misc/*.c)
MISC_HDRS=$(wildcard libs/misc/*.h)
Expand All @@ -90,7 +91,7 @@ endif
INCS=-I libs -I $(IDIR_HTS) -I $(IDIR_SEQ) $(EXTRA_INCS)

# Library linking
LIB_OBJS=$(LIB_MISC) $(LIB_STRS) $(LIB_HTS) $(LIB_ALIGN) libs/cJSON/cJSON.o
LIB_OBJS=$(LIB_MISC) $(LIB_STRS) $(LIB_CARRAYS) $(LIB_HTS) $(LIB_ALIGN) libs/cJSON/cJSON.o
LINK=-lpthread -lz -lm

CFLAGS = -std=c99 -Wall -Wextra
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ Bundled libraries may have different licenses:
* [seq-align](https://github.com/noporpoise/seq-align) (Public Domain)
* [seq_file](https://github.com/noporpoise/seq_file) (Public Domain)
* [sort_r](https://github.com/noporpoise/sort_r) (Public Domain)
* [carrays](https://github.com/noporpoise/carrays) (Public Domain)
* [string_buffer](https://github.com/noporpoise/string_buffer) (Public Domain)
* [xxHash](https://github.com/Cyan4973/xxHash.git) (BSD)

Expand Down
26 changes: 22 additions & 4 deletions libs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ SHELL=/bin/bash
# make all <- compile all libraries (deps + analysis)
# make clean <- clean all libraries

CORE=xxHash htslib string_buffer bit_array seq_file seq-align msg-pool sort_r madcrowlib misc
CORE=xxHash htslib string_buffer bit_array seq_file seq-align msg-pool sort_r madcrowlib carrays misc
OTHER=bcftools samtools bwa readsim bioinf-perl maximal_substrs vcf-slim

ALLTGTS=$(CORE) $(OTHER)
Expand Down Expand Up @@ -62,14 +62,31 @@ sort_r: sort_r/Makefile
madcrowlib: madcrowlib/Makefile
cd madcrowlib && $(MAKE) all

maximal_substrs:
carrays: carrays/Makefile
cd carrays && $(MAKE) all

maximal_substrs: maximal_substrs/Makefile
cd maximal_substrs && $(MAKE)

misc: misc/Makefile
cd misc && $(MAKE)

vcf-slim: vcf-slim/Makefile htslib
cd vcf-slim && $(MAKE) all test
cd vcf-slim && $(MAKE) all

#
# Run tests
#
test:
cd htslib && $(MAKE) test
cd bit_array && $(MAKE) test
cd string_buffer && $(MAKE) test
cd seq-align && $(MAKE) test
cd msg-pool && $(MAKE) test
cd sort_r && $(MAKE) test
cd carrays && $(MAKE) test
cd vcf-slim && $(MAKE) test
cd madcrowlib && $(MAKE) test

#
# Clean
Expand All @@ -87,8 +104,9 @@ clean:
cd msg-pool && $(MAKE) clean
cd sort_r && $(MAKE) clean
cd madcrowlib && $(MAKE) clean
cd carrays && $(MAKE) clean
cd maximal_substrs && $(MAKE) clean
cd misc && $(MAKE) clean

.PHONY: all clean update core
.PHONY: all clean test update core
.PHONY: $(ALLTGTS)
2 changes: 1 addition & 1 deletion libs/bcftools
1 change: 1 addition & 0 deletions libs/carrays
Submodule carrays added at daab0e
2 changes: 1 addition & 1 deletion libs/htslib
Submodule htslib updated 2 files
+2 −2 htslib/vcfutils.h
+1 −1 synced_bcf_reader.c
2 changes: 1 addition & 1 deletion libs/madcrowlib
3 changes: 0 additions & 3 deletions libs/misc/jenkins.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,4 @@ static inline uint32_t jenkins_one_at_a_time_hash(const uint8_t *key, size_t len
return jenkins_finish(hash);
}

// 2 ops per byte h = strhash_fast_mix(h,x)
#define strhash_fast_mix(h,x) ((h) * 37 + (x))

#endif /* JENKINS_H_ */
2 changes: 1 addition & 1 deletion libs/samtools
Submodule samtools updated 1 files
+47 −35 samtools.1
60 changes: 25 additions & 35 deletions scripts/make-pipeline.pl
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ sub print_usage
Options:
-r,--ref <ref.fa> Reference sequence
-1,--single-colour Build as single sample (not multicoloured graph)
-s,--stampy <path/stampy.py> Use stampy instead of BWA to place variants
-S,--stampy-base <B> Stampy hashes <B>.stidx and <B>.sthash
Expand Down Expand Up @@ -68,11 +69,13 @@ sub print_usage

my $stampy;
my $stampy_base; # base to stampy hash files (.stidx, .sthash)
my $single_colour = 0;

# Parse command line args
while(@ARGV > 3) {
my $arg = shift;
if($arg =~ /^(-r|--ref)$/ && !defined($ref_path)) { $ref_path = shift; }
elsif($arg =~ /^(-1|--single-colour)$/ && !$single_colour) { $single_colour = 1; }
elsif($arg =~ /^(-s|--stampy)$/ && !defined($stampy)) { $stampy = shift; }
elsif($arg =~ /^(-S|--stampy-base)$/ && !defined($stampy_base)) { $stampy_base = shift; }
else { print_usage("Unknown argument: $arg"); }
Expand Down Expand Up @@ -147,7 +150,6 @@ sub print_usage
# NTHREADS=<nthreads> Number of threads to use
# USE_LINKS=<B> <B> is "yes" or "no"
# JOINT_CALLING=<B> <B> is "yes" or "no"
# SINGLE_SAMPLE=<1|0> running with single sample (cannot be merged later!)
# MATEPAIR=<MP> MP can be FF,FR,RF,RR (default: FR)
# MIN_FRAG_LEN=<L> minimum fragment length bp (=read+gap+read)
# MAX_FRAG_LEN=<L> maximum fragment length bp (=read+gap+read)
Expand Down Expand Up @@ -262,12 +264,6 @@ sub print_usage
FQ_CUTOFF=10
HP_CUTOFF=0
# Set this to non-blank (e.g. 1) to stop infer edges from running.
# Links generated with this set will not be usable in multicolour graphs.
# If you have only one sample this will improve the calls.
# Do not set it if you have more sample.
SINGLE_SAMPLE=0
SEQ_PREFS=--fq-cutoff $(FQ_CUTOFF) --cut-hp $(HP_CUTOFF) --matepair $(MATEPAIR)
BRK_REF_KMERS=10
Expand Down Expand Up @@ -343,30 +339,15 @@ sub print_usage
JOINT=1
endif
# INFER_EDGES is default on
ifeq ($(SINGLE_SAMPLE),no)
INFER_EDGES=1
endif
ifeq ($(SINGLE_SAMPLE),false)
INFER_EDGES=1
endif
ifeq ($(SINGLE_SAMPLE),0)
INFER_EDGES=1
endif
ifndef SINGLE_SAMPLE
INFER_EDGES=1
endif
# LINKS is defined iff we are using links
# JOINT is defined iff we are doing joint calling
# INFER_EDGES is defined iff we are inferring edges
ifndef INFER_EDGES
BREAKPOINTS_ARGS:=$(BREAKPOINTS_ARGS) --no-ref-edges
endif
';

if($single_colour) {
print "# Must not load edges with --single-colour\n";
print 'BREAKPOINTS_ARGS:=$(BREAKPOINTS_ARGS) --no-ref-edges'."\n";
}

for my $k (@kmers) {
print "# Files at k=$k\n";
print "RAW_GRAPHS_K$k=".join(' ', map {"$proj/k$k/graphs/$_->{'name'}.raw.ctx"} @samples)."\n";
Expand Down Expand Up @@ -598,9 +579,9 @@ sub merge_vcf_list
print "$proj/k$k/graphs/%.raw.covg.csv: $proj/k$k/graphs/%.clean.ctx\n";
print "$proj/k$k/graphs/%.clean.ctx: $proj/k$k/graphs/%.raw.ctx\n";
print "\t$ctx clean \$(CTX_ARGS) \$(KMER_CLEANING_ARGS) --covg-before $proj/k$k/graphs/\$*.raw.covg.csv -o \$@ \$< >& \$@.log\n";
print "ifdef INFER_EDGES\n";
print "\t$ctx inferedges \$(CTX_ARGS) \$@ >& $proj/k$k/graphs/\$*.inferedges.ctx.log\n";
print "endif\n";
if(!$single_colour) {
print "\t$ctx inferedges \$(CTX_ARGS) \$@ >& $proj/k$k/graphs/\$*.inferedges.ctx.log\n";
}
print "\n";

# Dump unitigs
Expand Down Expand Up @@ -690,25 +671,34 @@ sub merge_vcf_list
for my $k (@kmers) {
my $ctx = get_mccortex($k);
my $link_args = get_p_args($k);
# If $single_colour, we can't load more than one graph WITH LINKS
my $hapcol = defined($ref_path) ? "--haploid ".scalar(@samples) : '';
my $hapcol1by1_links = defined($ref_path) && !$single_colour ? "--haploid 1" : '';
my $hapcol1by1_plain = defined($ref_path) ? "--haploid 1" : '';
my $refgraph = $single_colour ? "" : '$(REF_GRAPH_K'.$k.')';

# joint bubble calling
print "# bubble calls k=$k joint+links\n";
print "$proj/k$k/bubbles/joint.bub.gz: \$(CLEAN_GRAPHS_K$k) \$(REF_GRAPH_K$k) \$(CLEAN_PE_LINKS_K$k) | \$(DIRS)\n";
print "\t$ctx bubbles \$(CTX_ARGS) \$(BUBBLES_ARGS) $hapcol -o \$@ $link_args \$(CLEAN_GRAPHS_K$k) \$(REF_GRAPH_K$k) >& \$@.log\n\n";
if(!$single_colour) {
print "$proj/k$k/bubbles/joint.bub.gz: \$(CLEAN_GRAPHS_K$k) \$(REF_GRAPH_K$k) \$(CLEAN_PE_LINKS_K$k) | \$(DIRS)\n";
print "\t$ctx bubbles \$(CTX_ARGS) \$(BUBBLES_ARGS) $hapcol -o \$@ $link_args \$(CLEAN_GRAPHS_K$k) \$(REF_GRAPH_K$k) >& \$@.log\n\n";
} else {
print "$proj/k$k/bubbles/joint.bub.gz:\n";
print "\t>&2 echo 'Cannot create joint bubble calls with links using --single-colour' && exit 1\n"
}

print "# bubble calls k=$k joint+nolinks\n";
print "$proj/k$k/bubbles_plain/joint.bub.gz: \$(CLEAN_GRAPHS_K$k) \$(REF_GRAPH_K$k) | \$(DIRS)\n";
print "\t$ctx bubbles \$(CTX_ARGS) \$(BUBBLES_ARGS) $hapcol -o \$@ \$(CLEAN_GRAPHS_K$k) \$(REF_GRAPH_K$k) >& \$@.log\n\n";

# 1by1 bubble calling
print "# bubble calls k=$k 1by1+links\n";
print "$proj/k$k/bubbles/%.bub.gz: $proj/k$k/graphs/%.clean.ctx \$(REF_GRAPH_K$k) $proj/k$k/links/%.pe.clean.ctp.gz\n";
print "\t$ctx bubbles \$(CTX_ARGS) \$(BUBBLES_ARGS) --haploid 1 -o \$@ -p $proj/k$k/links/\$*.pe.clean.ctp.gz \$< \$(REF_GRAPH_K$k) >& \$@.log\n\n";
print "$proj/k$k/bubbles/%.bub.gz: $proj/k$k/graphs/%.clean.ctx $refgraph $proj/k$k/links/%.pe.clean.ctp.gz\n";
print "\t$ctx bubbles \$(CTX_ARGS) \$(BUBBLES_ARGS) $hapcol1by1_links -o \$@ -p $proj/k$k/links/\$*.pe.clean.ctp.gz \$< $refgraph >& \$@.log\n\n";

print "# bubble calls k=$k 1by1+nolinks\n";
print "$proj/k$k/bubbles_plain/%.bub.gz: $proj/k$k/graphs/%.clean.ctx \$(REF_GRAPH_K$k)\n";
print "\t$ctx bubbles \$(CTX_ARGS) \$(BUBBLES_ARGS) --haploid 1 -o \$@ \$< \$(REF_GRAPH_K$k) >& \$@.log\n\n";
print "\t$ctx bubbles \$(CTX_ARGS) \$(BUBBLES_ARGS) $hapcol1by1_plain -o \$@ \$< \$(REF_GRAPH_K$k) >& \$@.log\n\n";
}

# Some things require a reference to be used
Expand Down
2 changes: 1 addition & 1 deletion src/basic/binary_seq.c
Original file line number Diff line number Diff line change
Expand Up @@ -261,5 +261,5 @@ int binary_seqs_cmp(const uint8_t *arr0, size_t len0,
if((ret = x - y) != 0) return ret;
o+=2; b += (o == 8); o &= 7;
}
return (long)len0 - (long)len1;
return cmp(len0, len1);
}
4 changes: 2 additions & 2 deletions src/basic/file_filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -241,11 +241,11 @@ void file_filter_add(FileFilter *fltr, uint32_t from, uint32_t into)
// Sort array of Filter structs
//
static inline int _into_cmp(const void *a, const void *b) {
return (long)((const Filter*)a)->into - ((const Filter*)b)->into;
return cmp(((const Filter*)a)->into, ((const Filter*)b)->into);
}

static inline int _from_cmp(const void *a, const void *b) {
return (long)((const Filter*)a)->from - ((const Filter*)b)->from;
return cmp(((const Filter*)a)->from, ((const Filter*)b)->from);
}

#define filters_sort_by_into(fltr) qsort((fltr)->filter.b, file_filter_num(fltr), sizeof(fltr->filter.b[0]), _into_cmp)
Expand Down
5 changes: 3 additions & 2 deletions src/commands/ctx_links.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include "gpath_save.h"
#include "clean_graph.h"

#include "carrays/carrays.h" // gca_median_size()

const char links_usage[] =
"usage: "CMD" links [options] <in.ctp.gz>\n"
"\n"
Expand Down Expand Up @@ -78,9 +80,8 @@ static size_t print_cutoffs(size_t *sumcovgs, size_t *cutoffs, size_t len)
for(i = 1; i < len; i++) printf(",%zu", sumcovgs[i]);
printf("\ncutoffs=%zu", cutoffs[0]);
for(i = 1; i < len; i++) printf(",%zu", cutoffs[i]);
qsort(cutoffs, len, sizeof(cutoffs[0]), cmp_size);
printf("\n");
return MEDIAN(cutoffs, len);
return gca_median_size(cutoffs, len);
}

static void print_suggest_cutoff(size_t hist_distsize, size_t hist_covgsize,
Expand Down
Loading

0 comments on commit 2b13c5e

Please sign in to comment.