diff --git a/Makefile b/Makefile index dd4ccaebbd7..21727bcc12d 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,6 @@ STATIC_FLAGS=-static -static-libstdc++ -static-libgcc # These are put into libvg. OBJ = -OBJ += cpp/vg.pb.o OBJ += $(OBJ_DIR)/gssw_aligner.o OBJ += $(OBJ_DIR)/vg.o OBJ += $(OBJ_DIR)/index.o @@ -170,7 +169,6 @@ DEPS += $(LIB_DIR)/libsnappy.a DEPS += $(LIB_DIR)/librocksdb.a DEPS += $(INC_DIR)/gcsa/gcsa.h DEPS += $(LIB_DIR)/libgcsa2.a -DEPS += $(OBJ_DIR)/progress_bar.o DEPS += $(OBJ_DIR)/Fasta.o DEPS += $(LIB_DIR)/libhts.a DEPS += $(LIB_DIR)/libxg.a @@ -188,7 +186,7 @@ DEPS += $(LIB_DIR)/libpinchesandcacti.a DEPS += $(INC_DIR)/globalDefs.hpp DEPS += $(LIB_DIR)/libraptor2.a DEPS += $(INC_DIR)/sha1.hpp -DEPS += $(OBJ_DIR)/sha1.o +DEPS += $(INC_DIR)/progress_bar.hpp ifneq ($(shell uname -s),Darwin) DEPS += $(LIB_DIR)/libtcmalloc_minimal.a @@ -261,8 +259,11 @@ $(INC_DIR)/gcsa/gcsa.h: $(LIB_DIR)/libgcsa2.a $(LIB_DIR)/libgcsa2.a: $(LIB_DIR)/libsdsl.a $(wildcard $(GCSA2_DIR)/*.cpp) $(wildcard $(GCSA2_DIR)/*.hpp) +. ./source_me.sh && cd $(GCSA2_DIR) && cat Makefile | grep -v VERBOSE_STATUS_INFO >Makefile.quiet && $(MAKE) -f Makefile.quiet libgcsa2.a && mv libgcsa2.a $(CWD)/$(LIB_DIR) && cp -r include/gcsa $(CWD)/$(INC_DIR)/ +$(INC_DIR)/progress_bar.hpp: $(PROGRESS_BAR_DIR)/progress_bar.hpp + +cp $(PROGRESS_BAR_DIR)/progress_bar.hpp $(CWD)/$(INC_DIR) + $(OBJ_DIR)/progress_bar.o: - +cd $(PROGRESS_BAR_DIR) && $(MAKE) && cp progress_bar.o $(CWD)/$(OBJ_DIR) && cp *.h* $(CWD)/$(INC_DIR) + +cd $(PROGRESS_BAR_DIR) && $(MAKE) && cp progress_bar.o $(CWD)/$(OBJ_DIR) $(OBJ_DIR)/Fasta.o: +cd $(FASTAHACK_DIR) && $(MAKE) && mv Fasta.o $(CWD)/$(OBJ_DIR) && cp Fasta.h $(CWD)/$(INC_DIR) @@ -312,9 +313,11 @@ $(INC_DIR)/globalDefs.hpp: $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libraptor2.a: +cd $(RAPTOR_DIR)/build && cmake .. && $(MAKE) && cp src/libraptor2.a $(CWD)/$(LIB_DIR) && mkdir -p $(CWD)/$(INC_DIR)/raptor2 && cp src/*.h $(CWD)/$(INC_DIR)/raptor2 -$(INC_DIR)/sha1.hpp: $(OBJ_DIR)/sha1.o +$(INC_DIR)/sha1.hpp: $(SHA1_DIR)/sha1.hpp + +cp $(SHA1_DIR)/*.h* $(CWD)/$(INC_DIR)/ + $(OBJ_DIR)/sha1.o: $(SHA1_DIR)/sha1.cpp $(SHA1_DIR)/sha1.hpp - +$(CXX) $(CXXFLAGS) -c -o $@ $< $(LD_INCLUDE_FLAGS) && cp $(SHA1_DIR)/*.h* $(CWD)/$(INC_DIR)/ + +$(CXX) $(CXXFLAGS) -c -o $@ $< $(LD_INCLUDE_FLAGS) # Auto-versioning $(INC_DIR)/vg_git_version.hpp: .git diff --git a/scripts/setup-server b/scripts/setup-server index b3dae9c475c..a0d86ad928b 100644 --- a/scripts/setup-server +++ b/scripts/setup-server @@ -16,8 +16,8 @@ sudo cp jq-linux64 /usr/local/bin/jq sudo chmod +x /usr/local/bin/jq # get the indexes -mkdir pan && cd pan && dx cat ekg_vg_hacking:/hs37d5.vg.index_l16St16e4k16Z6000X3.tar | tar vx && cd ~ -mkdir ref && cd ref && dx cat ekg_vg_hacking:/hs37d5.ref.vg.index_l16St16e4k16Z6000X3.tar | tar vx && cd ~ +mkdir -p pan && cd pan && dx cat ekg_vg_hacking:/hs37d5.vg.index_l16St16e4k16Z6000X3.tar | tar vx && cd ~ +mkdir -p ref && cd ref && dx cat ekg_vg_hacking:/hs37d5.ref.vg.index_l16St16e4k16Z6000X3.tar | tar vx && cd ~ dx cat ekg_vg_hacking:/hs37d5.tar | tar vx # gcc fix diff --git a/src/main.cpp b/src/main.cpp index 989b5065170..b5c6cc53896 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1356,9 +1356,9 @@ void help_msga(char** argv) { << " -q, --max-target-x N skip cluster subgraphs with length > N*read_length (default: 100; 0=unset)" << endl << " -I, --max-multimaps N if N>1, keep N best mappings of each band, resolve alignment by DP (default: 1)" << endl << " -V, --mem-reseed N reseed SMEMs longer than this length to find non-supermaximal MEMs inside them" << endl - << " set to -1 to estimate as 2x min mem length (default: -1/estimated)" << endl + << " set to -1 to estimate as 1.5x min mem length (default: -1/estimated)" << endl << " -7, --min-cluster-length N require this much sequence in a cluster to consider it" << endl - << " set to -1 to estimate as 2.5x min mem length (default: -1/estimated)" << endl + << " set to -1 to estimate as 1.5x min mem length (default: 0)" << endl << "index generation:" << endl << " -K, --idx-kmer-size N use kmers of this size for building the GCSA indexes (default: 16)" << endl << " -O, --idx-no-recomb index only embedded paths, not recombinations of them" << endl @@ -1431,7 +1431,7 @@ int main_msga(int argc, char** argv) { int sens_step = 5; float chance_match = 0.05; int mem_reseed_length = -1; - int min_cluster_length = -1; + int min_cluster_length = 0; int c; optind = 2; // force optind past command positional argument @@ -1810,10 +1810,10 @@ int main_msga(int argc, char** argv) { : mapper->random_match_length(chance_match)); mapper->mem_reseed_length = (mem_reseed_length > 0 ? mem_reseed_length : (mem_reseed_length == 0 ? 0 - : round(2 * mapper->min_mem_length))); + : round(1.5 * mapper->min_mem_length))); mapper->min_cluster_length = (min_cluster_length > 0 ? min_cluster_length : (min_cluster_length == 0 ? 0 - : round(2.5 * mapper->min_mem_length))); + : round(1.5 * mapper->min_mem_length))); mapper->hit_max = hit_max; mapper->greedy_accept = greedy_accept; mapper->max_target_factor = max_target_factor; @@ -4948,7 +4948,7 @@ void help_map(char** argv) { << " max, mean, stdev, orientation (1=same, 0=flip), direction (1=forward, 0=backward)" << endl << " -2, --fragment-sigma N calculate fragment size as mean(buf)+sd(buf)*N where buf is the buffer of perfect pairs we use (default: 10e)" << endl << " -p, --pair-window N maximum distance between properly paired reads in node ID space" << endl - << " -u, --extra-multimaps N examine N extra mappings looking for a consistent read pairing (default: 100)" << endl + << " -u, --extra-multimaps N examine N extra mappings looking for a consistent read pairing (default: 64)" << endl << " -U, --always-rescue rescue each imperfectly-mapped read in a pair off the other" << endl << " -O, --top-pairs-only only produce paired alignments if both sides of the pair are top-scoring individually" << endl << "generic mapping parameters:" << endl @@ -4957,7 +4957,7 @@ void help_map(char** argv) { << " -n, --context-depth N follow this many edges out from each thread for alignment (default: 7)" << endl << " -M, --max-multimaps N produce up to N alignments for each read (default: 1)" << endl << " -3, --softclip-trig N trigger graph extension and realignment when either end has softclips (default: 0)" << endl - << " -m, --hit-max N ignore kmers or MEMs who have >N hits in our index (default: 10000)" << endl + << " -m, --hit-max N ignore kmers or MEMs who have >N hits in our index (default: 512)" << endl << " -c, --clusters N use at most the largest N ordered clusters of the kmer graph for alignment (default: all)" << endl << " -C, --cluster-min N require at least this many kmer hits in a cluster to attempt alignment (default: 1)" << endl << " -H, --max-target-x N skip cluster subgraphs with length > N*read_length (default: 100; unset: 0)" << endl @@ -4971,12 +4971,13 @@ void help_map(char** argv) { << "maximal exact match (MEM) mapper:" << endl << " This algorithm is used when --kmer-size is not specified and a GCSA index is given" << endl << " -L, --min-mem-length N ignore MEMs shorter than this length (default: estimated minimum where [-F] of hits are by chance)" << endl - << " -F, --chance-match N set the minimum MEM length so ~ this fraction of min-length hits will by by chance (default: 0.05)" << endl + << " -8, --chance-match N set the minimum MEM length so ~ this fraction of min-length hits will by by chance (default: 0.05)" << endl << " -Y, --max-mem-length N ignore MEMs longer than this length by stopping backward search (default: 0/unset)" << endl << " -V, --mem-reseed N reseed SMEMs longer than this length to find non-supermaximal MEMs inside them" << endl - << " set to -1 to estimate as 2x min mem length (default: -1/estimated)" << endl + << " set to -1 to estimate as 1.5x min mem length (default: -1/estimated)" << endl << " -7, --min-cluster-length N require this much sequence in a cluster to consider it" << endl - << " set to -1 to estimate as 2.5x min mem length (default: -1/estimated)" << endl + << " set to -1 to estimate as 1.5x min mem length (default: 0)" << endl + << " -F, --drop-chain N drop clusters of MEMs shorter than FLOAT fraction of the longest overlapping cluster (default: 0.5)" << endl << " -6, --fast-reseed use fast SMEM reseeding" << endl << " -a, --id-clustering use id clustering to drive the mapper, rather than MEM-threading" << endl << " -5, --unsmoothly don't smooth alignments after patching" << endl @@ -5010,7 +5011,7 @@ int main_map(int argc, char** argv) { string read_file; string hts_file; bool keep_secondary = false; - int hit_max = 100; + int hit_max = 512; int max_multimaps = 1; int thread_count = 1; int thread_ex = 10; @@ -5035,7 +5036,7 @@ int main_map(int argc, char** argv) { int softclip_threshold = 0; int max_mem_length = 0; int min_mem_length = -1; - int min_cluster_length = -1; + int min_cluster_length = 0; float random_match_chance = 0.05; int mem_reseed_length = -1; bool mem_threading = true; @@ -5047,7 +5048,7 @@ int main_map(int argc, char** argv) { int gap_extend = 1; int full_length_bonus = 5; bool qual_adjust_alignments = false; - int extra_multimaps = 32; + int extra_multimaps = 64; int max_mapping_quality = 60; int method_code = 1; string gam_input; @@ -5062,6 +5063,7 @@ int main_map(int argc, char** argv) { float chance_match = 0.05; bool smooth_alignments = true; bool use_fast_reseed = false; + float drop_chain = 0.5; int c; optind = 2; // force optind past command positional argument @@ -5127,13 +5129,14 @@ int main_map(int argc, char** argv) { {"fragment-sigma", required_argument, 0, '2'}, {"full-l-bonus", required_argument, 0, 'T'}, {"no-cluster-mq", no_argument, 0, '4'}, - {"chance-match", required_argument, 0, 'F'}, + {"chance-match", required_argument, 0, '8'}, {"unsmoothly", no_argument, 0, '5'}, + {"drop-chain", required_argument, 0, 'F'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "s:I:j:hd:x:g:c:r:m:k:M:t:DX:F:S:Jb:KR:N:if:p:B:h:G:C:A:E:Q:n:P:UOl:e:T:L:Y:H:Z:q:z:o:y:1u:v:wW:a2:3:V:4567:", + c = getopt_long (argc, argv, "s:I:j:hd:x:g:c:r:m:k:M:t:DX:F:S:Jb:KR:N:if:p:B:h:G:C:A:E:Q:n:P:UOl:e:T:L:Y:H:Z:q:z:o:y:1u:v:wW:a2:3:V:4567:8:", long_options, &option_index); @@ -5259,10 +5262,14 @@ int main_map(int argc, char** argv) { debug = true; break; - case 'F': + case '8': chance_match = atof(optarg); break; + case 'F': + drop_chain = atof(optarg); + break; + case 'G': gam_input = optarg; break; @@ -5571,14 +5578,15 @@ int main_map(int argc, char** argv) { m->kmer_min = kmer_min; m->min_identity = min_score; m->softclip_threshold = softclip_threshold; + m->drop_chain = drop_chain; m->min_mem_length = (min_mem_length > 0 ? min_mem_length : m->random_match_length(chance_match)); m->mem_reseed_length = (mem_reseed_length > 0 ? mem_reseed_length : (mem_reseed_length == 0 ? 0 - : round(2 * m->min_mem_length))); + : round(1.5 * m->min_mem_length))); m->min_cluster_length = (min_cluster_length > 0 ? min_cluster_length : (min_cluster_length == 0 ? 0 - : round(2 * m->min_mem_length))); + : round(1.5 * m->min_mem_length))); if (debug && i == 0) { cerr << "[vg map] : min_mem_length = " << m->min_mem_length << ", mem_reseed_length = " << m->mem_reseed_length diff --git a/src/mapper.cpp b/src/mapper.cpp index 337837560e9..200f8c75db2 100644 --- a/src/mapper.cpp +++ b/src/mapper.cpp @@ -60,6 +60,7 @@ Mapper::Mapper(Index* idex, , use_cluster_mq(false) , smooth_alignments(true) , simultaneous_pair_alignment(true) + , drop_chain(0.5) { init_aligner(default_match, default_mismatch, default_gap_open, default_gap_extension); init_node_cache(); @@ -1770,11 +1771,13 @@ pair, vector> Mapper::align_paired_multi_simul( } } #endif - + auto to_drop = clusters_to_drop(clusters); vector > alns; //pair, vector > alns; int multimaps = 0; for (auto& cluster : clusters) { + // skip if we've filtered the cluster + if (to_drop.count(&cluster)) continue; // stop if we have enough multimaps if (multimaps > total_multimaps) { break; } // skip if we've got enough multimaps to get MQ and we're under the min cluster length @@ -2071,6 +2074,53 @@ Mapper::average_node_length(void) { return (double) xindex->seq_length / (double) xindex->node_count; } +bool Mapper::mems_overlap(const MaximalExactMatch& mem1, + const MaximalExactMatch& mem2) { + // we overlap if we are not completely separated + return !(mem1.end <= mem2.begin + || mem2.end <= mem1.begin); +} + +bool Mapper::clusters_overlap(const vector& cluster1, + const vector& cluster2) { + for (auto& mem1 : cluster1) { + for (auto& mem2 : cluster2) { + if (mems_overlap(mem1, mem2)) { + return true; + } + } + } + return false; +} + +set* > Mapper::clusters_to_drop(const vector >& clusters) { + set* > to_drop; + for (int i = 0; i < clusters.size(); ++i) { + // establish overlaps with longer clusters for all clusters + auto& this_cluster = clusters[i]; + int t = cluster_coverage(this_cluster); + int b = -1; + int l = t; + for (int j = i; j >= 0; --j) { + if (j == i) continue; + // are we overlapping? + auto& other_cluster = clusters[j]; + if (clusters_overlap(this_cluster, other_cluster)) { + int c = cluster_coverage(other_cluster); + if (c > l) { + l = c; + b = j; + } + } + } + if (b >= 0 && (float) t / (float) l < drop_chain) { + to_drop.insert(&this_cluster); + } + } + return to_drop; +} + + // returns the SMEM clusters which are consistent with the distribution of the MEMs in the read // provided some tolerance +/- // uses the exact matches to generate as much of each alignment as possible @@ -2214,13 +2264,16 @@ Mapper::mems_pos_clusters_to_alignments(const Alignment& aln, vector alns; int multimaps = 0; for (auto& cluster : clusters) { + // filtered out due to overlap with longer chain + if (to_drop.count(&cluster)) continue; if (++multimaps > total_multimaps) { break; } // skip this if we don't have sufficient cluster coverage and we have at least two alignments // which we can use to estimate mapping quality diff --git a/src/mapper.hpp b/src/mapper.hpp index 65ed854ea99..b570e69e0fc 100644 --- a/src/mapper.hpp +++ b/src/mapper.hpp @@ -278,6 +278,14 @@ class Mapper { vector mems_pos_clusters_to_alignments(const Alignment& aln, vector& mems, int additional_multimaps, double& cluster_mq); // helper for computing the number of bases in the query covered by a cluster int cluster_coverage(const vector& cluster); + // helper to tell if mems are ovelapping + bool mems_overlap(const MaximalExactMatch& mem1, + const MaximalExactMatch& mem2); + // helper to tell if clusters have any overlap + bool clusters_overlap(const vector& cluster1, + const vector& cluster2); + // use mapper parameters to determine which clusters we should drop + set* > clusters_to_drop(const vector >& clusters); // takes the input alignment (with seq, etc) so we have reference to the base sequence // for reconstruction the alignments from the SMEMs @@ -518,6 +526,7 @@ class Mapper { float perfect_pair_identity_threshold; int8_t full_length_alignment_bonus; bool simultaneous_pair_alignment; + float drop_chain; // drop chains shorter than this fraction of the longest overlapping chain };