Skip to content

Commit

Permalink
Merge pull request #718 from vgteam/drop-chains
Browse files Browse the repository at this point in the history
Drop chains
  • Loading branch information
ekg authored Mar 13, 2017
2 parents 6d63861 + 52de333 commit 37f68b6
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 28 deletions.
15 changes: 9 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ STATIC_FLAGS=-static -static-libstdc++ -static-libgcc

# These are put into libvg.
OBJ =
OBJ += cpp/vg.pb.o
OBJ += $(OBJ_DIR)/gssw_aligner.o
OBJ += $(OBJ_DIR)/vg.o
OBJ += $(OBJ_DIR)/index.o
Expand Down Expand Up @@ -170,7 +169,6 @@ DEPS += $(LIB_DIR)/libsnappy.a
DEPS += $(LIB_DIR)/librocksdb.a
DEPS += $(INC_DIR)/gcsa/gcsa.h
DEPS += $(LIB_DIR)/libgcsa2.a
DEPS += $(OBJ_DIR)/progress_bar.o
DEPS += $(OBJ_DIR)/Fasta.o
DEPS += $(LIB_DIR)/libhts.a
DEPS += $(LIB_DIR)/libxg.a
Expand All @@ -188,7 +186,7 @@ DEPS += $(LIB_DIR)/libpinchesandcacti.a
DEPS += $(INC_DIR)/globalDefs.hpp
DEPS += $(LIB_DIR)/libraptor2.a
DEPS += $(INC_DIR)/sha1.hpp
DEPS += $(OBJ_DIR)/sha1.o
DEPS += $(INC_DIR)/progress_bar.hpp

ifneq ($(shell uname -s),Darwin)
DEPS += $(LIB_DIR)/libtcmalloc_minimal.a
Expand Down Expand Up @@ -261,8 +259,11 @@ $(INC_DIR)/gcsa/gcsa.h: $(LIB_DIR)/libgcsa2.a
$(LIB_DIR)/libgcsa2.a: $(LIB_DIR)/libsdsl.a $(wildcard $(GCSA2_DIR)/*.cpp) $(wildcard $(GCSA2_DIR)/*.hpp)
+. ./source_me.sh && cd $(GCSA2_DIR) && cat Makefile | grep -v VERBOSE_STATUS_INFO >Makefile.quiet && $(MAKE) -f Makefile.quiet libgcsa2.a && mv libgcsa2.a $(CWD)/$(LIB_DIR) && cp -r include/gcsa $(CWD)/$(INC_DIR)/

$(INC_DIR)/progress_bar.hpp: $(PROGRESS_BAR_DIR)/progress_bar.hpp
+cp $(PROGRESS_BAR_DIR)/progress_bar.hpp $(CWD)/$(INC_DIR)

$(OBJ_DIR)/progress_bar.o:
+cd $(PROGRESS_BAR_DIR) && $(MAKE) && cp progress_bar.o $(CWD)/$(OBJ_DIR) && cp *.h* $(CWD)/$(INC_DIR)
+cd $(PROGRESS_BAR_DIR) && $(MAKE) && cp progress_bar.o $(CWD)/$(OBJ_DIR)

$(OBJ_DIR)/Fasta.o:
+cd $(FASTAHACK_DIR) && $(MAKE) && mv Fasta.o $(CWD)/$(OBJ_DIR) && cp Fasta.h $(CWD)/$(INC_DIR)
Expand Down Expand Up @@ -312,9 +313,11 @@ $(INC_DIR)/globalDefs.hpp: $(LIB_DIR)/libsdsl.a
$(LIB_DIR)/libraptor2.a:
+cd $(RAPTOR_DIR)/build && cmake .. && $(MAKE) && cp src/libraptor2.a $(CWD)/$(LIB_DIR) && mkdir -p $(CWD)/$(INC_DIR)/raptor2 && cp src/*.h $(CWD)/$(INC_DIR)/raptor2

$(INC_DIR)/sha1.hpp: $(OBJ_DIR)/sha1.o
$(INC_DIR)/sha1.hpp: $(SHA1_DIR)/sha1.hpp
+cp $(SHA1_DIR)/*.h* $(CWD)/$(INC_DIR)/

$(OBJ_DIR)/sha1.o: $(SHA1_DIR)/sha1.cpp $(SHA1_DIR)/sha1.hpp
+$(CXX) $(CXXFLAGS) -c -o $@ $< $(LD_INCLUDE_FLAGS) && cp $(SHA1_DIR)/*.h* $(CWD)/$(INC_DIR)/
+$(CXX) $(CXXFLAGS) -c -o $@ $< $(LD_INCLUDE_FLAGS)

# Auto-versioning
$(INC_DIR)/vg_git_version.hpp: .git
Expand Down
4 changes: 2 additions & 2 deletions scripts/setup-server
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ sudo cp jq-linux64 /usr/local/bin/jq
sudo chmod +x /usr/local/bin/jq

# get the indexes
mkdir pan && cd pan && dx cat ekg_vg_hacking:/hs37d5.vg.index_l16St16e4k16Z6000X3.tar | tar vx && cd ~
mkdir ref && cd ref && dx cat ekg_vg_hacking:/hs37d5.ref.vg.index_l16St16e4k16Z6000X3.tar | tar vx && cd ~
mkdir -p pan && cd pan && dx cat ekg_vg_hacking:/hs37d5.vg.index_l16St16e4k16Z6000X3.tar | tar vx && cd ~
mkdir -p ref && cd ref && dx cat ekg_vg_hacking:/hs37d5.ref.vg.index_l16St16e4k16Z6000X3.tar | tar vx && cd ~
dx cat ekg_vg_hacking:/hs37d5.tar | tar vx

# gcc fix
Expand Down
44 changes: 26 additions & 18 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1356,9 +1356,9 @@ void help_msga(char** argv) {
<< " -q, --max-target-x N skip cluster subgraphs with length > N*read_length (default: 100; 0=unset)" << endl
<< " -I, --max-multimaps N if N>1, keep N best mappings of each band, resolve alignment by DP (default: 1)" << endl
<< " -V, --mem-reseed N reseed SMEMs longer than this length to find non-supermaximal MEMs inside them" << endl
<< " set to -1 to estimate as 2x min mem length (default: -1/estimated)" << endl
<< " set to -1 to estimate as 1.5x min mem length (default: -1/estimated)" << endl
<< " -7, --min-cluster-length N require this much sequence in a cluster to consider it" << endl
<< " set to -1 to estimate as 2.5x min mem length (default: -1/estimated)" << endl
<< " set to -1 to estimate as 1.5x min mem length (default: 0)" << endl
<< "index generation:" << endl
<< " -K, --idx-kmer-size N use kmers of this size for building the GCSA indexes (default: 16)" << endl
<< " -O, --idx-no-recomb index only embedded paths, not recombinations of them" << endl
Expand Down Expand Up @@ -1431,7 +1431,7 @@ int main_msga(int argc, char** argv) {
int sens_step = 5;
float chance_match = 0.05;
int mem_reseed_length = -1;
int min_cluster_length = -1;
int min_cluster_length = 0;

int c;
optind = 2; // force optind past command positional argument
Expand Down Expand Up @@ -1810,10 +1810,10 @@ int main_msga(int argc, char** argv) {
: mapper->random_match_length(chance_match));
mapper->mem_reseed_length = (mem_reseed_length > 0 ? mem_reseed_length
: (mem_reseed_length == 0 ? 0
: round(2 * mapper->min_mem_length)));
: round(1.5 * mapper->min_mem_length)));
mapper->min_cluster_length = (min_cluster_length > 0 ? min_cluster_length
: (min_cluster_length == 0 ? 0
: round(2.5 * mapper->min_mem_length)));
: round(1.5 * mapper->min_mem_length)));
mapper->hit_max = hit_max;
mapper->greedy_accept = greedy_accept;
mapper->max_target_factor = max_target_factor;
Expand Down Expand Up @@ -4948,7 +4948,7 @@ void help_map(char** argv) {
<< " max, mean, stdev, orientation (1=same, 0=flip), direction (1=forward, 0=backward)" << endl
<< " -2, --fragment-sigma N calculate fragment size as mean(buf)+sd(buf)*N where buf is the buffer of perfect pairs we use (default: 10e)" << endl
<< " -p, --pair-window N maximum distance between properly paired reads in node ID space" << endl
<< " -u, --extra-multimaps N examine N extra mappings looking for a consistent read pairing (default: 100)" << endl
<< " -u, --extra-multimaps N examine N extra mappings looking for a consistent read pairing (default: 64)" << endl
<< " -U, --always-rescue rescue each imperfectly-mapped read in a pair off the other" << endl
<< " -O, --top-pairs-only only produce paired alignments if both sides of the pair are top-scoring individually" << endl
<< "generic mapping parameters:" << endl
Expand All @@ -4957,7 +4957,7 @@ void help_map(char** argv) {
<< " -n, --context-depth N follow this many edges out from each thread for alignment (default: 7)" << endl
<< " -M, --max-multimaps N produce up to N alignments for each read (default: 1)" << endl
<< " -3, --softclip-trig N trigger graph extension and realignment when either end has softclips (default: 0)" << endl
<< " -m, --hit-max N ignore kmers or MEMs who have >N hits in our index (default: 10000)" << endl
<< " -m, --hit-max N ignore kmers or MEMs who have >N hits in our index (default: 512)" << endl
<< " -c, --clusters N use at most the largest N ordered clusters of the kmer graph for alignment (default: all)" << endl
<< " -C, --cluster-min N require at least this many kmer hits in a cluster to attempt alignment (default: 1)" << endl
<< " -H, --max-target-x N skip cluster subgraphs with length > N*read_length (default: 100; unset: 0)" << endl
Expand All @@ -4971,12 +4971,13 @@ void help_map(char** argv) {
<< "maximal exact match (MEM) mapper:" << endl
<< " This algorithm is used when --kmer-size is not specified and a GCSA index is given" << endl
<< " -L, --min-mem-length N ignore MEMs shorter than this length (default: estimated minimum where [-F] of hits are by chance)" << endl
<< " -F, --chance-match N set the minimum MEM length so ~ this fraction of min-length hits will by by chance (default: 0.05)" << endl
<< " -8, --chance-match N set the minimum MEM length so ~ this fraction of min-length hits will by by chance (default: 0.05)" << endl
<< " -Y, --max-mem-length N ignore MEMs longer than this length by stopping backward search (default: 0/unset)" << endl
<< " -V, --mem-reseed N reseed SMEMs longer than this length to find non-supermaximal MEMs inside them" << endl
<< " set to -1 to estimate as 2x min mem length (default: -1/estimated)" << endl
<< " set to -1 to estimate as 1.5x min mem length (default: -1/estimated)" << endl
<< " -7, --min-cluster-length N require this much sequence in a cluster to consider it" << endl
<< " set to -1 to estimate as 2.5x min mem length (default: -1/estimated)" << endl
<< " set to -1 to estimate as 1.5x min mem length (default: 0)" << endl
<< " -F, --drop-chain N drop clusters of MEMs shorter than FLOAT fraction of the longest overlapping cluster (default: 0.5)" << endl
<< " -6, --fast-reseed use fast SMEM reseeding" << endl
<< " -a, --id-clustering use id clustering to drive the mapper, rather than MEM-threading" << endl
<< " -5, --unsmoothly don't smooth alignments after patching" << endl
Expand Down Expand Up @@ -5010,7 +5011,7 @@ int main_map(int argc, char** argv) {
string read_file;
string hts_file;
bool keep_secondary = false;
int hit_max = 100;
int hit_max = 512;
int max_multimaps = 1;
int thread_count = 1;
int thread_ex = 10;
Expand All @@ -5035,7 +5036,7 @@ int main_map(int argc, char** argv) {
int softclip_threshold = 0;
int max_mem_length = 0;
int min_mem_length = -1;
int min_cluster_length = -1;
int min_cluster_length = 0;
float random_match_chance = 0.05;
int mem_reseed_length = -1;
bool mem_threading = true;
Expand All @@ -5047,7 +5048,7 @@ int main_map(int argc, char** argv) {
int gap_extend = 1;
int full_length_bonus = 5;
bool qual_adjust_alignments = false;
int extra_multimaps = 32;
int extra_multimaps = 64;
int max_mapping_quality = 60;
int method_code = 1;
string gam_input;
Expand All @@ -5062,6 +5063,7 @@ int main_map(int argc, char** argv) {
float chance_match = 0.05;
bool smooth_alignments = true;
bool use_fast_reseed = false;
float drop_chain = 0.5;

int c;
optind = 2; // force optind past command positional argument
Expand Down Expand Up @@ -5127,13 +5129,14 @@ int main_map(int argc, char** argv) {
{"fragment-sigma", required_argument, 0, '2'},
{"full-l-bonus", required_argument, 0, 'T'},
{"no-cluster-mq", no_argument, 0, '4'},
{"chance-match", required_argument, 0, 'F'},
{"chance-match", required_argument, 0, '8'},
{"unsmoothly", no_argument, 0, '5'},
{"drop-chain", required_argument, 0, 'F'},
{0, 0, 0, 0}
};

int option_index = 0;
c = getopt_long (argc, argv, "s:I:j:hd:x:g:c:r:m:k:M:t:DX:F:S:Jb:KR:N:if:p:B:h:G:C:A:E:Q:n:P:UOl:e:T:L:Y:H:Z:q:z:o:y:1u:v:wW:a2:3:V:4567:",
c = getopt_long (argc, argv, "s:I:j:hd:x:g:c:r:m:k:M:t:DX:F:S:Jb:KR:N:if:p:B:h:G:C:A:E:Q:n:P:UOl:e:T:L:Y:H:Z:q:z:o:y:1u:v:wW:a2:3:V:4567:8:",
long_options, &option_index);


Expand Down Expand Up @@ -5259,10 +5262,14 @@ int main_map(int argc, char** argv) {
debug = true;
break;

case 'F':
case '8':
chance_match = atof(optarg);
break;

case 'F':
drop_chain = atof(optarg);
break;

case 'G':
gam_input = optarg;
break;
Expand Down Expand Up @@ -5571,14 +5578,15 @@ int main_map(int argc, char** argv) {
m->kmer_min = kmer_min;
m->min_identity = min_score;
m->softclip_threshold = softclip_threshold;
m->drop_chain = drop_chain;
m->min_mem_length = (min_mem_length > 0 ? min_mem_length
: m->random_match_length(chance_match));
m->mem_reseed_length = (mem_reseed_length > 0 ? mem_reseed_length
: (mem_reseed_length == 0 ? 0
: round(2 * m->min_mem_length)));
: round(1.5 * m->min_mem_length)));
m->min_cluster_length = (min_cluster_length > 0 ? min_cluster_length
: (min_cluster_length == 0 ? 0
: round(2 * m->min_mem_length)));
: round(1.5 * m->min_mem_length)));
if (debug && i == 0) {
cerr << "[vg map] : min_mem_length = " << m->min_mem_length
<< ", mem_reseed_length = " << m->mem_reseed_length
Expand Down
57 changes: 55 additions & 2 deletions src/mapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ Mapper::Mapper(Index* idex,
, use_cluster_mq(false)
, smooth_alignments(true)
, simultaneous_pair_alignment(true)
, drop_chain(0.5)
{
init_aligner(default_match, default_mismatch, default_gap_open, default_gap_extension);
init_node_cache();
Expand Down Expand Up @@ -1770,11 +1771,13 @@ pair<vector<Alignment>, vector<Alignment>> Mapper::align_paired_multi_simul(
}
}
#endif

auto to_drop = clusters_to_drop(clusters);
vector<pair<Alignment, Alignment> > alns;
//pair<vector<Alignment>, vector<Alignment> > alns;
int multimaps = 0;
for (auto& cluster : clusters) {
// skip if we've filtered the cluster
if (to_drop.count(&cluster)) continue;
// stop if we have enough multimaps
if (multimaps > total_multimaps) { break; }
// skip if we've got enough multimaps to get MQ and we're under the min cluster length
Expand Down Expand Up @@ -2071,6 +2074,53 @@ Mapper::average_node_length(void) {
return (double) xindex->seq_length / (double) xindex->node_count;
}

bool Mapper::mems_overlap(const MaximalExactMatch& mem1,
const MaximalExactMatch& mem2) {
// we overlap if we are not completely separated
return !(mem1.end <= mem2.begin
|| mem2.end <= mem1.begin);
}

bool Mapper::clusters_overlap(const vector<MaximalExactMatch>& cluster1,
const vector<MaximalExactMatch>& cluster2) {
for (auto& mem1 : cluster1) {
for (auto& mem2 : cluster2) {
if (mems_overlap(mem1, mem2)) {
return true;
}
}
}
return false;
}

set<const vector<MaximalExactMatch>* > Mapper::clusters_to_drop(const vector<vector<MaximalExactMatch> >& clusters) {
set<const vector<MaximalExactMatch>* > to_drop;
for (int i = 0; i < clusters.size(); ++i) {
// establish overlaps with longer clusters for all clusters
auto& this_cluster = clusters[i];
int t = cluster_coverage(this_cluster);
int b = -1;
int l = t;
for (int j = i; j >= 0; --j) {
if (j == i) continue;
// are we overlapping?
auto& other_cluster = clusters[j];
if (clusters_overlap(this_cluster, other_cluster)) {
int c = cluster_coverage(other_cluster);
if (c > l) {
l = c;
b = j;
}
}
}
if (b >= 0 && (float) t / (float) l < drop_chain) {
to_drop.insert(&this_cluster);
}
}
return to_drop;
}


// returns the SMEM clusters which are consistent with the distribution of the MEMs in the read
// provided some tolerance +/-
// uses the exact matches to generate as much of each alignment as possible
Expand Down Expand Up @@ -2214,13 +2264,16 @@ Mapper::mems_pos_clusters_to_alignments(const Alignment& aln, vector<MaximalExac
}
}
#endif

auto to_drop = clusters_to_drop(clusters);

// for up to our required number of multimaps
// make the perfect-match alignment for the SMEM cluster
// then fix it up with DP on the little bits between the alignments
vector<Alignment> alns;
int multimaps = 0;
for (auto& cluster : clusters) {
// filtered out due to overlap with longer chain
if (to_drop.count(&cluster)) continue;
if (++multimaps > total_multimaps) { break; }
// skip this if we don't have sufficient cluster coverage and we have at least two alignments
// which we can use to estimate mapping quality
Expand Down
9 changes: 9 additions & 0 deletions src/mapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,14 @@ class Mapper {
vector<Alignment> mems_pos_clusters_to_alignments(const Alignment& aln, vector<MaximalExactMatch>& mems, int additional_multimaps, double& cluster_mq);
// helper for computing the number of bases in the query covered by a cluster
int cluster_coverage(const vector<MaximalExactMatch>& cluster);
// helper to tell if mems are ovelapping
bool mems_overlap(const MaximalExactMatch& mem1,
const MaximalExactMatch& mem2);
// helper to tell if clusters have any overlap
bool clusters_overlap(const vector<MaximalExactMatch>& cluster1,
const vector<MaximalExactMatch>& cluster2);
// use mapper parameters to determine which clusters we should drop
set<const vector<MaximalExactMatch>* > clusters_to_drop(const vector<vector<MaximalExactMatch> >& clusters);

// takes the input alignment (with seq, etc) so we have reference to the base sequence
// for reconstruction the alignments from the SMEMs
Expand Down Expand Up @@ -518,6 +526,7 @@ class Mapper {
float perfect_pair_identity_threshold;
int8_t full_length_alignment_bonus;
bool simultaneous_pair_alignment;
float drop_chain; // drop chains shorter than this fraction of the longest overlapping chain

};

Expand Down

0 comments on commit 37f68b6

Please sign in to comment.