Skip to content

Commit

Permalink
Merge branch 'master' of github.com:vgteam/vg
Browse files Browse the repository at this point in the history
  • Loading branch information
ekg committed Mar 10, 2017
2 parents 13db3dc + 1014c7c commit fdd117d
Show file tree
Hide file tree
Showing 14 changed files with 1,624 additions and 554 deletions.
13 changes: 9 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ OBJ += $(OBJ_DIR)/variant_adder.o
OBJ += $(OBJ_DIR)/name_mapper.o
OBJ += $(OBJ_DIR)/graph_synchronizer.o
OBJ += $(OBJ_DIR)/vg_algorithms.o
OBJ += $(OBJ_DIR)/nested_traversal_finder.o

# These aren't put into libvg. But they do go into the main vg binary to power its self-test.
UNITTEST_OBJ =
Expand Down Expand Up @@ -373,13 +374,13 @@ $(OBJ_DIR)/entropy.o: $(SRC_DIR)/entropy.cpp $(SRC_DIR)/entropy.hpp $(DEPS)

$(OBJ_DIR)/pileup.o: $(SRC_DIR)/pileup.cpp $(SRC_DIR)/pileup.hpp $(INC_DIR)/stream.hpp $(SRC_DIR)/vg.hpp $(SRC_DIR)/progressive.hpp $(SRC_DIR)/json2pb.h $(DEPS)

$(OBJ_DIR)/caller.o: $(SRC_DIR)/caller.cpp $(SRC_DIR)/caller.hpp $(SRC_DIR)/vg.hpp $(SRC_DIR)/progressive.hpp $(INC_DIR)/stream.hpp $(SRC_DIR)/json2pb.h $(SRC_DIR)/pileup.hpp $(DEPS)
$(OBJ_DIR)/caller.o: $(SRC_DIR)/caller.cpp $(SRC_DIR)/caller.hpp $(SRC_DIR)/vg.hpp $(SRC_DIR)/progressive.hpp $(INC_DIR)/stream.hpp $(SRC_DIR)/json2pb.h $(SRC_DIR)/pileup.hpp $(SRC_DIR)/snarls.hpp $(DEPS)

$(OBJ_DIR)/call2vcf.o: $(SRC_DIR)/call2vcf.cpp $(SRC_DIR)/caller.hpp $(SRC_DIR)/nodeside.hpp $(SRC_DIR)/nodetraversal.hpp $(SRC_DIR)/snarls.hpp $(SRC_DIR)/path_index.hpp $(SRC_DIR)/distributions.hpp $(DEPS)
$(OBJ_DIR)/call2vcf.o: $(SRC_DIR)/call2vcf.cpp $(SRC_DIR)/caller.hpp $(SRC_DIR)/nodeside.hpp $(SRC_DIR)/nodetraversal.hpp $(SRC_DIR)/snarls.hpp $(SRC_DIR)/path_index.hpp $(SRC_DIR)/distributions.hpp $(SRC_DIR)/genotypekit.hpp $(SRC_DIR)/snarls.hpp $(DEPS)

$(OBJ_DIR)/genotyper.o: $(SRC_DIR)/genotyper.cpp $(SRC_DIR)/genotyper.hpp $(SRC_DIR)/vg.hpp $(SRC_DIR)/progressive.hpp $(SRC_DIR)/stream.hpp $(SRC_DIR)/path_index.hpp $(SRC_DIR)/json2pb.h $(DEPS) $(INC_DIR)/sparsehash/sparse_hash_map $(SRC_DIR)/bubbles.hpp $(SRC_DIR)/distributions.hpp $(SRC_DIR)/utility.hpp

$(OBJ_DIR)/genotypekit.o: $(SRC_DIR)/genotypekit.cpp $(SRC_DIR)/genotypekit.hpp $(DEPS) $(SRC_DIR)/vg.hpp $(SRC_DIR)/progressive.hpp $(SRC_DIR)/utility.hpp $(SRC_DIR)/distributions.hpp $(DEPS)
$(OBJ_DIR)/genotypekit.o: $(SRC_DIR)/genotypekit.cpp $(SRC_DIR)/genotypekit.hpp $(DEPS) $(SRC_DIR)/vg.hpp $(SRC_DIR)/progressive.hpp $(SRC_DIR)/utility.hpp $(SRC_DIR)/distributions.hpp $(SRC_DIR)/snarls.hpp $(DEPS)

$(OBJ_DIR)/position.o: $(SRC_DIR)/position.cpp $(SRC_DIR)/position.hpp $(CPP_DIR)/vg.pb.h $(SRC_DIR)/vg.hpp $(SRC_DIR)/progressive.hpp $(SRC_DIR)/json2pb.h $(DEPS)

Expand Down Expand Up @@ -442,7 +443,11 @@ $(OBJ_DIR)/feature_set.o: $(SRC_DIR)/feature_set.cpp $(SRC_DIR)/feature_set.hpp

$(OBJ_DIR)/simplifier.o: $(SRC_DIR)/simplifier.cpp $(SRC_DIR)/simplifier.hpp $(SRC_DIR)/progressive.hpp $(SRC_DIR)/utility.hpp $(SRC_DIR)/feature_set.hpp $(SRC_DIR)/path.hpp $(SRC_DIR)/path_index.hpp $(SRC_DIR)/distributions.hpp $(DEPS)

$(OBJ_DIR)/vg_algorithms.o: $(ALGORITHMS_SRC_DIR)/vg_algorithms.cpp $(ALGORITHMS_SRC_DIR)/vg_algorithms.hpp
$(OBJ_DIR)/nested_traversal_finder.o: $(SRC_DIR)/nested_traversal_finder.cpp $(SRC_DIR)/nested_traversal_finder.hpp $(SRC_DIR)/genotypekit.hpp $(SRC_DIR)/snarls.hpp $(DEPS)

### Algorithms ###

$(OBJ_DIR)/vg_algorithms.o: $(ALGORITHMS_SRC_DIR)/vg_algorithms.cpp $(ALGORITHMS_SRC_DIR)/vg_algorithms.hpp $(DEPS)
. ./source_me.sh && $(CXX) $(CXXFLAGS) -c -o $@ $< $(LD_INCLUDE_FLAGS) $(LD_LIB_FLAGS) $(ROCKSDB_LDFLAGS)

###################################
Expand Down
1,002 changes: 559 additions & 443 deletions src/call2vcf.cpp

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion src/caller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,12 @@ class Call2Vcf {
bool is_reference(const SnarlTraversal& trav, AugmentedGraph& augmented, const PathIndex& primary_path);

// Option variables

// Should we output in VCF (true) or Protobuf Locus (false) format?
bool convert_to_vcf = true;
// How big should our output buffer be?
size_t locus_buffer_size = 1000;

// What's the name of the reference path in the graph?
string refPathName = "";
// What name should we give the contig in the VCF file?
Expand Down Expand Up @@ -412,7 +418,7 @@ class Call2Vcf {
// Should we drop variants that would overlap old ones? TODO: we really need
// a proper system for accounting for usage of graph material.
bool suppress_overlaps = false;
// Should we use average support instead minimum support for our calculations?
// Should we use average support instead of minimum support for our calculations?
bool useAverageSupport = false;
// What's the max ref length of a site that we genotype as a whole instead
// of splitting?
Expand Down
139 changes: 97 additions & 42 deletions src/genotypekit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -620,21 +620,67 @@ vector<SnarlTraversal> TrivialTraversalFinder::find_traversals(const Snarl& site
}

RepresentativeTraversalFinder::RepresentativeTraversalFinder(AugmentedGraph& augmented,
SnarlManager& snarl_manager, PathIndex& index, size_t max_depth,
SnarlManager& snarl_manager, PathIndex& primary_path_index, size_t max_depth,
size_t max_bubble_paths) : augmented(augmented), snarl_manager(snarl_manager),
index(index), max_depth(max_depth), max_bubble_paths(max_bubble_paths) {
primary_path_index(primary_path_index), max_depth(max_depth), max_bubble_paths(max_bubble_paths) {

// Nothing to do!

}

Path RepresentativeTraversalFinder::find_backbone(const Snarl& site) {

// TODO: this cheats and uses certain things that happen to be true about
// the TrivialTraversalFinder in order to work.

// Find a traversal, ignoring the fact that child sites ought to own their
// nodes.
TrivialTraversalFinder finder(augmented.graph);
auto traversals = finder.find_traversals(site);
assert(!traversals.empty());
auto& traversal = traversals.front();

// Convert it into a path that includes the boundry nodes
Path to_return;
*to_return.add_mapping() = to_mapping(traversal.snarl().start(), augmented.graph);
for (size_t i = 0; i < traversal.visits_size(); i++) {
*to_return.add_mapping() = to_mapping(traversal.visits(i), augmented.graph);
}
*to_return.add_mapping() = to_mapping(traversal.snarl().end(), augmented.graph);

return to_return;

}

vector<SnarlTraversal> RepresentativeTraversalFinder::find_traversals(const Snarl& site) {

// TODO: we can only do ultrabubbles right now. Other snarls may not have
// traversals through from end to end.
assert(site.type() == ULTRABUBBLE);

// Get its nodes and edges (including all child sites)
// We may need to make a new index for a backbone for this site, if it's not
// on the primary path.
unique_ptr<PathIndex> backbone_index;

if (!primary_path_index.by_id.count(site.start().node_id()) || !primary_path_index.by_id.count(site.end().node_id())) {
// This site is not strung along the primary path, so we will need to
// generate a backbone traversal of it to structure our search for
// representative traversals (because they always want to come back to
// the backbone as soon as possible).

// TODO: we don't handle children correctly (we just glom them into
// ourselves).
Path backbone = find_backbone(site);

// Index the backbone (but don't bother with the sequence)
backbone_index = unique_ptr<PathIndex>(new PathIndex(backbone));
}

// Determnine what path will be the path we use to scaffold the traversals:
// the primary path index by default, or the backbone index if we needed one.
PathIndex& index = (backbone_index.get() != nullptr ? *backbone_index : primary_path_index);

// Get the site's nodes and edges (including all child sites)
pair<unordered_set<Node*>, unordered_set<Edge*>> contents = snarl_manager.deep_contents(&site, augmented.graph, true);

// Copy its node set
Expand All @@ -643,8 +689,7 @@ vector<SnarlTraversal> RepresentativeTraversalFinder::find_traversals(const Snar
// Trace the ref path through the site
vector<NodeTraversal> ref_path_for_site;

// First figure where the site starts and ends in the primary path
// TODO: support off-path sites
// First figure where the site starts and ends in the selected path
size_t site_start = index.by_id.at(site.start().node_id()).first;
size_t site_end = index.by_id.at(site.end().node_id()).first;

Expand All @@ -668,7 +713,7 @@ vector<SnarlTraversal> RepresentativeTraversalFinder::find_traversals(const Snar
// Find the reference node starting here or later.
auto found = index.by_start.lower_bound(ref_node_start);
if(found == index.by_start.end()) {
throw runtime_error("No ref node found when tracing through site!");
throw runtime_error("No backbone node found when tracing through site!");
}
if((*found).first > index.by_id.at(site.end().node_id()).first) {
// The next reference node we can find is out of the space
Expand Down Expand Up @@ -699,7 +744,7 @@ vector<SnarlTraversal> RepresentativeTraversalFinder::find_traversals(const Snar
// Make sure none of the nodes in the site that we didn't visit
// while tracing along the ref path are on the ref path.
if(index.by_id.count(node->id())) {
cerr << "Node " << node->id() << " is on ref path at "
cerr << "Node " << node->id() << " is on backbone path at "
<< index.by_id.at(node->id()).first << " but not traced in site "
<< to_node_traversal(site.start(), augmented.graph) << " to "
<< to_node_traversal(site.end(), augmented.graph) << endl;
Expand All @@ -724,10 +769,10 @@ vector<SnarlTraversal> RepresentativeTraversalFinder::find_traversals(const Snar
assert(contents.first.count(traversal.node));
#ifdef debug
if(index.by_id.count(traversal.node->id())) {
cerr << "Path member " << traversal << " lives on ref at "
cerr << "Path member " << traversal << " lives on backbone at "
<< index.by_id.at(traversal.node->id()).first << endl;
} else {
cerr << "Path member " << traversal << " does not live on ref" << endl;
cerr << "Path member " << traversal << " does not live on backbone" << endl;
}
#endif
}
Expand Down Expand Up @@ -761,7 +806,7 @@ vector<SnarlTraversal> RepresentativeTraversalFinder::find_traversals(const Snar
if(ref_path_index == ref_path_for_site.size()) {
// Still couldn't find it!
stringstream err;
err << "Couldn't find " << path.back() << " in ref path of site "
err << "Couldn't find " << path.back() << " in backbone path of site "
<< to_node_traversal(site.start(), augmented.graph)
<< " to " << to_node_traversal(site.end(), augmented.graph) << endl;
throw runtime_error(err.str());
Expand All @@ -788,12 +833,12 @@ vector<SnarlTraversal> RepresentativeTraversalFinder::find_traversals(const Snar
}

if(index.by_id.count(node->id())) {
// Don't try to pathfind to the reference for reference nodes.
// Don't try to pathfind to the backbone for backbone nodes.
continue;
}

// TODO: allow bubbles that don't backend into the primary path
pair<Support, vector<NodeTraversal>> sup_path = find_bubble(node, nullptr);
// Find bubbles that backend into the backbone path
pair<Support, vector<NodeTraversal>> sup_path = find_bubble(node, nullptr, index);

vector<NodeTraversal>& path = sup_path.second;

Expand All @@ -818,14 +863,14 @@ vector<SnarlTraversal> RepresentativeTraversalFinder::find_traversals(const Snar
// Go through all the edges

if(!index.by_id.count(edge->from()) || !index.by_id.count(edge->to())) {
// Edge doesn't touch reference at both ends. Don't use it
// Edge doesn't touch backbone at both ends. Don't use it
// because for some reason it makes performance worse
// overall.
continue;
}

// Find a path based around this edge
pair<Support, vector<NodeTraversal>> sup_path = find_bubble(nullptr, edge);
pair<Support, vector<NodeTraversal>> sup_path = find_bubble(nullptr, edge, index);
vector<NodeTraversal>& path = sup_path.second;

#ifdef debug
Expand Down Expand Up @@ -907,7 +952,7 @@ vector<SnarlTraversal> RepresentativeTraversalFinder::find_traversals(const Snar
return unique_traversals;
}

pair<Support, vector<NodeTraversal>> RepresentativeTraversalFinder::find_bubble(Node* node, Edge* edge) {
pair<Support, vector<NodeTraversal>> RepresentativeTraversalFinder::find_bubble(Node* node, Edge* edge, PathIndex& index) {

// What are we going to find our left and right path halves based on?
NodeTraversal left_traversal;
Expand All @@ -934,8 +979,8 @@ pair<Support, vector<NodeTraversal>> RepresentativeTraversalFinder::find_bubble(
// Find paths on both sides, with nodes on the primary path at the outsides
// and this edge in the middle. Returns path lengths and paths in pairs in a
// set.
auto leftPaths = bfs_left(left_traversal);
auto rightPaths = bfs_right(right_traversal);
auto leftPaths = bfs_left(left_traversal, index);
auto rightPaths = bfs_right(right_traversal, index);

// Find a combination of two paths which gets us to the reference in a
// consistent orientation (meaning that when you look at the ending nodes'
Expand Down Expand Up @@ -1121,7 +1166,8 @@ Support RepresentativeTraversalFinder::min_support_in_path(const list<NodeTraver
return minSupport;
}

set<pair<size_t, list<NodeTraversal>>> RepresentativeTraversalFinder::bfs_left(NodeTraversal node, bool stopIfVisited) {
set<pair<size_t, list<NodeTraversal>>> RepresentativeTraversalFinder::bfs_left(NodeTraversal node,
PathIndex& index, bool stopIfVisited) {

// Holds partial paths we want to return, with their lengths in bp.
set<pair<size_t, list<NodeTraversal>>> toReturn;
Expand All @@ -1141,8 +1187,11 @@ set<pair<size_t, list<NodeTraversal>>> RepresentativeTraversalFinder::bfs_left(N
// Mark this traversal as already queued
alreadyQueued.insert(node);

#ifdef debug
// How many ticks have we spent searching?
size_t searchTicks = 0;
#endif

// Track how many options we have because size may be O(n).
size_t stillToExtend = toExtend.size();

Expand Down Expand Up @@ -1222,10 +1271,11 @@ set<pair<size_t, list<NodeTraversal>>> RepresentativeTraversalFinder::bfs_left(N
return toReturn;
}

set<pair<size_t, list<NodeTraversal>>> RepresentativeTraversalFinder::bfs_right(NodeTraversal node, bool stopIfVisited) {
set<pair<size_t, list<NodeTraversal>>> RepresentativeTraversalFinder::bfs_right(NodeTraversal node, PathIndex& index,
bool stopIfVisited) {

// Look left from the backward version of the node.
auto toConvert = bfs_left(node.reverse(), stopIfVisited);
auto toConvert = bfs_left(node.reverse(), index, stopIfVisited);

// Since we can't modify set records in place, we need to do a copy
set<pair<size_t, list<NodeTraversal>>> toReturn;
Expand Down Expand Up @@ -1253,7 +1303,7 @@ size_t RepresentativeTraversalFinder::bp_length(const list<NodeTraversal>& path)
return length;
}

int total(const Support& support) {
double total(const Support& support) {
return support.forward() + support.reverse();
}

Expand Down Expand Up @@ -1289,30 +1339,35 @@ Support& operator+=(Support& one, const Support& other) {
return one;
}

Support operator*(const Support& support, const size_t& scale) {
Support prod;
prod.set_forward(support.forward() * scale);
prod.set_reverse(support.reverse() * scale);
prod.set_left(support.left() * scale);
prod.set_right(support.right() * scale);

// log-scaled quality can just be multiplied
prod.set_quality(support.quality() * scale);

return prod;
bool operator< (const Support& a, const Support& b) {
return total(a) < total(b);
}

bool operator> (const Support& a, const Support& b) {
return total(a) > total(b);
}

Support operator*(const size_t& scale, const Support& support) {
Support prod;
prod.set_forward(support.forward() * scale);
prod.set_reverse(support.reverse() * scale);
prod.set_left(support.left() * scale);
prod.set_right(support.right() * scale);
ostream& operator<<(ostream& stream, const Support& support) {
return stream << support.forward() << "," << support.reverse();
}

string to_vcf_genotype(const Genotype& gt) {
// Emit parts into this stream
stringstream stream;

// log-scaled quality can just be multiplied
prod.set_quality(support.quality() * scale);
for (size_t i = 0; i < gt.allele_size(); i++) {
// For each allele called as present in the genotype

// Put it in the string
stream << gt.allele(i);

if (i + 1 != gt.allele_size()) {
// Write a separator after all but the last one
stream << (gt.is_phased() ? '|' : '/');
}
}

return prod;
return stream.str();
}


Expand Down
Loading

0 comments on commit fdd117d

Please sign in to comment.