Skip to content

Commit

Permalink
Merge pull request #56 from OndrejSladky/reorganization
Browse files Browse the repository at this point in the history
Project cleanup and reorganization
  • Loading branch information
OndrejSladky authored Dec 28, 2023
2 parents c5b7f17 + 1220778 commit e54b0d9
Show file tree
Hide file tree
Showing 20 changed files with 249 additions and 231 deletions.
4 changes: 2 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[submodule "googletest"]
path = googletest
[submodule "tests/googletest"]
path = tests/googletest
url = https://github.com/google/googletest
7 changes: 4 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

CXX= g++
CXXFLAGS= -g -Wall -Wno-unused-function -std=c++17 -O2
GTEST= googletest/googletest
LDFLAGS= -lz
SRC= src
TESTS= tests
GTEST= $(TESTS)/googletest/googletest

all: kmercamel

Expand All @@ -27,8 +28,8 @@ kmercamel: $(SRC)/main.cpp $(SRC)/$(wildcard *.cpp *.h *.hpp) src/version.h
$(CXX) $(CXXFLAGS) $(SRC)/main.cpp -o $@ $(LDFLAGS)
cp kmercamel 🐫 || true

kmercameltest: $(SRC)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(SRC)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS)
kmercameltest: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS)

gtest-all.o: $(GTEST)/src/gtest-all.cc $(wildcard *.cpp *.h *.hpp)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include -I $(GTEST) -DGTEST_CREATE_SHARED_LIBRARY=1 -c -pthread $(GTEST)/src/gtest-all.cc -o $@
Expand Down
1 change: 0 additions & 1 deletion googletest
Submodule googletest deleted from 93f08b
105 changes: 105 additions & 0 deletions src/ac_automaton.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#pragma once

#include <list>
#include <vector>

#include "models.h"

constexpr int INVALID_STATE = -1;
struct ACState {
// List of k-mers whose prefix this state is.
std::list<size_t> supporters;
// Indexes of the states longer by the corresponding nucleotide.
int forwardEdges[4] = {INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE};
// Where to go if searching failed.
int backwardEdge = 0;
// Length of the corresponding string.
int depth = 0;
// Index of the state in the automaton's *states* list.
int id = 0;
};

struct ACAutomaton {
std::vector<ACState> states;
// Indices of the states where i-th k-mer ends.
std::vector<int> endStateIndices;
// Reversed BFS order.
std::vector<int> reversedOrdering;

/// Append a new state and return its ID.
int AddState(const int depth) {
ACState newState;
newState.depth = depth;
newState.id = (int)states.size();
states.push_back(newState);
return (int)states.size() - 1;
}

/// Generate the trie from the given k-mers and set *endStateIndices*.
void ConstructTrie(const std::vector<KMer> &kMers) {
endStateIndices = std::vector<int>(kMers.size());
// Initialize the root.
AddState(0);
for (size_t i = 0; i < kMers.size(); ++i) states[0].supporters.push_back(i);
for (size_t i = 0; i < kMers.size(); ++i) {
int state = 0;
for (char c : kMers[i].value) {
int index = NucleotideToInt(c);
// If the next state does not yet exist, create it.
if (states[state].forwardEdges[index] == INVALID_STATE) {
int newState = AddState(states[state].depth + 1);
states[state].forwardEdges[index] = newState;
}
state = states[state].forwardEdges[index];
states[state].supporters.push_back(i);
}
endStateIndices[i] = state;
}

// Create a forward edge from the root to itself so that the AC Step always finds a valid forward edge.
for (int & forwardEdge : states[0].forwardEdges) {
if (forwardEdge == INVALID_STATE) {
forwardEdge = 0;
}
}
}

/// Do one step of the AC algorithm from the given state with a nucleotide with a given index.
int Step (int state, const int index) {
while (states[state].forwardEdges[index] == INVALID_STATE) {
state = states[state].backwardEdge;
}
return states[state].forwardEdges[index];
}

/// Construct the fail edges for the trie that already had been created.
/// Also construct the reversed BFS ordering.
void ConstructBackwardEdges () {
reversedOrdering = std::vector<int> (states.size(), 0);
std::queue<int> q;
for (int & forwardEdge : states[0].forwardEdges) {
if (forwardEdge != 0) {
q.push(forwardEdge);
}
}
size_t orderingIndex = reversedOrdering.size() - size_t(2);
while (!q.empty()) {
int state = q.front();
reversedOrdering[orderingIndex--] = state;
q.pop();
for (int i = 0; i < 4; ++i) {
int nextState = states[state].forwardEdges[i];
if (nextState != INVALID_STATE && nextState != 0) {
states[nextState].backwardEdge = Step(states[state].backwardEdge, i);
q.push(nextState);
}
}
}
}

/// Construct the Aho-Corasick automaton.
void Construct (const std::vector<KMer> &kMers) {
ConstructTrie(kMers);
ConstructBackwardEdges();
}
};
7 changes: 4 additions & 3 deletions src/greedy.h → src/global.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#pragma once
#include "models.h"

#include <vector>
#include <iostream>
Expand All @@ -8,6 +7,7 @@
#include <algorithm>
#include <cstdint>

#include "models.h"
#include "kmers.h"
#include "khash.h"

Expand Down Expand Up @@ -185,12 +185,13 @@ void SuperstringFromPath(const overlapPath &hamiltonianPath, const std::vector<i
of << unmaskedNucleotides;
}

/// Get the approximated shortest superstring of the given k-mers using the GREEDY algorithm.
/// Get the approximated shortest superstring of the given k-mers using the global greedy algorithm.
///
/// This runs in O(n k), where n is the number of k-mers.
/// If complements are provided, treat k-mer and its complement as identical.
/// If this is the case, k-mers are expected not to contain both k-mer and its complement.
/// Warning: this will destroy kMers.
void Greedy(std::vector<int64_t> &kMers, std::ostream& of, int k, bool complements) {
void Global(std::vector<int64_t> &kMers, std::ostream& of, int k, bool complements) {
if (kMers.empty()) {
throw std::invalid_argument("input cannot be empty");
}
Expand Down
108 changes: 4 additions & 104 deletions src/greedy_ac.h → src/global_ac.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,109 +7,12 @@
#include <list>
#include <sstream>
#include <fstream>
#include <algorithm>

#include "models.h"
#include "kmers.h"
#include "greedy.h"
#include "ac_automaton.h"

constexpr int INVALID_STATE = -1;
struct ACState {
// List of k-mers whose prefix this state is.
std::list<size_t> supporters;
// Indexes of the states longer by the corresponding nucleotide.
int forwardEdges[4] = {INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE};
// Where to go if searching failed.
int backwardEdge = 0;
// Length of the corresponding string.
int depth = 0;
// Index of the state in the automaton's *states* list.
int id = 0;
};

struct ACAutomaton {
std::vector<ACState> states;
// Indices of the states where i-th k-mer ends.
std::vector<int> endStateIndices;
// Reversed BFS order.
std::vector<int> reversedOrdering;

/// Append a new state and return its ID.
int AddState(const int depth) {
ACState newState;
newState.depth = depth;
newState.id = (int)states.size();
states.push_back(newState);
return (int)states.size() - 1;
}

/// Generate the trie from the given k-mers and set *endStateIndices*.
void ConstructTrie(const std::vector<KMer> &kMers) {
endStateIndices = std::vector<int>(kMers.size());
// Initialize the root.
AddState(0);
for (size_t i = 0; i < kMers.size(); ++i) states[0].supporters.push_back(i);
for (size_t i = 0; i < kMers.size(); ++i) {
int state = 0;
for (char c : kMers[i].value) {
int index = NucleotideToInt(c);
// If the next state does not yet exist, create it.
if (states[state].forwardEdges[index] == INVALID_STATE) {
int newState = AddState(states[state].depth + 1);
states[state].forwardEdges[index] = newState;
}
state = states[state].forwardEdges[index];
states[state].supporters.push_back(i);
}
endStateIndices[i] = state;
}

// Create a forward edge from the root to itself so that the AC Step always finds a valid forward edge.
for (int & forwardEdge : states[0].forwardEdges) {
if (forwardEdge == INVALID_STATE) {
forwardEdge = 0;
}
}
}

/// Do one step of the AC algorithm from the given state with a nucleotide with a given index.
int Step (int state, const int index) {
while (states[state].forwardEdges[index] == INVALID_STATE) {
state = states[state].backwardEdge;
}
return states[state].forwardEdges[index];
}

/// Construct the fail edges for the trie that already had been created.
/// Also construct the reversed BFS ordering.
void ConstructBackwardEdges () {
reversedOrdering = std::vector<int> (states.size(), 0);
std::queue<int> q;
for (int & forwardEdge : states[0].forwardEdges) {
if (forwardEdge != 0) {
q.push(forwardEdge);
}
}
size_t orderingIndex = reversedOrdering.size() - size_t(2);
while (!q.empty()) {
int state = q.front();
reversedOrdering[orderingIndex--] = state;
q.pop();
for (int i = 0; i < 4; ++i) {
int nextState = states[state].forwardEdges[i];
if (nextState != INVALID_STATE && nextState != 0) {
states[nextState].backwardEdge = Step(states[state].backwardEdge, i);
q.push(nextState);
}
}
}
}

/// Construct the Aho-Corasick automaton.
void Construct (const std::vector<KMer> &kMers) {
ConstructTrie(kMers);
ConstructBackwardEdges();
}
};

/// Represents oriented edge in the overlap graph.
struct OverlapEdge {
Expand Down Expand Up @@ -180,12 +83,9 @@ std::string Suffix(const KMer &kMer, const int overlap) {
void SuperstringFromPath(const std::vector<OverlapEdge> &hamiltonianPath, const std::vector<KMer> &kMers, std::ostream& of, const int k) {
std::vector<OverlapEdge> edgeFrom (kMers.size(), OverlapEdge{size_t(-1),size_t(-1), -1});
std::vector<bool> isStart(kMers.size(), false);
// Compute the resulting length of the superstring.
int result_length = kMers.size() * k;
for (auto edge : hamiltonianPath) {
isStart[edge.firstIndex] = true;
edgeFrom[edge.firstIndex] = edge;
result_length -= edge.overlapLength;
}
for (auto edge : hamiltonianPath) {
isStart[edge.secondIndex] = false;
Expand Down Expand Up @@ -218,10 +118,10 @@ void SuperstringFromPath(const std::vector<OverlapEdge> &hamiltonianPath, const
of << unmaskedNucleotides;
}

/// Get the approximated shortest superstring of the given k-mers using the GREEDY algorithm with Aho-Corasick automaton.
/// Get the approximated shortest superstring of the given k-mers using the global greedy algorithm with Aho-Corasick automaton.
/// This runs in O(n k), where n is the number of k-mers.
/// If complements are provided, it is expected that kMers do not contain both k-mer and its reverse complement.
void GreedyAC(std::vector<KMer> kMers, std::ostream& of, bool complements) {
void GlobalAC(std::vector<KMer> kMers, std::ostream& of, bool complements) {
if (kMers.empty()) {
throw std::invalid_argument("input cannot be empty");
}
Expand Down
10 changes: 7 additions & 3 deletions src/generalized_simplitigs.h → src/local.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,13 @@ void NextGeneralizedSimplitig(kh_S64_t *kMers, int64_t begin, std::ostream& of,
of.flush();
}

/// Compute the generalized simplitigs greedily.
/// This runs in O(n d_max ^ k), where n is the number of k-mers, but for practical uses it is fast.
void GreedyGeneralizedSimplitigs(kh_S64_t *kMers, std::ostream& of, int k, int d_max, bool complements) {
/// Get the approximated shortest superstring of the given k-mers using the local greedy algorithm.
///
/// This runs in O(n d_max ^ k), where n is the number of k-mers, but for practical uses it is faster than AC version.
/// If complements are provided, treat k-mer and its complement as identical.
/// If this is the case, k-mers are expected not to contain both k-mer and its complement.
/// Warning: this will destroy kMers.
void Local(kh_S64_t *kMers, std::ostream& of, int k, int d_max, bool complements) {
size_t lastIndex = 0;
while(true) {
int64_t begin = nextKMer(kMers, lastIndex);
Expand Down
10 changes: 6 additions & 4 deletions src/generalized_simplitigs_ac.h → src/local_ac.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
#pragma once
#include "models.h"
#include "greedy_ac.h"

#include <string>
#include <vector>
Expand All @@ -9,6 +7,9 @@
#include <algorithm>
#include <fstream>

#include "models.h"
#include "ac_automaton.h"

/// Find the index of the first extending k-mer from the incidentKMers which is not forbidden.
/// Mark this k-mer forbidden and remove all the k-mers in incidentKMers which are forbidden.
/// The latter is done in order not to increase time complexity by repeatedly iterating over forbidden k-mers.
Expand All @@ -31,10 +32,11 @@ size_t ExtensionAC(std::vector<bool> &forbidden, std::list<size_t> &incidentKMer
return -1;
}

/// Compute the generalized simplitigs greedily using the Aho-Corasick automaton.
/// Get the approximated shortest superstring of the given k-mers using the local greedy algorithm with Aho-Corasick automaton.
///
/// This runs in O(n k), where n is the number of k-mers.
/// If complements are provided, it is expected that kMers do not contain both k-mer and its reverse complement.
void GreedyGeneralizedSimplitigsAC(std::vector<KMer> kMers, std::ostream& of, int k, int d_max, bool complements) {
void LocalAC(std::vector<KMer> kMers, std::ostream& of, int k, int d_max, bool complements) {
// Add complementary k-mers.
size_t n = kMers.size();
kMers.resize(n * (1 + complements));
Expand Down
16 changes: 8 additions & 8 deletions src/main.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "greedy_ac.h"
#include "greedy.h"
#include "generalized_simplitigs.h"
#include "generalized_simplitigs_ac.h"
#include "global_ac.h"
#include "global.h"
#include "local.h"
#include "local_ac.h"
#include "parser.h"
#include "streaming.h"
#include "output.h"
Expand Down Expand Up @@ -129,9 +129,9 @@ int main(int argc, char **argv) {
auto kMerVec = kMersToVec(kMers);
kh_destroy_S64(kMers);
PartialPreSort(kMerVec, k);
Greedy(kMerVec, *of, k, complements);
Global(kMerVec, *of, k, complements);
}
else GreedyGeneralizedSimplitigs(kMers, *of, k, d_max, complements);
else Local(kMers, *of, k, d_max, complements);
} else {
auto data = ReadFasta(path);
if (data.empty()) {
Expand All @@ -144,10 +144,10 @@ int main(int argc, char **argv) {
auto kMers = ConstructKMers(data, k, complements);
WriteName(k, *of);
if (algorithm == "globalAC") {
GreedyAC(kMers, *of, complements);
GlobalAC(kMers, *of, complements);
}
else if (algorithm == "localAC") {
GreedyGeneralizedSimplitigsAC(kMers, *of, k, d_max, complements);
LocalAC(kMers, *of, k, d_max, complements);
}
else {
std::cerr << "Algorithm '" << algorithm << "' not supported." << std::endl;
Expand Down
Loading

0 comments on commit e54b0d9

Please sign in to comment.