Merge pull request #73 from OndrejSladky/include-large

Include large KmerCamel
OndrejSladky · Aug 21, 2024 · 2c96272 · 2c96272
2 parents b067518 + 72905d8
commit 2c96272
Show file tree

Hide file tree

Showing 19 changed files with 314 additions and 243 deletions.
diff --git a/Makefile b/Makefile
@@ -10,7 +10,7 @@ GTEST=       $(TESTS)/googletest/googletest
 DATA=        data
 
 
-all: kmercamel kmercamel-large
+all: kmercamel
 
 test: cpptest converttest verify
 
@@ -34,10 +34,6 @@ kmercamel: $(SRC)/main.cpp $(SRC)/$(wildcard *.cpp *.h *.hpp) src/version.h
 	$(CXX) $(CXXFLAGS) $(SRC)/main.cpp -o $@ $(LDFLAGS)
 	cp kmercamel  🐫 || true
 
-kmercamel-large: $(SRC)/main.cpp $(SRC)/$(wildcard *.cpp *.h *.hpp) src/version.h
-	./create-version.sh
-	$(CXX) $(CXXFLAGS) $(SRC)/main.cpp -o $@ $(LDFLAGS) $(LARGEFLAGS)
-
 kmercameltest: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp)
 	$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS)
 
@@ -55,6 +51,7 @@ clean:
 	rm -f kmercamel
 	rm -f 🐫 || true
 	rm -f kmercameltest
+	rm -f kmercameltest-large
 	rm -r -f ./bin
 	rm -f gtest-all.o
 	rm -f src/version.h
diff --git a/README.md b/README.md
@@ -20,8 +20,7 @@ They come in two different implementations (their results may differ due to the
 Note that at this point only the implementations with hash table are optimized and that the Aho-Corasick automaton
 based versions of the algorithms are only experimental.
 
-The hashing based implementations of the default KmerCamel🐫 (`./kmercamel`) support $k$-mer with $k$ at most 31,
-whereas the larger KmerCamel🐫 (`./kmercamel-large`) supports $k$-mers with $k$ at most 63 (at the cost of slight slowdown).
+The hashing based implementations of KmerCamel🐫 support $k$-mer with $k$ at most 63,
 
 All algorithms can be used to either work in the unidirectional model or in the bidirectional model
 (i.e. treat $k$-mer and its reverse complement as the same; in this case either of them appears in the result).
@@ -62,8 +61,8 @@ on macOS.
 
 The program has the following arguments:
 
-- `-p path_to_fasta` - the path to fasta file. This is a required argument.
-- `-k value_of_k` - the size of one k-mer. This is a required argument.
+- `-p path_to_fasta` - the path to fasta file (can be `gzip`ed). This is a required argument.
+- `-k value_of_k` - the size of one k-mer (up to 63). This is a required argument.
 - `-a algorithm` - the algorithm which should be run. Either `global` or `globalAC` for Global Greedy, `local` or `localAC` for Local Greedy.
 The versions with AC use Aho-Corasick automaton. Default `global`.
 - `-o output_path` - the path to output file. If not specified, output is printed to stdout.
@@ -79,27 +78,18 @@ The output contains the resulting superstring - capital letters indicate that at
 For example:
 
 ```
-./kmercamel -p ./spneumoniae.fa -a local -k 12 -d 7 -c
+./kmercamel -p ./spneumoniae.fa -a local -k 31 -d 5 -c
 ```
 
-runs the Local Greedy in the bidirectional model on the streptococcus fasta file with `k=12` and `d=7`.
+runs the Local Greedy in the bidirectional model on the streptococcus fasta file with `k=31` and `d=5`.
 
 Alternatively, if your operating system supports it, you can run `./🐫` instead of `./kmercamel`.
 
-Currently, KmerCamel🐫 does not support gziped files as an input.
-A possible workaround is to use `gzcat` and process substitution.
-
-```
-./kmercamel -k 13 -p <(gzcat fasta_file.fa.gz)
-```
-
-Note: on some systems you might need to use the name `zcat` instead of `gzcat`.
-
 ### Mask optimization
 
 For mask optimization, run the subcommand `optimize` with the following arguments:
 
-- `p path_to_fasta` - the path to fasta file. This is a required argument.
+- `p path_to_fasta` - the path to fasta file (can be `gzip`ed). This is a required argument.
 - `k k_value` - the size of one k-mer. This is a required argument.
 - `a algorithm` - the algorithm for mask optimization. Either `ones` for maximizing the number of 1s, `runs` for minimizing the number of runs of 1s, `runsapprox` for approximately minimizing the number of runs of 1s, or `zeros` for maximizing the number of 0s. Default `ones`.
 - `o output_path` - the path to output file. If not specified, output is printed to stdout.
@@ -110,26 +100,11 @@ For mask optimization, run the subcommand `optimize` with the following argument
 For example:
 
 ```
-./kmercamel optimize -p ./global-spneumoniae.fa -k 12 -a runs -c
+./kmercamel optimize -p ./global-spneumoniae.fa -k 31 -a runs -c
 ```
 
 minimizes the number of runs of 1s in the mask of the superstring computed by Global Greedy in the bidirectional model on the streptococcus fasta file with `k=12`.
 
-Note: currently mask optimization cannot read gziped files, nor can the proccess substititution be used.
-
-### Large $k$-mers
-
-The default version of KmerCamel🐫 does not support $k > 31$. For those values, use the large KmerCamel🐫,
-which supports $k < 64$.
-
-For example:
-
-```
-./kmercamel-large -p ./spneumoniae.fa -a global -k 63 -c
-```
-
-Note: for smaller $k$ it is recommended to use default KmerCamel🐫 as it is faster.
-
 ### Turn off memory optimizations for Global
 
 In order to reduce the memory footprint of hash-table based Global Greedy,

diff --git a/src/ac/kmers_ac.h b/src/ac/kmers_ac.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <string>
 #include <vector>
+#include "../kmers.h"
 
 struct KMer {
     std::string value;
@@ -42,3 +43,13 @@ int NucleotideToInt (char c) {
     }
 }
 
+/// Convert the given k-mer to its representation as a number.
+kmer64_t KMerToNumber(const KMer &kMer) {
+    kmer64_t ret = 0;
+    for (char c : kMer.value) {
+        ret <<= 2;
+        ret |= NucleotideToInt(c);
+    }
+    return ret;
+}
+
diff --git a/src/ac/streaming.h b/src/ac/streaming.h
@@ -5,7 +5,7 @@
 #include <cassert>
 #include <algorithm>
 
-#include "../kmers.h"
+#include "kmers_ac.h"
 
 void Push(std::ostream &of, const std::string &current, int k, int32_t used) {
     if (current.size() == static_cast<size_t>(k) && used != 0) {
@@ -40,7 +40,7 @@ void Streaming(std::string &path, std::ostream &of, int k, bool complements) {
                         if (kMer.size() == size_t(k + 1)) kMer=kMer.substr(1);
                         assert(kMer.size() <= size_t(k));;
                         if (kMer.length() == size_t (k) ) {
-                            int64_t encoded = KMerToNumber(KMer{kMer});
+                            uint64_t encoded = KMerToNumber(KMer{kMer});
                             auto rc = ReverseComplement(encoded, k);
                             auto rc2 = NumberToKMer(rc, k);
                             bool contained = (kMers.count(encoded) > 0) || ((complements) && kMers.count(ReverseComplement(encoded, k)) > 0);

diff --git a/src/global.h b/src/global.h
@@ -28,6 +28,7 @@ constexpr int SORT_FIRST_BITS_DEFAULT = 8;
 typedef std::pair<std::vector<size_t>, std::vector<unsigned char>> overlapPath;
 
 /// Rearrange the k-mers so that k-mers next to each other in sorted order appear close so that they are in the same bucket.
+template <typename kmer_t>
 void PartialPreSort(std::vector<kmer_t> &vals, int k) {
     int SORT_FIRST_BITS = std::min(2 * k, SORT_FIRST_BITS_DEFAULT);
     kmer_t DIFFERENT_PREFIXES_COUNT = kmer_t(1) << SORT_FIRST_BITS;
@@ -56,7 +57,8 @@ void PartialPreSort(std::vector<kmer_t> &vals, int k) {
 /// If complements are provided, treat k-mer and its complement as identical.
 /// If this is the case, k-mers are expected to contain only one k-mer from a complement pair.
 /// Moreover, if so, the resulting Hamiltonian path contains two superstrings which are reverse complements of one another.
-overlapPath OverlapHamiltonianPath (std::vector<kmer_t> &kMers, int k, bool complements) {
+template <typename kmer_t, typename kh_wrapper_t>
+overlapPath OverlapHamiltonianPath (kh_wrapper_t wrapper, std::vector<kmer_t> &kMers, int k, bool complements) {
     size_t n = kMers.size();
     size_t kMersCount = n * (1 + complements);
     size_t batchSize = kMersCount / MEMORY_REDUCTION_FACTOR + 1;
@@ -72,13 +74,13 @@ overlapPath OverlapHamiltonianPath (std::vector<kmer_t> &kMers, int k, bool comp
     for (size_t i = 0; i < n; ++i) {
         first[i] = last[i] = i;
     }
-    khash_t(P64)  *prefixes = kh_init(P64);
-    kh_resize(P64, prefixes, (kMersCount / MEMORY_REDUCTION_FACTOR + 1 ) * 100 / 77 );
+    auto *prefixes = wrapper.kh_init_map();
+    wrapper.kh_resize_map(prefixes, (kMersCount / MEMORY_REDUCTION_FACTOR + 1 ) * 100 / 77 );
     for (int d = k - 1; d >= 0; --d) {
         // In order to reduce memory requirements, the prefixes are not processed at once, but in batches.
         // As a cost, this slows down the algorithm.
         for (int part = 0; part < MEMORY_REDUCTION_FACTOR; part++) {
-            kh_clear(P64, prefixes);
+            wrapper.kh_clear_map(prefixes);
             for (size_t i = 0; i < batchSize; ++i) {
                 next[i] = (size_t)-1;
             }
@@ -88,20 +90,19 @@ overlapPath OverlapHamiltonianPath (std::vector<kmer_t> &kMers, int k, bool comp
                 if (!prefixForbidden[i]) {
                     next[i - from] = -1;
                     kmer_t prefix = BitPrefix(access(kMers,i), k, d);
-                    auto prefix_key = kh_get(P64, prefixes, prefix);
+                    auto prefix_key = wrapper.kh_get_from_map(prefixes, prefix);
                     if (prefix_key != kh_end(prefixes)) {
                         next[i - from] = kh_val(prefixes, prefix_key);
                     } else {
                         int ret;
-                        prefix_key = kh_put(P64, prefixes, prefix, &ret);
-
+                        prefix_key = wrapper.kh_put_to_map(prefixes, prefix, &ret);
                     }
                     kh_value(prefixes, prefix_key) = i;
                 }
             for (size_t i = 0; i < kMersCount; ++i)
                 if (!suffixForbidden[i]) {
                     kmer_t suffix = BitSuffix(access(kMers, i), d);
-                    auto suffix_key = kh_get(P64, prefixes, suffix);
+                    auto suffix_key = wrapper.kh_get_from_map(prefixes, suffix);
                     if (suffix_key == kh_end(prefixes)) continue;
                     size_t previous, j;
                     previous = j = kh_val(prefixes, suffix_key);
@@ -138,7 +139,7 @@ overlapPath OverlapHamiltonianPath (std::vector<kmer_t> &kMers, int k, bool comp
         }
     }
 
-    kh_destroy(P64, prefixes);
+    wrapper.kh_destroy_map(prefixes);
     delete[](next);
     delete[](first);
     delete[](last);
@@ -148,6 +149,7 @@ overlapPath OverlapHamiltonianPath (std::vector<kmer_t> &kMers, int k, bool comp
 /// Construct the superstring and its mask from the given overlapPath path in the overlap graph.
 /// If reverse complements are considered and the overlapPath path contains two paths which are reverse complements of one another,
 /// return only one of them.
+template <typename kmer_t>
 void SuperstringFromPath(const overlapPath &hamiltonianPath, const std::vector<kmer_t> &kMers, std::ostream& of, const int k, const bool complements) {
     size_t kMersCount = kMers.size() * (1 + complements);
     auto edgeFrom = hamiltonianPath.first;
@@ -189,11 +191,12 @@ void SuperstringFromPath(const overlapPath &hamiltonianPath, const std::vector<k
 /// If complements are provided, treat k-mer and its complement as identical.
 /// If this is the case, k-mers are expected not to contain both k-mer and its complement.
 /// Warning: this will destroy kMers.
-void Global(std::vector<kmer_t> &kMers, std::ostream& of, int k, bool complements) {
+template <typename kmer_t, typename kh_wrapper_t>
+void Global(kh_wrapper_t wrapper, std::vector<kmer_t> &kMers, std::ostream& of, int k, bool complements) {
     if (kMers.empty()) {
         throw std::invalid_argument("input cannot be empty");
     }
-    auto hamiltonianPath = OverlapHamiltonianPath(kMers, k, complements);
+    auto hamiltonianPath = OverlapHamiltonianPath(wrapper, kMers, k, complements);
     SuperstringFromPath(hamiltonianPath, kMers, of, k, complements);
 }
 

diff --git a/src/khash_utils.h b/src/khash_utils.h
@@ -16,41 +16,78 @@
 #define KHASH_SET_INIT_INT128(name)										\
 	KHASH_INIT(name, __uint128_t, char, 0, kh_int128_hash_func, kh_int128_hash_equal)
 
-#ifdef LARGE_KMERS
-    // Use 128-bit integers for k-mers to allow for larger k.
-    KHASH_SET_INIT_INT128(S64)
-    KHASH_MAP_INIT_INT128(P64, size_t)
-    KHASH_MAP_INIT_INT128(O64, size_t)
-#else
-    // Use 64-bits integers for k-mers for faster operations and less memory usage.
-    KHASH_SET_INIT_INT64(S64)
-    KHASH_MAP_INIT_INT64(P64, size_t)
-    KHASH_MAP_INIT_INT64(O64, size_t)
-#endif
-
+// Use 128-bit integers for k-mers to allow for larger k.
+KHASH_SET_INIT_INT128(S128)
+KHASH_MAP_INIT_INT128(P128, size_t)
+// Use 64-bits integers for k-mers for faster operations and less memory usage.
+KHASH_SET_INIT_INT64(S64)
+KHASH_MAP_INIT_INT64(P64, size_t)
+
+#define INIT_KHASH_WRAPPER(type) \
+    struct kmer_dict##type##_t { \
+        inline kh_S##type##_t *kh_init_set() { \
+            return kh_init_S##type(); \
+        }                         \
+        inline khint_t kh_get_from_set(kh_S##type##_t *set, kmer##type##_t key) { \
+            return kh_get_S##type(set, key); \
+        }                        \
+        inline khint_t kh_put_to_set(kh_S##type##_t *set, kmer##type##_t key, int *ret) { \
+            return kh_put_S##type(set, key, ret); \
+        }                        \
+        inline void kh_del_from_set(kh_S##type##_t *set, khint_t key) { \
+            kh_del_S##type(set, key); \
+        }                        \
+        inline kh_P##type##_t *kh_init_map() { \
+            return kh_init_P##type(); \
+        }                         \
+        inline khint_t kh_get_from_map(kh_P##type##_t *map, kmer##type##_t key) { \
+            return kh_get_P##type(map, key); \
+        }                        \
+        inline khint_t kh_put_to_map(kh_P##type##_t *map, kmer##type##_t key, int *ret) { \
+            return kh_put_P##type(map, key, ret); \
+        }                        \
+        inline void kh_del_from_map(kh_P##type##_t *map, khint_t key) { \
+            kh_del_P##type(map, key); \
+        }                        \
+        inline void kh_destroy_map(kh_P##type##_t *map) { \
+            kh_destroy_P##type(map); \
+        }                        \
+        inline void kh_clear_map(kh_P##type##_t *map) { \
+            kh_clear_P##type(map); \
+        }                        \
+        inline void kh_resize_map(kh_P##type##_t *map, khint_t size) { \
+            kh_resize_P##type(map, size); \
+        }                        \
+    };
+
+INIT_KHASH_WRAPPER(64)
+INIT_KHASH_WRAPPER(128)
 
 /// Determine whether the k-mer or its reverse complement is present.
-bool containsKMer(kh_S64_t *kMers, kmer_t kMer, int k, bool complements) {
-    bool ret = kh_get_S64(kMers, kMer) != kh_end(kMers);
-    if (complements) ret |= kh_get_S64(kMers, ReverseComplement(kMer, k )) != kh_end(kMers);
+template <typename kmer_t, typename kh_S_t, typename kh_wrapper_t>
+bool containsKMer(kh_S_t *kMers, kh_wrapper_t wrapper, kmer_t kMer, int k, bool complements) {
+    bool ret = wrapper.kh_get_from_set(kMers, kMer) != kh_end(kMers);
+    if (complements) ret |= wrapper.kh_get_from_set(kMers, ReverseComplement(kMer, k )) != kh_end(kMers);
     return ret;
 }
 
 /// Remove the k-mer and its reverse complement.
-void eraseKMer(kh_S64_t *kMers, kmer_t kMer, int k, bool complements) {
-    auto key = kh_get_S64(kMers, kMer);
+template <typename kmer_t, typename kh_S_t, typename kh_wrapper_t>
+void eraseKMer(kh_S_t *kMers, kh_wrapper_t wrapper, kmer_t kMer, int k, bool complements) {
+    auto key = wrapper.kh_get_from_set(kMers, kMer);
     if (key != kh_end(kMers)) {
-        kh_del_S64(kMers, key);
+        wrapper.kh_del_from_set(kMers, key);
     }
     if (complements) {
         kmer_t reverseComplement = ReverseComplement(kMer, k);
-        key = kh_get_S64(kMers, reverseComplement);
-        if (key != kh_end(kMers)) kh_del_S64(kMers, key);
+        key = wrapper.kh_get_from_set(kMers, reverseComplement);
+        if (key != kh_end(kMers)) wrapper.kh_del_from_set(kMers, key);
     }
 }
 
 /// Return the next k-mer in the k-mer set and update the index.
-kmer_t nextKMer(kh_S64_t *kMers, size_t &lastIndex) {
+template <typename kmer_t, typename kh_S_t>
+kmer_t nextKMer(kh_S_t *kMers, [[maybe_unused]] kmer_t _, size_t &lastIndex) {
     for (size_t i = kh_begin(kMers) + lastIndex; i != kh_end(kMers); ++i, ++lastIndex) {
         if (!kh_exist(kMers, i)) continue;
         return kh_key(kMers, i);
@@ -60,7 +97,8 @@ kmer_t nextKMer(kh_S64_t *kMers, size_t &lastIndex) {
 }
 
 /// Construct a vector of the k-mer set in an arbitrary order.
-std::vector<kmer_t> kMersToVec(kh_S64_t *kMers) {
+template <typename kmer_t, typename kh_S_t>
+std::vector<kmer_t> kMersToVec(kh_S_t *kMers, [[maybe_unused]] kmer_t _) {
     std::vector<kmer_t> res(kh_size(kMers));
     size_t index = 0;
     for (auto i = kh_begin(kMers); i != kh_end(kMers); ++i) {
@@ -73,17 +111,18 @@ std::vector<kmer_t> kMersToVec(kh_S64_t *kMers) {
 /// Add an interval with given index to the given k-mer.
 ///
 /// [intervalsForKMer] store the intervals and [intervals] maps the k-mer to the index in [intervalsForKMer].
-bool appendInterval(kh_O64_t *intervals, std::vector<std::list<size_t>> &intervalsForKMer, kmer_t kMer, size_t index, int k, bool complements) {
+template <typename kmer_t, typename kh_P_t, typename kh_wrapper_t>
+bool appendInterval(kh_P_t *intervals, kh_wrapper_t wrapper, std::vector<std::list<size_t>> &intervalsForKMer, kmer_t kMer, size_t index, int k, bool complements) {
     if (complements) kMer = std::min(kMer, ReverseComplement(kMer, k));
-    auto key = kh_get_O64(intervals, kMer);
+    auto key = wrapper.kh_get_from_map(intervals, kMer);
     if (key == kh_end(intervals)) {
         int ret;
-        kh_put_O64(intervals, kMer, &ret);
-        key = kh_get_O64(intervals, kMer);
+        wrapper.kh_put_to_map(intervals, kMer, &ret);
+        key = wrapper.kh_get_from_map(intervals, kMer);
         kh_value(intervals, key) = intervalsForKMer.size();
         intervalsForKMer.emplace_back(std::list<size_t>());
     }
-    key = kh_get_O64(intervals, kMer);
+    key = wrapper.kh_get_from_map(intervals, kMer);
     auto position = kh_value(intervals, key);
     if (intervalsForKMer[position].empty() || intervalsForKMer[position].back() != index) {
         intervalsForKMer[position].push_back(index);