Skip to content

Commit

Permalink
Sourmash into kSpider (#26)
Browse files Browse the repository at this point in the history
* validate bin with sig

* bins indexing implemented

* pairwise minor modifications

* dumping modes

* minor fix

* 🐛 handle gzipped sigs

* sig to bin

* sigs to bins update

* :fix: all parallel

* refactot

* remove print inside parallel loop

* :fix: validate

* skip converted files

* sequential loading

* legends to phmap

* check invalid bins

* bins indexing done

* update kProcessor submodule

* filter by abundance

* delete kProcessor submodule

* modify kProcessor

* new json parser

* adapt the new json parser changes

* modify json import

* update kProcessor

* update kProcessor

* update kProcessor

* update kProcessor

* update kProcessor

* update kp

* update CMAKE flags

* print colors size

* more stats

* more options

* fix

* representative sketches

* reorganize
  • Loading branch information
mr-eyes authored Dec 15, 2022
1 parent 5445276 commit 6f6aedc
Show file tree
Hide file tree
Showing 38 changed files with 3,815 additions and 1,677 deletions.
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
url = https://github.com/adishavit/argh
[submodule "lib/kProcessor"]
path = lib/kProcessor
url = https://github.com/dib-lab/kProcessor.git
url = git@github.com:dib-lab/kProcessor.git
32 changes: 28 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ project (
VERSION 2.0.0
)

set(default_build_type "Release")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -lstdc++fs -fPIC -lgomp -lrt -fopenmp -O3 -Ofast")
set(default_build_type "RELEASE")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -lstdc++fs -fPIC -lgomp -lrt -fopenmp -Ofast")
# ----------------------------------------------------------------------------
# kProcessor Setup
# ----------------------------------------------------------------------------
Expand All @@ -31,13 +31,13 @@ include_directories(${kProcessor_INCLUDE_PATH})

include_directories("${PROJECT_SOURCE_DIR}/include")
include_directories("${PROJECT_SOURCE_DIR}/lib/argh")
include_directories("${PROJECT_SOURCE_DIR}/lib/json_parser")
include_directories("${PROJECT_SOURCE_DIR}/lib/json_parser/lib/include/")
add_subdirectory("lib/zstr")

set(PHMAP_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/lib/kProcessor/ThirdParty/kmerDecoder/lib/parallel-hashmap")
include_directories("${PHMAP_INCLUDE_DIRS}")

add_library(kSpider STATIC src/pairwise.cpp src/index.cpp src/fastx_to_kf.cpp src/sourmash_indexing.cpp ${PROJECT_SOURCE_DIR}/lib/kProcessor/include/kProcessor)
add_library(kSpider STATIC src/pairwise.cpp src/index.cpp src/fastx_to_kf.cpp src/sourmash_indexing.cpp src/bins_indexing.cpp ${PROJECT_SOURCE_DIR}/lib/kProcessor/include/kProcessor)
set_target_properties(kSpider PROPERTIES POSITION_INDEPENDENT_CODE 1 CXX_STANDARD 17)
target_link_libraries (kSpider kProcessor z)
target_link_libraries (kSpider kProcessor z zstr::zstr)
Expand All @@ -49,6 +49,30 @@ target_include_directories(kSpider INTERFACE ${PHMAP_INCLUDE_DIRS} ${PROJECT_SOU
add_executable(pairwise pairwise.cpp)
target_link_libraries(pairwise kSpider kProcessor z)

add_executable(index_bins bins.cpp)
target_link_libraries(index_bins kSpider kProcessor z)

add_executable(validate validate.cpp)
target_link_libraries(validate kSpider kProcessor z)

add_executable(dump_bin export_bin.cpp)
target_link_libraries(dump_bin kSpider kProcessor z)

add_executable(check_bin check_bin.cpp)
target_link_libraries(check_bin kSpider kProcessor z)

add_executable(dump_sig export_sig.cpp)
target_link_libraries(dump_sig kSpider kProcessor z)

add_executable(sigs_to_bins sigs_to_bins.cpp)
target_link_libraries(sigs_to_bins kSpider kProcessor z)

add_executable(sig_to_bin sig_to_bin.cpp)
target_link_libraries(sig_to_bin kSpider kProcessor z zstr::zstr)

add_executable(repr_sketches apps/repr_sketches.cpp)
target_link_libraries(repr_sketches z kProcessor)

# add_executable(index_kframes apps/index_kframes.cpp)
# target_link_libraries(index_kframes kSpider kProcessor z)

Expand Down
44 changes: 44 additions & 0 deletions apps/repr_sketches.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#include <boost/algorithm/string.hpp>
#include <cstdint>
#include <unordered_map>
#include <parallel_hashmap/phmap.h>

using namespace boost::algorithm;
using namespace std;

bool comp(pair<uint64_t,uint64_t> a, pair<uint64_t,uint64_t> b) {
return a.second > b.second;
}


int main(int argc, char** argv) {
ifstream fin(argv[1]);
phmap::flat_hash_map<uint64_t, uint64_t> count;
string line;
getline(fin, line); // skip header.
while (getline(fin, line)) {
// Split line into tab-separated parts
vector<string> parts;
split(parts, line, boost::is_any_of("\t"));
float containment = stof(parts[4]);
if (containment > 0.20) {
uint64_t from_node = stoi(parts[0]);
uint64_t to_node = stoi(parts[1]);
count[from_node]++;
count[to_node]++;
}

}
fin.close();

std::vector<std::pair<uint64_t, uint64_t>> elems(count.begin(), count.end());
std::sort(elems.begin(), elems.end(), comp);

for (auto& [k, v] : elems) {
cout << k << ": " << v << endl;
}
}
26 changes: 26 additions & 0 deletions bins.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#include "kSpider.hpp"

inline uint64_t to_uint64_t(std::string const& value) {
uint64_t result = 0;
char const* p = value.c_str();
char const* q = p + value.size();
while (p < q) {
result *= 10;
result += *(p++) - '0';
}
return result;
}

int main(int argc, char** argv) {
if(argc < 6){
cout << "args: <bins_dir> <kSize> <output_prefix> <initial_reserve_size> <legend_reserve>\n";
exit(1);
}
string bins_dir = argv[1];
int kSize = stoi(argv[2]);
string output_prefix = argv[3];
uint64_t reserve_size = to_uint64_t(argv[4]);
uint64_t legend_reserve = to_uint64_t(argv[5]);

kSpider::bins_indexing(bins_dir, kSize, output_prefix, reserve_size, legend_reserve);
}
31 changes: 31 additions & 0 deletions check_bin.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#include <iostream>
#include <cstdint>
#include <chrono>
#include "parallel_hashmap/phmap.h"
#include <ctime>
#include<omp.h>
#include <glob.h>
#include <string>
#include <stdexcept>
#include "parallel_hashmap/phmap_dump.h"
#include <cstdlib>

using namespace std;
// using namespace phmap;


int main(int argc, char** argv) {

if (argc != 2) {
cout << "run: ./check_bin <bin>" << endl;
exit(1);
}

string bin_path = argv[1];
phmap::flat_hash_set<uint64_t> table_in;
phmap::BinaryInputArchive ar_in(bin_path.c_str());
table_in.phmap_load(ar_in);


cout << "VALID_BIN: " << table_in.size();
}
33 changes: 33 additions & 0 deletions export_bin.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#include <iostream>
#include <cstdint>
#include <chrono>
#include "parallel_hashmap/phmap.h"
#include <ctime>
#include<omp.h>
#include <glob.h>
#include <string>
#include <stdexcept>
#include "parallel_hashmap/phmap_dump.h"
#include <cstdlib>

using namespace std;
// using namespace phmap;


int main(int argc, char** argv) {

if (argc != 2) {
cout << "run: ./dump_bin <bin>" << endl;
exit(1);
}

string bin_path = argv[1];

phmap::flat_hash_set<uint64_t> table_in;
phmap::BinaryInputArchive ar_in(bin_path.c_str());
table_in.phmap_load(ar_in);
cerr << "loaded bin size: " << table_in.size() << endl;

for(const uint64_t & hash : table_in) cout << hash << endl;

}
55 changes: 55 additions & 0 deletions export_sig.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include <iostream>
#include <cstdint>
#include <chrono>
#include "parallel_hashmap/phmap.h"
#include <ctime>
#include <omp.h>
#include "cpp-json/json.h"
#include "zstr.hpp"
#include <glob.h>
#include <string>
#include <stdexcept>
#include "parallel_hashmap/phmap_dump.h"
#include <cstdlib>

using namespace std;
// using namespace phmap;

typedef std::chrono::high_resolution_clock Time;


int main(int argc, char** argv) {

if (argc != 3) {
cout << "run: ./dump_sig <sig> <kSize>" << endl;
exit(1);
}

string sig_path = argv[1];
int kSize = stoi(argv[2]);

phmap::flat_hash_set<uint64_t> tmp_hashes;

auto begin_time = Time::now();
zstr::ifstream sig_stream(sig_path);
json::value json = json::parse(sig_stream);
auto sourmash_sig = json[0]["signatures"];
const json::array& sig_array = as_array(sourmash_sig);
for (auto it = sig_array.begin(); it != sig_array.end(); ++it) {
const json::value& v = *it;
if (v["ksize"] == kSize) {
const json::array& mins = as_array(v["mins"]);
auto mins_it = mins.begin();
while (mins_it != mins.end()) {
tmp_hashes.insert(json::to_number<uint64_t>(*mins_it));
mins_it++;
}
}
break;
}


for (const uint64_t& hash : tmp_hashes) cout << hash << endl;


}
1 change: 1 addition & 0 deletions include/kSpider.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ namespace kSpider {
void index_dayhoff(int kSize, string fasta_file, string names_file, int chunk_size, string index_prefix);
void index_datasets(string kfs_dir);
void sourmash_sigs_indexing(string sigs_dir, int kSize);
void bins_indexing(string bins_dir, int selective_kSize, string output_prefix, uint64_t kmers_reserve, uint64_t colors_reserve);
void paired_end_to_kDataFrame(string r1_file_name, string r2_file_name, int kSize, int chunk_size, int downsampling_ratio, bool remove_singletones);
void single_end_to_kDataFrame(string r1_file_name, int kSize, int chunk_size, int downsampling_ration, bool remove_singletones);
void protein_to_kDataFrame(string r1_file_name, int kSize, int chunk_size, bool is_dayhoff, string output_prefix, int downsampling_ration = 1);
Expand Down
5 changes: 5 additions & 0 deletions lib/json_parser/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
cmake_minimum_required (VERSION 3.0)
project(cpp-json CXX)

add_subdirectory(lib)
add_subdirectory(test)
Loading

0 comments on commit 6f6aedc

Please sign in to comment.