Skip to content

Commit

Permalink
Merge pull request #443 from eseiler/misc/use_sketches
Browse files Browse the repository at this point in the history
[MISC] Use hll sketches to estimate bin sizes
  • Loading branch information
eseiler authored Oct 17, 2024
2 parents da75ebe + 8bd2678 commit 25802fd
Show file tree
Hide file tree
Showing 8 changed files with 56 additions and 19 deletions.
15 changes: 9 additions & 6 deletions src/argument_parsing/compute_bin_size.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <seqan3/search/views/minimiser_hash.hpp>

#include <hibf/build/bin_size_in_bits.hpp>
#include <hibf/contrib/robin_hood.hpp>
#include <hibf/sketch/hyperloglog.hpp>

#include <raptor/adjust_seed.hpp>
#include <raptor/argument_parsing/compute_bin_size.hpp>
Expand Down Expand Up @@ -94,14 +94,17 @@ size_t kmer_count_from_sequence_files(std::vector<std::vector<std::string>> cons

auto worker = [&callback, &reader](auto && zipped_view)
{
robin_hood::unordered_flat_set<uint64_t> kmers;
auto insert_it = std::inserter(kmers, kmers.end());
seqan::hibf::sketch::hyperloglog sketch{15u};

for (auto && [file_names, bin_number] : zipped_view)
{
kmers.clear();
reader.hash_into(file_names, insert_it);
callback(kmers.size());
sketch.reset();
reader.for_each_hash(file_names,
[&sketch](auto && hash)
{
sketch.add(hash);
});
callback(sketch.estimate());
}
};

Expand Down
13 changes: 9 additions & 4 deletions src/build/max_count_per_partition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include <algorithm>
#include <fstream>

#include <hibf/sketch/hyperloglog.hpp>

#include <raptor/build/max_count_per_partition.hpp>
#include <raptor/call_parallel_on_bins.hpp>
#include <raptor/file_reader.hpp>
Expand Down Expand Up @@ -41,17 +43,20 @@ std::vector<size_t> max_count_per_partition(partition_config const & cfg,
auto worker = [&callback, &reader, &cfg](auto && zipped_view)
{
std::vector<size_t> max_kmer_counts(cfg.partitions);
std::vector<size_t> kmer_counts(cfg.partitions);
std::vector<seqan::hibf::sketch::hyperloglog> kmer_counts(cfg.partitions, 15u);

for (auto && [file_names, bin_number] : zipped_view)
{
reader.for_each_hash(file_names,
[&](auto && hash)
{
++kmer_counts[cfg.hash_partition(hash)];
kmer_counts[cfg.hash_partition(hash)].add(hash);
});
for (size_t i = 0; i < cfg.partitions; ++i)
max_kmer_counts[i] = std::max<size_t>(max_kmer_counts[i], kmer_counts[i]);
std::ranges::fill(kmer_counts, 0u);
{
max_kmer_counts[i] = std::max<size_t>(max_kmer_counts[i], kmer_counts[i].estimate());
kmer_counts[i].reset();
}
}
callback(max_kmer_counts);
};
Expand Down
Binary file modified test/data/128bins19window.index
Binary file not shown.
Binary file modified test/data/1bins19window.index
Binary file not shown.
Binary file modified test/data/64bins19window.index
Binary file not shown.
27 changes: 25 additions & 2 deletions test/include/raptor/test/cli_test.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,8 @@ struct raptor_base : public cli_test
template <typename data_t = raptor::index_structure::ibf>
static inline void compare_index(std::filesystem::path const & expected_result,
std::filesystem::path const & actual_result,
compare_extension const compare_ext = compare_extension::yes)
compare_extension const compare_ext = compare_extension::yes,
is_preprocessed const preprocessed = is_preprocessed::no)
{
constexpr bool is_ibf = std::same_as<data_t, raptor::index_structure::ibf>;
constexpr bool is_hibf = std::same_as<data_t, raptor::index_structure::hibf>;
Expand All @@ -287,7 +288,29 @@ struct raptor_base : public cli_test
if constexpr (is_ibf)
{
auto const &expected_ibf{expected_index.ibf()}, actual_ibf{actual_index.ibf()};
EXPECT_TRUE(expected_ibf == actual_ibf) << debug_ibfs(expected_ibf, actual_ibf);
if (!preprocessed)
{
EXPECT_TRUE(expected_ibf == actual_ibf) << debug_ibfs(expected_ibf, actual_ibf);
}
else
{
// Preprocessed IBFs use minimiser files. The counts are exact.
// Without minimiser files, HLL sketches are used, which are estimates.
auto check_diff = [](double const expected_value, double const actual_value) -> bool
{
static constexpr double rel_error{0.005};
double const diff = std::abs(expected_value - actual_value);
double const abs_error = rel_error * expected_value;
return diff < abs_error;
};

bool const is_same = expected_ibf.hash_function_count() == actual_ibf.hash_function_count()
&& expected_ibf.bin_count() == actual_ibf.bin_count()
&& check_diff(expected_ibf.bin_size(), actual_ibf.bin_size())
&& check_diff(expected_ibf.bit_size(), actual_ibf.bit_size());

EXPECT_TRUE(is_same) << debug_ibfs(expected_ibf, actual_ibf);
}
}
else
{
Expand Down
10 changes: 5 additions & 5 deletions test/unit/api/compute_bin_size.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,34 +14,34 @@ TEST_F(compute_bin_size, single_file)
{
raptor::build_arguments const config{.bin_path = {{data("bin1.fa")}}};
size_t const result = raptor::compute_bin_size(config);
EXPECT_EQ(result, 3011u); // max_count = 381
EXPECT_EQ(result, 3011u); // exact count = 381, exact size = 3011
}

TEST_F(compute_bin_size, multiple_files)
{
raptor::build_arguments const config{.bin_path = {{data("bin1.fa"), data("bin2.fa")}}};
size_t const result = raptor::compute_bin_size(config);
EXPECT_EQ(result, 3794u); // max_count = 480
EXPECT_EQ(result, 3801u); // exact count = 480, exact size = 3794
}

TEST_F(compute_bin_size, multiple_records)
{
raptor::build_arguments const config{.bin_path = {{data("multi_record_bin.fa")}}};
size_t const result = raptor::compute_bin_size(config);
EXPECT_EQ(result, 3794u); // max_count = 480
EXPECT_EQ(result, 3801u); // exact count = 480, exact size = 3794
}

TEST_F(compute_bin_size, multiple_files_multiple_records)
{
raptor::build_arguments const config{.bin_path = {{data("multi_record_bin.fa"), data("multi_record_bin.fa")}}};
size_t const result = raptor::compute_bin_size(config);
EXPECT_EQ(result, 3794u); // max_count = 480
EXPECT_EQ(result, 3801u); // exact count = 480, exact size = 3794
}

TEST_F(compute_bin_size, mixed)
{
raptor::build_arguments const config{
.bin_path = {{data("multi_record_bin.fa")}, {data("bin3.fa")}, {data("bin4.fa")}}};
size_t const result = raptor::compute_bin_size(config);
EXPECT_EQ(result, 3794u); // max_count = 480
EXPECT_EQ(result, 3801u); // exact count = 480, exact size = 3794
}
10 changes: 8 additions & 2 deletions test/unit/cli/search/search_ibf_preprocessing_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ TEST_P(search_ibf_preprocessing, pipeline)
EXPECT_EQ(result2.err, std::string{});
RAPTOR_ASSERT_ZERO_EXIT(result2);

compare_index(ibf_path(number_of_repeated_bins, window_size), "raptor.index", compare_extension::no);
compare_index(ibf_path(number_of_repeated_bins, window_size),
"raptor.index",
compare_extension::no,
is_preprocessed::yes);

cli_test_result const result3 = execute_app("raptor",
"search",
Expand Down Expand Up @@ -126,7 +129,10 @@ TEST_P(search_ibf_preprocessing, pipeline_compressed_bins)
EXPECT_EQ(result2.err, std::string{});
RAPTOR_ASSERT_ZERO_EXIT(result2);

compare_index(ibf_path(number_of_repeated_bins, window_size), "raptor.index", compare_extension::no);
compare_index(ibf_path(number_of_repeated_bins, window_size),
"raptor.index",
compare_extension::no,
is_preprocessed::yes);

cli_test_result const result3 = execute_app("raptor",
"search",
Expand Down

0 comments on commit 25802fd

Please sign in to comment.