Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Io bam performance #3106

Merged
merged 7 commits into from
Jan 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ If possible, provide tooling that performs the changes, e.g. a shell-script.
#### I/O
* Empty SAM/BAM files must at least write a header to ensure a valid file
([\#3081](https://github.com/seqan/seqan3/pull/3081)).
* Reading SAM/BAM files is 2x faster than before
([\#3106](https://github.com/seqan/seqan3/pull/3106)).

## API changes

Expand Down
4 changes: 3 additions & 1 deletion include/seqan3/io/detail/misc_input.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,9 @@ inline auto make_secondary_istream(std::basic_istream<char_t> & primary_stream,

// unget all read chars.
for (size_t i = 0; i < read_chars; ++i)
primary_stream.unget();
primary_stream.unget(); // If you unget() more characters than are present in the get area, badbit is set.

assert(primary_stream.good() && "`unget()` was called too many times on primary_stream.");

std::string extension{};
if (filename.has_extension())
Expand Down
43 changes: 18 additions & 25 deletions include/seqan3/io/sam_file/detail/cigar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,46 +78,39 @@ inline void update_alignment_lengths(int32_t & ref_length,

/*!\brief Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
* \ingroup io_sam_file
* \tparam cigar_input_type The type of a single pass input view over the cigar string; must model
* std::ranges::input_range.
* \param[in] cigar_input The single pass input view over the cigar string to parse.
* \param[in] cigar_str The cigar string to parse.
*
* \returns A tuple of size three containing (1) std::vector over seqan3::cigar, that describes
* the alignment, (2) the aligned reference length, (3) the aligned query sequence length.
* \returns A std::vector over seqan3::cigar that describes the alignment.
*
* \details
*
* For example, the view over the cigar string "1H4M1D2M2S" will return
* `{[(H,1), (M,4), (D,1), (M,2), (S,2)], 7, 6}`.
*/
template <typename cigar_input_type>
SEQAN3_WORKAROUND_LITERAL std::tuple<std::vector<cigar>, int32_t, int32_t> parse_cigar(cigar_input_type && cigar_input)
SEQAN3_WORKAROUND_LITERAL std::vector<cigar> parse_cigar(std::string_view const cigar_str)
{
std::vector<cigar> operations{};
std::array<char, 20> buffer{}; // buffer to parse numbers with from_chars. Biggest number should fit in uint64_t
char cigar_operation{};
uint32_t cigar_count{};
int32_t ref_length{}, seq_length{}; // length of aligned part for ref and query
std::vector<seqan3::cigar> cigar_vector{};

if (cigar_str == "*")
return cigar_vector;

// transform input into a single input view if it isn't already
auto cigar_view = cigar_input | views::single_pass_input;
uint32_t cigar_count{};
char const * ptr = cigar_str.data();
char const * const end = ptr + cigar_str.size();

// parse the rest of the cigar
// -------------------------------------------------------------------------------------------------------------
while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view)) // until stream is not empty
while (ptr < end)
{
auto buff_end = (std::ranges::copy(cigar_view | detail::take_until_or_throw(!is_digit), buffer.data())).out;
cigar_operation = *std::ranges::begin(cigar_view);
++std::ranges::begin(cigar_view);
auto const res = std::from_chars(ptr, end, cigar_count); // reads number up to next character

if (res.ec != std::errc{})
throw format_error{"Corrupted cigar string."};

if (std::from_chars(buffer.begin(), buff_end, cigar_count).ec != std::errc{})
throw format_error{"Corrupted cigar string encountered"};
ptr = res.ptr + 1; // skip cigar operation character

update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count);
operations.emplace_back(cigar_count, cigar::operation{}.assign_char(cigar_operation));
cigar_vector.emplace_back(cigar_count, seqan3::assign_char_strictly_to(*res.ptr, seqan3::cigar::operation{}));
}

return {operations, ref_length, seq_length};
return cigar_vector;
}

/*!\brief Transforms a vector of cigar elements into a string representation.
Expand Down
89 changes: 39 additions & 50 deletions include/seqan3/io/sam_file/detail/format_sam_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,14 @@ class format_sam_base

int32_t soft_clipping_at_front(std::vector<cigar> const & cigar_vector) const;

template <typename stream_view_t>
void read_byte_field(stream_view_t && stream_view, std::byte & byte_target);

template <typename stream_view_type, std::ranges::forward_range target_range_type>
void read_forward_range_field(stream_view_type && stream_view, target_range_type & target);

template <typename stream_view_t, arithmetic arithmetic_target_type>
void read_arithmetic_field(stream_view_t && stream_view, arithmetic_target_type & arithmetic_target);
template <std::ranges::forward_range target_range_type>
void read_forward_range_field(std::string_view const str, target_range_type & target);

template <arithmetic arithmetic_target_type>
void read_arithmetic_field(std::string_view const & str, arithmetic_target_type & arithmetic_target);

template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type>
void read_header(stream_view_type && stream_view,
Expand Down Expand Up @@ -179,36 +179,6 @@ inline int32_t format_sam_base::soft_clipping_at_front(std::vector<cigar> const
return sc_front;
}

/*!\brief Reads std::byte fields using std::from_chars.
* \tparam stream_view_t The type of the stream as a view.
*
* \param[in, out] stream_view The stream view to iterate over.
* \param[out] byte_target The std::byte object to store the parsed value.
*
* \throws seqan3::format_error if the character sequence in stream_view cannot be successfully converted to a value
* of type std::byte.
*/
template <typename stream_view_t>
inline void format_sam_base::read_byte_field(stream_view_t && stream_view, std::byte & byte_target)
{
// unfortunately std::from_chars only accepts char const * so we need a buffer.
auto [ignore, end] = std::ranges::copy(stream_view, arithmetic_buffer.data());
(void)ignore;

uint8_t byte{};
// std::from_chars cannot directly parse into a std::byte
std::from_chars_result res = std::from_chars(arithmetic_buffer.begin(), end, byte, 16);

if (res.ec == std::errc::invalid_argument || res.ptr != end)
throw format_error{std::string("[CORRUPTED SAM FILE] The string '")
+ std::string(arithmetic_buffer.begin(), end) + "' could not be cast into type uint8_t."};

if (res.ec == std::errc::result_out_of_range)
throw format_error{std::string("[CORRUPTED SAM FILE] Casting '") + std::string(arithmetic_buffer.begin(), end)
+ "' into type uint8_t would cause an overflow."};
byte_target = std::byte{byte};
}

/*!\brief Reads a range by copying from stream_view to target, converting values with seqan3::views::char_to.
* \tparam stream_view_type The type of the stream as a view.
* \tparam target_range_type The type of range to parse from input; must model std::ranges::forward_range.
Expand Down Expand Up @@ -238,32 +208,51 @@ inline void format_sam_base::read_forward_range_field(stream_view_type && stream
}
}

/*!\brief Reads from `str` to `target`, converting values with seqan3::views::char_to.
* \tparam target_range_type The type of range to parse from input; must model std::ranges::forward_range.
*
* \param[in, out] str The string_view to parse.
* \param[out] target The range to store the parsed sequence.
*/
template <std::ranges::forward_range target_range_type>
inline void format_sam_base::read_forward_range_field(std::string_view const str, target_range_type & target)
{
if (str.size() == 1 && str[0] == '*') // '*' denotes empty field
return;

if constexpr (std::assignable_from<target_range_type, std::string_view>)
{
target = str;
}
else
{
target.resize(str.size());
for (size_t i = 0; i < str.size(); ++i)
target[i] = assign_char_to(str[i], std::ranges::range_value_t<target_range_type>{});
}
}

/*!\brief Reads arithmetic fields using std::from_chars.
* \tparam stream_view_t The type of the stream as a view.
* \tparam arithmetic_target_type The type of value to parse from input; must model seqan3::arithmetic.
*
* \param[in, out] stream_view The stream view to iterate over.
* \param[in, out] str The string_view to parse.
* \param[out] arithmetic_target The arithmetic value object to store the parsed value.
*
* \throws seqan3::format_error if the character sequence in stream_view cannot be successfully converted to a value
* \throws seqan3::format_error if the character sequence in str cannot be successfully converted to a value
* of type arithmetic_target_type.
*/
template <typename stream_view_t, arithmetic arithmetic_target_type>
inline void format_sam_base::read_arithmetic_field(stream_view_t && stream_view,
template <arithmetic arithmetic_target_type>
inline void format_sam_base::read_arithmetic_field(std::string_view const & str,
arithmetic_target_type & arithmetic_target)
{
// unfortunately std::from_chars only accepts char const * so we need a buffer.
auto [ignore, end] = std::ranges::copy(stream_view, arithmetic_buffer.data());
(void)ignore;
std::from_chars_result res = std::from_chars(arithmetic_buffer.begin(), end, arithmetic_target);
std::from_chars_result res = std::from_chars(str.begin(), str.end(), arithmetic_target);

if (res.ec == std::errc::invalid_argument || res.ptr != end)
throw format_error{std::string("[CORRUPTED SAM FILE] The string '")
+ std::string(arithmetic_buffer.begin(), end) + "' could not be cast into type "
+ detail::type_name_as_string<arithmetic_target_type>};
if (res.ec == std::errc::invalid_argument || res.ptr != str.end())
throw format_error{std::string("[CORRUPTED SAM FILE] The string '") + std::string(str.begin(), str.end())
+ "' could not be cast into type " + detail::type_name_as_string<arithmetic_target_type>};

if (res.ec == std::errc::result_out_of_range)
throw format_error{std::string("[CORRUPTED SAM FILE] Casting '") + std::string(arithmetic_buffer.begin(), end)
throw format_error{std::string("[CORRUPTED SAM FILE] Casting '") + std::string(str.begin(), str.end())
+ "' into type " + detail::type_name_as_string<arithmetic_target_type>
+ " would cause an overflow."};
}
Expand Down Expand Up @@ -291,7 +280,7 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
{
auto it = std::ranges::begin(stream_view);
auto end = std::ranges::end(stream_view);
std::vector<char> string_buffer{};
std::string string_buffer{};

auto make_tag = [](uint8_t char1, uint8_t char2) constexpr
{
Expand Down
Loading