Skip to content

Commit

Permalink
Merge pull request #163 from Irallia/TEST/add_tests_for_tandem_duplic…
Browse files Browse the repository at this point in the history
…ations

[TEST] Add tests for tandem duplications
  • Loading branch information
Irallia authored Sep 27, 2021
2 parents f31f991 + ea7d188 commit a19055b
Show file tree
Hide file tree
Showing 8 changed files with 251 additions and 97 deletions.
26 changes: 26 additions & 0 deletions include/modules/sv_detection_methods/analyze_split_read_method.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,32 @@ void split_string(std::string const & str, Container & cont, char const delim =
*/
void retrieve_aligned_segments(std::string const & sa_string, std::vector<AlignedSegment> & aligned_segments);

/*! \brief Calculates the end respectively start position of two consecutive mates (current and next
* [AlignedSegment](\ref AlignedSegment)) depending on their orientation and corrects the position of the second
* mate, if they are overlapping.
*
* \param[in] current - current [AlignedSegment](\ref AlignedSegment)
* \param[in] next - consecutive next [AlignedSegment](\ref AlignedSegment)
* \param[in] distance_on_read - distance between the consecutive aligned segments on the read
*
* \returns std::pair(mate1_pos, mate2_pos) - a pair of the resulting mate positions
*
* \details \verbatim
Case 1: --current--> --next-->
Case 2: --current--> <--next--
Case 3: <--current-- --next-->
Case 4: <--current-- <--next--
| |
mate1_pos mate2_pos
overlapping Case: --current--> distance_on_read < 0
--next-->
||
mate1_pos mate2_pos
\endverbatim
*/
std::pair<int32_t, int32_t> get_mate_positions(AlignedSegment current, AlignedSegment next, int32_t distance_on_read);

/*! \brief Build junctions out of aligned_segments.
*
* \param[in] aligned_segments - vector of [aligned_segments](\ref AlignedSegment)
Expand Down
97 changes: 49 additions & 48 deletions src/modules/sv_detection_methods/analyze_split_read_method.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,42 @@ void retrieve_aligned_segments(std::string const & sa_string, std::vector<Aligne
}
}

std::pair<int32_t, int32_t> get_mate_positions(AlignedSegment current, AlignedSegment next, int32_t distance_on_read)
{
int32_t mate1_pos;
if (current.orientation == strand::forward)
mate1_pos = current.get_reference_end();
else
mate1_pos = current.get_reference_start();
int32_t mate2_pos;
if (next.orientation == strand::forward)
{
// Correct position of mate 2 for overlapping alignment segments:
// Trim alignment of `next` segment at the start to remove overlap
if (distance_on_read < 0)
mate2_pos = next.get_reference_start() - distance_on_read;
else
mate2_pos = next.get_reference_start();
}
else
{
// Correct position of mate 2 for overlapping alignment segments:
// Trim alignment of `next` segment at the end to remove overlap
if (distance_on_read < 0)
mate2_pos = next.get_reference_end() + distance_on_read;
else
mate2_pos = next.get_reference_end();
}
return std::pair(mate1_pos, mate2_pos);
}

void analyze_aligned_segments(std::vector<AlignedSegment> const & aligned_segments,
std::vector<Junction> & junctions,
seqan3::dna5_vector const & query_sequence,
std::string const & read_name,
int32_t const min_length,
int32_t const max_overlap)
{
size_t tandem_dup_count = 0;
for (size_t i = 1; i < aligned_segments.size(); i++)
{
AlignedSegment current = aligned_segments[i-1];
Expand All @@ -70,58 +98,37 @@ void analyze_aligned_segments(std::vector<AlignedSegment> const & aligned_segmen
// of the read is lower than the given threshold
if (distance_on_read >= -max_overlap)
{
int32_t mate1_pos;
if (current.orientation == strand::forward)
mate1_pos = current.get_reference_end();
else
mate1_pos = current.get_reference_start();
int32_t mate2_pos;
if (next.orientation == strand::forward)
{
// Correct position of mate 2 for overlapping alignment segments:
// Trim alignment of `next` segment at the start to remove overlap
if (distance_on_read < 0)
mate2_pos = next.get_reference_start() - distance_on_read;
else
mate2_pos = next.get_reference_start();
}
else
{
// Correct position of mate 2 for overlapping alignment segments:
// Trim alignment of `next` segment at the end to remove overlap
if (distance_on_read < 0)
mate2_pos = next.get_reference_end() + distance_on_read;
else
mate2_pos = next.get_reference_end();
}
auto [ mate1_pos, mate2_pos ] = get_mate_positions(current, next, distance_on_read);
int32_t distance_on_ref = mate2_pos - mate1_pos - 1;
// Check that the two consecutive alignment segments either
// map to different reference sequences (e.g. translocation, interspersed duplication),
// have a different orientation (e.g. inversion)
// have a large distance on the reference (e.g. deletion, inversion, tandem duplication), or
// have a large distance on the read (e.g. insertion)
if (current.ref_name != next.ref_name || //TODO / QUESTION (irallia 23.06.2021): What about translocation on the same ref?
std::abs(distance_on_ref) >= min_length ||
if (current.ref_name != next.ref_name || // 1: Segments not on same chromosome -> translocation
std::abs(distance_on_ref) >= min_length || // 3: Distance on reference >= minimum SV size -> 4
distance_on_read >= min_length)
{
Breakend mate1{current.ref_name,
mate1_pos,
current.orientation};
Breakend mate2{next.ref_name,
mate2_pos,
next.orientation};
if (distance_on_read < 0)
{
// No inserted sequence between overlapping alignment segments
junctions.emplace_back(mate1, mate2, ""_dna5, tandem_dup_count, read_name);
}
else
Breakend mate1{current.ref_name, mate1_pos, current.orientation};
Breakend mate2{next.ref_name, mate2_pos, next.orientation};
size_t tandem_dup_count = 0;
// if novel inserted sequence
if (distance_on_read > 0)
{
auto inserted_bases = query_sequence | seqan3::views::slice(current.get_query_end(),
next.get_query_start());
junctions.emplace_back(mate1, mate2, inserted_bases, tandem_dup_count, read_name);
if (gVerbose)
seqan3::debug_stream << "INS: " << junctions.back() << "\n";
}
else
{
// No inserted sequence between overlapping alignment segments
junctions.emplace_back(mate1, mate2, ""_dna5, tandem_dup_count, read_name);
if (gVerbose)
seqan3::debug_stream << "BND: " << junctions.back() << "\n";

}
if (gVerbose)
seqan3::debug_stream << "BND: " << junctions.back() << "\n";
}
}
}
Expand All @@ -138,17 +145,11 @@ void analyze_sa_tag(std::string const & query_name,
cmd_arguments const & args,
std::vector<Junction> & junctions)
{

std::vector<AlignedSegment> aligned_segments{};
strand strand = (hasFlagReverseComplement(flag) ? strand::reverse : strand::forward);
aligned_segments.push_back(AlignedSegment{strand, ref_name, pos, mapq, cigar});
retrieve_aligned_segments(sa_tag, aligned_segments);
// sort by query start, query end, mapping quality (in this order):
std::sort(aligned_segments.begin(), aligned_segments.end());
analyze_aligned_segments(aligned_segments,
junctions,
seq,
query_name,
args.min_var_length,
args.max_overlap);
analyze_aligned_segments(aligned_segments, junctions, seq, query_name, args.min_var_length, args.max_overlap);
}
Loading

0 comments on commit a19055b

Please sign in to comment.