Skip to content

Commit

Permalink
feat: Change alts to alt
Browse files Browse the repository at this point in the history
BREAKING CHANGE: VariantRecord.alts no longer contains a list, it has been changed to VariantRecord.alt
  • Loading branch information
Rapsssito committed Apr 4, 2022
1 parent 2fe8b37 commit 8e1dd86
Show file tree
Hide file tree
Showing 7 changed files with 21 additions and 20 deletions.
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

While there is somewhat of an agreement on how to label the SNVs and indels variants, this is not the case for the structural variants. In the current scenario, different labeling between variant callers makes comparisons between structural variants difficult. This package provides an unified interface to extract variants (included structural variants) from VCFs generated by different variant callers. Apart from reading the VCF file, the `variant_extractor` **adds a preprocessing layer to homogenize the variants** extracted from the file. This way, the variants can be used in downstream analysis in a consistent way. For more information about the homogenization process, check the [homogenization rules](#homogenization-rules) section.

**WARNING: VCF entries with multiple ALT values are ignored.**
**WARNING: VCF entries with multiple ALT values are ignored. It is highly recommended to normalize the VCF file before processing it.**

## Table of contents<!-- omit in toc -->
- [Usage](#usage)
Expand Down Expand Up @@ -31,8 +31,8 @@ extractor = VariantExtractor()
# Read the VCF file
variants = extractor.read_vcf('/path/to/file.vcf')
# Iterate through the variants
for variant_type, variant_record in variants:
print(f'Found variant of type {variant_type.name()}: {variant_record.contig}:{variant_record.pos}')
for variant_record in variants:
print(f'Found variant of type {variant_record.variant_type.name}: {variant_record.contig}:{variant_record.pos}')
```

For a complete list of examples, see the [examples](./examples/) directory.
Expand All @@ -47,7 +47,8 @@ The `VariantExtractor.read_vcf()` method returns a list of `VariantRecord`. The
| `end` | `int` | End position of the variant in the contig (same as `pos` for TRN and SNV) |
| `id` | `str` | Record identifier |
| `ref` | `str` | Reference sequence |
| `alt` | `list` | List of alternative sequences |
| `alt` | `str` | Alternative sequence |
| `qual` | `Optional[float]` | Quality score for the assertion made in ALT |
| `filter` | `pysam.VariantRecordFilter` | Filter |
| `info` | `pysam.VariantRecordInfo` | Dictionary of information fields |
| `variant_type` | [`VariantType`](#varianttype) | Variant type inferred |
Expand Down Expand Up @@ -91,7 +92,7 @@ The `VariantType` enum is a container for the information about the type of the
The `variant_extractor` package provides a unified interface to extract variants (included structural variants) from VCF files generated by different variant callers. The variants are homogenized and returned applying the following rules:

### SNVs
Entries with `REF/ALT` of the same lenghts are treated like SNVs. If the `REF/ALT` sequences are more than one nucleotide, they are divided into multiple SNVs. For example:
Entries with `REF/ALT` of the same lenghts are treated like SNVs. If the `REF/ALT` sequences are more than one nucleotide, they are divided into multiple atomic SNVs. For example:

| CHROM | POS | ID | REF | ALT | FILTER |
| ----- | --- | ------- | --- | --- | ------ |
Expand Down
4 changes: 2 additions & 2 deletions examples/vcf_to_bamsurgeon.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def variant_callback(variant_record):
# TODO: What with records with different REF/ALT sizes
if variant_record.variant_type == VariantType.SNV:
output_file_snv.write(
f'{variant_record.contig} {variant_record.pos} {variant_record.pos} {VAF} {variant_record.alts[0]}\n')
f'{variant_record.contig} {variant_record.pos} {variant_record.pos} {VAF} {variant_record.alt}\n')
else:
# Add prefix or suffix as insertion. Ex: AAAGGTC[1:12121[
insertion_prefix = ''
Expand Down Expand Up @@ -86,7 +86,7 @@ def variant_callback(variant_record):
else variant_record.pos - variant_record.end
dna_sequence = generate_random_dna(insert_length)
else:
dna_sequence = variant_record.alts[0]
dna_sequence = variant_record.alt
# Check if INDEL
if insert_length < INDEL_THRESHOLD:
output_file_indel.write(
Expand Down
2 changes: 1 addition & 1 deletion examples/vcf_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
start_chrom = variant_record.contig.replace('chr', '')
start = variant_record.pos
ref = variant_record.ref
alt = variant_record.alts[0]
alt = variant_record.alt
length = variant_record.end - variant_record.pos
end = variant_record.end
if variant_record.alt_sv_bracket:
Expand Down
4 changes: 2 additions & 2 deletions src/variant_extractor/VariantExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@ def __handle_standard_record(self, vcf_record):
if vcf_record.variant_type == VariantType.SNV:
# REF=CTT ALT=ATG -> Normalize to 3 SNVs
for i in range(len(vcf_record.ref)):
if vcf_record.alts[0][i] != vcf_record.ref[i]:
if vcf_record.alt[i] != vcf_record.ref[i]:
new_vcf_record = vcf_record._replace(
ref=vcf_record.ref[i], pos=i+vcf_record.pos, end=i+vcf_record.pos, alts=[vcf_record.alts[0][i]])
ref=vcf_record.ref[i], pos=i+vcf_record.pos, end=i+vcf_record.pos, alt=vcf_record.alt[i])
self.__variants.append(new_vcf_record)
else:
# INS or DEL
Expand Down
8 changes: 4 additions & 4 deletions src/variant_extractor/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ def _permute_bracket_sv(variant_record):
alt_bracket = ']' if variant_record.alt_sv_bracket.bracket == '[' else '['
alt_contig = variant_record.contig
alt_pos = variant_record.pos
new_alts = [f'{alt_prefix}{alt_bracket}{alt_contig}:{alt_pos}{alt_bracket}{alt_suffix}']
new_alt = f'{alt_prefix}{alt_bracket}{alt_contig}:{alt_pos}{alt_bracket}{alt_suffix}'
alt_sv_bracket = BracketSVRecord(alt_prefix, alt_bracket, alt_contig, alt_pos, alt_suffix)
variant_record = variant_record._replace(contig=new_contig, pos=new_pos,
end=new_end, alts=new_alts, alt_sv_bracket=alt_sv_bracket)
end=new_end, alt=new_alt, alt_sv_bracket=alt_sv_bracket)
return variant_record


Expand All @@ -51,11 +51,11 @@ def _convert_inv_to_bracket(variant_record):
alt_1 = f'.]{variant_record.contig}:{variant_record.end}]'
alt_sv_bracket_1 = BracketSVRecord('.', ']', variant_record.contig, variant_record.end, None)
variant_record_1 = variant_record._replace(
pos=variant_record.pos-1, id=variant_record.id+'_1', ref='.', alts=[alt_1], alt_sv_bracket=alt_sv_bracket_1)
pos=variant_record.pos-1, id=variant_record.id+'_1', ref='.', alt=alt_1, alt_sv_bracket=alt_sv_bracket_1)

alt_2 = f'[{variant_record.contig}:{variant_record.end+1}[{variant_record.ref}'
alt_sv_bracket_2 = BracketSVRecord(None, '[', variant_record.contig, variant_record.pos, variant_record.ref)
variant_record_2 = variant_record._replace(
end=variant_record.end+1, id=variant_record.id+'_2', alts=[alt_2], alt_sv_bracket=alt_sv_bracket_2)
end=variant_record.end+1, id=variant_record.id+'_2', alt=alt_2, alt_sv_bracket=alt_sv_bracket_2)

return variant_record_1, variant_record_2
8 changes: 4 additions & 4 deletions src/variant_extractor/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def _parse_bracket_sv(rec):
variant_type = VariantType.INV

# Create new record
vcf_record = VariantRecord(rec.contig, rec.pos, end_pos, rec.id, rec.ref, rec.alts,
vcf_record = VariantRecord(rec.contig, rec.pos, end_pos, rec.id, rec.ref, rec.alts[0],
rec.qual, rec.filter, rec.info, variant_type, alt_sv_bracket, None)
return vcf_record

Expand Down Expand Up @@ -77,7 +77,7 @@ def _parse_shorthand_sv(rec):
raise ValueError(f'Unknown variant type: {alt_type}')

# Create new record
vcf_record = VariantRecord(rec.contig, rec.pos, rec.stop, rec.id, rec.ref, rec.alts,
vcf_record = VariantRecord(rec.contig, rec.pos, rec.stop, rec.id, rec.ref, rec.alts[0],
rec.qual, rec.filter, rec.info, variant_type, None, alt_sv_shorthand)
return vcf_record

Expand All @@ -88,7 +88,7 @@ def _parse_sgl_sv(rec):
return None
variant_type = VariantType.SGL
# Create new record
vcf_record = VariantRecord(rec.contig, rec.pos, rec.stop, rec.id, rec.ref, rec.alts,
vcf_record = VariantRecord(rec.contig, rec.pos, rec.stop, rec.id, rec.ref, rec.alts[0],
rec.qual, rec.filter, rec.info, variant_type, None, None)
return vcf_record

Expand All @@ -104,6 +104,6 @@ def _parse_standard_record(rec):
else:
variant_type = VariantType.DEL
# Create new record
vcf_record = VariantRecord(rec.contig, rec.pos, rec.stop, rec.id, rec.ref, rec.alts,
vcf_record = VariantRecord(rec.contig, rec.pos, rec.stop, rec.id, rec.ref, rec.alts[0],
rec.qual, rec.filter, rec.info, variant_type, None, None)
return vcf_record
4 changes: 2 additions & 2 deletions src/variant_extractor/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ class VariantRecord(NamedTuple):
"""Record identifier"""
ref: str
"""Reference sequence"""
alts: List[str]
"""List of alternative sequences"""
alt: str
"""Alternative sequence"""
qual: Optional[float]
"""Quality score for the assertion made in ALT"""
filter: pysam.VariantRecordFilter
Expand Down

0 comments on commit 8e1dd86

Please sign in to comment.