Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an option to include call name in BQ table. #677

Merged
merged 2 commits into from
Oct 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions gcp_variant_transforms/beam_io/vcf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,13 @@ class VariantCall():
variant. It may include associated information such as quality and phasing.
"""

def __init__(self, sample_id=None, genotype=None, phaseset=None, info=None):
# type: (int, List[int], str, Dict[str, Any]) -> None
def __init__(
self, sample_id=None, name=None, genotype=None, phaseset=None, info=None):
# type: (int, str, List[int], str, Dict[str, Any]) -> None
"""Initialize the :class:`VariantCall` object.

Args:
sample_id: Hashed ID for the call name.
name: The name of the call.
genotype: The genotype of this variant call as specified by the VCF
schema. The values are either `0` representing the reference, or a
Expand All @@ -207,13 +209,16 @@ def __init__(self, sample_id=None, genotype=None, phaseset=None, info=None):
header FORMAT.
"""
self.sample_id = sample_id
self.name = name
self.genotype = genotype or []
self.phaseset = phaseset
self.info = info or {}

def __eq__(self, other):
return ((self.sample_id, self.genotype, self.phaseset, self.info) ==
(other.sample_id, other.genotype, other.phaseset, other.info))
return (
(self.sample_id, self.name, self.genotype, self.phaseset, self.info) ==
(other.sample_id, other.name, other.genotype, other.phaseset,
other.info))

def __lt__(self, other):
if self.sample_id != other.sample_id:
Expand All @@ -238,9 +243,8 @@ def __ne__(self, other):
return not self == other

def __repr__(self):
return ', '.join(
[str(s) for s in [
self.sample_id, self.genotype, self.phaseset, self.info]])
return ', '.join([str(s) for s in [
self.sample_id, self.name, self.genotype, self.phaseset, self.info]])


class VcfParser():
Expand Down Expand Up @@ -664,6 +668,6 @@ def _get_variant_calls(self, samples):
if phaseset is None and sample.phased and len(genotype) > 1:
phaseset = DEFAULT_PHASESET_VALUE
encoded_name = self._lookup_encoded_sample_name(name)
calls.append(VariantCall(encoded_name, genotype, phaseset, info))
calls.append(VariantCall(encoded_name, name, genotype, phaseset, info))

return calls
2 changes: 1 addition & 1 deletion gcp_variant_transforms/beam_io/vcf_parser_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class VariantCallTest(unittest.TestCase):

def _default_variant_call(self):
return vcfio.VariantCall(
sample_id=hash_name('Sample1'), genotype=[1, 0],
sample_id=hash_name('Sample1'), name='Sample1', genotype=[1, 0],
phaseset=vcfio.DEFAULT_PHASESET_VALUE, info={'GQ': 48})

def test_variant_call_order(self):
Expand Down
78 changes: 50 additions & 28 deletions gcp_variant_transforms/beam_io/vcfio_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,11 @@ def _get_sample_variant_1(file_name='', use_1_based_coordinate=False,
quality=50, filters=['PASS'],
info={'AF': [0.5, 0.1], 'NS': 1, 'SVTYPE': ['BÑD']})
variant.calls.append(
vcfio.VariantCall(sample_id=hash_name_method('Sample1'), genotype=[0, 0],
info={'GQ': 48}))
vcfio.VariantCall(sample_id=hash_name_method('Sample1'), name='Sample1',
genotype=[0, 0], info={'GQ': 48}))
variant.calls.append(
vcfio.VariantCall(sample_id=hash_name_method('Sample2'), genotype=[1, 0],
info={'GQ': 20}))
vcfio.VariantCall(sample_id=hash_name_method('Sample2'), name='Sample2',
genotype=[1, 0], info={'GQ': 20}))

return variant

Expand All @@ -119,11 +119,12 @@ def _get_sample_variant_2(file_name='', use_1_based_coordinate=False,
alternate_bases=[], names=['rs1234'], quality=40,
filters=['q10', 's50'], info={'NS': 2})
variant.calls.append(
vcfio.VariantCall(sample_id=hash_name_method('Sample1'), genotype=[-1, 0],
phaseset=vcfio.DEFAULT_PHASESET_VALUE, info={'GQ': 48}))
vcfio.VariantCall(sample_id=hash_name_method('Sample1'), name='Sample1',
genotype=[-1, 0], phaseset=vcfio.DEFAULT_PHASESET_VALUE,
info={'GQ': 48}))
variant.calls.append(
vcfio.VariantCall(sample_id=hash_name_method('Sample2'), genotype=[0, -1],
info={'GQ': None}))
vcfio.VariantCall(sample_id=hash_name_method('Sample2'), name='Sample2',
genotype=[0, -1], info={'GQ': None}))
return variant


Expand All @@ -142,10 +143,10 @@ def _get_sample_variant_3(file_name='', use_1_based_coordinate=False,
reference_bases='C', alternate_bases=['<SYMBOLIC>'], quality=49,
filters=['q10'], info={'AF': [0.5]})
variant.calls.append(
vcfio.VariantCall(sample_id=hash_name_method('Sample1'), genotype=[0, 1],
phaseset='1', info={'GQ': 45}))
vcfio.VariantCall(sample_id=hash_name_method('Sample1'), name='Sample1',
genotype=[0, 1], phaseset='1', info={'GQ': 45}))
variant.calls.append(
vcfio.VariantCall(sample_id=hash_name_method('Sample2'),
vcfio.VariantCall(sample_id=hash_name_method('Sample2'), name='Sample2',
genotype=[vcfio.MISSING_GENOTYPE_VALUE],
info={'GQ': None}))
return variant
Expand All @@ -157,8 +158,8 @@ def _get_sample_non_variant(use_1_based_coordinate=False):
reference_name='19', start=1233 + use_1_based_coordinate, end=1236,
reference_bases='C', alternate_bases=['<NON_REF>'], quality=50)
non_variant.calls.append(
vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 0],
info={'GQ': 99}))
vcfio.VariantCall(sample_id=hash_name('Sample1'), name='Sample1',
genotype=[0, 0], info={'GQ': 99}))

return non_variant

Expand Down Expand Up @@ -389,7 +390,6 @@ def test_single_file_verify_details_without_encoding(self):
self._create_temp_file_and_return_records_with_file_name(
_SAMPLE_HEADER_LINES + [VCF_LINE_1, VCF_LINE_2, VCF_LINE_3],
sample_name_encoding=SampleNameEncoding.NONE))

variant_1 = _get_sample_variant_1(file_name='', use_hashing=False)
variant_2 = _get_sample_variant_2(file_name='Name1', use_hashing=False)
variant_3 = _get_sample_variant_3(file_name=file_name, use_hashing=False)
Expand Down Expand Up @@ -451,9 +451,11 @@ def test_no_info(self):
expected_variant = Variant(reference_name='chr19', start=122, end=123)
expected_variant.calls.append(
VariantCall(sample_id=hash_name('Sample1'),
name='Sample1',
genotype=[vcfio.MISSING_GENOTYPE_VALUE]))
expected_variant.calls.append(
VariantCall(sample_id=hash_name('Sample2'),
name='Sample2',
genotype=[vcfio.MISSING_GENOTYPE_VALUE]))
read_data = self._create_temp_file_and_read_records(
_SAMPLE_HEADER_LINES + [record_line])
Expand All @@ -476,16 +478,20 @@ def test_info_numbers_and_types(self):
info={'HA': ['a1', 'a2'], 'HG': [1, 2, 3], 'HR': ['a', 'b', 'c'],
'HF': True, 'HU': [0.1]})
variant_1.calls.append(VariantCall(sample_id=hash_name('Sample1'),
name='Sample1',
genotype=[1, 0]))
variant_1.calls.append(VariantCall(sample_id=hash_name('Sample2'),
name='Sample2',
genotype=[0, 1]))
variant_2 = Variant(
reference_name='19', start=123, end=124, reference_bases='A',
alternate_bases=['T'],
info={'HG': [3, 4, 5], 'HR': ['d', 'e'], 'HU': [1.1, 1.2]})
variant_2.calls.append(VariantCall(sample_id=hash_name('Sample1'),
name='Sample1',
genotype=[0, 0]))
variant_2.calls.append(VariantCall(sample_id=hash_name('Sample2'),
name='Sample2',
genotype=[0, 1]))
read_data = self._create_temp_file_and_read_records(
info_headers + _SAMPLE_HEADER_LINES[1:] + record_lines)
Expand All @@ -509,8 +515,10 @@ def test_use_of_representative_header(self):
reference_name='19', start=1, end=2, reference_bases='A',
alternate_bases=['T'], info={'HU': ['a', 'b']})
variant.calls.append(VariantCall(sample_id=hash_name('Sample1'),
name='Sample1',
genotype=[0, 0]))
variant.calls.append(VariantCall(sample_id=hash_name('Sample2'),
name='Sample2',
genotype=[0, 1]))

# `file_headers` is used.
Expand Down Expand Up @@ -547,13 +555,13 @@ def test_use_of_representative_header_two_files(self):
reference_name='9', start=1, end=2, reference_bases='A',
alternate_bases=['T'], info={'HU': ['a', 'b']})
variant_1.calls.append(VariantCall(sample_id=hash_name('Sample1'),
genotype=[0, 0]))
name='Sample1', genotype=[0, 0]))

variant_2 = Variant(
reference_name='19', start=1, end=2, reference_bases='A',
alternate_bases=['T'], info={'HU': ['a', 'b']})
variant_2.calls.append(VariantCall(sample_id=hash_name('Sample2'),
genotype=[0, 1]))
name='Sample2', genotype=[0, 1]))

read_data_1 = self._create_temp_file_and_read_records(
file_content_1, representative_header_lines)
Expand All @@ -574,15 +582,19 @@ def test_end_info_key(self):
reference_name='19', start=122, end=1111, reference_bases='A',
alternate_bases=['T'])
variant_1.calls.append(VariantCall(sample_id=hash_name('Sample1'),
name='Sample1',
genotype=[1, 0]))
variant_1.calls.append(VariantCall(sample_id=hash_name('Sample2'),
name='Sample2',
genotype=[0, 1]))
variant_2 = Variant(
reference_name='19', start=122, end=123, reference_bases='A',
alternate_bases=['T'])
variant_2.calls.append(VariantCall(sample_id=hash_name('Sample1'),
name='Sample1',
genotype=[0, 1]))
variant_2.calls.append(VariantCall(sample_id=hash_name('Sample2'),
name='Sample2',
genotype=[1, 1]))
read_data = self._create_temp_file_and_read_records(
[end_info_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines)
Expand All @@ -597,8 +609,10 @@ def test_end_info_key_unknown_number(self):
reference_name='19', start=122, end=1111, reference_bases='A',
alternate_bases=['T'])
variant_1.calls.append(VariantCall(sample_id=hash_name('Sample1'),
name='Sample1',
genotype=[1, 0]))
variant_1.calls.append(VariantCall(sample_id=hash_name('Sample2'),
name='Sample2',
genotype=[0, 1]))
read_data = self._create_temp_file_and_read_records(
[end_info_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines)
Expand All @@ -613,8 +627,10 @@ def test_end_info_key_unknown_number_invalid(self):
reference_name='19', start=122, end=150, reference_bases='A',
alternate_bases=['T'])
variant.calls.append(VariantCall(sample_id=hash_name('Sample1'),
name='Sample1',
genotype=[1, 0]))
variant.calls.append(VariantCall(sample_id=hash_name('Sample2'),
name='Sample2',
genotype=[0, 1]))
read_data = self._create_temp_file_and_read_records(
[end_info_header_line] + _SAMPLE_HEADER_LINES[1:] +
Expand Down Expand Up @@ -646,19 +662,19 @@ def test_custom_phaseset(self):
reference_name='19', start=122, end=123, reference_bases='A',
alternate_bases=['T'])
variant_1.calls.append(
VariantCall(sample_id=hash_name('Sample1'), genotype=[1, 0],
phaseset='1111'))
VariantCall(sample_id=hash_name('Sample1'), name='Sample1',
genotype=[1, 0], phaseset='1111'))
variant_1.calls.append(VariantCall(sample_id=hash_name('Sample2'),
genotype=[0, 1]))
name='Sample2', genotype=[0, 1]))
variant_2 = Variant(
reference_name='19', start=120, end=121, reference_bases='A',
alternate_bases=['T'])
variant_2.calls.append(
VariantCall(sample_id=hash_name('Sample1'), genotype=[1, 0],
phaseset='2222'))
VariantCall(sample_id=hash_name('Sample1'), name='Sample1',
genotype=[1, 0], phaseset='2222'))
variant_2.calls.append(
VariantCall(sample_id=hash_name('Sample2'), genotype=[0, 1],
phaseset='2222'))
VariantCall(sample_id=hash_name('Sample2'), name='Sample2',
genotype=[0, 1], phaseset='2222'))
read_data = self._create_temp_file_and_read_records(
[phaseset_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines)
self.assertEqual(2, len(read_data))
Expand All @@ -681,11 +697,13 @@ def test_format_numbers(self):
alternate_bases=['T', 'C'])
expected_variant.calls.append(VariantCall(
sample_id=hash_name('Sample1'),
name='Sample1',
genotype=[1, 0],
info={'FU': ['a1'], 'F1': 3, 'F2': ['a', 'b'], 'AO': [1],
'AD': [3, 4]}))
expected_variant.calls.append(VariantCall(
sample_id=hash_name('Sample2'),
name='Sample2',
genotype=[0, 1],
info={'FU': ['a2', 'a3'], 'F1': 4, 'F2': ['b', 'c'], 'AO': [1, 2],
'AD':[3]}))
Expand Down Expand Up @@ -889,10 +907,12 @@ def test_missing_info_key(self):
coder = self._get_coder()
variant = Variant()
variant.calls.append(VariantCall(sample_id=hash_name('Sample1'),
name='Sample1',
genotype=[0, 1],
info={'GQ': 10, 'AF': 20}))
variant.calls.append(VariantCall(
sample_id=hash_name('Sample2'), genotype=[0, 1], info={'AF': 20}))
variant.calls.append(VariantCall(sample_id=hash_name('Sample2'),
name='Sample2', genotype=[0, 1],
info={'AF': 20}))
expected = ('. . . . . . . . GT:AF:GQ 0/1:20:10 '
'0/1:20:.\n')

Expand All @@ -903,6 +923,7 @@ def test_info_list(self):
coder = self._get_coder()
variant = Variant()
variant.calls.append(VariantCall(sample_id=hash_name('Sample'),
name='Sample',
genotype=[0, 1],
info={'LI': [1, None, 3]}))
expected = '. . . . . . . . GT:LI 0/1:1,.,3\n'
Expand All @@ -928,7 +949,8 @@ def test_empty_sample_calls(self):
coder = self._get_coder()
variant = Variant()
variant.calls.append(
VariantCall(sample_id=hash_name('Sample2'), genotype=-1))
VariantCall(sample_id=hash_name('Sample2'), name='Sample2',
genotype=-1))
expected = '. . . . . . . . GT .\n'
self._assert_variant_lines_equal(
coder.encode(variant).decode('utf-8'), expected)
Expand All @@ -937,7 +959,7 @@ def test_missing_genotype(self):
coder = self._get_coder()
variant = Variant()
variant.calls.append(
VariantCall(sample_id=hash_name('Sample'),
VariantCall(sample_id=hash_name('Sample'), name='Sample',
genotype=[1, vcfio.MISSING_GENOTYPE_VALUE]))
expected = '. . . . . . . . GT 1/.\n'

Expand All @@ -948,7 +970,7 @@ def test_triploid_genotype(self):
coder = self._get_coder()
variant = Variant()
variant.calls.append(VariantCall(
sample_id=hash_name('Sample'), genotype=[1, 0, 1]))
sample_id=hash_name('Sample'), name='Sample', genotype=[1, 0, 1]))
expected = '. . . . . . . . GT 1/0/1\n'

self._assert_variant_lines_equal(
Expand Down
15 changes: 12 additions & 3 deletions gcp_variant_transforms/libs/bigquery_row_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,15 @@ def __init__(
schema_descriptor, # type: bigquery_schema_descriptor.SchemaDescriptor
conflict_resolver=None,
# type: vcf_field_conflict_resolver.ConflictResolver
null_numeric_value_replacement=None # type: int
null_numeric_value_replacement=None, # type: int
include_call_name=False # type: bool
):
# type: (...) -> None
self._schema_descriptor = schema_descriptor
self._conflict_resolver = conflict_resolver
self._bigquery_field_sanitizer = bigquery_sanitizer.FieldSanitizer(
null_numeric_value_replacement)
self._include_call_name = include_call_name

def _get_bigquery_field_entry(
self,
Expand Down Expand Up @@ -224,7 +226,9 @@ def get_rows(self,
num_calls_in_row = 0
for call in variant.calls:
call_record, empty = self._get_call_record(
call, call_record_schema_descriptor, allow_incompatible_records)
call,
call_record_schema_descriptor,
allow_incompatible_records)

if omit_empty_sample_calls and empty:
continue
Expand All @@ -240,7 +244,7 @@ def _get_call_record(
self,
call, # type: vcfio.VariantCall
schema_descriptor, # type: bigquery_schema_descriptor.SchemaDescriptor
allow_incompatible_records, # type: bool
allow_incompatible_records # type: bool
):
# type: (...) -> (Dict[str, Any], bool)
call_record, is_empty = super()._get_call_record(
Expand All @@ -249,6 +253,11 @@ def _get_call_record(
bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID:
self._bigquery_field_sanitizer.get_sanitized_field(call.sample_id)
})
if self._include_call_name:
call_record.update({
bigquery_util.ColumnKeyConstants.CALLS_NAME:
self._bigquery_field_sanitizer.get_sanitized_field(call.name)
})
return call_record, is_empty

def _get_base_row_from_variant(self, variant, allow_incompatible_records):
Expand Down
Loading