From 07c12534fa951cbe7c70c7b34f955f9ad6cd12ee Mon Sep 17 00:00:00 2001 From: tjmier Date: Sun, 16 Jun 2024 11:38:13 -0500 Subject: [PATCH 1/5] Added a new parameter 'chain_id' to the function 'get_sequence' in the module 'biotite.structure.io.pdb' to return a dictionary mapping chain_id to the sequence --- src/biotite/structure/io/pdbx/convert.py | 31 +++++++++++++++++++----- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index f5c896175..9d011acb0 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -110,8 +110,8 @@ def _filter(category, index): }) -def get_sequence(pdbx_file, data_block=None): - """ +def get_sequence(pdbx_file, data_block=None, chain_ids=False): + """"" Get the protein and nucleotide sequences from the ``entity_poly.pdbx_seq_one_letter_code_can`` entry. @@ -131,12 +131,18 @@ def get_sequence(pdbx_file, data_block=None): file. If the data block object is passed directly to `pdbx_file`, this parameter is ignored. + chain_ids : bool, optional + If True, a dictionary mapping chain IDs to sequences is returned. + If False or not given, all sequences are returned. + Default is False. Returns ------- - sequences : list of Sequence - The protein and nucleotide sequences for each entity - (equivalent to chains in most cases). + sequences : list of Sequence or dict + If `chain_ids` is False, returns a list of protein and nucleotide + sequences for each entity. + If `chain_ids` is True, returns a dictionary where each key is a + chain ID and each value is the corresponding sequence. """ block = _get_block(pdbx_file, data_block) @@ -148,7 +154,20 @@ def get_sequence(pdbx_file, data_block=None): sequence = _convert_string_to_sequence(string, stype) if sequence is not None: sequences.append(sequence) - return sequences + + if chain_ids: + strand_ids = poly_category['pdbx_strand_id'].as_array(str) + strand_ids = [strand_id.split(",") for strand_id in strand_ids] + + strand_ids_to_seq_dict = {} + for entity, strand_ids in enumerate(strand_ids): + for strand_id in strand_ids: + strand_ids_to_seq_dict[strand_id] = sequences[entity-1] + + return strand_ids_to_seq_dict + + else: + return sequences def get_model_count(pdbx_file, data_block=None): From d5eb03391aa3c9f0c57b21b26cd80f50a92732d6 Mon Sep 17 00:00:00 2001 From: tjmier Date: Mon, 17 Jun 2024 13:57:21 -0500 Subject: [PATCH 2/5] updated get_sequence to return dict of sequences with entity_poly.pdbx_strand_id as keys. Updated test_pdx.py test_get_sequence to reflect returning a dict instead of a list --- src/biotite/structure/io/pdbx/convert.py | 59 ++++++++++++------------ tests/structure/test_pdbx.py | 24 +++++----- 2 files changed, 42 insertions(+), 41 deletions(-) diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index 9d011acb0..15665c767 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -110,7 +110,7 @@ def _filter(category, index): }) -def get_sequence(pdbx_file, data_block=None, chain_ids=False): +def get_sequence(pdbx_file, data_block=None): """"" Get the protein and nucleotide sequences from the ``entity_poly.pdbx_seq_one_letter_code_can`` entry. @@ -131,43 +131,44 @@ def get_sequence(pdbx_file, data_block=None, chain_ids=False): file. If the data block object is passed directly to `pdbx_file`, this parameter is ignored. - chain_ids : bool, optional - If True, a dictionary mapping chain IDs to sequences is returned. - If False or not given, all sequences are returned. - Default is False. Returns ------- - sequences : list of Sequence or dict - If `chain_ids` is False, returns a list of protein and nucleotide - sequences for each entity. - If `chain_ids` is True, returns a dictionary where each key is a - chain ID and each value is the corresponding sequence. + sequence_dict : Dictionary of Sequences + Dictionary keys are derived from ``entity_poly.pdbx_strand_id`` + (often equivalent to chain_id and atom_site.auth_asym_id + in most cases). Dictionary values are sequences. + + Notes + ----- + The ``entity_poly.pdbx_seq_one_letter_code_can`` field contains the initial + complete sequence. If the structure represents a truncated or spliced + version of this initial sequence, it will include only a subset of the + initial sequence. Use biotite.structure.get_residues to retrieve only + the residues that are represented in the structure. """ block = _get_block(pdbx_file, data_block) - poly_category= block["entity_poly"] + seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str) seq_type = poly_category["type"].as_array(str) - sequences = [] - for string, stype in zip(seq_string, seq_type): - sequence = _convert_string_to_sequence(string, stype) - if sequence is not None: - sequences.append(sequence) - - if chain_ids: - strand_ids = poly_category['pdbx_strand_id'].as_array(str) - strand_ids = [strand_id.split(",") for strand_id in strand_ids] - - strand_ids_to_seq_dict = {} - for entity, strand_ids in enumerate(strand_ids): - for strand_id in strand_ids: - strand_ids_to_seq_dict[strand_id] = sequences[entity-1] - - return strand_ids_to_seq_dict + + sequences = [ + _convert_string_to_sequence(string, stype) + for string, stype in zip(seq_string, seq_type) + if _convert_string_to_sequence(string, stype) is not None + ] - else: - return sequences + strand_ids = poly_category['pdbx_strand_id'].as_array(str) + strand_ids = [strand_id.split(",") for strand_id in strand_ids] + + sequence_dict = { + strand_id: sequence + for sequence, strand_ids in zip(sequences, strand_ids) + for strand_id in strand_ids + } + + return sequence_dict def get_model_count(pdbx_file, data_block=None): diff --git a/tests/structure/test_pdbx.py b/tests/structure/test_pdbx.py index 8523b8b94..68d166271 100644 --- a/tests/structure/test_pdbx.py +++ b/tests/structure/test_pdbx.py @@ -457,17 +457,17 @@ def test_get_sequence(format): File = pdbx.BinaryCIFFile pdbx_file = File.read(join(data_dir("structure"), f"5ugo.{format}")) - sequences = pdbx.get_sequence(pdbx_file) + sequences_1 = pdbx.get_sequence(pdbx_file) pdbx_file = File.read(join(data_dir("structure"), f"4gxy.{format}")) - sequences += pdbx.get_sequence(pdbx_file) - assert str(sequences[0]) == "CCGACGGCGCATCAGC" - assert type(sequences[0]) is seq.NucleotideSequence - assert str(sequences[1]) == "GCTGATGCGCC" - assert type(sequences[1]) is seq.NucleotideSequence - assert str(sequences[2]) == "GTCGG" - assert type(sequences[2]) is seq.NucleotideSequence + sequences_2 = pdbx.get_sequence(pdbx_file) + assert str(sequences_1['T']) == "CCGACGGCGCATCAGC" + assert type(sequences_1['T']) is seq.NucleotideSequence + assert str(sequences_1['P']) == "GCTGATGCGCC" + assert type(sequences_1['P']) is seq.NucleotideSequence + assert str(sequences_1['D']) == "GTCGG" + assert type(sequences_1['D']) is seq.NucleotideSequence assert ( - str(sequences[3]) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN" + str(sequences_1['A']) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN" "AYRKAASVIAKYPHKIKSGAEAKKLPGVGTKIAEKIDEFLATGKLRKLEKIRQD" "DTSSSINFLTRVSGIGPSAARKFVDEGIKTLEDLRKNEDKLNHHQRIGLKYFGD" "FEKRIPREEMLQMQDIVLNEVKKVDSEYIATVCGSFRRGAESSGDMDVLLTHPS" @@ -475,14 +475,14 @@ def test_get_sequence(format): "RIDIRLIPKDQYYCGVLYFTGSDIFNKNMRAHALEKGFTINEYTIRPLGVTGVA" "GEPLPVDSEKDIFDYIQWKYREPKDRSE" ) - assert type(sequences[3]) is seq.ProteinSequence + assert type(sequences_1['A']) is seq.ProteinSequence assert ( - str(sequences[4]) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA" + str(sequences_2['A']) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA" "AAGGGAAGCCGGTGCAAGTCCGGCACGGTCCCGCCACTGTGACGGGGAGTCGCC" "CCTCGGGATGTGCCACTGGCCCGAAGGCCGGGAAGGCGGAGGGGCGGCGAGGAT" "CCGGAGTCAGGAAACCTGCCTGCCGTC" ) - assert type(sequences[4]) is seq.NucleotideSequence + assert type(sequences_2['A']) is seq.NucleotideSequence def test_bcif_encoding(): From e79237e28d0a60fe1aae6dc79de55a775ec16ebb Mon Sep 17 00:00:00 2001 From: tjmier Date: Tue, 18 Jun 2024 19:05:57 -0500 Subject: [PATCH 3/5] update local branch with remote branch --- src/biotite/structure/io/pdbx/convert.py | 31 +++++++++++++++++++----- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index f5c896175..9d011acb0 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -110,8 +110,8 @@ def _filter(category, index): }) -def get_sequence(pdbx_file, data_block=None): - """ +def get_sequence(pdbx_file, data_block=None, chain_ids=False): + """"" Get the protein and nucleotide sequences from the ``entity_poly.pdbx_seq_one_letter_code_can`` entry. @@ -131,12 +131,18 @@ def get_sequence(pdbx_file, data_block=None): file. If the data block object is passed directly to `pdbx_file`, this parameter is ignored. + chain_ids : bool, optional + If True, a dictionary mapping chain IDs to sequences is returned. + If False or not given, all sequences are returned. + Default is False. Returns ------- - sequences : list of Sequence - The protein and nucleotide sequences for each entity - (equivalent to chains in most cases). + sequences : list of Sequence or dict + If `chain_ids` is False, returns a list of protein and nucleotide + sequences for each entity. + If `chain_ids` is True, returns a dictionary where each key is a + chain ID and each value is the corresponding sequence. """ block = _get_block(pdbx_file, data_block) @@ -148,7 +154,20 @@ def get_sequence(pdbx_file, data_block=None): sequence = _convert_string_to_sequence(string, stype) if sequence is not None: sequences.append(sequence) - return sequences + + if chain_ids: + strand_ids = poly_category['pdbx_strand_id'].as_array(str) + strand_ids = [strand_id.split(",") for strand_id in strand_ids] + + strand_ids_to_seq_dict = {} + for entity, strand_ids in enumerate(strand_ids): + for strand_id in strand_ids: + strand_ids_to_seq_dict[strand_id] = sequences[entity-1] + + return strand_ids_to_seq_dict + + else: + return sequences def get_model_count(pdbx_file, data_block=None): From 997469956d6159a3b751d4da6688363845fedb81 Mon Sep 17 00:00:00 2001 From: tjmier Date: Tue, 18 Jun 2024 19:15:23 -0500 Subject: [PATCH 4/5] fix merge mistake --- src/biotite/structure/io/pdbx/convert.py | 39 +----------------------- 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index 718214712..b91b3a0d5 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -110,11 +110,7 @@ def _filter(category, index): }) -<<<<<<< HEAD def get_sequence(pdbx_file, data_block=None): -======= -def get_sequence(pdbx_file, data_block=None, chain_ids=False): ->>>>>>> master """"" Get the protein and nucleotide sequences from the ``entity_poly.pdbx_seq_one_letter_code_can`` entry. @@ -135,14 +131,9 @@ def get_sequence(pdbx_file, data_block=None, chain_ids=False): file. If the data block object is passed directly to `pdbx_file`, this parameter is ignored. - chain_ids : bool, optional - If True, a dictionary mapping chain IDs to sequences is returned. - If False or not given, all sequences are returned. - Default is False. Returns ------- -<<<<<<< HEAD sequence_dict : Dictionary of Sequences Dictionary keys are derived from ``entity_poly.pdbx_strand_id`` (often equivalent to chain_id and atom_site.auth_asym_id @@ -155,20 +146,13 @@ def get_sequence(pdbx_file, data_block=None, chain_ids=False): version of this initial sequence, it will include only a subset of the initial sequence. Use biotite.structure.get_residues to retrieve only the residues that are represented in the structure. -======= - sequences : list of Sequence or dict - If `chain_ids` is False, returns a list of protein and nucleotide - sequences for each entity. - If `chain_ids` is True, returns a dictionary where each key is a - chain ID and each value is the corresponding sequence. ->>>>>>> master """ + block = _get_block(pdbx_file, data_block) poly_category= block["entity_poly"] seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str) seq_type = poly_category["type"].as_array(str) -<<<<<<< HEAD sequences = [ _convert_string_to_sequence(string, stype) @@ -186,27 +170,6 @@ def get_sequence(pdbx_file, data_block=None, chain_ids=False): } return sequence_dict -======= - sequences = [] - for string, stype in zip(seq_string, seq_type): - sequence = _convert_string_to_sequence(string, stype) - if sequence is not None: - sequences.append(sequence) - - if chain_ids: - strand_ids = poly_category['pdbx_strand_id'].as_array(str) - strand_ids = [strand_id.split(",") for strand_id in strand_ids] - - strand_ids_to_seq_dict = {} - for entity, strand_ids in enumerate(strand_ids): - for strand_id in strand_ids: - strand_ids_to_seq_dict[strand_id] = sequences[entity-1] - - return strand_ids_to_seq_dict - - else: - return sequences ->>>>>>> master def get_model_count(pdbx_file, data_block=None): From 89297ac1334d1e28242ea6dab60eca4d5730f2f8 Mon Sep 17 00:00:00 2001 From: tjmier Date: Tue, 18 Jun 2024 21:28:50 -0500 Subject: [PATCH 5/5] updated get_sequence, dependent tests, and example --- doc/examples/scripts/sequence/residue_coevolution.py | 2 +- src/biotite/structure/io/pdbx/convert.py | 2 +- tests/database/test_rcsb.py | 2 +- tests/structure/test_sequence.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/examples/scripts/sequence/residue_coevolution.py b/doc/examples/scripts/sequence/residue_coevolution.py index a45e7bdb9..e1f2f7329 100644 --- a/doc/examples/scripts/sequence/residue_coevolution.py +++ b/doc/examples/scripts/sequence/residue_coevolution.py @@ -58,7 +58,7 @@ # Get structure and sequence pdbx_file = pdbx.CIFFile.read(rcsb.fetch("1GUU", "mmcif")) -sequence = pdbx.get_sequence(pdbx_file)[0] +sequence = pdbx.get_sequence(pdbx_file)['A'] # 'use_author_fields' is set to false, # to ensure that values in the 'res_id' annotation point to the sequence structure = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False) diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index b91b3a0d5..0a149d186 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -157,7 +157,6 @@ def get_sequence(pdbx_file, data_block=None): sequences = [ _convert_string_to_sequence(string, stype) for string, stype in zip(seq_string, seq_type) - if _convert_string_to_sequence(string, stype) is not None ] strand_ids = poly_category['pdbx_strand_id'].as_array(str) @@ -167,6 +166,7 @@ def get_sequence(pdbx_file, data_block=None): strand_id: sequence for sequence, strand_ids in zip(sequences, strand_ids) for strand_id in strand_ids + if sequence is not None } return sequence_dict diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py index 715bb0dc5..415b713a4 100644 --- a/tests/database/test_rcsb.py +++ b/tests/database/test_rcsb.py @@ -142,7 +142,7 @@ def test_search_field(field, molecular_definition, params, ref_ids): def test_search_sequence(): IDENTIY_CUTOFF = 0.9 pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif")) - ref_sequence = pdbx.get_sequence(pdbx_file)[0] + ref_sequence = pdbx.get_sequence(pdbx_file)['A'] query = rcsb.SequenceQuery( ref_sequence, "protein", min_identity=IDENTIY_CUTOFF ) diff --git a/tests/structure/test_sequence.py b/tests/structure/test_sequence.py index 2d4efb516..098958824 100644 --- a/tests/structure/test_sequence.py +++ b/tests/structure/test_sequence.py @@ -53,7 +53,7 @@ def test_pdbx_sequence_consistency(path): def _find_best_match(sequence, ref_sequences): best_alignment = None best_identity = 0.0 - for ref_sequence in ref_sequences: + for ref_sequence in ref_sequences.values(): if type(sequence) != type(ref_sequence): continue if isinstance(sequence, seq.ProteinSequence):