From 1e60b91ec7f62fecae4ab1e9374efb7c66c1b9d3 Mon Sep 17 00:00:00 2001 From: vandrw Date: Wed, 15 Nov 2023 20:16:08 +0100 Subject: [PATCH 1/3] fix #104 Add check for dot symbol and warn user --- selfies/utils/encoding_utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/selfies/utils/encoding_utils.py b/selfies/utils/encoding_utils.py index 411c3534..05347c75 100644 --- a/selfies/utils/encoding_utils.py +++ b/selfies/utils/encoding_utils.py @@ -47,7 +47,16 @@ def selfies_to_encoding( selfies += "[nop]" * (pad_to_len - len_selfies(selfies)) # integer encode - char_list = split_selfies(selfies) + char_list = list(split_selfies(selfies)) + + # Check if SELFIES string contains unconnected molecules + if "." in list(char_list) and not "." in vocab_stoi: + raise ValueError( + "The SELFIES string contains two unconnected molecules " + "(given by the '.' character), but vocab_stoi does not " + "contain the '.' key. Please add it or separate the molecules." + ) + integer_encoded = [vocab_stoi[char] for char in char_list] if enc_type == "label": From 1d22f1d517188c1f858609c5416d2ee310a3b847 Mon Sep 17 00:00:00 2001 From: vandrw Date: Sun, 19 Nov 2023 12:54:33 +0100 Subject: [PATCH 2/3] Move search for dot symbol in try-except --- selfies/utils/encoding_utils.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/selfies/utils/encoding_utils.py b/selfies/utils/encoding_utils.py index 05347c75..a2aabf31 100644 --- a/selfies/utils/encoding_utils.py +++ b/selfies/utils/encoding_utils.py @@ -47,17 +47,20 @@ def selfies_to_encoding( selfies += "[nop]" * (pad_to_len - len_selfies(selfies)) # integer encode - char_list = list(split_selfies(selfies)) - - # Check if SELFIES string contains unconnected molecules - if "." in list(char_list) and not "." in vocab_stoi: - raise ValueError( - "The SELFIES string contains two unconnected molecules " - "(given by the '.' character), but vocab_stoi does not " - "contain the '.' key. Please add it or separate the molecules." - ) - - integer_encoded = [vocab_stoi[char] for char in char_list] + char_list = split_selfies(selfies) + + try: + integer_encoded = [vocab_stoi[char] for char in char_list] + except KeyError as e: + if e.args[0] == ".": + raise KeyError( + "The SELFIES string contains two unconnected molecules " + "(given by the '.' character), but vocab_stoi does not " + "contain the '.' key. Please add it to the vocabulary " + "or separate the molecules." + ) + raise KeyError(e.args[0]) + if enc_type == "label": return integer_encoded From 00756c686b0e1368b8559500d62992ec4c33abdf Mon Sep 17 00:00:00 2001 From: vandrw Date: Wed, 22 Nov 2023 00:40:45 +0100 Subject: [PATCH 3/3] Add one-pass check for unconnected molecules --- selfies/utils/encoding_utils.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/selfies/utils/encoding_utils.py b/selfies/utils/encoding_utils.py index a2aabf31..05d31c1c 100644 --- a/selfies/utils/encoding_utils.py +++ b/selfies/utils/encoding_utils.py @@ -47,21 +47,18 @@ def selfies_to_encoding( selfies += "[nop]" * (pad_to_len - len_selfies(selfies)) # integer encode - char_list = split_selfies(selfies) - - try: - integer_encoded = [vocab_stoi[char] for char in char_list] - except KeyError as e: - if e.args[0] == ".": + integer_encoded = [] + for char in split_selfies(selfies): + if (char == ".") and ("." not in vocab_stoi): raise KeyError( "The SELFIES string contains two unconnected molecules " "(given by the '.' character), but vocab_stoi does not " "contain the '.' key. Please add it to the vocabulary " "or separate the molecules." ) - raise KeyError(e.args[0]) - + integer_encoded.append(vocab_stoi[char]) + if enc_type == "label": return integer_encoded