Skip to content

Commit

Permalink
Fix improper quote escaping
Browse files Browse the repository at this point in the history
  • Loading branch information
padix-key committed Sep 1, 2024
1 parent de8c985 commit f794895
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 50 deletions.
126 changes: 76 additions & 50 deletions src/biotite/structure/io/pdbx/cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ def deserialize(text, expect_whitespace=True):
if category_name is None:
raise DeserializationError("Failed to parse category name")

lines = _to_single(lines, is_looped)
lines = _to_single(lines)
if is_looped:
category_dict = CIFCategory._deserialize_looped(lines, expect_whitespace)
else:
Expand Down Expand Up @@ -439,11 +439,28 @@ def _deserialize_single(lines):
Process a category where each field has a single value.
"""
category_dict = {}
for line in lines:
line_i = 0
while line_i < len(lines):
line = lines[line_i]
parts = _split_one_line(line)
column_name = parts[0].split(".")[1]
column = parts[1]
category_dict[column_name] = CIFColumn(column)
if len(parts) == 2:
# Standard case -> name and value in one line
name_part, value_part = parts
line_i += 1
elif len(parts) == 1:
# Value is a multiline value on the next line
name_part = parts[0]
parts = _split_one_line(lines[line_i + 1])
if len(parts) == 1:
value_part = parts[0]
else:
raise DeserializationError(f"Failed to parse line '{line}'")
line_i += 2
elif len(parts) == 0:
raise DeserializationError("Empty line within category")
else:
raise DeserializationError(f"Failed to parse line '{line}'")
category_dict[name_part.split(".")[1]] = CIFColumn(value_part)
return category_dict

@staticmethod
Expand All @@ -468,7 +485,7 @@ def _deserialize_looped(lines, expect_whitespace):
data_lines = lines[i:]
# Rows may be split over multiple lines -> do not rely on
# row-line-alignment at all and simply cycle through columns
column_names = itertools.cycle(column_names)
column_indices = itertools.cycle(range(len(column_names)))
for data_line in data_lines:
# If whitespace is expected in quote protected values,
# use regex-based _split_one_line() to split
Expand All @@ -485,9 +502,18 @@ def _deserialize_looped(lines, expect_whitespace):
):
values[k] = values[k][1:-1]
for val in values:
column_name = next(column_names)
column_index = next(column_indices)
column_name = column_names[column_index]
category_dict[column_name].append(val)

# Check if all columns have the same length
# Otherwise, this would indicate a parsing error or an invalid CIF file
column_index = next(column_indices)
if column_index != 0:
raise DeserializationError(
"Category contains columns with different lengths"
)

return category_dict

def _serialize_single(self):
Expand All @@ -496,7 +522,7 @@ def _serialize_single(self):
# "+3" Because of three whitespace chars after longest key
req_len = max_len + 3
return [
key.ljust(req_len) + _multiline(_quote(column.as_item()))
key.ljust(req_len) + _escape(column.as_item())
for key, column in zip(keys, self.values())
]

Expand All @@ -508,7 +534,7 @@ def _serialize_looped(self):
array = column.as_array(str)
# Quote before measuring the number of chars,
# as the quote characters modify the length
array = np.array([_multiline(_quote(element)) for element in array])
array = np.array([_escape(element) for element in array])
column_arrays.append(array)

# Number of characters the longest string in the column needs
Expand Down Expand Up @@ -927,52 +953,50 @@ def _is_loop_start(line):
return line.startswith("loop_")


def _to_single(lines, is_looped):
"""
def _to_single(lines):
r"""
Convert multiline values into singleline values
(in terms of 'lines' list elements).
Linebreaks are preserved.
Linebreaks are preserved as ``'\n'`` characters within a list element.
The initial ``';'`` character is also preserved, while the final ``';'`` character
is removed.
"""
processed_lines = [None] * len(lines)
in_i = 0
out_i = 0
while in_i < len(lines):
if lines[in_i][0] == ";":
# Multiline value
multi_line_str = lines[in_i][1:]
j = in_i + 1
while lines[j] != ";":
# Preserve linebreaks
multi_line_str += "\n" + lines[j]
j += 1
if is_looped:
# Create a line for the multiline string only
processed_lines[out_i] = f"'{multi_line_str}'"
out_i += 1
processed_lines = []
in_multi_line = False
mutli_line_value = []
for line in lines:
# Multiline value are enclosed by ';' at the start of the beginning and end line
if line[0] == ";":
if not in_multi_line:
# Start of multiline value
in_multi_line = True
mutli_line_value.append(line)
else:
# Append multiline string to previous line
processed_lines[out_i - 1] += " " + f"'{multi_line_str}'"
in_i = j + 1

elif not is_looped and lines[in_i][0] != "_":
# Singleline value in the line after the corresponding key
processed_lines[out_i - 1] += " " + lines[in_i]
in_i += 1

# End of multiline value
in_multi_line = False
# The current line contains only the end character ';'
# Hence this line is not added to the processed lines
processed_lines.append("\n".join(mutli_line_value))
mutli_line_value = []
else:
# Normal singleline value in the same row as the key
processed_lines[out_i] = lines[in_i]
in_i += 1
out_i += 1

return [line for line in processed_lines if line is not None]
if in_multi_line:
mutli_line_value.append(line)
else:
processed_lines.append(line)
return processed_lines


def _quote(value):
def _escape(value):
"""
A less secure but much quicker version of ``shlex.quote()``.
Escape special characters in a value to make it compatible with CIF.
"""
if len(value) == 0:
if "\n" in value:
# A value with linebreaks must be represented as multiline value
return _multiline(value)
elif "'" in value and '"' in value:
# If both quote types are present, you cannot use them for escaping
return _multiline(value)
elif len(value) == 0:
return "''"
elif value[0] == "_":
return "'" + value + "'"
Expand All @@ -990,19 +1014,21 @@ def _quote(value):

def _multiline(value):
"""
Convert a string containing linebreaks into CIF-compatible
Convert a string that may contain linebreaks into CIF-compatible
multiline string.
"""
if "\n" in value:
return "\n;" + value + "\n;\n"
return value
return "\n;" + value + "\n;\n"


def _split_one_line(line):
"""
Split a line into its fields.
Supporting embedded quotes (' or "), like `'a dog's life'` to `a dog's life`
"""
# Special case of multiline value, where the line starts with ';'
if line[0] == ";":
return [line[1:]]

# Define the patterns for different types of fields
single_quote_pattern = r"('(?:'(?! )|[^'])*')(?:\s|$)"
double_quote_pattern = r'("(?:"(?! )|[^"])*")(?:\s|$)'
Expand Down
2 changes: 2 additions & 0 deletions tests/structure/test_pdbx.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ def test_get_model_count(format):
'"',
"te\nxt",
"\t",
"""single"anddouble"marks""",
"""single' and double" marks with whitespace""",
],
[False, True],
),
Expand Down

0 comments on commit f794895

Please sign in to comment.