Fix improper quote escaping

biotite-dev · Sep 1, 2024 · f794895 · f794895
1 parent de8c985
commit f794895
Show file tree

Hide file tree

Showing 2 changed files with 78 additions and 50 deletions.
diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py
@@ -370,7 +370,7 @@ def deserialize(text, expect_whitespace=True):
         if category_name is None:
             raise DeserializationError("Failed to parse category name")
 
-        lines = _to_single(lines, is_looped)
+        lines = _to_single(lines)
         if is_looped:
             category_dict = CIFCategory._deserialize_looped(lines, expect_whitespace)
         else:
@@ -439,11 +439,28 @@ def _deserialize_single(lines):
         Process a category where each field has a single value.
         """
         category_dict = {}
-        for line in lines:
+        line_i = 0
+        while line_i < len(lines):
+            line = lines[line_i]
             parts = _split_one_line(line)
-            column_name = parts[0].split(".")[1]
-            column = parts[1]
-            category_dict[column_name] = CIFColumn(column)
+            if len(parts) == 2:
+                # Standard case -> name and value in one line
+                name_part, value_part = parts
+                line_i += 1
+            elif len(parts) == 1:
+                # Value is a multiline value on the next line
+                name_part = parts[0]
+                parts = _split_one_line(lines[line_i + 1])
+                if len(parts) == 1:
+                    value_part = parts[0]
+                else:
+                    raise DeserializationError(f"Failed to parse line '{line}'")
+                line_i += 2
+            elif len(parts) == 0:
+                raise DeserializationError("Empty line within category")
+            else:
+                raise DeserializationError(f"Failed to parse line '{line}'")
+            category_dict[name_part.split(".")[1]] = CIFColumn(value_part)
         return category_dict
 
     @staticmethod
@@ -468,7 +485,7 @@ def _deserialize_looped(lines, expect_whitespace):
         data_lines = lines[i:]
         # Rows may be split over multiple lines -> do not rely on
         # row-line-alignment at all and simply cycle through columns
-        column_names = itertools.cycle(column_names)
+        column_indices = itertools.cycle(range(len(column_names)))
         for data_line in data_lines:
             # If whitespace is expected in quote protected values,
             # use regex-based _split_one_line() to split
@@ -485,9 +502,18 @@ def _deserialize_looped(lines, expect_whitespace):
                     ):
                         values[k] = values[k][1:-1]
             for val in values:
-                column_name = next(column_names)
+                column_index = next(column_indices)
+                column_name = column_names[column_index]
                 category_dict[column_name].append(val)
 
+        # Check if all columns have the same length
+        # Otherwise, this would indicate a parsing error or an invalid CIF file
+        column_index = next(column_indices)
+        if column_index != 0:
+            raise DeserializationError(
+                "Category contains columns with different lengths"
+            )
+
         return category_dict
 
     def _serialize_single(self):
@@ -496,7 +522,7 @@ def _serialize_single(self):
         # "+3" Because of three whitespace chars after longest key
         req_len = max_len + 3
         return [
-            key.ljust(req_len) + _multiline(_quote(column.as_item()))
+            key.ljust(req_len) + _escape(column.as_item())
             for key, column in zip(keys, self.values())
         ]
 
@@ -508,7 +534,7 @@ def _serialize_looped(self):
             array = column.as_array(str)
             # Quote before measuring the number of chars,
             # as the quote characters modify the length
-            array = np.array([_multiline(_quote(element)) for element in array])
+            array = np.array([_escape(element) for element in array])
             column_arrays.append(array)
 
         # Number of characters the longest string in the column needs
@@ -927,52 +953,50 @@ def _is_loop_start(line):
     return line.startswith("loop_")
 
 
-def _to_single(lines, is_looped):
-    """
+def _to_single(lines):
+    r"""
     Convert multiline values into singleline values
     (in terms of 'lines' list elements).
-    Linebreaks are preserved.
+    Linebreaks are preserved as ``'\n'`` characters within a list element.
+    The initial ``';'`` character is also preserved, while the final ``';'`` character
+    is removed.
     """
-    processed_lines = [None] * len(lines)
-    in_i = 0
-    out_i = 0
-    while in_i < len(lines):
-        if lines[in_i][0] == ";":
-            # Multiline value
-            multi_line_str = lines[in_i][1:]
-            j = in_i + 1
-            while lines[j] != ";":
-                # Preserve linebreaks
-                multi_line_str += "\n" + lines[j]
-                j += 1
-            if is_looped:
-                # Create a line for the multiline string only
-                processed_lines[out_i] = f"'{multi_line_str}'"
-                out_i += 1
+    processed_lines = []
+    in_multi_line = False
+    mutli_line_value = []
+    for line in lines:
+        # Multiline value are enclosed by ';' at the start of the beginning and end line
+        if line[0] == ";":
+            if not in_multi_line:
+                # Start of multiline value
+                in_multi_line = True
+                mutli_line_value.append(line)
             else:
-                # Append multiline string to previous line
-                processed_lines[out_i - 1] += " " + f"'{multi_line_str}'"
-            in_i = j + 1
-
-        elif not is_looped and lines[in_i][0] != "_":
-            # Singleline value in the line after the corresponding key
-            processed_lines[out_i - 1] += " " + lines[in_i]
-            in_i += 1
-
+                # End of multiline value
+                in_multi_line = False
+                # The current line contains only the end character ';'
+                # Hence this line is not added to the processed lines
+                processed_lines.append("\n".join(mutli_line_value))
+                mutli_line_value = []
         else:
-            # Normal singleline value in the same row as the key
-            processed_lines[out_i] = lines[in_i]
-            in_i += 1
-            out_i += 1
-
-    return [line for line in processed_lines if line is not None]
+            if in_multi_line:
+                mutli_line_value.append(line)
+            else:
+                processed_lines.append(line)
+    return processed_lines
 
 
-def _quote(value):
+def _escape(value):
     """
-    A less secure but much quicker version of ``shlex.quote()``.
+    Escape special characters in a value to make it compatible with CIF.
     """
-    if len(value) == 0:
+    if "\n" in value:
+        # A value with linebreaks must be represented as multiline value
+        return _multiline(value)
+    elif "'" in value and '"' in value:
+        # If both quote types are present, you cannot use them for escaping
+        return _multiline(value)
+    elif len(value) == 0:
         return "''"
     elif value[0] == "_":
         return "'" + value + "'"
@@ -990,19 +1014,21 @@ def _quote(value):
 
 def _multiline(value):
     """
-    Convert a string containing linebreaks into CIF-compatible
+    Convert a string that may contain linebreaks into CIF-compatible
     multiline string.
     """
-    if "\n" in value:
-        return "\n;" + value + "\n;\n"
-    return value
+    return "\n;" + value + "\n;\n"
 
 
 def _split_one_line(line):
     """
     Split a line into its fields.
     Supporting embedded quotes (' or "), like `'a dog's life'` to  `a dog's life`
     """
+    # Special case of multiline value, where the line starts with ';'
+    if line[0] == ";":
+        return [line[1:]]
+
     # Define the patterns for different types of fields
     single_quote_pattern = r"('(?:'(?! )|[^'])*')(?:\s|$)"
     double_quote_pattern = r'("(?:"(?! )|[^"])*")(?:\s|$)'

diff --git a/tests/structure/test_pdbx.py b/tests/structure/test_pdbx.py
@@ -44,6 +44,8 @@ def test_get_model_count(format):
             '"',
             "te\nxt",
             "\t",
+            """single"anddouble"marks""",
+            """single' and double" marks with whitespace""",
         ],
         [False, True],
     ),