Handle sloppy cross references more and less generically

A previous commit adjusted readSymbol() to skip leading whitespace in order to avoid errors with sloppy cross references. This did not fix handling of literals such as numbers and booleans in readObject() because they're not accessed using readSymbol(). Also, adjusting the very low-level readSymbol() function might generate fallout. So instead, this change moves the skipping of leading whitespace into readObject() so that it affects all types of referenced objects equally but not all symbol lookups altogether. Signed-off-by: Michael Weiser <[email protected]>
hatching · Jul 2, 2020 · 90720a4 · 90720a4
1 parent f50847f
commit 90720a4
Showing 1 changed file with 5 additions and 1 deletion.
diff --git a/peepdf/PDFCore.py b/peepdf/PDFCore.py
@@ -7857,6 +7857,11 @@ def readObject(self, content, objectType=None, forceMode=False, looseMode=False)
         pdfObject = None
         oldCounter = self.charCounter
         self.charCounter = 0
+        # skip leading whitespace in case of sloppy reference offsets
+        self.readSpaces(content)
+        if self.charCounter > 0:
+            content = content[self.charCounter:]
+            self.charCounter = 0
         if objectType is not None:
             objectsTypeArray = [self.delimiters[i][2] for i in range(len(self.delimiters))]
             index = objectsTypeArray.index(objectType)
@@ -8011,7 +8016,6 @@ def readSymbol(self, string, symbol, deleteSpaces=True):
             errorMessage = 'EOF while looking for symbol "'+symbol+'"'
             pdfFile.addError(errorMessage)
             return (-1, errorMessage)
-        self.readSpaces(string)
         while string[self.charCounter] == '%':
             ret = self.readUntilEndOfLine(string)
             if ret[0] == -1: