Skip to content

Commit

Permalink
Do not crash on ASCII85 in inline images and properly support their c…
Browse files Browse the repository at this point in the history
…olorspaces (pdfminer#1010)
  • Loading branch information
dhdaines authored Jul 15, 2024
1 parent 88139ad commit 1a8bd2f
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- `ValueError` when corrupt PDF specifies an invalid mediabox ([#987](https://github.com/pdfminer/pdfminer.six/pull/987))
- `RecursionError` when corrupt PDF specifies a recursive /Pages object ([#998](https://github.com/pdfminer/pdfminer.six/pull/998))
- `TypeError` when corrupt PDF specifies text-positioning operators with invalid values ([#1000](https://github.com/pdfminer/pdfminer.six/pull/1000))
- inline image parsing fails when stream data contains "EI\n" ([#1008](https://github.com/pdfminer/pdfminer.six/issues/1008))

### Removed

Expand Down
12 changes: 10 additions & 2 deletions pdfminer/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
LITERAL_DEVICE_CMYK,
LITERAL_DEVICE_GRAY,
LITERAL_DEVICE_RGB,
LITERAL_INLINE_DEVICE_GRAY,
LITERAL_INLINE_DEVICE_RGB,
)
from pdfminer.pdfexceptions import PDFValueError
from pdfminer.pdftypes import (
Expand Down Expand Up @@ -125,10 +127,16 @@ def export_image(self, image: LTImage) -> str:
elif image.bits == 1:
name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits)

elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
elif image.bits == 8 and (
LITERAL_DEVICE_RGB in image.colorspace
or LITERAL_INLINE_DEVICE_RGB in image.colorspace
):
name = self._save_bmp(image, width, height, width * 3, image.bits * 3)

elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
elif image.bits == 8 and (
LITERAL_DEVICE_GRAY in image.colorspace
or LITERAL_INLINE_DEVICE_GRAY in image.colorspace
):
name = self._save_bmp(image, width, height, width, image.bits)

elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
Expand Down
4 changes: 4 additions & 0 deletions pdfminer/pdfcolor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
LITERAL_DEVICE_GRAY = LIT("DeviceGray")
LITERAL_DEVICE_RGB = LIT("DeviceRGB")
LITERAL_DEVICE_CMYK = LIT("DeviceCMYK")
# Abbreviations for inline images
LITERAL_INLINE_DEVICE_GRAY = LIT("G")
LITERAL_INLINE_DEVICE_RGB = LIT("RGB")
LITERAL_INLINE_DEVICE_CMYK = LIT("CMYK")


class PDFColorSpace:
Expand Down
17 changes: 14 additions & 3 deletions pdfminer/pdfinterp.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
)
from pdfminer.pdfpage import PDFPage
from pdfminer.pdftypes import (
LITERALS_ASCII85_DECODE,
PDFObjRef,
PDFStream,
dict_value,
Expand Down Expand Up @@ -331,11 +332,21 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
if len(objs) % 2 != 0:
error_msg = f"Invalid dictionary construct: {objs!r}"
raise PSTypeError(error_msg)
d = {literal_name(k): v for (k, v) in choplist(2, objs)}
(pos, data) = self.get_inline_data(pos + len(b"ID "))
d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
eos = b"EI"
filter = d.get("F", None)
if filter is not None:
if isinstance(filter, PSLiteral):
filter = [filter]
if filter[0] in LITERALS_ASCII85_DECODE:
eos = b"~>"
(pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
if eos != b"EI": # it may be necessary for decoding
data += eos
obj = PDFStream(d, data)
self.push((pos, obj))
self.push((pos, self.KEYWORD_EI))
if eos == b"EI": # otherwise it is still in the stream
self.push((pos, self.KEYWORD_EI))
except PSTypeError:
if settings.STRICT:
raise
Expand Down
4 changes: 3 additions & 1 deletion pdfminer/psparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,9 @@ def _parse_string_1(self, s: bytes, i: int) -> int:
return i + 1

elif self.oct:
self._curtoken += bytes((int(self.oct, 8),))
chrcode = int(self.oct, 8)
assert chrcode < 256, "Invalid octal %s (%d)" % (repr(self.oct), chrcode)
self._curtoken += bytes((chrcode,))
self._parse1 = self._parse_string
return i

Expand Down
Binary file added samples/contrib/issue-1008-inline-ascii85.pdf
Binary file not shown.
7 changes: 7 additions & 0 deletions tests/test_tools_pdf2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,10 @@ def test_contrib_issue_495_pdfobjref(self):
filepath = absolute_sample_path("contrib/issue_495_pdfobjref.pdf")
image_files = self.extract_images(filepath)
assert image_files[0].endswith("jpg")

def test_contrib_issue_1008_inline(self):
"""Test for parsing and extracting inline images"""
filepath = absolute_sample_path("contrib/issue-1008-inline-ascii85.pdf")
image_files = self.extract_images(filepath)
assert len(image_files) == 23
assert all(x.endswith(".bmp") for x in image_files)

0 comments on commit 1a8bd2f

Please sign in to comment.