diff --git a/CHANGELOG.md b/CHANGELOG.md index a83d1a2c..8316c857 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,9 +10,11 @@ All notable changes to this project will be documented in this file. The format - Add support for PDF 1.3 logical structure via `Page.structure_tree` (h/t @dhdaines). ([#963](https://github.com/jsvine/pdfplumber/pulls/963)) - Add "gswin64c" as another possible Ghostscript executable in `repair.py` (h/t @echedey-ls). ([#1032](https://github.com/jsvine/pdfplumber/issues/1030)) - Re-add `Page.close()` method, have `PDF.close()` close all pages as well, and improve relevant documentation (h/t @luketudge). ([#1042](https://github.com/jsvine/pdfplumber/issues/1042)) +- Add `force_mediabox` parameter to `Page.to_image(...)`. ([#1054](https://github.com/jsvine/pdfplumber/issues/1054)) ### Fixed +- Standardize handling of cropbox, fixing various issues with PageImage. ([#1054](https://github.com/jsvine/pdfplumber/issues/1054)) - Fix `Page.get_textmap` caching to allow for `extra_attrs=[...]`, by preconverting list kwargs to tuples. ([#1030](https://github.com/jsvine/pdfplumber/issues/1030)) diff --git a/README.md b/README.md index 8e9741cc..75034f9e 100644 --- a/README.md +++ b/README.md @@ -273,6 +273,7 @@ To turn any page (including cropped pages) into an `PageImage` object, call `my_ - `width`: The desired image width in pixels. Default: unset, determined by `resolution`. Type: `int`. - `height`: The desired image width in pixels. Default: unset, determined by `resolution`. Type: `int`. - `antialias`: Whether to use antialiasing when creating the image. Setting to `True` creates images with less-jagged text and graphics, but with larger file sizes. Default: `False`. Type: `bool`. +- `force_mediabox`: Use the page's `.mediabox` dimensions, rather than the `.cropbox` dimensions. Default: `False`. Type: `bool`. For instance: diff --git a/pdfplumber/display.py b/pdfplumber/display.py index f8caa5c0..44723d70 100644 --- a/pdfplumber/display.py +++ b/pdfplumber/display.py @@ -78,8 +78,12 @@ def __init__( original: Optional[PIL.Image.Image] = None, resolution: Union[int, float] = DEFAULT_RESOLUTION, antialias: bool = False, + force_mediabox: bool = False, ): self.page = page + self.root = page if page.is_original else page.root_page + self.resolution = resolution + if original is None: self.original = get_page_image( stream=page.pdf.stream, @@ -92,43 +96,49 @@ def __init__( else: self.original = original - if page.is_original: - self.root = page - cropped = False - else: - self.root = page.root_page - cropped = page.root_page.bbox != page.bbox + self.scale = self.original.size[0] / (page.cropbox[2] - page.cropbox[0]) - self.resolution = resolution - self.scale = self.original.size[0] / self.root.width - - if cropped: - cropbox = ( - int((page.bbox[0] - page.root_page.bbox[0]) * self.scale), - int((page.bbox[1] - page.root_page.bbox[1]) * self.scale), - int((page.bbox[2] - page.root_page.bbox[0]) * self.scale), - int((page.bbox[3] - page.root_page.bbox[1]) * self.scale), + # This value represents the coordinates of the page, + # in page-unit values, that will be displayed. + self.bbox = ( + page.bbox + if page.bbox != page.mediabox + else (page.mediabox if force_mediabox else page.cropbox) + ) + + # If this value is different than the *Page*'s .cropbox + # (e.g., because the mediabox differs from the cropbox or + # or because we've used Page.crop(...)), then we'll need to + # crop the initially-converted image. + if page.bbox != page.cropbox: + crop_dims = self._reproject_bbox(page.cropbox) + bbox_dims = self._reproject_bbox(self.bbox) + self.original = self.original.crop( + ( + bbox_dims[0] - crop_dims[0], + bbox_dims[1] - crop_dims[1], + bbox_dims[2] - crop_dims[0], + bbox_dims[3] - crop_dims[1], + ) ) - self.original = self.original.crop(cropbox) + self.reset() - def _reproject_bbox(self, bbox: T_bbox) -> T_bbox: + def _reproject_bbox(self, bbox: T_bbox) -> Tuple[int, int, int, int]: x0, top, x1, bottom = bbox _x0, _top = self._reproject((x0, top)) _x1, _bottom = self._reproject((x1, bottom)) return (_x0, _top, _x1, _bottom) - def _reproject(self, coord: T_point) -> T_point: + def _reproject(self, coord: T_point) -> Tuple[int, int]: """ Given an (x0, top) tuple from the *root* coordinate system, return an (x0, top) tuple in the *image* coordinate system. """ x0, top = coord - px0, ptop = self.page.bbox[:2] - rx0, rtop = self.root.bbox[:2] - _x0 = (x0 + rx0 - px0) * self.scale - _top = (top + rtop - ptop) * self.scale - return (_x0, _top) + _x0 = (x0 - self.bbox[0]) * self.scale + _top = (top - self.bbox[1]) * self.scale + return (int(_x0), int(_top)) def reset(self) -> "PageImage": self.annotated = PIL.Image.new("RGB", self.original.size) @@ -202,7 +212,7 @@ def draw_vline( stroke: T_color = DEFAULT_STROKE, stroke_width: int = DEFAULT_STROKE_WIDTH, ) -> "PageImage": - points = (location, self.page.bbox[1], location, self.page.bbox[3]) + points = (location, self.bbox[1], location, self.bbox[3]) self.draw.line(self._reproject_bbox(points), fill=stroke, width=stroke_width) return self @@ -222,7 +232,7 @@ def draw_hline( stroke: T_color = DEFAULT_STROKE, stroke_width: int = DEFAULT_STROKE_WIDTH, ) -> "PageImage": - points = (self.page.bbox[0], location, self.page.bbox[2], location) + points = (self.bbox[0], location, self.bbox[2], location) self.draw.line(self._reproject_bbox(points), fill=stroke, width=stroke_width) return self diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 3ad1938c..c458ab92 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -185,6 +185,27 @@ def new_func(**kwargs: Any) -> TextMap: return new_func +def _normalize_box(box_raw: T_bbox, rotation: T_num = 0) -> T_bbox: + # Per PDF Reference 3.8.4: "Note: Although rectangles are + # conventionally specified by their lower-left and upperright + # corners, it is acceptable to specify any two diagonally opposite + # corners." + x0, x1 = sorted((box_raw[0], box_raw[2])) + y0, y1 = sorted((box_raw[1], box_raw[3])) + if rotation in [90, 270]: + return (y0, x0, y1, x1) + else: + return (x0, y0, x1, y1) + + +# PDFs coordinate spaces refer to an origin in the bottom-left of the +# page; pdfplumber flips this vertically, so that the origin is in the +# top-left. +def _invert_box(box_raw: T_bbox, mb_height: T_num) -> T_bbox: + x0, y0, x1, y1 = box_raw + return (x0, mb_height - y1, x1, mb_height - y0) + + class Page(Container): cached_properties: List[str] = Container.cached_properties + ["_layout"] is_original: bool = True @@ -201,35 +222,34 @@ def __init__( self.root_page = self self.page_obj = page_obj self.page_number = page_number - _rotation = resolve_all(self.page_obj.attrs.get("Rotate", 0)) or 0 - self.rotation = _rotation % 360 - self.page_obj.rotate = self.rotation self.initial_doctop = initial_doctop - cropbox = page_obj.attrs.get("CropBox") - mediabox = page_obj.attrs.get("MediaBox") + def get_attr(key: str, default: Any = None) -> Any: + ref = page_obj.attrs.get(key) + return default if ref is None else resolve_all(ref) - self.cropbox = resolve_all(cropbox) if cropbox is not None else None - self.mediabox = resolve_all(mediabox) or self.cropbox - m = self.mediabox + # Per PDF Reference Table 3.27: "The number of degrees by which the + # page should be rotated clockwise when displayed or printed. The value + # must be a multiple of 90. Default value: 0" + _rotation = get_attr("Rotate", 0) + self.rotation = _rotation % 360 - self.bbox: T_bbox = ( - ( - min(m[1], m[3]), - min(m[0], m[2]), - max(m[1], m[3]), - max(m[0], m[2]), - ) - if self.rotation in [90, 270] - else ( - min(m[0], m[2]), - min(m[1], m[3]), - max(m[0], m[2]), - max(m[1], m[3]), + mb_raw = _normalize_box(get_attr("MediaBox"), self.rotation) + mb_height = mb_raw[3] - mb_raw[1] + + self.mediabox = _invert_box(mb_raw, mb_height) + + if "CropBox" in page_obj.attrs: + self.cropbox = _invert_box( + _normalize_box(get_attr("CropBox"), self.rotation), mb_height ) - ) + else: + self.cropbox = self.mediabox + + # Page.bbox defaults to self.mediabox, but can be altered by Page.crop(...) + self.bbox = self.mediabox - # https://rednafi.com/python/lru_cache_on_methods/ + # See https://rednafi.com/python/lru_cache_on_methods/ self.get_textmap = textmap_cacher(self._get_textmap) def close(self) -> None: @@ -542,6 +562,7 @@ def to_image( width: Optional[Union[int, float]] = None, height: Optional[Union[int, float]] = None, antialias: bool = False, + force_mediabox: bool = False, ) -> "PageImage": """ You can pass a maximum of 1 of the following: @@ -562,7 +583,10 @@ def to_image( resolution = 72 * height / self.height return PageImage( - self, resolution=resolution or DEFAULT_RESOLUTION, antialias=antialias + self, + resolution=resolution or DEFAULT_RESOLUTION, + antialias=antialias, + force_mediabox=force_mediabox, ) def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]: @@ -597,6 +621,8 @@ def __init__(self, parent_page: Page): self.pdf = parent_page.pdf self.page_obj = parent_page.page_obj self.page_number = parent_page.page_number + self.mediabox = parent_page.mediabox + self.cropbox = parent_page.cropbox self.flush_cache(Container.cached_properties) self.get_textmap = textmap_cacher(self._get_textmap) diff --git a/tests/pdfs/issue-1054-example.pdf b/tests/pdfs/issue-1054-example.pdf new file mode 100644 index 00000000..90917d11 Binary files /dev/null and b/tests/pdfs/issue-1054-example.pdf differ diff --git a/tests/test_basics.py b/tests/test_basics.py index 5c9e1c68..02bdb12e 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -148,7 +148,7 @@ def test_rotation(self): assert rotated.pages[0].width == 612 assert rotated.pages[0].height == 1008 - assert rotated.pages[0].cropbox == self.pdf.pages[0].cropbox + assert rotated.pages[0].cropbox != self.pdf.pages[0].cropbox assert rotated.pages[0].bbox != self.pdf.pages[0].bbox def test_password(self): diff --git a/tests/test_display.py b/tests/test_display.py index faf3d162..bbcee301 100644 --- a/tests/test_display.py +++ b/tests/test_display.py @@ -73,6 +73,14 @@ def test_cropped(self): im = self.pdf.pages[0].crop((10, 20, 30, 50)).to_image() assert im.original.size == (20, 30) + def test_cropbox(self): + path = os.path.join(HERE, "pdfs/issue-1054-example.pdf") + with pdfplumber.open(path) as pdf: + im = pdf.pages[0].to_image() + assert im.original.size == (596, 842) + im = pdf.pages[0].to_image(force_mediabox=True) + assert im.original.size == (2227, 2923) + def test_copy(self): assert self.im.copy().original == self.im.original