diff --git a/README.md b/README.md index 94df63f7..474eda2c 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ The output will be a CSV containing info about every character, line, and rectan |----------|-------------| |`--format [format]`| `csv` or `json`. The `json` format returns slightly more information; it includes PDF-level metadata and height/width information about each page.| |`--pages [list of pages]`| A space-delimited, `1`-indexed list of pages or hyphenated page ranges. E.g., `1, 11-15`, which would return data for pages 1, 11, 12, 13, 14, and 15.| -|`--types [list of object types to extract]`| Choices are `char`, `anno`, `line`, `curve`, `rect`, `rect_edge`. Defaults to `char`, `anno`, `line`, `curve`, `rect`.| +|`--types [list of object types to extract]`| Choices are `char`, `line`, `curve`, `rect`, `rect_edge`. Defaults to `char`, `line`, `curve`, `rect`.| ## Python library @@ -106,16 +106,17 @@ The `pdfplumber.Page` class is at the core of `pdfplumber`. Most things you'll d Each instance of `pdfplumber.PDF` and `pdfplumber.Page` provides access to four types of PDF objects. The following properties each return a Python list of the matching objects: - `.chars`, each representing a single text character. -- `.annos`, each representing a single annotation-text character. - `.lines`, each representing a single 1-dimensional line. - `.rects`, each representing a single 2-dimensional rectangle. - `.curves`, each representing a series of connected points. - `.images`, each representing an image. - `.figures`, each representing a figure. +- `.annots`, each representing a single PDF annotation (cf. Section 8.4 of the [official PDF specification](https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf) for details) +- `.hyperlinks`, each representing a single PDF annotation of the subtype `Link` and having an `URI` action attribute Each object is represented as a simple Python `dict`, with the following properties: -#### `char` / `anno` properties +#### `char` properties | Property | Description | |----------|-------------| @@ -134,7 +135,7 @@ Each object is represented as a simple Python `dict`, with the following propert |`top`| Distance of top of character from top of page.| |`bottom`| Distance of bottom of the character from top of page.| |`doctop`| Distance of top of character from top of document.| -|`object_type`| "char" / "anno"| +|`object_type`| "char"| #### `line` properties diff --git a/pdfplumber/container.py b/pdfplumber/container.py index 46da1fb7..e4836741 100644 --- a/pdfplumber/container.py +++ b/pdfplumber/container.py @@ -35,10 +35,6 @@ def figures(self): def chars(self): return self.objects.get("char", []) - @property - def annos(self): - return self.objects.get("anno", []) - @property def rect_edges(self): if hasattr(self, "_rect_edges"): diff --git a/pdfplumber/page.py b/pdfplumber/page.py index bd316803..e4a78a20 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -1,5 +1,5 @@ from . import utils -from .utils import resolve_all +from .utils import resolve, resolve_all from .table import TableFinder from .container import Container @@ -57,6 +57,34 @@ def layout(self): self._layout = self.pdf.process_page(self.page_obj) return self._layout + @property + def annots(self): + def parse(annot): + data = resolve(annot.resolve()) + rect = self.decimalize(resolve_all(data["Rect"])) + parsed = { + "page_number": self.page_number, + "doctop": self.initial_doctop + self.height - rect[3], + "top": self.height - rect[3], + "x0": rect[0], + "bottom": self.height - rect[1], + "x1": rect[2], + "width": rect[2] - rect[0], + "height": rect[3] - rect[1], + "data": data, + } + uri = data.get("A", {}).get("URI") + if uri is not None: + parsed["URI"] = uri.decode("utf-8") + return parsed + + raw = resolve(self.page_obj.annots) or [] + return list(map(parse, raw)) + + @property + def hyperlinks(self): + return [a for a in self.annots if "URI" in a] + @property def objects(self): if hasattr(self, "_objects"): diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index 3a515604..9668e13e 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -3,6 +3,7 @@ from .utils import decode_text import pathlib +import itertools from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage @@ -87,3 +88,13 @@ def objects(self): all_objects[kind] = all_objects.get(kind, []) + p.objects[kind] self._objects = all_objects return self._objects + + @property + def annots(self): + gen = (p.annots for p in self.pages) + return list(itertools.chain(*gen)) + + @property + def hyperlinks(self): + gen = (p.hyperlinks for p in self.pages) + return list(itertools.chain(*gen)) diff --git a/tests/pdfs/pdffill-demo.pdf b/tests/pdfs/pdffill-demo.pdf new file mode 100644 index 00000000..dcc7eb32 Binary files /dev/null and b/tests/pdfs/pdffill-demo.pdf differ diff --git a/tests/test_basics.py b/tests/test_basics.py index 4fcbddb2..792728ec 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -35,6 +35,18 @@ def test_objects(self): assert len(self.pdf.chars) assert len(self.pdf.rects) assert len(self.pdf.lines) + assert len(self.pdf.rect_edges) + # Ensure that caching is working: + assert id(self.pdf._rect_edges) == id(self.pdf.rect_edges) + + def test_annots(self): + # via http://www.pdfill.com/example/pdf_drawing_new.pdf + path = os.path.join(HERE, "pdfs/pdffill-demo.pdf") + with pdfplumber.open(path) as pdf: + assert len(pdf.annots) + assert len(pdf.hyperlinks) == 17 + uri = "http://www.pdfill.com/pdf_drawing.html" + assert pdf.hyperlinks[0]["URI"] == uri def test_crop_and_filter(self): def test(obj):