Add .annots and .hyperlinks, replacing .annos

.annos was non-functional, based on a misunderstanding of how Annotation objects were represented in the PDF object. Also shifts language from "annos" to "annots" to mirror pdfminer's nomenclature.
jsvine · Jul 31, 2020 · aa03961 · aa03961
1 parent 7a90630
commit aa03961
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -43,7 +43,7 @@ The output will be a CSV containing info about every character, line, and rectan
 |----------|-------------|
 |`--format [format]`| `csv` or `json`. The `json` format returns slightly more information; it includes PDF-level metadata and height/width information about each page.|
 |`--pages [list of pages]`| A space-delimited, `1`-indexed list of pages or hyphenated page ranges. E.g., `1, 11-15`, which would return data for pages 1, 11, 12, 13, 14, and 15.|
-|`--types [list of object types to extract]`| Choices are `char`, `anno`, `line`, `curve`, `rect`, `rect_edge`. Defaults to `char`, `anno`, `line`, `curve`, `rect`.|
+|`--types [list of object types to extract]`| Choices are `char`, `line`, `curve`, `rect`, `rect_edge`. Defaults to `char`, `line`, `curve`, `rect`.|
 
 ## Python library
 
@@ -106,16 +106,17 @@ The `pdfplumber.Page` class is at the core of `pdfplumber`. Most things you'll d
 Each instance of `pdfplumber.PDF` and `pdfplumber.Page` provides access to four types of PDF objects. The following properties each return a Python list of the matching objects:
 
 - `.chars`, each representing a single text character.
-- `.annos`, each representing a single annotation-text character.
 - `.lines`, each representing a single 1-dimensional line.
 - `.rects`, each representing a single 2-dimensional rectangle.
 - `.curves`, each representing a series of connected points.
 - `.images`, each representing an image.
 - `.figures`, each representing a figure.
+- `.annots`, each representing a single PDF annotation (cf. Section 8.4 of the [official PDF specification](https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf) for details)
+- `.hyperlinks`, each representing a single PDF annotation of the subtype `Link` and having an `URI` action attribute
 
 Each object is represented as a simple Python `dict`, with the following properties:
 
-#### `char` / `anno` properties
+#### `char` properties
 
 | Property | Description |
 |----------|-------------|
@@ -134,7 +135,7 @@ Each object is represented as a simple Python `dict`, with the following propert
 |`top`| Distance of top of character from top of page.|
 |`bottom`| Distance of bottom of the character from top of page.|
 |`doctop`| Distance of top of character from top of document.|
-|`object_type`| "char" / "anno"|
+|`object_type`| "char"|
 
 #### `line` properties
 

diff --git a/pdfplumber/container.py b/pdfplumber/container.py
@@ -35,10 +35,6 @@ def figures(self):
     def chars(self):
         return self.objects.get("char", [])
 
-    @property
-    def annos(self):
-        return self.objects.get("anno", [])
-
     @property
     def rect_edges(self):
         if hasattr(self, "_rect_edges"):

diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -1,5 +1,5 @@
 from . import utils
-from .utils import resolve_all
+from .utils import resolve, resolve_all
 from .table import TableFinder
 from .container import Container
 
@@ -57,6 +57,34 @@ def layout(self):
         self._layout = self.pdf.process_page(self.page_obj)
         return self._layout
 
+    @property
+    def annots(self):
+        def parse(annot):
+            data = resolve(annot.resolve())
+            rect = self.decimalize(resolve_all(data["Rect"]))
+            parsed = {
+                "page_number": self.page_number,
+                "doctop": self.initial_doctop + self.height - rect[3],
+                "top": self.height - rect[3],
+                "x0": rect[0],
+                "bottom": self.height - rect[1],
+                "x1": rect[2],
+                "width": rect[2] - rect[0],
+                "height": rect[3] - rect[1],
+                "data": data,
+            }
+            uri = data.get("A", {}).get("URI")
+            if uri is not None:
+                parsed["URI"] = uri.decode("utf-8")
+            return parsed
+
+        raw = resolve(self.page_obj.annots) or []
+        return list(map(parse, raw))
+
+    @property
+    def hyperlinks(self):
+        return [a for a in self.annots if "URI" in a]
+
     @property
     def objects(self):
         if hasattr(self, "_objects"):

diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py
@@ -3,6 +3,7 @@
 from .utils import decode_text
 
 import pathlib
+import itertools
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
@@ -87,3 +88,13 @@ def objects(self):
                 all_objects[kind] = all_objects.get(kind, []) + p.objects[kind]
         self._objects = all_objects
         return self._objects
+
+    @property
+    def annots(self):
+        gen = (p.annots for p in self.pages)
+        return list(itertools.chain(*gen))
+
+    @property
+    def hyperlinks(self):
+        gen = (p.hyperlinks for p in self.pages)
+        return list(itertools.chain(*gen))
diff --git a/tests/pdfs/pdffill-demo.pdf b/tests/pdfs/pdffill-demo.pdf
diff --git a/tests/test_basics.py b/tests/test_basics.py
@@ -35,6 +35,18 @@ def test_objects(self):
         assert len(self.pdf.chars)
         assert len(self.pdf.rects)
         assert len(self.pdf.lines)
+        assert len(self.pdf.rect_edges)
+        # Ensure that caching is working:
+        assert id(self.pdf._rect_edges) == id(self.pdf.rect_edges)
+
+    def test_annots(self):
+        # via http://www.pdfill.com/example/pdf_drawing_new.pdf
+        path = os.path.join(HERE, "pdfs/pdffill-demo.pdf")
+        with pdfplumber.open(path) as pdf:
+            assert len(pdf.annots)
+            assert len(pdf.hyperlinks) == 17
+            uri = "http://www.pdfill.com/pdf_drawing.html"
+            assert pdf.hyperlinks[0]["URI"] == uri
 
     def test_crop_and_filter(self):
         def test(obj):