Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Caret annotations: initial support #102

Merged
merged 4 commits into from
Dec 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions pdfannots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ def _mkannotation(
"""
Given a PDF annotation, capture relevant fields and construct an Annotation object.

Refer to Section 8.4 of the PDF spec:
https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
Refer to Section 8.4 of the PDF reference (version 1.7).
"""

subtype = pa.get('Subtype')
Expand Down Expand Up @@ -85,13 +84,17 @@ def _mkannotation(
rect = pdftypes.resolve1(pa.get('Rect'))

# QuadPoints are defined only for "markup" annotations (Highlight, Underline, StrikeOut,
# Squiggly), where they specify the quadrilaterals (boxes) covered by the annotation.
# Squiggly, Caret), where they specify the quadrilaterals (boxes) covered by the annotation.
quadpoints = pdftypes.resolve1(pa.get('QuadPoints'))

author = pdftypes.resolve1(pa.get('T'))
if author is not None:
author = pdfminer.utils.decode_text(author)

name = pdftypes.resolve1(pa.get('NM'))
if name is not None:
name = pdfminer.utils.decode_text(name)

created = None
dobj = pa.get('CreationDate')
# some pdf apps set modification date, but not creation date
Expand All @@ -103,8 +106,9 @@ def _mkannotation(
createds = pdfminer.utils.decode_text(createds)
created = decode_datetime(createds)

return Annotation(page, annot_type, quadpoints, rect,
contents, author=author, created=created, color=rgb)
return Annotation(page, annot_type, quadpoints=quadpoints, rect=rect, name=name,
contents=contents, author=author, created=created, color=rgb,
in_reply_to_ref=pa.get('IRT'))


def _get_outlines(doc: PDFDocument) -> typ.Iterator[Outline]:
Expand Down Expand Up @@ -383,6 +387,10 @@ def emit_progress(msg: str) -> None:
o.resolve(page)
page.outlines.append(o)

# Dict from object ID (in the ObjRef) to Annotation object
# This is used while post-processing to resolve inter-annotation references
annots_by_objid: typ.Dict[int, Annotation] = {}

# Construct Annotation objects, and append them to the page.
for pa in pdftypes.resolve1(pdfpage.annots) if pdfpage.annots else []:
if isinstance(pa, pdftypes.PDFObjRef):
Expand All @@ -391,6 +399,8 @@ def emit_progress(msg: str) -> None:
annot = _mkannotation(annot_dict, page)
if annot is not None:
page.annots.append(annot)
assert pa.objid not in annots_by_objid
annots_by_objid[pa.objid] = annot
else:
logger.warning("Unknown annotation: %s", pa)

Expand All @@ -410,7 +420,7 @@ def emit_progress(msg: str) -> None:

# Give the annotations a chance to update their internals
for a in page.annots:
a.postprocess()
a.postprocess(annots_by_objid)

emit_progress("\n")

Expand Down
5 changes: 4 additions & 1 deletion pdfannots/printer/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def annot_to_dict(
assert annot.pos

result = {
"name": annot.name,
"type": annot.subtype.name,
"page": annot.pos.page.pageno + 1,
"page_label": annot.pos.page.label,
Expand All @@ -23,7 +24,9 @@ def annot_to_dict(
"contents": annot.contents,
"author": annot.author,
"created": annot.created.strftime('%Y-%m-%dT%H:%M:%S') if annot.created else None,
"color": ('#' + annot.color.ashex()) if annot.color else None
"color": ('#' + annot.color.ashex()) if annot.color else None,
"in_reply_to": (annot.in_reply_to.name if annot.in_reply_to and annot.in_reply_to.name
else None),
}

# Remove keys with None values in nested dictionary and return
Expand Down
34 changes: 24 additions & 10 deletions pdfannots/printer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,11 +216,17 @@ def format_annot(
document: Document,
extra: typ.Optional[str] = None
) -> str:
# Limited support for Caret annotations with a single "reply" of type StrikeOut
contents = annot.contents
if (annot.subtype == AnnotationType.Caret and annot.replies
and annot.replies[0].subtype == AnnotationType.StrikeOut):
annot = annot.replies[0]
if annot.contents:
logger.warning("Ignored StrikeOut comment: %s", annot.contents)

# capture item text and contents (i.e. the comment), and split the latter into paragraphs
text = annot.gettext(self.remove_hyphens) or ''
comment = ([l for l in annot.contents.splitlines() if l]
if annot.contents else [])
comment = [l for l in contents.splitlines() if l] if contents else []

if annot.has_context():
assert annot.subtype == AnnotationType.StrikeOut
Expand Down Expand Up @@ -270,13 +276,13 @@ def emit_body(
self,
document: Document
) -> typ.Iterator[str]:
for a in document.iter_annots():
for a in document.iter_annots(include_replies=False):
yield self.format_annot(a, document, a.subtype.name)


class GroupedMarkdownPrinter(MarkdownPrinter):
ANNOT_NITS = frozenset({
AnnotationType.Squiggly, AnnotationType.StrikeOut, AnnotationType.Underline})
ANNOT_NITS = frozenset({AnnotationType.Caret, AnnotationType.Squiggly,
AnnotationType.StrikeOut, AnnotationType.Underline})
ALL_SECTIONS = ["highlights", "comments", "nits"]

def __init__(
Expand Down Expand Up @@ -316,12 +322,12 @@ def fmt_header(name: str, level: int = 2) -> str:
return prefix + header + " " + name + "\n"

# Partition annotations into nits, comments, and highlights.
nits = []
comments = []
highlights = [] # When grouping by color, this holds only the undefined annotations
nits: typ.List[Annotation] = []
comments: typ.List[Annotation] = []
highlights: typ.List[Annotation] = [] # When grouping by color holds only undefined annots
highlights_by_color: typ.DefaultDict[RGB, typ.List[Annotation]] = defaultdict(list)

for a in document.iter_annots():
for a in document.iter_annots(include_replies=False):
if a.subtype in self.ANNOT_NITS:
nits.append(a)
elif a.contents:
Expand Down Expand Up @@ -355,5 +361,13 @@ def fmt_header(name: str, level: int = 2) -> str:
if nits and secname == 'nits':
yield fmt_header("Nits")
for a in nits:
extra = "suggested deletion" if a.subtype == AnnotationType.StrikeOut else None
extra = None
if a.subtype == AnnotationType.Caret:
if a.replies and a.replies[0].subtype == AnnotationType.StrikeOut:
extra = "suggested replacement"
else:
extra = "suggested insertion"
elif a.subtype == AnnotationType.StrikeOut:
extra = "suggested deletion"

yield self.format_annot(a, document, extra)
70 changes: 49 additions & 21 deletions pdfannots/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ def __init__(self, x0: float, y0: float, x1: float, y1: float):
self.y0 = y0
self.y1 = y1

def __repr__(self) -> str:
return '<Box (%f,%f) (%f,%f)>' % (self.x0, self.y0, self.x1, self.y1)

@staticmethod
def from_item(item: LTComponent) -> Box:
"""Construct a Box from the bounding box of a given PDF component."""
Expand Down Expand Up @@ -261,6 +264,8 @@ class AnnotationType(enum.Enum):
StrikeOut = enum.auto()
Underline = enum.auto()

Caret = enum.auto()

# A single rectangle, that is abused by some Apple tools to render custom
# highlights. We do not attempt to capture the affected text.
Square = enum.auto()
Expand All @@ -274,35 +279,43 @@ class Annotation(ObjectWithPos):
A PDF annotation, and its extracted text.

Attributes:
subtype PDF annotation type
contents Contents of the annotation in the PDF (e.g. comment/description)
text Text in the order captured (use gettext() for a cleaner form)
author Author of the annotation
created Timestamp the annotation was created
color RGB color of the annotation
contents Contents of the annotation in the PDF (e.g. comment/description)
created Timestamp the annotation was created
in_reply_to Reference to another annotation on the page that this is "in reply to"
last_charseq Sequence number of the most recent character in text
name If present, uniquely identifies this annotation among others on the page
replies Annotations replying to this one (reverse of in_reply_to)
subtype PDF annotation type
text Text in the order captured (use gettext() for a cleaner form)

Attributes updated only for StrikeOut annotations:
Attributes updated for StrikeOut and Caret annotations:
pre_context Text captured just prior to the beginning of 'text'
post_context Text captured just after the end of 'text'
"""

contents: typ.Optional[str]
boxes: typ.List[Box]
text: typ.List[str]
contents: typ.Optional[str]
in_reply_to: typ.Optional[Annotation]
pre_context: typ.Optional[str]
post_context: typ.Optional[str]
replies: typ.List[Annotation]
text: typ.List[str]

def __init__(
self,
page: Page,
subtype: AnnotationType,
quadpoints: typ.Optional[typ.Sequence[float]] = None,
rect: typ.Optional[BoxCoords] = None,
contents: typ.Optional[str] = None,
*,
author: typ.Optional[str] = None,
created: typ.Optional[datetime.datetime] = None,
color: typ.Optional[RGB] = None):
color: typ.Optional[RGB] = None,
contents: typ.Optional[str] = None,
in_reply_to_ref: typ.Optional[PDFObjRef] = None,
name: typ.Optional[str] = None,
quadpoints: typ.Optional[typ.Sequence[float]] = None,
rect: typ.Optional[BoxCoords] = None):

# Construct boxes from quadpoints
boxes = []
Expand All @@ -324,16 +337,22 @@ def __init__(
super().__init__(pos)

# Initialise the attributes
self.subtype = subtype
self.contents = contents if contents else None
self.author = author
self.created = created
self.text = []
self.color = color
self.pre_context = None
self.post_context = None
self.boxes = boxes
self.color = color
self.contents = contents if contents else None
self.created = created
self.name = name
self.last_charseq = 0
self.post_context = None
self.pre_context = None
self.replies = []
self.subtype = subtype
self.text = []

# The in_reply_to reference will be resolved in postprocess()
self._in_reply_to_ref = in_reply_to_ref
self.in_reply_to = None

def __repr__(self) -> str:
return ('<Annotation %s %r%s%s>' %
Expand Down Expand Up @@ -394,8 +413,15 @@ def get_context(self, remove_hyphens: bool = False) -> typ.Tuple[str, str]:
return (merge_lines(self.pre_context or '', remove_hyphens, strip_space=False),
merge_lines(self.post_context or '', remove_hyphens, strip_space=False))

def postprocess(self) -> None:
def postprocess(self, annots_by_objid: typ.Dict[int, Annotation]) -> None:
"""Update internal state once all text and context has been captured."""
# Resole the in_reply_to object reference to its annotation
if self._in_reply_to_ref is not None:
assert self.in_reply_to is None # This should be called once only
self.in_reply_to = annots_by_objid.get(self._in_reply_to_ref.objid)
if self.in_reply_to is not None:
self.in_reply_to.replies.append(self)

# The Skim PDF reader (https://skim-app.sourceforge.io/) creates annotations whose
# default initial contents are a copy of the selected text. Unless the user goes to
# the trouble of editing each annotation, this goes badly for us because we have
Expand Down Expand Up @@ -466,10 +492,12 @@ class Document:
def __init__(self) -> None:
self.pages = []

def iter_annots(self) -> typ.Iterator[Annotation]:
def iter_annots(self, *, include_replies: bool = True) -> typ.Iterator[Annotation]:
"""Iterate over all the annotations in the document."""
for p in self.pages:
yield from p.annots
for a in p.annots:
if include_replies or not a.in_reply_to:
yield a

def nearest_outline(
self,
Expand Down
15 changes: 15 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,21 @@ def test(self) -> None:
self.assertEqual(self.annots[0].gettext(), None)


class CaretAnnotations(ExtractionTestBase):
filename = 'caret.pdf'

def test(self) -> None:
self.assertEqual(len(self.annots), 5)
self.assertEqual(self.annots[1].subtype, AnnotationType.StrikeOut)
self.assertEqual(self.annots[1].gettext(), 'Adobe Acrobat Reader')
self.assertEqual(self.annots[4].subtype, AnnotationType.Caret)
self.assertEqual(self.annots[4].contents, 'Google Chrome')
self.assertEqual(self.annots[1].in_reply_to, self.annots[4])
self.assertEqual(self.annots[4].replies, [self.annots[1]])
self.assertEqual(self.annots[1].replies, [])
self.assertEqual(self.annots[4].in_reply_to, None)


class PrinterTestBase(unittest.TestCase):
filename = 'hotos17.pdf'

Expand Down
Binary file added tests/caret.pdf
Binary file not shown.
Loading