Skip to content

Commit

Permalink
Add .curve_edges, use by default in table-detect.
Browse files Browse the repository at this point in the history
Most of the groundwork was already there to add a PDF/Page.curve_edges
property. And, inspired, by
#858 and related issues,
we now include 0/90/180/270-degree oriented curve segments into the
default table-detection strategy. As before, you can still switch to the
"lines_strict" to use only lines defined as such (rather than also using
rect and curve edges).
  • Loading branch information
jsvine committed Apr 13, 2023
1 parent 4b37397 commit 6f6b465
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 16 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,9 @@ my_char_rotation = my_char_ctm.skew_x
|`non_stroking_color`|The curve’s fill color.|
|`object_type`| "curve"|

Additionally, both `pdfplumber.PDF` and `pdfplumber.Page` provide access to two derived lists of objects: `.rect_edges` (which decomposes each rectangle into its four lines) and `.edges` (which combines `.rect_edges` with `.lines`).
#### Derived properties

Additionally, both `pdfplumber.PDF` and `pdfplumber.Page` provide access to several derived lists of objects: `.rect_edges` (which decomposes each rectangle into its four lines), `.curve_edges` (which does the same for `curve` objects), and `.edges` (which combines `.rect_edges`, `.curve_edges`, and `.lines`).

#### `image` properties

Expand Down
12 changes: 10 additions & 2 deletions pdfplumber/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


class Container(object):
cached_properties = ["_rect_edges", "_edges", "_objects"]
cached_properties = ["_rect_edges", "_curve_edges", "_edges", "_objects"]

@property
def pages(self) -> Optional[List[Any]]:
Expand Down Expand Up @@ -73,12 +73,20 @@ def rect_edges(self) -> T_obj_list:
self._rect_edges: T_obj_list = list(chain(*rect_edges_gen))
return self._rect_edges

@property
def curve_edges(self) -> T_obj_list:
if hasattr(self, "_curve_edges"):
return self._curve_edges
curve_edges_gen = (utils.curve_to_edges(r) for r in self.curves)
self._curve_edges: T_obj_list = list(chain(*curve_edges_gen))
return self._curve_edges

@property
def edges(self) -> T_obj_list:
if hasattr(self, "_edges"):
return self._edges
line_edges = list(map(utils.line_to_edge, self.lines))
self._edges: T_obj_list = self.rect_edges + line_edges
self._edges: T_obj_list = line_edges + self.rect_edges + self.curve_edges
return self._edges

@property
Expand Down
14 changes: 8 additions & 6 deletions pdfplumber/utils/geometry.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ def curve_to_edges(curve: T_obj) -> T_obj_list:
point_pairs = zip(curve["pts"], curve["pts"][1:])
return [
{
"object_type": "curve_edge",
"x0": min(p0[0], p1[0]),
"x1": max(p0[0], p1[0]),
"top": min(p0[1], p1[1]),
Expand Down Expand Up @@ -253,12 +254,13 @@ def line_to_edge(line: T_obj) -> T_obj:


def obj_to_edges(obj: T_obj) -> T_obj_list:
return {
"line": lambda x: [line_to_edge(x)],
"rect": rect_to_edges,
"rect_edge": rect_to_edges,
"curve": curve_to_edges,
}[obj["object_type"]](obj)
t = obj["object_type"]
if "_edge" in t:
return [obj]
elif t == "line":
return [line_to_edge(obj)]
else:
return {"rect": rect_to_edges, "curve": curve_to_edges}[t](obj)


def filter_edges(
Expand Down
Binary file added tests/pdfs/table-curves-example.pdf
Binary file not shown.
18 changes: 11 additions & 7 deletions tests/test_basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@ class Test(unittest.TestCase):
def setup_class(self):
path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
self.pdf = pdfplumber.open(path)
# via http://www.pdfill.com/example/pdf_drawing_new.pdf
path_2 = os.path.join(HERE, "pdfs/pdffill-demo.pdf")
self.pdf_2 = pdfplumber.open(path_2)

@classmethod
def teardown_class(self):
self.pdf.close()
self.pdf_2.close()

def test_metadata(self):
metadata = self.pdf.metadata
Expand All @@ -38,18 +42,18 @@ def test_objects(self):
assert len(self.pdf.rects)
assert len(self.pdf.lines)
assert len(self.pdf.rect_edges)
assert len(self.pdf_2.curve_edges)
# Ensure that caching is working:
assert id(self.pdf._rect_edges) == id(self.pdf.rect_edges)
assert id(self.pdf_2._curve_edges) == id(self.pdf_2.curve_edges)
assert id(self.pdf.pages[0]._layout) == id(self.pdf.pages[0].layout)

def test_annots(self):
# via http://www.pdfill.com/example/pdf_drawing_new.pdf
path = os.path.join(HERE, "pdfs/pdffill-demo.pdf")
with pdfplumber.open(path) as pdf:
assert len(pdf.annots)
assert len(pdf.hyperlinks) == 17
uri = "http://www.pdfill.com/pdf_drawing.html"
assert pdf.hyperlinks[0]["uri"] == uri
pdf = self.pdf_2
assert len(pdf.annots)
assert len(pdf.hyperlinks) == 17
uri = "http://www.pdfill.com/pdf_drawing.html"
assert pdf.hyperlinks[0]["uri"] == uri

path = os.path.join(HERE, "pdfs/annotations.pdf")
with pdfplumber.open(path) as pdf:
Expand Down
13 changes: 13 additions & 0 deletions tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,3 +211,16 @@ def test_discussion_539_null_value(self):
}
assert page.extract_table(table_settings)
assert page.extract_tables(table_settings)

def test_table_curves(self):
# See https://github.com/jsvine/pdfplumber/discussions/808
path = os.path.join(HERE, "pdfs/table-curves-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
assert len(page.curves)
tables = page.extract_tables()
assert len(tables) == 1
t = tables[0]
assert t[-2][-2] == "Uncommon"

assert len(page.extract_tables({"vertical_strategy": "lines_strict"})) == 0

0 comments on commit 6f6b465

Please sign in to comment.