Skip to content

Commit

Permalink
Add snap_x/y_tolerance to table-extract. settings
Browse files Browse the repository at this point in the history
Based largely on @dustindall's work in PR #51, adapted to current code.

Also resolves issue #475.
  • Loading branch information
jsvine committed Dec 1, 2021
1 parent 156bb4f commit 7ed4742
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 13 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ All notable changes to this project will be documented in this file. The format
- Add `.extract_text(layout=True)`, an *experimental feature* which attempts to mimic the structural layout of the text on the page. ([#10](https://github.com/jsvine/pdfplumber/issues/10))
- Add `utils.merge_bboxes(bboxes)`, which returns the smallest bounding box that contains all bounding boxes in the `bboxes` argument. ([f8d5e70](https://github.com/jsvine/pdfplumber/commit/f8d5e70a509aa9ed3ee565d7d3f97bb5ec67f5a5))
- Add `--precision` argument to CLI ([#520](https://github.com/jsvine/pdfplumber/pull/520))
- Add `snap_x_tolerance` and `snap_y_tolerance` to table extraction settings. ([#51](https://github.com/jsvine/pdfplumber/pull/51) + [#475](https://github.com/jsvine/pdfplumber/issues/475)) [h/t @dustindall]

## Changed
- Upgrade `pdfminer.six` from `20200517` to `20211012`; see [that library's changelog](https://github.com/pdfminer/pdfminer.six/blob/develop/CHANGELOG.md) for details, but a key difference is an improvement in how it assigns `line`, `rect`, and `curve` objects. (Diagonal two-point lines, for instance, are now `line` objects instead of `curve` objects.) ([#515](https://github.com/jsvine/pdfplumber/pull/515))
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,8 @@ By default, `extract_tables` uses the page's vertical and horizontal lines (or r
"explicit_vertical_lines": [],
"explicit_horizontal_lines": [],
"snap_tolerance": 3,
"snap_x_tolerance": 3,
"snap_y_tolerance": 3,
"join_tolerance": 3,
"edge_min_length": 3,
"min_words_vertical": 3,
Expand All @@ -330,7 +332,7 @@ By default, `extract_tables` uses the page's vertical and horizontal lines (or r
|`"horizontal_strategy"`| Either `"lines"`, `"lines_strict"`, `"text"`, or `"explicit"`. See explanation below.|
|`"explicit_vertical_lines"`| A list of vertical lines that explicitly demarcate cells in the table. Can be used in combination with any of the strategies above. Items in the list should be either numbers — indicating the `x` coordinate of a line the full height of the page — or `line`/`rect`/`curve` objects.|
|`"explicit_horizontal_lines"`| A list of horizontal lines that explicitly demarcate cells in the table. Can be used in combination with any of the strategies above. Items in the list should be either numbers — indicating the `y` coordinate of a line the full height of the page — or `line`/`rect`/`curve` objects.|
|`"snap_tolerance"`| Parallel lines within `snap_tolerance` pixels will be "snapped" to the same horizontal or vertical position.|
|`"snap_tolerance"`, `"snap_x_tolerance"`, `"snap_y_tolerance"`| Parallel lines within `snap_tolerance` pixels will be "snapped" to the same horizontal or vertical position.|
|`"join_tolerance"`| Line segments on the same infinite line, and whose ends are within `join_tolerance` of one another, will be "joined" into a single line segment.|
|`"edge_min_length"`| Edges shorter than `edge_min_length` will be discarded before attempting to reconstruct the table.|
|`"min_words_vertical"`| When using `"vertical_strategy": "text"`, at least `min_words_vertical` words must share the same alignment.|
Expand Down
33 changes: 23 additions & 10 deletions pdfplumber/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,20 @@
DEFAULT_MIN_WORDS_HORIZONTAL = 1


def snap_edges(edges, tolerance=DEFAULT_SNAP_TOLERANCE):
def snap_edges(
edges, x_tolerance=DEFAULT_SNAP_TOLERANCE, y_tolerance=DEFAULT_SNAP_TOLERANCE
):
"""
Given a list of edges, snap any within `tolerance` pixels of one another
to their positional average.
"""
v, h = [list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h")]
by_orientation = {"v": [], "h": []}
for e in edges:
by_orientation[e["orientation"]].append(e)

snap = utils.snap_objects
snapped = snap(v, "x0", tolerance) + snap(h, "top", tolerance)
return snapped
snapped_v = utils.snap_objects(by_orientation["v"], "x0", x_tolerance)
snapped_h = utils.snap_objects(by_orientation["h"], "top", y_tolerance)
return snapped_v + snapped_h


def join_edge_group(edges, orientation, tolerance=DEFAULT_JOIN_TOLERANCE):
Expand Down Expand Up @@ -47,7 +51,7 @@ def join_edge_group(edges, orientation, tolerance=DEFAULT_JOIN_TOLERANCE):
return joined


def merge_edges(edges, snap_tolerance, join_tolerance):
def merge_edges(edges, snap_x_tolerance, snap_y_tolerance, join_tolerance):
"""
Using the `snap_edges` and `join_edge_group` methods above,
merge a list of edges into a more "seamless" list.
Expand All @@ -59,8 +63,8 @@ def get_group(edge):
else:
return ("v", edge["x0"])

if snap_tolerance > 0:
edges = snap_edges(edges, snap_tolerance)
if snap_x_tolerance > 0 or snap_y_tolerance > 0:
edges = snap_edges(edges, snap_x_tolerance, snap_y_tolerance)

if join_tolerance > 0:
_sorted = sorted(edges, key=get_group)
Expand Down Expand Up @@ -412,6 +416,8 @@ def char_in_bbox(char, bbox):
"explicit_vertical_lines": [],
"explicit_horizontal_lines": [],
"snap_tolerance": DEFAULT_SNAP_TOLERANCE,
"snap_x_tolerance": None,
"snap_y_tolerance": None,
"join_tolerance": DEFAULT_JOIN_TOLERANCE,
"edge_min_length": 3,
"min_words_vertical": DEFAULT_MIN_WORDS_VERTICAL,
Expand Down Expand Up @@ -475,6 +481,8 @@ def resolve_table_settings(table_settings={}):
for var, fallback in [
("text_x_tolerance", "text_tolerance"),
("text_y_tolerance", "text_tolerance"),
("snap_x_tolerance", "snap_tolerance"),
("snap_y_tolerance", "snap_tolerance"),
("intersection_x_tolerance", "intersection_tolerance"),
("intersection_y_tolerance", "intersection_tolerance"),
]:
Expand Down Expand Up @@ -573,10 +581,15 @@ def get_edges(self):

edges = list(v) + list(h)

if settings["snap_tolerance"] > 0 or settings["join_tolerance"] > 0:
if (
settings["snap_x_tolerance"] > 0
or settings["snap_y_tolerance"] > 0
or settings["join_tolerance"] > 0
):
edges = merge_edges(
edges,
snap_tolerance=settings["snap_tolerance"],
snap_x_tolerance=settings["snap_x_tolerance"],
snap_y_tolerance=settings["snap_y_tolerance"],
join_tolerance=settings["join_tolerance"],
)
return utils.filter_edges(edges, min_length=settings["edge_min_length"])
27 changes: 25 additions & 2 deletions tests/test_ca_warn_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,34 @@ def test_edge_merging(self):
p0 = self.pdf.pages[0]
assert len(p0.edges) == 364
assert (
len(table.merge_edges(p0.edges, snap_tolerance=3, join_tolerance=3)) == 46
len(
table.merge_edges(
p0.edges, snap_x_tolerance=3, snap_y_tolerance=3, join_tolerance=3
)
)
== 46
)
assert (
len(
table.merge_edges(
p0.edges, snap_x_tolerance=0, snap_y_tolerance=3, join_tolerance=3
)
)
== 94
)
assert (
len(
table.merge_edges(
p0.edges, snap_x_tolerance=3, snap_y_tolerance=0, join_tolerance=3
)
)
== 174
)

def test_vertices(self):
p0 = self.pdf.pages[0]
edges = table.merge_edges(p0.edges, snap_tolerance=3, join_tolerance=3)
edges = table.merge_edges(
p0.edges, snap_x_tolerance=3, snap_y_tolerance=3, join_tolerance=3
)
ixs = table.edges_to_intersections(edges)
assert len(ixs.keys()) == 304 # 38x8

0 comments on commit 7ed4742

Please sign in to comment.