From 8a5d858b6090a81997b55eac510e968e9b73406b Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Wed, 1 Apr 2020 09:13:18 -0400 Subject: [PATCH] Improve/fix extract_text bug, esp. re. vert. text Previously, utils.extract_text(...) returned incorrect results in certain cases when vertical text was present, as observed in https://github.com/jsvine/pdfplumber/pull/192. This commit fixes that by first segregating vertical and horizontal text (via "upright" char attribute) before clustering characters. It also adds two parameters, horizontal_ltr and vertical_ttb, to give users control over whethere words are meant to be read left-to-right and/or top-to-bottom vs. their opposites. --- README.md | 2 +- pdfplumber/utils.py | 48 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 5ddc7b8e..97e5be39 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ The `pdfplumber.Page` class is at the core of `pdfplumber`. Most things you'll d |`.within_bbox(bounding_box)`| Similar to `.crop`, but only retains objects that fall *entirely* within the bounding box.| |`.filter(test_function)`| Returns a version of the page with only the `.objects` for which `test_function(obj)` returns `True`.| |`.extract_text(x_tolerance=0, y_tolerance=0)`| Collates all of the page's character objects into a single string. Adds spaces where the difference between the `x1` of one character and the `x0` of the next is greater than `x_tolerance`. Adds newline characters where the difference between the `doctop` of one character and the `doctop` of the next is greater than `y_tolerance`.| -|`.extract_words(x_tolerance=0, y_tolerance=0)`| Returns a list of all word-looking things and their bounding boxes. Words are considered to be sequences of characters where the difference between the `x1` of one character and the `x0` of the next is less than or equal to `x_tolerance` *and* where the `doctop` of one character and the `doctop` of the next is less than or equal to `y_tolerance`.| +|`.extract_words(x_tolerance=0, y_tolerance=0, horizontal_ltr=True, vertical_ttb=True)`| Returns a list of all word-looking things and their bounding boxes. Words are considered to be sequences of characters where (for "upright" characters) the difference between the `x1` of one character and the `x0` of the next is less than or equal to `x_tolerance` *and* where the `doctop` of one character and the `doctop` of the next is less than or equal to `y_tolerance`. A similar approach is taken for non-upright characters, but instead measuring the vertical, rather than horizontal, distances between them. The parameters `horizontal_ltr` and `vertical_ttb` indicate whether the words should be read from left-to-right (for horizontal words) / top-to-bottom (for vertical words).| |`.extract_tables(table_settings)`| Extracts tabular data from the page. For more details see "[Extracting tables](#extracting-tables)" below.| |`.to_image(**conversion_kwargs)`| Returns an instance of the `PageImage` class. For more details, see "[Visual debugging](#visual-debugging)" below. For conversion_kwargs, see [here](http://docs.wand-py.org/en/latest/wand/image.html#wand.image.Image).| diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py index 4a8cf812..7f581f8a 100644 --- a/pdfplumber/utils.py +++ b/pdfplumber/utils.py @@ -164,10 +164,13 @@ def bbox_to_rect(bbox): "bottom": bbox[3] } + def extract_words(chars, x_tolerance=DEFAULT_X_TOLERANCE, y_tolerance=DEFAULT_Y_TOLERANCE, - keep_blank_chars=False + keep_blank_chars=False, + horizontal_ltr = True, # Should words be read left-to-right? + vertical_ttb = True, # Should vertical words be read top-to-bottom? ): x_tolerance = decimalize(x_tolerance) @@ -183,10 +186,13 @@ def process_word_chars(chars): "text": "".join(map(itemgetter("text"), chars)) } - - def get_line_words(chars, tolerance=DEFAULT_X_TOLERANCE): + def get_line_words(chars, is_upright, tolerance=DEFAULT_X_TOLERANCE): get_text = itemgetter("text") - chars_sorted = sorted(chars, key=itemgetter("x0")) + min_key = "x0" if is_upright else "top" + max_key = "x1" if is_upright else "bottom" + + chars_sorted = sorted(chars, key=itemgetter(min_key)) + words = [] current_word = [] @@ -200,23 +206,43 @@ def get_line_words(chars, tolerance=DEFAULT_X_TOLERANCE): current_word.append(char) else: last_char = current_word[-1] - if char["x0"] > (last_char["x1"] + tolerance): + if char[min_key] > (last_char[max_key] + tolerance): words.append(current_word) current_word = [] current_word.append(char) if len(current_word) > 0: - words.append(current_word) + if upright: + if horizontal_ltr: + sorted_chars = current_word + else: + sorted_chars = sorted(current_word, key = lambda x: -x["x1"]) + else: + if vertical_ttb: + sorted_chars = sorted(current_word, key = itemgetter("doctop")) + else: + sorted_chars = sorted(current_word, key = lambda x: -x["bottom"]) + + words.append(sorted_chars) + processed_words = list(map(process_word_chars, words)) return processed_words - chars = to_list(chars) - doctop_clusters = cluster_objects(chars, "doctop", y_tolerance) + chars_by_upright = { 1: [], 0: [] } + words = [] + for char in to_list(chars): + chars_by_upright[char.get("upright", 1)].append(char) + + for upright, char_group in chars_by_upright.items(): + clusters = cluster_objects( + char_group, + "doctop" if upright else "x0", + y_tolerance, # Still use y-tolerance here, even for vertical words + ) - nested = [ get_line_words(line_chars, tolerance=x_tolerance) - for line_chars in doctop_clusters ] + for line_chars in clusters: + words += get_line_words(line_chars, upright, tolerance = x_tolerance) - words = list(itertools.chain(*nested)) return words def extract_text(chars,