Improve/fix extract_text bug, esp. re. vert. text

Previously, utils.extract_text(...) returned incorrect results in certain cases when vertical text was present, as observed in #192. This commit fixes that by first segregating vertical and horizontal text (via "upright" char attribute) before clustering characters. It also adds two parameters, horizontal_ltr and vertical_ttb, to give users control over whethere words are meant to be read left-to-right and/or top-to-bottom vs. their opposites.
jsvine · Apr 1, 2020 · 8a5d858 · 8a5d858
1 parent b498df2
commit 8a5d858
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -94,7 +94,7 @@ The `pdfplumber.Page` class is at the core of `pdfplumber`. Most things you'll d
 |`.within_bbox(bounding_box)`| Similar to `.crop`, but only retains objects that fall *entirely* within the bounding box.|
 |`.filter(test_function)`| Returns a version of the page with only the `.objects` for which `test_function(obj)` returns `True`.|
 |`.extract_text(x_tolerance=0, y_tolerance=0)`| Collates all of the page's character objects into a single string. Adds spaces where the difference between the `x1` of one character and the `x0` of the next is greater than `x_tolerance`. Adds newline characters where the difference between the `doctop` of one character and the `doctop` of the next is greater than `y_tolerance`.|
-|`.extract_words(x_tolerance=0, y_tolerance=0)`| Returns a list of all word-looking things and their bounding boxes. Words are considered to be sequences of characters where the difference between the `x1` of one character and the `x0` of the next is less than or equal to `x_tolerance` *and* where the `doctop` of one character and the `doctop` of the next is less than or equal to `y_tolerance`.|
+|`.extract_words(x_tolerance=0, y_tolerance=0, horizontal_ltr=True, vertical_ttb=True)`| Returns a list of all word-looking things and their bounding boxes. Words are considered to be sequences of characters where (for "upright" characters) the difference between the `x1` of one character and the `x0` of the next is less than or equal to `x_tolerance` *and* where the `doctop` of one character and the `doctop` of the next is less than or equal to `y_tolerance`. A similar approach is taken for non-upright characters, but instead measuring the vertical, rather than horizontal, distances between them. The parameters `horizontal_ltr` and `vertical_ttb` indicate whether the words should be read from left-to-right (for horizontal words) / top-to-bottom (for vertical words).|
 |`.extract_tables(table_settings)`| Extracts tabular data from the page. For more details see "[Extracting tables](#extracting-tables)" below.|
 |`.to_image(**conversion_kwargs)`| Returns an instance of the `PageImage` class. For more details, see "[Visual debugging](#visual-debugging)" below. For conversion_kwargs, see [here](http://docs.wand-py.org/en/latest/wand/image.html#wand.image.Image).|
 

diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py
@@ -164,10 +164,13 @@ def bbox_to_rect(bbox):
         "bottom": bbox[3]
     }
 
+
 def extract_words(chars,
     x_tolerance=DEFAULT_X_TOLERANCE,
     y_tolerance=DEFAULT_Y_TOLERANCE,
-    keep_blank_chars=False
+    keep_blank_chars=False,
+    horizontal_ltr = True, # Should words be read left-to-right?
+    vertical_ttb = True, # Should vertical words be read top-to-bottom?
     ):
 
     x_tolerance = decimalize(x_tolerance)
@@ -183,10 +186,13 @@ def process_word_chars(chars):
             "text": "".join(map(itemgetter("text"), chars))
         }
 
-
-    def get_line_words(chars, tolerance=DEFAULT_X_TOLERANCE):
+    def get_line_words(chars, is_upright, tolerance=DEFAULT_X_TOLERANCE):
         get_text = itemgetter("text")
-        chars_sorted = sorted(chars, key=itemgetter("x0"))
+        min_key = "x0" if is_upright else "top"
+        max_key = "x1" if is_upright else "bottom"
+
+        chars_sorted = sorted(chars, key=itemgetter(min_key))
+
         words = []
         current_word = []
 
@@ -200,23 +206,43 @@ def get_line_words(chars, tolerance=DEFAULT_X_TOLERANCE):
                 current_word.append(char)
             else:
                 last_char = current_word[-1]
-                if char["x0"] > (last_char["x1"] + tolerance):
+                if char[min_key] > (last_char[max_key] + tolerance):
                     words.append(current_word)
                     current_word = []
                 current_word.append(char)
 
         if len(current_word) > 0:
-            words.append(current_word)
+            if upright:
+                if horizontal_ltr:
+                    sorted_chars = current_word
+                else:
+                    sorted_chars = sorted(current_word, key = lambda x: -x["x1"])
+            else:
+                if vertical_ttb:
+                    sorted_chars = sorted(current_word, key = itemgetter("doctop"))
+                else:
+                    sorted_chars = sorted(current_word, key = lambda x: -x["bottom"])
+
+            words.append(sorted_chars)
+
         processed_words = list(map(process_word_chars, words))
         return processed_words
 
-    chars = to_list(chars)
-    doctop_clusters = cluster_objects(chars, "doctop", y_tolerance)
+    chars_by_upright = { 1: [], 0: [] }
+    words = []
+    for char in to_list(chars):
+        chars_by_upright[char.get("upright", 1)].append(char)
+
+    for upright, char_group in chars_by_upright.items():
+        clusters = cluster_objects(
+            char_group,
+            "doctop" if upright else "x0",
+            y_tolerance, # Still use y-tolerance here, even for vertical words
+        )
 
-    nested = [ get_line_words(line_chars, tolerance=x_tolerance)
-        for line_chars in doctop_clusters ]
+        for line_chars in clusters:
+            words += get_line_words(line_chars, upright, tolerance = x_tolerance)
 
-    words = list(itertools.chain(*nested))
     return words
 
 def extract_text(chars,