From 6acd580ceb50b376e4e6c520ba23a2f03b557ad9 Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Tue, 11 Apr 2023 16:35:18 -0400 Subject: [PATCH] Make char_begins_new_word more explict & rigorous This commit aims to fix two things: - The semi-crypticness of the previous version of char_begins_new_word - The inconsistency (vs. the rest of the approach) in how the method was comparing "top" to "bottom" for interline comparisons, instead of "top" to "top", as rightly and helpfully pointed out by @bellma-lilly in https://github.com/jsvine/pdfplumber/discussions/840 Based on the unit tests, this shouldn't change the output of `pdfplumber` in the vast majority of use cases. It might affect some output in edge-cases, for which I apologize for any inconvenience and which I hope is balanced out by this more consistent approach's benefits in the long run. --- pdfplumber/utils/text.py | 74 +++++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 8 deletions(-) diff --git a/pdfplumber/utils/text.py b/pdfplumber/utils/text.py index ea65b6b5..e286a9c6 100644 --- a/pdfplumber/utils/text.py +++ b/pdfplumber/utils/text.py @@ -265,18 +265,76 @@ def merge_chars(self, ordered_chars: T_obj_list) -> T_obj: def char_begins_new_word( self, prev_char: T_obj, - next_char: T_obj, + curr_char: T_obj, ) -> bool: + """This method takes several factors into account to determine if + `curr_char` represents the beginning of a new word: + + - Whether the text is "upright" (i.e., non-rotated) + - Whether the user has specified that horizontal text runs + left-to-right (default) or right-to-left, as represented by + self.horizontal_ltr + - Whether the user has specified that vertical text the text runs + top-to-bottom (default) or bottom-to-top, as represented by + self.vertical_ttb + - The x0, top, x1, and bottom attributes of prev_char and + curr_char + - The self.x_tolerance and self.y_tolerance settings. Note: In + this case, x/y refer to those directions for non-rotated text. + For vertical text, they are flipped. A more accurate terminology + might be "*intra*line character distance tolerance" and + "*inter*line character distance tolerance" + + An important note: The *intra*line distance is measured from the + *end* of the previous character to the *beginning* of the current + character, while the *inter*line distance is measured from the + *top* of the previous character to the *top* of the next + character. The reasons for this are partly repository-historical, + and partly logical, as successive text lines' bounding boxes often + overlap slightly (and we don't want that overlap to be interpreted + as the two lines being the same line). + + The upright-ness of the character determines the attributes to + compare, while horizontal_ltr/vertical_ttb determine the direction + of the comparison. + """ + + # Note: Due to the grouping step earlier in the process, + # curr_char["upright"] will always equal prev_char["upright"]. + if curr_char["upright"]: + inter_tol = self.y_tolerance + intra_tol = self.x_tolerance + + inter_attr = "top" + intra_attr_min = "x0" + intra_attr_max = "x1" + + if self.horizontal_ltr: + char_min = prev_char + char_max = curr_char + else: + char_min = curr_char + char_max = prev_char + else: + inter_tol = self.x_tolerance + intra_tol = self.y_tolerance - upright = prev_char["upright"] - intraline_tol = self.x_tolerance if upright else self.y_tolerance - interline_tol = self.y_tolerance if upright else self.x_tolerance + inter_attr = "x0" + intra_attr_min = "top" + intra_attr_max = "bottom" + + if self.vertical_ttb: + char_min = curr_char + char_max = prev_char + else: + char_min = prev_char + char_max = curr_char return bool( - (next_char["x0"] > prev_char["x1"] + intraline_tol) - or (next_char["x1"] < prev_char["x0"] - intraline_tol) - or (next_char["top"] > prev_char["bottom"] + interline_tol) - or (next_char["bottom"] < prev_char["top"] - interline_tol) + # Intraline test + (char_max[intra_attr_min] > char_min[intra_attr_max] + intra_tol) + # Interline test + or (char_max[inter_attr] > char_min[inter_attr] + inter_tol) ) def iter_chars_to_words(