diff --git a/README.md b/README.md index 72f4f4c1..5f68704b 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ The `pdfplumber.Page` class is at the core of `pdfplumber`. Most things you'll d |`.within_bbox(bounding_box, relative=False)`| Similar to `.crop`, but only retains objects that fall *entirely* within the bounding box.| |`.filter(test_function)`| Returns a version of the page with only the `.objects` for which `test_function(obj)` returns `True`.| |`.extract_text(x_tolerance=3, y_tolerance=3)`| Collates all of the page's character objects into a single string. Adds spaces where the difference between the `x1` of one character and the `x0` of the next is greater than `x_tolerance`. Adds newline characters where the difference between the `doctop` of one character and the `doctop` of the next is greater than `y_tolerance`.| -|`.extract_words(x_tolerance=3, y_tolerance=3, horizontal_ltr=True, vertical_ttb=True)`| Returns a list of all word-looking things and their bounding boxes. Words are considered to be sequences of characters where (for "upright" characters) the difference between the `x1` of one character and the `x0` of the next is less than or equal to `x_tolerance` *and* where the `doctop` of one character and the `doctop` of the next is less than or equal to `y_tolerance`. A similar approach is taken for non-upright characters, but instead measuring the vertical, rather than horizontal, distances between them. The parameters `horizontal_ltr` and `vertical_ttb` indicate whether the words should be read from left-to-right (for horizontal words) / top-to-bottom (for vertical words).| +|`.extract_words(x_tolerance=3, y_tolerance=3, keep_blank_chars=False, horizontal_ltr=True, vertical_ttb=True, extra_attrs=[])`| Returns a list of all word-looking things and their bounding boxes. Words are considered to be sequences of characters where (for "upright" characters) the difference between the `x1` of one character and the `x0` of the next is less than or equal to `x_tolerance` *and* where the `doctop` of one character and the `doctop` of the next is less than or equal to `y_tolerance`. A similar approach is taken for non-upright characters, but instead measuring the vertical, rather than horizontal, distances between them. The parameters `horizontal_ltr` and `vertical_ttb` indicate whether the words should be read from left-to-right (for horizontal words) / top-to-bottom (for vertical words). Changing `keep_blank_chars` to `True` will mean that blank characters are treated as part of a word, not as a space between words. Passing a list of `extra_attrs` (e.g., `["fontname", "size"]` will restrict each words to characters that share exactly the same value for each of those attributes, and the resulting word dicts will indicate those attributes.| |`.extract_tables(table_settings)`| Extracts tabular data from the page. For more details see "[Extracting tables](#extracting-tables)" below.| |`.to_image(**conversion_kwargs)`| Returns an instance of the `PageImage` class. For more details, see "[Visual debugging](#visual-debugging)" below. For conversion_kwargs, see [here](http://docs.wand-py.org/en/latest/wand/image.html#wand.image.Image).| diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py index 6db98c78..29ab9ec2 100644 --- a/pdfplumber/utils.py +++ b/pdfplumber/utils.py @@ -205,86 +205,103 @@ def bbox_to_rect(bbox): return {"x0": bbox[0], "top": bbox[1], "x1": bbox[2], "bottom": bbox[3]} -def extract_words( - chars, - x_tolerance=DEFAULT_X_TOLERANCE, - y_tolerance=DEFAULT_Y_TOLERANCE, - keep_blank_chars=False, - horizontal_ltr=True, # Should words be read left-to-right? - vertical_ttb=True, # Should vertical words be read top-to-bottom? -): +def merge_chars(ordered_chars, extra_attrs=[]): + x0, top, x1, bottom = objects_to_bbox(ordered_chars) + + word = { + "text": "".join(map(itemgetter("text"), ordered_chars)), + "x0": x0, + "x1": x1, + "top": top, + "bottom": bottom, + "upright": ordered_chars[0]["upright"], + } - x_tolerance = decimalize(x_tolerance) - y_tolerance = decimalize(y_tolerance) + for key in extra_attrs: + word[key] = ordered_chars[0][key] - def process_word_chars(chars, upright): - x0, top, x1, bottom = objects_to_bbox(chars) + return word - return { - "x0": x0, - "x1": x1, - "top": top, - "bottom": bottom, - "upright": upright, - "text": "".join(map(itemgetter("text"), chars)), - } - def get_line_words(chars, upright, tolerance): - get_text = itemgetter("text") - if upright: - min_key, max_key = ("x0", "x1") if horizontal_ltr else ("x1", "x0") - else: - min_key, max_key = ("top", "bottom") if vertical_ttb else ("bottom", "top") +def cluster_line_chars( + chars, tolerance, keep_blank_chars=False, min_key="x0", max_key="x1", sort_asc=True +): + get_text = itemgetter("text") + + words = [] + current_word = [] - words = [] - current_word = [] + comp_fn = gt if sort_asc else lt + tol_fn = add if sort_asc else sub - asc_order = (upright and horizontal_ltr) or (not upright and vertical_ttb) + def sort_key(x): + return tol_fn(0, x[min_key]) - comp_fn = gt if asc_order else lt - tol_fn = add if asc_order else sub + sorted_chars = sorted(chars, key=sort_key) - def sort_key(x): - return tol_fn(0, x[min_key]) + for char in sorted_chars: + if not keep_blank_chars and get_text(char).isspace(): + if len(current_word) > 0: + words.append(current_word) + current_word = [] + elif len(current_word) == 0: + current_word.append(char) + else: + last_char = current_word[-1] + prev_pos = tol_fn(last_char[max_key], tolerance) + if comp_fn(char[min_key], prev_pos): + words.append(current_word) + current_word = [] + current_word.append(char) + + if len(current_word) > 0: + words.append(current_word) - sorted_chars = sorted(chars, key=sort_key) + return words - for char in sorted_chars: - if not keep_blank_chars and get_text(char).isspace(): - if len(current_word) > 0: - words.append(current_word) - current_word = [] - else: - pass - elif len(current_word) == 0: - current_word.append(char) - else: - last_char = current_word[-1] - prev_pos = tol_fn(last_char[max_key], tolerance) - if comp_fn(char[min_key], prev_pos): - words.append(current_word) - current_word = [] - current_word.append(char) - if len(current_word) > 0: - words.append(current_word) +def extract_words( + chars, + x_tolerance=DEFAULT_X_TOLERANCE, + y_tolerance=DEFAULT_Y_TOLERANCE, + keep_blank_chars=False, + horizontal_ltr=True, # Should words be read left-to-right? + vertical_ttb=True, # Should vertical words be read top-to-bottom? + extra_attrs=[], +): - return [process_word_chars(chars, upright) for chars in words] + x_tolerance = decimalize(x_tolerance) + y_tolerance = decimalize(y_tolerance) - chars_by_upright = {True: [], False: []} words = [] - for char in to_list(chars): - chars_by_upright[char.get("upright", False)].append(char) + grouped = itertools.groupby(chars, itemgetter("upright", *extra_attrs)) + + for keyvals, char_group in grouped: + upright = keyvals[0] if len(extra_attrs) else keyvals - for upright, char_group in chars_by_upright.items(): clusters = cluster_objects( char_group, "doctop" if upright else "x0", y_tolerance, # Still use y-tolerance here, even for vertical words ) + sort_asc = (upright and horizontal_ltr) or (not upright and vertical_ttb) + min_key, max_key = ("x0", "x1") if upright else ("top", "bottom") + + if not sort_asc: + min_key, max_key = max_key, min_key + for line_chars in clusters: - words += get_line_words(line_chars, upright, tolerance=x_tolerance) + word_clusters = cluster_line_chars( + line_chars, + # Still use x-tolerance here, even for vertical words + tolerance=x_tolerance, + keep_blank_chars=keep_blank_chars, + min_key=min_key, + max_key=max_key, + sort_asc=sort_asc, + ) + words += [merge_chars(c, extra_attrs) for c in word_clusters] return words diff --git a/tests/test_utils.py b/tests/test_utils.py index ece054a8..b64c1b0f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -68,11 +68,20 @@ def test_extract_words(self): with pdfplumber.open(path) as pdf: p = pdf.pages[0] words = p.extract_words(vertical_ttb=False) + words_attr = p.extract_words(vertical_ttb=False, extra_attrs = [ "size" ]) + words_w_spaces = p.extract_words(vertical_ttb=False, keep_blank_chars=True) words_rtl = p.extract_words(horizontal_ltr=False) assert words[0]["text"] == "Agaaaaa:" + + assert "size" not in words[0] + assert float(words_attr[0]["size"]) == 9.960 + + assert words_w_spaces[0]["text"] == "Agaaaaa: AAAA" + vertical = [w for w in words if w["upright"] == 0] assert vertical[0]["text"] == "Aaaaaabag8" + assert words_rtl[1]["text"] == "baaabaaA/AAA" def test_extract_text(self):