Skip to content

Commit

Permalink
fix: pdf2text imports
Browse files Browse the repository at this point in the history
  • Loading branch information
dvdblk committed Oct 26, 2023
1 parent 2492a62 commit 90ff310
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 7 deletions.
9 changes: 6 additions & 3 deletions experiments/alex/pdf2text/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@ class PDFConverter:
Note:
The full conversion pipeline is as follows:
1. `extract` text and formats from PDF (including tables and images)
a. `preprocess` the text (remove linebreaks, page numbers, headers, footers, etc.)
b. `postprocess` the text (pdf specific postprocessing e.g. remove first few pages of intro, blank pages, etc.)
1. `extract` text and formats from PDF (including text from tables and images)
a. `preprocess` the text = works on individual text paragraphs, tables, images
(remove linebreaks, join words, remove whitespace etc.)
b. `postprocess` the text = works on full pages of preprocessed text
(e.g. remove first few pages of intro, blank pages,
page numbers, etc.)
2. (optional) `translate`
3. `format` the text to the desired output
"""
Expand Down
3 changes: 1 addition & 2 deletions experiments/alex/pdf2text/extract.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from typing import Any, Generator, List, Tuple

import pdfplumber
from pdf2text.models import ExtractedPage
from pdf2text.preprocessing.manager import PreprocessorManager
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTChar, LTFigure, LTItem, LTRect, LTTextContainer

from .model import ExtractedPage


# TODO: Move this to formatter.py
def table_converter(table):
Expand Down
2 changes: 1 addition & 1 deletion experiments/alex/pdf2text/preprocessing/decorator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from pdf2text.model import PDFElementType
from pdf2text.models import PDFElementType

_preprocessors = {
PDFElementType.TABLE: [],
Expand Down
2 changes: 1 addition & 1 deletion experiments/alex/pdf2text/preprocessing/manager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Optional

from pdf2text.model import PDFElementType
from pdf2text.models import PDFElementType
from pdf2text.preprocessing.decorator import _preprocessors, _register_preprocessor
from pdf2text.preprocessing.result import (
ModifiedPreprocessResult,
Expand Down

0 comments on commit 90ff310

Please sign in to comment.