diff --git a/CHANGELOG.md b/CHANGELOG.md index 5425c5d3..1a52aac0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed - Using absolute instead of relative imports ([[#995](https://github.com/pdfminer/pdfminer.six/pull/995)]) +- Allow suppression of `(cid:N)` in `pdf2txt.py` ([#1070](https://github.com/pdfminer/pdfminer.six/pull/1070)) ### Deprecated diff --git a/samples/contrib/issue-1056-cid.pdf b/samples/contrib/issue-1056-cid.pdf new file mode 100644 index 00000000..76c610a6 Binary files /dev/null and b/samples/contrib/issue-1056-cid.pdf differ diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index e80a5e69..ec0be9be 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -68,6 +68,21 @@ def test_contrib_issue_350(self): """ run("contrib/issue-00352-asw-oct96-p41.pdf") + def test_contrib_issue_1056(self): + """Test fix to pdf2txt.py mentioned in + https://github.com/pdfminer/pdfminer.six/issues/1056""" + with TemporaryFilePath() as output_file_name: + pdf2txt.main( + [ + "--ignore-unmapped", + f"-o{output_file_name}", + absolute_sample_path("contrib/issue-1056-cid.pdf"), + ] + ) + with open(output_file_name) as infh: + for spam in infh: + assert "(cid:" not in spam + def test_scancode_patchelf(self): """Regression test for https://github.com/euske/pdfminer/issues/96""" run("scancode/patchelf.pdf") diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 3341690c..8344b1d8 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -9,6 +9,7 @@ from typing import Any, Container, Iterable, List, Optional import pdfminer.high_level +from pdfminer.converter import PDFLayoutAnalyzer from pdfminer.layout import LAParams from pdfminer.pdfexceptions import PDFValueError from pdfminer.utils import AnyIO @@ -277,6 +278,13 @@ def create_parser() -> argparse.ArgumentParser: help="Remove control statement from text. " "Only used when output_type is xml.", ) + output_params.add_argument( + "--ignore-unmapped", + "-I", + default=False, + action="store_true", + help="Ignore unmapped characters rather than outputting" "(cid:N) in the text", + ) return parser @@ -309,6 +317,9 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace: if parsed_args.outfile.endswith(override): parsed_args.output_type = alttype + if parsed_args.ignore_unmapped: + PDFLayoutAnalyzer.handle_undefined_char = lambda *args: "" # type: ignore + return parsed_args