Skip to content

Commit

Permalink
ROB: cope with str returned from get_data in cmap
Browse files Browse the repository at this point in the history
fixes py-pdf#1379
have not been able to identify why str is returned instead of bytes as usual
prefer to convert locally
  • Loading branch information
pubpub-zz committed Oct 5, 2022
1 parent 9d870a2 commit f93f087
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 0 deletions.
2 changes: 2 additions & 0 deletions PyPDF2/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ def parse_to_unicode(

def prepare_cm(ft: DictionaryObject) -> bytes:
cm: bytes = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
if isinstance(cm, str):
cm = cm.encode()
# we need to prepare cm before due to missing return line in pdf printed to pdf from word
cm = (
cm.strip()
Expand Down
7 changes: 7 additions & 0 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,10 @@ def test_iss1370():
name = "cmap1370.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader.pages[0].extract_text()


def test_iss1370():
url = "https://github.com/py-pdf/PyPDF2/files/9712729/02voc.pdf"
name = "02voc.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader.pages[2].extract_text()

0 comments on commit f93f087

Please sign in to comment.