-
-
Notifications
You must be signed in to change notification settings - Fork 310
/
utils.py
48 lines (37 loc) · 1.45 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# Copyright 2015-2021 Akretion France
# @author: Alexis de Lattre <[email protected]>
# Copyright 2022 Camptocamp SA
# @author: Simone Orsi <[email protected]>
# License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl).
import logging
from io import BytesIO
from struct import error as StructError
from lxml import etree
try:
from PyPDF2.errors import PdfReadError
except ImportError:
from PyPDF2.utils import PdfReadError
from odoo.tools.pdf import OdooPdfFileReader
_logger = logging.getLogger(__name__)
class PDFParser:
def __init__(self, pdf_file):
self.pdf_file = pdf_file
def get_xml_files(self):
"""Parse PDF files to extract XML content.
:param pdf_file: binary PDF file content
:returns: a dict like {$filename: $parsed_xml_file_obj}.
"""
res = {}
with BytesIO(self.pdf_file) as buffer:
pdf_reader = OdooPdfFileReader(buffer, strict=False)
# Process embedded files.
for xml_name, content in pdf_reader.getAttachments():
try:
res[xml_name] = etree.fromstring(content)
except Exception:
_logger.debug("Non XML file found in PDF")
if res:
_logger.debug("Valid XML files found in PDF: %s", list(res.keys()))
return res
def get_xml_files_swallable_exceptions(self):
return (NotImplementedError, StructError, PdfReadError)