Skip to content
This repository has been archived by the owner on Jan 6, 2025. It is now read-only.

[WIP] PDFHandler accepts file like objects #189

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
import os
import sys

from builtins import str
from PyPDF2 import PdfFileReader, PdfFileWriter

from .core import TableList
from .parsers import Stream, Lattice
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
get_rotation)
get_rotation, is_url)


class PDFHandler(object):
Expand All @@ -27,11 +28,11 @@ class PDFHandler(object):
Password for decryption.

"""
def __init__(self, filename, pages='1', password=None):
self.filename = filename
if not filename.lower().endswith('.pdf'):
def __init__(self, io, pages='1', password=None):
self.io = io
if isinstance(self.io, str) and not self.io.endswith('.pdf'):
raise NotImplementedError("File format not supported")
self.pages = self._get_pages(self.filename, pages)
self.pages = self._get_pages(self.io, pages)
if password is None:
self.password = ''
else:
Expand Down Expand Up @@ -149,7 +150,7 @@ def parse(self, flavor='lattice', **kwargs):
tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages:
self._save_page(self.filename, p, tempdir)
self._save_page(self.io, p, tempdir)
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
for p in self.pages]
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
Expand Down
37 changes: 36 additions & 1 deletion camelot/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
from __future__ import division
from future.standard_library import hooks
import shutil
import tempfile
import warnings
from itertools import groupby
from operator import itemgetter
from contextlib import contextmanager, closing

with hooks():
from urllib.parse import (urlparse, uses_relative, uses_netloc, uses_params,
urlencode, urljoin)
from urllib.request import urlopen

import numpy as np

Expand All @@ -17,7 +24,8 @@
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical)


_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard('')
stream_kwargs = [
'columns',
'row_close_tol',
Expand Down Expand Up @@ -639,3 +647,30 @@ def get_text_objects(layout, ltype="char", t=None):
except AttributeError:
pass
return t


def is_url(url):
"""Check to see if a URL has a valid protocol.

Parameters
----------
url : str or unicode

Returns
-------
isurl : bool
If `url` has a valid protocol return True otherwise False.
"""
try:
return urlparse(url).scheme in _VALID_URLS
except Exception:
return False


@contextmanager
def pdf_file_reader(io):
if is_url(io):
with closing(urlopen(io)) as f:
yield f
else:
pass
17 changes: 9 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
click>=6.7
matplotlib>=2.2.3
numpy>=1.13.3
opencv-python>=3.4.2.17
openpyxl>=2.5.8
pandas>=0.23.4
pdfminer.six>=20170720
PyPDF2>=1.26.0
click>=6.7
matplotlib>=2.2.3
numpy>=1.13.3
opencv-python>=3.4.2.17
openpyxl>=2.5.8
pandas>=0.23.4
pdfminer.six>=20170720
PyPDF2>=1.26.0
future>=0.16.0