forked from RZachLamberty/mtg_data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommon.py
executable file
·75 lines (57 loc) · 1.85 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Module: common.py
Author: zlamberty
Created: 2016-02-28
Description:
common access values
Usage:
<usage>
"""
import logging
import os
import lxml.html
import requests
# ----------------------------- #
# Module Constants #
# ----------------------------- #
# local html caching
HTML_DIR = os.path.join(os.sep, 'var', 'data', 'local_html_cache')
LOGGER = logging.getLogger(__name__)
# ----------------------------- #
# utility #
# ----------------------------- #
def url2html(url, localdir=HTML_DIR, forcerefresh=False, hidden=True,
session=requests):
"""General purpose download tool; will save html files locally instead of
making re-requests
args:
url (str): url to request
localdir (str): directory in which we will save files
(default: common.HTML_DIR)
forcerefresh (bool): whether or not we ignore local copy
hidden (bool): whether or not files are saved as hidden locally
session (requests.Session): handy for multiple request scenarios
returns:
lxml.html: parsed xml object obtained from possibly-cached raw html
raises:
None
"""
# what is the local name?
localname = os.path.join(
localdir, '{}{}'.format(
'.' if hidden else '',
os.path.basename(url)
)
)
# if we are calling out regardless (forcerefresh) or we have no local copy..
if forcerefresh or not os.access(localname, os.R_OK):
LOGGER.debug('active download of url: {}'.format(url))
resp = session.get(url)
with open(localname, 'wb') as fp:
fp.write(resp.content)
return lxml.html.fromstring(resp.content)
else:
with open(localname, 'rb') as fp:
return lxml.html.fromstring(fp.read())