Skip to content

Commit

Permalink
User-Agent Detection Fix + New-Style rewriting on by default + Depend…
Browse files Browse the repository at this point in the history
…ency Update (2.6.6) (#708)

* js rewriting: default to moden js-proxy based rewriting by default, use legacy rewriting only if browsers are older than minimum, as suggested in #707 
* user-agent detection: use ua_parser for user-agent detection instead of obsolete werkzeug.useragent, which also did not support browsers >=100
* tests: additional tests for rewriting with various user-agents, defaulting to new-style rewriting for unknown browsers
* dockerfile: Update Dockerfile to use py3.8
* tests: skip s3 tests dependent on commoncrawl data (for now, need better s3 tests).
* bump to 2.6.6, update CHANGES
  • Loading branch information
ikreymer authored Apr 11, 2022
1 parent 63ac82e commit 403167f
Show file tree
Hide file tree
Showing 8 changed files with 89 additions and 44 deletions.
8 changes: 8 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
pywb 2.6.6 changelist
~~~~~~~~~~~~~~~~~~~~~

* dependency: don't use obsolete werkzeug useragent package `#704 <https://github.com/webrecorder/pywb/pull/704>`_
* fix user-agent detection: use ua-parser module, default to new js-proxy mode, unless older browser detected `#707 <https://github.com/webrecorder/pywb/pull/707>`_
* fix tests: disable broken s3 tests for now
* Dockerfile: use python 3.8 by default

pywb 2.6.5 changelist
~~~~~~~~~~~~~~~~~~~~~

Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG PYTHON=python:3.7.2
ARG PYTHON=python:3.8

FROM $PYTHON

Expand Down
53 changes: 32 additions & 21 deletions pywb/rewrite/default_rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pywb import DEFAULT_RULES_FILE

import copy
from werkzeug.useragents import UserAgent
from ua_parser import user_agent_parser


# ============================================================================
Expand All @@ -34,7 +34,7 @@ class DefaultRewriter(BaseContentRewriter):

'css': CSSRewriter,

'js': JSLocationOnlyRewriter,
'js': JSWombatProxyRewriter,
'js-proxy': JSNoneRewriter,
'js-worker': JSWorkerRewriter,

Expand Down Expand Up @@ -119,33 +119,44 @@ def __init__(self, *args, **kwargs):
super(RewriterWithJSProxy, self).__init__(*args, **kwargs)

def get_rewriter(self, rw_type, rwinfo=None):
if rw_type == 'js' and rwinfo:
# check if UA allows this
if self.ua_allows_obj_proxy(rwinfo.url_rewriter.rewrite_opts):
return JSWombatProxyRewriter

# otherwise, return default rewriter
return super(RewriterWithJSProxy, self).get_rewriter(rw_type, rwinfo)

def ua_allows_obj_proxy(self, opts):
if rw_type != 'js' or not rwinfo:
return super(RewriterWithJSProxy, self).get_rewriter(rw_type, rwinfo)

# check if should use old non-proxy rewriter
if self.ua_no_obj_proxy(rwinfo.url_rewriter.rewrite_opts):
print("loc only")
return JSLocationOnlyRewriter
else:
# otherwise, return default, js proxy-capable rewriter
return JSWombatProxyRewriter

def ua_no_obj_proxy(self, opts):
ua = opts.get('ua')
if not ua:
ua_string = opts.get('ua_string')
if ua_string:
ua = UserAgent(ua_string)
ua = user_agent_parser.ParseUserAgent(ua_string)

if ua is None:
return True
return False

supported = {
'chrome': '49.0',
'firefox': '44.0',
'safari': '10.0',
'opera': '36.0',
'edge': '12.0',
'msie': None,
'chrome': 49,
'firefox': 4,
'safari': 10,
'opera': 36,
'edge': 12,
'ie': 1000,
}

min_vers = supported.get(ua.browser)
min_vers = supported.get(ua.get("family", "").lower())
if not min_vers:
return False

try:
ua_version = int(ua.get("major", 0))
except:
return False

return ua_version < min_vers

return (min_vers and ua.version >= min_vers)
14 changes: 7 additions & 7 deletions pywb/rewrite/test/test_content_rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
from pywb.rewrite.default_rewriter import RewriterWithJSProxy

from pywb import get_test_dir

Expand All @@ -39,8 +39,7 @@ def headers(request):
class TestContentRewriter(object):
@classmethod
def setup_class(self):
self.content_rewriter = DefaultRewriter()
self.js_proxy_content_rewriter = RewriterWithJSProxy()
self.content_rewriter = RewriterWithJSProxy()

def _create_response_record(self, url, headers, payload, warc_headers):
writer = BufferWARCWriter()
Expand All @@ -65,7 +64,6 @@ def rewrite_record(self, headers, content, ts, url='http://example.com/',
record = self._create_response_record(url, headers, content, warc_headers)

wburl = WbUrl(ts + '/' + (request_url or url))
url_rewriter = UrlRewriter(wburl, prefix)

cdx = CDXObject()
cdx['url'] = url
Expand All @@ -79,11 +77,13 @@ def insert_func(rule, cdx):
return ''

if use_js_proxy:
rewriter = self.js_proxy_content_rewriter
rewrite_opts = {}
else:
rewriter = self.content_rewriter
rewrite_opts = {'ua_string': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/10.0 Safari/537.36'}

return rewriter(record, url_rewriter, cookie_rewriter=None,
url_rewriter = UrlRewriter(wburl, prefix, rewrite_opts=rewrite_opts)

return self.content_rewriter(record, url_rewriter, cookie_rewriter=None,
head_insert_func=insert_func,
cdx=cdx,
environ=environ)
Expand Down
5 changes: 3 additions & 2 deletions pywb/utils/test/test_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@

test_cdx_dir = get_test_dir() + 'cdx/'


@pytest.mark.skip("skip for now, made need different s3 source")
def test_s3_read_1():
pytest.importorskip('boto3')

Expand All @@ -112,13 +112,14 @@ def test_s3_read_1():
assert reader.readline() == b'WARC/1.0\r\n'
assert reader.readline() == b'WARC-Type: response\r\n'

@pytest.mark.skip("skip for now, made need different s3 source")
def test_s3_read_2():
pytest.importorskip('boto3')

res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html')

buff = res.read()
assert len(buff) == 2082
assert len(buff) == 2330

reader = DecompressingBufferedReader(BytesIO(buff))
assert reader.readline() == b'<!DOCTYPE html>\n'
Expand Down
2 changes: 1 addition & 1 deletion pywb/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '2.6.5'
__version__ = '2.6.6'

if __name__ == '__main__':
print(__version__)
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ jinja2<3.0.0
surt>=0.3.1
brotlipy
pyyaml
werkzeug==1.0.1
werkzeug
webencodings
gevent==20.9.0
webassets==0.12.1
Expand All @@ -16,3 +16,4 @@ fakeredis<1.0
tldextract
python-dateutil
markupsafe<2.1.0
ua_parser
46 changes: 35 additions & 11 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,17 +269,41 @@ def test_replay_js_obj_proxy(self, fmod):
assert resp.content_length != 0
assert resp.content_type == 'application/x-javascript'

# test with Chrome user agent
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})
assert 'let window = _____WB$wombat$assign$function_____(' in resp.text

def test_replay_js_ie11_no_obj_proxy(self, fmod):
# IE11 user-agent, no proxy
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})

assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text
user_agents = [
# chrome
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.3071.115 Safari/537.36'
# firefox
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/98.0'
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/100.0',
# safari
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
# other
'some-custom-browser'
]

# test with each user-agent
for ua in user_agents:
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
headers={'User-Agent': ua})

assert 'let window = _____WB$wombat$assign$function_____(' in resp.text

def test_replay_js_no_obj_proxy(self, fmod):
user_agents = [
# IE11 user-agent, no proxy
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
# old chrome
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/19.0.3071.115 Safari/537.36'
# old firefox
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/12.0'
]

for ua in user_agents:
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
headers={'User-Agent': ua})

assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text

def test_replay_non_exact(self, fmod):
# non-exact mode, don't redirect to exact capture
Expand Down

0 comments on commit 403167f

Please sign in to comment.