Skip to content

Commit

Permalink
R6 - Various Fixes (webrecorder#540)
Browse files Browse the repository at this point in the history
* fixes for RC6:
- blockrecordloader: ensure record stream is closed after parsing one record 
- wrap HttpLoader streams in StreamClosingReader() which should close the connection even if stream not fully consumed
- simplify no_except_close
may help with ukwa/ukwa-pywb#53
- iframe: add allow fullscreen, autoplay
- wombat: update to latest, filter out custom wombat props from getOwnPropertyNames
- rules: add rule for vimeo

* cdx formatting: fix output=text to return plain text / non-cdxj output

* auto fetch fix:
- update to latest wombat to fix auto-fetch in rewriting mode
- fix /proxy-fetch/ endpoint for proxy mode recording, switch proxy-fetch to run in recording mode
- don't use global to allow repeated checks

* rewriter html check: peek 1024 bytes to determine if page is html instead of 128

* fix jinja2 dependency for py2
  • Loading branch information
ikreymer authored Feb 21, 2020
1 parent fa021ee commit 92e459b
Show file tree
Hide file tree
Showing 18 changed files with 84 additions and 38 deletions.
25 changes: 15 additions & 10 deletions pywb/apps/frontendapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def __init__(self, config_file=None, custom_config=None):

self.proxy_prefix = None # the URL prefix to be used for the collection with proxy mode (e.g. /coll/id_/)
self.proxy_coll = None # the name of the collection that has proxy mode enabled
self.proxy_record = False # indicate if proxy recording
self.init_proxy(config)

self.init_recorder(config.get('recorder'))
Expand Down Expand Up @@ -627,17 +628,21 @@ def init_proxy(self, config):
if proxy_coll in self.warcserver.list_fixed_routes():
raise Exception('Can not record into fixed collection')

proxy_coll += self.RECORD_ROUTE
proxy_route = proxy_coll + self.RECORD_ROUTE
if not config.get('recorder'):
config['recorder'] = 'live'

self.proxy_record = True

else:
logging.info('Proxy enabled for collection "{0}"'.format(proxy_coll))
self.proxy_record = False
proxy_route = proxy_coll

if proxy_config.get('enable_content_rewrite', True):
self.proxy_prefix = '/{0}/bn_/'.format(proxy_coll)
self.proxy_prefix = '/{0}/bn_/'.format(proxy_route)
else:
self.proxy_prefix = '/{0}/id_/'.format(proxy_coll)
self.proxy_prefix = '/{0}/id_/'.format(proxy_route)

self.proxy_default_timestamp = proxy_config.get('default_timestamp')
if self.proxy_default_timestamp:
Expand Down Expand Up @@ -686,14 +691,14 @@ def proxy_fetch(self, env, url):
return WbResponse.options_response(env)

# ensure full URL
request_url = env['REQUEST_URI']
# replace with /id_ so we do not get rewritten
url = request_url.replace('/proxy-fetch', '/id_')
# update WSGI environment object
env['REQUEST_URI'] = self.proxy_coll + url
env['PATH_INFO'] = env['PATH_INFO'].replace('/proxy-fetch', self.proxy_coll + '/id_')
url = env['REQUEST_URI'].split('/proxy-fetch/', 1)[-1]

env['REQUEST_URI'] = self.proxy_prefix + url
env['PATH_INFO'] = self.proxy_prefix + env['PATH_INFO'].split('/proxy-fetch/', 1)[-1]

# make request using normal serve_content
response = self.serve_content(env, self.proxy_coll, url)
response = self.serve_content(env, self.proxy_coll, url, record=self.proxy_record)

# for WR
if isinstance(response, WbResponse):
response.add_access_control_headers(env=env)
Expand Down
2 changes: 1 addition & 1 deletion pywb/rewrite/content_rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ def _resolve_text_type(self, text_type):
else:
return text_type

buff = self.read_and_keep(128)
buff = self.read_and_keep(1024)

# check if doesn't start with a tag, then likely not html
if self.TAG_REGEX.match(buff):
Expand Down
2 changes: 1 addition & 1 deletion pywb/rewrite/regex_rewriters.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def __init__(self):
# rewriting 'this.' special properties access, not on new line (no ;)
(r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(this_rw), 0),
# rewrite '= this' or ', this'
(r'(?<=[=,])\s*this\b\s*(?![.$])', self.replace_str(this_rw), 0),
(r'(?<=[=,])\s*this\b\s*(?![:.$])', self.replace_str(this_rw), 0),
# rewrite ')(this)'
('\}(?:\s*\))?\s*\(this\)', self.replace_str(this_rw), 0),
# rewrite this in && or || expr?
Expand Down
3 changes: 3 additions & 0 deletions pywb/rewrite/test/test_regex_rewriters.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,9 @@
>>> _test_js_obj_proxy('return this.foo')
'return this.foo'
>>> _test_js_obj_proxy('{foo: bar, this: other}')
'{foo: bar, this: other}'
>>> _test_js_obj_proxy(r'this.$location = http://example.com/')
'this.$location = http://example.com/'
Expand Down
2 changes: 1 addition & 1 deletion pywb/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ rules:
- videoFileId
- signature

- url_prefix: 'net,akamaized,gcs-vimeo)/'
- url_prefix: ['net,akamaized,gcs-vimeo)/', 'net,akamaized,vod)/']

fuzzy_lookup:
match: '([/\d]+\.mp4)$'
Expand Down
17 changes: 11 additions & 6 deletions pywb/static/autoFetchWorker.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ var config = {
rwRe: null,
defaultFetchOptions: {
cache: 'force-cache',
mode: null
mode: 'cors'
}
};

Expand Down Expand Up @@ -53,7 +53,7 @@ if (!config.haveFetch) {
xhr.onreadystatechange = function() {
if (xhr.readyState === 4) {
if (!config.havePromise) {
fetchDoneOrErrored();
fetchDone();
}
resolve();
}
Expand All @@ -78,7 +78,7 @@ if (location.search.indexOf('init') !== -1) {
config.prefix = init.prefix;
config.mod = init.mod;
config.prefixMod = init.prefix + init.mod;
config.rwRe = new RegExp(init.rwRe, 'g');
config.rwRe = new RegExp(init.rwRe);
config.relative = init.prefix.split(location.origin)[1];
config.schemeless = '/' + config.relative;
})();
Expand All @@ -101,11 +101,16 @@ self.onmessage = function(event) {

function noop() {}

function fetchDoneOrErrored() {
function fetchDone() {
runningFetches -= 1;
fetchFromQ();
}

function fetchErrored(err) {
console.warn("Fetch Failed: " + err);
fetchDone();
}

/**
* Fetches the supplied URL and increments the {@link runningFetches} variable
* to represent an inflight request.
Expand All @@ -130,8 +135,8 @@ function fetchURL(toBeFetched) {
}

fetch(url, options)
.then(fetchDoneOrErrored)
.catch(fetchDoneOrErrored);
.then(fetchDone)
.catch(fetchErrored);
}

function queueOrFetch(toBeFetched) {
Expand Down
2 changes: 1 addition & 1 deletion pywb/static/wombat.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pywb/templates/frame_insert.html
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
<body style="margin: 0px; padding: 0px;">

<div id="wb_iframe_div">
<iframe id="replay_iframe" frameborder="0" seamless="seamless" scrolling="yes" class="wb_iframe"></iframe>
<iframe id="replay_iframe" frameborder="0" seamless="seamless" scrolling="yes" class="wb_iframe" allow="autoplay; fullscreen"></iframe>
</div>
<script>
var cframe = new ContentFrame({"url": "{{ url }}" + window.location.hash,
Expand Down
26 changes: 19 additions & 7 deletions pywb/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,22 @@
from warcio.utils import BUFF_SIZE


# =============================================================================
def no_except_close(closable):
"""Attempts to call the close method of the
supplied object.
supplied object catching all exceptions.
Also tries to call release_conn() in case a requests raw stream
:param closable: The object to be closed
:rtype: None
"""
if not closable:
return

try:
closable.close()
except Exception:
pass

try:
release_conn = getattr(closable, 'release_conn', None)
if release_conn is not None:
release_conn()
closable.release_conn()
except Exception:
pass

Expand Down Expand Up @@ -121,3 +118,18 @@ def read(self, length=None):
def readline(self, length=None):
self._skip()
return super(OffsetLimitReader, self).readline(length)


# ============================================================================
class StreamClosingReader(object):
def __init__(self, stream):
self.stream = stream

def read(self, length=None):
return self.stream.read(length)

def readline(self, length=None):
return self.stream.readline(length)

def close(self):
no_except_close(self.stream)
4 changes: 2 additions & 2 deletions pywb/utils/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from io import open, BytesIO
from warcio.limitreader import LimitReader
from pywb.utils.io import no_except_close
from pywb.utils.io import no_except_close, StreamClosingReader

try:
import boto3
Expand Down Expand Up @@ -355,7 +355,7 @@ def load(self, url, offset, length):

r = self.session.get(url, headers=headers, stream=True)
r.raise_for_status()
return r.raw
return StreamClosingReader(r.raw)


# =================================================================
Expand Down
2 changes: 1 addition & 1 deletion pywb/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '2.4.0-rc5'
__version__ = '2.4.0-rc6'

if __name__ == '__main__':
print(__version__)
7 changes: 5 additions & 2 deletions pywb/warcserver/index/cdxobject.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,13 @@ def to_text(self, fields=None):
:param fields: list of field names to output.
"""
if fields is None:
return str(self) + '\n'
if self.cdxline:
return to_native_str(self.cdxline, 'utf-8') + '\n'

fields = six.iterkeys(self)

try:
result = ' '.join(str(self[x]) for x in fields) + '\n'
result = ' '.join(str(self.get(x, '-')) for x in fields) + '\n'
except KeyError as ke:
msg = 'Invalid field "{0}" found in fields= argument'
msg = msg.format(str(ke))
Expand Down
1 change: 1 addition & 0 deletions pywb/warcserver/index/test/test_cdxops.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1)
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
Expand Down
2 changes: 1 addition & 1 deletion pywb/warcserver/resource/pathresolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __call__(self, filename, cdx):
if hasattr(cdx, '_formatter') and cdx._formatter:
full_path = cdx._formatter.format(full_path)

path = full_path + filename
path = os.path.join(full_path, filename)
if '*' not in path:
return path

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ six
warcio>=1.7.1
requests
redis<3.0
jinja2
jinja2<3.0.0
surt>=0.3.1
brotlipy
pyyaml
Expand Down
13 changes: 13 additions & 0 deletions tests/test_cdx_server_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,19 @@ def test_exact_url_json(self):
assert len(lines) == 3, resp.text
assert len(list(map(json.loads, lines))) == 3

def test_exact_url_plain_text(self):
"""
basic exact match, no filters, etc.
"""
resp = self.query('http://www.iana.org/', output='text')

assert resp.status_code == 200
assert resp.content_type == 'text/plain'
assert '{' not in resp.text

lines = resp.text.splitlines()
assert len(lines) == 3, resp.text

def test_prefix_match(self):
"""
prefix match test
Expand Down
8 changes: 6 additions & 2 deletions tests/test_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,12 +430,16 @@ def test_include_auto_fetch_worker_not_wombat(self, scheme):


# ============================================================================
class TestProxyAutoFetchWorkerEndPoints(BaseTestProxy):
class TestProxyAutoFetchWorkerEndPoints(CollsDirMixin, BaseTestProxy):
@classmethod
def setup_class(cls):
super(TestProxyAutoFetchWorkerEndPoints, cls).setup_class(
proxy_opts={'enable_wombat': True}, config_opts={'enable_auto_fetch': True}
coll='test2',
config_file='config_test_record.yaml',
proxy_opts={'enable_wombat': True}, config_opts={'enable_auto_fetch': True},
recording=True
)
manager(['init', 'test2'])

def test_proxy_fetch_options_request(self, scheme):
expected_origin = '{0}://example.com'.format(scheme)
Expand Down
2 changes: 1 addition & 1 deletion wombat

0 comments on commit 92e459b

Please sign in to comment.