Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

memento timegate: Fix for timegate in framed replay #564

Merged
merged 1 commit into from
Jun 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 25 additions & 16 deletions pywb/apps/rewriterapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ def render_content(self, wb_url, kwargs, environ):
content_rw, is_proxy)

response = None
keep_frame_response = False

# prefer overrides custom response?
if pref_mod is not None:
Expand All @@ -360,13 +361,22 @@ def render_content(self, wb_url, kwargs, environ):
else:
wb_url.mod = pref_mod
else:
# don't return top-frame response for timegate with exact redirects
kwargs['is_timegate_redir'] = is_timegate and redirect_to_exact
response = self.handle_custom_response(environ, wb_url,
full_prefix, host_prefix,
kwargs)
if kwargs.get('output'):
response = self.handle_timemap(wb_url, kwargs, full_prefix)

if response:
elif wb_url.is_query():
response = self.handle_query(environ, wb_url, kwargs, full_prefix)

else:
# don't return top-frame response for timegate with exact redirects
if not (is_timegate and redirect_to_exact):
keep_frame_response = is_timegate and not redirect_to_exact and not is_proxy
response = self.handle_custom_response(environ, wb_url,
full_prefix, host_prefix,
kwargs)


if response and not keep_frame_response:
return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy)

if is_proxy:
Expand Down Expand Up @@ -443,6 +453,11 @@ def render_content(self, wb_url, kwargs, environ):

return self.send_redirect(new_path, url_parts, urlrewriter)

# return top-frame timegate response, with timestamp from cdx
if response and keep_frame_response:
no_except_close(r.raw)
return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy, cdx['timestamp'])

stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
record = self.loader.parse_record_stream(stream,
ensure_http_headers=True)
Expand Down Expand Up @@ -560,7 +575,7 @@ def render_content(self, wb_url, kwargs, environ):

return response

def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy):
def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy, timegate_closest_ts=None):
memento_ts = None
if not isinstance(response, WbResponse):
content_type = 'text/html'
Expand All @@ -569,13 +584,13 @@ def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy):
if not self.is_framed_replay(wb_url):
content_type += '; charset=utf-8'
else:
memento_ts = wb_url.timestamp
memento_ts = timegate_closest_ts or wb_url.timestamp

response = WbResponse.text_response(response, content_type=content_type)

if self.enable_memento and response.status_headers.statusline.startswith('200'):
self._add_memento_links(wb_url.url, full_prefix, None, memento_ts,
response.status_headers, is_timegate, is_proxy)
response.status_headers, is_timegate, is_proxy, is_memento=not is_timegate)
return response

def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
Expand Down Expand Up @@ -873,13 +888,7 @@ def get_top_frame_params(self, wb_url, kwargs):
return {'metadata': kwargs.get('metadata', {})}

def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs):
if kwargs.get('output'):
return self.handle_timemap(wb_url, kwargs, full_prefix)

if wb_url.is_query():
return self.handle_query(environ, wb_url, kwargs, full_prefix)

if self.is_framed_replay(wb_url) and not kwargs.get('is_timegate_redir'):
if self.is_framed_replay(wb_url):
extra_params = self.get_top_frame_params(wb_url, kwargs)
return self.frame_insert_view.get_top_frame(wb_url,
full_prefix,
Expand Down
31 changes: 31 additions & 0 deletions tests/test_memento.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,28 @@ def test_memento_top_frame(self):
assert '"20140127171238"' in resp.text
assert '"http://www.iana.org/"' in resp.text, resp.text

def test_memento_top_frame_timegate(self):
resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css')

# vary header
assert VARY in resp.headers

# no memento header, as not really a memento (top-frame)
assert MEMENTO_DATETIME not in resp.headers

# Memento Headers
# memento link
dt = 'Mon, 27 Jan 2014 17:12:39 GMT'
url = 'http://www.iana.org/_css/2013.1/screen.css'

links = self.get_links(resp)

assert self.make_memento_link(url, '20140127171239', dt, 'mp_', include_coll=False) in links

#timegate link
assert self.make_timegate_link(url, '') in links


def test_memento_content_replay_exact(self, fmod):
resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod)

Expand Down Expand Up @@ -175,6 +197,15 @@ def test_timemap_error_invalid_format(self):
resp = self._timemap_get('/pywb/timemap/foo/http://example.com', status=400)
assert resp.json == {'message': 'output=foo not supported'}

def test_timegate_error_not_found(self):
resp = self.testapp.get('/pywb/http://example.com/x-not-found', status=404)
assert resp.status_code == 404

# No Memento Headers
assert VARY not in resp.headers
assert MEMENTO_DATETIME not in resp.headers
assert 'Link' not in resp.headers

def test_error_bad_accept_datetime(self):
"""
400 response for bad accept_datetime
Expand Down