Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Response bodies can be empty or missing + HttpParser refactoring #688

Merged
merged 5 commits into from
Nov 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
300 changes: 162 additions & 138 deletions proxy/http/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,35 +43,59 @@


class HttpParser:
"""HTTP request/response parser."""
"""HTTP request/response parser.

TODO: Make me zero-copy by using memoryview.
Currently due to chunk/buffer handling we
are not able to utilize memoryview
efficiently.

For this to happen we must store `buffer`
as List[memoryview] instead of raw bytes and
update parser to work accordingly.
"""

def __init__(self, parser_type: int) -> None:
self.type: int = parser_type
self.state: int = httpParserStates.INITIALIZED

self.host: Optional[bytes] = None
self.port: Optional[int] = None
self.path: Optional[bytes] = None
self.method: Optional[bytes] = None
self.code: Optional[bytes] = None
self.reason: Optional[bytes] = None
self.version: Optional[bytes] = None

# Total size of raw bytes passed for parsing
self.total_size: int = 0

# Buffer to hold unprocessed bytes
self.buffer: bytes = b''

# Keys are lower case header names
# Values are 2-tuple containing original
# header and it's value as received.
self.headers: Dict[bytes, Tuple[bytes, bytes]] = {}
self.body: Optional[bytes] = None

self.method: Optional[bytes] = None
self.url: Optional[urlparse.SplitResultBytes] = None
self.code: Optional[bytes] = None
self.reason: Optional[bytes] = None
self.version: Optional[bytes] = None

self.chunk_parser: Optional[ChunkParser] = None

# This cleans up developer APIs as Python urlparse.urlsplit behaves differently
# for incoming proxy request and incoming web request. Web request is the one
# which is broken.
self.host: Optional[bytes] = None
self.port: Optional[int] = None
self.path: Optional[bytes] = None
# TODO: Deprecate me, we don't need this in core.
#
# Deprecated since v2.4.0
#
# This is mostly for developers so that they can directly
# utilize a url object, but is unnecessary as parser
# provides all the necessary parsed information.
#
# But developers can utilize urlsplit or whatever
# library they are using when necessary. This will certainly
# give some performance boost as url parsing won't be needed
# for every request/response object.
#
# (except query string and fragments)
self._url: Optional[urlparse.SplitResultBytes] = None

@classmethod
def request(cls: Type[T], raw: bytes) -> T:
Expand Down Expand Up @@ -116,157 +140,51 @@ def set_url(self, url: bytes) -> None:
# with urlsplit, which expects a fully qualified url.
if self.method == httpMethods.CONNECT:
url = b'https://' + url
self.url = urlparse.urlsplit(url)
self.set_line_attributes()

def set_line_attributes(self) -> None:
if self.type == httpParserTypes.REQUEST_PARSER:
if self.method == httpMethods.CONNECT and self.url:
self.host = self.url.hostname
self.port = 443 if self.url.port is None else self.url.port
elif self.url:
self.host, self.port = self.url.hostname, self.url.port \
if self.url.port else DEFAULT_HTTP_PORT
else:
raise KeyError(
'Invalid request. Method: %r, Url: %r' %
(self.method, self.url),
)
self.path = self.build_path()
self._url = urlparse.urlsplit(url)
self._set_line_attributes()

def is_chunked_encoded(self) -> bool:
return b'transfer-encoding' in self.headers and \
self.headers[b'transfer-encoding'][1].lower() == b'chunked'

def content_expected(self) -> bool:
return b'content-length' in self.headers and int(self.header(b'content-length')) > 0

def body_expected(self) -> bool:
return (
b'content-length' in self.headers and
int(self.header(b'content-length')) > 0
) or \
self.is_chunked_encoded()
return self.content_expected() or self.is_chunked_encoded()

def parse(self, raw: bytes) -> None:
"""Parses Http request out of raw bytes.

Check HttpParser state after parse has successfully returned."""
Check for `HttpParser.state` after `parse` has successfully returned.
"""
self.total_size += len(raw)
raw = self.buffer + raw
self.buffer = b''

more = len(raw) > 0
self.buffer, more = b'', len(raw) > 0
while more and self.state != httpParserStates.COMPLETE:
if self.state in (
httpParserStates.HEADERS_COMPLETE,
httpParserStates.RCVING_BODY,
):
if b'content-length' in self.headers:
self.state = httpParserStates.RCVING_BODY
if self.body is None:
self.body = b''
total_size = int(self.header(b'content-length'))
received_size = len(self.body)
self.body += raw[:total_size - received_size]
if self.body and \
len(self.body) == int(self.header(b'content-length')):
self.state = httpParserStates.COMPLETE
more, raw = len(raw) > 0, raw[total_size - received_size:]
elif self.is_chunked_encoded():
if not self.chunk_parser:
self.chunk_parser = ChunkParser()
raw = self.chunk_parser.parse(raw)
if self.chunk_parser.state == chunkParserStates.COMPLETE:
self.body = self.chunk_parser.body
self.state = httpParserStates.COMPLETE
more = False
else:
raise NotImplementedError(
'Parser shouldn\'t have reached here. ' +
'This can happen when content length header is missing but their is a body in the payload',
)
else:
more, raw = self.process(raw)
# gte with HEADERS_COMPLETE also encapsulated RCVING_BODY state
more, raw = self._process_body(raw) \
if self.state >= httpParserStates.HEADERS_COMPLETE else \
self._process_line_and_headers(raw)
self.buffer = raw

def process(self, raw: bytes) -> Tuple[bool, bytes]:
"""Returns False when no CRLF could be found in received bytes."""
line, raw = find_http_line(raw)
if line is None:
return False, raw

if self.state == httpParserStates.INITIALIZED:
self.process_line(line)
self.state = httpParserStates.LINE_RCVD
elif self.state in (httpParserStates.LINE_RCVD, httpParserStates.RCVING_HEADERS):
if self.state == httpParserStates.LINE_RCVD:
# LINE_RCVD state is equivalent to RCVING_HEADERS
self.state = httpParserStates.RCVING_HEADERS
if line.strip() == b'': # Blank line received.
self.state = httpParserStates.HEADERS_COMPLETE
else:
self.process_header(line)

# When server sends a response line without any header or body e.g.
# HTTP/1.1 200 Connection established\r\n\r\n
if self.state == httpParserStates.LINE_RCVD and \
self.type == httpParserTypes.RESPONSE_PARSER and \
raw == CRLF:
self.state = httpParserStates.COMPLETE
elif self.state == httpParserStates.HEADERS_COMPLETE and \
not self.body_expected() and \
raw == b'':
self.state = httpParserStates.COMPLETE

return len(raw) > 0, raw

def process_line(self, raw: bytes) -> None:
line = raw.split(WHITESPACE)
if self.type == httpParserTypes.REQUEST_PARSER:
self.method = line[0].upper()
self.set_url(line[1])
self.version = line[2]
else:
self.version = line[0]
self.code = line[1]
self.reason = WHITESPACE.join(line[2:])

def process_header(self, raw: bytes) -> None:
parts = raw.split(COLON)
key = parts[0].strip()
value = COLON.join(parts[1:]).strip()
self.add_headers([(key, value)])

def build_path(self) -> bytes:
if not self.url:
return b'/None'
url = self.url.path
if url == b'':
url = b'/'
if not self.url.query == b'':
url += b'?' + self.url.query
if not self.url.fragment == b'':
url += b'#' + self.url.fragment
return url

def build(self, disable_headers: Optional[List[bytes]] = None, for_proxy: bool = False) -> bytes:
"""Rebuild the request object."""
assert self.method and self.version and self.path and self.type == httpParserTypes.REQUEST_PARSER
if disable_headers is None:
disable_headers = DEFAULT_DISABLE_HEADERS
body: Optional[bytes] = ChunkParser.to_chunks(self.body) \
if self.is_chunked_encoded() and self.body else \
self.body
body: Optional[bytes] = self._get_body_or_chunks()
path = self.path
if for_proxy:
assert self.url and self.host and self.port and self.path
assert self._url and self.host and self.port and self.path
path = (
self.url.scheme +
self._url.scheme +
COLON + SLASH + SLASH +
self.host +
COLON +
str(self.port).encode() +
self.path
) if self.method != httpMethods.CONNECT else (self.host + COLON + str(self.port).encode())

return build_http_request(
self.method, path, self.version,
headers={} if not self.headers else {
Expand All @@ -278,16 +196,15 @@ def build(self, disable_headers: Optional[List[bytes]] = None, for_proxy: bool =

def build_response(self) -> bytes:
"""Rebuild the response object."""
assert self.code and self.version and self.body and self.type == httpParserTypes.RESPONSE_PARSER
assert self.code and self.version and self.type == httpParserTypes.RESPONSE_PARSER
return build_http_response(
status_code=int(self.code),
protocol_version=self.version,
reason=self.reason,
headers={} if not self.headers else {
self.headers[k][0]: self.headers[k][1] for k in self.headers
},
body=self.body if not self.is_chunked_encoded(
) else ChunkParser.to_chunks(self.body),
body=self._get_body_or_chunks(),
)

def has_host(self) -> bool:
Expand All @@ -305,3 +222,110 @@ def is_connection_upgrade(self) -> bool:
return self.version == HTTP_1_1 and \
self.has_header(b'Connection') and \
self.has_header(b'Upgrade')

def _process_body(self, raw: bytes) -> Tuple[bool, bytes]:
if b'content-length' in self.headers:
self.state = httpParserStates.RCVING_BODY
if self.body is None:
self.body = b''
total_size = int(self.header(b'content-length'))
received_size = len(self.body)
self.body += raw[:total_size - received_size]
if self.body and \
len(self.body) == int(self.header(b'content-length')):
self.state = httpParserStates.COMPLETE
more, raw = len(raw) > 0, raw[total_size - received_size:]
elif self.is_chunked_encoded():
if not self.chunk_parser:
self.chunk_parser = ChunkParser()
raw = self.chunk_parser.parse(raw)
if self.chunk_parser.state == chunkParserStates.COMPLETE:
self.body = self.chunk_parser.body
self.state = httpParserStates.COMPLETE
more = False
else:
raise NotImplementedError(
'Parser shouldn\'t have reached here. ' +
'This can happen when content length header is missing but their is a body in the payload',
)
return more, raw

def _process_line_and_headers(self, raw: bytes) -> Tuple[bool, bytes]:
"""Returns False when no CRLF could be found in received bytes."""
line, raw = find_http_line(raw)
if line is None:
return False, raw

if self.state == httpParserStates.INITIALIZED:
self._process_line(line)
self.state = httpParserStates.LINE_RCVD
elif self.state in (httpParserStates.LINE_RCVD, httpParserStates.RCVING_HEADERS):
if self.state == httpParserStates.LINE_RCVD:
# LINE_RCVD state is equivalent to RCVING_HEADERS
self.state = httpParserStates.RCVING_HEADERS
if line.strip() == b'': # Blank line received.
self.state = httpParserStates.HEADERS_COMPLETE
else:
self._process_header(line)

# When server sends a response line without any header or body e.g.
# HTTP/1.1 200 Connection established\r\n\r\n
if self.state == httpParserStates.LINE_RCVD and \
self.type == httpParserTypes.RESPONSE_PARSER and \
raw == CRLF:
self.state = httpParserStates.COMPLETE
elif self.state == httpParserStates.HEADERS_COMPLETE and \
not self.body_expected() and \
raw == b'':
self.state = httpParserStates.COMPLETE

return len(raw) > 0, raw

def _process_line(self, raw: bytes) -> None:
line = raw.split(WHITESPACE)
if self.type == httpParserTypes.REQUEST_PARSER:
self.method = line[0].upper()
self.set_url(line[1])
self.version = line[2]
else:
self.version = line[0]
self.code = line[1]
self.reason = WHITESPACE.join(line[2:])

def _process_header(self, raw: bytes) -> None:
parts = raw.split(COLON)
key = parts[0].strip()
value = COLON.join(parts[1:]).strip()
self.add_headers([(key, value)])

def _get_body_or_chunks(self) -> Optional[bytes]:
return ChunkParser.to_chunks(self.body) \
if self.body and self.is_chunked_encoded() else \
self.body

def _set_line_attributes(self) -> None:
if self.type == httpParserTypes.REQUEST_PARSER:
if self.method == httpMethods.CONNECT and self._url:
self.host = self._url.hostname
self.port = 443 if self._url.port is None else self._url.port
elif self._url:
self.host, self.port = self._url.hostname, self._url.port \
if self._url.port else DEFAULT_HTTP_PORT
else:
raise KeyError(
'Invalid request. Method: %r, Url: %r' %
(self.method, self._url),
)
self.path = self._build_path()

def _build_path(self) -> bytes:
if not self._url:
return b'/None'
url = self._url.path
if url == b'':
url = b'/'
if not self._url.query == b'':
url += b'?' + self._url.query
if not self._url.fragment == b'':
url += b'#' + self._url.fragment
return url
4 changes: 3 additions & 1 deletion proxy/plugin/modify_chunk_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ def handle_upstream_chunk(self, chunk: memoryview) -> memoryview:
self.response.parse(chunk.tobytes())
# If response is complete, modify and dispatch to client
if self.response.state == httpParserStates.COMPLETE:
self.response.body = b'\n'.join(self.DEFAULT_CHUNKS) + b'\n'
# Avoid setting a body for responses where content is not expected
if self.response.content_expected():
self.response.body = b'\n'.join(self.DEFAULT_CHUNKS) + b'\n'
self.client.queue(memoryview(self.response.build_response()))
return memoryview(b'')

Expand Down
Loading