Skip to content

Commit

Permalink
Performance: Allow large sets of JSON docs without StackOverflows
Browse files Browse the repository at this point in the history
  • Loading branch information
bblommers committed May 10, 2024
1 parent ca7faeb commit 9b48238
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 59 deletions.
13 changes: 5 additions & 8 deletions py_partiql_parser/_internal/from_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,12 @@ def get_source_data(self, documents: Dict[str, str]) -> Any:
from_query
].endswith("]")

source_data = JsonParser().parse(documents[from_query])
source_data = list(JsonParser().parse(documents[from_query]))

if doc_is_list:
return {"_1": source_data}
return {"_1": source_data[0]}
elif from_alias:
if isinstance(source_data, list):
return [CaseInsensitiveDict({from_alias: doc}) for doc in source_data]
else:
return CaseInsensitiveDict({from_alias: source_data})
return [CaseInsensitiveDict({from_alias: doc}) for doc in source_data]
else:
return source_data

Expand Down Expand Up @@ -135,10 +132,10 @@ def _get_nested_source_data(self, documents: Dict[str, Any]) -> Any:
doc_is_list = source_data[new_key].startswith("[") and source_data[
new_key
].endswith("]")
source_data = JsonParser().parse(source_data[new_key])
source_data = list(JsonParser().parse(source_data[new_key])) # type: ignore
if root_doc and doc_is_list:
# AWS behaviour when the root-document is a list
source_data = {"_1": source_data}
source_data = {"_1": source_data[0]} # type: ignore
elif key_so_far == entire_key:
if isinstance(source_data, list): # type: ignore[unreachable]
source_data = [{alias: doc} for doc in source_data] # type: ignore[unreachable]
Expand Down
2 changes: 1 addition & 1 deletion py_partiql_parser/_internal/insert_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def parse(self, query: str) -> Tuple[str, Dict[str, Any]]:
if section == "SECTION_VALUE":
assert current_phrase.upper() in ["VALUE"]
tokenizer.skip_white_space()
attr = JsonParser().parse(tokenizer.give_remaining())
attr = next(JsonParser().parse(tokenizer.give_remaining()))
for key, value in attr.items():
attr[key] = serializer.serialize(value)
if section == "TABLE_NAME":
Expand Down
52 changes: 25 additions & 27 deletions py_partiql_parser/_internal/json_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,30 @@ def parse(
) -> Any:
if not (original.startswith("{") or original.startswith("[")):
# Doesn't look like JSON - let's return as a variable
return original if original.isnumeric() else Variable(original)
yield original if original.isnumeric() else Variable(original)
tokenizer = tokenizer or ClauseTokenizer(original)
while tokenizer.current() is not None:
result = self._parse(original, tokenizer, only_parse_initial)
if result is not None:
yield result

def _parse(
self,
original: str,
tokenizer: ClauseTokenizer,
only_parse_initial: bool = False,
) -> Any:
section: Optional[str] = None # DICT_KEY | KEY_TO_VALUE | DICT_VAL | OBJECT_END
dict_key = ""
current_phrase = ""
result = CaseInsensitiveDict()
tokenizer = tokenizer or ClauseTokenizer(original)
level = 0
while True:
c = tokenizer.next()
if not c:
break
return None
elif c == "[" and (not section or section == "KEY_TO_VALUE"):
level += 1
# Start of a list
if not section:
return self._parse_list(original, tokenizer)
Expand All @@ -41,6 +54,8 @@ def parse(
section = None
current_phrase = ""
elif c in ["{", ","] and (not section or section == "OBJECT_END"):
if c == "{":
level += 1
# Start of a key
section = "DICT_KEY"
tokenizer.skip_until(ACCEPTED_QUOTES)
Expand All @@ -52,9 +67,10 @@ def parse(
section = "KEY_TO_VALUE"
current_phrase = ""
elif c in ["{"] and section == "KEY_TO_VALUE":
level += 1
# Start of a value with a new dictionary
tokenizer.revert() # Ensure we start the new parser with the initial {
result[dict_key] = self.parse(original, tokenizer)
result[dict_key] = self._parse(original, tokenizer)
section = None
current_phrase = ""
elif c in ACCEPTED_QUOTES and section == "KEY_TO_VALUE":
Expand All @@ -67,6 +83,7 @@ def parse(
section = None
current_phrase = ""
elif c in ["}"] and section in ["VAR_VALUE", "INT_VALUE"]:
level -= 1
# End of a variable/number
if section == "INT_VALUE":
result[dict_key] = int(current_phrase)
Expand All @@ -90,29 +107,11 @@ def parse(
tokenizer.revert()
section = None
current_phrase = ""
elif section in ["OBJECT_END"]:
next_documents = self.parse(original, tokenizer)
if next_documents == {}:
return result
elif isinstance(next_documents, list):
return [result] + next_documents
else:
return [result, next_documents]
elif c == "}" and section is None:
section = "OBJECT_END"
# We know whether we are at the end of an object at this point
# But we don't know whether this is:
# - end of the root object
# - end of a nested object
# - inbetween multiple objects (separated by new-line)
tokenizer.skip_white_space()
if tokenizer.current() == "{":
# we're inbetween multiple objects - continue parsing
tokenizer.revert()
pass
level -= 1
if level == 0:
return result
else:
# we're at the end of the root object - next char is probably None. Break and return to the user
# we're at the end of a nested object - next char is probably }. Break and let the parent processor takeover
break
elif c in [" ", NEW_LINE] and section not in ["DICT_KEY", "DICT_VAL"]:
pass
Expand All @@ -126,7 +125,6 @@ def parse(
section = "VAR_VALUE"
if section in ["DICT_KEY", "DICT_VAL", "INT_VALUE", "VAR_VALUE"]:
current_phrase += c

return result

def _parse_list(self, original: str, tokenizer: ClauseTokenizer) -> Any:
Expand All @@ -139,7 +137,7 @@ def _parse_list(self, original: str, tokenizer: ClauseTokenizer) -> Any:
break
if c == "{":
tokenizer.revert() # Ensure we start the new parser with the initial {
result.append(self.parse(original, tokenizer, only_parse_initial=True))
result.append(self._parse(original, tokenizer, only_parse_initial=True))
if tokenizer.current() == "]":
break
tokenizer.skip_until([","])
Expand Down
69 changes: 46 additions & 23 deletions tests/test_json_parser.py
Original file line number Diff line number Diff line change
@@ -1,83 +1,96 @@
import json
import pytest
from typing import Any
from uuid import uuid4
from py_partiql_parser._internal.json_parser import JsonParser, Variable


def test_static_value() -> None:
assert JsonParser().parse("a") == Variable("a")
assert next(JsonParser().parse("a")) == Variable("a")


def test_dict() -> None:
assert JsonParser().parse(json.dumps({"a": "b"})) == {"a": "b"}
assert JsonParser().parse("{'a': 'b'}") == {"a": "b"}
assert JsonParser().parse('{"a": "b"}') == {"a": "b"}
assert next(JsonParser().parse(json.dumps({"a": "b"}))) == {"a": "b"}
assert next(JsonParser().parse("{'a': 'b'}")) == {"a": "b"}
assert next(JsonParser().parse('{"a": "b"}')) == {"a": "b"}


def test_dict_with_spaces_in_keys_and_values() -> None:
assert JsonParser().parse(json.dumps({"a sth": "b sth"})) == {"a sth": "b sth"}
assert next(JsonParser().parse(json.dumps({"a sth": "b sth"}))) == {
"a sth": "b sth"
}


def test_dict_with_multiple_entries() -> None:
assert JsonParser().parse(json.dumps({"a": "b", "c": "d"})) == {"a": "b", "c": "d"}
assert next(JsonParser().parse(json.dumps({"a": "b", "c": "d"}))) == {
"a": "b",
"c": "d",
}


def test_dict_with_nested_entries() -> None:
original = {"a": {"b1": {"b1.1": "b1.2"}}, "c": "d"}
assert JsonParser().parse(json.dumps(original)) == original
assert next(JsonParser().parse(json.dumps(original))) == original


def test_dict_with_list() -> None:
assert JsonParser().parse(json.dumps({"a": ["b1", "b2"], "c": "d"})) == {
assert next(JsonParser().parse(json.dumps({"a": ["b1", "b2"], "c": "d"}))) == {
"a": ["b1", "b2"],
"c": "d",
}


def test_list() -> None:
assert JsonParser().parse(json.dumps(["a", "b", "asdfasdf"])) == [
assert next(JsonParser().parse(json.dumps(["a", "b", "asdfasdf"]))) == [
"a",
"b",
"asdfasdf",
]


def test_list_with_only_numbers() -> None:
assert JsonParser().parse(json.dumps([1, 1234, 12341234])) == [1, 1234, 12341234]
assert next(JsonParser().parse(json.dumps([1, 1234, 12341234]))) == [
1,
1234,
12341234,
]


def test_list_with_numbers_and_strings() -> None:
assert JsonParser().parse(json.dumps(["x", 1324, "y"])) == ["x", 1324, "y"]
assert next(JsonParser().parse(json.dumps(["x", 1324, "y"]))) == ["x", 1324, "y"]


def test_list_with_variables() -> None:
assert JsonParser().parse("[v.a, v.b]") == [Variable("v.a"), Variable("v.b")]
assert next(JsonParser().parse("[v.a, v.b]")) == [Variable("v.a"), Variable("v.b")]


def test_dict_with_key_containing_a_special_char() -> None:
assert JsonParser().parse(json.dumps({"a:a": "b"})) == {"a:a": "b"}
assert next(JsonParser().parse(json.dumps({"a:a": "b"}))) == {"a:a": "b"}


def test_dict_with_value_containing_a_special_char() -> None:
assert JsonParser().parse(json.dumps({"a": "b:b"})) == {"a": "b:b"}
assert next(JsonParser().parse(json.dumps({"a": "b:b"}))) == {"a": "b:b"}


def test_dict_containing_a_number() -> None:
original = "[{'a':'legit', 'b':1}, {'a':400, 'b':2}]"
assert JsonParser().parse(original) == [{"a": "legit", "b": 1}, {"a": 400, "b": 2}]
@pytest.mark.parametrize(
"original",
[[{"a": "legit", "b": 1}, {"a": 400, "b": 2}], {"a": "legit", "b": {"nr": 25}}],
)
def test_dict_containing_a_number(original: str) -> None:
assert next(JsonParser().parse(json.dumps(original))) == original


def test_dict_containing_a_variable() -> None:
original = "[{'a':'legit', 'b':1}, {'a':qwer, 'b':'2'}]"
assert JsonParser().parse(original) == [
assert next(JsonParser().parse(original)) == [
{"a": "legit", "b": 1},
{"a": Variable("qwer"), "b": "2"},
]


def test_unusual_quotes() -> None:
original = "[{’a’:1, ’b’:true}, {’a’:2, ’b’:null}, {’a’:3}]"
assert JsonParser().parse(original) == [
assert next(JsonParser().parse(original)) == [
{"a": 1, "b": True},
{"a": 2, "b": Variable(None)},
{"a": 3},
Expand All @@ -96,7 +109,7 @@ def test_parse_multiple_objects() -> None:
}
"""
assert JsonParser().parse(multi_object_string) == [
assert list(JsonParser().parse(multi_object_string)) == [
{"a1": "v1", "a1": "v2"},
{"a2": "w1", "a2": "w2"},
{"a3": "z"},
Expand All @@ -112,15 +125,25 @@ def test_parse_multiple_objects() -> None:
],
)
def test_list_and_string_are_siblings(source: Any) -> None: # type: ignore[misc]
assert JsonParser().parse(json.dumps(source)) == source
assert next(JsonParser().parse(json.dumps(source))) == source


def test_bool_parser() -> None:
assert JsonParser().parse(json.dumps({"sth": False})) == {"sth": False}
assert next(JsonParser().parse(json.dumps({"sth": False}))) == {"sth": False}


def test_multiline_bool_parser() -> None:
obj1 = {"sth": False}
obj2 = {"k1": "v1"}
combined = json.dumps(obj1) + "\n" + json.dumps(obj2)
assert JsonParser().parse(combined) == [obj1, obj2]
assert list(JsonParser().parse(combined)) == [obj1, obj2]


@pytest.mark.parametrize("nr_of_docs", [1, 25, 2500])
def test_large_object(nr_of_docs: int) -> None:
data = "".join(
[json.dumps({"pk": f"pk{i}", "data": str(uuid4())}) for i in range(nr_of_docs)]
)

res = list(JsonParser().parse(data))
assert len(res) == nr_of_docs

0 comments on commit 9b48238

Please sign in to comment.