From df0c9842d674cd3677685d1ad4b24c2b49839d0b Mon Sep 17 00:00:00 2001 From: Seperman Date: Sun, 5 Feb 2023 23:12:03 -0800 Subject: [PATCH] Making orjson optional. Adding PrefixOrSuffixOperator --- README.md | 4 ++++ deepdiff/commands.py | 13 +++++++++- deepdiff/operator.py | 11 +++++++++ deepdiff/serialization.py | 24 ++++++++++++------- docs/custom.rst | 50 +++++++++++++++++++++++++++++++++++---- docs/index.rst | 4 ++++ docs/optimizations.rst | 9 +++++++ requirements-dev-3.7.txt | 2 +- requirements-dev.txt | 3 ++- requirements-optimize.txt | 1 + requirements.txt | 1 - setup.py | 3 +++ tests/test_command.py | 16 ++++++------- tests/test_operators.py | 25 +++++++++++++++++++- 14 files changed, 139 insertions(+), 27 deletions(-) create mode 100644 requirements-optimize.txt diff --git a/README.md b/README.md index b2301818..620804d9 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,10 @@ If you want to use DeepDiff from commandline: `pip install "deepdiff[cli]"` +If you want to improve the performance of DeepDiff with certain processes such as json serialization: + +`pip install "deepdiff[optimize]"` + ### Importing ```python diff --git a/deepdiff/commands.py b/deepdiff/commands.py index 86daee40..72629632 100644 --- a/deepdiff/commands.py +++ b/deepdiff/commands.py @@ -11,6 +11,11 @@ from deepdiff import Delta, DeepSearch, extract as deep_extract from deepdiff.serialization import load_path_content, save_content_to_path +try: + import orjson +except ImportError: + orjson = None + @click.group() def cli(): @@ -105,7 +110,13 @@ def diff( # printing into stdout sys.stdout.buffer.write(delta.dumps()) else: - pprint(diff, indent=2) + try: + if orjson: + print(diff.to_json(option=orjson.OPT_INDENT_2)) + else: + print(diff.to_json(indent=2)) + except Exception: + pprint(diff, indent=2) @cli.command() diff --git a/deepdiff/operator.py b/deepdiff/operator.py index 058c5c81..b7e2596f 100644 --- a/deepdiff/operator.py +++ b/deepdiff/operator.py @@ -25,3 +25,14 @@ def match(self, level) -> bool: def give_up_diffing(self, level, diff_instance) -> bool: raise NotImplementedError('Please implement the diff function.') + + +class PrefixOrSuffixOperator: + + def match(self, level) -> bool: + return level.t1 and level.t2 and isinstance(level.t1, str) and isinstance(level.t2, str) + + def give_up_diffing(self, level, diff_instance) -> bool: + t1 = level.t1 + t2 = level.t2 + return t1.startswith(t2) or t2.startswith(t1) diff --git a/deepdiff/serialization.py b/deepdiff/serialization.py index 926298d2..8a859e53 100644 --- a/deepdiff/serialization.py +++ b/deepdiff/serialization.py @@ -3,7 +3,6 @@ import io import os import json -import orjson import uuid import logging import re # NOQA @@ -26,6 +25,11 @@ except ImportError: # pragma: no cover. import csv clevercsv = None # pragma: no cover. +try: + import orjson +except ImportError: # pragma: no cover. + orjson = None + from copy import deepcopy from functools import partial from collections.abc import Mapping @@ -556,15 +560,17 @@ def object_hook(self, obj): def json_dumps(item, default_mapping=None, **kwargs): """ Dump json with extra details that are not normally json serializable - - Note: I tried to replace json with orjson for its speed. It does work - but the output it makes is a byte object and Postgres couldn't directly use it without - encoding to str. So I switched back to json. """ - return orjson.dumps( - item, - default=json_convertor_default(default_mapping=default_mapping), - **kwargs).decode(encoding='utf-8') + if orjson: + return orjson.dumps( + item, + default=json_convertor_default(default_mapping=default_mapping), + **kwargs).decode(encoding='utf-8') + else: + return json.dumps( + item, + default=json_convertor_default(default_mapping=default_mapping), + **kwargs) json_loads = partial(json.loads, cls=JSONDecoder) diff --git a/docs/custom.rst b/docs/custom.rst index e371fcdb..586270c4 100644 --- a/docs/custom.rst +++ b/docs/custom.rst @@ -128,21 +128,61 @@ For example you could use the level object to further determine if the 2 objects Custom Operators ---------------- -Whether two objects are different or not are largely depend on the context. For example, apple and banana are the same +Whether two objects are different or not largely depends on the context. For example, apples and bananas are the same if you are considering whether they are fruits or not. In that case, you can pass a *custom_operators* for the job. -In fact, custom operators give you a lot of power. In the following examples we explore use cases from making DeepDiff -report the L2 Distance of items, to only include certain paths in diffing all the way to making DeepDiff stop diffing -as soon as the first diff is reported. +Custom operators give you a lot of power. In the following examples, we explore various use cases such as: + +- Making DeepDiff report the L2 Distance of items +- Only include specific paths in diffing +- Making DeepDiff stop diffing once we find the first diff. + +You can use one of the predefined custom operators that come with DeepDiff. Or you can define one yourself. + + +Built-In Custom Operators + + +PrefixOrSuffixOperator +...................... + + +This operator will skip strings that are suffix or prefix of each other. + +For example when this operator is used, the two strings of "joe" and "joe's car" will not be reported as different. + + >>> from deepdiff import DeepDiff + >>> from deepdiff.operator import PrefixOrSuffixOperator + >>> t1 = { + ... "key1": ["foo", "bar's food", "jack", "joe"] + ... } + >>> t2 = { + ... "key1": ["foo", "bar", "jill", "joe'car"] + ... } + >>> + >>> DeepDiff(t1, t2) + {'values_changed': {"root['key1'][1]": {'new_value': 'bar', 'old_value': "bar's food"}, "root['key1'][2]": {'new_value': 'jill', 'old_value': 'jack'}, "root['key1'][3]": {'new_value': "joe'car", 'old_value': 'joe'}}} + >>> DeepDiff(t1, t2, custom_operators=[ + ... PrefixOrSuffixOperator() + ... ]) + >>> + {'values_changed': {"root['key1'][2]": {'new_value': 'jill', 'old_value': 'jack'}}} + + + + +Define A Custom Operator +------------------------ + To define an custom operator, you just need to inherit a *BaseOperator* and * implement a give_up_diffing method * give_up_diffing(level: DiffLevel, diff_instance: DeepDiff) -> boolean - If it returns True, then we will give up diffing the 2 objects. + If it returns True, then we will give up diffing the tow objects. You may or may not use the diff_instance.custom_report_result within this function to report any diff. If you decide not to report anything, and this function returns True, then the objects are basically skipped in the results. diff --git a/docs/index.rst b/docs/index.rst index 331936c8..b2428406 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -76,6 +76,10 @@ If you want to use DeepDiff from commandline:: pip install "deepdiff[cli]" +If you want to improve the performance of DeepDiff with certain processes such as json serialization:: + + pip install "deepdiff[optimize]" + Read about DeepDiff optimizations at :ref:`optimizations_label` Importing diff --git a/docs/optimizations.rst b/docs/optimizations.rst index 9be272e5..273613d6 100644 --- a/docs/optimizations.rst +++ b/docs/optimizations.rst @@ -8,6 +8,15 @@ Optimizations If you are dealing with large nested objects and ignore_order=True, chances are DeepDiff takes a while to calculate the diff. Here are some tips that may help you with optimizations and progress report. +Optimized Libraries +------------------- + +If you dump DeepDiff or Delta objects as json, you can improve the performance by installing orjson. +DeepDiff will automatically use orjson instead of Python's built-in json library to do json serialization. + + pip install "deepdiff[optimize]" + + Max Passes ---------- diff --git a/requirements-dev-3.7.txt b/requirements-dev-3.7.txt index e46e73b7..83c448fd 100644 --- a/requirements-dev-3.7.txt +++ b/requirements-dev-3.7.txt @@ -1,4 +1,3 @@ -wheel==0.38.1 -r requirements.txt -r requirements-cli.txt bump2version==1.0.1 @@ -8,3 +7,4 @@ numpy==1.21.6 pytest==7.1.2 python-dotenv==0.20.0 python-dateutil==2.8.2 +wheel==0.38.1 diff --git a/requirements-dev.txt b/requirements-dev.txt index 51ad3962..728d16ab 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,3 @@ -wheel==0.38.1 -r requirements.txt -r requirements-cli.txt bump2version==1.0.1 @@ -14,3 +13,5 @@ Sphinx==5.3.0 sphinx-sitemap==2.2.1 flake8==6.0.0 python-dateutil==2.8.2 +orjson==3.8.3 +wheel==0.38.1 diff --git a/requirements-optimize.txt b/requirements-optimize.txt new file mode 100644 index 00000000..b3fe036f --- /dev/null +++ b/requirements-optimize.txt @@ -0,0 +1 @@ +orjson diff --git a/requirements.txt b/requirements.txt index 82d09d8d..c8de6a12 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1 @@ ordered-set>=4.0.2,<4.2.0 -orjson diff --git a/setup.py b/setup.py index 0bb80331..4953da7c 100755 --- a/setup.py +++ b/setup.py @@ -21,6 +21,7 @@ def get_reqs(filename): reqs = get_reqs("requirements.txt") cli_reqs = get_reqs("requirements-cli.txt") +optimize_reqs = get_reqs("requirements-optimize.txt") with open('README.md') as file: long_description = file.read() @@ -45,6 +46,7 @@ def get_reqs(filename): python_requires='>=3.7', extras_require={ "cli": cli_reqs, + "optimize": optimize_reqs, }, classifiers=[ "Intended Audience :: Developers", @@ -54,6 +56,7 @@ def get_reqs(filename): "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Programming Language :: Python :: Implementation :: PyPy", "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: MIT License" diff --git a/tests/test_command.py b/tests/test_command.py index 49a706c1..894b1ac1 100644 --- a/tests/test_command.py +++ b/tests/test_command.py @@ -11,14 +11,14 @@ class TestCommands: @pytest.mark.parametrize('t1, t2, expected_in_stdout, expected_exit_code', [ - ('t1.json', 't2.json', "'dictionary_item_added\': [root[0]", 0), + ('t1.json', 't2.json', '"dictionary_item_added": [\n "root[0]', 0), ('t1_corrupt.json', 't2.json', "Expecting property name enclosed in double quotes", 1), - ('t1.json', 't2_json.csv', "'old_value\': \'value2\'", 0), - ('t2_json.csv', 't1.json', "'old_value\': \'value3\'", 0), - ('t1.csv', 't2.csv', "\'new_value\': \'James\'", 0), + ('t1.json', 't2_json.csv', '"old_value": "value2"', 0), + ('t2_json.csv', 't1.json', '"old_value": "value3"', 0), + ('t1.csv', 't2.csv', '"new_value": "James"', 0), ('t1.toml', 't2.toml', "10.0.0.2", 0), - ('t1.pickle', 't2.pickle', "'new_value': 5, 'old_value': 1", 0), - ('t1.yaml', 't2.yaml', "'new_value': 61, 'old_value': 65", 0), + ('t1.pickle', 't2.pickle', '"new_value": 5,\n "old_value": 1', 0), + ('t1.yaml', 't2.yaml', '"new_value": 61,\n "old_value": 65', 0), ]) def test_diff_command(self, t1, t2, expected_in_stdout, expected_exit_code): t1 = os.path.join(FIXTURES_DIR, t1) @@ -74,7 +74,7 @@ def test_command_group_by(self): diffed = runner.invoke(diff, [t1, t2, '--group-by', 'id']) assert 0 == diffed.exit_code assert 'values_changed' in diffed.output - assert '\'new_value\': \'Chicken\'' in diffed.output + assert '"new_value": "Chicken"' in diffed.output def test_command_math_epsilon(self): t1 = os.path.join(FIXTURES_DIR, 'd_t1.yaml') @@ -86,7 +86,7 @@ def test_command_math_epsilon(self): diffed2 = runner.invoke(diff, [t1, t2, '--math-epsilon', '0.001']) assert 0 == diffed2.exit_code - assert "{'values_changed': {'root[2][2]': {'new_value': 0.289, 'old_value': 0.288}}}\n" == diffed2.output + assert '{\n "values_changed": {\n "root[2][2]": {\n "new_value": 0.289,\n "old_value": 0.288\n }\n }\n}\n' == diffed2.output def test_command_grep(self): path = os.path.join(FIXTURES_DIR, 'd_t1.yaml') diff --git a/tests/test_operators.py b/tests/test_operators.py index c3d28b21..7e0baf6e 100644 --- a/tests/test_operators.py +++ b/tests/test_operators.py @@ -2,7 +2,7 @@ from typing import List from deepdiff import DeepDiff -from deepdiff.operator import BaseOperator +from deepdiff.operator import BaseOperator, PrefixOrSuffixOperator class TestOperators: @@ -217,3 +217,26 @@ def give_up_diffing(self, level, diff_instance) -> bool: expected = {'values_changed': {'root[0][1]': {'new_value': 3, 'old_value': 2}}} assert expected == ddiff + + def test_prefix_or_suffix_diff(self): + + t1 = { + "key1": ["foo", "bar's food", "jack", "joe"] + } + t2 = { + "key1": ["foo", "bar", "jill", "joe'car"] + } + + ddiff = DeepDiff(t1, t2, custom_operators=[ + PrefixOrSuffixOperator() + ]) + + expected = {'values_changed': {"root['key1'][2]": {'new_value': 'jill', 'old_value': 'jack'}}} + assert expected == ddiff + + ddiff2 = DeepDiff(t1, t2, ignore_order=True, custom_operators=[ + PrefixOrSuffixOperator() + ]) + + expected2 = {'values_changed': {"root['key1'][2]": {'new_value': 'jill', 'old_value': 'jack'}}} + assert expected2 == ddiff2