From f59fec86e6e48019d1be4bbea6849786524d8ec3 Mon Sep 17 00:00:00 2001 From: Lennart Regebro Date: Mon, 3 Apr 2023 15:41:59 +0200 Subject: [PATCH] Added an experimental --best-match method (#105) * Added an experimental --best-match method * Preparing for beta release * Preparing release 2.6b1 * Back to development: 2.6 --- CHANGES.rst | 9 +++++++ setup.cfg | 2 +- tests/test_diff.py | 60 ++++++++++++++++++++++++++++++++++++++++++---- xmldiff/diff.py | 37 +++++++++++++++++++++++++--- xmldiff/main.py | 10 +++++++- 5 files changed, 108 insertions(+), 10 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index e3470fc..58227fe 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -7,6 +7,15 @@ Changes - Nothing changed yet. +2.6b1 (2023-01-12) +------------------ + +- Added an experimental --best-match method that is slower, but generate + smaller diffs when you have many nodes that are similar. + +- The -F argument now also affects the --fast-match stage. + + 2.5 (2023-01-11) ---------------- diff --git a/setup.cfg b/setup.cfg index 338c98a..05b2db2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ version = 2.6.dev0 description = Creates diffs of XML files long_description = file: README.rst, CHANGES.rst classifiers = - Development Status :: 5 - Production/Stable + Development Status :: 4 - Beta Intended Audience :: Developers Intended Audience :: End Users/Desktop Topic :: Text Processing :: Markup :: XML diff --git a/tests/test_diff.py b/tests/test_diff.py index 8f5fa34..2947eb9 100644 --- a/tests/test_diff.py +++ b/tests/test_diff.py @@ -821,11 +821,11 @@ def test_entirely_different(self): ) -class FastMatchTests(unittest.TestCase): - def _match(self, left, right, fast_match): +class BestFastMatchTests(unittest.TestCase): + def _match(self, left, right, fast_match=False, best_match=False): left_tree = etree.fromstring(left) right_tree = etree.fromstring(right) - differ = Differ(fast_match=fast_match) + differ = Differ(fast_match=fast_match, best_match=best_match) differ.set_trees(left_tree, right_tree) matches = differ.match() lpath = differ.left.getroottree().getpath @@ -860,9 +860,11 @@ def test_move_paragraph(self): """ # Same matches as the non-fast match test, but the matches are # a different order. - slow_result = sorted(self._match(left, right, False)) - fast_result = sorted(self._match(left, right, True)) + slow_result = sorted(self._match(left, right)) + fast_result = sorted(self._match(left, right, fast_match=True)) + best_result = sorted(self._match(left, right, best_match=True)) self.assertEqual(slow_result, fast_result) + self.assertEqual(slow_result, best_result) def test_move_children(self): # Here the paragraphs are all so similar that that each paragraph @@ -917,6 +919,54 @@ def test_move_children(self): ], ) + # Best should be as good as slow (but slower) + best_result = sorted(self._match(left, right, best_match=True)) + self.assertEqual(best_result, slow_result) + + def test_delete_node(self): + # If you have a list of similar nodes, and delete one, that + # confuses both the standard and the fast algorithm: + left = """ + + + + + + +""" + right = """ + + + + + +""" + + slow_result = sorted(self._match(left, right)) + fast_result = sorted(self._match(left, right, fast_match=True)) + best_result = sorted(self._match(left, right, best_match=True)) + self.assertEqual( + slow_result, + [ + ("/root", "/root"), + ("/root/node[1]", "/root/node[1]"), + ("/root/node[2]", "/root/node[2]"), + ("/root/node[3]", "/root/node[3]"), + ("/root/node[4]", "/root/node[4]"), + ], + ) + self.assertEqual(fast_result, slow_result) + self.assertEqual( + best_result, + [ + ("/root", "/root"), + ("/root/node[1]", "/root/node[1]"), + ("/root/node[2]", "/root/node[2]"), + ("/root/node[4]", "/root/node[3]"), + ("/root/node[5]", "/root/node[4]"), + ], + ) + class UpdateNodeTests(unittest.TestCase): """Testing only the update phase of the diffing""" diff --git a/xmldiff/diff.py b/xmldiff/diff.py index 589da31..004ce5c 100644 --- a/xmldiff/diff.py +++ b/xmldiff/diff.py @@ -12,6 +12,7 @@ def __init__( uniqueattrs=None, ratio_mode="fast", fast_match=False, + best_match=False, ignored_attrs=[], ): # The minimum similarity between two nodes to consider them equal @@ -25,6 +26,7 @@ def __init__( uniqueattrs = ["{http://www.w3.org/XML/1998/namespace}id"] self.uniqueattrs = uniqueattrs self.fast_match = fast_match + self.best_match = best_match # Avoid recreating this for every node self._sequencematcher = SequenceMatcher() @@ -120,7 +122,7 @@ def match(self, left=None, right=None): # First find matches with longest_common_subsequence: matches = list( utils.longest_common_subsequence( - lnodes, rnodes, lambda x, y: self.node_ratio(x, y) >= 0.5 + lnodes, rnodes, lambda x, y: self.node_ratio(x, y) >= self.F ) ) @@ -130,8 +132,37 @@ def match(self, left=None, right=None): # Then remove the nodes (needs to be done backwards): for left_match, right_match in reversed(matches): - lnode = lnodes.pop(left_match) - rnode = rnodes.pop(right_match) + lnodes.pop(left_match) + rnodes.pop(right_match) + + elif self.best_match: + unmatched_lnodes = [] + + # First find all nodes that match perfectly + for lnode in lnodes: + max_match = 0 + match_node = None + + for rnode in rnodes: + match = self.node_ratio(lnode, rnode) + if match == 1.0: + self.append_match(lnode, rnode, 1.0) + rnodes.remove(rnode) + break + + if match > max_match: + match_node = rnode + max_match = match + else: + unmatched_lnodes.append((lnode, match_node, max_match)) + # unmatched_lnodes.append(lnode) + + lnodes = [] + for lnode, rnode, max_match in unmatched_lnodes: + if max_match >= self.F and rnode in rnodes: + self.append_match(lnode, rnode, max_match) + else: + lnodes.append(lnode) for lnode in lnodes: max_match = 0 diff --git a/xmldiff/main.py b/xmldiff/main.py index 408c957..e5cb766 100644 --- a/xmldiff/main.py +++ b/xmldiff/main.py @@ -115,9 +115,15 @@ def make_diff_parser(): choices={"accurate", "fast", "faster"}, help="Choose the node comparison optimization.", ) - parser.add_argument( + match_group = parser.add_mutually_exclusive_group() + match_group.add_argument( "--fast-match", action="store_true", help="A faster, less optimal match run." ) + match_group.add_argument( + "--best-match", + action="store_true", + help="A slower, two-stage match run that may result in smaller diffs. (Experimental)", + ) parser.add_argument( "--ignored-attributes", type=str, @@ -161,8 +167,10 @@ def diff_command(args=None): "ratio_mode": args.ratio_mode, "F": args.F, "fast_match": args.fast_match, + "best_match": args.best_match, "uniqueattrs": _parse_uniqueattrs(args.unique_attributes), } + result = diff_files( args.file1, args.file2, diff_options=diff_options, formatter=formatter )