diff --git a/CHANGES.rst b/CHANGES.rst
index e3470fc..58227fe 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -7,6 +7,15 @@ Changes
- Nothing changed yet.
+2.6b1 (2023-01-12)
+------------------
+
+- Added an experimental --best-match method that is slower, but generate
+ smaller diffs when you have many nodes that are similar.
+
+- The -F argument now also affects the --fast-match stage.
+
+
2.5 (2023-01-11)
----------------
diff --git a/setup.cfg b/setup.cfg
index 338c98a..05b2db2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -4,7 +4,7 @@ version = 2.6.dev0
description = Creates diffs of XML files
long_description = file: README.rst, CHANGES.rst
classifiers =
- Development Status :: 5 - Production/Stable
+ Development Status :: 4 - Beta
Intended Audience :: Developers
Intended Audience :: End Users/Desktop
Topic :: Text Processing :: Markup :: XML
diff --git a/tests/test_diff.py b/tests/test_diff.py
index 8f5fa34..2947eb9 100644
--- a/tests/test_diff.py
+++ b/tests/test_diff.py
@@ -821,11 +821,11 @@ def test_entirely_different(self):
)
-class FastMatchTests(unittest.TestCase):
- def _match(self, left, right, fast_match):
+class BestFastMatchTests(unittest.TestCase):
+ def _match(self, left, right, fast_match=False, best_match=False):
left_tree = etree.fromstring(left)
right_tree = etree.fromstring(right)
- differ = Differ(fast_match=fast_match)
+ differ = Differ(fast_match=fast_match, best_match=best_match)
differ.set_trees(left_tree, right_tree)
matches = differ.match()
lpath = differ.left.getroottree().getpath
@@ -860,9 +860,11 @@ def test_move_paragraph(self):
"""
# Same matches as the non-fast match test, but the matches are
# a different order.
- slow_result = sorted(self._match(left, right, False))
- fast_result = sorted(self._match(left, right, True))
+ slow_result = sorted(self._match(left, right))
+ fast_result = sorted(self._match(left, right, fast_match=True))
+ best_result = sorted(self._match(left, right, best_match=True))
self.assertEqual(slow_result, fast_result)
+ self.assertEqual(slow_result, best_result)
def test_move_children(self):
# Here the paragraphs are all so similar that that each paragraph
@@ -917,6 +919,54 @@ def test_move_children(self):
],
)
+ # Best should be as good as slow (but slower)
+ best_result = sorted(self._match(left, right, best_match=True))
+ self.assertEqual(best_result, slow_result)
+
+ def test_delete_node(self):
+ # If you have a list of similar nodes, and delete one, that
+ # confuses both the standard and the fast algorithm:
+ left = """
+
+
+
+
+
+
+"""
+ right = """
+
+
+
+
+
+"""
+
+ slow_result = sorted(self._match(left, right))
+ fast_result = sorted(self._match(left, right, fast_match=True))
+ best_result = sorted(self._match(left, right, best_match=True))
+ self.assertEqual(
+ slow_result,
+ [
+ ("/root", "/root"),
+ ("/root/node[1]", "/root/node[1]"),
+ ("/root/node[2]", "/root/node[2]"),
+ ("/root/node[3]", "/root/node[3]"),
+ ("/root/node[4]", "/root/node[4]"),
+ ],
+ )
+ self.assertEqual(fast_result, slow_result)
+ self.assertEqual(
+ best_result,
+ [
+ ("/root", "/root"),
+ ("/root/node[1]", "/root/node[1]"),
+ ("/root/node[2]", "/root/node[2]"),
+ ("/root/node[4]", "/root/node[3]"),
+ ("/root/node[5]", "/root/node[4]"),
+ ],
+ )
+
class UpdateNodeTests(unittest.TestCase):
"""Testing only the update phase of the diffing"""
diff --git a/xmldiff/diff.py b/xmldiff/diff.py
index 589da31..004ce5c 100644
--- a/xmldiff/diff.py
+++ b/xmldiff/diff.py
@@ -12,6 +12,7 @@ def __init__(
uniqueattrs=None,
ratio_mode="fast",
fast_match=False,
+ best_match=False,
ignored_attrs=[],
):
# The minimum similarity between two nodes to consider them equal
@@ -25,6 +26,7 @@ def __init__(
uniqueattrs = ["{http://www.w3.org/XML/1998/namespace}id"]
self.uniqueattrs = uniqueattrs
self.fast_match = fast_match
+ self.best_match = best_match
# Avoid recreating this for every node
self._sequencematcher = SequenceMatcher()
@@ -120,7 +122,7 @@ def match(self, left=None, right=None):
# First find matches with longest_common_subsequence:
matches = list(
utils.longest_common_subsequence(
- lnodes, rnodes, lambda x, y: self.node_ratio(x, y) >= 0.5
+ lnodes, rnodes, lambda x, y: self.node_ratio(x, y) >= self.F
)
)
@@ -130,8 +132,37 @@ def match(self, left=None, right=None):
# Then remove the nodes (needs to be done backwards):
for left_match, right_match in reversed(matches):
- lnode = lnodes.pop(left_match)
- rnode = rnodes.pop(right_match)
+ lnodes.pop(left_match)
+ rnodes.pop(right_match)
+
+ elif self.best_match:
+ unmatched_lnodes = []
+
+ # First find all nodes that match perfectly
+ for lnode in lnodes:
+ max_match = 0
+ match_node = None
+
+ for rnode in rnodes:
+ match = self.node_ratio(lnode, rnode)
+ if match == 1.0:
+ self.append_match(lnode, rnode, 1.0)
+ rnodes.remove(rnode)
+ break
+
+ if match > max_match:
+ match_node = rnode
+ max_match = match
+ else:
+ unmatched_lnodes.append((lnode, match_node, max_match))
+ # unmatched_lnodes.append(lnode)
+
+ lnodes = []
+ for lnode, rnode, max_match in unmatched_lnodes:
+ if max_match >= self.F and rnode in rnodes:
+ self.append_match(lnode, rnode, max_match)
+ else:
+ lnodes.append(lnode)
for lnode in lnodes:
max_match = 0
diff --git a/xmldiff/main.py b/xmldiff/main.py
index 408c957..e5cb766 100644
--- a/xmldiff/main.py
+++ b/xmldiff/main.py
@@ -115,9 +115,15 @@ def make_diff_parser():
choices={"accurate", "fast", "faster"},
help="Choose the node comparison optimization.",
)
- parser.add_argument(
+ match_group = parser.add_mutually_exclusive_group()
+ match_group.add_argument(
"--fast-match", action="store_true", help="A faster, less optimal match run."
)
+ match_group.add_argument(
+ "--best-match",
+ action="store_true",
+ help="A slower, two-stage match run that may result in smaller diffs. (Experimental)",
+ )
parser.add_argument(
"--ignored-attributes",
type=str,
@@ -161,8 +167,10 @@ def diff_command(args=None):
"ratio_mode": args.ratio_mode,
"F": args.F,
"fast_match": args.fast_match,
+ "best_match": args.best_match,
"uniqueattrs": _parse_uniqueattrs(args.unique_attributes),
}
+
result = diff_files(
args.file1, args.file2, diff_options=diff_options, formatter=formatter
)