From f59fec86e6e48019d1be4bbea6849786524d8ec3 Mon Sep 17 00:00:00 2001
From: Lennart Regebro <regebro@gmail.com>
Date: Mon, 3 Apr 2023 15:41:59 +0200
Subject: [PATCH] Added an experimental --best-match method (#105)

* Added an experimental --best-match method

* Preparing for beta release

* Preparing release 2.6b1

* Back to development: 2.6
---
 CHANGES.rst        |  9 +++++++
 setup.cfg          |  2 +-
 tests/test_diff.py | 60 ++++++++++++++++++++++++++++++++++++++++++----
 xmldiff/diff.py    | 37 +++++++++++++++++++++++++---
 xmldiff/main.py    | 10 +++++++-
 5 files changed, 108 insertions(+), 10 deletions(-)
diff --git a/CHANGES.rst b/CHANGES.rst
index e3470fc..58227fe 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -7,6 +7,15 @@ Changes
 - Nothing changed yet.
 
 
+2.6b1 (2023-01-12)
+------------------
+
+- Added an experimental --best-match method that is slower, but generate
+  smaller diffs when you have many nodes that are similar.
+
+- The -F argument now also affects the --fast-match stage.
+
+
 2.5 (2023-01-11)
 ----------------
 
diff --git a/setup.cfg b/setup.cfg
index 338c98a..05b2db2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -4,7 +4,7 @@ version = 2.6.dev0
 description = Creates diffs of XML files
 long_description = file: README.rst, CHANGES.rst
 classifiers =
-    Development Status :: 5 - Production/Stable
+    Development Status :: 4 - Beta
     Intended Audience :: Developers
     Intended Audience :: End Users/Desktop
     Topic :: Text Processing :: Markup :: XML
diff --git a/tests/test_diff.py b/tests/test_diff.py
index 8f5fa34..2947eb9 100644
--- a/tests/test_diff.py
+++ b/tests/test_diff.py
@@ -821,11 +821,11 @@ def test_entirely_different(self):
         )
 
 
-class FastMatchTests(unittest.TestCase):
-    def _match(self, left, right, fast_match):
+class BestFastMatchTests(unittest.TestCase):
+    def _match(self, left, right, fast_match=False, best_match=False):
         left_tree = etree.fromstring(left)
         right_tree = etree.fromstring(right)
-        differ = Differ(fast_match=fast_match)
+        differ = Differ(fast_match=fast_match, best_match=best_match)
         differ.set_trees(left_tree, right_tree)
         matches = differ.match()
         lpath = differ.left.getroottree().getpath
@@ -860,9 +860,11 @@ def test_move_paragraph(self):
 """
         # Same matches as the non-fast match test, but the matches are
         # a different order.
-        slow_result = sorted(self._match(left, right, False))
-        fast_result = sorted(self._match(left, right, True))
+        slow_result = sorted(self._match(left, right))
+        fast_result = sorted(self._match(left, right, fast_match=True))
+        best_result = sorted(self._match(left, right, best_match=True))
         self.assertEqual(slow_result, fast_result)
+        self.assertEqual(slow_result, best_result)
 
     def test_move_children(self):
         # Here the paragraphs are all so similar that that each paragraph
@@ -917,6 +919,54 @@ def test_move_children(self):
             ],
         )
 
+        # Best should be as good as slow (but slower)
+        best_result = sorted(self._match(left, right, best_match=True))
+        self.assertEqual(best_result, slow_result)
+
+    def test_delete_node(self):
+        # If you have a list of similar nodes, and delete one, that
+        # confuses both the standard and the fast algorithm:
+        left = """<root>
+<node id="1"/>
+<node id="2"/>
+<node id="3"/>
+<node id="4"/>
+<node id="5"/>
+</root>
+"""
+        right = """<root>
+<node id="1"/>
+<node id="2"/>
+<node id="4"/>
+<node id="5"/>
+</root>
+"""
+
+        slow_result = sorted(self._match(left, right))
+        fast_result = sorted(self._match(left, right, fast_match=True))
+        best_result = sorted(self._match(left, right, best_match=True))
+        self.assertEqual(
+            slow_result,
+            [
+                ("/root", "/root"),
+                ("/root/node[1]", "/root/node[1]"),
+                ("/root/node[2]", "/root/node[2]"),
+                ("/root/node[3]", "/root/node[3]"),
+                ("/root/node[4]", "/root/node[4]"),
+            ],
+        )
+        self.assertEqual(fast_result, slow_result)
+        self.assertEqual(
+            best_result,
+            [
+                ("/root", "/root"),
+                ("/root/node[1]", "/root/node[1]"),
+                ("/root/node[2]", "/root/node[2]"),
+                ("/root/node[4]", "/root/node[3]"),
+                ("/root/node[5]", "/root/node[4]"),
+            ],
+        )
+
 
 class UpdateNodeTests(unittest.TestCase):
     """Testing only the update phase of the diffing"""
diff --git a/xmldiff/diff.py b/xmldiff/diff.py
index 589da31..004ce5c 100644
--- a/xmldiff/diff.py
+++ b/xmldiff/diff.py
@@ -12,6 +12,7 @@ def __init__(
         uniqueattrs=None,
         ratio_mode="fast",
         fast_match=False,
+        best_match=False,
         ignored_attrs=[],
     ):
         # The minimum similarity between two nodes to consider them equal
@@ -25,6 +26,7 @@ def __init__(
             uniqueattrs = ["{http://www.w3.org/XML/1998/namespace}id"]
         self.uniqueattrs = uniqueattrs
         self.fast_match = fast_match
+        self.best_match = best_match
 
         # Avoid recreating this for every node
         self._sequencematcher = SequenceMatcher()
@@ -120,7 +122,7 @@ def match(self, left=None, right=None):
             # First find matches with longest_common_subsequence:
             matches = list(
                 utils.longest_common_subsequence(
-                    lnodes, rnodes, lambda x, y: self.node_ratio(x, y) >= 0.5
+                    lnodes, rnodes, lambda x, y: self.node_ratio(x, y) >= self.F
                 )
             )
 
@@ -130,8 +132,37 @@ def match(self, left=None, right=None):
 
             # Then remove the nodes (needs to be done backwards):
             for left_match, right_match in reversed(matches):
-                lnode = lnodes.pop(left_match)
-                rnode = rnodes.pop(right_match)
+                lnodes.pop(left_match)
+                rnodes.pop(right_match)
+
+        elif self.best_match:
+            unmatched_lnodes = []
+
+            # First find all nodes that match perfectly
+            for lnode in lnodes:
+                max_match = 0
+                match_node = None
+
+                for rnode in rnodes:
+                    match = self.node_ratio(lnode, rnode)
+                    if match == 1.0:
+                        self.append_match(lnode, rnode, 1.0)
+                        rnodes.remove(rnode)
+                        break
+
+                    if match > max_match:
+                        match_node = rnode
+                        max_match = match
+                else:
+                    unmatched_lnodes.append((lnode, match_node, max_match))
+                    # unmatched_lnodes.append(lnode)
+
+            lnodes = []
+            for lnode, rnode, max_match in unmatched_lnodes:
+                if max_match >= self.F and rnode in rnodes:
+                    self.append_match(lnode, rnode, max_match)
+                else:
+                    lnodes.append(lnode)
 
         for lnode in lnodes:
             max_match = 0
diff --git a/xmldiff/main.py b/xmldiff/main.py
index 408c957..e5cb766 100644
--- a/xmldiff/main.py
+++ b/xmldiff/main.py
@@ -115,9 +115,15 @@ def make_diff_parser():
         choices={"accurate", "fast", "faster"},
         help="Choose the node comparison optimization.",
     )
-    parser.add_argument(
+    match_group = parser.add_mutually_exclusive_group()
+    match_group.add_argument(
         "--fast-match", action="store_true", help="A faster, less optimal match run."
     )
+    match_group.add_argument(
+        "--best-match",
+        action="store_true",
+        help="A slower, two-stage match run that may result in smaller diffs. (Experimental)",
+    )
     parser.add_argument(
         "--ignored-attributes",
         type=str,
@@ -161,8 +167,10 @@ def diff_command(args=None):
         "ratio_mode": args.ratio_mode,
         "F": args.F,
         "fast_match": args.fast_match,
+        "best_match": args.best_match,
         "uniqueattrs": _parse_uniqueattrs(args.unique_attributes),
     }
+
     result = diff_files(
         args.file1, args.file2, diff_options=diff_options, formatter=formatter
     )