Skip to content

Commit

Permalink
Added an experimental --best-match method (#105)
Browse files Browse the repository at this point in the history
* Added an experimental --best-match method

* Preparing for beta release

* Preparing release 2.6b1

* Back to development: 2.6
  • Loading branch information
regebro authored Apr 3, 2023
1 parent 27dc2be commit f59fec8
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 10 deletions.
9 changes: 9 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ Changes
- Nothing changed yet.


2.6b1 (2023-01-12)
------------------

- Added an experimental --best-match method that is slower, but generate
smaller diffs when you have many nodes that are similar.

- The -F argument now also affects the --fast-match stage.


2.5 (2023-01-11)
----------------

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version = 2.6.dev0
description = Creates diffs of XML files
long_description = file: README.rst, CHANGES.rst
classifiers =
Development Status :: 5 - Production/Stable
Development Status :: 4 - Beta
Intended Audience :: Developers
Intended Audience :: End Users/Desktop
Topic :: Text Processing :: Markup :: XML
Expand Down
60 changes: 55 additions & 5 deletions tests/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,11 +821,11 @@ def test_entirely_different(self):
)


class FastMatchTests(unittest.TestCase):
def _match(self, left, right, fast_match):
class BestFastMatchTests(unittest.TestCase):
def _match(self, left, right, fast_match=False, best_match=False):
left_tree = etree.fromstring(left)
right_tree = etree.fromstring(right)
differ = Differ(fast_match=fast_match)
differ = Differ(fast_match=fast_match, best_match=best_match)
differ.set_trees(left_tree, right_tree)
matches = differ.match()
lpath = differ.left.getroottree().getpath
Expand Down Expand Up @@ -860,9 +860,11 @@ def test_move_paragraph(self):
"""
# Same matches as the non-fast match test, but the matches are
# a different order.
slow_result = sorted(self._match(left, right, False))
fast_result = sorted(self._match(left, right, True))
slow_result = sorted(self._match(left, right))
fast_result = sorted(self._match(left, right, fast_match=True))
best_result = sorted(self._match(left, right, best_match=True))
self.assertEqual(slow_result, fast_result)
self.assertEqual(slow_result, best_result)

def test_move_children(self):
# Here the paragraphs are all so similar that that each paragraph
Expand Down Expand Up @@ -917,6 +919,54 @@ def test_move_children(self):
],
)

# Best should be as good as slow (but slower)
best_result = sorted(self._match(left, right, best_match=True))
self.assertEqual(best_result, slow_result)

def test_delete_node(self):
# If you have a list of similar nodes, and delete one, that
# confuses both the standard and the fast algorithm:
left = """<root>
<node id="1"/>
<node id="2"/>
<node id="3"/>
<node id="4"/>
<node id="5"/>
</root>
"""
right = """<root>
<node id="1"/>
<node id="2"/>
<node id="4"/>
<node id="5"/>
</root>
"""

slow_result = sorted(self._match(left, right))
fast_result = sorted(self._match(left, right, fast_match=True))
best_result = sorted(self._match(left, right, best_match=True))
self.assertEqual(
slow_result,
[
("/root", "/root"),
("/root/node[1]", "/root/node[1]"),
("/root/node[2]", "/root/node[2]"),
("/root/node[3]", "/root/node[3]"),
("/root/node[4]", "/root/node[4]"),
],
)
self.assertEqual(fast_result, slow_result)
self.assertEqual(
best_result,
[
("/root", "/root"),
("/root/node[1]", "/root/node[1]"),
("/root/node[2]", "/root/node[2]"),
("/root/node[4]", "/root/node[3]"),
("/root/node[5]", "/root/node[4]"),
],
)


class UpdateNodeTests(unittest.TestCase):
"""Testing only the update phase of the diffing"""
Expand Down
37 changes: 34 additions & 3 deletions xmldiff/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def __init__(
uniqueattrs=None,
ratio_mode="fast",
fast_match=False,
best_match=False,
ignored_attrs=[],
):
# The minimum similarity between two nodes to consider them equal
Expand All @@ -25,6 +26,7 @@ def __init__(
uniqueattrs = ["{http://www.w3.org/XML/1998/namespace}id"]
self.uniqueattrs = uniqueattrs
self.fast_match = fast_match
self.best_match = best_match

# Avoid recreating this for every node
self._sequencematcher = SequenceMatcher()
Expand Down Expand Up @@ -120,7 +122,7 @@ def match(self, left=None, right=None):
# First find matches with longest_common_subsequence:
matches = list(
utils.longest_common_subsequence(
lnodes, rnodes, lambda x, y: self.node_ratio(x, y) >= 0.5
lnodes, rnodes, lambda x, y: self.node_ratio(x, y) >= self.F
)
)

Expand All @@ -130,8 +132,37 @@ def match(self, left=None, right=None):

# Then remove the nodes (needs to be done backwards):
for left_match, right_match in reversed(matches):
lnode = lnodes.pop(left_match)
rnode = rnodes.pop(right_match)
lnodes.pop(left_match)
rnodes.pop(right_match)

elif self.best_match:
unmatched_lnodes = []

# First find all nodes that match perfectly
for lnode in lnodes:
max_match = 0
match_node = None

for rnode in rnodes:
match = self.node_ratio(lnode, rnode)
if match == 1.0:
self.append_match(lnode, rnode, 1.0)
rnodes.remove(rnode)
break

if match > max_match:
match_node = rnode
max_match = match
else:
unmatched_lnodes.append((lnode, match_node, max_match))
# unmatched_lnodes.append(lnode)

lnodes = []
for lnode, rnode, max_match in unmatched_lnodes:
if max_match >= self.F and rnode in rnodes:
self.append_match(lnode, rnode, max_match)
else:
lnodes.append(lnode)

for lnode in lnodes:
max_match = 0
Expand Down
10 changes: 9 additions & 1 deletion xmldiff/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,15 @@ def make_diff_parser():
choices={"accurate", "fast", "faster"},
help="Choose the node comparison optimization.",
)
parser.add_argument(
match_group = parser.add_mutually_exclusive_group()
match_group.add_argument(
"--fast-match", action="store_true", help="A faster, less optimal match run."
)
match_group.add_argument(
"--best-match",
action="store_true",
help="A slower, two-stage match run that may result in smaller diffs. (Experimental)",
)
parser.add_argument(
"--ignored-attributes",
type=str,
Expand Down Expand Up @@ -161,8 +167,10 @@ def diff_command(args=None):
"ratio_mode": args.ratio_mode,
"F": args.F,
"fast_match": args.fast_match,
"best_match": args.best_match,
"uniqueattrs": _parse_uniqueattrs(args.unique_attributes),
}

result = diff_files(
args.file1, args.file2, diff_options=diff_options, formatter=formatter
)
Expand Down

0 comments on commit f59fec8

Please sign in to comment.