From 422528ba57340d25c00ee8f90a39ecfda98106e4 Mon Sep 17 00:00:00 2001 From: Lennart Regebro Date: Mon, 3 Apr 2023 18:30:41 +0200 Subject: [PATCH] Better namespace handling (#107) Adds support for adding and removing namespaces. Changing URI's does not work, and will likely require big changed, because of lxml's namespace handling. --- CHANGES.rst | 8 +++-- docs/source/advanced.rst | 14 ++++++--- docs/source/api.rst | 36 +++++++++++++++++++++ docs/source/conf.py | 4 +-- tests/test_data/all_actions.expected.xml | 6 ++-- tests/test_data/all_actions.left.xml | 4 ++- tests/test_data/all_actions.right.xml | 6 ++-- tests/test_formatting.py | 6 ++-- xmldiff/actions.py | 3 ++ xmldiff/diff.py | 21 +++++++++++++ xmldiff/formatting.py | 40 +++++++++++++++++++++--- xmldiff/patch.py | 40 ++++++++++++++++++------ 12 files changed, 158 insertions(+), 30 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 58227fe..af7acd0 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,12 +4,16 @@ Changes 2.6 (unreleased) ---------------- -- Nothing changed yet. - +- Added `InsertNamespace` and `DeleteNamespace` actions for better handling + of changing namespaces. Should improve any "Unknown namespace prefix" + errors. Changing the URI of a a namespace prefix is not supported, and will + raise an error. 2.6b1 (2023-01-12) ------------------ +- Used geometric mean for the node_ratio, for better handling of simple nodes. + - Added an experimental --best-match method that is slower, but generate smaller diffs when you have many nodes that are similar. diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index 301059b..d9223ba 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -43,11 +43,13 @@ especially in the case where formatting is added: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = '

My Fine Content

' - >>> right = '

My Fine Content

' + >>> right = '

My Fine Content

' >>> result = main.diff_texts(left, right, formatter=formatter) >>> print(result) -

My Fine Content

+

+ My Fine Content +

My Fine Content

@@ -66,7 +68,9 @@ The XMLFormatter supports a better handling of text with the ``text_tags`` and ` >>> result = main.diff_texts(left, right, formatter=formatter) >>> print(result) -

My Fine Content

+

+ My Fine Content +

This gives a result that flags the ```` tag as new formatting. @@ -134,7 +138,9 @@ Now use that formatter in the diffing: >>> result = main.diff_texts(left, right, formatter=formatter) >>> print(result) -

My Fine Content

+

+ My Fine Content +

You can then add into your CSS files classes that make inserted text green, diff --git a/docs/source/api.rst b/docs/source/api.rst index f38538a..1af252a 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -448,6 +448,42 @@ Example: [InsertComment(target='/document[1]', position=0, text=' A comment ')] +``InsertNamespace(prefix, uri)`` +................................ + +Adds a new namespace to the XML document. You need to have this before +adding a node that uses a namespace that is not in the original XML tree. + +Example: + +.. doctest:: + :options: -ELLIPSIS, +NORMALIZE_WHITESPACE + + >>> left = '' + >>> right = '' + >>> main.diff_texts(left, right) + [InsertNamespace(prefix='new', uri='http://theuri')] + + +``DeleteNamespace(prefix)`` +................................ + +Removes a namespace from the XML document. You don't need to handle this, +strictly speaking, nothing will break if there is an unused namespace, +but `xmldiff` will return this action. + +Example: + +.. doctest:: + :options: -ELLIPSIS, +NORMALIZE_WHITESPACE + + >>> left = '' + >>> right = '' + >>> main.diff_texts(left, right) + [DeleteNamespace(prefix='new')] + + + The patching API ---------------- diff --git a/docs/source/conf.py b/docs/source/conf.py index da38069..b28292a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -30,7 +30,7 @@ extensions = [ "sphinx.ext.doctest", "sphinx.ext.coverage", - "sphinxarg.ext", + # "sphinxarg.ext", ] # Add any paths that contain templates here, relative to this directory. @@ -66,7 +66,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command-line for these cases. -language = None +language = "en" # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: diff --git a/tests/test_data/all_actions.expected.xml b/tests/test_data/all_actions.expected.xml index 94961e5..0b4dc41 100644 --- a/tests/test_data/all_actions.expected.xml +++ b/tests/test_data/all_actions.expected.xml @@ -1,4 +1,4 @@ - + A bit of contained textModifiedThis is outside a tagNew tail content Here we have some text. @@ -8,7 +8,7 @@ Here we have some text. - + My last tag - + diff --git a/tests/test_data/all_actions.left.xml b/tests/test_data/all_actions.left.xml index 36b8ab8..9061ffe 100644 --- a/tests/test_data/all_actions.left.xml +++ b/tests/test_data/all_actions.left.xml @@ -1,4 +1,5 @@ - + A bit of contained text @@ -12,5 +13,6 @@ My last tag + diff --git a/tests/test_data/all_actions.right.xml b/tests/test_data/all_actions.right.xml index 406d3d5..ce9427e 100644 --- a/tests/test_data/all_actions.right.xml +++ b/tests/test_data/all_actions.right.xml @@ -1,4 +1,5 @@ - + Modified @@ -10,5 +11,6 @@ Here we have some text. - + + diff --git a/tests/test_formatting.py b/tests/test_formatting.py index d4ac0e3..76f0830 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -488,6 +488,8 @@ def test_all_actions(self): formatter = formatting.XmlDiffFormatter() result = main.diff_files(lfile, rfile, formatter=formatter) expected = ( + "[insert-namespace, space, http://namespaces.shoobx.com/outerspace]\n" + "[delete-namespace, name]\n" "[move-after, /document/node[2], /document/tag[1]]\n" "[insert-comment, /document[1], 0, Insert a new comment ]\n" '[update, /document/node[1]/@name, "was updated"]\n' @@ -505,8 +507,8 @@ def test_all_actions(self): '[update, /document/node[1]/text()[2], "\\n ' 'New tail content\\n "]\n' "[rename, /document/node[2], nod]\n" - "[insert-after, /document/tail[1], \n" - "]\n" + "[rename, /document/name:space[1], {http://namespaces.shoobx.com/outerspace}name]\n" + '[update, /document/space:name[1]/text()[2], "\\n "]\n' "[remove, /document/tail[1]]" ) self.assertEqual(result, expected) diff --git a/xmldiff/actions.py b/xmldiff/actions.py index 926539c..7a6fd3a 100644 --- a/xmldiff/actions.py +++ b/xmldiff/actions.py @@ -15,3 +15,6 @@ RenameAttrib = namedtuple("RenameAttrib", "node oldname newname") InsertComment = namedtuple("InsertComment", "target position text") + +InsertNamespace = namedtuple("InsertNamespace", "prefix uri") +DeleteNamespace = namedtuple("DeleteNamespace", "prefix") diff --git a/xmldiff/diff.py b/xmldiff/diff.py index 004ce5c..99594a9 100644 --- a/xmldiff/diff.py +++ b/xmldiff/diff.py @@ -426,6 +426,27 @@ def diff(self, left=None, right=None): if not self._matches: self.match(left, right) + # First, deal with namespaces: + rnsmap = self.right.nsmap + lnsmap = self.left.nsmap + for k, v in rnsmap.items(): + # Make sure it's registered: + if k is not None: + etree.register_namespace(k, v) + if k not in lnsmap: + yield actions.InsertNamespace(k, v) + elif lnsmap[k] != v: + raise RuntimeError( + "Sorry, we do not support changing the URI of namespaces in xmldiff" + ) + + for k, v in lnsmap.items(): + # Make sure it's registered: + if k is not None: + etree.register_namespace(k, v) + if k not in rnsmap: + yield actions.DeleteNamespace(k) + # The paper talks about the five phases, and then does four of them # in one phase, in a different order that described. This # implementation in turn differs in order yet again. diff --git a/xmldiff/formatting.py b/xmldiff/formatting.py index 9cdc967..06086fd 100644 --- a/xmldiff/formatting.py +++ b/xmldiff/formatting.py @@ -335,6 +335,7 @@ def format(self, diff, orig_tree, differ=None): else: root = result + self._nsmap = [(DIFF_PREFIX, DIFF_NS)] etree.register_namespace(DIFF_PREFIX, DIFF_NS) for action in diff: @@ -342,7 +343,7 @@ def format(self, diff, orig_tree, differ=None): self.finalize(root) - etree.cleanup_namespaces(result, top_nsmap={DIFF_PREFIX: DIFF_NS}) + etree.cleanup_namespaces(result, top_nsmap=dict(self._nsmap)) return self.render(result) def render(self, result): @@ -369,6 +370,11 @@ def _xpath(self, node, xpath): # one and exactly one element is found. This is to protect against # formatting a diff on the wrong tree, or against using ambiguous # edit script xpaths. + + # First, make a namespace map that uses the left tree's URI's: + nsmap = dict(self._nsmap) + nsmap.update(node.nsmap) + if xpath[0] == "/": root = True xpath = xpath[1:] @@ -393,11 +399,10 @@ def _xpath(self, node, xpath): path = "/" + path matches = [] - for match in node.xpath(path, namespaces=node.nsmap): + for match in node.xpath(path, namespaces=nsmap): # Skip nodes that have been deleted if DELETE_NAME not in match.attrib: matches.append(match) - if index >= len(matches): raise ValueError( "xpath {}[{}] not found at {}.".format( @@ -632,6 +637,14 @@ def _handle_UpdateTextAfter(self, action, tree): return node + def _handle_InsertNamespace(self, action, tree): + # There is no way to mark this so it's visible, so we'll just update the tree + self._nsmap.append((action.prefix, action.uri)) + + def _handle_DeleteNamespace(self, action, tree): + # This will be handled by the namespace cleanup + pass + # There is no InsertComment handler, as this formatter removes all comments @@ -702,6 +715,19 @@ def _handle_InsertComment(self, action): json.dumps(action.text), ) + def _handle_InsertNamespace(self, action): + return ( + "insert-namespace", + action.prefix, + action.uri, + ) + + def _handle_DeleteNamespace(self, action): + return ( + "delete-namespace", + action.prefix, + ) + class XmlDiffFormatter(BaseFormatter): """A formatter for an output trying to be xmldiff 0.6 compatible""" @@ -792,4 +818,10 @@ def _handle_RenameNode(self, action, orig_tree): yield "rename", action.node, action.tag def _handle_InsertComment(self, action, orig_tree): - yield ("insert-comment", action.target, str(action.position), action.text) + yield "insert-comment", action.target, str(action.position), action.text + + def _handle_InsertNamespace(self, action, orig_tree): + yield "insert-namespace", action.prefix, action.uri + + def _handle_DeleteNamespace(self, action, orig_tree): + yield "delete-namespace", action.prefix diff --git a/xmldiff/patch.py b/xmldiff/patch.py index 8c6d128..d052db3 100644 --- a/xmldiff/patch.py +++ b/xmldiff/patch.py @@ -6,10 +6,17 @@ class Patcher: + @property + def nsmap(self): + return getattr(self, "_nsmap", {}) + def patch(self, actions, tree): if isinstance(tree, etree._ElementTree): tree = tree.getroot() + # Save the namespace: + self._nsmap = tree.nsmap + # Copy the tree so we don't modify the original result = deepcopy(tree) @@ -24,46 +31,46 @@ def handle_action(self, action, tree): method(action, tree) def _handle_DeleteNode(self, action, tree): - node = tree.xpath(action.node, namespaces=tree.nsmap)[0] + node = tree.xpath(action.node, namespaces=self.nsmap)[0] node.getparent().remove(node) def _handle_InsertNode(self, action, tree): - target = tree.xpath(action.target, namespaces=tree.nsmap)[0] + target = tree.xpath(action.target, namespaces=self.nsmap)[0] node = target.makeelement(action.tag) target.insert(action.position, node) def _handle_RenameNode(self, action, tree): - tree.xpath(action.node, namespaces=tree.nsmap)[0].tag = action.tag + tree.xpath(action.node, namespaces=self.nsmap)[0].tag = action.tag def _handle_MoveNode(self, action, tree): - node = tree.xpath(action.node, namespaces=tree.nsmap)[0] + node = tree.xpath(action.node, namespaces=self.nsmap)[0] node.getparent().remove(node) target = tree.xpath(action.target)[0] target.insert(action.position, node) def _handle_UpdateTextIn(self, action, tree): - tree.xpath(action.node, namespaces=tree.nsmap)[0].text = action.text + tree.xpath(action.node, namespaces=self.nsmap)[0].text = action.text def _handle_UpdateTextAfter(self, action, tree): - tree.xpath(action.node, namespaces=tree.nsmap)[0].tail = action.text + tree.xpath(action.node, namespaces=self.nsmap)[0].tail = action.text def _handle_UpdateAttrib(self, action, tree): - node = tree.xpath(action.node, namespaces=tree.nsmap)[0] + node = tree.xpath(action.node, namespaces=self.nsmap)[0] # This should not be used to insert new attributes. assert action.name in node.attrib node.attrib[action.name] = action.value def _handle_DeleteAttrib(self, action, tree): - del tree.xpath(action.node, namespaces=tree.nsmap)[0].attrib[action.name] + del tree.xpath(action.node, namespaces=self.nsmap)[0].attrib[action.name] def _handle_InsertAttrib(self, action, tree): - node = tree.xpath(action.node, namespaces=tree.nsmap)[0] + node = tree.xpath(action.node, namespaces=self.nsmap)[0] # This should not be used to update existing attributes. assert action.name not in node.attrib node.attrib[action.name] = action.value def _handle_RenameAttrib(self, action, tree): - node = tree.xpath(action.node, namespaces=tree.nsmap)[0] + node = tree.xpath(action.node, namespaces=self.nsmap)[0] assert action.oldname in node.attrib assert action.newname not in node.attrib node.attrib[action.newname] = node.attrib[action.oldname] @@ -73,6 +80,13 @@ def _handle_InsertComment(self, action, tree): target = tree.xpath(action.target)[0] target.insert(action.position, etree.Comment(action.text)) + def _handle_InsertNamespace(self, action, tree): + self.nsmap[action.prefix] = action.uri + + def _handle_DeleteNamespace(self, action, tree): + # Nothing needs to be done, it will be handled by cleanup + pass + class DiffParser: """Makes a text diff into a list of actions""" @@ -142,3 +156,9 @@ def _handle_rename_attribute(self, node, oldname, newname): def _handle_insert_comment(self, target, position, text): return actions.InsertComment(target, int(position), loads(text)) + + def _handle_insert_namespace(self, prefix, uri): + return actions.InsertNamespace(prefix, uri) + + def _handle_delete_namespace(self, prefix): + return actions.DeleteNamespace(prefix)