From e9084c0e895f671466771b21b69977bb808e682c Mon Sep 17 00:00:00 2001 From: Lennart Regebro Date: Thu, 6 Sep 2018 14:40:38 +0200 Subject: [PATCH] Moving to xmldiff 2.0 --- .coveragerc | 3 +- .gitignore | 25 +- .travis.yml | 31 +- AUTHORS.rst | 4 - CHANGES.rst | 248 +-- LICENSE.txt | 184 +- MANIFEST.in | 17 +- Makefile | 17 + README.rst | 87 +- README.txt | 3 + TODO.rst | 34 - TODO.txt | 39 + doc/API.txt | 104 - doc/HELP.txt | 213 -- doc/makefile | 16 - docs/Makefile | 216 ++ docs/make.bat | 263 +++ docs/source/api.rst | 307 +++ docs/source/commandline.rst | 114 + docs/source/conf.py | 287 +++ docs/source/contributing.rst | 120 ++ docs/source/index.rst | 30 + docs/source/installation.rst | 20 + extensions/maplookup.c | 413 ---- setup.cfg | 2 + setup.py | 129 +- src/xmldiff/__init__.py | 17 - src/xmldiff/difflib.py | 162 -- src/xmldiff/fmes.py | 439 ---- src/xmldiff/format.py | 95 - src/xmldiff/input.py | 63 - src/xmldiff/main.py | 153 -- src/xmldiff/misc.py | 161 -- src/xmldiff/objects.py | 287 --- src/xmldiff/parser.py | 225 -- tests/README | 15 - tests/__init__.py | 2 +- tests/conftest.py | 43 - tests/data/broken/broken.xml | 12 - tests/data/dir1/changing.xml | 5 - tests/data/dir1/dir_inboth/changing.xml | 5 - tests/data/dir1/dir_inboth/inbothdir.xml | 1 - tests/data/dir1/dir_inboth/onlyindir1.xml | 1 - tests/data/dir1/inbothdir.xml | 1 - tests/data/dir1/onlyindir1.xml | 1 - tests/data/dir2/changing.xml | 2 - tests/data/dir2/dir_inboth/changing.xml | 2 - tests/data/dir2/dir_inboth/inbothdir.xml | 1 - tests/data/dir2/dir_inboth/onlyindir2.xml | 1 - tests/data/dir2/dir_only2/.empty | 0 tests/data/dir2/inbothdir.xml | 1 - tests/data/dir2/onlyindir2.xml | 1 - tests/data/parse/1.xml | 39 - tests/data/parse/default_ns.xml | 10 - tests/data/parse/html.html | 10 - tests/data/parse/iso.xml | 6 - tests/data/parse/simple_ns.xml | 10 - tests/data/parse/tal_ns.xml | 10 - tests/data/parse/utf16.xml | Bin 202 -> 0 bytes tests/data/parse/utf8.xml | 6 - tests/data/test00_1.xml | 12 - tests/data/test00_2.xml | 17 - tests/data/test00_result | 23 - tests/data/test01_1.xml | 5 - tests/data/test01_2.xml | 2 - tests/data/test01_result | 4 - tests/data/test02_1.xml | 1 - tests/data/test02_2.xml | 2 - tests/data/test02_result | 8 - tests/data/test03_1.xml | 1 - tests/data/test03_2.xml | 3 - tests/data/test03_result | 13 - tests/data/test04_1.xml | 15 - tests/data/test04_2.xml | 24 - tests/data/test04_result | 43 - tests/data/test05_1.xml | 28 - tests/data/test05_2.xml | 29 - tests/data/test05_result | 15 - tests/data/test06_1.xml | 2 - tests/data/test06_2.xml | 1 - tests/data/test06_result | 6 - tests/data/test07_1.xml | 1 - tests/data/test07_2.xml | 1 - tests/data/test07_result | 2 - tests/data/test08_1.xml | 39 - tests/data/test08_2.xml | 39 - tests/data/test08_result | 66 - tests/data/test09_ns_1.xml | 12 - tests/data/test09_ns_2.xml | 12 - tests/data/test09_ns_result | 24 - tests/data/test10_ns_1.xml | 39 - tests/data/test10_ns_2.xml | 39 - tests/data/test10_ns_result | 68 - tests/data/test11_ns_1.xml | 6 - tests/data/test11_ns_2.xml | 6 - tests/data/test11_ns_result | 1 - .../complex-text-update.expected.rml | 5 + tests/test_data/complex-text-update.left.rml | 12 + tests/test_data/complex-text-update.right.rml | 12 + tests/test_data/insert-node.expected.rml | 7 + tests/test_data/insert-node.left.rml | 4 + tests/test_data/insert-node.right.rml | 7 + .../no-text-substitutions.expected.rml | 7 + .../test_data/no-text-substitutions.left.rml | 4 + .../test_data/no-text-substitutions.right.rml | 5 + tests/test_data/rmldoc.expected.rml | 300 +++ tests/test_data/rmldoc.expected.xml | 498 +++++ tests/test_data/rmldoc.left.rml | 508 +++++ tests/test_data/rmldoc.left.xml | 508 +++++ tests/test_data/rmldoc.right.rml | 519 +++++ tests/test_data/rmldoc.right.xml | 519 +++++ tests/test_diff.py | 1067 +++++++++ tests/test_difflib.py | 127 -- tests/test_formatting.py | 375 ++++ tests/test_main.py | 130 ++ tests/test_parser.py | 325 --- tests/test_performance.py | 33 - tests/test_regrtest.py | 164 -- tests/test_utils.py | 113 + tests/testing.py | 37 + tox.ini | 15 - .../dir_only1/.empty => xmldiff/__init__.py | 0 xmldiff/_diff_match_patch_py2.py | 1919 +++++++++++++++++ xmldiff/_diff_match_patch_py3.py | 1907 ++++++++++++++++ xmldiff/diff.py | 399 ++++ xmldiff/diff_match_patch.py | 5 + xmldiff/formatting.py | 621 ++++++ xmldiff/main.py | 85 + xmldiff/utils.py | 121 ++ 129 files changed, 11271 insertions(+), 4409 deletions(-) delete mode 100644 AUTHORS.rst create mode 100644 Makefile create mode 100644 README.txt delete mode 100644 TODO.rst create mode 100644 TODO.txt delete mode 100644 doc/API.txt delete mode 100644 doc/HELP.txt delete mode 100644 doc/makefile create mode 100644 docs/Makefile create mode 100644 docs/make.bat create mode 100644 docs/source/api.rst create mode 100644 docs/source/commandline.rst create mode 100644 docs/source/conf.py create mode 100644 docs/source/contributing.rst create mode 100644 docs/source/index.rst create mode 100644 docs/source/installation.rst delete mode 100644 extensions/maplookup.c create mode 100644 setup.cfg delete mode 100644 src/xmldiff/__init__.py delete mode 100644 src/xmldiff/difflib.py delete mode 100644 src/xmldiff/fmes.py delete mode 100644 src/xmldiff/format.py delete mode 100644 src/xmldiff/input.py delete mode 100644 src/xmldiff/main.py delete mode 100644 src/xmldiff/misc.py delete mode 100644 src/xmldiff/objects.py delete mode 100644 src/xmldiff/parser.py delete mode 100644 tests/README delete mode 100644 tests/conftest.py delete mode 100644 tests/data/broken/broken.xml delete mode 100644 tests/data/dir1/changing.xml delete mode 100644 tests/data/dir1/dir_inboth/changing.xml delete mode 100644 tests/data/dir1/dir_inboth/inbothdir.xml delete mode 100644 tests/data/dir1/dir_inboth/onlyindir1.xml delete mode 100644 tests/data/dir1/inbothdir.xml delete mode 100644 tests/data/dir1/onlyindir1.xml delete mode 100644 tests/data/dir2/changing.xml delete mode 100644 tests/data/dir2/dir_inboth/changing.xml delete mode 100644 tests/data/dir2/dir_inboth/inbothdir.xml delete mode 100644 tests/data/dir2/dir_inboth/onlyindir2.xml delete mode 100644 tests/data/dir2/dir_only2/.empty delete mode 100644 tests/data/dir2/inbothdir.xml delete mode 100644 tests/data/dir2/onlyindir2.xml delete mode 100644 tests/data/parse/1.xml delete mode 100644 tests/data/parse/default_ns.xml delete mode 100644 tests/data/parse/html.html delete mode 100644 tests/data/parse/iso.xml delete mode 100644 tests/data/parse/simple_ns.xml delete mode 100644 tests/data/parse/tal_ns.xml delete mode 100644 tests/data/parse/utf16.xml delete mode 100644 tests/data/parse/utf8.xml delete mode 100644 tests/data/test00_1.xml delete mode 100644 tests/data/test00_2.xml delete mode 100644 tests/data/test00_result delete mode 100644 tests/data/test01_1.xml delete mode 100644 tests/data/test01_2.xml delete mode 100644 tests/data/test01_result delete mode 100644 tests/data/test02_1.xml delete mode 100644 tests/data/test02_2.xml delete mode 100644 tests/data/test02_result delete mode 100644 tests/data/test03_1.xml delete mode 100644 tests/data/test03_2.xml delete mode 100644 tests/data/test03_result delete mode 100644 tests/data/test04_1.xml delete mode 100644 tests/data/test04_2.xml delete mode 100644 tests/data/test04_result delete mode 100644 tests/data/test05_1.xml delete mode 100644 tests/data/test05_2.xml delete mode 100644 tests/data/test05_result delete mode 100644 tests/data/test06_1.xml delete mode 100644 tests/data/test06_2.xml delete mode 100644 tests/data/test06_result delete mode 100644 tests/data/test07_1.xml delete mode 100644 tests/data/test07_2.xml delete mode 100644 tests/data/test07_result delete mode 100644 tests/data/test08_1.xml delete mode 100644 tests/data/test08_2.xml delete mode 100644 tests/data/test08_result delete mode 100644 tests/data/test09_ns_1.xml delete mode 100644 tests/data/test09_ns_2.xml delete mode 100644 tests/data/test09_ns_result delete mode 100644 tests/data/test10_ns_1.xml delete mode 100644 tests/data/test10_ns_2.xml delete mode 100644 tests/data/test10_ns_result delete mode 100644 tests/data/test11_ns_1.xml delete mode 100644 tests/data/test11_ns_2.xml delete mode 100644 tests/data/test11_ns_result create mode 100644 tests/test_data/complex-text-update.expected.rml create mode 100644 tests/test_data/complex-text-update.left.rml create mode 100644 tests/test_data/complex-text-update.right.rml create mode 100644 tests/test_data/insert-node.expected.rml create mode 100644 tests/test_data/insert-node.left.rml create mode 100644 tests/test_data/insert-node.right.rml create mode 100644 tests/test_data/no-text-substitutions.expected.rml create mode 100644 tests/test_data/no-text-substitutions.left.rml create mode 100644 tests/test_data/no-text-substitutions.right.rml create mode 100644 tests/test_data/rmldoc.expected.rml create mode 100644 tests/test_data/rmldoc.expected.xml create mode 100644 tests/test_data/rmldoc.left.rml create mode 100644 tests/test_data/rmldoc.left.xml create mode 100644 tests/test_data/rmldoc.right.rml create mode 100644 tests/test_data/rmldoc.right.xml create mode 100644 tests/test_diff.py delete mode 100644 tests/test_difflib.py create mode 100644 tests/test_formatting.py create mode 100644 tests/test_main.py delete mode 100644 tests/test_parser.py delete mode 100644 tests/test_performance.py delete mode 100644 tests/test_regrtest.py create mode 100644 tests/test_utils.py create mode 100644 tests/testing.py delete mode 100644 tox.ini rename tests/data/dir1/dir_only1/.empty => xmldiff/__init__.py (100%) create mode 100644 xmldiff/_diff_match_patch_py2.py create mode 100644 xmldiff/_diff_match_patch_py3.py create mode 100644 xmldiff/diff.py create mode 100644 xmldiff/diff_match_patch.py create mode 100644 xmldiff/formatting.py create mode 100644 xmldiff/main.py create mode 100644 xmldiff/utils.py diff --git a/.coveragerc b/.coveragerc index 086990b..fd0079b 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,3 +1,4 @@ [run] source = xmldiff -omit = */test* +omit = tests* + xmldiff/*diff_match_patch*.py diff --git a/.gitignore b/.gitignore index c4b3710..9c28604 100644 --- a/.gitignore +++ b/.gitignore @@ -1,23 +1,10 @@ +*.pyc +__pycache__ +*.bak +*.egg-info +*.wp? .eggs -.pytest_cache/ -.cache .tox -.venv/ - -tags -TAGS -ID -bin -eggs -parts -tree -develop-eggs -src/xmldiff.egg-info/ -.installed.cfg -tmp -*.so -*.py? .coverage htmlcov -coverage -coverage.xml +docs/build diff --git a/.travis.yml b/.travis.yml index 754a074..f65d710 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,17 +1,28 @@ language: python -sudo: false + +# Travis doesn't support 3.7 by default, but if we run it on xenial it works matrix: - include: - - python: 2.7 - env: TOXENV=py27 - - python: 3.6 - env: TOXENV=py36 + fast_finish: true + include: + - python: 2.7 + - python: 3.5 + - python: 3.6 + - python: 3.7 + sudo: required + dist: xenial + - python: pypy + - python: pypy3 + install: - - pip install tox-travis coveralls + - pip install . coverage coveralls flake8 sphinx sphinx sphinx-argparse + script: - - tox -notifications: - email: false + - make flake + - make coverage + - cd docs; make doctest; make html after_success: - coveralls + +cache: + pip: true diff --git a/AUTHORS.rst b/AUTHORS.rst deleted file mode 100644 index da4912b..0000000 --- a/AUTHORS.rst +++ /dev/null @@ -1,4 +0,0 @@ -Nicolas Chauvat -Adam Groszer -Stephan Richter -Sylvain Thenault diff --git a/CHANGES.rst b/CHANGES.rst index 0dbd6cc..a7b7146 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,250 +1,30 @@ -CHANGES +Changes ======= -1.1.2 (unreleased) +2.0b2 (unreleased) ------------------ -- When I fixed the xpath namespace handling I also changed the tag names to - an xpath syntax. This was unhelpful, so I changed that back. To solve this - I have had to extend the return format from the parser and ass a N_NSPREFIX - that contains the prefix. This is used by the differ to return correct - xpaths without changing the tags. +- Documentation -- The AbstractFormatter.init() method made exactly zero sense and made it - practically impossible to set the output stream to anything but sys.stdout. - Fixed and deprecated. +- The diff formatter now handles the --keep-whitespace argument +- Added a ``--version`` argument -1.1.1 (2018-06-20) ------------------- - -- When moving attributes the secondary rename action xpath was - '//LogilabXmldiffTmpAttr' which is a tag specification. - Changed this to '//@LogilabXmldiffTmpAttr', so an attribute - is specified. - - -1.1.0 (2018-06-15) ------------------- - -- When using namespaces, the returned xpaths are now in the form ns_prefix:tag - instead of the earlier {ns_uri}tag, which isn't correct xpaths. - -1.0.0 (2018-04-13) +2.0b1 (2018-09-03) ------------------ -- pep8 cleanup, added flake8 checking to tox - - -1.0.0a6 (2018-04-12) --------------------- - -- Removed encoding, because python does unicode just fine - -- Switched on namespace handling for XML input - - -1.0.0a5 (2018-04-11) --------------------- - -- Brownbag release to make up for bad previous ones. - -1.0.0a2 (2018-04-11) --------------------- +- A complete, bottom-up, pure-python rewrite -- Temporary disabling of encoding text (hopefully permanent). +- New easy API -- Reverted bug fix: Do not remove newlines from text while parsing - the XML. +- New output formats: + - A list of actions (similar but not compatible with the old format) -1.0.0a1 (2018-04-10) --------------------- + - XML with changes marked though tags and attributes -- Bug: Fix a off-by-one issue with `insert-after` action. - -- Bug: Do not rename children on text node updates. - -- Bug: Text moves were not recorded as part of the fmes edit script. - -- Remove only partially implemented xmlrev script. - -- Removed support for xupdate, which never became a standard. - -- Removed deprecated ezs optional algorithm. - -- Removed support for Debian and RedHat packaging. - -- Removed Windows support - -- LOTS of package cleanup (setup.py, MANIFEST, proper console script, etc) - -- tests moved to py.test and cleaned, added tox, travis, coverage support - - -0.6.10 (2010-08-27) -------------------- - -- apply Daiki Ueno patch: fails when comparing minimal trees on i386 - - -0.6.9 (2009-04-02) ------------------- - -- Fixed xmldiff-xmlrev compilation error - - -0.6.8 (2006-06-15) ------------------- - -- Fixed 64bit cleanness issues - - -0.6.7 (2005-05-04) ------------------- - -- WARNING: xmldiff is no longer a logilab subpackage. Users may have to - manually remove the old logilab/xmldiff directory. - -- fixed debian bug #275750, also reported by Christopher R Newman on the - xml-projects mailing list - -- fixed --profile option, wrap function from maplookup when profiling so that - they appear in the profile information - -- fixed setup.py to ignore the xmlrev shell script under windows platforms - -- small improvements (remove recursion in object.py, minor enhancement in - mydifflib.py, rewrite of lcs4 in C) - - -0.6.6 (2004-12-23) ------------------- - -- Applied patch by Bastian Kleineidam which - - - corrects the typo in ML_DIR - - - fixes the TMPFILE_XSLT/TMPFILE_XSL typo - - - makes sure the files are XML or SGML files, else prints an error - - - adds various missing quotes around filenames which could have - spaces or begin with a hyphen - - - fixes typos in the usage() function - - Thanks a lot, Bastian. - -- Fixed some problems in the xmlrev.xslt stylesheet - -- Fixed problems in xmlrev caused by the exit status of xmldiff when - successful - -- Added a man page for xmldiff and xmlrev - - -0.6.5 (2004-09-02) ------------------- - -- xmlrev bugfixes - -- Fixed packaging problems (missing xsl stylesheets and MANIFEST file) - - -0.6.4 (2003-10-02) ------------------- - -- fix recursive mode - -- rewrite regression test, add test for the recursive mode - -- add --help option to xlmrev - -- packaging fixes - -- turn API.txt and HELP.txt to correct ReST - - -0.6.3 (2002-11-06) ------------------- - -- fix wrong xpath for attributes - -- fix bug with temporary duplicate attribute node - -- fix for xupdate - -- fix ext_pes option bug - -- update changelog to new format - - -0.6.2 (2002-09-23) ------------------- - -- return number of differences on command line - -- reintroduce misc.list_print which caused recursive mode - to fail - -- use psyco if available (http://psyco.sf.net) - -- little changes in C extension - - -0.6.1 (2002-08-29) ------------------- - -- fix packaging problems - - -0.6.0 (2002-08-23) ------------------- - -- change of the internal representation - -- remove support for the EZS algorithm (no more maintened - for the moment) - -- add command line options to parse html and to control - entities inclusion and output encoding - -- fixing coalescing text nodes bug - -- many other bugs fixes - -- great speed improvement - - -0.5.3 (2002-01-31) ------------------- - -- add __init__.py in "logilab" directory - - -0.5.2 (2001-10-29) ------------------- - -- bug fixes in xupdate formatting and in the dom interface. - - -0.5.1 (2001-09-07) ------------------- - -- Fast Match / Edit Scritp algorithm, now fully usable - -- fixes Unicode problem - - -0.2.1 (2001-08-10) ------------------- - -- bug fixes, optimizations for ezs algorithm - - -0.1.1 (2001-08-04) ------------------- + - RML aware XML where tags containing text are semantically diffed, useful + for human output such as converting to HTML or PDF -- original revision +- 100% test coverage diff --git a/LICENSE.txt b/LICENSE.txt index 0a04128..0f39b38 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,165 +1,19 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - - This version of the GNU Lesser General Public License incorporates -the terms and conditions of version 3 of the GNU General Public -License, supplemented by the additional permissions listed below. - - 0. Additional Definitions. - - As used herein, "this License" refers to version 3 of the GNU Lesser -General Public License, and the "GNU GPL" refers to version 3 of the GNU -General Public License. - - "The Library" refers to a covered work governed by this License, -other than an Application or a Combined Work as defined below. - - An "Application" is any work that makes use of an interface provided -by the Library, but which is not otherwise based on the Library. -Defining a subclass of a class defined by the Library is deemed a mode -of using an interface provided by the Library. - - A "Combined Work" is a work produced by combining or linking an -Application with the Library. The particular version of the Library -with which the Combined Work was made is also called the "Linked -Version". - - The "Minimal Corresponding Source" for a Combined Work means the -Corresponding Source for the Combined Work, excluding any source code -for portions of the Combined Work that, considered in isolation, are -based on the Application, and not on the Linked Version. - - The "Corresponding Application Code" for a Combined Work means the -object code and/or source code for the Application, including any data -and utility programs needed for reproducing the Combined Work from the -Application, but excluding the System Libraries of the Combined Work. - - 1. Exception to Section 3 of the GNU GPL. - - You may convey a covered work under sections 3 and 4 of this License -without being bound by section 3 of the GNU GPL. - - 2. Conveying Modified Versions. - - If you modify a copy of the Library, and, in your modifications, a -facility refers to a function or data to be supplied by an Application -that uses the facility (other than as an argument passed when the -facility is invoked), then you may convey a copy of the modified -version: - - a) under this License, provided that you make a good faith effort to - ensure that, in the event an Application does not supply the - function or data, the facility still operates, and performs - whatever part of its purpose remains meaningful, or - - b) under the GNU GPL, with none of the additional permissions of - this License applicable to that copy. - - 3. Object Code Incorporating Material from Library Header Files. - - The object code form of an Application may incorporate material from -a header file that is part of the Library. You may convey such object -code under terms of your choice, provided that, if the incorporated -material is not limited to numerical parameters, data structure -layouts and accessors, or small macros, inline functions and templates -(ten or fewer lines in length), you do both of the following: - - a) Give prominent notice with each copy of the object code that the - Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the object code with a copy of the GNU GPL and this license - document. - - 4. Combined Works. - - You may convey a Combined Work under terms of your choice that, -taken together, effectively do not restrict modification of the -portions of the Library contained in the Combined Work and reverse -engineering for debugging such modifications, if you also do each of -the following: - - a) Give prominent notice with each copy of the Combined Work that - the Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the Combined Work with a copy of the GNU GPL and this license - document. - - c) For a Combined Work that displays copyright notices during - execution, include the copyright notice for the Library among - these notices, as well as a reference directing the user to the - copies of the GNU GPL and this license document. - - d) Do one of the following: - - 0) Convey the Minimal Corresponding Source under the terms of this - License, and the Corresponding Application Code in a form - suitable for, and under terms that permit, the user to - recombine or relink the Application with a modified version of - the Linked Version to produce a modified Combined Work, in the - manner specified by section 6 of the GNU GPL for conveying - Corresponding Source. - - 1) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (a) uses at run time - a copy of the Library already present on the user's computer - system, and (b) will operate properly with a modified version - of the Library that is interface-compatible with the Linked - Version. - - e) Provide Installation Information, but only if you would otherwise - be required to provide such information under section 6 of the - GNU GPL, and only to the extent that such information is - necessary to install and execute a modified version of the - Combined Work produced by recombining or relinking the - Application with a modified version of the Linked Version. (If - you use option 4d0, the Installation Information must accompany - the Minimal Corresponding Source and Corresponding Application - Code. If you use option 4d1, you must provide the Installation - Information in the manner specified by section 6 of the GNU GPL - for conveying Corresponding Source.) - - 5. Combined Libraries. - - You may place library facilities that are a work based on the -Library side by side in a single library together with other library -facilities that are not Applications and are not covered by this -License, and convey such a combined library under terms of your -choice, if you do both of the following: - - a) Accompany the combined library with a copy of the same work based - on the Library, uncombined with any other library facilities, - conveyed under the terms of this License. - - b) Give prominent notice with the combined library that part of it - is a work based on the Library, and explaining where to find the - accompanying uncombined form of the same work. - - 6. Revised Versions of the GNU Lesser General Public License. - - The Free Software Foundation may publish revised and/or new versions -of the GNU Lesser General Public License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. - - Each version is given a distinguishing version number. If the -Library as you received it specifies that a certain numbered version -of the GNU Lesser General Public License "or any later version" -applies to it, you have the option of following the terms and -conditions either of that published version or of any later version -published by the Free Software Foundation. If the Library as you -received it does not specify a version number of the GNU Lesser -General Public License, you may choose any version of the GNU Lesser -General Public License ever published by the Free Software Foundation. - - If the Library as you received it specifies that a proxy can decide -whether future versions of the GNU Lesser General Public License shall -apply, that proxy's public statement of acceptance of any version is -permanent authorization for you to choose that version for the -Library. +Copyright (c) 2018 Xmldiff Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in index b571bed..b4d40ec 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,13 +1,8 @@ -recursive-include doc * -recursive-include extensions * -recursive-include src * -recursive-include tests * - include *.rst -include *.py +include *.txt include *.yml -include tox.ini - -exclude .coverage -global-exclude *.pyc -global-exclude *.so +include .coveragerc +include Makefile +recursive-include tests *.py +recursive-include tests *.xml +recursive-include tests *.rml diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e5dddab --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +root_dir := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) +dfm_source_2 := "https://raw.githubusercontent.com/google/diff-match-patch/master/python2/diff_match_patch.py" +dfm_source_3 := "https://raw.githubusercontent.com/google/diff-match-patch/master/python3/diff_match_patch.py" + +update-diff-match-patch: + wget $(dfm_source_2) -O $(root_dir)/xmldiff/_diff_match_patch_py2.py + wget $(dfm_source_3) -O $(root_dir)/xmldiff/_diff_match_patch_py3.py + +flake: + flake8 tests xmldiff --exclude *diff_match_patch*.py + +coverage: + coverage run setup.py test + coverage html + +test: + python setup.py test diff --git a/README.rst b/README.rst index 9e9e39c..c7638e1 100644 --- a/README.rst +++ b/README.rst @@ -1,62 +1,73 @@ -======== -XML Diff +xmldiff ======== -.. image:: https://travis-ci.org/Shoobx/xmldiff.png?branch=master - :target: https://travis-ci.org/Shoobx/xmldiff +.. image:: https://travis-ci.org/regebro/xmldiff2.svg?branch=master -.. image:: https://coveralls.io/repos/github/Shoobx/xmldiff/badge.svg?branch=master - :target: https://coveralls.io/github/Shoobx/xmldiff?branch=master +.. image:: https://coveralls.io/repos/github/regebro/xmldiff2/badge.svg -.. image:: https://img.shields.io/pypi/v/xmldiff.svg - :target: https://pypi.python.org/pypi/xmldiff +``xmldiff`` is a library and a command line utility for making diffs out of XML. +This may seem like something that doesn't need a dedicated utility, +but change detection in hierarchical data is very different from change detection in flat data. +XML type formats are also not only used for computer readable data, +it is also often used as a format for hierarchical data that can be rendered into human readable formats. +A traditional diff on such a format would tell you line by line the differences, +but this would not be be readable by a human. +This library provides tools to make human readable diffs in those situations. -.. image:: https://img.shields.io/pypi/pyversions/xmldiff.svg - :target: https://pypi.python.org/pypi/xmldiff/ -.. image:: https://api.codeclimate.com/v1/badges/b5a94d8f61fdff1e3214/maintainability - :target: https://codeclimate.com/github/Shoobx/xmldiff/maintainability - :alt: Maintainability +Quick usage +----------- -Xmldiff is a utility for extracting differences between two xml files. It -returns a set of primitives to apply on source tree to obtain the destination -tree. +``xmldiff`` is both a command line tool and a Python library. +To use it from the commandline, just run ``xmldiff`` with two input files:: -The implementation is based on `Change detection in hierarchically structured -information`, by S. Chawathe, A. Rajaraman, H. Garcia-Molina and J. Widom, -Stanford University, 1996 + $ xmldiff file1.xml file2.xml -Installation ------------- +As a library:: + + from lxml import etree + from xmldiff import main, formatting -To install the latest release: + differ = diff.Differ() + diff = main.diff_files('file1.xml', 'file2.xml', + formatter=formatting.XMLFormatter()) -.. code:: bash + print(diff) - pip install xmldiff +There is also a method ``diff_trees()`` that take two lxml trees, +and a method ``diff_texts()`` that will take strings containing XML. -To install the development version: +Changes from ``xmldiff`` 1.x +----------------------------- -.. code:: bash + * A complete, ground up, pure-Python rewrite - git clone https://github.com/Shoobx/xmldiff.git - cd xmldiff - virtualenv ./.venv - ./.venv/bin/python setup.py install + * Easier to maintain, the code is less complex and more Pythonic, + and uses more custom classes instead of just nesting lists and dicts. -Then to compare two given XML files: + * Fixes the problems with certain large files and solves the memory leaks. -.. code:: bash + * A nice, easy to use Python API for using it as a library. - ./.venv/bin/xmldiff 1.xml 2.xml + * Adds support for showing the diffs in different formats, + mainly one where differences are marked up in the XML, + useful for making human readable diffs. + * These formats can show text differences in a semantically meaningful way. -Running tests -------------- + * 2.0 is urrently significantly slower than ``xmldiff`` 2.x, + but this may change in the future. + Currently we make no effort to make ``xmldiff`` 2.0 fast, + we concentrate on making it correct and usable. + + +Contributors +------------ -To run the test suite for all python versions: + * Lennart Regebro, lregebro@shoobx.com (main author) -.. code:: bash + * Stephan Richter, srichter@shoobx.com - tox +The diff algorithm is based on "`Change Detection in Hierarchically Structured Information `_", +and the text diff is using Google's ``diff_match_patch`` algorithm. diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..de1f3e2 --- /dev/null +++ b/README.txt @@ -0,0 +1,3 @@ +See README.rst for general information +See LICENSE.txt for Licensing information. +See docs/source/contributing.rst for development information diff --git a/TODO.rst b/TODO.rst deleted file mode 100644 index e6474aa..0000000 --- a/TODO.rst +++ /dev/null @@ -1,34 +0,0 @@ -TO DOs -====== - -Version 1.0 To Dos ------------------- - -- Code cleanup: - * PEP8, especially docstrings - -- Modernize package: - * lxml-based SAX parser. - * Allow code to be run without C extensions - (See Zope packages with PyPy support for help.) - need to reimplement a bunch of C only code in python - also, the C code gets variables "passed" with `fmes_init` - -- Systematic test setup. - * Use Shoobx approach for XML-processing-based testing. - -- Bonus: Make some of the variables more readable and rem - - -Old TODO List for xmldiff -------------------------- - -_ report namespaces declaration ! -_ support Processing Instruction nodes, CDATA -_ support for XML namespaces -_ option for case insensitive -_ data/document modes ? -_ translate HELP.txt and API.txt to docbook -_ update ezs to make it work with the new internal representation -_ optimizations: - use tuple instead of list when it's possible diff --git a/TODO.txt b/TODO.txt new file mode 100644 index 0000000..2eb1c41 --- /dev/null +++ b/TODO.txt @@ -0,0 +1,39 @@ +TODO +==== + +First Alpha: +-------------- + +* 100% coverage, trafis, coveralls, pep8 - Done + +* Actions should be named tuple objects, it's nicer than plain tuples. - Done + +* Explicit namespace tests. Since we use lxml all the time this already works + as by magic, but we need to test for it. - Done + +* Support for making a diffed xml with diff: tags. - Done + +First Beta: +----------- + +* Command line utility - Done + +* Support for making a pretty text diff - Done + +First Final: +------------ + +* Documentation + + +Future releases: +---------------- + +* An xmlpatch2 utility/command that can apply the diffs. + +* Support for the xmldiff diff format. + +* Maybe a diff format that looks like a text diff, but understands XML + structure and ignores ignorable whitespace? But that is also doable by + just reformatting the XML and pretty printing it and then using a text + diff, so maybe that's pointless. diff --git a/doc/API.txt b/doc/API.txt deleted file mode 100644 index 0608ee0..0000000 --- a/doc/API.txt +++ /dev/null @@ -1,104 +0,0 @@ -XmlDiff API -=========== - -:Author: Sylvain Th�nault -:Organization: Logilab -:Version: $Revision: 1.2 $ -:Date: $Date: 2003-10-02 10:38:21 $ - -.. contents:: - -To use this package as a librarie, you need the provided python's -modules described below. - - -mydifflib.py ------------- -provides functions for Longest Common Subsequence calculation. - -lcs2(X, Y, equal): - apply the greedy lcs/ses algorithm between X and Y sequence - (should be any Python's sequence) - equal is a function to compare X and Y which must return 0 (or - a Python false value) if X and Y are different, 1 (or Python - true value) if they are identical - return a list of matched pairs in tuples - -lcsl(X, Y, equal): - same as above but return the length of the lcs - -quick_ratio(a,b): - optimized version of the standard difflib.py quick_ratio - (without junk and class) - return an upper bound on ratio() relatively quickly. - - -input.py --------- -provides functions for converting DOM tree or xml file in order to -process it with xmldiff functions. - -tree_from_stream(stream, norm_sp=1, ext_ges=0, ext_pes=0, include_comment=1, encoding='UTF-8'): - create and return internal tree from xml stream (open file or - IOString) - if norm_sp = 1, normalize space and new line - if ext_ges = 1, include all external general (text) entities. - if ext_pes = 1, include all external parameter entities, including the external DTD subset. - if include_comment = 1, include comment nodes - encoding specify the encoding to use - -tree_from_dom(root): - create and return internal tree from DOM subtree - - -fmes.py -------- -Fast match/ Edit script algorithm (not sure to obtain the minimum edit -cost, but accept big documents). - -Warning, the process(oldtree, newtree) function has a side effect: -after call it, oldtree == newtree. - -class FmesCorrector(self, formatter, f=0.6, t=0.5): - class which contains the fmes algorithm - formatter is a class instance which handle the edit script - formatting (see format.py) - f and t are algorithm parameter, 0 < f < 1 and 0.5 < t < 1 - in xmldiff, f = 0.59 and t = 0.5 - -FmesCorrector.process_trees(self, tree1, tree2): - launch diff between internal tree tree1 (old xmltree) and - tree2 (new xml tree) - return an actions list - - -format.py ---------- -provides classes for converting xmldiff algorithms output to DOM -tree or printing it in native format. The -formatter interface is the following : - -class AbstractFormatter: - abstract class designed to be overrinden by concrete - formatters - -AbstractFormatter.init(self): - method called before the begining of the tree 2 tree - correction - -AbstractFormatter.add_action(self, action): - method called when an action is added to the edit script - -AbstractFormatter.format_action(self, action): - method called by end() to format each action in the edit - script - at least this method should be overriden - -AbstractFormatter.end(self): - method called at the end of the tree 2 tree correction - - -the concrete class is InternalPrinter - - -See xmldiff.py for an use example. diff --git a/doc/HELP.txt b/doc/HELP.txt deleted file mode 100644 index def4f10..0000000 --- a/doc/HELP.txt +++ /dev/null @@ -1,213 +0,0 @@ -XmlDiff TUTORIAL -================ - -:Author: Sylvain Th�nault -:Organization: Logilab -:Version: $Revision: 1.4 $ -:Date: $Date: 2003-10-08 09:34:12 $ - -.. contents:: - -Synopsis --------- -:: - - xmldiff [Options] from_file to_file - xmldiff [Options] [-r] from_directory to_directory - - Options: - -h, --help - display this help message and exit. - -V, --version - display version number and exit - -H, --html - input files are HTML instead of XML - -r, --recursive - when comparing directories, recursively compare any - subdirectories found. - -e encoding, --encoding=encoding - specify the encoding to use for output. Default is UTF-8 - -n, --not-normalize-spaces - do not normalize spaces and new lines in text and comment nodes. - -c, --exclude-comments - do not process comment nodes - -g, --ext-ges - include all external general (text) entities. - -p, --ext-pes - include all external parameter entities, including the external DTD - subset. - - --profile=file - display an execution profile (run slower with this option), - profile saved to file (binarie form). - - - -Detailed example ----------------- - -if you process two files file1 and file2 which respectively contain: :: - - - - - - almaster@logilab.org - - - - - - - - -and - -:: - - - - - - syt@logilab.org - - - - - - - - - - hoye! - - - - -executing *xmldiff file1 file2* will give the following result: :: - - rename_node, /memory[1]/mailbox[1], box] - [insert-after, /memory[1]/junkbuster-method[1], - - - - ] - [insert-after, /memory[1]/spoken-languages[1], - - - hoye! - - ] - [update, /memory[1]/email_addr[1]/text()[1], syt@logilab.org] - [rename_node, /memory[1]/junkbuster-method[1]@value, val] - [append-first, /memory[1]/junkbuster-method[1], - - ] - [move-first, /memory[1]/spoken-languages[2]/language[2], /memory[1]/spoken-languages[1]] - [update, /memory[1]/server-socket[2]@port, 7797] - [remove, /memory[1]/spoken-languages[2]] - - - -This give you a list of primitives to apply on file1 to obtain file2 -(you should obtain file2 after the execution of all this script!). See -[4] and [5] for more information. -The script above tell you the 9 actions to apply on file1: - -* insert after the node /memory/spoken-languages[0] the below xml subtree:: - - - - hoye! - - -* rename node /memory/mailbox[0] to "box" - -* append a node to the node /memory[0]/junkbuster-method[0] - -* append an attribute named "new" with value "new attribute" to the - node /memory/spoken-languages[0] - -* update attribute /memory/server-socket[1]@port value to "7797" - -* update text /memory/email_addr/text()[0] to "syt@logilab.org" - -* rename attribute /memory/junkbuster-method[0]@value to "val" - -* move the attributes "code" and "name" from - /memory[0]/spoken-languages[0]/language[1] to - /memory[0]/spoken-languages[0]/language[0] - and rename them to LogilabXmldiffTmpAttr:code and - LogilabXmldiffTmpAttr:name - -* move the attributes "code" and "name" from - /memory[0]/spoken-languages[0]/language[0] to - /memory[0]/spoken-languages[0]/language[1] - and rename them to LogilabXmldiffTmpAttr:code and - LogilabXmldiffTmpAttr:name - -* remove node /memory/spoken-languages/language[2] - -* rename attributes LogilabXmldiffTmpAttr:code and - LogilabXmldiffTmpAttr:name of /memory/spoken-languages/language[0] - to name and code - -* rename attributes LogilabXmldiffTmpAttr:code and - LogilabXmldiffTmpAttr:name of /memory/spoken-languages/language[1] - to name and code - -Note all xpath are relative to the file1 with previous steps applied. - - -Warnings --------- - -* This version of xmldiff doesn't process the DTD, CDATA and - PROCESSING INSTRUCTIONS nodes, so if there is a difference between two - document in one of those nodes, xmldiff won't see it. - -* Furthermore, xml namespaces are disabled: - and - - are seen as different nodes - -* Comparing document bigger than 200Ko can take a few minutes (during - tests, it took at about 25 seconds to diff two versions of a 130Ko - document on a Celeron 533 box with 256Mo RAM) - -* The execution time is scaled to the number of differences between - the documents to compare - -* Finally, a few assumptions have been made to obtain the faster - algorithm: - - - there is an ordering <_l on the labels in the shema such that a node - with a label l1 can appear as the descendent of a node with a label l2 - only if l1 <_l l2 - - - for any leaf x from T1, there is at most one leaf y from T2 which - can be mapped with x (internally, 2 node may be mapped together if - their lcs (longest common subsequence) ratio is greater than 0.6) - - -References ----------- - -1. "Change detection in hierarchically structured information" - by S. Chawathe, A. Rajaraman, H. Garcia-Molina, J. Widom - Stanford University, 1996 - The Fast Match / Edit Script algorithm (fmes), used by default - -2. http://www.w3.org/TR/2000/REC-xml-20001006 - XML 1.0 W3C recommendation - -3. http://www.w3.org/TR/xpath - XML path language 1.0 W3C recommendation - - -Feedback --------- - -xmldiff discussion should take place on the xml-logilab mailing list. -Please check http://lists.logilab.org/mailman/listinfo/xml-projects for -information on subscribing and the mailing list archives. diff --git a/doc/makefile b/doc/makefile deleted file mode 100644 index 538aafa..0000000 --- a/doc/makefile +++ /dev/null @@ -1,16 +0,0 @@ -MKHTML=mkdoc -MKHTML_OPT=--doctype article --param toc.section.depth=1 --target html --stylesheet single-file - -SRC=. - - -all: HELP.html API.html - -HELP.html: ${SRC}/HELP.txt - ${MKHTML} ${MKHTML_OPT} ${SRC}/HELP.txt - -API.html: ${SRC}/API.txt - ${MKHTML} ${MKHTML_OPT} ${SRC}/API.txt - -clean: - rm -f *.html diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d38b8e9 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,216 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + +.PHONY: clean +clean: + rm -rf $(BUILDDIR)/* + +.PHONY: html +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +.PHONY: dirhtml +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +.PHONY: singlehtml +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +.PHONY: pickle +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +.PHONY: json +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +.PHONY: htmlhelp +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +.PHONY: qthelp +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/xmldiff.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/xmldiff.qhc" + +.PHONY: applehelp +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +.PHONY: devhelp +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/xmldiff" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/xmldiff" + @echo "# devhelp" + +.PHONY: epub +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +.PHONY: latex +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +.PHONY: latexpdf +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: latexpdfja +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: text +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +.PHONY: man +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +.PHONY: texinfo +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +.PHONY: info +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +.PHONY: gettext +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +.PHONY: changes +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +.PHONY: linkcheck +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +.PHONY: doctest +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +.PHONY: coverage +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +.PHONY: xml +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +.PHONY: pseudoxml +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..a2a5746 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,263 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source +set I18NSPHINXOPTS=%SPHINXOPTS% source +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + echo. coverage to run coverage check of the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +REM Check if sphinx-build is available and fallback to Python version if any +%SPHINXBUILD% 1>NUL 2>NUL +if errorlevel 9009 goto sphinx_python +goto sphinx_ok + +:sphinx_python + +set SPHINXBUILD=python -m sphinx.__init__ +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +:sphinx_ok + + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\xmldiff.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\xmldiff.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "coverage" ( + %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage + if errorlevel 1 exit /b 1 + echo. + echo.Testing of coverage in the sources finished, look at the ^ +results in %BUILDDIR%/coverage/python.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/docs/source/api.rst b/docs/source/api.rst new file mode 100644 index 0000000..f0726dd --- /dev/null +++ b/docs/source/api.rst @@ -0,0 +1,307 @@ +Python API +========== + +Main API +-------- + +Using ``xmldiff`` from Python is very easy, +you just import and call one of the three main API methods. + +.. doctest:: + :options: -ELLIPSIS, +NORMALIZE_WHITESPACE + + >>> from xmldiff import main + >>> main.diff_files("../tests/test_data/insert-node.left.rml", + ... "../tests/test_data/insert-node.right.rml") + [UpdateTextIn(node='/document/story[1]', text=None), + InsertNode(target='/document/story[1]', tag='h1', position=0), + UpdateTextIn(node='/document/story/h1[1]', text='Inserted '), + InsertNode(target='/document/story/h1[1]', tag='i', position=0), + UpdateTextIn(node='/document/story/h1/i[1]', text='Node')] + +Which one you choose depends on if the XML is contained in files, +text strings or ``lxml`` trees. + +* ``xmldiff.main.diff_files()`` takes as input paths to files, or file streams. + +* ``xmldiff.main.diff_texts()`` takes as input Unicode strings. + +* ``xmldiff.main.diff_trees()`` takes as input lxml trees. + + +The arguments to these functions are the same: + +Parameters +.......... + +``left``: + The "left", "old" or "from" XML. + The diff will show the changes to transform this XML to the "right" XML. + +``right``: + The "right", "new" or "target" XML. + +``F``: float + A value between 0 and 1 that determines how similar two XML nodes must be to match as the same in both trees. Defaults to 0.5. + +``uniqueattrs``: list of strings + A list of XML node attributes that will uniquely identify a node. + Defaults to ``['{http://www.w3.org/XML/1998/namespace}id']``. + See `Unique Attributes`_ + +``formatter``: instance + The formatter to use, see `Using Formatters`_. + If no formatter is specified the function will return a list of edit actions, + see `The Edit Script`_. + + +Result +...... + +If no formatter is specified the diff functions will return a list of actions. +Such a list is called an edit script and contains all changes needed to transform the "left" XML into the "right" XML. + +If a formatter is specified that formatter determines the result. +The included formatters, ``diff``, ``xml``, and ``rml`` all return a Unicode string. + + +Unique Attributes +----------------- + +The ``uniqueattrs`` argument is a list of strings specifying attributes that uniquely identify a node in the document. +This is used by the differ when trying to match nodes. +If one node in the left tree has a this attribute, +the node in the right three with the same value for that attribute will match, +regardless of other attributes, child nodes or text content. + +The default is ``['{http://www.w3.org/XML/1998/namespace}id']``, +which is the ``xml:id`` attribute. +But if your document have other unique identifiers, +you can pass them in instead. +If you for some reason do not want the differ to look at the ``xml:id`` attribute, +pass in an empty list. + + +Using Formatters +---------------- + +By default the diff functions will return an edit script, +but if you pass in a formatter the result will be whatever that formatter returns. + +The three included formatters, ``diff``, ``xml`` and ``rml``, +all return Unicode strings. +The ``diff`` formatter will return a string with the edit script printed out, +one action per line. +Each line is enclosed in brackets and consists of a string describing the action, +and the actions arguments. +This is the output format of xmldiff 1.x, +however, the actions and arguments are not the same, +so the output is not compatible. + +.. doctest:: + :options: -ELLIPSIS, +NORMALIZE_WHITESPACE + + >>> from xmldiff import formatting + >>> formatter = formatting.DiffFormatter() + >>> print(main.diff_files("../tests/test_data/insert-node.left.rml", + ... "../tests/test_data/insert-node.right.rml", + ... formatter=formatter)) + [update-text, /document/story[1], null] + [insert, /document/story[1], h1, 0] + [update-text, /document/story/h1[1], "Inserted "] + [insert, /document/story/h1[1], i, 0] + [update-text, /document/story/h1/i[1], "Node"] + + +The other two differs return XML with tags describing the changes. +These formats are designed so they easily can be changed into something that will render nicely, +for example with XSLT replacing the tags with the format you need. + +.. doctest:: + :options: -ELLIPSIS, +NORMALIZE_WHITESPACE + + >>> from xmldiff import formatting + >>> formatter = formatting.RMLFormatter() + >>> print(main.diff_files("../tests/test_data/insert-node.left.rml", + ... "../tests/test_data/insert-node.right.rml", + ... formatter=formatter)) + + +

+ Inserted Node +

+
+
+ + +The Edit Script +--------------- + +The default result of the diffing methods is to return an edit script, +which is a list of Python objects called edit actions. +Those actions tell you how to turn the "left" tree into the "right" tree. + +``xmldiff`` has nine different actions. +These specify one or two nodes in the XML tree, +called ``node`` or ``target``. +They are specified with an XPATH expression that will uniquely identify the node. + +The other arguments vary depending on the action. + + +``InsertNode(target, tag, position')`` +...................................... + +The ``InsertNode`` action means that the node specified in ``target`` needs a new subnode. +``tag`` specifies which tag that node should have. +The ``position`` argument specifies which position the new node should have, +``0`` means that the new node will be inserted as the first child of the target. + +Example: + +.. doctest:: + :options: -ELLIPSIS, +NORMALIZE_WHITESPACE + + >>> left = 'Content' + >>> right = 'Content' + >>> main.diff_texts(left, right) + [InsertNode(target='/document[1]', tag='newnode', position=1)] + + +``DeleteNode(node)`` +.................... + +The ``DeleteNode`` action means that the node specified in ``node`` should be deleted. + +Example: + +.. doctest:: + :options: -ELLIPSIS, +NORMALIZE_WHITESPACE + + >>> left = 'Content' + >>> right = '' + >>> main.diff_texts(left, right) + [DeleteNode(node='/document/node[1]')] + + +``MoveNode(node, target, position)`` +.................................... + +The ``MoveNode`` action means that the node specified in ``node`` should be moved to be a child under the target node. +The ``position`` argument specifies which position it should have, +``0`` means that the new node will be inserted as the first child of the target. + +Example: + +.. doctest:: + :options: -ELLIPSIS, +NORMALIZE_WHITESPACE + + >>> left = 'Content' + >>> right = 'Content' + >>> main.diff_texts(left, right) + [MoveNode(node='/document/node[1]', target='/document[1]', + position=1)] + + +``InsertAttrib(node, name, value)`` +..................................... + +The ``InsertAttrib`` action means that the node specified in ``node`` should get a new attribute. +The ``name `` and ``value`` arguments specify the name and value of that attribute. + +Example: + +.. doctest:: + :options: -ELLIPSIS, +NORMALIZE_WHITESPACE + + >>> left = '' + >>> right = '' + >>> main.diff_texts(left, right) + [InsertAttrib(node='/document[1]', name='newattr', + value='newvalue')] + + +``DeleteAttrib(node, name)`` +............................ + +The ``DeleteAttrib`` action means that an attribute of the node specified in ``target`` should be deleted. +The ``name`` argument specify which attribute. + +Example: + +.. doctest:: + :options: -ELLIPSIS, +NORMALIZE_WHITESPACE + + >>> left = '' + >>> right = '' + >>> main.diff_texts(left, right) + [DeleteAttrib(node='/document[1]', name='newattr')] + + +``RenameAttrib(node, name)`` +............................ + +The ``RenameAttrib`` action means that an attribute of the node specified in ``node`` should be renamed. +The ``oldname`` and ``newname`` arguments specify which attribute and it's new name. + +Example: + +.. doctest:: + :options: -ELLIPSIS, +NORMALIZE_WHITESPACE + + >>> left = '' + >>> right = '' + >>> main.diff_texts(left, right) + [RenameAttrib(node='/document[1]', oldname='attrib', + newname='newattrib')] + + +``UpdateAttrib(node, name)`` +............................ + +The ``UpdateAttrib`` action means that an attribute of the node specified in ``node`` should get a new value. +The ``name`` and ``value`` arguments specify which attribute and it's new value. + +Example: + +.. doctest:: + :options: -ELLIPSIS, +NORMALIZE_WHITESPACE + + >>> left = '' + >>> right = '' + >>> main.diff_texts(left, right) + [UpdateAttrib(node='/document[1]', name='attrib', value='newvalue')] + + +``UpdateTextIn(node, name)`` +............................ + +The ``UpdateTextIn`` action means that an text content of the node specified in ``node`` should get a new value. +The ``text`` argument specify the new value of that text. + +Example: + +.. doctest:: + :options: -ELLIPSIS, +NORMALIZE_WHITESPACE + + >>> left = 'Content' + >>> right = 'New Content' + >>> main.diff_texts(left, right) + [UpdateTextIn(node='/document/node[1]', text='New Content')] + + +``UpdateTextAfter(node, name)`` +............................... + +The ``UpdateTextAfter`` action means that an text that trails the node specified in ``node`` should get a new value. +The ``text`` argument specify the new value of that text. + +Example: + +.. doctest:: + :options: -ELLIPSIS, +NORMALIZE_WHITESPACE + + >>> left = 'Content' + >>> right = 'ContentTrailing text' + >>> main.diff_texts(left, right) + [UpdateTextAfter(node='/document/node[1]', text='Trailing text')] diff --git a/docs/source/commandline.rst b/docs/source/commandline.rst new file mode 100644 index 0000000..7a4e460 --- /dev/null +++ b/docs/source/commandline.rst @@ -0,0 +1,114 @@ +Command line usage +================== + +``xmldiff`` is both a command line tool and a Python library. +To use it from the commandline, just run ``xmldiff`` with two input files: + +.. code-block:: bash + + $ xmldiff file1.xml file2.xml + +There are a few extra options to modify the output, +but be aware that not all of the combinations are meaningful, +so don't be surprised of you add one and nothing happens. + + +Options +------- + +.. argparse:: + :module: xmldiff.main + :func: make_parser + :prog: xmldiff + :nodescription: + +Formatters +---------- + +You can select different output formats with ``xmldiff``, +but beware that some formatters may assume certain things about the type of XML. + +The two formatters named ``diff`` and ``xml`` are generic and will work for any type of XML, +but may not give you a useful output. +If you are using ``xmldiff`` as a library, +you can create your own formatters that is suited for your particular usage of XML. + +Whitespace handling +------------------- + +Formatters are also responsable for whitespace handling, +both in parsing and in output. + +By default ``xmldiff`` will strip all whitespace that is between tags, +as opposed to inside tags. +That whitespace isn't a part of any data and can be ignored. +So this XML structure: + +.. code-block:: xml + + + +Will be seen as the same document as this: + +.. code-block:: xml + + + +Because the whitespace is between the tags. +However, this structure is different, +since the whitespace there occurs inside a tag: + +.. code-block:: xml + + + +In some XML formats, whitespace inside some tags is also not significant. +The ``rml`` formatter is an example of this. +It's a format that can be used to generate documents, +and has a ```` tag for formatted text, +similar to HTML's ``

`` tag. +The ``rml`` formatter is aware of this, +and will by default normalize whitespace inside these tags before comparing it, +effectively replacing any whitespace inside those tags to a single space. +This is so that when diffing two versions of RML files you will not see changes that would not be visible in the final document. + +Both of these types of whitespace can be preserved with the ``--keep-whitespace`` argument. +The third case of whitespace, +whitespace that occurs inside tags that are *not* known to be formatted text tags, +will always be preserved. +Both the ``diff`` and ``xml`` formatters don't know of any text formatting, +and will therefore always preserve all whitespace inside tags. + + +Pretty printing +--------------- + +The term "pretty printing" refers to making an output a bit more human readable by structuring it with whitespace. +In the case of XML this means inserting ignorable whitespace into the XML, +yes, the same in-between whitespace that is ignored by ``xmldiff`` when detecting changes between two files. + +``xmldiff``'s ``xml`` and ``rml`` formatters understand the ``--pretty-print`` argument and will insert whitespace to make the output more readable. + +For example, an XML output that would normally look like this: + + Some contentThis is some + simple text with formatting. + +Will with the ``--pretty-print`` argument look like this: + +.. code-block:: xml + + + Some content + + This is some simple text with formatting. + + + +This means you can actually use ``xmldiff`` to reformat XML, by using the +``xml`` formatter and passing in the same XML file twice:: + + $ xmldiff -f xml -p uglyfile.xml uglyfile.xml + +However, if you keep whitespace with ``--keep-whitespace`` or ``-w``, +no reformatting will be done. diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..df06512 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,287 @@ +# -*- coding: utf-8 -*- +# +# xmldiff documentation build configuration file, created by +# sphinx-quickstart on Tue Sep 4 12:07:12 2018. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.doctest', + 'sphinx.ext.coverage', + 'sphinxarg.ext', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'xmldiff' +copyright = u'2018, Lennart Regebro' +author = u'Lennart Regebro' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = u'2.0' +# The full version, including alpha/beta/rc tags. +release = u'2.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (relative to this directory) to use as a favicon of +# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'xmldiffdoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'xmldiff.tex', u'xmldiff Documentation', + u'Lennart Regebro', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'xmldiff', u'xmldiff Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'xmldiff', u'xmldiff Documentation', + author, 'xmldiff', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst new file mode 100644 index 0000000..07f0309 --- /dev/null +++ b/docs/source/contributing.rst @@ -0,0 +1,120 @@ +Contributing to ``xmldiff`` +=========================== + +``xmldiff`` welcomes your help. Replies and responses may be slow, but don't +despair, we will get to you, we will answer your questions and we will +review your pull requests, but nobody has "Maintain ``xmldiff``" as their job +description, so it may take a long time. That's open source. + +There are some extremely complex issues deep down in ``xmldiff``, but don't +let that scare you away, there's easy things to do as well. + + +Setting up a dev environment +---------------------------- + +To set up a development environment you need a github account, git, and +of course Python with pip installed. You also should have the Python tools +``coverage`` and ``flake8`` installed:: + + pip install coverage flake8 + +Then you need to clone the repository, and install it's dependencies:: + + git clone git@github.com:regebro/xmldiff2.git + cd xmldiff2 + pip install -e . + +You should now be able to test your setup by running a few ``make`` commands:: + + make test + make flake + +These should both pass with no errors, and then you are set! + + +Testing +------- + +``xmldiff``'s tests are written using ``unittest`` and are discoverable by most test runners. +There is also a ``test`` target in the make file. +The following test runners/commands are known to work: + + * ``make test`` + + * ``python setup.py test`` + + * ``nosetests`` + + * ``pytest`` + +There is no support for ``tox`` to run test under different Python versions. +This is because Travis will run all supported versions on pull requests in any case, +and having yet another list of supported Python versions to maintain seems unnecessary. +You can either create your own tox.ini file, +or you can install ```spiny`` `_, +which doesn't require any extra configuration in the normal case, +and will run the tests on all versions that are defined as supported in ``setup.py``: + + +Pull requests +------------- + +Even if you have write permissions to the repository we discourage pushing changes to master. +Make a branch and a pull request, and we'll merge that. + +You pull requests should: + + * Add a test that fails before the change is made + + * Keep test coverage at 100% + + * Include an description of the change in ``CHANGES.txt`` + + * Sdd you to the contributors list in ``README.txt`` if you aren't already there. + + +Code quality and conventions +---------------------------- + +``xmldiff`` aims to have 100% test coverage. +You run a coverage report with ``$ make coverage``. +This will generate a HTML coverage report in ``htmlcov/index.html`` + +We run flake8 as a part of all Travis test runs, +the correct way to run it is ``$ make flake``, +as this includes only the files that should be covered. + + +Documentation +------------- + +The documentation is written with ``sphinx``. +It and any other files using the ReStructuredText format, +such as README's etc, +are using a one line per sub-sentence structure. +This is so that adding one word to a paragraph will not cause several lines of changes, +as that will make any pull request harder to read. + +That means that every sentence and most commas should be followed by a new line, +except in cases where this obviously do not make sense. +As a result of this there is no limits on line length, +but if a line becomes very long you might consider rewriting it to make it more understandable. + +You generate the documentation with a make command:: + + cd docs + make html + +We will be using (but aren't yet) `Read the Docs `_ to host the documentation. + + +Implementation details +---------------------- + +``xmldiff`` is based on `"Change Detection in Hierarchically StructuredS Information" `_ +by Sudarshan S. Chawathe, Anand Rajaraman, Hector Garcia-Molina, and Jennifer Widom, 1995. +It's not necessary to read and understand that paper in all it's details to help with ``xmldiff``, +but if you want to improve the actual diffing algorithm it is certainly helpful. + +I hope to extend this section with an overview of how this library does it's thing. diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..23b1d55 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,30 @@ +xmldiff +======= + +``xmldiff`` is a library and a command line utility for making diffs out of XML. +This may seem like something that doesn't need a dedicated utility, +but change detection in hierarchical data is very different from change detection in flat data. +XML type formats are also not only used for computer readable data, +it is also often used as a format for hierarchical data that can be rendered into human readable formats. +A traditional diff on such a format would tell you line by line the differences, +but this would not be be readable by a human. +This library provides tools to make human readable diffs in those situations. + +Contents: + +.. toctree:: + :maxdepth: 2 + + installation + commandline + api + contributing + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/docs/source/installation.rst b/docs/source/installation.rst new file mode 100644 index 0000000..8b423c9 --- /dev/null +++ b/docs/source/installation.rst @@ -0,0 +1,20 @@ +Installation +============ + +``xmldiff`` is a standard Python package and can be installed in all the ways Python packages normally can be installed. +The most common way is to use ``pip``:: + + pip install xmldiff + +You can also download the latest version from `The Cheeseshop a.k.a PyPI `_, +unpack it with you favourite unpacking tool and then run:: + + python setup.py install + +That's it, ``xmldiff`` should now be available for you to use. + +Several Unix distributions also include xmldiff so you can install it with your distrubutions package manager. +Be aware that currently most distribute an earlier version, +typically 0.6.10, which is very different from 2.x, +which this documentation is written for. +You can check this by running ``xmldiff --version``. diff --git a/extensions/maplookup.c b/extensions/maplookup.c deleted file mode 100644 index 6d865b5..0000000 --- a/extensions/maplookup.c +++ /dev/null @@ -1,413 +0,0 @@ -#include "Python.h" -#include -#include - -char * __revision__ = "$Id: maplookup.c,v 1.12 2005-06-29 06:49:12 alf Exp $"; - -#if PY_MAJOR_VERSION >= 3 -#define INT_AS_LONG(x) PyLong_AS_LONG(x) -#else -#define INT_AS_LONG(x) PyInt_AS_LONG(x) -#endif - - -/* PYTHON EQUIVALENCES - # def _has_couple(couple, mapping): - # for a,b in mapping: - # if b is couple[1] and a is couple[0]: - # return TRUE - # return FALSE - - # def _partner(index, node, mapping): - # for i in mapping: - # if i[index] is node: - # return i[1-index] - # return None - - # def fmes_node_equal(self, n1, n2): - # """ function to compare subtree during mapping """ - # hk1, hk2 = self._d1.has_key, self._d2.has_key - # mapping = self._mapping - # # factor 2.5 for tree expansion compensation - # length = 0 - # i = 0 - # for a,b in mapping: - # i += 1 - # if hk1((id(n1), id(a))): - # if hk2((id(n2), id(b))): - # length += 1 - ## length = len([a for a,b in mapping - ## if hk1((id(n1), id(a))) and hk2((id(n2), id(b)))]) - # fact = 2.5*length/float(max(n1[N_ISSUE], n2[N_ISSUE])) - # if fact >= self.T: - # return TRUE - # return FALSE -*/ - - -/******************* functions specific to the fmes algorithm *****************/ -static short N_ISSUE = 5 ; - -/* function to init objects for the next functions - * - * arguments are (*mapping, *cache_dict1, *cache_dict2, T) - */ -static PyObject *_mapping, *_dict1, *_dict2 ; -static double _T_treshold ; - -static void free_dicts(void) -{ - Py_XDECREF(_dict1) ; - _dict1 = NULL ; - Py_XDECREF(_dict2) ; - _dict2 = NULL ; -} - -static void free_global(void) -{ - Py_XDECREF(_mapping) ; - _mapping = NULL ; - free_dicts() ; -} - - -static PyObject *fmes_init(PyObject *self, PyObject *args) -{ - free_global() ; - if (!PyArg_ParseTuple(args, "OOOd", &_mapping, &_dict1, &_dict2, &_T_treshold)) - return NULL ; - Py_INCREF(_mapping) ; - Py_INCREF(_dict1) ; - Py_INCREF(_dict2) ; - Py_INCREF(Py_None) ; - return Py_None ; -} - -static PyObject *fmes_end(PyObject *self, PyObject *args) -{ - free_global() ; - Py_INCREF(Py_None) ; - return Py_None ; -} - -static PyObject *match_end(PyObject *self, PyObject *args) -{ - free_dicts() ; - Py_INCREF(Py_None) ; - return Py_None ; -} - - -/* look in mapping's couples for an occurence of couple - * return 1 if found, None either - */ -static PyObject *has_couple(PyObject *self, PyObject *args) -{ - PyObject *object1, *object2, *couple; - int i; - if (!PyArg_ParseTuple(args, "OO", &object1, &object2)) - return NULL; - for (i=0; i= _T_treshold) - { - return Py_BuildValue("i", 1) ; - } - else - { - Py_INCREF(Py_None); - return Py_None; - } -} - - -static int is_equal( PyObject* equal, PyObject* X, int x, PyObject* Y, int y ) -{ - PyObject *ob1, *ob2, *args, *res; - ob1 = PySequence_GetItem( X, x ); - ob2 = PySequence_GetItem( Y, y ); - args = Py_BuildValue( "NN", ob1, ob2 ); - res = PyObject_CallObject( equal, args ); - Py_DECREF( args ); - return PyObject_IsTrue( res ); -} - -static int get_v( int* V, int d, int k ) -{ - /* accessor function for V[d][K] which is stored as a triangle matrix */ - return V[d+k+1]; -} - -static void set_v( int* V, int d, int k, int v ) -{ - /* accessor function for V[d][K] which is stored as a triangle matrix */ - V[d+k+1] = v; -} - -static PyObject* lcs2( PyObject* self, PyObject* args ) -{ - PyObject *X, *Y, *equal, *result, *ox, *oy; - int N, M, max, D, k, x, y, nmax; - int **V, *res, *resp; - int xs, idx, v_up, v_down; - - if (!PyArg_ParseTuple( args, "OOO", &X,&Y,&equal )) - return NULL; - if (!PySequence_Check(X) || !PySequence_Check(Y)) { - PyErr_SetString( PyExc_TypeError, "First two args must be sequences" ); - return NULL; - } - N = PySequence_Length(X); - M = PySequence_Length(Y); - max = N + M; - V = (int**)malloc( (2*max+1)*sizeof(int*) ); - memset( V, 0, (2*max+1)*sizeof(int*) ); - V[0] = (int*)malloc( 3*sizeof(int) ); - memset( V[0], 0, 3*sizeof(int) ); - nmax = (N>M ? N : M ); - res = (int*)malloc( nmax*sizeof(int)*2 ); - - for(D=0;D<=max;++D) { - V[D+1] = (int*)malloc( (2*D+5)*sizeof(int) ); - memset( V[D+1], 0, (2*D+5)*sizeof(int) ); - for(k=-D;k<=D;k+=2) { - v_up = get_v( V[D], D, k+1 ); - v_down = get_v( V[D], D, k-1 ); - if ( k==-D || ( k != D && v_down < v_up) ) { - x = v_up; - } else { - x = v_down + 1; - } - y = x - k; - while ( x < N && y < M && is_equal( equal, X, x, Y, y ) ) { - x += 1; - y += 1; - /*common = 0;*/ - } - set_v(V[D+1],D+1,k,x); - if ( x >= N && y >= M ) { - /* build the maximal path */ - k = N-M; - resp = res; - x = N; - y = M; - while(D>=0) { - v_up = get_v( V[D], D, k+1 ); - v_down = get_v( V[D], D, k-1 ); - if ( k==-D || ( k != D && v_down < v_up) ) { - xs = v_up; - k = k + 1; - } else { - xs = v_down + 1; - k = k - 1; - } - while( x>xs ) - { - *resp++ = --x; - *resp++ = --y; - } - x = get_v(V[D],D,k); - y = x - k; - D -= 1; - } - /* now builds the python list from res */ - result = PyList_New( (resp-res)/2 ); - idx = 0; - resp-=2; - while(resp>=res) { - ox = PySequence_GetItem( X, resp[0] ); - oy = PySequence_GetItem( Y, resp[1] ); - PyList_SetItem( result, idx, Py_BuildValue("NN",ox,oy) ); - resp-=2; - idx+=1; - } - for(idx=0;idx= 3 -#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) -#else -#define GETSTATE(m) (&_state) -static struct module_state _state; -#endif - -static PyMethodDef maplookup_methods[] = { - {"has_couple", has_couple, METH_VARARGS}, - {"partner", partner, METH_VARARGS}, - {"fmes_init", fmes_init, METH_VARARGS}, - {"fmes_end", fmes_end, METH_VARARGS}, - {"match_end", match_end, METH_VARARGS}, - {"fmes_node_equal", fmes_node_equal, METH_VARARGS}, - {"lcs2", lcs2, METH_VARARGS}, - {NULL, NULL} /* Sentinel */ -}; - -#if PY_MAJOR_VERSION >= 3 - -static int maplookup_traverse(PyObject *m, visitproc visit, void *arg) { - Py_VISIT(GETSTATE(m)->error); - return 0; -} - -static int maplookup_clear(PyObject *m) { - Py_CLEAR(GETSTATE(m)->error); - return 0; -} - - -static struct PyModuleDef moduledef = { - PyModuleDef_HEAD_INIT, - "maplookup", - NULL, - sizeof(struct module_state), - maplookup_methods, - NULL, - maplookup_traverse, - maplookup_clear, - NULL -}; - -#define INITERROR return NULL - -PyMODINIT_FUNC -PyInit_maplookup(void) - -#else -#define INITERROR return - -void -initmaplookup(void) -#endif -{ -#if PY_MAJOR_VERSION >= 3 - PyObject *module = PyModule_Create(&moduledef); -#else - PyObject *module = Py_InitModule("maplookup", maplookup_methods); -#endif - - if (module == NULL) - INITERROR; - struct module_state *st = GETSTATE(module); - - st->error = PyErr_NewException("maplookup.Error", NULL, NULL); - if (st->error == NULL) { - Py_DECREF(module); - INITERROR; - } - -#if PY_MAJOR_VERSION >= 3 - return module; -#endif -} diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..3c6e79c --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[bdist_wheel] +universal=1 diff --git a/setup.py b/setup.py index 5ccd6ce..65f22b5 100644 --- a/setup.py +++ b/setup.py @@ -1,95 +1,48 @@ -# Copyright (c) 2000-2010 LOGILAB S.A. (Paris, FRANCE). -# http://www.logilab.fr/ -- mailto:contact@logilab.fr -# Copyright (c) 2018 Shoobx.com. -# https://www.shoobx.com/ -- mailto:dev@shoobx.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -"""Setup -""" -import os +from io import open from setuptools import setup, find_packages +version = '2.0b2.dev0' -def read(*rnames): - with open(os.path.join(os.path.dirname(__file__), *rnames), 'rb') as f: - return f.read().decode('utf-8') +with open('README.rst', 'rt', encoding='utf8') as readme: + description = readme.read() +with open('CHANGES.rst', 'rt', encoding='utf8') as changes: + history = changes.read() -TESTS_REQUIRE = [ - 'coverage', - 'lxml', - 'mock', - 'pytest', - 'pytest-cov', - 'flake8', - ] -try: - from distutils.core import Extension - ext_modules = [Extension('xmldiff.maplookup', - ['extensions/maplookup.c'])] -except: - ext_modules = [] - - -setup( - name='xmldiff', - version='1.1.2.dev0', - author="Logilab and Shoobx Team", - author_email="dev@shoobx.com", - url='https://github.com/Shoobx/xmldiff', - description=('Tree 2 tree correction between xml documents. ' - 'Extract differences between two xml files. ' - 'It returns a set of primitives to apply on source tree ' - 'to obtain the destination tree.'), - long_description=( - read('README.rst') + - '\n\n' + - read('CHANGES.rst') - ), - license='LGPL', - keywords=['xml', 'diff', 'xmldiff', 'tree 2 tree'], - classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Framework :: ZODB', - 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)', - 'Natural Language :: English', - 'Operating System :: OS Independent'], - packages=find_packages('src'), - package_dir={'': 'src'}, - extras_require=dict( - test=TESTS_REQUIRE, - ), - install_requires=[ - 'future', - 'lxml', - 'six', - 'setuptools', - ], - include_package_data=True, - zip_safe=False, - entry_points=''' - [console_scripts] - xmldiff = xmldiff.main:run - ''', - ext_modules=ext_modules, - tests_require=TESTS_REQUIRE, +setup(name='xmldiff', + version=version, + description="Creates diffs of XML files", + long_description=description + '\n' + history, + # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers + classifiers=['Development Status :: 3 - Alpha', + 'Topic :: Text Processing :: Markup :: XML', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'License :: OSI Approved :: MIT License', + ], + keywords='xml html diff', + author='Lennart Regebro', + author_email='lregebro@shoobx.com', + url='https://github.com/regebro/xmldiff2', + license='MIT', + packages=find_packages(exclude=['doc', 'tests']), + include_package_data=True, + zip_safe=False, + install_requires=[ + 'setuptools', + 'lxml>=3.1.0', + 'six', + ], + test_suite='tests', + entry_points={ + 'console_scripts': [ + 'xmldiff = xmldiff.main:run', + ], + }, ) diff --git a/src/xmldiff/__init__.py b/src/xmldiff/__init__.py deleted file mode 100644 index 16bc1e3..0000000 --- a/src/xmldiff/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2000-2010 LOGILAB S.A. (Paris, FRANCE). -# http://www.logilab.fr/ -- mailto:contact@logilab.fr -# Copyright (c) 2018 Shoobx.com. -# https://www.shoobx.com/ -- mailto:dev@shoobx.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . diff --git a/src/xmldiff/difflib.py b/src/xmldiff/difflib.py deleted file mode 100644 index b2723a6..0000000 --- a/src/xmldiff/difflib.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2000-2010 LOGILAB S.A. (Paris, FRANCE). -# http://www.logilab.fr/ -- mailto:contact@logilab.fr -# Copyright (c) 2018 Shoobx.com. -# https://www.shoobx.com/ -- mailto:dev@shoobx.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -""" -longest common subsequence algorithm - -the algorithm is describe in "An O(ND) Difference Algorithm and its Variation" -by Eugene W. MYERS - -As opposed to the algorithm in difflib.py, this one doesn't require hashable -elements -""" - -from six.moves import range - - -def lcs2(X, Y, equal): - """ - apply the greedy lcs/ses algorithm between X and Y sequence - (should be any Python's sequence) - equal is a function to compare X and Y which must return 0 if - X and Y are different, 1 if they are identical - return a list of matched pairs in tuplesthe greedy lcs/ses algorithm - """ - N, M = len(X), len(Y) - if not X or not Y: - return [] - max = N + M - v = [0 for i in range(2 * max + 1)] - common = [[] for i in range(2 * max + 1)] - for D in range(max + 1): - for k in range(-D, D + 1, 2): - if k == -D or k != D and v[k - 1] < v[k + 1]: - x = v[k + 1] - common[k] = common[k + 1][:] - else: - x = v[k - 1] + 1 - common[k] = common[k - 1][:] - - y = x - k - while x < N and y < M and equal(X[x], Y[y]): - common[k].append((x, y)) - x += 1 - y += 1 - - v[k] = x - if x >= N and y >= M: - return [(X[xl], Y[yl]) for xl, yl in common[k]] - - -def lcs4(X, Y, equal): - """ - apply the greedy lcs/ses algorithm between X and Y sequence - (should be any Python's sequence) - equal is a function to compare X and Y which must return 0 if - X and Y are different, 1 if they are identical - return a list of matched pairs in tuplesthe greedy lcs/ses algorithm - """ - N, M = len(X), len(Y) - if not X or not Y: - return [] - max = N + M - v = [0 for i in range(2 * max + 1)] - vl = [v] - for D in range(max + 1): - for k in range(-D, D + 1, 2): - if k == -D or k != D and v[k - 1] < v[k + 1]: - x = v[k + 1] - else: - x = v[k - 1] + 1 - - y = x - k - while x < N and y < M and equal(X[x], Y[y]): - x += 1 - y += 1 - - v[k] = x - if x >= N and y >= M: - # reconstruction du chemin - vl.append(v) - path = [] - k = N - M - - while vl: - oldv = vl.pop(-1) - if k == -D or k != D and oldv[k - 1] < oldv[k + 1]: - xs = oldv[k + 1] - k = k + 1 - else: - xs = oldv[k - 1] + 1 - k = k - 1 - # print "-> x=%d y=%d v=%r ok=%d k=%d xs=%d D=%d" % ( - # x,y,oldv,oldk,k,xs,D) - while x > xs: - x -= 1 - y -= 1 - # print "(%d,%d)" % (x,y) - path.append((X[x], Y[y])) - D -= 1 - x = oldv[k] - y = x - k - # print "<- x=%d y=%d v=%r ok=%d k=%d xs=%d D=%d" % ( - # x,y,oldv,oldk,k,xs,D) - # print x,y,deltax,deltay,oldv, oldk, k - path.reverse() - return path - vl.append(v[:]) - - -# save the reference for tests -lcs2_python = lcs2 -have_c_extension = False - -try: - import xmldiff.maplookup - lcs2 = xmldiff.maplookup.lcs2 - have_c_extension = True -except ImportError: # pragma: no cover - pass - - -def quick_ratio(a, b): - """ - optimized version of the standard difflib.py quick_ration - (without junk and class) - Return an upper bound on ratio() relatively quickly. - """ - # viewing a and b as multisets, set matches to the cardinality - # of their intersection; this counts the number of matches - # without regard to order, so is clearly an upper bound - if not a and not b: - return 1 - fullbcount = {} - for elt in b: - fullbcount[elt] = fullbcount.get(elt, 0) + 1 - # avail[x] is the number of times x appears in 'b' less the - # number of times we've seen it in 'a' so far ... kinda - avail = {} - matches = 0 - for elt in a: - if elt in avail: - numb = avail[elt] - else: - numb = fullbcount.get(elt, 0) - avail[elt] = numb - 1 - if numb > 0: - matches = matches + 1 - return 2.0 * matches / (len(a) + len(b)) diff --git a/src/xmldiff/fmes.py b/src/xmldiff/fmes.py deleted file mode 100644 index 292bed5..0000000 --- a/src/xmldiff/fmes.py +++ /dev/null @@ -1,439 +0,0 @@ -# Copyright (c) 2000-2010 LOGILAB S.A. (Paris, FRANCE). -# http://www.logilab.fr/ -- mailto:contact@logilab.fr -# Copyright (c) 2018 Shoobx.com. -# https://www.shoobx.com/ -- mailto:dev@shoobx.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -""" - this file provides the fast match / edit script (fmes) tree to tree correction - algorithm as described in "Change detection in hierarchically structured - information" by S. Chawathe, A. Rajaraman, H. Garcia-Molina and J. Widom - ([CRGMW95]) -""" - -from xmldiff.objects import ( - NT_ROOT, NT_NODE, NT_ATTN, NT_ATTV, NT_TEXT, N_TYPE, N_NAME, N_VALUE, - N_CHILDS, N_PARENT, NSIZE) -from xmldiff.objects import ( - get_labels, get_ancestors, make_bfo_list, insert_node, delete_node, - rename_node, get_pos, f_xpath, nb_attrs) -from xmldiff.difflib import lcs2, quick_ratio -from xmldiff.misc import intersection, in_ref, index_ref - -# c extensions -from xmldiff.maplookup import ( - has_couple, partner, fmes_init, fmes_node_equal, match_end, fmes_end) - -# node's attributes for fmes algorithm -N_INORDER = NSIZE -N_MAPPED = N_INORDER + 1 - - -def _init_tree(tree, map_attr=None): - """ recursively append N_INORDER attribute to tree - optionnaly add the N_MAPPED attribute (for node from tree 1) - """ - tree.append(False) - if map_attr is not None: - tree.append(False) - for child in tree[N_CHILDS]: - _init_tree(child, map_attr) - - -## FMES TREE 2 TREE ALGORITHM ################################################# -class FmesCorrector: - """ - Fast Match / Edit Script implementation - - See [CRGMW95] for reference. - """ - - def __init__(self, formatter, f=0.6, t=0.5): # f=0,59 - # algorithm parameters - if f > 1 or f < 0 or t > 1 or t < 0.5: - raise Exception('Invalid parameters: 1 > f > 0 and 1 > t > 0.5') - self.F = f - self.T = t - self._formatter = formatter - - def process_trees(self, tree1, tree2): - """ - Process the two trees - """ - # add needed attribute (INORDER) - _init_tree(tree1, map_attr=1) - _init_tree(tree2) - # print '**** TREE 2' - # print node_repr(tree2) - # print '**** TREE 1' - # print node_repr(tree1) - # attributes initialisation - self._mapping = [] # empty mapping - self.add_action = self._formatter.add_action - self._d1, self._d2 = {}, {} - # give references to the C extensions specific to fmes - fmes_init(self._mapping, self._d1, self._d2, self.T) - self._dict = {} - self._tmp_attrs_dict = {} - self._pending = [] - # step 0: mapping - self._fast_match(tree1, tree2) - # free matching variables - match_end() - del self._d1 - del self._d2 - # step 1: breadth first search tree2 - self._fmes_step1(tree2, tree1) - # step 2: post order traversal tree1 - self._fmes_step2(tree1, tree2) - # step 3: rename tmp attributes - for tmp_name, real_name in self._tmp_attrs_dict.items(): - self.add_action(['rename', '//@%s' % tmp_name, real_name]) - # free mapping ref in C extensions - fmes_end() - self._formatter.end() - - ## Private functions ###################################################### - def _fast_match(self, tree1, tree2): - """ the fast match algorithm - try to resolve the 'good matching problem' """ - labl1, labl2 = {}, {} - leaf_labl1, leaf_labl2 = {}, {} - # chain all nodes with a given label l in tree T together - get_labels(tree1, labl1, leaf_labl1) - get_labels(tree2, labl2, leaf_labl2) - # do the matching job - self._match(leaf_labl1, leaf_labl2, self._l_equal) - # remove roots ('/') from labels - del labl1['R'] - del labl2['R'] - # append roots to mapping - self._mapping.append((tree1, tree2)) - # mark node as mapped - tree1[N_MAPPED] = True - self._match(labl1, labl2, fmes_node_equal) # self._n_equal - - def _match(self, lab_l1, lab_l2, equal): - """do the actual matching""" - d1, d2 = self._d1, self._d2 - # for each leaf label in both tree1 and tree2 - # sort list to avoid differences between python version - labls = sorted(intersection(lab_l1.keys(), lab_l2.keys())) - for label in labls: - s1 = lab_l1[label] - s2 = lab_l2[label] - # compute the longest common subsequence - common = lcs2(s1, s2, equal) - # for each pair of nodes (x,y) in the lcs - for x, y in common: - # add (x,y) to the mapping - self._mapping.append((x, y)) - # mark node from tree 1 as mapped - x[N_MAPPED] = True - # fill the mapping cache - for n in get_ancestors(x, []): - d1[(id(n), id(x))] = 1 - for n in get_ancestors(y, []): - d2[(id(n), id(y))] = 1 - - def _fmes_step1(self, tree2, tree1): - """ first step of the edit script algorithm - combines the update, insert, align and move phases - """ - # x the current node in the breadth-first order traversal - for x in make_bfo_list(tree2): - y = x[N_PARENT] - z = partner(1, y) - w = partner(1, x) - # insert - if not w: - todo = 1 - # avoid to add existing attribute node - if x[N_TYPE] == NT_ATTN: - for w in z[N_CHILDS]: - if w[N_TYPE] != NT_ATTN: - break - elif w[N_VALUE] == x[N_VALUE]: - # FIXME: what if w or w[N_CHILDS][0] yet mapped ?? - if not w[N_MAPPED]: - todo = None - w[N_MAPPED] = True - self._mapping.append((w, x)) - # print 'delete 1' - # if not w[N_CHILDS][0]: - delete_node(w[N_CHILDS][0]) - break - - if todo is not None: - x[N_INORDER] = True - k = self._find_pos(x) - # w = copy(x) - w = x[:] - w[N_CHILDS] = [] - w.append(True) # <-> w[N_MAPPED] = True - self._mapping.append((w, x)) - # avoid coalescing two text nodes - if w[N_TYPE] == NT_TEXT: - k = self._before_insert_text(z, w, k) - # real insert on tree 1 - insert_node(z, w, k) - # make actions on subtree - self._dict[id(w)] = ww = w[:] - ww[N_CHILDS] = [] - # preformat action - if id(z) not in self._dict: - if w[N_TYPE] == NT_ATTV: - action = ['update', f_xpath(z), w[N_VALUE]] - elif w[N_TYPE] == NT_ATTN: - action = ['append', f_xpath(z), ww] - elif z[N_TYPE] == NT_ROOT: - action = ['append-first', '/', ww] - else: - k = get_pos(w) - if k <= nb_attrs(z): - action = ['append-first', - f_xpath(z), ww] - else: - action = ['insert-after', - f_xpath(z[N_CHILDS][k - 1]), ww] - self.add_action(action) - else: - insert_node(self._dict[id(z)], ww, k) - elif x[N_NAME] != '/': - v = w[N_PARENT] - # update - if w[N_VALUE] != x[N_VALUE]: - needs_rename = True - # format action - if w[N_TYPE] == NT_NODE: - self.add_action(['rename', f_xpath(w), x[N_VALUE]]) - elif w[N_TYPE] == NT_ATTN: - attr_name = self._before_attribute(w[N_PARENT], w, - x[N_VALUE]) - self.add_action(['rename', f_xpath(w), attr_name]) - x[N_NAME] = '@%sName' % attr_name - x[N_VALUE] = attr_name - else: - self.add_action(['update', f_xpath(w), x[N_VALUE]]) - # We are simply replacing the main text node, so no - # need to rename. - needs_rename = False - # real update on t1 - w[N_VALUE] = x[N_VALUE] - # this is necessary for xpath, but do not rename on simple - # text update. - if needs_rename: - rename_node(w, x[N_NAME]) - # move x if parents not mapped together - if not has_couple(v, y): - x[N_INORDER] = True - k = self._find_pos(x) - self._make_move(w, z, k) - # align children - self._align_children(w, x) - # print 'after', node_repr(tree1) - - def _fmes_step2(self, tree1, tree2): - """ the delete_node phase of the edit script algorithm - - instead of the standard algorithm, walk on tree1 in pre order and - add a remove action on node not marked as mapped. - Avoiding recursion on these node allow to extract remove on subtree - instead of leaf - - do not use next_sibling for performance issue - """ - stack = [] - i = 0 - node = tree1 - while node is not None: - if not node[N_MAPPED]: - if node[N_PARENT] and len(node[N_PARENT][N_CHILDS]) > i + 1: - next_node = node[N_PARENT][N_CHILDS][i + 1] - # if next node is a text node to remove, switch actions - if next_node[N_TYPE] == NT_TEXT and \ - not next_node[N_MAPPED]: - self.add_action(['remove', f_xpath(next_node)]) - delete_node(next_node) - try: - next_node = node[N_PARENT][N_CHILDS][i + 1] - except IndexError: - next_node = None - else: - next_node = None - self.add_action(['remove', f_xpath(node)]) - delete_node(node) - node = next_node - elif node[N_CHILDS]: - # push next sibbling on the stack - if node[N_PARENT] and len(node[N_PARENT][N_CHILDS]) > i + 1: - stack.append((node[N_PARENT][N_CHILDS][i + 1], i + 1)) - node = node[N_CHILDS][0] - i = 0 - elif node[N_PARENT] and len(node[N_PARENT][N_CHILDS]) > i + 1: - i += 1 - node = node[N_PARENT][N_CHILDS][i] # next_sibling(node) - else: - node = None - if node is None and stack: - node, i = stack.pop() - - def _align_children(self, w, x): - """ align children to correct misaligned nodes - """ - # mark all children of w an d as "out of order" - self._childs_out_of_order(w) - self._childs_out_of_order(x) - # s1: children of w whose partner is children of x - s1 = [n for n in w[N_CHILDS] if in_ref(x[N_CHILDS], partner(0, n))] - # s2: children of x whose partners are children of w - s2 = [n for n in x[N_CHILDS] if in_ref(w[N_CHILDS], partner(1, n))] - # compute the longest common subsequence - s = lcs2(s1, s2, has_couple) - # mark each (a,b) from lcs in order - for a, b in s: - a[N_INORDER] = b[N_INORDER] = True - s1.pop(index_ref(s1, a)) - # s: a E T1, b E T2, (a,b) E M, (a;b) not E s - for a in s1: - b = partner(0, a) - # mark a and b in order - a[N_INORDER] = b[N_INORDER] = True - k = self._find_pos(b) - self._make_move(a, w, k) - - def _find_pos(self, x): - """ find the position of a node in the destination tree (tree2) - - do not use previous_sibling for performance issue - """ - y = x[N_PARENT] - # if x is the leftmost child of y in order, return 1 - for v in y[N_CHILDS]: - if v[N_INORDER]: - if v is x: - # return 0 instead of 1 here since the first element of a - # list have index 0 - return 0 - break - # looking for rightmost left sibling of y INORDER - i = get_pos(x) - 1 - while i >= 0: - v = y[N_CHILDS][i] - if v[N_INORDER]: - break - i -= 1 - u = partner(1, v) - if u is not None: - return get_pos(u) + 1 - - def _make_move(self, n1, n2, k): - # avoid coalescing two text nodes - act_node = self._before_delete_node(n1) - if act_node is not None and act_node[0] is n2 and act_node[1] < k: - k += 1 - if n1[N_TYPE] == NT_TEXT: - k = self._before_insert_text(n2, n1, k) - if k <= nb_attrs(n2): - self.add_action(['move-first', n1, n2]) - else: - self.add_action(['move-after', n1, n2[N_CHILDS][k - 1]]) - elif n1[N_TYPE] == NT_ATTN: - # avoid to move an attribute node from a place to another on - # the same node - if not n1[N_PARENT] is n2: - old_name = n1[N_VALUE] - new_name = self._before_attribute(n2, n1) - if new_name != old_name: - self.add_action(['remove', f_xpath(n1)]) - n1[N_NAME] = '@%sName' % new_name - n1[N_VALUE] = new_name - self.add_action(['append', f_xpath(n2), n1]) - else: - self.add_action(['move-first', n1, n2]) - elif k <= nb_attrs(n2): - self.add_action(['move-first', n1, n2]) - else: - self.add_action(['move-after', n1, n2[N_CHILDS][k - 1]]) - # real move - delete_node(n1) - insert_node(n2, n1, k) - - def _before_attribute(self, parent_node, attr_node, new_name=None): - attr_name = new_name or attr_node[N_VALUE] - for w in parent_node[N_CHILDS]: - if w[N_TYPE] != NT_ATTN: - break - if w[N_VALUE] == attr_name: - new_name = 'LogilabXmldiffTmpAttr%s' % attr_name.replace(':', - '_') - self._tmp_attrs_dict[new_name] = attr_name - return new_name - return attr_name - - FAKE_TAG = [NT_NODE, 'LogilabXMLDIFFFAKETag', 'LogilabXMLDIFFFAKETag', - [], None, 0, 0, None, True, False] - - def _before_insert_text(self, parent, new_text, k): - """ check if a text node that will be remove has two sibbling text - nodes to avoid coalescing two text nodes - """ - if k > 1: - if parent[N_CHILDS][k - 1][N_TYPE] == NT_TEXT: - tag = self.FAKE_TAG[:] - self.add_action(['insert-after', - f_xpath(parent[N_CHILDS][k - 1]), tag]) - insert_node(parent, tag, k) - return k + 1 - if k < len(parent[N_CHILDS]): - if parent[N_CHILDS][k][N_TYPE] == NT_TEXT: - tag = self.FAKE_TAG[:] - if k <= nb_attrs(parent): - self.add_action(['append-first', f_xpath(parent), tag]) - else: - self.add_action(['insert-after', - f_xpath(parent[N_CHILDS][k - 1]), tag]) - insert_node(parent, tag, k) - return k - - def _before_delete_node(self, node): - """ check if a text node will be inserted with a sibbling text node to - avoid coalescing two text nodes - """ - k = get_pos(node) - parent = node[N_PARENT] - if k >= 1 and k + 1 < len(parent[N_CHILDS]): - if parent[N_CHILDS][k - 1][N_TYPE] == NT_TEXT and \ - parent[N_CHILDS][k + 1][N_TYPE] == NT_TEXT: - tag = self.FAKE_TAG[:] - self.add_action(['insert-after', - f_xpath(parent[N_CHILDS][k - 1]), tag]) - insert_node(parent, tag, k) - return parent, k - return None - - def _childs_out_of_order(self, subtree): - """ initialisation function : tag all the subtree as unordered """ - for child in subtree[N_CHILDS]: - child[N_INORDER] = False - self._childs_out_of_order(child) - - def _l_equal(self, n1, n2): - """ function to compare leafs during mapping """ - ratio = quick_ratio(n1[N_VALUE], n2[N_VALUE]) - if ratio > self.F: - # print 'MATCH (%s): %s / %s' %(ratio, n1[N_VALUE],n2[N_VALUE]) - return True - # print 'UNMATCH (%s): %s / %s' %(ratio, n1[N_VALUE],n2[N_VALUE]) - return False diff --git a/src/xmldiff/format.py b/src/xmldiff/format.py deleted file mode 100644 index 23bb6e1..0000000 --- a/src/xmldiff/format.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) 2000-2010 LOGILAB S.A. (Paris, FRANCE). -# http://www.logilab.fr/ -- mailto:contact@logilab.fr -# Copyright (c) 2018 Shoobx.com. -# https://www.shoobx.com/ -- mailto:dev@shoobx.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -""" -this module provides classes to format the native tree2tree output -""" - -import logging -import sys -try: - from xml.dom import EMPTY_NAMESPACE as NO_NS -except ImportError: # pragma: no cover - NO_NS = None -from xmldiff.objects import A_N1, A_N2, A_DESC, xml_print, f_xpath - - -## Formatter interface ######################################################## -class AbstractFormatter(object): - """ - Formatter interface - """ - def __init__(self, stream=None): - self.edit_s = [] - self._stream = stream - - def init(self, stream=None): - """ method called before the begining of the tree 2 tree correction """ - logging.warning("The init() method of Formatters is deprecated. Set the " - "stream with __init__() instead.") - if self._stream is None and stream is not None: - self._stream = stream - - def add_action(self, action): - """ method called when an action is added to the edit script """ - self.edit_s.append(action) - - def format_action(self, action): - """ method called by end() to format each action in the edit script - at least this method should be overridden - """ - raise NotImplementedError() # pragma: no cover - - def end(self): - """ method called at the end of the tree 2 tree correction """ - for action in self.edit_s: - self.format_action(action) - - -## Internal Formatter ######################################################### -class InternalPrinter(AbstractFormatter): - """ print actions in the internal format """ - - def add_action(self, action): - """ - See AbstractFormatter interface - """ - if len(action) > 2 and isinstance(action[A_N2], list): - if isinstance(action[A_N1], list): - # swap or move node - action[A_N1] = f_xpath(action[A_N1]) - action[A_N2] = f_xpath(action[A_N2]) - super(InternalPrinter, self).add_action(action) - - def format_action(self, action): - """ - See AbstractFormatter interface - """ - stream = self._stream - if stream is None: - stream = sys.stdout - - if len(action) > 2 and isinstance(action[A_N2], list): - stream.write('[%s, %s,\n' % (action[A_DESC], action[A_N1])) - xml_print(action[A_N2], stream=stream) - stream.write("]\n") - elif len(action) > 2: - stream.write('[%s, %s, %s]\n' % (action[A_DESC], - action[A_N1], - action[A_N2])) - else: - stream.write('[%s, %s]\n' % (action[A_DESC], action[A_N1])) diff --git a/src/xmldiff/input.py b/src/xmldiff/input.py deleted file mode 100644 index 89af047..0000000 --- a/src/xmldiff/input.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) 2000-2010 LOGILAB S.A. (Paris, FRANCE). -# http://www.logilab.fr/ -- mailto:contact@logilab.fr -# Copyright (c) 2018 Shoobx.com. -# https://www.shoobx.com/ -- mailto:dev@shoobx.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -""" Provides functions for converting DOM tree or xml file in order to process -it with xmldiff functions. """ - -from xml.sax import make_parser, SAXNotRecognizedException -from xml.sax.handler import feature_namespaces, feature_external_ges -from xml.sax.handler import feature_external_pes, property_lexical_handler - -from xmldiff.parser import SaxHandler - - -def tree_from_stream(stream, - norm_sp=1, ext_ges=0, ext_pes=0, include_comment=1, - html=0): - """ - create internal tree from xml stream (open file or StringIO) - if norm_sp = 1, normalize space and new line - """ - handler = SaxHandler(norm_sp, include_comment) - if html: - parser = make_parser(["xml.sax.drivers2.drv_sgmlop_html"]) - else: - parser = make_parser() - # do perform Namespace processing - parser.setFeature(feature_namespaces, 1) - # do not include any external entities - try: - parser.setFeature(feature_external_ges, ext_ges) - except SAXNotRecognizedException: # pragma: no cover - print('Unable to set feature external ges') - try: - parser.setFeature(feature_external_pes, ext_pes) - except SAXNotRecognizedException: # pragma: no cover - print('Unable to set feature external pes') - - # add lexical handler for comments, entities, dtd and cdata - parser.setProperty(property_lexical_handler, handler) - parser.setContentHandler(handler) - parser.parse(stream) - return handler.get_tree() - - -def tree_from_lxml(tree, norm_sp=True, include_comment=True): - handler = SaxHandler(norm_sp, include_comment) - import lxml.sax - lxml.sax.saxify(tree, handler) - return handler.get_tree() diff --git a/src/xmldiff/main.py b/src/xmldiff/main.py deleted file mode 100644 index 1a38744..0000000 --- a/src/xmldiff/main.py +++ /dev/null @@ -1,153 +0,0 @@ -# Copyright (c) 2000-2010 LOGILAB S.A. (Paris, FRANCE). -# http://www.logilab.fr/ -- mailto:contact@logilab.fr -# Copyright (c) 2018 Shoobx.com. -# https://www.shoobx.com/ -- mailto:dev@shoobx.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -import argparse -import logging -import sys -import os -import pkg_resources - -from xmldiff.fmes import FmesCorrector -from xmldiff.format import InternalPrinter -from xmldiff.input import tree_from_stream -from xmldiff.misc import process_dirs, list_print -from xmldiff.objects import node_repr, N_ISSUE -from xml.sax import SAXParseException - -logging.basicConfig() - -def process_files(file1, file2, norm_sp, verbose, - ext_ges, ext_pes, include_comment, - html): - """ - Computes the diff between two files. - """ - trees = [] - for fname in (file1, file2): - with open(fname, 'r') as fhandle: - try: - tree = tree_from_stream(fhandle, norm_sp, ext_ges, - ext_pes, include_comment, html) - except SAXParseException as err: - print(err) - return -1 - trees.append(tree) - - if verbose: - print('Source tree:\n%s' % node_repr(trees[0])) - print('Destination tree:\n%s' % node_repr(trees[1])) - print('Source tree has %d nodes' % trees[0][N_ISSUE]) - print('Destination tree has %d nodes' % trees[1][N_ISSUE]) - - # output formatter - formatter = InternalPrinter() - # choose and apply tree to tree algorithm - strategy = FmesCorrector(formatter) - strategy.process_trees(*trees) - return len(formatter.edit_s) - - -def parse_args(argv): - package = pkg_resources.get_distribution("xmldiff") - - parser = argparse.ArgumentParser( - description=('Tree 2 tree correction between xml documents. ' - 'Extract differences between two xml files. ' - 'It returns a set of primitives to apply on source tree ' - 'to obtain the destination tree.')) - parser.add_argument('-V', '--version', action='version', - version=package.version) - parser.add_argument('-H', '--html', action='store_true', default=False, - help=('input files are HTML instead of XML.')) - parser.add_argument('-r', '--recursive', action='store_true', - default=False, - help=('when comparing directories, recursively ' - 'compare any subdirectories found.')) - parser.add_argument('-v', '--verbose', action='store_true', default=False) - parser.add_argument('-n', '--not-normalize-spaces', action='store_true', - default=False, - help=('do not normalize spaces and new lines in text ' - 'and comment nodes.')) - parser.add_argument('-c', '--exclude-comments', action='store_true', - default=False, - help=('do not process comment nodes.')) - parser.add_argument('-g', '--ext-ges', action='store_true', default=False, - help=('include all external general (text) entities.')) - parser.add_argument('-p', '--ext-pes', action='store_true', default=False, - help=('include all external parameter entities, ' - 'including the external DTD subset.')) - - parser.add_argument('from_file_or_dir', - help=('in')) - parser.add_argument('to_file_or_dir', - help=('out')) - - args = parser.parse_args(argv) - return args - - -def run(argv=None): - """ - Main. To be called with list of command-line arguments (if provided, - args should not contain the executable as first item) - """ - if argv is None: - argv = sys.argv[1:] - - args = parse_args(argv) - - fpath1 = args.from_file_or_dir - fpath2 = args.to_file_or_dir - normalize_spaces = not args.not_normalize_spaces - include_comments = not args.exclude_comments - exit_status = 0 - # if args are directory - if os.path.isdir(fpath1) and os.path.isdir(fpath2): - common, deleted, added = process_dirs(fpath1, fpath2, args.recursive) - - list_print(deleted[0], 'FILE:', 'deleted') - list_print(deleted[1], 'DIRECTORY:', 'deleted') - list_print(added[0], 'FILE:', 'added') - list_print(added[1], 'DIRECTORY:', 'added') - exit_status += sum((len(deleted[0]), len(deleted[1]), - len(added[0]), len(added[1]))) - for filename in common[0]: - print('-' * 80) - print('FILE: %s' % filename) - diffs = process_files( - os.path.join(fpath1, filename), - os.path.join(fpath2, filename), - normalize_spaces, args.verbose, - args.ext_ges, args.ext_pes, include_comments, args.html) - if diffs: - exit_status += diffs - # if args are files - elif os.path.isfile(fpath1) and os.path.isfile(fpath2): - exit_status = process_files( - fpath1, fpath2, - normalize_spaces, args.verbose, - args.ext_ges, args.ext_pes, include_comments, args.html) - else: - exit_status = -1 - print('%s and %s are not comparable, or not directory ' - 'nor regular files' % (fpath1, fpath2)) - sys.exit(exit_status) - - -if __name__ == '__main__': - run() diff --git a/src/xmldiff/misc.py b/src/xmldiff/misc.py deleted file mode 100644 index 9ded4e2..0000000 --- a/src/xmldiff/misc.py +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright (c) 2000-2010 LOGILAB S.A. (Paris, FRANCE). -# http://www.logilab.fr/ -- mailto:contact@logilab.fr -# Copyright (c) 2018 Shoobx.com. -# https://www.shoobx.com/ -- mailto:dev@shoobx.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -""" -miscellaneous functions -""" - -import os - - -def process_dirs(dir1, dir2, recursive): - """ - function which return common, added, deleted file from dir1 to dir2 - and remove initial directory from paths - """ - dir1 = normalize_dir(dir1) - dir2 = normalize_dir(dir2) - common, deleted, added = _process_dirs(dir1, dir2, recursive) - # remove prefix - deleted[0] = list(map(_remove_prefix(len(dir1)), deleted[0])) - deleted[1] = list(map(_remove_prefix(len(dir1)), deleted[1])) - added[0] = list(map(_remove_prefix(len(dir2)), added[0])) - added[1] = list(map(_remove_prefix(len(dir2)), added[1])) - common[0] = list(map(_remove_prefix(len(dir1)), common[0])) - common[1] = list(map(_remove_prefix(len(dir1)), common[1])) - return common, deleted, added - - -def divide_files(dir): - """ return a list with subdir of dir and another one with files """ - import os - dirs = [] - regs = [] - for filename in sorted(os.listdir(dir)): - if os.path.isfile(os.path.join(dir, filename)): - regs.append(filename) - elif os.path.isdir(os.path.join(dir, filename)): - dirs.append(filename) - return dirs, regs - - -def extract(list1, list2): - """ extract common, added, deleted item from list1 to list2 """ - common, deleted, added = [], [], [] - for item in list1: - if item in list2: - common.append(item) - list2.remove(item) - else: - deleted.append(item) - for item in list2: - if item not in list1: - added.append(item) - return common, deleted, added - - -def intersection(list1, list2): - """ return common items in list1 and list2 """ - tmp = {} - result = [] - for i in list1: - tmp[i] = 1 - for i in list2: - if i in tmp: - result.append(i) - return result - - -def in_ref(list, item): - """ return true if list contains a reference on item """ - for it in list: - if it is item: - return True - return False - - -def index_ref(list, item): - """ - return the index of item in list by reference comparison - raise Exception if not found - """ - index = 0 - for it in list: - if it is item: - return index - index += 1 - raise Exception('No item ' + item) - - -def list_print(list, s1='', s2=''): - for item in list: - if item: - print('-' * 80) - print("%s %s %s" % (s1, item, s2)) - - -def normalize_dir(directory): - """remove trailing path separator from the directory name - """ - while directory[-1] == os.sep: - directory = directory[:-1] - return directory - - -def _process_dirs(dir1, dir2, recursive): - """ - function which return common, added, deleted file from dir1 to dir2 - if recursive, enter in subdirectory - """ - # /!\ sym links /!\# - - # extract common files and directory - d_list1, f_list1 = divide_files(dir1) - d_list2, f_list2 = divide_files(dir2) - common, deleted, added = [[], []], [[], []], [[], []] - common[0], deleted[0], added[0] = extract(f_list1, f_list2) - common[1], deleted[1], added[1] = extract(d_list1, d_list2) - # add prefix - deleted[0] = list(map(_add_prefix(dir1), deleted[0])) - deleted[1] = list(map(_add_prefix(dir1), deleted[1])) - added[0] = list(map(_add_prefix(dir2), added[0])) - added[1] = list(map(_add_prefix(dir2), added[1])) - common[0] = list(map(_add_prefix(dir1), common[0])) - if recursive: - # for all common subdirs - for dir in common[1]: - # recursion - comm, dele, adde = _process_dirs(os.path.join(dir1, dir), - os.path.join(dir2, dir), - recursive) - # add subdir items - deleted[0].extend(dele[0]) - deleted[1].extend(dele[1]) - added[0].extend(adde[0]) - added[1].extend(adde[1]) - common[0].extend(comm[0]) - return common, deleted, added - - -def _remove_prefix(prfx_size): - """ return a function to add remove with map() """ - return lambda s, len=prfx_size: s[len + 1:] - - -def _add_prefix(prefix): - """ return a function to add prefix with map() """ - return lambda s, prfx=prefix, join=os.path.join: join(prfx, s) diff --git a/src/xmldiff/objects.py b/src/xmldiff/objects.py deleted file mode 100644 index 35384e2..0000000 --- a/src/xmldiff/objects.py +++ /dev/null @@ -1,287 +0,0 @@ -# Copyright (c) 2000-2010 LOGILAB S.A. (Paris, FRANCE). -# http://www.logilab.fr/ -- mailto:contact@logilab.fr -# Copyright (c) 2018 Shoobx.com. -# https://www.shoobx.com/ -- mailto:dev@shoobx.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -""" -provides constantes for using node and action (list) and some functions -for these objects use - - /!\ /!\ do not call index, remove or compare two node with == since a - node is a recursive list -""" - -import sys - -################ ACTIONS ###################################################### - -A_DESC = 0 # string describes the action -A_N1 = 1 # node on which the action applies -A_N2 = 2 # optional second action argument, maybe node or value - - -################## NODES CONSTANTES ########################################### - -N_TYPE = 0 # node's type -N_NAME = 1 # node's label (to process xpath) -N_VALUE = 2 # node's value -N_CHILDS = 3 # nodes's childs list -N_PARENT = 4 # node's parent -N_ISSUE = 5 # node's total issue number -N_XNUM = 6 # to compute node's xpath -N_NSPREFIX = 7 # node's namespace prefix (if any) -NSIZE = 8 # number of items in a list which represent a node - -# NODE TYPES -# NT_SYST = 0 # SYSTEM node (added by parser) /!\ deprecated -NT_NODE = 1 # ELEMENT node -NT_ATTN = 2 # ATTRIBUTE NAME node -NT_ATTV = 3 # ATTRIBUTE VALUE node -NT_TEXT = 4 # TEXT node -NT_COMM = 5 # COMMENT node -NT_ROOT = 6 # root node -NODES_TYPES = ('NT', 'NN', 'AN', 'AV', 'T', 'C', 'R') # for printing - - -################## OPERATIONS EDITING NODES ################################### - -def link_node(parent, child): - """ link child to his parent """ - if child: - parent[N_CHILDS].append(child) - child[N_PARENT] = parent - - -def insert_node(node, new, pos): - """ insert child new on node at position pos (integer) """ - node[N_CHILDS].insert(pos, new) - new[N_PARENT] = node - i, j = 0, 1 - while i < len(node[N_CHILDS]): - n = node[N_CHILDS][i] - if n[N_NAME] == new[N_NAME] and n[N_TYPE] == new[N_TYPE]: - n[N_XNUM] = j - j += 1 - i += 1 - - -def delete_node(node): - """ delete a node from its tree """ - siblings = node[N_PARENT][N_CHILDS] - i = get_pos(node) - siblings.pop(i) - node[N_PARENT] = None - while i < len(siblings): - n = siblings[i] - if n[N_NAME] == node[N_NAME] and n[N_TYPE] == node[N_TYPE]: - n[N_XNUM] -= 1 - i += 1 - - -def rename_node(node, new_name): - """ rename a node - this is necessary for xpath - """ - siblings = node[N_PARENT][N_CHILDS] - pos = get_pos(node) - xnum = 1 - for i in range(len(siblings)): - n = siblings[i] - if i < pos: - if n[N_NAME] == new_name and n[N_TYPE] == node[N_TYPE]: - xnum += 1 - elif i != pos: - if n[N_NAME] == node[N_NAME] and n[N_TYPE] == node[N_TYPE]: - n[N_XNUM] -= 1 - elif n[N_NAME] == new_name and n[N_TYPE] == node[N_TYPE]: - n[N_XNUM] += 1 - node[N_NAME] = new_name - node[N_XNUM] = xnum - - -################## OPERATIONS FORMATING NODES ################################# - -def caract(node): - """ return a string which represent the node """ - return '%s:%s (%s) %s %s' % (NODES_TYPES[node[N_TYPE]], node[N_VALUE], - f_xpath(node), id(node), node[N_ISSUE]) - - -def f_xpath(node, x=''): - """ compute node's xpath """ - name = node[N_NAME] - if '{' in name: - # We have a namespace - pre, rest = name.split('{', 1) - uri, local_name = rest.split('}', 1) - prefix = node[N_NSPREFIX] - if prefix is None: - # Default namespace - name = pre + local_name - else: - name = '%s%s:%s' % (pre, prefix, local_name) - - if name != '/': - if node[N_TYPE] == NT_ATTN: - return f_xpath(node[N_PARENT], - '/%s' % name[:len(name) - 4]) - if node[N_TYPE] == NT_ATTV: - return f_xpath(node[N_PARENT]) # [N_PARENT], '/%s'%name) - return f_xpath(node[N_PARENT], '/%s[%d]%s' % ( - name, node[N_XNUM], x)) - elif not x: - return '/' - return x - - -def node_repr(node): - """ return a string which represents the given node """ - s = '%s\n' % caract(node) - for child in node[N_CHILDS]: - s = '%s%s' % (s, _indent(child, ' ')) - return s - - -def _indent(node, indent_str): - s = '%s\-%s\n' % (indent_str, caract(node)) - if next_sibling(node) is not None: - indent_str = '%s| ' % indent_str - else: - indent_str = '%s ' % indent_str - for child in node[N_CHILDS]: - s = '%s%s' % (s, _indent(child, indent_str)) - return s - - -def xml_print(node, indent='', stream=None): - """ - recursive function which write the node in an xml form without the added - nodes - """ - if stream is None: - stream = sys.stdout - _xml_print_internal_format(node, indent, stream) - - -def _xml_print_internal_format(node, indent, stream): - if node[N_TYPE] == NT_NODE: - attrs_s = '' - i = 0 - while i < len(node[N_CHILDS]): - n = node[N_CHILDS][i] - if n[N_TYPE] == NT_ATTN: - i += 1 - attrs_s = '%s %s="%s"' % (attrs_s, n[N_VALUE], - n[N_CHILDS][0][N_VALUE]) - else: - break - if len(node[N_CHILDS]) > i: - stream.write('%s<%s%s>\n' % (indent, node[N_VALUE], attrs_s)) - for _curr_node in node[N_CHILDS][i:]: - _xml_print_internal_format( - _curr_node, indent + ' ', stream=stream) - stream.write('%s\n' % (indent, node[N_VALUE])) - else: - stream.write('%s<%s%s/>\n' % (indent, node[N_VALUE], attrs_s)) - elif node[N_TYPE] == NT_ATTN: - stream.write('%s<@%s>\n' % (indent, node[N_VALUE])) - stream.write(node[N_CHILDS][0][N_VALUE] + '\n') - stream.write('%s\n' % (indent, node[N_VALUE])) - elif node[N_TYPE] == NT_COMM: - stream.write('%s\n' % (indent, node[N_VALUE])) - elif node[N_TYPE] == NT_TEXT: - stream.write(node[N_VALUE] + '\n') - else: - stream.write('unknown node type', str(node[N_TYPE])) - - -################## OPERATIONS GIVING INFOS ON NODES ########################### -def get_pos(node): - """ return the index of a node in its parent's children list - - /!\ /!\ do not call index, remove or compare two node with == since a - node is a recursive list - """ - try: - childs = node[N_PARENT][N_CHILDS] - for i, child in enumerate(childs): - if child is node: - return i - except TypeError: - return -1 - except ValueError: - return -1 - - -def nb_attrs(node): - """ return the number of attributes of the given node """ - for i, child in enumerate(node[N_CHILDS]): - if child[N_TYPE] != NT_ATTN: - break - else: - try: - i += 1 - except UnboundLocalError: - i = 0 - return i - - -################## MISCELLANEOUS OPERATIONS ON NODES ########################## -def next_sibling(node): - """ return the node's right sibling """ - if node[N_PARENT] is None: - return None - myindex = get_pos(node) - if len(node[N_PARENT][N_CHILDS]) > myindex + 1: - return node[N_PARENT][N_CHILDS][myindex + 1] - return None - - -def get_ancestors(node, l): - """ append to l all the ancestors from node """ - while node[N_PARENT]: - l.append(node) - node = node[N_PARENT] - return l - - -def get_labels(tree, labels, leaf_labels): - """ - Chain all nodes with a given label l in tree T together, from left to - right, by filling dictionnaries labels and leaf_labels (for leaf nodes). - - Label are keys pointing to a list of nodes with this type. - Node x occurs after y in the list if x appears before y in the in-order - traversal of T. - /!\ /!\ - since this isn't binary tree, post order traversal (?) - """ - if tree and tree[N_CHILDS]: - for node in tree[N_CHILDS]: - get_labels(node, labels, leaf_labels) - labels.setdefault(NODES_TYPES[tree[N_TYPE]], []).append(tree) - elif tree: - leaf_labels.setdefault(NODES_TYPES[tree[N_TYPE]], []).append(tree) - - -def make_bfo_list(tree): - """ create a list with tree nodes in breadth first order """ - queue = [tree] - lst = [tree] - while queue: - node = queue.pop(0) - lst.extend(node[N_CHILDS]) - queue.extend([n for n in node[N_CHILDS] if n[N_CHILDS]]) - return lst diff --git a/src/xmldiff/parser.py b/src/xmldiff/parser.py deleted file mode 100644 index 47325ec..0000000 --- a/src/xmldiff/parser.py +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright (c) 2000-2010 LOGILAB S.A. (Paris, FRANCE). -# http://www.logilab.fr/ -- mailto:contact@logilab.fr -# Copyright (c) 2018 Shoobx.com. -# https://www.shoobx.com/ -- mailto:dev@shoobx.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -""" -This file contains a parser to transform xml document into an internal -tree in order to avoid adding new primitives with tree transformation - -This operation represent all the document in a tree without attributes on -nodes nor text nodes, only nodes with a name and a child list - -(the tree is composed by elements of type Node, defined below) -""" - -from xmldiff.objects import NT_ROOT, NT_NODE, NT_ATTN, NT_ATTV, \ - NT_TEXT, NT_COMM, N_TYPE, N_ISSUE, N_CHILDS, N_VALUE, link_node -from xml.sax import ContentHandler - - -def _inc_xpath(h, xpath): - try: - h[xpath] = h[xpath] + 1 - except KeyError: - h[xpath] = 1 - - -class SaxHandler(ContentHandler): - """ - Sax handler to transform xml doc into basic tree - """ - - def __init__(self, normalize_space, include_comment): - self._p_stack = [[NT_ROOT, '/', '', [], None, 0, 0, None]] - self._norm_sp = normalize_space or None - self._incl_comm = include_comment or None - self._xpath = '' - self._h = {} - self._n_elmt = 0 - self._ns_mapping = {None: [None]} - self._new_mappings = {} - self._default_ns = None - - def startPrefixMapping(self, prefix, uri): - self._new_mappings[prefix] = uri - try: - self._ns_mapping[prefix].append(uri) - except KeyError: - self._ns_mapping[prefix] = [uri] - if prefix is None: - self._default_ns = uri - - def endPrefixMapping(self, prefix): - ns_uri_list = self._ns_mapping[prefix] - if prefix is None: - self._default_ns = ns_uri_list[-1] - ns_uri_list.pop() - - def _buildTag(self, ns_name_tuple): - ns_uri, local_name = ns_name_tuple - if ns_uri: - el_tag = "{%s}%s" % ns_name_tuple - else: - el_tag = local_name - return el_tag - - def _getPrefix(self, ns_uri): - if not ns_uri: - return None - for (prefix, uri) in self._ns_mapping.items(): - if ns_uri in uri: - return prefix - if ns_uri == 'http://www.w3.org/XML/1998/namespace': - # It's the xml: namespace, undeclared. - return 'xml' - raise ValueError("No prefix found for namespace URI %s" % ns_uri) - - # Don't know if I need this - def _buildXPath(self, ns_name_tuple): - ns_uri, local_name = ns_name_tuple - if ns_uri: - prefix = self._getPrefix(ns_uri) - return '%s:%s' % (prefix, local_name) - return local_name - - ## method of the ContentHandler interface ################################# - def startElement(self, name, attrs): - self.startElementNS((None, name), None, attrs) - - def startElementNS(self, name, qname, attrs): - tagName = self._buildTag(name) - prefix = self._getPrefix(name[0]) - - # process xpath - self._xpath = "%s%s%s" % (self._xpath, '/', name) - _inc_xpath(self._h, self._xpath) - # nodes construction for element - node = [NT_NODE, tagName, tagName, [], None, self._n_elmt + 1, - self._h[self._xpath], prefix] - self._n_elmt += 1 - self._xpath = "%s%s%s%s" % ( - self._xpath, '[', self._h[self._xpath], ']') - # nodes construction for element's attributes - # sort attributes to avoid further moves - for key, value in sorted(attrs.items()): - self._n_elmt += 2 - attrName = self._buildTag(key) - prefix = self._getPrefix(key[0]) - attr_node = [NT_ATTN, '@%sName' % attrName, attrName, [], None, - 1, 0, prefix] - link_node(node, attr_node) - link_node(attr_node, [NT_ATTV, '@%s' % attrName, value, - [], None, 0, 0, prefix]) - - link_node(self._p_stack[-1], node) - # set current element on the top of the father stack - self._p_stack.append(node) - - def endElementNS(self, ns_name, qname): - self.endElement(self._buildTag(ns_name)) - - def endElement(self, name): - # process xpath - size = len(self._xpath) - for i in range(size): - size = size - 1 - if self._xpath[-i - 1] == '/': - break - self._xpath = self._xpath[:size] - self._p_stack[-1][N_ISSUE] = self._n_elmt - self._p_stack[-1][N_ISSUE] - # remove last element from stack - self._p_stack.pop() - - def characters(self, ch): - if self._norm_sp is not None: - ch = ' '.join(ch.split()) - if len(ch) > 0 and ch != "\n" and ch != ' ': - parent = self._p_stack[-1] - # if sibling text nodes - if parent[N_CHILDS] and parent[N_CHILDS][-1][N_TYPE] == NT_TEXT: - n = parent[N_CHILDS][-1] - n[N_VALUE] = n[N_VALUE] + ch - else: - self._n_elmt += 1 - xpath = '%s/text()' % self._xpath - _inc_xpath(self._h, xpath) - # nodes construction for text - node = [NT_TEXT, 'text()', ch, [], None, 0, - self._h[xpath], None] - link_node(parent, node) - - ## method of the LexicalHandler interface ################################# - def comment(self, content): - if self._incl_comm is None: - return - if self._norm_sp is not None: - content = ' '.join(content.split()) - if len(content) > 0: - self._n_elmt += 1 - xpath = '%s/comment()' % self._xpath - _inc_xpath(self._h, xpath) - # nodes construction for comment - node = [NT_COMM, 'comment()', content, [], None, - 0, self._h[xpath], None] - link_node(self._p_stack[-1], node) - - # methods from xml.sax.saxlib.LexicalHandler (avoid dependency on pyxml) - def startDTD(self, name, public_id, system_id): - """Report the start of the DTD declarations, if the document - has an associated DTD. - - A startEntity event will be reported before declaration events - from the external DTD subset are reported, and this can be - used to infer from which subset DTD declarations derive. - - name is the name of the document element type, public_id the - public identifier of the DTD (or None if none were supplied) - and system_id the system identfier of the external subset (or - None if none were supplied).""" - - def endDTD(self): - "Signals the end of DTD declarations." - - def startEntity(self, name): - """Report the beginning of an entity. - - The start and end of the document entity is not reported. The - start and end of the external DTD subset is reported with the - pseudo-name '[dtd]'. - - Skipped entities will be reported through the skippedEntity - event of the ContentHandler rather than through this event. - - name is the name of the entity. If it is a parameter entity, - the name will begin with '%'.""" - - def endEntity(self, name): - """Reports the end of an entity. name is the name of the - entity, and follows the same conventions as for - startEntity.""" - - def startCDATA(self): - """Reports the beginning of a CDATA marked section. - - The contents of the CDATA marked section will be reported - through the characters event.""" - - def endCDATA(self): - "Reports the end of a CDATA marked section." - - def get_tree(self): - self._p_stack[0][N_ISSUE] = self._n_elmt - return self._p_stack[0] diff --git a/tests/README b/tests/README deleted file mode 100644 index f6537af..0000000 --- a/tests/README +++ /dev/null @@ -1,15 +0,0 @@ -Data files can be added in the data/ subdirectory, with the following -naming conventions: - - * files belonging to the same test share the first 6 characters of their name - * the 7th character is an underscore - * original file name ends with 1.xml - * modified file name ends with 2.xml - * result file names end with _result, and can contain command line -options to be passed to xmldiff separated with underscores - -Example: - -test02_1.xml -test02_2.xml -test02_result diff --git a/tests/__init__.py b/tests/__init__.py index 792d600..0ce5d01 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +1 @@ -# +# Make the tests a module, so we they are discoverable diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index 76d9991..0000000 --- a/tests/conftest.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2000-2010 LOGILAB S.A. (Paris, FRANCE). -# http://www.logilab.fr/ -- mailto:contact@logilab.fr -# Copyright (c) 2018 Shoobx.com. -# https://www.shoobx.com/ -- mailto:dev@shoobx.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -import pytest -import xmldiff.difflib - - -if xmldiff.difflib.have_c_extension: - lcs2_type_params = ['force_python', 'c'] -else: - lcs2_type_params = ['just_python'] - - -# XXX: there are more functions defined in maplookup.c -# which are NOT implemented in python! - - -@pytest.fixture(params=lcs2_type_params, scope='module') -def lcs2_type(request): - save_lcs2 = xmldiff.difflib.lcs2 - if request.param == 'force_python': - xmldiff.difflib.lcs2 = xmldiff.difflib.lcs2_python - elif request.param == 'c': - pass - elif request.param == 'just_python': - pass - yield request.param - xmldiff.difflib.lcs2 = save_lcs2 diff --git a/tests/data/broken/broken.xml b/tests/data/broken/broken.xml deleted file mode 100644 index 33af9d9..0000000 --- a/tests/data/broken/broken.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - almaster@logilab.org - - - - - - - diff --git a/tests/data/dir1/changing.xml b/tests/data/dir1/changing.xml deleted file mode 100644 index 8f5c795..0000000 --- a/tests/data/dir1/changing.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/tests/data/dir1/dir_inboth/changing.xml b/tests/data/dir1/dir_inboth/changing.xml deleted file mode 100644 index 8f5c795..0000000 --- a/tests/data/dir1/dir_inboth/changing.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/tests/data/dir1/dir_inboth/inbothdir.xml b/tests/data/dir1/dir_inboth/inbothdir.xml deleted file mode 100644 index f3f286e..0000000 --- a/tests/data/dir1/dir_inboth/inbothdir.xml +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tests/data/dir1/dir_inboth/onlyindir1.xml b/tests/data/dir1/dir_inboth/onlyindir1.xml deleted file mode 100644 index f3f286e..0000000 --- a/tests/data/dir1/dir_inboth/onlyindir1.xml +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tests/data/dir1/inbothdir.xml b/tests/data/dir1/inbothdir.xml deleted file mode 100644 index f3f286e..0000000 --- a/tests/data/dir1/inbothdir.xml +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tests/data/dir1/onlyindir1.xml b/tests/data/dir1/onlyindir1.xml deleted file mode 100644 index f3f286e..0000000 --- a/tests/data/dir1/onlyindir1.xml +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tests/data/dir2/changing.xml b/tests/data/dir2/changing.xml deleted file mode 100644 index b9ceb4b..0000000 --- a/tests/data/dir2/changing.xml +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/tests/data/dir2/dir_inboth/changing.xml b/tests/data/dir2/dir_inboth/changing.xml deleted file mode 100644 index b9ceb4b..0000000 --- a/tests/data/dir2/dir_inboth/changing.xml +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/tests/data/dir2/dir_inboth/inbothdir.xml b/tests/data/dir2/dir_inboth/inbothdir.xml deleted file mode 100644 index f3f286e..0000000 --- a/tests/data/dir2/dir_inboth/inbothdir.xml +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tests/data/dir2/dir_inboth/onlyindir2.xml b/tests/data/dir2/dir_inboth/onlyindir2.xml deleted file mode 100644 index f3f286e..0000000 --- a/tests/data/dir2/dir_inboth/onlyindir2.xml +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tests/data/dir2/dir_only2/.empty b/tests/data/dir2/dir_only2/.empty deleted file mode 100644 index e69de29..0000000 diff --git a/tests/data/dir2/inbothdir.xml b/tests/data/dir2/inbothdir.xml deleted file mode 100644 index f3f286e..0000000 --- a/tests/data/dir2/inbothdir.xml +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tests/data/dir2/onlyindir2.xml b/tests/data/dir2/onlyindir2.xml deleted file mode 100644 index f3f286e..0000000 --- a/tests/data/dir2/onlyindir2.xml +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tests/data/parse/1.xml b/tests/data/parse/1.xml deleted file mode 100644 index 3257495..0000000 --- a/tests/data/parse/1.xml +++ /dev/null @@ -1,39 +0,0 @@ - - - - 1 - 2 - 3 - - - 1 - 2 - 3 - - - 1 - 2 - 3 - - - This is the fourth sentence. - - - This is the fifth sentence. - - - This is the sixth sentence. - - - This is the seventh sentence. - - - This is now the Eighth sentence. - - - This is now the Ninth sentence. - - - This is now the Tenth sentence. - - diff --git a/tests/data/parse/default_ns.xml b/tests/data/parse/default_ns.xml deleted file mode 100644 index ecf39f2..0000000 --- a/tests/data/parse/default_ns.xml +++ /dev/null @@ -1,10 +0,0 @@ -

- - S001 - Sales - - - - - -
diff --git a/tests/data/parse/html.html b/tests/data/parse/html.html deleted file mode 100644 index c87692f..0000000 --- a/tests/data/parse/html.html +++ /dev/null @@ -1,10 +0,0 @@ - - - - -

My First Heading

- -

My first paragraph.

- - - diff --git a/tests/data/parse/iso.xml b/tests/data/parse/iso.xml deleted file mode 100644 index 37b3ee2..0000000 --- a/tests/data/parse/iso.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - �� - - diff --git a/tests/data/parse/simple_ns.xml b/tests/data/parse/simple_ns.xml deleted file mode 100644 index ce9978f..0000000 --- a/tests/data/parse/simple_ns.xml +++ /dev/null @@ -1,10 +0,0 @@ - - - S001 - Sales - - - - - - diff --git a/tests/data/parse/tal_ns.xml b/tests/data/parse/tal_ns.xml deleted file mode 100644 index 830149d..0000000 --- a/tests/data/parse/tal_ns.xml +++ /dev/null @@ -1,10 +0,0 @@ - -

- - outer variable x, first appearance - - inner variable x - - outer variable x, second appearance - -

diff --git a/tests/data/parse/utf16.xml b/tests/data/parse/utf16.xml deleted file mode 100644 index e7b7bcca1d385699212c7f785b87a6c50d03aa47..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 202 zcmYj~Jr06E5Jq1@rKi}?E7(}A(%IVR33NkDU@=0B9?4@-dI2j6zI79{n3;S(d1EPf zP*Y*JbAlYjp~?%(3#E7@a2Q*`H(NKDjCn e><@!>?1a`{Tqpge=apEp$%&L~BF8^J9Y2lDM<6Kx diff --git a/tests/data/parse/utf8.xml b/tests/data/parse/utf8.xml deleted file mode 100644 index 33de8d9..0000000 --- a/tests/data/parse/utf8.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - éáΩ - - diff --git a/tests/data/test00_1.xml b/tests/data/test00_1.xml deleted file mode 100644 index 88997db..0000000 --- a/tests/data/test00_1.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - almaster@logilab.org - - - - - - - \ No newline at end of file diff --git a/tests/data/test00_2.xml b/tests/data/test00_2.xml deleted file mode 100644 index 57b576c..0000000 --- a/tests/data/test00_2.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - - - syt@logilab.org - - - - - - - - - - hoye! - - \ No newline at end of file diff --git a/tests/data/test00_result b/tests/data/test00_result deleted file mode 100644 index 0e4adb6..0000000 --- a/tests/data/test00_result +++ /dev/null @@ -1,23 +0,0 @@ -[rename, /memory[1]/mailbox[1], box] -[insert-after, /memory[1]/spoken-languages[1], - - -hoye! - -] -[update, /memory[1]/email_addr[1]/text()[1], syt@logilab.org] -[rename, /memory[1]/junkbuster-method[1]/@value, val] -[append-first, /memory[1]/junkbuster-method[1], - -] -[append, /memory[1]/spoken-languages[1], -<@new> -new attribute - -] -[insert-after, /memory[1]/spoken-languages[1]/language[2], - -] -[update, /memory[1]/server-socket[2]/@port, 7797] -[remove, /memory[1]/spoken-languages[1]/language[1]] -[remove, /memory[1]/spoken-languages[1]/language[3]] diff --git a/tests/data/test01_1.xml b/tests/data/test01_1.xml deleted file mode 100644 index 8f5c795..0000000 --- a/tests/data/test01_1.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/tests/data/test01_2.xml b/tests/data/test01_2.xml deleted file mode 100644 index b9ceb4b..0000000 --- a/tests/data/test01_2.xml +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/tests/data/test01_result b/tests/data/test01_result deleted file mode 100644 index 2bd5668..0000000 --- a/tests/data/test01_result +++ /dev/null @@ -1,4 +0,0 @@ -[append-first, /, - -] -[remove, /oopoyy[1]] diff --git a/tests/data/test02_1.xml b/tests/data/test02_1.xml deleted file mode 100644 index f819301..0000000 --- a/tests/data/test02_1.xml +++ /dev/null @@ -1 +0,0 @@ -moretext diff --git a/tests/data/test02_2.xml b/tests/data/test02_2.xml deleted file mode 100644 index 522c979..0000000 --- a/tests/data/test02_2.xml +++ /dev/null @@ -1,2 +0,0 @@ -iimoretext - diff --git a/tests/data/test02_result b/tests/data/test02_result deleted file mode 100644 index 37725b2..0000000 --- a/tests/data/test02_result +++ /dev/null @@ -1,8 +0,0 @@ -[append-first, /, - -ii - -] -[rename, /a[2], b] -[move-after, /b[1], /a[1]/text()[1]] -[remove, /a[1]/b[1]/b[1]] diff --git a/tests/data/test03_1.xml b/tests/data/test03_1.xml deleted file mode 100644 index 171bb7e..0000000 --- a/tests/data/test03_1.xml +++ /dev/null @@ -1 +0,0 @@ -moretexthehe diff --git a/tests/data/test03_2.xml b/tests/data/test03_2.xml deleted file mode 100644 index 743c635..0000000 --- a/tests/data/test03_2.xml +++ /dev/null @@ -1,3 +0,0 @@ -iihehe -moretext - diff --git a/tests/data/test03_result b/tests/data/test03_result deleted file mode 100644 index 55a2d19..0000000 --- a/tests/data/test03_result +++ /dev/null @@ -1,13 +0,0 @@ -[append-first, /, - - -moretext - - -] -[rename, /a[2], branch] -[move-first, /branch[1], /a[1]] -[append-first, /a[1]/branch[1], -ii -] -[remove, /a[1]/branch[1]/b[1]] diff --git a/tests/data/test04_1.xml b/tests/data/test04_1.xml deleted file mode 100644 index 017c4f1..0000000 --- a/tests/data/test04_1.xml +++ /dev/null @@ -1,15 +0,0 @@ - - - - - - almastlogilab.org - - - - - - - - - diff --git a/tests/data/test04_2.xml b/tests/data/test04_2.xml deleted file mode 100644 index 1fda8fa..0000000 --- a/tests/data/test04_2.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - -and new text - - - - - syt@logilab.org -insert test - - - - - - - - - - - hoye! - -and some new text - diff --git a/tests/data/test04_result b/tests/data/test04_result deleted file mode 100644 index 25fa769..0000000 --- a/tests/data/test04_result +++ /dev/null @@ -1,43 +0,0 @@ -[move-first, /memory[1]/comment()[1], /memory[1]] -[update, /memory[1]/comment()[1], new comment] -[insert-after, /memory[1]/comment()[1], -and new text -] -[rename, /memory[1]/mailbox[1], box] -[insert-after, /memory[1]/box[1], - -] -[insert-after, /memory[1]/email_addr[1], -insert test -] -[insert-after, /memory[1]/junkbuster-method[1], - -] -[insert-after, /memory[1]/spoken-languages[1], - - -hoye! - -] -[insert-after, /memory[1]/test[1], -and some new text -] -[rename, /memory[1]/box[1]/@path, pathe] -[update, /memory[1]/email_addr[1]/text()[1], syt@logilab.org] -[rename, /memory[1]/junkbuster-method[1]/@value, val] -[append-first, /memory[1]/junkbuster-method[1], - -] -[append, /memory[1]/spoken-languages[1], -<@new> -new attribute - -] -[insert-after, /memory[1]/spoken-languages[1]/language[2], - -] -[update, /memory[1]/server-socket[1]/@port, 7rm -rf tm776] -[update, /memory[1]/server-socket[2]/@port, 7797] -[remove, /memory[1]/@attr] -[remove, /memory[1]/spoken-languages[1]/language[1]] -[remove, /memory[1]/spoken-languages[1]/language[3]] diff --git a/tests/data/test05_1.xml b/tests/data/test05_1.xml deleted file mode 100644 index 1646d71..0000000 --- a/tests/data/test05_1.xml +++ /dev/null @@ -1,28 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tests/data/test05_2.xml b/tests/data/test05_2.xml deleted file mode 100644 index de9def1..0000000 --- a/tests/data/test05_2.xml +++ /dev/null @@ -1,29 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tests/data/test05_result b/tests/data/test05_result deleted file mode 100644 index 9c0bcbf..0000000 --- a/tests/data/test05_result +++ /dev/null @@ -1,15 +0,0 @@ -[insert-after, /bean[1]/add[1]/bean[1]/property[2], - - - - - - - -] -[move-first, /bean[1]/add[1]/bean[1]/add[2]/bean[1]/@class, /bean[1]/add[1]/bean[1]/add[1]/bean[1]] -[move-after, /bean[1]/add[1]/bean[1]/add[2]/bean[1]/event-binding[1], /bean[1]/add[1]/bean[1]/add[1]/bean[1]/property[3]] -[update, /bean[1]/add[1]/bean[1]/add[1]/bean[1]/@class, java.awt.Scrollbar] -[update, /bean[1]/add[1]/bean[1]/add[1]/bean[1]/event-binding[1]/@name, adjustment] -[update, /bean[1]/add[1]/bean[1]/add[1]/bean[1]/event-binding[1]/@targetObject, adjustmenthandler] -[remove, /bean[1]/add[1]/bean[1]/add[2]] diff --git a/tests/data/test06_1.xml b/tests/data/test06_1.xml deleted file mode 100644 index dc37d4b..0000000 --- a/tests/data/test06_1.xml +++ /dev/null @@ -1,2 +0,0 @@ -iimoretext - diff --git a/tests/data/test06_2.xml b/tests/data/test06_2.xml deleted file mode 100644 index f819301..0000000 --- a/tests/data/test06_2.xml +++ /dev/null @@ -1 +0,0 @@ -moretext diff --git a/tests/data/test06_result b/tests/data/test06_result deleted file mode 100644 index a42e6da..0000000 --- a/tests/data/test06_result +++ /dev/null @@ -1,6 +0,0 @@ -[insert-after, /a[1]/text()[1], - -] -[move-after, /a[1]/b[1], /a[1]/text()[2]] -[remove, /a[1]/text()[1]] -[remove, /a[1]/LogilabXMLDIFFFAKETag[1]] diff --git a/tests/data/test07_1.xml b/tests/data/test07_1.xml deleted file mode 100644 index a4b50cc..0000000 --- a/tests/data/test07_1.xml +++ /dev/null @@ -1 +0,0 @@ -texteautre texte diff --git a/tests/data/test07_2.xml b/tests/data/test07_2.xml deleted file mode 100644 index 25f8466..0000000 --- a/tests/data/test07_2.xml +++ /dev/null @@ -1 +0,0 @@ -texte diff --git a/tests/data/test07_result b/tests/data/test07_result deleted file mode 100644 index 36e78ab..0000000 --- a/tests/data/test07_result +++ /dev/null @@ -1,2 +0,0 @@ -[remove, /a[1]/text()[2]] -[remove, /a[1]/b[1]] diff --git a/tests/data/test08_1.xml b/tests/data/test08_1.xml deleted file mode 100644 index 3257495..0000000 --- a/tests/data/test08_1.xml +++ /dev/null @@ -1,39 +0,0 @@ - - - - 1 - 2 - 3 - - - 1 - 2 - 3 - - - 1 - 2 - 3 - - - This is the fourth sentence. - - - This is the fifth sentence. - - - This is the sixth sentence. - - - This is the seventh sentence. - - - This is now the Eighth sentence. - - - This is now the Ninth sentence. - - - This is now the Tenth sentence. - - diff --git a/tests/data/test08_2.xml b/tests/data/test08_2.xml deleted file mode 100644 index 84b0450..0000000 --- a/tests/data/test08_2.xml +++ /dev/null @@ -1,39 +0,0 @@ - - - - 1 - 2 - 2.1 - 3 - - - 1 - 3 - - - 2 - 1 - 3 - - - This WAS the fourth sentence. - - - This is the fifth sentence. - - - This is the and improved sixth sentence. - - - This is the seventh sentence. - - - This is the Eighth sentence. - - - This is (changed) the Ninth sentence. - - - This is now the Tenth sentence. - - diff --git a/tests/data/test08_result b/tests/data/test08_result deleted file mode 100644 index 56d1ba4..0000000 --- a/tests/data/test08_result +++ /dev/null @@ -1,66 +0,0 @@ -[insert-after, /Tests[1]/Test[8], - - -This is the seventh sentence. - - -] -[insert-after, /Tests[1]/Test[9], - - -This is the Eighth sentence. - - -] -[insert-after, /Tests[1]/Test[1]/One[2], - -2.1 - -] -[insert-after, /Tests[1]/Test[3]/Three[2], - -1 - -] -[remove, /Tests[1]/Test[5]/@type] -[append, /Tests[1]/Test[7], -<@LogilabXmldiffTmpAttrtype> -Insert mixed element - -] -[rename, /Tests[1]/Test[7]/Seven[1], Five] -[remove, /Tests[1]/Test[6]/@type] -[append, /Tests[1]/Test[8], -<@LogilabXmldiffTmpAttrtype> -Insert mixed element with text - -] -[rename, /Tests[1]/Test[8]/Eight[1], Six] -[move-first, /Tests[1]/Test[7]/@type, /Tests[1]/Test[9]] -[move-first, /Tests[1]/Test[8]/@type, /Tests[1]/Test[10]] -[update, /Tests[1]/Test[4]/Four[1]/text()[1], This WAS the fourth sentence.] -[update, /Tests[1]/Test[7]/Five[1]/text()[1], This is the] -[insert-after, /Tests[1]/Test[7]/Five[1]/text()[1], - -] -[update, /Tests[1]/Test[7]/Five[1]/text()[2], fifth sentence.] -[update, /Tests[1]/Test[8]/Six[1]/text()[1], This is the] -[insert-after, /Tests[1]/Test[8]/Six[1]/text()[1], - -and improved - -] -[update, /Tests[1]/Test[8]/Six[1]/text()[2], sixth sentence.] -[insert-after, /Tests[1]/Test[11]/Nine[1]/text()[1], - -(changed) - -] -[remove, /Tests[1]/Test[2]/Two[2]] -[remove, /Tests[1]/Test[3]/Three[1]] -[remove, /Tests[1]/Test[5]] -[remove, /Tests[1]/Test[5]] -[remove, /Tests[1]/Test[5]/Five[1]/b[1]] -[remove, /Tests[1]/Test[6]/Six[1]/b[1]] -[remove, /Tests[1]/Test[9]/Nine[1]/b[2]] -[rename, //@LogilabXmldiffTmpAttrtype, type] diff --git a/tests/data/test09_ns_1.xml b/tests/data/test09_ns_1.xml deleted file mode 100644 index 084487a..0000000 --- a/tests/data/test09_ns_1.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - This is now the Ninth sentence. - - - now the Ninth sentence. - - - This is now - - diff --git a/tests/data/test09_ns_2.xml b/tests/data/test09_ns_2.xml deleted file mode 100644 index 249beaa..0000000 --- a/tests/data/test09_ns_2.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - This is (changed) the Ninth sentence. - - - (changed) the Ninth sentence. - - - This is (changed) - - diff --git a/tests/data/test09_ns_result b/tests/data/test09_ns_result deleted file mode 100644 index 2969018..0000000 --- a/tests/data/test09_ns_result +++ /dev/null @@ -1,24 +0,0 @@ -[append-first, /Tests[1]/Test[1]/Nine[1]/b[1], - -] -[append-first, /Tests[1]/Test[1]/Nine[1]/b[1], -(changed) -] -[append-first, /Tests[1]/Test[2]/Nine[1]/b[1], - -] -[append-first, /Tests[1]/Test[2]/Nine[1]/b[1], -(changed) -] -[append-first, /Tests[1]/Test[3]/Nine[1]/b[1], - -] -[append-first, /Tests[1]/Test[3]/Nine[1]/b[1], -(changed) -] -[remove, /Tests[1]/Test[1]/Nine[1]/b[1]/text()[2]] -[remove, /Tests[1]/Test[1]/Nine[1]/b[1]/LogilabXMLDIFFFAKETag[1]] -[remove, /Tests[1]/Test[2]/Nine[1]/b[1]/text()[2]] -[remove, /Tests[1]/Test[2]/Nine[1]/b[1]/LogilabXMLDIFFFAKETag[1]] -[remove, /Tests[1]/Test[3]/Nine[1]/b[1]/text()[2]] -[remove, /Tests[1]/Test[3]/Nine[1]/b[1]/LogilabXMLDIFFFAKETag[1]] diff --git a/tests/data/test10_ns_1.xml b/tests/data/test10_ns_1.xml deleted file mode 100644 index 4720824..0000000 --- a/tests/data/test10_ns_1.xml +++ /dev/null @@ -1,39 +0,0 @@ - - - - 1 - 2 - 3 - - - 1 - 2 - 3 - - - 1 - 2 - 3 - - - This is the fourth sentence. - - - This is the fifth sentence. - - - This is the sixth sentence. - - - This is the seventh sentence. - - - This is now the Eighth sentence. - - - This is now the Ninth sentence. - - - This is now the Tenth sentence. - - diff --git a/tests/data/test10_ns_2.xml b/tests/data/test10_ns_2.xml deleted file mode 100644 index 5ba7ab2..0000000 --- a/tests/data/test10_ns_2.xml +++ /dev/null @@ -1,39 +0,0 @@ - - - - 1 - 2 - 2.1 - 3 - - - 1 - 3 - - - 2 - 1 - 3 - - - This WAS the fourth sentence. - - - This is the fifth sentence. - - - This is the and improved sixth sentence. - - - This is the seventh sentence. - - - This is the Eighth sentence. - - - This is (changed) the Ninth sentence. - - - This is now the Tenth sentence. - - diff --git a/tests/data/test10_ns_result b/tests/data/test10_ns_result deleted file mode 100644 index e994e6d..0000000 --- a/tests/data/test10_ns_result +++ /dev/null @@ -1,68 +0,0 @@ -[insert-after, /Tests[1]/Test[7], - - -This is the seventh sentence. - - -] -[insert-after, /Tests[1]/Test[8], - - -This is the Eighth sentence. - - -] -[insert-after, /Tests[1]/Test[1]/One[2], - -2.1 - -] -[insert-after, /Tests[1]/Test[2]/Three[2], - -1 - -] -[remove, /Tests[1]/Test[4]/@type] -[append, /Tests[1]/Test[6], -<@LogilabXmldiffTmpAttrtype> -Insert mixed element - -] -[rename, /Tests[1]/Test[6]/Seven[1], Five] -[remove, /Tests[1]/Test[5]/@type] -[append, /Tests[1]/Test[7], -<@LogilabXmldiffTmpAttrtype> -Insert mixed element with text - -] -[rename, /Tests[1]/Test[7]/Eight[1], Six] -[move-first, /Tests[1]/Test[6]/@type, /Tests[1]/Test[8]] -[move-first, /Tests[1]/Test[7]/@type, /Tests[1]/Test[9]] -[update, /Tests[1]/Test[3]/tns:Four[1]/text()[1], This WAS the fourth sentence.] -[update, /Tests[1]/Test[6]/Five[1]/text()[1], This is the] -[insert-after, /Tests[1]/Test[6]/Five[1]/text()[1], - -] -[update, /Tests[1]/Test[6]/Five[1]/text()[2], fifth sentence.] -[update, /Tests[1]/Test[7]/Six[1]/text()[1], This is the] -[insert-after, /Tests[1]/Test[7]/Six[1]/text()[1], - -and improved - -] -[update, /Tests[1]/Test[7]/Six[1]/text()[2], sixth sentence.] -[append-first, /Tests[1]/Test[10]/Nine[1]/b[1], - -] -[append-first, /Tests[1]/Test[10]/Nine[1]/b[1], -(changed) -] -[remove, /Tests[1]/tns:Test[1]/Two[2]] -[remove, /Tests[1]/Test[2]/Three[1]] -[remove, /Tests[1]/Test[4]] -[remove, /Tests[1]/Test[4]] -[remove, /Tests[1]/Test[4]/Five[1]/b[1]] -[remove, /Tests[1]/Test[5]/Six[1]/b[1]] -[remove, /Tests[1]/Test[8]/Nine[1]/b[1]/text()[2]] -[remove, /Tests[1]/Test[8]/Nine[1]/b[1]/LogilabXMLDIFFFAKETag[1]] -[rename, //@LogilabXmldiffTmpAttrtype, type] diff --git a/tests/data/test11_ns_1.xml b/tests/data/test11_ns_1.xml deleted file mode 100644 index 24fc1d2..0000000 --- a/tests/data/test11_ns_1.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - This is now the Ninth sentence. - - diff --git a/tests/data/test11_ns_2.xml b/tests/data/test11_ns_2.xml deleted file mode 100644 index b6c0d4f..0000000 --- a/tests/data/test11_ns_2.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - This is now the Ninth sentence. - - diff --git a/tests/data/test11_ns_result b/tests/data/test11_ns_result deleted file mode 100644 index 8af773c..0000000 --- a/tests/data/test11_ns_result +++ /dev/null @@ -1 +0,0 @@ -[update, /Tests[1]/Test[1]/Nine[1]/b[1]/@tns:name, NineNameChanged] diff --git a/tests/test_data/complex-text-update.expected.rml b/tests/test_data/complex-text-update.expected.rml new file mode 100644 index 0000000..d597ae1 --- /dev/null +++ b/tests/test_data/complex-text-update.expected.rml @@ -0,0 +1,5 @@ + + + Let's see. This is some simple text demonstrating the features of the human text differ. This feature attempts to make changelog readable for humans. The human text differ uses sentences as its first order matching. Let's see. + + diff --git a/tests/test_data/complex-text-update.left.rml b/tests/test_data/complex-text-update.left.rml new file mode 100644 index 0000000..7eaadd1 --- /dev/null +++ b/tests/test_data/complex-text-update.left.rml @@ -0,0 +1,12 @@ + + + + + This is some simple text demonstrating the features of the human text + differ. This feature attempts to make changelog readable for + humans. The human text differ uses sentences as its first order + matching. Let's see. + + + + diff --git a/tests/test_data/complex-text-update.right.rml b/tests/test_data/complex-text-update.right.rml new file mode 100644 index 0000000..60e8133 --- /dev/null +++ b/tests/test_data/complex-text-update.right.rml @@ -0,0 +1,12 @@ + + + + + Let's see. This is some simple text demonstrating the features of the + human text differ. This feature attempts to make changelog + readable for humans. The human text differ uses sentences as its + first order matching. + + + + diff --git a/tests/test_data/insert-node.expected.rml b/tests/test_data/insert-node.expected.rml new file mode 100644 index 0000000..14f0f6c --- /dev/null +++ b/tests/test_data/insert-node.expected.rml @@ -0,0 +1,7 @@ + + +

+ Inserted Node +

+
+
diff --git a/tests/test_data/insert-node.left.rml b/tests/test_data/insert-node.left.rml new file mode 100644 index 0000000..399b568 --- /dev/null +++ b/tests/test_data/insert-node.left.rml @@ -0,0 +1,4 @@ + + + + diff --git a/tests/test_data/insert-node.right.rml b/tests/test_data/insert-node.right.rml new file mode 100644 index 0000000..4517c99 --- /dev/null +++ b/tests/test_data/insert-node.right.rml @@ -0,0 +1,7 @@ + + + +

Inserted Node

+ +
+
diff --git a/tests/test_data/no-text-substitutions.expected.rml b/tests/test_data/no-text-substitutions.expected.rml new file mode 100644 index 0000000..aa7eccc --- /dev/null +++ b/tests/test_data/no-text-substitutions.expected.rml @@ -0,0 +1,7 @@ + + + + Simple text + + + diff --git a/tests/test_data/no-text-substitutions.left.rml b/tests/test_data/no-text-substitutions.left.rml new file mode 100644 index 0000000..399b568 --- /dev/null +++ b/tests/test_data/no-text-substitutions.left.rml @@ -0,0 +1,4 @@ + + + + diff --git a/tests/test_data/no-text-substitutions.right.rml b/tests/test_data/no-text-substitutions.right.rml new file mode 100644 index 0000000..74920dd --- /dev/null +++ b/tests/test_data/no-text-substitutions.right.rml @@ -0,0 +1,5 @@ + + + Simple text + + diff --git a/tests/test_data/rmldoc.expected.rml b/tests/test_data/rmldoc.expected.rml new file mode 100644 index 0000000..9632cdf --- /dev/null +++ b/tests/test_data/rmldoc.expected.rml @@ -0,0 +1,300 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_data/rmldoc.expected.xml b/tests/test_data/rmldoc.expected.xml new file mode 100644 index 0000000..02888ab --- /dev/null +++ b/tests/test_data/rmldoc.expected.xml @@ -0,0 +1,498 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/test_data/rmldoc.left.rml b/tests/test_data/rmldoc.left.rml new file mode 100644 index 0000000..3e191b5 --- /dev/null +++ b/tests/test_data/rmldoc.left.rml @@ -0,0 +1,508 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/test_data/rmldoc.left.xml b/tests/test_data/rmldoc.left.xml new file mode 100644 index 0000000..3e191b5 --- /dev/null +++ b/tests/test_data/rmldoc.left.xml @@ -0,0 +1,508 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/test_data/rmldoc.right.rml b/tests/test_data/rmldoc.right.rml new file mode 100644 index 0000000..5932065 --- /dev/null +++ b/tests/test_data/rmldoc.right.rml @@ -0,0 +1,519 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/test_data/rmldoc.right.xml b/tests/test_data/rmldoc.right.xml new file mode 100644 index 0000000..5932065 --- /dev/null +++ b/tests/test_data/rmldoc.right.xml @@ -0,0 +1,519 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/test_diff.py b/tests/test_diff.py new file mode 100644 index 0000000..f61a27a --- /dev/null +++ b/tests/test_diff.py @@ -0,0 +1,1067 @@ +import os +import unittest + +from io import open +from lxml import etree +from xmldiff import utils +from xmldiff.diff import (Differ, UpdateTextIn, InsertNode, MoveNode, + DeleteNode, UpdateAttrib, InsertAttrib, RenameAttrib, + DeleteAttrib, UpdateTextAfter) + + +class APITests(unittest.TestCase): + left = u"

Text

More

" + right = u"

Tokst

More

" + lefttree = etree.fromstring(left) + righttree = etree.fromstring(right) + differ = Differ() + + def test_set_trees(self): + # Passing in just one parameter causes an error: + with self.assertRaises(TypeError): + self.differ.set_trees(self.lefttree, None) + + # Passing in something that isn't iterable also cause errors... + with self.assertRaises(TypeError): + self.differ.set_trees(object(), self.righttree) + + # This is the way: + self.differ.set_trees(self.lefttree, self.righttree) + + def test_match(self): + # Passing in just one parameter causes an error: + with self.assertRaises(TypeError): + self.differ.match(self.lefttree, None) + + # Passing in something that isn't iterable also cause errors... + with self.assertRaises(TypeError): + self.differ.match(object(), self.righttree) + + # This is the way: + res1 = self.differ.match(self.lefttree, self.righttree) + lpath = self.differ.left.getroottree().getpath + rpath = self.differ.right.getroottree().getpath + res1x = [(lpath(x[0]), rpath(x[1]), x[2]) for x in res1] + + # Or, you can use set_trees: + self.differ.set_trees(self.lefttree, self.righttree) + res2 = self.differ.match() + lpath = self.differ.left.getroottree().getpath + rpath = self.differ.right.getroottree().getpath + res2x = [(lpath(x[0]), rpath(x[1]), x[2]) for x in res2] + + # The match sequences should be the same, of course: + self.assertEqual(res1x, res2x) + # But importantly, they are not the same object, meaning the + # matching was redone. + self.assertIsNot(res1, res2) + # However, if we call match() a second time without setting + # new sequences, we'll get a cached result: + self.assertIs(self.differ.match(), res2) + + def test_diff(self): + # Passing in just one parameter causes an error: + with self.assertRaises(TypeError): + list(self.differ.diff(self.lefttree, None)) + + # Passing in something that isn't iterable also cause errors... + with self.assertRaises(TypeError): + list(self.differ.diff(object(), self.righttree)) + + # This is the way: + res1 = list(self.differ.diff(self.lefttree, self.righttree)) + + # Or, you can use set_trees() or match() + # We need to reparse self.lefttree, since after the diffing they + # are equal. + self.lefttree = etree.fromstring(self.left) + self.differ.set_trees(self.lefttree, self.righttree) + res2 = list(self.differ.diff()) + + # The match sequences should be the same, of course: + self.assertEqual(res1, res2) + # But importantly, they are not the same object, meaning the + # matching was redone. + self.assertIsNot(res1, res2) + # There is no caching of diff(), so running it again means another + # diffing. + self.assertIsNot(list(self.differ.diff()), res2) + + +class NodeRatioTests(unittest.TestCase): + + def test_compare_equal(self): + xml = u""" + +
+ First paragraph +
+
+ Last paragraph +
+
+
+""" + tree = etree.fromstring(xml) + differ = Differ() + differ.set_trees(tree, tree) + differ.match() + + # Every node in these trees should get a 1.0 leaf_ratio, + # and if it has children, 1.0 child_ration, else None + for left, right in zip(utils.post_order_traverse(differ.left), + utils.post_order_traverse(differ.right)): + self.assertEqual(differ.leaf_ratio(left, right), 1.0) + if left.getchildren(): + self.assertEqual(differ.child_ratio(left, right), 1.0) + else: + self.assertIsNone(differ.child_ratio(left, right)) + + def test_compare_different_leafs(self): + left = u""" + +
+ This doesn't match at all +
+
+ First paragraph +
+
+ Last paragraph +
+
+
+""" + + right = u""" + +
+ It's completely different +
+
+ Another paragraph +
+
+ Last paragraph +
+
+
+""" + + lefttree = etree.fromstring(left) + righttree = etree.fromstring(right) + differ = Differ() + + # Make some choice comparisons here + # These node are exactly the same + left = lefttree.xpath('/document/story/section[3]/para')[0] + right = righttree.xpath('/document/story/section[3]/para')[0] + + self.assertEqual(differ.leaf_ratio(left, right), 1.0) + + # These nodes have slightly different text, but no children + left = lefttree.xpath('/document/story/section[2]/para')[0] + right = righttree.xpath('/document/story/section[2]/para')[0] + + self.assertAlmostEqual(differ.leaf_ratio(left, right), + 0.6875) + + # These nodes should not be very similar + left = lefttree.xpath('/document/story/section[1]/para')[0] + right = righttree.xpath('/document/story/section[1]/para')[0] + self.assertAlmostEqual(differ.leaf_ratio(left, right), + 0.24) + + def test_compare_different_nodes(self): + left = u""" + +
+ First paragraph + Second paragraph +
+
+ Third paragraph +
+
+ Last paragraph +
+
+
+""" + + right = u""" + +
+ First paragraph +
+
+ Second paragraph + Third paragraph +
+
+ Last paragraph +
+
+
+""" + + differ = Differ() + differ.set_trees(etree.fromstring(left), etree.fromstring(right)) + differ.match() + + # Make some choice comparisons here. leaf_ratio will always be 1.0, + # as these leafs have the same attributes and no text, even though + # attributes may be in different order. + left = differ.left.xpath('/document/story/section[1]')[0] + right = differ.right.xpath('/document/story/section[1]')[0] + + self.assertEqual(differ.leaf_ratio(left, right), 1.0) + # Only one of two matches: + self.assertEqual(differ.child_ratio(left, right), 0.5) + + left = differ.left.xpath('/document/story/section[2]')[0] + right = differ.right.xpath('/document/story/section[2]')[0] + + self.assertEqual(differ.leaf_ratio(left, right), 1.0) + # Only one of two matches: + self.assertEqual(differ.child_ratio(left, right), 0.5) + + # These nodes should not be very similar + left = differ.left.xpath('/document/story/section[3]')[0] + right = differ.right.xpath('/document/story/section[3]')[0] + self.assertEqual(differ.leaf_ratio(left, right), 1.0) + self.assertEqual(differ.child_ratio(left, right), 1.0) + + def test_compare_with_xmlid(self): + left = u""" + +
+ First paragraph + This is the second paragraph +
+
+ Det tredje stycket +
+
+ Last paragraph +
+
+
+""" + + right = u""" + +
+ First paragraph +
+
+ This is the second + Det tredje stycket +
+
+ Last paragraph +
+
+
+""" + + differ = Differ() + differ.set_trees(etree.fromstring(left), etree.fromstring(right)) + differ.match() + + # Make some choice comparisons here. + + left = differ.left.xpath('/document/story/section[1]')[0] + right = differ.right.xpath('/document/story/section[1]')[0] + + # These have different id's + self.assertEqual(differ.leaf_ratio(left, right), 0) + # And one out of two children in common + self.assertEqual(differ.child_ratio(left, right), 0.5) + + # Here's the ones with the same id: + left = differ.left.xpath('/document/story/section[1]')[0] + right = differ.right.xpath('/document/story/section[2]')[0] + + self.assertEqual(differ.leaf_ratio(left, right), 1.0) + # And one out of two children in common + self.assertEqual(differ.child_ratio(left, right), 0.5) + + # The last ones are completely similar, but only one + # has an xml:id, so they do not match. + left = differ.left.xpath('/document/story/section[3]')[0] + right = differ.right.xpath('/document/story/section[3]')[0] + self.assertEqual(differ.leaf_ratio(left, right), 0) + self.assertEqual(differ.child_ratio(left, right), 1.0) + + +class MatchTests(unittest.TestCase): + + def _match(self, left, right): + left_tree = etree.fromstring(left) + right_tree = etree.fromstring(right) + differ = Differ() + differ.set_trees(left_tree, right_tree) + matches = differ.match() + lpath = differ.left.getroottree().getpath + rpath = differ.right.getroottree().getpath + return [(lpath(item[0]), rpath(item[1])) for item in matches] + + def test_same_tree(self): + xml = u""" + +
+ First paragraph +
+
+ Last paragraph +
+
+
+""" + result = self._match(xml, xml) + nodes = list(utils.post_order_traverse(etree.fromstring(xml))) + # Everything matches + self.assertEqual(len(result), len(nodes)) + + def test_no_xml_id_match(self): + # Here we insert a section first, but because they contain numbering + # it's easy to match section 1 in left with section 2 in right, + # though it should be detected as an insert. + + # If the number of similar attributes are few it works fine, the + # differing content of the ref="3" section means it's detected to + # be an insert. + left = u""" + +
+ First paragraph +
+
+ Last paragraph +
+
+
+ """ + + # We even detect that the first section is an insert without + # xmlid, but that's less reliable. + right = u""" + +
+ New paragraph +
+
+ First paragraph +
+
+ Last paragraph +
+
+
+ """ + + result = self._match(left, right) + self.assertEqual(result, [ + ('/document/story/section[1]/para', + '/document/story/section[2]/para'), + ('/document/story/section[1]', + '/document/story/section[2]'), + ('/document/story/section[2]/para', + '/document/story/section[3]/para'), + ('/document/story/section[2]', + '/document/story/section[3]'), + ('/document/story', + '/document/story'), + ('/document', + '/document') + ]) + + def test_with_xmlid(self): + # This first section contains attributes that are similar (and longer + # than the content text. That would trick the matcher into matching + # the oldfirst and the newfirst section to match, except that we + # this time also have xml:id's, and they trump everything else! + left = u""" + +
+ First paragraph +
+
+ Second paragraph +
+
+ Last paragraph +
+
+
+""" + + # We even detect that the first section is an insert without + # xmlid, but that's less reliable. + right = u""" + +
+ New paragraph +
+
+ First paragraph +
+
+ Second paragraph +
+
+ Last paragraph +
+
+
+""" + + result = self._match(left, right) + self.assertEqual(result, [ + ('/document/story/section[1]/para', + '/document/story/section[2]/para'), + ('/document/story/section[1]', + '/document/story/section[2]'), + ('/document/story/section[2]/para', + '/document/story/section[3]/para'), + ('/document/story/section[2]', + '/document/story/section[3]'), + ('/document/story/section[3]/para', + '/document/story/section[4]/para'), + ('/document/story/section[3]', + '/document/story/section[4]'), + ('/document/story', + '/document/story'), + ('/document', + '/document') + ]) + + def test_change_attribs(self): + + left = u""" + +
+ First +
+
+ Last +
+
+
+""" + + right = u""" + +
+ First +
+
+ Last +
+
+
+""" + # It matches everything straight, which means the attrib changes + # should become updates, which makes sense. + result = self._match(left, right) + self.assertEqual(result, [ + ('/document/story/section[1]/para', + '/document/story/section[1]/para'), + ('/document/story/section[1]', + '/document/story/section[1]'), + ('/document/story/section[2]/para', + '/document/story/section[2]/para'), + ('/document/story/section[2]', + '/document/story/section[2]'), + ('/document/story', + '/document/story'), + ('/document', + '/document') + ]) + + def test_move_paragraph(self): + left = u""" + +
+ First paragraph + Second paragraph +
+
+ Last paragraph +
+
+
+""" + + right = u""" + +
+ First paragraph +
+
+ Second paragraph + Last paragraph +
+
+
+""" + result = self._match(left, right) + self.assertEqual(result, [ + ('/document/story/section[1]/para[1]', + '/document/story/section[1]/para'), + ('/document/story/section[1]/para[2]', + '/document/story/section[2]/para[1]'), + ('/document/story/section[1]', '/document/story/section[1]'), + ('/document/story/section[2]/para', + '/document/story/section[2]/para[2]'), + ('/document/story/section[2]', '/document/story/section[2]'), + ('/document/story', '/document/story'), + ('/document', '/document') + ]) + + def test_match_complex_text(self): + left = """ + Consultant shall not indemnify and hold Company, its + affiliates and their respective directors, + officers, agents and employees harmless from and + against all claims, demands, losses, damages and + judgments, including court costs and attorneys' + fees, arising out of or based upon (a) any claim + that the Services provided hereunder or, any + related Intellectual Property Rights or the + exercise of any rights in or to any Company-Related + Development or Pre-Existing Development or related + Intellectual Property Rights infringe on, + constitute a misappropriation of the subject matter + of, or otherwise violate any patent, copyright, + trade secret, trademark or other proprietary right + of any person or breaches any person's contractual + rights; This is strange, but true. + """ + + right = """ + + Consultant shall not indemnify and hold + Company, its affiliates and their respective + directors, officers, agents and employees harmless + from and against all claims, demands, losses, + excluding court costs and attorneys' fees, arising + out of or based upon (a) any claim that the + Services provided hereunder or, any related + Intellectual Property Rights or the exercise of any + rights in or to any Company-Related Development or + Pre-Existing Development or related Intellectual + Property Rights infringe on, constitute a + misappropriation of the subject matter of, or + otherwise violate any patent, copyright, trade + secret, trademark or other proprietary right of any + person or breaches any person's contractual rights; + This is very strange, but true. + + """ + + result = self._match(left, right) + self.assertEqual(result, [ + ('/wrap/para/b', '/wrap/para/b'), + ('/wrap/para', '/wrap/para'), + ('/wrap', '/wrap') + ]) + + def test_match_insert_node(self): + left = u''' + + + + +''' + right = u''' + + +

Inserted Node

+ +
+
''' + result = self._match(left, right) + self.assertEqual(result, [ + ('/document/story', '/document/story'), + ('/document', '/document'), + ]) + + def test_entirely_different(self): + left = u''' + + + + +''' + right = u''' +

Inserted Node

+
''' + result = self._match(left, right) + self.assertEqual(result, [ + ('/document', '/document'), + ]) + + +class UpdateNodeTests(unittest.TestCase): + """Testing only the update phase of the diffing""" + + def _match(self, left, right): + left_tree = etree.fromstring(left) + right_tree = etree.fromstring(right) + differ = Differ() + differ.set_trees(left_tree, right_tree) + matches = differ.match() + steps = [] + for left, right, m in matches: + steps.extend(differ.update_node_attr(left, right)) + steps.extend(differ.update_node_text(left, right)) + + return steps + + def test_same_tree(self): + xml = u""" + +
+ First paragraph +
+
+ Last paragraph +
+
+
+""" + result = self._match(xml, xml) + # Everything matches + self.assertEqual(result, []) + + def test_attribute_changes(self): + left = u"""The contained textAnd a tail!""" + + right = u"""The new textAlso a tail!""" + + result = self._match(left, right) + + self.assertEqual( + result, + [ + UpdateAttrib('/root/node[1]', 'attr2', 'uhhuh'), + RenameAttrib('/root/node[1]', 'attr1', 'attr4'), + InsertAttrib('/root/node[1]', 'attr5', 'new'), + DeleteAttrib('/root/node[1]', 'attr0'), + UpdateTextIn('/root/node[1]', 'The new text'), + UpdateTextAfter('/root/node[1]', 'Also a tail!'), + ] + ) + + +class AlignChildrenTests(unittest.TestCase): + """Testing only the align phase of the diffing""" + + def _align(self, left, right): + left_tree = etree.fromstring(left) + right_tree = etree.fromstring(right) + differ = Differ() + differ.set_trees(left_tree, right_tree) + matches = differ.match() + steps = [] + for left, right, m in matches: + steps.extend(differ.align_children(left, right)) + return steps + + def test_same_tree(self): + xml = u""" + +
+ First paragraph +
+
+ Last paragraph +
+
+
+""" + result = self._align(xml, xml) + # Everything matches + self.assertEqual(result, []) + + def test_move_paragraph(self): + left = u""" + +
+ First paragraph + Second paragraph +
+
+ Last paragraph +
+
+
+""" + + right = u""" + +
+ First paragraph +
+
+ Second paragraph + Last paragraph +
+
+
+""" + result = self._align(left, right) + # Everything matches + self.assertEqual(result, []) + + def test_move_children(self): + left = u""" + +
+ First paragraph + Second paragraph + Last paragraph +
+
+
+""" + + right = u""" + +
+ Second paragraph + Last paragraph + First paragraph +
+
+
+""" + result = self._align(left, right) + self.assertEqual(result, + [MoveNode('/document/story/section/para[1]', + '/document/story/section[1]', 2)]) + + +class DiffTests(unittest.TestCase): + """Testing only the align phase of the diffing""" + + def _diff(self, left, right): + parser = etree.XMLParser(remove_blank_text=True) + left_tree = etree.fromstring(left, parser) + right_tree = etree.fromstring(right, parser) + differ = Differ() + differ.set_trees(left_tree, right_tree) + editscript = list(differ.diff()) + return editscript + + def test_process(self): + left = u""" + +
+ First paragraph + Second paragraph + Third paragraph +
+ + Delete it + +
+
+""" + + right = u""" + +
+ First paragraph + Second paragraph +
+
+ Third paragraph + Fourth paragraph +
+
+
+""" + result = self._diff(left, right) + self.assertEqual( + result, + [ + InsertNode('/document/story[1]', 'section', 1), + InsertAttrib('/document/story/section[2]', 'ref', '4'), + InsertAttrib('/document/story/section[2]', 'single-ref', '4'), + MoveNode('/document/story/section[1]/para[3]', + '/document/story/section[2]', 0), + InsertNode('/document/story/section[2]', 'para', 0), + UpdateTextIn('/document/story/section[2]/para[1]', + 'Fourth paragraph'), + DeleteNode('/document/story/deleteme/para[1]'), + DeleteNode('/document/story/deleteme[1]'), + ] + ) + + def test_needs_align(self): + left = "

1

2

3

4

" + right = "

2

4

1

3

" + result = self._diff(left, right) + self.assertEqual( + result, + [ + MoveNode('/root/n[1]', '/root[1]', 1), + MoveNode('/root/n[2]/p[2]', '/root/n[1]', 0), + ] + ) + + def test_no_root_match(self): + left = '

1

2

3

'\ + '

4

' + right = '

2

4

1

3

' + result = self._diff(left, right) + self.assertEqual( + result, + [ + DeleteAttrib(node='/root[1]', name='attr'), + MoveNode('/root/n[1]', '/root[1]', 1), + MoveNode('/root/n[2]/p[2]', '/root/n[1]', 0), + ] + ) + + def test_rmldoc(self): + here = os.path.split(__file__)[0] + lfile = os.path.join(here, 'test_data', 'rmldoc.left.xml') + rfile = os.path.join(here, 'test_data', 'rmldoc.right.xml') + with open(lfile, 'rt', encoding='utf8') as infile: + left = infile.read() + with open(rfile, 'rt', encoding='utf8') as infile: + right = infile.read() + + result = self._diff(left, right) + self.assertEqual( + result, + [ + InsertNode( + '/document/story[1]', + '{http://namespaces.shoobx.com/application}section', + 4), + InsertAttrib( + '/document/story/app:section[4]', 'hidden', 'false'), + InsertAttrib( + '/document/story/app:section[4]', 'name', 'sign'), + InsertAttrib( + '/document/story/app:section[4]', 'ref', '3'), + InsertAttrib( + '/document/story/app:section[4]', 'removed', 'false'), + InsertAttrib( + '/document/story/app:section[4]', 'single-ref', '3'), + InsertAttrib( + '/document/story/app:section[4]', 'title', 'Signing Bonus'), + UpdateAttrib('/document/story/app:section[5]', 'ref', '4'), + UpdateAttrib( + '/document/story/app:section[5]', 'single-ref', '4'), + UpdateAttrib('/document/story/app:section[6]', 'ref', '5'), + UpdateAttrib( + '/document/story/app:section[6]', 'single-ref', '5'), + UpdateAttrib('/document/story/app:section[7]', 'ref', '6'), + UpdateAttrib( + '/document/story/app:section[7]', 'single-ref', '6'), + UpdateAttrib('/document/story/app:section[8]', 'ref', '7'), + UpdateAttrib( + '/document/story/app:section[8]', 'single-ref', '7'), + UpdateAttrib('/document/story/app:section[9]', 'ref', '8'), + UpdateAttrib( + '/document/story/app:section[9]', 'single-ref', '8'), + UpdateAttrib('/document/story/app:section[10]', 'ref', '9'), + UpdateAttrib( + '/document/story/app:section[10]', 'single-ref', '9'), + UpdateAttrib('/document/story/app:section[11]', 'ref', '10'), + UpdateAttrib( + '/document/story/app:section[11]', 'single-ref', '10'), + UpdateAttrib('/document/story/app:section[12]', 'ref', '11'), + UpdateAttrib( + '/document/story/app:section[12]', 'single-ref', '11'), + UpdateAttrib('/document/story/app:section[14]', 'ref', '12'), + UpdateAttrib( + '/document/story/app:section[14]', 'single-ref', '12'), + UpdateTextIn( + '/document/story/app:section[1]/para[2]/' + 'app:placeholder[1]', + 'Second Name'), + InsertNode( + '/document/story/app:section[4]', + '{http://namespaces.shoobx.com/application}term', + 0), + InsertAttrib( + '/document/story/app:section[4]/app:term[1]', 'name', + 'sign_bonus'), + InsertAttrib( + '/document/story/app:section[4]/app:term[1]', 'set', 'ol'), + InsertNode('/document/story/app:section[4]', 'para', 1), + InsertNode( + '/document/story/app:section[4]/para[1]', + '{http://namespaces.shoobx.com/application}ref', + 0), + InsertAttrib( + '/document/story/app:section[4]/para/app:ref[1]', 'name', + 'sign'), + InsertAttrib( + '/document/story/app:section[4]/para/app:ref[1]', + '{http://namespaces.shoobx.com/preview}body', + ''), + UpdateTextIn( + '/document/story/app:section[4]/para/app:ref[1]', '3'), + UpdateTextAfter( + '/document/story/app:section[4]/para/app:ref[1]', '. '), + InsertNode('/document/story/app:section[4]/para[1]', 'u', 1), + UpdateTextAfter( + '/document/story/app:section[4]/para/u[1]', + '.\n You will also be paid a '), + InsertNode( + '/document/story/app:section[4]/para[1]', + '{http://namespaces.shoobx.com/application}placeholder', + 2), + InsertAttrib( + '/document/story/app:section[4]/para/app:placeholder[1]', + 'field', + 'ol.sign_bonus_include_amt'), + InsertAttrib( + '/document/story/app:section[4]/para/app:placeholder[1]', + 'missing', + 'Signing Bonus Amount'), + UpdateTextAfter( + '/document/story/app:section[4]/para/app:placeholder[1]', + ' signing\n bonus, which will be paid on the ' + 'next regularly scheduled pay date\n after ' + 'you start employment with the Company.\n \n' + ' ' + ), + InsertNode('/document/story/app:section[4]/para/u[1]', 'b', 0), + UpdateTextIn( + '/document/story/app:section[4]/para/u/b[1]', + 'Signing Bonus'), + UpdateTextIn( + '/document/story/app:section[5]/para/app:ref[1]', + '4'), + UpdateTextIn( + '/document/story/app:section[6]/para/app:ref[1]', + '5'), + UpdateTextIn( + '/document/story/app:section[7]/para/app:ref[1]', + '6'), + UpdateTextIn( + '/document/story/app:section[8]/para/app:ref[1]', + '7'), + UpdateTextIn( + '/document/story/app:section[9]/para/app:ref[1]', + '8'), + UpdateTextIn( + '/document/story/app:section[10]/para/app:ref[1]', + '9'), + UpdateTextIn( + '/document/story/app:section[11]/para/app:ref[1]', + '10'), + UpdateTextIn( + '/document/story/app:section[12]/para/app:ref[1]', + '11') + ] + ) + + def test_namespace(self): + # Test changing nodes and attributes with namespaces + left = u""" + + + Lorem ipsum dolor sit amet, + consectetur adipiscing elit. Pellentesque feugiat metus quam. + Suspendisse potenti. Vestibulum quis ornare felis, + ac elementum sem. + Second paragraph + Third paragraph + + Paragraph to tweak the matching of the section node + + + By making many matching children + + + Until the node matches properly. + + + + +""" + + right = u""" + + + Lorem ipsum dolor sit amet, + consectetur adipiscing elit. Pellentesque feugiat metus quam. + Suspendisse potenti. Vestibulum quis ornare felis, + ac elementum sem. + Second paragraph + Third paragraph + + Paragraph to tweak the matching of the section node + + + By making many matching children + + + Until the node matches properly. + + + + +""" + result = self._diff(left, right) + self.assertEqual( + result, + [ + InsertNode( + '/document/story/app:section[1]', + '{someuri}para', + 0), + UpdateTextIn( + '/document/story/app:section/app:para[1]', + 'Lorem ipsum dolor sit amet,\n consectetur ' + 'adipiscing elit. Pellentesque feugiat metus quam.\n' + ' Suspendisse potenti. Vestibulum quis ' + 'ornare felis,\n ac elementum sem.'), + InsertAttrib('/document/story/app:section/app:para[3]', + '{someuri}attrib', 'value'), + DeleteNode('/document/story/app:section/foo:para[1]'), + ] + ) + + def test_multiple_tag_deletes(self): + left = u""" + + +
    +
  • One
  • +
  • Two
  • +
  • Three
  • +
+ +
+
""" + + right = u""" + + +""" + + result = self._diff(left, right) + self.assertEqual( + result, + [UpdateTextIn(node='/document/story[1]', text='\n '), + DeleteNode(node='/document/story/ul/li[3]'), + DeleteNode(node='/document/story/ul/li[2]'), + DeleteNode(node='/document/story/ul/li[1]'), + DeleteNode(node='/document/story/ul[1]'), + ] + ) diff --git a/tests/test_difflib.py b/tests/test_difflib.py deleted file mode 100644 index f4402d8..0000000 --- a/tests/test_difflib.py +++ /dev/null @@ -1,127 +0,0 @@ -# Copyright (c) 2000-2010 LOGILAB S.A. (Paris, FRANCE). -# http://www.logilab.fr/ -- mailto:contact@logilab.fr -# Copyright (c) 2018 Shoobx.com. -# https://www.shoobx.com/ -- mailto:dev@shoobx.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -import random -import xmldiff.difflib - - -def _cmp(a, b): - return a == b - - -def lcsl(X, Y, equal): - """return the length of the result sent by lcs2""" - return len(xmldiff.difflib.lcs2(X, Y, equal)) - - -def help_test(seq1, seq2, res): - seq = xmldiff.difflib.lcs2(seq1, seq2, _cmp) - assert seq == list(zip(res, res)) - - -def test_lcs_1(lcs2_type): - help_test("abcdefghijkl", "bcdeghijk", "bcdeghijk") - - -def test_lcs_2(lcs2_type): - help_test("abdefghijkl", "bcdeghijk", "bdeghijk") - - -def test_lcs_3(lcs2_type): - help_test("abdefghijkl", "bxcydzewgzhijk", "bdeghijk") - - -def test_lcs_4(lcs2_type): - help_test("abdefghijkl", "zzzbcdeghijk", "bdeghijk") - - -def test_lcs_5(lcs2_type): - help_test("", "", []) - - -def test_lcs_6(): - seq = xmldiff.difflib.lcs4("", "", _cmp) - assert seq == [] - - -def test_quick_ratio(): - seq = xmldiff.difflib.quick_ratio("", "") - assert seq == 1 - - -# def test_time_lcs2(lcs2=lcs2): -# import time -# t = time.clock() -# quick_ratio('abcdefghijklmnopqrst'*100, 'abcdefghijklmnopqrst'*100) -# print 'quick ratio :', time.clock()-t -# lcs2('abcdefghijklmnopqrst'*100, 'abcdefghijklmnopqrst'*100, -# lambda x, y: x == y) -# print 'lcs2 : ', time.clock()-t -# quick_ratio('abcdefghijklmno'*100, 'zyxwvutsrqp'*100) -# print 'quick ratio :', time.clock()-t -# lcs2('abcdefghijklmno'*100, 'zyxwvutsrqp'*100, lambda x, y: x == y) -# print 'lcs2 : ', time.clock()-t -# quick_ratio('abcdefghijklmnopqrst'*100, 'abcdefghijklmnopqrst'*100) -# print 'quick ratio :', time.clock()-t -# lcs2('abcdefghijklmnopqrst'*100, 'abcdefghijklmnopqrst'*100, -# lambda x, y: x == y) -# print 'lcs2 : ', time.clock()-t -# quick_ratio('abcdefghijklmno'*100, 'zyxwvutsrqp'*100) -# print 'quick ratio :', time.clock()-t -# lcs2('abcdefghijklmno'*100, 'zyxwvutsrqp'*100, lambda x, y: x == y) -# print 'lcs2 : ', time.clock()-t - - -# def test_main_lcs2(lcs2=lcs2): -# print "abcde - bydc" -# print lcsl('abcde', 'bydc', lambda x, y: x == y) -# for a in lcs2('abcde', 'bydc', lambda x, y: x == y): -# print a -# print "abacdge - bcdg" -# print lcsl('abacdge', 'bcdg', lambda x, y: x == y) -# for a in lcs2('abacdge', 'bcdg', lambda x, y: x == y): -# print a - - -def randstr(lmin, lmax, alphabet): - L = random.randint(lmin, lmax) - S = [] - N = len(alphabet) - 1 - for i in range(L): - S.append(alphabet[random.randint(0, N)]) - return "".join(S) - - -def test_random_string(lcs2_type): - """Generate random test sequences and compare lcs2, lcs3, lcs4""" - import xmldiff.maplookup - lcsm = xmldiff.maplookup.lcs2 - - _alpha = "abcdefghijklmnopqrstuvwxyz" - for i in range(100): - S1 = randstr(2, 5, _alpha) - S2 = randstr(2, 5, _alpha) - # print S1, S2 - R1 = xmldiff.difflib.lcs2(S1, S2, _cmp) - # print "lcs2:", "".join([x[0] for x in R1]) - R2 = xmldiff.difflib.lcs4(S1, S2, _cmp) - # print "lcs4", "".join([x[0] for x in R2]) - R3 = lcsm(S1, S2, _cmp) - # print "lcsm", "".join([x[0] for x in R3]) - # print - assert R1 == R2, (S1, S2) - assert R1 == R3, (S1, S2) diff --git a/tests/test_formatting.py b/tests/test_formatting.py new file mode 100644 index 0000000..06fab4e --- /dev/null +++ b/tests/test_formatting.py @@ -0,0 +1,375 @@ +# -*- coding: UTF-8 -*- +import os +import unittest + +from lxml import etree +from xmldiff import diff, formatting, main + +from .testing import generate_filebased_cases + +START = u'' + + +class PlaceholderMakerTests(unittest.TestCase): + + def test_get_placeholder(self): + replacer = formatting.PlaceholderMaker() + # Get a placeholder: + ph = replacer.get_placeholder( + etree.Element('tag'), formatting.T_OPEN, None) + self.assertEqual(ph, u'\U000f0005') + # Do it again: + ph = replacer.get_placeholder( + etree.Element('tag'), formatting.T_OPEN, None) + self.assertEqual(ph, u'\U000f0005') + # Get another one + ph = replacer.get_placeholder( + etree.Element('tag'), formatting.T_CLOSE, ph) + self.assertEqual(ph, u'\U000f0006') + + def test_do_element(self): + replacer = formatting.PlaceholderMaker(['p'], ['b']) + + # Formatting tags get replaced, and the content remains + text = u'

This is a tag with formatted text.

' + element = etree.fromstring(text) + replacer.do_element(element) + + self.assertEqual( + etree.tounicode(element), + u'

This is a tag with \U000f0006formatted\U000f0005 text.

') + + replacer.undo_element(element) + self.assertEqual(etree.tounicode(element), text) + + # Non formatting tags get replaced with content + text = u'

This is a tag with formatted text.

' + element = etree.fromstring(text) + replacer.do_element(element) + result = etree.tounicode(element) + self.assertEqual( + result, + u'

This is a tag with \U000f0007 text.

') + + # Single formatting tags still get two placeholders. + text = u'

This is a with text.

' + element = etree.fromstring(text) + replacer.do_element(element) + result = etree.tounicode(element) + self.assertEqual( + result, + u'

This is a \U000f0009\U000f0008 with \U000f000a text.

') + + def test_do_undo_element(self): + replacer = formatting.PlaceholderMaker(['p'], ['b']) + + # Formatting tags get replaced, and the content remains + text = u'

This a tag with formatted text.

' + element = etree.fromstring(text) + replacer.do_element(element) + + self.assertEqual( + element.text, + u'This \U000f0005 a \U000f0006 with \U000f0008formatted' + u'\U000f0007 text.') + + replacer.undo_element(element) + result = etree.tounicode(element) + self.assertEqual(result, text) + + def test_do_undo_element_double_format(self): + replacer = formatting.PlaceholderMaker(['p'], ['b', 'u']) + + # Formatting tags get replaced, and the content remains + text = u'

This is doubly formatted text.

' + element = etree.fromstring(text) + replacer.do_element(element) + + self.assertEqual( + element.text, + u'This is \U000f0006doubly \U000f0008formatted\U000f0007' + u'\U000f0005 text.') + + replacer.undo_element(element) + result = etree.tounicode(element) + self.assertEqual(result, text) + + def test_rml_bug(self): + etree.register_namespace(formatting.DIFF_PREFIX, formatting.DIFF_NS) + before_diff = u""" +
+ + 4. + At Will Employment + .\u201cText\u201d + +
+
""" + tree = etree.fromstring(before_diff) + replacer = formatting.PlaceholderMaker( + text_tags=('para',), formatting_tags=('b', 'u', 'i',)) + replacer.do_tree(tree) + after_diff = u""" +
+ + \U000f0005. + \U000f0007\U000f0009At Will Employment\U000f0008\U000f0006 + .\u201cNew Text\u201d + +
+
""" + + # The diff formatting will find some text to insert. + delete_attrib = u'{%s}delete-format' % formatting.DIFF_NS + replacer.placeholder2tag[u'\U000f0006' + ].element.attrib[delete_attrib] = '' + replacer.placeholder2tag[u'\U000f0007' + ].element.attrib[delete_attrib] = '' + tree = etree.fromstring(after_diff) + replacer.undo_tree(tree) + result = etree.tounicode(tree) + expected = u""" +
+ + 4. + At Will Employment + .\u201cNew Text\u201d + +
+
""" + self.assertEqual(result, expected) + + +class XMLFormatTests(unittest.TestCase): + + def _format_test(self, left, action, expected): + formatter = formatting.XMLFormatter(pretty_print=False) + result = formatter.format([action], etree.fromstring(left)) + self.assertEqual(result, expected) + + def test_incorrect_xpaths(self): + left = u'Text' + expected = START + u' diff:delete-attr="a">Text' + END + + with self.assertRaises(ValueError): + action = diff.DeleteAttrib('/document/node', 'a') + self._format_test(left, action, expected) + + with self.assertRaises(ValueError): + action = diff.DeleteAttrib('/document/ummagumma', 'a') + self._format_test(left, action, expected) + + def test_del_attr(self): + left = u'Text' + action = diff.DeleteAttrib('/document/node', 'a') + expected = START + u' diff:delete-attr="a">Text' + END + + self._format_test(left, action, expected) + + def test_del_node(self): + left = u'Text' + action = diff.DeleteNode('/document/node') + expected = START + u' attr="val" diff:delete="">Text' + END + + self._format_test(left, action, expected) + + def test_del_text(self): + left = u'Text' + action = diff.UpdateTextIn('/document/node', None) + expected = START + u' attr="val">Text' + END + + self._format_test(left, action, expected) + + def test_insert_attr(self): + left = u'We need more text' + action = diff.InsertAttrib('/document/node', 'attr', 'val') + expected = START + u' attr="val" diff:add-attr="attr">'\ + u'We need more text' + END + + self._format_test(left, action, expected) + + def test_insert_node(self): + left = u'' + action = diff.InsertNode('/document', 'node', 0) + expected = START + u' diff:insert=""/>
' + + self._format_test(left, action, expected) + + def test_move_attr(self): + # The library currently only uses move attr for when attributes are + # renamed: + left = u'Text' + action = diff.RenameAttrib('/document/node', 'attr', 'bottr') + expected = START + u' bottr="val" diff:rename-attr="attr:bottr"'\ + u'>Text' + END + + self._format_test(left, action, expected) + + def test_move_node(self): + # Move 1 down + left = u'' + action = diff.MoveNode('/document/node[1]', '/document', 1) + expected = START + u' id="1" diff:delete=""/>' + + self._format_test(left, action, expected) + + # Move 2 up (same result, different diff) + left = u'' + action = diff.MoveNode('/document/node[2]', '/document', 0) + expected = START + u' id="2" diff:insert=""/>' + + self._format_test(left, action, expected) + + def test_update_attr(self): + left = u'' + action = diff.UpdateAttrib('/document/node', 'attr', 'newval') + expected = START + u' attr="newval" diff:update-attr="attr:val"/>'\ + u'' + + self._format_test(left, action, expected) + + def test_update_text_in(self): + left = u'' + action = diff.UpdateTextIn('/document/node', 'Text') + expected = START + u' attr="val">Text' + END + + self._format_test(left, action, expected) + + left = u'This is a bit of text, right' + END + action = diff.UpdateTextIn('/document/node', + 'Also a bit of text, rick') + expected = START + u'>This is'\ + u'Also a bit of text, right'\ + u'ck' + END + + self._format_test(left, action, expected) + + def test_update_text_after_1(self): + left = u'' + action = diff.UpdateTextAfter('/document/node[1]', 'Text') + expected = START + u'/>Text'\ + u'' + + self._format_test(left, action, expected) + + def test_update_text_after_2(self): + left = u'This is a bit of text, right' + action = diff.UpdateTextAfter('/document/node', + 'Also a bit of text, rick') + expected = START + u'/>This is'\ + u'Also a bit of text, ri'\ + u'ghtck' + + self._format_test(left, action, expected) + + +class DiffFormatTests(unittest.TestCase): + + def _format_test(self, action, expected): + formatter = formatting.DiffFormatter() + result = formatter.format([action], None) + self.assertEqual(result, expected) + + def test_del_attr(self): + action = diff.DeleteAttrib('/document/node', 'a') + expected = '[delete-attribute, /document/node, a]' + self._format_test(action, expected) + + def test_del_node(self): + action = diff.DeleteNode('/document/node') + expected = '[delete, /document/node]' + self._format_test(action, expected) + + def test_del_text(self): + action = diff.UpdateTextIn('/document/node', None) + expected = '[update-text, /document/node, null]' + self._format_test(action, expected) + + def test_insert_attr(self): + action = diff.InsertAttrib('/document/node', 'attr', 'val') + expected = '[insert-attribute, /document/node, attr, "val"]' + self._format_test(action, expected) + + def test_insert_node(self): + action = diff.InsertNode('/document', 'node', 0) + expected = '[insert, /document, node, 0]' + self._format_test(action, expected) + + def test_rename_attr(self): + action = diff.RenameAttrib('/document/node', 'attr', 'bottr') + expected = '[move-attribute, /document/node, attr, bottr]' + self._format_test(action, expected) + + def test_move_node(self): + # Move 1 down + action = diff.MoveNode('/document/node[1]', '/document', 1) + expected = '[move, /document/node[1], /document, 1]' + self._format_test(action, expected) + + # Move 2 up (same result, different diff) + action = diff.MoveNode('/document/node[2]', '/document', 0) + expected = '[move, /document/node[2], /document, 0]' + + self._format_test(action, expected) + + def test_update_attr(self): + action = diff.UpdateAttrib('/document/node', 'attr', 'newval') + expected = '[update-attribute, /document/node, attr, "newval"]' + self._format_test(action, expected) + + def test_update_text_in(self): + action = diff.UpdateTextIn('/document/node', 'Text') + expected = '[update-text, /document/node, "Text"]' + self._format_test(action, expected) + + action = diff.UpdateTextIn('/document/node', + 'Also a bit of text, "rick"') + expected = '[update-text, /document/node, '\ + u'"Also a bit of text, \\"rick\\""]' + self._format_test(action, expected) + + def test_update_text_after_1(self): + action = diff.UpdateTextAfter('/document/node[1]', 'Text') + expected = '[update-text-after, /document/node[1], "Text"]' + self._format_test(action, expected) + + def test_update_text_after_2(self): + action = diff.UpdateTextAfter('/document/node', + 'Also a bit of text, rick') + expected = '[update-text-after, /document/node, '\ + u'"Also a bit of text, rick"]' + self._format_test(action, expected) + + +class FormatterFileTests(unittest.TestCase): + + formatter = None # Override this + maxDiff = None + + def process(self, left, right): + return main.diff_files(left, right, formatter=self.formatter) + + +class XMLFormatterFileTests(FormatterFileTests): + + # The XMLFormatter has no text or formatting tags, so + formatter = formatting.XMLFormatter(pretty_print=False, + normalize=formatting.WS_TEXT) + + +class RMLFormatterFileTests(FormatterFileTests): + + # We use the RMLFormatter for the placeholder tests + formatter = formatting.RMLFormatter() + + +# Add tests that use no placeholder replacement (ie plain XML) +data_dir = os.path.join(os.path.dirname(__file__), 'test_data') +generate_filebased_cases(data_dir, XMLFormatterFileTests) + +# Add tests that use placeholder replacement (ie RML) +data_dir = os.path.join(os.path.dirname(__file__), 'test_data') +generate_filebased_cases(data_dir, RMLFormatterFileTests, suffix='rml') diff --git a/tests/test_main.py b/tests/test_main.py new file mode 100644 index 0000000..5d3c4ea --- /dev/null +++ b/tests/test_main.py @@ -0,0 +1,130 @@ +import os +import six +import sys +import unittest + +from lxml import etree +from xmldiff import main, formatting + +CURDIR = os.path.split(__file__)[0] +LEFT_FILE = os.path.join(CURDIR, 'test_data', 'rmldoc.left.xml') +RIGHT_FILE = os.path.join(CURDIR, 'test_data', 'rmldoc.right.xml') +EXPECTED_FILE = os.path.join(CURDIR, 'test_data', 'rmldoc.expected.xml') + + +class MainAPITests(unittest.TestCase): + + def test_api_diff_files(self): + # diff_files can take filenames + result1 = main.diff_files(LEFT_FILE, RIGHT_FILE) + + # Or open file streams: + with open(LEFT_FILE, 'rb') as linfile: + with open(RIGHT_FILE, 'rb') as rinfile: + result2 = main.diff_files(linfile, rinfile) + + self.assertEqual(result1, result2) + + # Give something else, and it fails: + with self.assertRaises(IOError): + main.diff_files('', '') + + def test_api_diff_texts(self): + # diff_text can take bytes + with open(LEFT_FILE, 'rb') as linfile: + with open(RIGHT_FILE, 'rb') as rinfile: + left = linfile.read() + right = rinfile.read() + result1 = main.diff_texts(left, right) + + # And unicode + result2 = main.diff_texts(left.decode('utf8'), + right.decode('utf8')) + + self.assertEqual(result1, result2) + + with open(LEFT_FILE, 'rb') as infile: + with open(RIGHT_FILE, 'rb') as infile: + # Give something else, and it fails: + with self.assertRaises(ValueError): + main.diff_texts(infile, infile) + + def test_api_diff_trees(self): + # diff_tree can take ElementEtrees + left = etree.parse(LEFT_FILE) + right = etree.parse(RIGHT_FILE) + result1 = main.diff_trees(left, right) + + # And Elements + result2 = main.diff_trees(left.getroot(), right.getroot()) + self.assertEqual(result1, result2) + + # Give something else, and it fails: + with self.assertRaises(TypeError): + main.diff_trees(LEFT_FILE, RIGHT_FILE) + + def test_api_diff_files_with_formatter(self): + formatter = formatting.XMLFormatter() + # diff_files can take filenames + result = main.diff_files(LEFT_FILE, RIGHT_FILE, formatter=formatter) + # This formatter will insert a diff namespace: + self.assertIn('xmlns:diff="http://namespaces.shoobx.com/diff"', result) + + +class MainCLITests(unittest.TestCase): + + def call_run(self, args): + output = six.StringIO() + errors = six.StringIO() + + stdout = sys.stdout + stderr = sys.stderr + + try: + sys.stdout = output + sys.stderr = errors + + main.run(args) + finally: + sys.stdout = stdout + sys.stderr = stderr + + return output.getvalue(), errors.getvalue() + + def test_cli_no_args(self): + with self.assertRaises(SystemExit): + stdout, stderr = self.call_run([]) + + def test_cli_simple(self): + curdir = os.path.dirname(__file__) + filepath = os.path.join(curdir, 'test_data') + file1 = os.path.join(filepath, 'insert-node.left.rml') + file2 = os.path.join(filepath, 'insert-node.right.rml') + + output, errors = self.call_run([file1, file2]) + self.assertEqual(len(output.splitlines()), 5) + # This should default to the diff formatter: + self.assertEqual(output[0], '[') + + def test_cli_args(self): + curdir = os.path.dirname(__file__) + filepath = os.path.join(curdir, 'test_data') + file1 = os.path.join(filepath, 'insert-node.left.rml') + file2 = os.path.join(filepath, 'insert-node.right.rml') + + # Select a formatter: + output, errors = self.call_run([file1, file2, '--formatter', 'xml']) + # It gives a very compact output + self.assertEqual(len(output.splitlines()), 1) + # Now it's XML + self.assertEqual(output[0], '<') + + # Don't strip the whitespace keeps the formatting from the source: + output, errors = self.call_run([file1, file2, '--keep-whitespace', + '--formatter', 'xml']) + self.assertEqual(len(output.splitlines()), 7) + + # And stripping and pretty printing gives a longer readable output + output, errors = self.call_run([file1, file2, '--pretty-print', + '--formatter', 'xml']) + self.assertEqual(len(output.splitlines()), 11) diff --git a/tests/test_parser.py b/tests/test_parser.py deleted file mode 100644 index 2a28fb7..0000000 --- a/tests/test_parser.py +++ /dev/null @@ -1,325 +0,0 @@ -# Copyright (c) 2000-2010 LOGILAB S.A. (Paris, FRANCE). -# http://www.logilab.fr/ -- mailto:contact@logilab.fr -# Copyright (c) 2018 Shoobx.com. -# https://www.shoobx.com/ -- mailto:dev@shoobx.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -""" unit tests for xmldiff. -""" - -import os -import lxml.etree -import mock -import six - -from xmldiff.input import tree_from_stream -from xmldiff.input import tree_from_lxml - -from xmldiff.objects import N_VALUE, N_CHILDS, N_PARENT, N_NSPREFIX - - -HERE = os.path.dirname(__file__) - - -def _nuke_parent(tree): - # having the parent node is cool, but causes all sort of problems - # with asserts and comparison... get rid of it - tree[N_PARENT] = None - for child in tree[N_CHILDS]: - _nuke_parent(child) - - -def test_tree_from_stream_simple(): - stream = six.StringIO(""" - - - - - - - - - - - - - """) - tree = tree_from_stream(stream) - expected = [ - 6, - '/', - '', - [[1, - u'a', - u'a', - [[1, u'b', u'b', [], mock.ANY, 0, 1, None], - [1, u'c', u'c', [], mock.ANY, 0, 1, None], - [1, - u'd', - u'd', - [[1, - u'e', - u'e', - [[1, - u'h', - u'h', - [], - mock.ANY, - 0, - 1, - None]], - mock.ANY, - 1, - 1, - None], - [1, - u'f', - u'f', - [], - mock.ANY, - 0, - 1, - None]], - mock.ANY, - 3, - 1, - None]], - mock.ANY, - 6, - 1, - None]], - None, - 7, - 0, - None] - assert tree == expected - - -def test_tree_from_stream(): - fname = os.path.join(HERE, 'data', 'parse', '1.xml') - with open(fname, 'r') as fhandle: - tree = tree_from_stream(fhandle) - # lets not dump the whole tree - assert len(tree[N_CHILDS]) == 1 - - -def test_tree_from_stream_utf8(): - fname = os.path.join(HERE, 'data', 'parse', 'utf8.xml') - with open(fname, 'rb') as fhandle: - tree = tree_from_stream(fhandle) - type_node = tree[N_CHILDS][0][N_CHILDS][0][N_CHILDS][0][N_CHILDS][0] - text_node = tree[N_CHILDS][0][N_CHILDS][0][N_CHILDS][1][N_CHILDS][0] - assert type_node[N_VALUE] == u'\xf6\xfc' - assert text_node[N_VALUE] == u'\xe9\xe1\u03a9' - - -def test_tree_from_stream_utf16(): - fname = os.path.join(HERE, 'data', 'parse', 'utf16.xml') - with open(fname, 'rb') as fhandle: - tree = tree_from_stream(fhandle) - type_node = tree[N_CHILDS][0][N_CHILDS][0][N_CHILDS][0][N_CHILDS][0] - text_node = tree[N_CHILDS][0][N_CHILDS][0][N_CHILDS][1][N_CHILDS][0] - assert type_node[N_VALUE] == u'\xf6\xfc' - assert text_node[N_VALUE] == u'\xe9\xe1\u03a9' - - -def test_tree_from_stream_iso(): - fname = os.path.join(HERE, 'data', 'parse', 'iso.xml') - with open(fname, 'rb') as fhandle: - tree = tree_from_stream(fhandle) - type_node = tree[N_CHILDS][0][N_CHILDS][0][N_CHILDS][0][N_CHILDS][0] - text_node = tree[N_CHILDS][0][N_CHILDS][0][N_CHILDS][1][N_CHILDS][0] - assert type_node[N_VALUE] == u'\xf6\xfc' - assert text_node[N_VALUE] == u'\xe9\xe1' - - -def test_tree_from_stream_with_namespace(): - fname = os.path.join(HERE, 'data', 'parse', 'simple_ns.xml') - with open(fname, 'r') as fhandle: - tree = tree_from_stream(fhandle) - - _nuke_parent(tree) - - expected = [ - 6, - '/', - '', - [[1, - u'{urn:corp:sec}section', - u'{urn:corp:sec}section', - [[1, - u'{urn:corp:sec}sectionInfo', - u'{urn:corp:sec}sectionInfo', - [[1, - u'{urn:corp:sec}secID', - u'{urn:corp:sec}secID', - [[4, 'text()', u'S001', [], None, 0, 1, None]], - None, - 1, - 1, - 'sec'], - [1, - u'{urn:corp:sec}name', - u'{urn:corp:sec}name', - [[4, 'text()', u'Sales', [], None, 0, 1, None]], - None, - 1, - 1, - 'sec']], - None, - 4, - 1, - 'sec'], - [1, - u'{urn:corp:sec}sectionInfo', - u'{urn:corp:sec}sectionInfo', - [[2, - u'@nameName', - u'name', - [[3, u'@name', u'Development', [], None, 0, 0, None]], - None, - 1, - 0, - None], - [2, - u'@secIDName', - u'secID', - [[3, u'@secID', u'S002', [], None, 0, 0, None]], - None, - 1, - 0, - None]], - None, - 4, - 2, - 'sec'], - [1, - u'{urn:corp:sec}sectionInfo', - u'{urn:corp:sec}sectionInfo', - [[2, - u'@{urn:corp:sec}nameName', - u'{urn:corp:sec}name', - [[3, u'@{urn:corp:sec}name', u'Gardening', [], None, 0, 0, 'sec']], - None, - 1, - 0, - 'sec'], - [2, - u'@{urn:corp:sec}secIDName', - u'{urn:corp:sec}secID', - [[3, u'@{urn:corp:sec}secID', u'S003', [], None, 0, 0, 'sec']], - None, - 1, - 0, - 'sec']], - None, - 4, - 3, - 'sec']], - None, - 15, - 1, - 'sec']], - None, - 16, - 0, - None] - - assert tree == expected - - -def test_tree_from_lxml(): - fname = os.path.join(HERE, 'data', 'parse', '1.xml') - xml = lxml.etree.parse(fname) - tree = tree_from_lxml(xml) - assert len(tree[N_CHILDS]) == 1 - - fname = os.path.join(HERE, 'data', 'parse', '1.xml') - with open(fname, 'r') as fhandle: - tree_stream = tree_from_stream(fhandle) - - _nuke_parent(tree) - _nuke_parent(tree_stream) - - assert tree == tree_stream - - -# In lxml, up to and including version 4.2.1, the namespace prefixes -# will be replaced by auto-generated namespace prefixes, ns00, ns01, etc -# If we encounter an "ns00:"" prefix, replace it. -# This code can be removed once we no longer need to run the tests with -# lxml 4.2.1 or earlier. -# This is only to fix this test, using xmldiff with these versions of -# lxml will still work, but the prefixes will be wrong. -def fix_lxml_421_tree(t, prefix): - if t[N_NSPREFIX] == 'ns00': - t[N_NSPREFIX] = prefix - for subtree in t[3]: - fix_lxml_421_tree(subtree, prefix) - - -def test_tree_from_lxml_with_namespace(): - fname = os.path.join(HERE, 'data', 'parse', 'simple_ns.xml') - xml = lxml.etree.parse(fname) - tree = tree_from_lxml(xml) - - with open(fname, 'r') as fhandle: - tree_stream = tree_from_stream(fhandle) - - _nuke_parent(tree) - _nuke_parent(tree_stream) - - # lxml <= 4.2.1 - fix_lxml_421_tree(tree, 'sec') - - assert tree == tree_stream - - fname = os.path.join(HERE, 'data', 'parse', 'tal_ns.xml') - xml = lxml.etree.parse(fname) - tree = tree_from_lxml(xml) - - with open(fname, 'r') as fhandle: - tree_stream = tree_from_stream(fhandle) - - _nuke_parent(tree) - _nuke_parent(tree_stream) - - # lxml <= 4.2.1 - fix_lxml_421_tree(tree, 'z') - - assert tree == tree_stream - - -def test_tree_from_lxml_with_default_namespace(): - fname = os.path.join(HERE, 'data', 'parse', 'default_ns.xml') - xml = lxml.etree.parse(fname) - tree = tree_from_lxml(xml) - - with open(fname, 'r') as fhandle: - tree_stream = tree_from_stream(fhandle) - - _nuke_parent(tree) - _nuke_parent(tree_stream) - - fix_lxml_421_tree(tree, None) - - assert tree == tree_stream - - -def test_parse_html(): - fname = os.path.join(HERE, 'data', 'parse', 'html.html') - with open(fname, 'r') as fhandle: - tree = tree_from_stream(fhandle, html=True) - # lets not dump the whole tree - assert len(tree[N_CHILDS]) == 1 diff --git a/tests/test_performance.py b/tests/test_performance.py deleted file mode 100644 index 40c7114..0000000 --- a/tests/test_performance.py +++ /dev/null @@ -1,33 +0,0 @@ -import glob -import lxml.etree -import os -import unittest - -from xmldiff.input import tree_from_lxml -from xmldiff.fmes import FmesCorrector -from xmldiff.format import InternalPrinter -from six import StringIO - - -class PerformanceTest(unittest.TestCase): - # This tests don't fail, they just run the diff loads of times - # so you can get a rough measurement of how long it takes. - # It's disabled by default (prefixed with "no_"). - - def no_test_performance(self): - HERE = os.path.dirname(__file__) - left_files = glob.glob(os.path.join(HERE, 'data', '*_1.xml')) - right_files = glob.glob(os.path.join(HERE, 'data', '*_2.xml')) - - for left, right in zip(sorted(left_files), sorted(right_files)): - with open(left, 'rb') as leftfile, open(right, 'rb') as rightfile: - lefttree = tree_from_lxml(lxml.etree.parse(leftfile)) - righttree = tree_from_lxml(lxml.etree.parse(rightfile)) - - stream = StringIO() - formatter = InternalPrinter(stream=stream) - # Prioritized xmlid, and increase f to 0.7, to get better matches. - strategy = FmesCorrector(formatter, f=0.7) - - for i in range(1000): - strategy.process_trees(lefttree, righttree) diff --git a/tests/test_regrtest.py b/tests/test_regrtest.py deleted file mode 100644 index 19fa74b..0000000 --- a/tests/test_regrtest.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (c) 2000-2010 LOGILAB S.A. (Paris, FRANCE). -# http://www.logilab.fr/ -- mailto:contact@logilab.fr -# Copyright (c) 2018 Shoobx.com. -# https://www.shoobx.com/ -- mailto:dev@shoobx.com -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -""" -xmldiff non regression test -""" -from os.path import join, basename, dirname -import re -import sys -import six -import pytest -import glob - -from xmldiff import main - -HERE = dirname(__file__) - - -def get_output(options): - backup = sys.stdout - - # capture stdout - sys.stdout = out = six.StringIO() - try: - main.run(options) - except SystemExit: - pass - finally: - sys.stdout = backup - output = out.getvalue().strip() - out.close() - - return output - - -def test_recursive(): - options = ['-r', join(HERE, 'data', 'dir1'), join(HERE, 'data', 'dir2')] - expected = """-------------------------------------------------------------------------------- -FILE: onlyindir1.xml deleted --------------------------------------------------------------------------------- -FILE: dir_inboth/onlyindir1.xml deleted --------------------------------------------------------------------------------- -DIRECTORY: dir_only1 deleted --------------------------------------------------------------------------------- -FILE: onlyindir2.xml added --------------------------------------------------------------------------------- -FILE: dir_inboth/onlyindir2.xml added --------------------------------------------------------------------------------- -DIRECTORY: dir_only2 added --------------------------------------------------------------------------------- -FILE: changing.xml -[append-first, /, - -] -[remove, /oopoyy[1]] --------------------------------------------------------------------------------- -FILE: inbothdir.xml --------------------------------------------------------------------------------- -FILE: dir_inboth/changing.xml -[append-first, /, - -] -[remove, /oopoyy[1]] --------------------------------------------------------------------------------- -FILE: dir_inboth/inbothdir.xml""" - data = get_output(options) - assert data == expected, '%s:\n%r != %r' % (options, data, expected) - - -def test_broken(): - options = ['-r', join(HERE, 'data', 'broken', 'broken.xml'), - join(HERE, 'data', 'broken', 'broken.xml')] - expected = "xmldiff/tests/data/broken/broken.xml:11:4: mismatched tag" - data = get_output(options) - assert expected in data - - -def test_verbose(): - options = ['--verbose', join(HERE, 'data', 'test01_1.xml'), - join(HERE, 'data', 'test01_2.xml')] - expected = """Source tree: -R: (/) node-id 2 - \-NN:oopoyy (/oopoyy[1]) node-id 1 - \-NN:function (/oopoyy[1]/function[1]) node-id 0 - -Destination tree: -R: (/) node-id 1 - \-NN:gap (/gap[1]) node-id 0 - -Source tree has 2 nodes -Destination tree has 1 nodes -[append-first, /, - -] -[remove, /oopoyy[1]]""" - data = get_output(options) - data = re.sub(r"\d{10,20}", "node-id", data) - assert expected in data - - -def test_wrong_combo(): - options = ['-r', join(HERE, 'data', 'dir1'), join(HERE, 'data', 'test00_1.xml')] - expected = "are not comparable, or not directory nor regular files" - data = get_output(options) - assert expected in data - - -def make_tests(): - """generate tests classes from test info - - return the list of generated test classes - """ - tests_files = glob.glob(join(HERE, 'data', '*.xml')) + \ - glob.glob(join(HERE, 'data', '*_result')) - tests = {} - # regroup test files - for filename in tests_files: - base = basename(filename) - name = base[:6] - filetype = base[-5:] - if filetype == '1.xml': - tests.setdefault(name, {})['old'] = filename - elif filetype == '2.xml': - tests.setdefault(name, {})['new'] = filename - else: - tests.setdefault(name, {})['result'] = filename - - for t_dict in tests.values(): - # quick check whether input, output, result is there - t_dict['old'] - t_dict['new'] - t_dict['result'] - - return sorted(tests.values(), key=lambda td: td['old']) - - -@pytest.fixture(params=make_tests()) -def fnames(request): - return request.param - - -def test_known(fnames, lcs2_type): - old = fnames['old'] - new = fnames['new'] - res_file = fnames['result'] - with open(res_file) as f: - expected = f.read().strip() - options = [old, new] - data = get_output(options) - assert data == expected, '%s:\n%r != %r' % (options, data, expected) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..14b94c8 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,113 @@ +import unittest + +from lxml import etree +from xmldiff import utils + + +class TraverseTests(unittest.TestCase): + + def test_post_order(self): + xml = u''' + +
+ First paragraph +
+
+ Last paragraph +
+
+
+''' + root = etree.fromstring(xml) + tree = root.getroottree() + res = [tree.getpath(x) for x in utils.post_order_traverse(root)] + self.assertEqual(res, ['/document/story/section[1]/para', + '/document/story/section[1]', + '/document/story/section[2]/para', + '/document/story/section[2]', + '/document/story', + '/document']) + + def test_reverse_post_order(self): + xml = u''' + +
+ First paragraph +
+
+ Last paragraph +
+
+
+''' + root = etree.fromstring(xml) + tree = root.getroottree() + res = [tree.getpath(x) for x in + utils.reverse_post_order_traverse(root)] + self.assertEqual(res, ['/document/story/section[2]/para', + '/document/story/section[2]', + '/document/story/section[1]/para', + '/document/story/section[1]', + '/document/story', + '/document']) + + def test_breadth_first(self): + xml = u''' + +
+ First paragraph +
+
+ Last paragraph +
+
+
+''' + root = etree.fromstring(xml) + tree = root.getroottree() + res = [tree.getpath(x) for x in utils.breadth_first_traverse(root)] + self.assertEqual(res, ['/document', + '/document/story', + '/document/story/section[1]', + '/document/story/section[2]', + '/document/story/section[1]/para', + '/document/story/section[2]/para', + ]) + + +class LongestCommonSubsequenceTests(unittest.TestCase): + + def _diff(self, left, right, result): + res = [] + for x, y in utils.longest_common_subsequence(left, right): + self.assertEqual(left[x], right[y]) + res.append(left[x]) + + self.assertEqual(''.join(res), result) + + def test_lcs(self): + + self._diff('ABCDEF', 'ABCDEF', 'ABCDEF') + + self._diff('ABCDEF', 'GHIJKL', '') + + self._diff('ABCDEF', 'ACDQRB', 'ACD') + + self._diff('CXCDEFX', 'CDEFX', 'CDEFX') + + self._diff('HUMAN', 'CHIMPANZEE', 'HMAN') + + self._diff('ABCDEF', 'A', 'A') + + self._diff('123AAAAAAAAA', '123BBBBBBBBB', '123') + + self._diff('AAAAAAAAA123', 'BBBBBBBBB123', '123') + + self._diff('ABCDE1', '1FGHIJK', '1') + + # There are several correct options here, make sure that doesn't + # confuse it, we want just one, and don't care which. + self._diff('HORSEBACK', 'SNOWFLAKE', 'SAK') + + # Empty sequences: + self._diff('', '', '') diff --git a/tests/testing.py b/tests/testing.py new file mode 100644 index 0000000..78c726f --- /dev/null +++ b/tests/testing.py @@ -0,0 +1,37 @@ +import os + +from io import open + + +def make_case_function(left_filename): + right_filename = left_filename.replace('.left.', '.right.') + expected_filename = left_filename.replace('.left.', '.expected.') + + def test(self): + with open(expected_filename, 'rt', encoding='utf8') as input_file: + expected_xml = input_file.read() + + try: + result_xml = self.process(left_filename, right_filename) + except Exception as err: + if u'.err' not in left_filename: + raise + result_xml = u'%s: %s' % (err.__class__.__name__, err) + + self.assertEqual(expected_xml.strip(), result_xml.strip()) + + return test + + +def generate_filebased_cases(data_dir, test_class, suffix='xml', ignore=()): + for left_filename in os.listdir(data_dir): + if not left_filename.endswith('.left.' + suffix): + continue + if left_filename in ignore: + continue + + left_filename = os.path.join(data_dir, left_filename) + test_function = make_case_function(left_filename) + function_name = os.path.split(left_filename)[-1].replace('.', '-') + test_name = 'test_' + function_name + setattr(test_class, test_name, test_function) diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 221ac76..0000000 --- a/tox.ini +++ /dev/null @@ -1,15 +0,0 @@ -[tox] -envlist = py27, py36 - -[testenv] -passenv = TRAVIS TRAVIS_JOB_ID TRAVIS_BRANCH -commands = - flake8 ./src - flake8 ./tests - pytest \ - -rw --cov=xmldiff --cov-report=term-missing --cov-report=html \ - -s --tb=native -vv -deps = .[test] - -[flake8] -ignore = E501, E266 diff --git a/tests/data/dir1/dir_only1/.empty b/xmldiff/__init__.py similarity index 100% rename from tests/data/dir1/dir_only1/.empty rename to xmldiff/__init__.py diff --git a/xmldiff/_diff_match_patch_py2.py b/xmldiff/_diff_match_patch_py2.py new file mode 100644 index 0000000..806fe1e --- /dev/null +++ b/xmldiff/_diff_match_patch_py2.py @@ -0,0 +1,1919 @@ +#!/usr/bin/python2.4 + +from __future__ import division + +"""Diff Match and Patch +Copyright 2018 The diff-match-patch Authors. +https://github.com/google/diff-match-patch + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +"""Functions for diff, match and patch. + +Computes the difference between two texts to create a patch. +Applies the patch onto another text, allowing for errors. +""" + +__author__ = 'fraser@google.com (Neil Fraser)' + +import re +import sys +import time +import urllib + + +class diff_match_patch: + """Class containing the diff, match and patch methods. + + Also contains the behaviour settings. + """ + + def __init__(self): + """Inits a diff_match_patch object with default settings. + Redefine these in your program to override the defaults. + """ + + # Number of seconds to map a diff before giving up (0 for infinity). + self.Diff_Timeout = 1.0 + # Cost of an empty edit operation in terms of edit characters. + self.Diff_EditCost = 4 + # At what point is no match declared (0.0 = perfection, 1.0 = very loose). + self.Match_Threshold = 0.5 + # How far to search for a match (0 = exact location, 1000+ = broad match). + # A match this many characters away from the expected location will add + # 1.0 to the score (0.0 is a perfect match). + self.Match_Distance = 1000 + # When deleting a large block of text (over ~64 characters), how close do + # the contents have to be to match the expected contents. (0.0 = perfection, + # 1.0 = very loose). Note that Match_Threshold controls how closely the + # end points of a delete need to match. + self.Patch_DeleteThreshold = 0.5 + # Chunk size for context length. + self.Patch_Margin = 4 + + # The number of bits in an int. + # Python has no maximum, thus to disable patch splitting set to 0. + # However to avoid long patches in certain pathological cases, use 32. + # Multiple short patches (using native ints) are much faster than long ones. + self.Match_MaxBits = 32 + + # DIFF FUNCTIONS + + # The data structure representing a diff is an array of tuples: + # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")] + # which means: delete "Hello", add "Goodbye" and keep " world." + DIFF_DELETE = -1 + DIFF_INSERT = 1 + DIFF_EQUAL = 0 + + def diff_main(self, text1, text2, checklines=True, deadline=None): + """Find the differences between two texts. Simplifies the problem by + stripping any common prefix or suffix off the texts before diffing. + + Args: + text1: Old string to be diffed. + text2: New string to be diffed. + checklines: Optional speedup flag. If present and false, then don't run + a line-level diff first to identify the changed areas. + Defaults to true, which does a faster, slightly less optimal diff. + deadline: Optional time when the diff should be complete by. Used + internally for recursive calls. Users should set DiffTimeout instead. + + Returns: + Array of changes. + """ + # Set a deadline by which time the diff must be complete. + if deadline == None: + # Unlike in most languages, Python counts time in seconds. + if self.Diff_Timeout <= 0: + deadline = sys.maxint + else: + deadline = time.time() + self.Diff_Timeout + + # Check for null inputs. + if text1 == None or text2 == None: + raise ValueError("Null inputs. (diff_main)") + + # Check for equality (speedup). + if text1 == text2: + if text1: + return [(self.DIFF_EQUAL, text1)] + return [] + + # Trim off common prefix (speedup). + commonlength = self.diff_commonPrefix(text1, text2) + commonprefix = text1[:commonlength] + text1 = text1[commonlength:] + text2 = text2[commonlength:] + + # Trim off common suffix (speedup). + commonlength = self.diff_commonSuffix(text1, text2) + if commonlength == 0: + commonsuffix = '' + else: + commonsuffix = text1[-commonlength:] + text1 = text1[:-commonlength] + text2 = text2[:-commonlength] + + # Compute the diff on the middle block. + diffs = self.diff_compute(text1, text2, checklines, deadline) + + # Restore the prefix and suffix. + if commonprefix: + diffs[:0] = [(self.DIFF_EQUAL, commonprefix)] + if commonsuffix: + diffs.append((self.DIFF_EQUAL, commonsuffix)) + self.diff_cleanupMerge(diffs) + return diffs + + def diff_compute(self, text1, text2, checklines, deadline): + """Find the differences between two texts. Assumes that the texts do not + have any common prefix or suffix. + + Args: + text1: Old string to be diffed. + text2: New string to be diffed. + checklines: Speedup flag. If false, then don't run a line-level diff + first to identify the changed areas. + If true, then run a faster, slightly less optimal diff. + deadline: Time when the diff should be complete by. + + Returns: + Array of changes. + """ + if not text1: + # Just add some text (speedup). + return [(self.DIFF_INSERT, text2)] + + if not text2: + # Just delete some text (speedup). + return [(self.DIFF_DELETE, text1)] + + if len(text1) > len(text2): + (longtext, shorttext) = (text1, text2) + else: + (shorttext, longtext) = (text1, text2) + i = longtext.find(shorttext) + if i != -1: + # Shorter text is inside the longer text (speedup). + diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext), + (self.DIFF_INSERT, longtext[i + len(shorttext):])] + # Swap insertions for deletions if diff is reversed. + if len(text1) > len(text2): + diffs[0] = (self.DIFF_DELETE, diffs[0][1]) + diffs[2] = (self.DIFF_DELETE, diffs[2][1]) + return diffs + + if len(shorttext) == 1: + # Single character string. + # After the previous speedup, the character can't be an equality. + return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)] + + # Check to see if the problem can be split in two. + hm = self.diff_halfMatch(text1, text2) + if hm: + # A half-match was found, sort out the return data. + (text1_a, text1_b, text2_a, text2_b, mid_common) = hm + # Send both pairs off for separate processing. + diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline) + diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline) + # Merge the results. + return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b + + if checklines and len(text1) > 100 and len(text2) > 100: + return self.diff_lineMode(text1, text2, deadline) + + return self.diff_bisect(text1, text2, deadline) + + def diff_lineMode(self, text1, text2, deadline): + """Do a quick line-level diff on both strings, then rediff the parts for + greater accuracy. + This speedup can produce non-minimal diffs. + + Args: + text1: Old string to be diffed. + text2: New string to be diffed. + deadline: Time when the diff should be complete by. + + Returns: + Array of changes. + """ + + # Scan the text on a line-by-line basis first. + (text1, text2, linearray) = self.diff_linesToChars(text1, text2) + + diffs = self.diff_main(text1, text2, False, deadline) + + # Convert the diff back to original text. + self.diff_charsToLines(diffs, linearray) + # Eliminate freak matches (e.g. blank lines) + self.diff_cleanupSemantic(diffs) + + # Rediff any replacement blocks, this time character-by-character. + # Add a dummy entry at the end. + diffs.append((self.DIFF_EQUAL, '')) + pointer = 0 + count_delete = 0 + count_insert = 0 + text_delete = '' + text_insert = '' + while pointer < len(diffs): + if diffs[pointer][0] == self.DIFF_INSERT: + count_insert += 1 + text_insert += diffs[pointer][1] + elif diffs[pointer][0] == self.DIFF_DELETE: + count_delete += 1 + text_delete += diffs[pointer][1] + elif diffs[pointer][0] == self.DIFF_EQUAL: + # Upon reaching an equality, check for prior redundancies. + if count_delete >= 1 and count_insert >= 1: + # Delete the offending records and add the merged ones. + subDiff = self.diff_main(text_delete, text_insert, False, deadline) + diffs[pointer - count_delete - count_insert : pointer] = subDiff + pointer = pointer - count_delete - count_insert + len(subDiff) + count_insert = 0 + count_delete = 0 + text_delete = '' + text_insert = '' + + pointer += 1 + + diffs.pop() # Remove the dummy entry at the end. + + return diffs + + def diff_bisect(self, text1, text2, deadline): + """Find the 'middle snake' of a diff, split the problem in two + and return the recursively constructed diff. + See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. + + Args: + text1: Old string to be diffed. + text2: New string to be diffed. + deadline: Time at which to bail if not yet complete. + + Returns: + Array of diff tuples. + """ + + # Cache the text lengths to prevent multiple calls. + text1_length = len(text1) + text2_length = len(text2) + max_d = (text1_length + text2_length + 1) // 2 + v_offset = max_d + v_length = 2 * max_d + v1 = [-1] * v_length + v1[v_offset + 1] = 0 + v2 = v1[:] + delta = text1_length - text2_length + # If the total number of characters is odd, then the front path will + # collide with the reverse path. + front = (delta % 2 != 0) + # Offsets for start and end of k loop. + # Prevents mapping of space beyond the grid. + k1start = 0 + k1end = 0 + k2start = 0 + k2end = 0 + for d in xrange(max_d): + # Bail out if deadline is reached. + if time.time() > deadline: + break + + # Walk the front path one step. + for k1 in xrange(-d + k1start, d + 1 - k1end, 2): + k1_offset = v_offset + k1 + if k1 == -d or (k1 != d and + v1[k1_offset - 1] < v1[k1_offset + 1]): + x1 = v1[k1_offset + 1] + else: + x1 = v1[k1_offset - 1] + 1 + y1 = x1 - k1 + while (x1 < text1_length and y1 < text2_length and + text1[x1] == text2[y1]): + x1 += 1 + y1 += 1 + v1[k1_offset] = x1 + if x1 > text1_length: + # Ran off the right of the graph. + k1end += 2 + elif y1 > text2_length: + # Ran off the bottom of the graph. + k1start += 2 + elif front: + k2_offset = v_offset + delta - k1 + if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1: + # Mirror x2 onto top-left coordinate system. + x2 = text1_length - v2[k2_offset] + if x1 >= x2: + # Overlap detected. + return self.diff_bisectSplit(text1, text2, x1, y1, deadline) + + # Walk the reverse path one step. + for k2 in xrange(-d + k2start, d + 1 - k2end, 2): + k2_offset = v_offset + k2 + if k2 == -d or (k2 != d and + v2[k2_offset - 1] < v2[k2_offset + 1]): + x2 = v2[k2_offset + 1] + else: + x2 = v2[k2_offset - 1] + 1 + y2 = x2 - k2 + while (x2 < text1_length and y2 < text2_length and + text1[-x2 - 1] == text2[-y2 - 1]): + x2 += 1 + y2 += 1 + v2[k2_offset] = x2 + if x2 > text1_length: + # Ran off the left of the graph. + k2end += 2 + elif y2 > text2_length: + # Ran off the top of the graph. + k2start += 2 + elif not front: + k1_offset = v_offset + delta - k2 + if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1: + x1 = v1[k1_offset] + y1 = v_offset + x1 - k1_offset + # Mirror x2 onto top-left coordinate system. + x2 = text1_length - x2 + if x1 >= x2: + # Overlap detected. + return self.diff_bisectSplit(text1, text2, x1, y1, deadline) + + # Diff took too long and hit the deadline or + # number of diffs equals number of characters, no commonality at all. + return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)] + + def diff_bisectSplit(self, text1, text2, x, y, deadline): + """Given the location of the 'middle snake', split the diff in two parts + and recurse. + + Args: + text1: Old string to be diffed. + text2: New string to be diffed. + x: Index of split point in text1. + y: Index of split point in text2. + deadline: Time at which to bail if not yet complete. + + Returns: + Array of diff tuples. + """ + text1a = text1[:x] + text2a = text2[:y] + text1b = text1[x:] + text2b = text2[y:] + + # Compute both diffs serially. + diffs = self.diff_main(text1a, text2a, False, deadline) + diffsb = self.diff_main(text1b, text2b, False, deadline) + + return diffs + diffsb + + def diff_linesToChars(self, text1, text2): + """Split two texts into an array of strings. Reduce the texts to a string + of hashes where each Unicode character represents one line. + + Args: + text1: First string. + text2: Second string. + + Returns: + Three element tuple, containing the encoded text1, the encoded text2 and + the array of unique strings. The zeroth element of the array of unique + strings is intentionally blank. + """ + lineArray = [] # e.g. lineArray[4] == "Hello\n" + lineHash = {} # e.g. lineHash["Hello\n"] == 4 + + # "\x00" is a valid character, but various debuggers don't like it. + # So we'll insert a junk entry to avoid generating a null character. + lineArray.append('') + + def diff_linesToCharsMunge(text): + """Split a text into an array of strings. Reduce the texts to a string + of hashes where each Unicode character represents one line. + Modifies linearray and linehash through being a closure. + + Args: + text: String to encode. + + Returns: + Encoded string. + """ + chars = [] + # Walk the text, pulling out a substring for each line. + # text.split('\n') would would temporarily double our memory footprint. + # Modifying text would create many large strings to garbage collect. + lineStart = 0 + lineEnd = -1 + while lineEnd < len(text) - 1: + lineEnd = text.find('\n', lineStart) + if lineEnd == -1: + lineEnd = len(text) - 1 + line = text[lineStart:lineEnd + 1] + + if line in lineHash: + chars.append(unichr(lineHash[line])) + else: + if len(lineArray) == maxLines: + # Bail out at 65535 because unichr(65536) throws. + line = text[lineStart:] + lineEnd = len(text) + lineArray.append(line) + lineHash[line] = len(lineArray) - 1 + chars.append(unichr(len(lineArray) - 1)) + lineStart = lineEnd + 1 + return "".join(chars) + + # Allocate 2/3rds of the space for text1, the rest for text2. + maxLines = 40000 + chars1 = diff_linesToCharsMunge(text1) + maxLines = 65535 + chars2 = diff_linesToCharsMunge(text2) + return (chars1, chars2, lineArray) + + def diff_charsToLines(self, diffs, lineArray): + """Rehydrate the text in a diff from a string of line hashes to real lines + of text. + + Args: + diffs: Array of diff tuples. + lineArray: Array of unique strings. + """ + for i in xrange(len(diffs)): + text = [] + for char in diffs[i][1]: + text.append(lineArray[ord(char)]) + diffs[i] = (diffs[i][0], "".join(text)) + + def diff_commonPrefix(self, text1, text2): + """Determine the common prefix of two strings. + + Args: + text1: First string. + text2: Second string. + + Returns: + The number of characters common to the start of each string. + """ + # Quick check for common null cases. + if not text1 or not text2 or text1[0] != text2[0]: + return 0 + # Binary search. + # Performance analysis: https://neil.fraser.name/news/2007/10/09/ + pointermin = 0 + pointermax = min(len(text1), len(text2)) + pointermid = pointermax + pointerstart = 0 + while pointermin < pointermid: + if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]: + pointermin = pointermid + pointerstart = pointermin + else: + pointermax = pointermid + pointermid = (pointermax - pointermin) // 2 + pointermin + return pointermid + + def diff_commonSuffix(self, text1, text2): + """Determine the common suffix of two strings. + + Args: + text1: First string. + text2: Second string. + + Returns: + The number of characters common to the end of each string. + """ + # Quick check for common null cases. + if not text1 or not text2 or text1[-1] != text2[-1]: + return 0 + # Binary search. + # Performance analysis: https://neil.fraser.name/news/2007/10/09/ + pointermin = 0 + pointermax = min(len(text1), len(text2)) + pointermid = pointermax + pointerend = 0 + while pointermin < pointermid: + if (text1[-pointermid:len(text1) - pointerend] == + text2[-pointermid:len(text2) - pointerend]): + pointermin = pointermid + pointerend = pointermin + else: + pointermax = pointermid + pointermid = (pointermax - pointermin) // 2 + pointermin + return pointermid + + def diff_commonOverlap(self, text1, text2): + """Determine if the suffix of one string is the prefix of another. + + Args: + text1 First string. + text2 Second string. + + Returns: + The number of characters common to the end of the first + string and the start of the second string. + """ + # Cache the text lengths to prevent multiple calls. + text1_length = len(text1) + text2_length = len(text2) + # Eliminate the null case. + if text1_length == 0 or text2_length == 0: + return 0 + # Truncate the longer string. + if text1_length > text2_length: + text1 = text1[-text2_length:] + elif text1_length < text2_length: + text2 = text2[:text1_length] + text_length = min(text1_length, text2_length) + # Quick check for the worst case. + if text1 == text2: + return text_length + + # Start by looking for a single character match + # and increase length until no match is found. + # Performance analysis: https://neil.fraser.name/news/2010/11/04/ + best = 0 + length = 1 + while True: + pattern = text1[-length:] + found = text2.find(pattern) + if found == -1: + return best + length += found + if found == 0 or text1[-length:] == text2[:length]: + best = length + length += 1 + + def diff_halfMatch(self, text1, text2): + """Do the two texts share a substring which is at least half the length of + the longer text? + This speedup can produce non-minimal diffs. + + Args: + text1: First string. + text2: Second string. + + Returns: + Five element Array, containing the prefix of text1, the suffix of text1, + the prefix of text2, the suffix of text2 and the common middle. Or None + if there was no match. + """ + if self.Diff_Timeout <= 0: + # Don't risk returning a non-optimal diff if we have unlimited time. + return None + if len(text1) > len(text2): + (longtext, shorttext) = (text1, text2) + else: + (shorttext, longtext) = (text1, text2) + if len(longtext) < 4 or len(shorttext) * 2 < len(longtext): + return None # Pointless. + + def diff_halfMatchI(longtext, shorttext, i): + """Does a substring of shorttext exist within longtext such that the + substring is at least half the length of longtext? + Closure, but does not reference any external variables. + + Args: + longtext: Longer string. + shorttext: Shorter string. + i: Start index of quarter length substring within longtext. + + Returns: + Five element Array, containing the prefix of longtext, the suffix of + longtext, the prefix of shorttext, the suffix of shorttext and the + common middle. Or None if there was no match. + """ + seed = longtext[i:i + len(longtext) // 4] + best_common = '' + j = shorttext.find(seed) + while j != -1: + prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:]) + suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j]) + if len(best_common) < suffixLength + prefixLength: + best_common = (shorttext[j - suffixLength:j] + + shorttext[j:j + prefixLength]) + best_longtext_a = longtext[:i - suffixLength] + best_longtext_b = longtext[i + prefixLength:] + best_shorttext_a = shorttext[:j - suffixLength] + best_shorttext_b = shorttext[j + prefixLength:] + j = shorttext.find(seed, j + 1) + + if len(best_common) * 2 >= len(longtext): + return (best_longtext_a, best_longtext_b, + best_shorttext_a, best_shorttext_b, best_common) + else: + return None + + # First check if the second quarter is the seed for a half-match. + hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4) + # Check again based on the third quarter. + hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2) + if not hm1 and not hm2: + return None + elif not hm2: + hm = hm1 + elif not hm1: + hm = hm2 + else: + # Both matched. Select the longest. + if len(hm1[4]) > len(hm2[4]): + hm = hm1 + else: + hm = hm2 + + # A half-match was found, sort out the return data. + if len(text1) > len(text2): + (text1_a, text1_b, text2_a, text2_b, mid_common) = hm + else: + (text2_a, text2_b, text1_a, text1_b, mid_common) = hm + return (text1_a, text1_b, text2_a, text2_b, mid_common) + + def diff_cleanupSemantic(self, diffs): + """Reduce the number of edits by eliminating semantically trivial + equalities. + + Args: + diffs: Array of diff tuples. + """ + changes = False + equalities = [] # Stack of indices where equalities are found. + lastEquality = None # Always equal to diffs[equalities[-1]][1] + pointer = 0 # Index of current position. + # Number of chars that changed prior to the equality. + length_insertions1, length_deletions1 = 0, 0 + # Number of chars that changed after the equality. + length_insertions2, length_deletions2 = 0, 0 + while pointer < len(diffs): + if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found. + equalities.append(pointer) + length_insertions1, length_insertions2 = length_insertions2, 0 + length_deletions1, length_deletions2 = length_deletions2, 0 + lastEquality = diffs[pointer][1] + else: # An insertion or deletion. + if diffs[pointer][0] == self.DIFF_INSERT: + length_insertions2 += len(diffs[pointer][1]) + else: + length_deletions2 += len(diffs[pointer][1]) + # Eliminate an equality that is smaller or equal to the edits on both + # sides of it. + if (lastEquality and (len(lastEquality) <= + max(length_insertions1, length_deletions1)) and + (len(lastEquality) <= max(length_insertions2, length_deletions2))): + # Duplicate record. + diffs.insert(equalities[-1], (self.DIFF_DELETE, lastEquality)) + # Change second copy to insert. + diffs[equalities[-1] + 1] = (self.DIFF_INSERT, + diffs[equalities[-1] + 1][1]) + # Throw away the equality we just deleted. + equalities.pop() + # Throw away the previous equality (it needs to be reevaluated). + if len(equalities): + equalities.pop() + if len(equalities): + pointer = equalities[-1] + else: + pointer = -1 + # Reset the counters. + length_insertions1, length_deletions1 = 0, 0 + length_insertions2, length_deletions2 = 0, 0 + lastEquality = None + changes = True + pointer += 1 + + # Normalize the diff. + if changes: + self.diff_cleanupMerge(diffs) + self.diff_cleanupSemanticLossless(diffs) + + # Find any overlaps between deletions and insertions. + # e.g: abcxxxxxxdef + # -> abcxxxdef + # e.g: xxxabcdefxxx + # -> defxxxabc + # Only extract an overlap if it is as big as the edit ahead or behind it. + pointer = 1 + while pointer < len(diffs): + if (diffs[pointer - 1][0] == self.DIFF_DELETE and + diffs[pointer][0] == self.DIFF_INSERT): + deletion = diffs[pointer - 1][1] + insertion = diffs[pointer][1] + overlap_length1 = self.diff_commonOverlap(deletion, insertion) + overlap_length2 = self.diff_commonOverlap(insertion, deletion) + if overlap_length1 >= overlap_length2: + if (overlap_length1 >= len(deletion) / 2.0 or + overlap_length1 >= len(insertion) / 2.0): + # Overlap found. Insert an equality and trim the surrounding edits. + diffs.insert(pointer, (self.DIFF_EQUAL, + insertion[:overlap_length1])) + diffs[pointer - 1] = (self.DIFF_DELETE, + deletion[:len(deletion) - overlap_length1]) + diffs[pointer + 1] = (self.DIFF_INSERT, + insertion[overlap_length1:]) + pointer += 1 + else: + if (overlap_length2 >= len(deletion) / 2.0 or + overlap_length2 >= len(insertion) / 2.0): + # Reverse overlap found. + # Insert an equality and swap and trim the surrounding edits. + diffs.insert(pointer, (self.DIFF_EQUAL, deletion[:overlap_length2])) + diffs[pointer - 1] = (self.DIFF_INSERT, + insertion[:len(insertion) - overlap_length2]) + diffs[pointer + 1] = (self.DIFF_DELETE, deletion[overlap_length2:]) + pointer += 1 + pointer += 1 + pointer += 1 + + def diff_cleanupSemanticLossless(self, diffs): + """Look for single edits surrounded on both sides by equalities + which can be shifted sideways to align the edit to a word boundary. + e.g: The cat came. -> The cat came. + + Args: + diffs: Array of diff tuples. + """ + + def diff_cleanupSemanticScore(one, two): + """Given two strings, compute a score representing whether the + internal boundary falls on logical boundaries. + Scores range from 6 (best) to 0 (worst). + Closure, but does not reference any external variables. + + Args: + one: First string. + two: Second string. + + Returns: + The score. + """ + if not one or not two: + # Edges are the best. + return 6 + + # Each port of this function behaves slightly differently due to + # subtle differences in each language's definition of things like + # 'whitespace'. Since this function's purpose is largely cosmetic, + # the choice has been made to use each language's native features + # rather than force total conformity. + char1 = one[-1] + char2 = two[0] + nonAlphaNumeric1 = not char1.isalnum() + nonAlphaNumeric2 = not char2.isalnum() + whitespace1 = nonAlphaNumeric1 and char1.isspace() + whitespace2 = nonAlphaNumeric2 and char2.isspace() + lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n") + lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n") + blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one) + blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two) + + if blankLine1 or blankLine2: + # Five points for blank lines. + return 5 + elif lineBreak1 or lineBreak2: + # Four points for line breaks. + return 4 + elif nonAlphaNumeric1 and not whitespace1 and whitespace2: + # Three points for end of sentences. + return 3 + elif whitespace1 or whitespace2: + # Two points for whitespace. + return 2 + elif nonAlphaNumeric1 or nonAlphaNumeric2: + # One point for non-alphanumeric. + return 1 + return 0 + + pointer = 1 + # Intentionally ignore the first and last element (don't need checking). + while pointer < len(diffs) - 1: + if (diffs[pointer - 1][0] == self.DIFF_EQUAL and + diffs[pointer + 1][0] == self.DIFF_EQUAL): + # This is a single edit surrounded by equalities. + equality1 = diffs[pointer - 1][1] + edit = diffs[pointer][1] + equality2 = diffs[pointer + 1][1] + + # First, shift the edit as far left as possible. + commonOffset = self.diff_commonSuffix(equality1, edit) + if commonOffset: + commonString = edit[-commonOffset:] + equality1 = equality1[:-commonOffset] + edit = commonString + edit[:-commonOffset] + equality2 = commonString + equality2 + + # Second, step character by character right, looking for the best fit. + bestEquality1 = equality1 + bestEdit = edit + bestEquality2 = equality2 + bestScore = (diff_cleanupSemanticScore(equality1, edit) + + diff_cleanupSemanticScore(edit, equality2)) + while edit and equality2 and edit[0] == equality2[0]: + equality1 += edit[0] + edit = edit[1:] + equality2[0] + equality2 = equality2[1:] + score = (diff_cleanupSemanticScore(equality1, edit) + + diff_cleanupSemanticScore(edit, equality2)) + # The >= encourages trailing rather than leading whitespace on edits. + if score >= bestScore: + bestScore = score + bestEquality1 = equality1 + bestEdit = edit + bestEquality2 = equality2 + + if diffs[pointer - 1][1] != bestEquality1: + # We have an improvement, save it back to the diff. + if bestEquality1: + diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1) + else: + del diffs[pointer - 1] + pointer -= 1 + diffs[pointer] = (diffs[pointer][0], bestEdit) + if bestEquality2: + diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2) + else: + del diffs[pointer + 1] + pointer -= 1 + pointer += 1 + + # Define some regex patterns for matching boundaries. + BLANKLINEEND = re.compile(r"\n\r?\n$") + BLANKLINESTART = re.compile(r"^\r?\n\r?\n") + + def diff_cleanupEfficiency(self, diffs): + """Reduce the number of edits by eliminating operationally trivial + equalities. + + Args: + diffs: Array of diff tuples. + """ + changes = False + equalities = [] # Stack of indices where equalities are found. + lastEquality = None # Always equal to diffs[equalities[-1]][1] + pointer = 0 # Index of current position. + pre_ins = False # Is there an insertion operation before the last equality. + pre_del = False # Is there a deletion operation before the last equality. + post_ins = False # Is there an insertion operation after the last equality. + post_del = False # Is there a deletion operation after the last equality. + while pointer < len(diffs): + if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found. + if (len(diffs[pointer][1]) < self.Diff_EditCost and + (post_ins or post_del)): + # Candidate found. + equalities.append(pointer) + pre_ins = post_ins + pre_del = post_del + lastEquality = diffs[pointer][1] + else: + # Not a candidate, and can never become one. + equalities = [] + lastEquality = None + + post_ins = post_del = False + else: # An insertion or deletion. + if diffs[pointer][0] == self.DIFF_DELETE: + post_del = True + else: + post_ins = True + + # Five types to be split: + # ABXYCD + # AXCD + # ABXC + # AXCD + # ABXC + + if lastEquality and ((pre_ins and pre_del and post_ins and post_del) or + ((len(lastEquality) < self.Diff_EditCost / 2) and + (pre_ins + pre_del + post_ins + post_del) == 3)): + # Duplicate record. + diffs.insert(equalities[-1], (self.DIFF_DELETE, lastEquality)) + # Change second copy to insert. + diffs[equalities[-1] + 1] = (self.DIFF_INSERT, + diffs[equalities[-1] + 1][1]) + equalities.pop() # Throw away the equality we just deleted. + lastEquality = None + if pre_ins and pre_del: + # No changes made which could affect previous entry, keep going. + post_ins = post_del = True + equalities = [] + else: + if len(equalities): + equalities.pop() # Throw away the previous equality. + if len(equalities): + pointer = equalities[-1] + else: + pointer = -1 + post_ins = post_del = False + changes = True + pointer += 1 + + if changes: + self.diff_cleanupMerge(diffs) + + def diff_cleanupMerge(self, diffs): + """Reorder and merge like edit sections. Merge equalities. + Any edit section can move as long as it doesn't cross an equality. + + Args: + diffs: Array of diff tuples. + """ + diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end. + pointer = 0 + count_delete = 0 + count_insert = 0 + text_delete = '' + text_insert = '' + while pointer < len(diffs): + if diffs[pointer][0] == self.DIFF_INSERT: + count_insert += 1 + text_insert += diffs[pointer][1] + pointer += 1 + elif diffs[pointer][0] == self.DIFF_DELETE: + count_delete += 1 + text_delete += diffs[pointer][1] + pointer += 1 + elif diffs[pointer][0] == self.DIFF_EQUAL: + # Upon reaching an equality, check for prior redundancies. + if count_delete + count_insert > 1: + if count_delete != 0 and count_insert != 0: + # Factor out any common prefixies. + commonlength = self.diff_commonPrefix(text_insert, text_delete) + if commonlength != 0: + x = pointer - count_delete - count_insert - 1 + if x >= 0 and diffs[x][0] == self.DIFF_EQUAL: + diffs[x] = (diffs[x][0], diffs[x][1] + + text_insert[:commonlength]) + else: + diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength])) + pointer += 1 + text_insert = text_insert[commonlength:] + text_delete = text_delete[commonlength:] + # Factor out any common suffixies. + commonlength = self.diff_commonSuffix(text_insert, text_delete) + if commonlength != 0: + diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] + + diffs[pointer][1]) + text_insert = text_insert[:-commonlength] + text_delete = text_delete[:-commonlength] + # Delete the offending records and add the merged ones. + new_ops = [] + if len(text_delete) != 0: + new_ops.append((self.DIFF_DELETE, text_delete)) + if len(text_insert) != 0: + new_ops.append((self.DIFF_INSERT, text_insert)) + pointer -= count_delete + count_insert + diffs[pointer : pointer + count_delete + count_insert] = new_ops + pointer += len(new_ops) + 1 + elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL: + # Merge this equality with the previous one. + diffs[pointer - 1] = (diffs[pointer - 1][0], + diffs[pointer - 1][1] + diffs[pointer][1]) + del diffs[pointer] + else: + pointer += 1 + + count_insert = 0 + count_delete = 0 + text_delete = '' + text_insert = '' + + if diffs[-1][1] == '': + diffs.pop() # Remove the dummy entry at the end. + + # Second pass: look for single edits surrounded on both sides by equalities + # which can be shifted sideways to eliminate an equality. + # e.g: ABAC -> ABAC + changes = False + pointer = 1 + # Intentionally ignore the first and last element (don't need checking). + while pointer < len(diffs) - 1: + if (diffs[pointer - 1][0] == self.DIFF_EQUAL and + diffs[pointer + 1][0] == self.DIFF_EQUAL): + # This is a single edit surrounded by equalities. + if diffs[pointer][1].endswith(diffs[pointer - 1][1]): + # Shift the edit over the previous equality. + if diffs[pointer - 1][1] != "": + diffs[pointer] = (diffs[pointer][0], + diffs[pointer - 1][1] + + diffs[pointer][1][:-len(diffs[pointer - 1][1])]) + diffs[pointer + 1] = (diffs[pointer + 1][0], + diffs[pointer - 1][1] + diffs[pointer + 1][1]) + del diffs[pointer - 1] + changes = True + elif diffs[pointer][1].startswith(diffs[pointer + 1][1]): + # Shift the edit over the next equality. + diffs[pointer - 1] = (diffs[pointer - 1][0], + diffs[pointer - 1][1] + diffs[pointer + 1][1]) + diffs[pointer] = (diffs[pointer][0], + diffs[pointer][1][len(diffs[pointer + 1][1]):] + + diffs[pointer + 1][1]) + del diffs[pointer + 1] + changes = True + pointer += 1 + + # If shifts were made, the diff needs reordering and another shift sweep. + if changes: + self.diff_cleanupMerge(diffs) + + def diff_xIndex(self, diffs, loc): + """loc is a location in text1, compute and return the equivalent location + in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8 + + Args: + diffs: Array of diff tuples. + loc: Location within text1. + + Returns: + Location within text2. + """ + chars1 = 0 + chars2 = 0 + last_chars1 = 0 + last_chars2 = 0 + for x in xrange(len(diffs)): + (op, text) = diffs[x] + if op != self.DIFF_INSERT: # Equality or deletion. + chars1 += len(text) + if op != self.DIFF_DELETE: # Equality or insertion. + chars2 += len(text) + if chars1 > loc: # Overshot the location. + break + last_chars1 = chars1 + last_chars2 = chars2 + + if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE: + # The location was deleted. + return last_chars2 + # Add the remaining len(character). + return last_chars2 + (loc - last_chars1) + + def diff_prettyHtml(self, diffs): + """Convert a diff array into a pretty HTML report. + + Args: + diffs: Array of diff tuples. + + Returns: + HTML representation. + """ + html = [] + for (op, data) in diffs: + text = (data.replace("&", "&").replace("<", "<") + .replace(">", ">").replace("\n", "¶
")) + if op == self.DIFF_INSERT: + html.append("%s" % text) + elif op == self.DIFF_DELETE: + html.append("%s" % text) + elif op == self.DIFF_EQUAL: + html.append("%s" % text) + return "".join(html) + + def diff_text1(self, diffs): + """Compute and return the source text (all equalities and deletions). + + Args: + diffs: Array of diff tuples. + + Returns: + Source text. + """ + text = [] + for (op, data) in diffs: + if op != self.DIFF_INSERT: + text.append(data) + return "".join(text) + + def diff_text2(self, diffs): + """Compute and return the destination text (all equalities and insertions). + + Args: + diffs: Array of diff tuples. + + Returns: + Destination text. + """ + text = [] + for (op, data) in diffs: + if op != self.DIFF_DELETE: + text.append(data) + return "".join(text) + + def diff_levenshtein(self, diffs): + """Compute the Levenshtein distance; the number of inserted, deleted or + substituted characters. + + Args: + diffs: Array of diff tuples. + + Returns: + Number of changes. + """ + levenshtein = 0 + insertions = 0 + deletions = 0 + for (op, data) in diffs: + if op == self.DIFF_INSERT: + insertions += len(data) + elif op == self.DIFF_DELETE: + deletions += len(data) + elif op == self.DIFF_EQUAL: + # A deletion and an insertion is one substitution. + levenshtein += max(insertions, deletions) + insertions = 0 + deletions = 0 + levenshtein += max(insertions, deletions) + return levenshtein + + def diff_toDelta(self, diffs): + """Crush the diff into an encoded string which describes the operations + required to transform text1 into text2. + E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. + Operations are tab-separated. Inserted text is escaped using %xx notation. + + Args: + diffs: Array of diff tuples. + + Returns: + Delta text. + """ + text = [] + for (op, data) in diffs: + if op == self.DIFF_INSERT: + # High ascii will raise UnicodeDecodeError. Use Unicode instead. + data = data.encode("utf-8") + text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# ")) + elif op == self.DIFF_DELETE: + text.append("-%d" % len(data)) + elif op == self.DIFF_EQUAL: + text.append("=%d" % len(data)) + return "\t".join(text) + + def diff_fromDelta(self, text1, delta): + """Given the original text1, and an encoded string which describes the + operations required to transform text1 into text2, compute the full diff. + + Args: + text1: Source string for the diff. + delta: Delta text. + + Returns: + Array of diff tuples. + + Raises: + ValueError: If invalid input. + """ + if type(delta) == unicode: + # Deltas should be composed of a subset of ascii chars, Unicode not + # required. If this encode raises UnicodeEncodeError, delta is invalid. + delta = delta.encode("ascii") + diffs = [] + pointer = 0 # Cursor in text1 + tokens = delta.split("\t") + for token in tokens: + if token == "": + # Blank tokens are ok (from a trailing \t). + continue + # Each token begins with a one character parameter which specifies the + # operation of this token (delete, insert, equality). + param = token[1:] + if token[0] == "+": + param = urllib.unquote(param).decode("utf-8") + diffs.append((self.DIFF_INSERT, param)) + elif token[0] == "-" or token[0] == "=": + try: + n = int(param) + except ValueError: + raise ValueError("Invalid number in diff_fromDelta: " + param) + if n < 0: + raise ValueError("Negative number in diff_fromDelta: " + param) + text = text1[pointer : pointer + n] + pointer += n + if token[0] == "=": + diffs.append((self.DIFF_EQUAL, text)) + else: + diffs.append((self.DIFF_DELETE, text)) + else: + # Anything else is an error. + raise ValueError("Invalid diff operation in diff_fromDelta: " + + token[0]) + if pointer != len(text1): + raise ValueError( + "Delta length (%d) does not equal source text length (%d)." % + (pointer, len(text1))) + return diffs + + # MATCH FUNCTIONS + + def match_main(self, text, pattern, loc): + """Locate the best instance of 'pattern' in 'text' near 'loc'. + + Args: + text: The text to search. + pattern: The pattern to search for. + loc: The location to search around. + + Returns: + Best match index or -1. + """ + # Check for null inputs. + if text == None or pattern == None: + raise ValueError("Null inputs. (match_main)") + + loc = max(0, min(loc, len(text))) + if text == pattern: + # Shortcut (potentially not guaranteed by the algorithm) + return 0 + elif not text: + # Nothing to match. + return -1 + elif text[loc:loc + len(pattern)] == pattern: + # Perfect match at the perfect spot! (Includes case of null pattern) + return loc + else: + # Do a fuzzy compare. + match = self.match_bitap(text, pattern, loc) + return match + + def match_bitap(self, text, pattern, loc): + """Locate the best instance of 'pattern' in 'text' near 'loc' using the + Bitap algorithm. + + Args: + text: The text to search. + pattern: The pattern to search for. + loc: The location to search around. + + Returns: + Best match index or -1. + """ + # Python doesn't have a maxint limit, so ignore this check. + #if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits: + # raise ValueError("Pattern too long for this application.") + + # Initialise the alphabet. + s = self.match_alphabet(pattern) + + def match_bitapScore(e, x): + """Compute and return the score for a match with e errors and x location. + Accesses loc and pattern through being a closure. + + Args: + e: Number of errors in match. + x: Location of match. + + Returns: + Overall score for match (0.0 = good, 1.0 = bad). + """ + accuracy = float(e) / len(pattern) + proximity = abs(loc - x) + if not self.Match_Distance: + # Dodge divide by zero error. + return proximity and 1.0 or accuracy + return accuracy + (proximity / float(self.Match_Distance)) + + # Highest score beyond which we give up. + score_threshold = self.Match_Threshold + # Is there a nearby exact match? (speedup) + best_loc = text.find(pattern, loc) + if best_loc != -1: + score_threshold = min(match_bitapScore(0, best_loc), score_threshold) + # What about in the other direction? (speedup) + best_loc = text.rfind(pattern, loc + len(pattern)) + if best_loc != -1: + score_threshold = min(match_bitapScore(0, best_loc), score_threshold) + + # Initialise the bit arrays. + matchmask = 1 << (len(pattern) - 1) + best_loc = -1 + + bin_max = len(pattern) + len(text) + # Empty initialization added to appease pychecker. + last_rd = None + for d in xrange(len(pattern)): + # Scan for the best match each iteration allows for one more error. + # Run a binary search to determine how far from 'loc' we can stray at + # this error level. + bin_min = 0 + bin_mid = bin_max + while bin_min < bin_mid: + if match_bitapScore(d, loc + bin_mid) <= score_threshold: + bin_min = bin_mid + else: + bin_max = bin_mid + bin_mid = (bin_max - bin_min) // 2 + bin_min + + # Use the result from this iteration as the maximum for the next. + bin_max = bin_mid + start = max(1, loc - bin_mid + 1) + finish = min(loc + bin_mid, len(text)) + len(pattern) + + rd = [0] * (finish + 2) + rd[finish + 1] = (1 << d) - 1 + for j in xrange(finish, start - 1, -1): + if len(text) <= j - 1: + # Out of range. + charMatch = 0 + else: + charMatch = s.get(text[j - 1], 0) + if d == 0: # First pass: exact match. + rd[j] = ((rd[j + 1] << 1) | 1) & charMatch + else: # Subsequent passes: fuzzy match. + rd[j] = (((rd[j + 1] << 1) | 1) & charMatch) | ( + ((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1] + if rd[j] & matchmask: + score = match_bitapScore(d, j - 1) + # This match will almost certainly be better than any existing match. + # But check anyway. + if score <= score_threshold: + # Told you so. + score_threshold = score + best_loc = j - 1 + if best_loc > loc: + # When passing loc, don't exceed our current distance from loc. + start = max(1, 2 * loc - best_loc) + else: + # Already passed loc, downhill from here on in. + break + # No hope for a (better) match at greater error levels. + if match_bitapScore(d + 1, loc) > score_threshold: + break + last_rd = rd + return best_loc + + def match_alphabet(self, pattern): + """Initialise the alphabet for the Bitap algorithm. + + Args: + pattern: The text to encode. + + Returns: + Hash of character locations. + """ + s = {} + for char in pattern: + s[char] = 0 + for i in xrange(len(pattern)): + s[pattern[i]] |= 1 << (len(pattern) - i - 1) + return s + + # PATCH FUNCTIONS + + def patch_addContext(self, patch, text): + """Increase the context until it is unique, + but don't let the pattern expand beyond Match_MaxBits. + + Args: + patch: The patch to grow. + text: Source text. + """ + if len(text) == 0: + return + pattern = text[patch.start2 : patch.start2 + patch.length1] + padding = 0 + + # Look for the first and last matches of pattern in text. If two different + # matches are found, increase the pattern length. + while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits == + 0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - + self.Patch_Margin)): + padding += self.Patch_Margin + pattern = text[max(0, patch.start2 - padding) : + patch.start2 + patch.length1 + padding] + # Add one chunk for good luck. + padding += self.Patch_Margin + + # Add the prefix. + prefix = text[max(0, patch.start2 - padding) : patch.start2] + if prefix: + patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)] + # Add the suffix. + suffix = text[patch.start2 + patch.length1 : + patch.start2 + patch.length1 + padding] + if suffix: + patch.diffs.append((self.DIFF_EQUAL, suffix)) + + # Roll back the start points. + patch.start1 -= len(prefix) + patch.start2 -= len(prefix) + # Extend lengths. + patch.length1 += len(prefix) + len(suffix) + patch.length2 += len(prefix) + len(suffix) + + def patch_make(self, a, b=None, c=None): + """Compute a list of patches to turn text1 into text2. + Use diffs if provided, otherwise compute it ourselves. + There are four ways to call this function, depending on what data is + available to the caller: + Method 1: + a = text1, b = text2 + Method 2: + a = diffs + Method 3 (optimal): + a = text1, b = diffs + Method 4 (deprecated, use method 3): + a = text1, b = text2, c = diffs + + Args: + a: text1 (methods 1,3,4) or Array of diff tuples for text1 to + text2 (method 2). + b: text2 (methods 1,4) or Array of diff tuples for text1 to + text2 (method 3) or undefined (method 2). + c: Array of diff tuples for text1 to text2 (method 4) or + undefined (methods 1,2,3). + + Returns: + Array of Patch objects. + """ + text1 = None + diffs = None + # Note that texts may arrive as 'str' or 'unicode'. + if isinstance(a, basestring) and isinstance(b, basestring) and c is None: + # Method 1: text1, text2 + # Compute diffs from text1 and text2. + text1 = a + diffs = self.diff_main(text1, b, True) + if len(diffs) > 2: + self.diff_cleanupSemantic(diffs) + self.diff_cleanupEfficiency(diffs) + elif isinstance(a, list) and b is None and c is None: + # Method 2: diffs + # Compute text1 from diffs. + diffs = a + text1 = self.diff_text1(diffs) + elif isinstance(a, basestring) and isinstance(b, list) and c is None: + # Method 3: text1, diffs + text1 = a + diffs = b + elif (isinstance(a, basestring) and isinstance(b, basestring) and + isinstance(c, list)): + # Method 4: text1, text2, diffs + # text2 is not used. + text1 = a + diffs = c + else: + raise ValueError("Unknown call format to patch_make.") + + if not diffs: + return [] # Get rid of the None case. + patches = [] + patch = patch_obj() + char_count1 = 0 # Number of characters into the text1 string. + char_count2 = 0 # Number of characters into the text2 string. + prepatch_text = text1 # Recreate the patches to determine context info. + postpatch_text = text1 + for x in xrange(len(diffs)): + (diff_type, diff_text) = diffs[x] + if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL: + # A new patch starts here. + patch.start1 = char_count1 + patch.start2 = char_count2 + if diff_type == self.DIFF_INSERT: + # Insertion + patch.diffs.append(diffs[x]) + patch.length2 += len(diff_text) + postpatch_text = (postpatch_text[:char_count2] + diff_text + + postpatch_text[char_count2:]) + elif diff_type == self.DIFF_DELETE: + # Deletion. + patch.length1 += len(diff_text) + patch.diffs.append(diffs[x]) + postpatch_text = (postpatch_text[:char_count2] + + postpatch_text[char_count2 + len(diff_text):]) + elif (diff_type == self.DIFF_EQUAL and + len(diff_text) <= 2 * self.Patch_Margin and + len(patch.diffs) != 0 and len(diffs) != x + 1): + # Small equality inside a patch. + patch.diffs.append(diffs[x]) + patch.length1 += len(diff_text) + patch.length2 += len(diff_text) + + if (diff_type == self.DIFF_EQUAL and + len(diff_text) >= 2 * self.Patch_Margin): + # Time for a new patch. + if len(patch.diffs) != 0: + self.patch_addContext(patch, prepatch_text) + patches.append(patch) + patch = patch_obj() + # Unlike Unidiff, our patch lists have a rolling context. + # https://github.com/google/diff-match-patch/wiki/Unidiff + # Update prepatch text & pos to reflect the application of the + # just completed patch. + prepatch_text = postpatch_text + char_count1 = char_count2 + + # Update the current character count. + if diff_type != self.DIFF_INSERT: + char_count1 += len(diff_text) + if diff_type != self.DIFF_DELETE: + char_count2 += len(diff_text) + + # Pick up the leftover patch if not empty. + if len(patch.diffs) != 0: + self.patch_addContext(patch, prepatch_text) + patches.append(patch) + return patches + + def patch_deepCopy(self, patches): + """Given an array of patches, return another array that is identical. + + Args: + patches: Array of Patch objects. + + Returns: + Array of Patch objects. + """ + patchesCopy = [] + for patch in patches: + patchCopy = patch_obj() + # No need to deep copy the tuples since they are immutable. + patchCopy.diffs = patch.diffs[:] + patchCopy.start1 = patch.start1 + patchCopy.start2 = patch.start2 + patchCopy.length1 = patch.length1 + patchCopy.length2 = patch.length2 + patchesCopy.append(patchCopy) + return patchesCopy + + def patch_apply(self, patches, text): + """Merge a set of patches onto the text. Return a patched text, as well + as a list of true/false values indicating which patches were applied. + + Args: + patches: Array of Patch objects. + text: Old text. + + Returns: + Two element Array, containing the new text and an array of boolean values. + """ + if not patches: + return (text, []) + + # Deep copy the patches so that no changes are made to originals. + patches = self.patch_deepCopy(patches) + + nullPadding = self.patch_addPadding(patches) + text = nullPadding + text + nullPadding + self.patch_splitMax(patches) + + # delta keeps track of the offset between the expected and actual location + # of the previous patch. If there are patches expected at positions 10 and + # 20, but the first patch was found at 12, delta is 2 and the second patch + # has an effective expected position of 22. + delta = 0 + results = [] + for patch in patches: + expected_loc = patch.start2 + delta + text1 = self.diff_text1(patch.diffs) + end_loc = -1 + if len(text1) > self.Match_MaxBits: + # patch_splitMax will only provide an oversized pattern in the case of + # a monster delete. + start_loc = self.match_main(text, text1[:self.Match_MaxBits], + expected_loc) + if start_loc != -1: + end_loc = self.match_main(text, text1[-self.Match_MaxBits:], + expected_loc + len(text1) - self.Match_MaxBits) + if end_loc == -1 or start_loc >= end_loc: + # Can't find valid trailing context. Drop this patch. + start_loc = -1 + else: + start_loc = self.match_main(text, text1, expected_loc) + if start_loc == -1: + # No match found. :( + results.append(False) + # Subtract the delta for this failed patch from subsequent patches. + delta -= patch.length2 - patch.length1 + else: + # Found a match. :) + results.append(True) + delta = start_loc - expected_loc + if end_loc == -1: + text2 = text[start_loc : start_loc + len(text1)] + else: + text2 = text[start_loc : end_loc + self.Match_MaxBits] + if text1 == text2: + # Perfect match, just shove the replacement text in. + text = (text[:start_loc] + self.diff_text2(patch.diffs) + + text[start_loc + len(text1):]) + else: + # Imperfect match. + # Run a diff to get a framework of equivalent indices. + diffs = self.diff_main(text1, text2, False) + if (len(text1) > self.Match_MaxBits and + self.diff_levenshtein(diffs) / float(len(text1)) > + self.Patch_DeleteThreshold): + # The end points match, but the content is unacceptably bad. + results[-1] = False + else: + self.diff_cleanupSemanticLossless(diffs) + index1 = 0 + for (op, data) in patch.diffs: + if op != self.DIFF_EQUAL: + index2 = self.diff_xIndex(diffs, index1) + if op == self.DIFF_INSERT: # Insertion + text = text[:start_loc + index2] + data + text[start_loc + + index2:] + elif op == self.DIFF_DELETE: # Deletion + text = text[:start_loc + index2] + text[start_loc + + self.diff_xIndex(diffs, index1 + len(data)):] + if op != self.DIFF_DELETE: + index1 += len(data) + # Strip the padding off. + text = text[len(nullPadding):-len(nullPadding)] + return (text, results) + + def patch_addPadding(self, patches): + """Add some padding on text start and end so that edges can match + something. Intended to be called only from within patch_apply. + + Args: + patches: Array of Patch objects. + + Returns: + The padding string added to each side. + """ + paddingLength = self.Patch_Margin + nullPadding = "" + for x in xrange(1, paddingLength + 1): + nullPadding += chr(x) + + # Bump all the patches forward. + for patch in patches: + patch.start1 += paddingLength + patch.start2 += paddingLength + + # Add some padding on start of first diff. + patch = patches[0] + diffs = patch.diffs + if not diffs or diffs[0][0] != self.DIFF_EQUAL: + # Add nullPadding equality. + diffs.insert(0, (self.DIFF_EQUAL, nullPadding)) + patch.start1 -= paddingLength # Should be 0. + patch.start2 -= paddingLength # Should be 0. + patch.length1 += paddingLength + patch.length2 += paddingLength + elif paddingLength > len(diffs[0][1]): + # Grow first equality. + extraLength = paddingLength - len(diffs[0][1]) + newText = nullPadding[len(diffs[0][1]):] + diffs[0][1] + diffs[0] = (diffs[0][0], newText) + patch.start1 -= extraLength + patch.start2 -= extraLength + patch.length1 += extraLength + patch.length2 += extraLength + + # Add some padding on end of last diff. + patch = patches[-1] + diffs = patch.diffs + if not diffs or diffs[-1][0] != self.DIFF_EQUAL: + # Add nullPadding equality. + diffs.append((self.DIFF_EQUAL, nullPadding)) + patch.length1 += paddingLength + patch.length2 += paddingLength + elif paddingLength > len(diffs[-1][1]): + # Grow last equality. + extraLength = paddingLength - len(diffs[-1][1]) + newText = diffs[-1][1] + nullPadding[:extraLength] + diffs[-1] = (diffs[-1][0], newText) + patch.length1 += extraLength + patch.length2 += extraLength + + return nullPadding + + def patch_splitMax(self, patches): + """Look through the patches and break up any which are longer than the + maximum limit of the match algorithm. + Intended to be called only from within patch_apply. + + Args: + patches: Array of Patch objects. + """ + patch_size = self.Match_MaxBits + if patch_size == 0: + # Python has the option of not splitting strings due to its ability + # to handle integers of arbitrary precision. + return + for x in xrange(len(patches)): + if patches[x].length1 <= patch_size: + continue + bigpatch = patches[x] + # Remove the big old patch. + del patches[x] + x -= 1 + start1 = bigpatch.start1 + start2 = bigpatch.start2 + precontext = '' + while len(bigpatch.diffs) != 0: + # Create one of several smaller patches. + patch = patch_obj() + empty = True + patch.start1 = start1 - len(precontext) + patch.start2 = start2 - len(precontext) + if precontext: + patch.length1 = patch.length2 = len(precontext) + patch.diffs.append((self.DIFF_EQUAL, precontext)) + + while (len(bigpatch.diffs) != 0 and + patch.length1 < patch_size - self.Patch_Margin): + (diff_type, diff_text) = bigpatch.diffs[0] + if diff_type == self.DIFF_INSERT: + # Insertions are harmless. + patch.length2 += len(diff_text) + start2 += len(diff_text) + patch.diffs.append(bigpatch.diffs.pop(0)) + empty = False + elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and + patch.diffs[0][0] == self.DIFF_EQUAL and + len(diff_text) > 2 * patch_size): + # This is a large deletion. Let it pass in one chunk. + patch.length1 += len(diff_text) + start1 += len(diff_text) + empty = False + patch.diffs.append((diff_type, diff_text)) + del bigpatch.diffs[0] + else: + # Deletion or equality. Only take as much as we can stomach. + diff_text = diff_text[:patch_size - patch.length1 - + self.Patch_Margin] + patch.length1 += len(diff_text) + start1 += len(diff_text) + if diff_type == self.DIFF_EQUAL: + patch.length2 += len(diff_text) + start2 += len(diff_text) + else: + empty = False + + patch.diffs.append((diff_type, diff_text)) + if diff_text == bigpatch.diffs[0][1]: + del bigpatch.diffs[0] + else: + bigpatch.diffs[0] = (bigpatch.diffs[0][0], + bigpatch.diffs[0][1][len(diff_text):]) + + # Compute the head context for the next patch. + precontext = self.diff_text2(patch.diffs) + precontext = precontext[-self.Patch_Margin:] + # Append the end context for this patch. + postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin] + if postcontext: + patch.length1 += len(postcontext) + patch.length2 += len(postcontext) + if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL: + patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] + + postcontext) + else: + patch.diffs.append((self.DIFF_EQUAL, postcontext)) + + if not empty: + x += 1 + patches.insert(x, patch) + + def patch_toText(self, patches): + """Take a list of patches and return a textual representation. + + Args: + patches: Array of Patch objects. + + Returns: + Text representation of patches. + """ + text = [] + for patch in patches: + text.append(str(patch)) + return "".join(text) + + def patch_fromText(self, textline): + """Parse a textual representation of patches and return a list of patch + objects. + + Args: + textline: Text representation of patches. + + Returns: + Array of Patch objects. + + Raises: + ValueError: If invalid input. + """ + if type(textline) == unicode: + # Patches should be composed of a subset of ascii chars, Unicode not + # required. If this encode raises UnicodeEncodeError, patch is invalid. + textline = textline.encode("ascii") + patches = [] + if not textline: + return patches + text = textline.split('\n') + while len(text) != 0: + m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0]) + if not m: + raise ValueError("Invalid patch string: " + text[0]) + patch = patch_obj() + patches.append(patch) + patch.start1 = int(m.group(1)) + if m.group(2) == '': + patch.start1 -= 1 + patch.length1 = 1 + elif m.group(2) == '0': + patch.length1 = 0 + else: + patch.start1 -= 1 + patch.length1 = int(m.group(2)) + + patch.start2 = int(m.group(3)) + if m.group(4) == '': + patch.start2 -= 1 + patch.length2 = 1 + elif m.group(4) == '0': + patch.length2 = 0 + else: + patch.start2 -= 1 + patch.length2 = int(m.group(4)) + + del text[0] + + while len(text) != 0: + if text[0]: + sign = text[0][0] + else: + sign = '' + line = urllib.unquote(text[0][1:]) + line = line.decode("utf-8") + if sign == '+': + # Insertion. + patch.diffs.append((self.DIFF_INSERT, line)) + elif sign == '-': + # Deletion. + patch.diffs.append((self.DIFF_DELETE, line)) + elif sign == ' ': + # Minor equality. + patch.diffs.append((self.DIFF_EQUAL, line)) + elif sign == '@': + # Start of next patch. + break + elif sign == '': + # Blank line? Whatever. + pass + else: + # WTF? + raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line)) + del text[0] + return patches + + +class patch_obj: + """Class representing one patch operation. + """ + + def __init__(self): + """Initializes with an empty list of diffs. + """ + self.diffs = [] + self.start1 = None + self.start2 = None + self.length1 = 0 + self.length2 = 0 + + def __str__(self): + """Emulate GNU diff's format. + Header: @@ -382,8 +481,9 @@ + Indices are printed as 1-based, not 0-based. + + Returns: + The GNU diff string. + """ + if self.length1 == 0: + coords1 = str(self.start1) + ",0" + elif self.length1 == 1: + coords1 = str(self.start1 + 1) + else: + coords1 = str(self.start1 + 1) + "," + str(self.length1) + if self.length2 == 0: + coords2 = str(self.start2) + ",0" + elif self.length2 == 1: + coords2 = str(self.start2 + 1) + else: + coords2 = str(self.start2 + 1) + "," + str(self.length2) + text = ["@@ -", coords1, " +", coords2, " @@\n"] + # Escape the body of the patch with %xx notation. + for (op, data) in self.diffs: + if op == diff_match_patch.DIFF_INSERT: + text.append("+") + elif op == diff_match_patch.DIFF_DELETE: + text.append("-") + elif op == diff_match_patch.DIFF_EQUAL: + text.append(" ") + # High ascii will raise UnicodeDecodeError. Use Unicode instead. + data = data.encode("utf-8") + text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + "\n") + return "".join(text) diff --git a/xmldiff/_diff_match_patch_py3.py b/xmldiff/_diff_match_patch_py3.py new file mode 100644 index 0000000..906d16d --- /dev/null +++ b/xmldiff/_diff_match_patch_py3.py @@ -0,0 +1,1907 @@ +#!/usr/bin/python3 + +"""Diff Match and Patch +Copyright 2018 The diff-match-patch Authors. +https://github.com/google/diff-match-patch + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +"""Functions for diff, match and patch. + +Computes the difference between two texts to create a patch. +Applies the patch onto another text, allowing for errors. +""" + +__author__ = 'fraser@google.com (Neil Fraser)' + +import re +import sys +import time +import urllib.parse + + +class diff_match_patch: + """Class containing the diff, match and patch methods. + + Also contains the behaviour settings. + """ + + def __init__(self): + """Inits a diff_match_patch object with default settings. + Redefine these in your program to override the defaults. + """ + + # Number of seconds to map a diff before giving up (0 for infinity). + self.Diff_Timeout = 1.0 + # Cost of an empty edit operation in terms of edit characters. + self.Diff_EditCost = 4 + # At what point is no match declared (0.0 = perfection, 1.0 = very loose). + self.Match_Threshold = 0.5 + # How far to search for a match (0 = exact location, 1000+ = broad match). + # A match this many characters away from the expected location will add + # 1.0 to the score (0.0 is a perfect match). + self.Match_Distance = 1000 + # When deleting a large block of text (over ~64 characters), how close do + # the contents have to be to match the expected contents. (0.0 = perfection, + # 1.0 = very loose). Note that Match_Threshold controls how closely the + # end points of a delete need to match. + self.Patch_DeleteThreshold = 0.5 + # Chunk size for context length. + self.Patch_Margin = 4 + + # The number of bits in an int. + # Python has no maximum, thus to disable patch splitting set to 0. + # However to avoid long patches in certain pathological cases, use 32. + # Multiple short patches (using native ints) are much faster than long ones. + self.Match_MaxBits = 32 + + # DIFF FUNCTIONS + + # The data structure representing a diff is an array of tuples: + # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")] + # which means: delete "Hello", add "Goodbye" and keep " world." + DIFF_DELETE = -1 + DIFF_INSERT = 1 + DIFF_EQUAL = 0 + + def diff_main(self, text1, text2, checklines=True, deadline=None): + """Find the differences between two texts. Simplifies the problem by + stripping any common prefix or suffix off the texts before diffing. + + Args: + text1: Old string to be diffed. + text2: New string to be diffed. + checklines: Optional speedup flag. If present and false, then don't run + a line-level diff first to identify the changed areas. + Defaults to true, which does a faster, slightly less optimal diff. + deadline: Optional time when the diff should be complete by. Used + internally for recursive calls. Users should set DiffTimeout instead. + + Returns: + Array of changes. + """ + # Set a deadline by which time the diff must be complete. + if deadline == None: + # Unlike in most languages, Python counts time in seconds. + if self.Diff_Timeout <= 0: + deadline = sys.maxsize + else: + deadline = time.time() + self.Diff_Timeout + + # Check for null inputs. + if text1 == None or text2 == None: + raise ValueError("Null inputs. (diff_main)") + + # Check for equality (speedup). + if text1 == text2: + if text1: + return [(self.DIFF_EQUAL, text1)] + return [] + + # Trim off common prefix (speedup). + commonlength = self.diff_commonPrefix(text1, text2) + commonprefix = text1[:commonlength] + text1 = text1[commonlength:] + text2 = text2[commonlength:] + + # Trim off common suffix (speedup). + commonlength = self.diff_commonSuffix(text1, text2) + if commonlength == 0: + commonsuffix = '' + else: + commonsuffix = text1[-commonlength:] + text1 = text1[:-commonlength] + text2 = text2[:-commonlength] + + # Compute the diff on the middle block. + diffs = self.diff_compute(text1, text2, checklines, deadline) + + # Restore the prefix and suffix. + if commonprefix: + diffs[:0] = [(self.DIFF_EQUAL, commonprefix)] + if commonsuffix: + diffs.append((self.DIFF_EQUAL, commonsuffix)) + self.diff_cleanupMerge(diffs) + return diffs + + def diff_compute(self, text1, text2, checklines, deadline): + """Find the differences between two texts. Assumes that the texts do not + have any common prefix or suffix. + + Args: + text1: Old string to be diffed. + text2: New string to be diffed. + checklines: Speedup flag. If false, then don't run a line-level diff + first to identify the changed areas. + If true, then run a faster, slightly less optimal diff. + deadline: Time when the diff should be complete by. + + Returns: + Array of changes. + """ + if not text1: + # Just add some text (speedup). + return [(self.DIFF_INSERT, text2)] + + if not text2: + # Just delete some text (speedup). + return [(self.DIFF_DELETE, text1)] + + if len(text1) > len(text2): + (longtext, shorttext) = (text1, text2) + else: + (shorttext, longtext) = (text1, text2) + i = longtext.find(shorttext) + if i != -1: + # Shorter text is inside the longer text (speedup). + diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext), + (self.DIFF_INSERT, longtext[i + len(shorttext):])] + # Swap insertions for deletions if diff is reversed. + if len(text1) > len(text2): + diffs[0] = (self.DIFF_DELETE, diffs[0][1]) + diffs[2] = (self.DIFF_DELETE, diffs[2][1]) + return diffs + + if len(shorttext) == 1: + # Single character string. + # After the previous speedup, the character can't be an equality. + return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)] + + # Check to see if the problem can be split in two. + hm = self.diff_halfMatch(text1, text2) + if hm: + # A half-match was found, sort out the return data. + (text1_a, text1_b, text2_a, text2_b, mid_common) = hm + # Send both pairs off for separate processing. + diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline) + diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline) + # Merge the results. + return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b + + if checklines and len(text1) > 100 and len(text2) > 100: + return self.diff_lineMode(text1, text2, deadline) + + return self.diff_bisect(text1, text2, deadline) + + def diff_lineMode(self, text1, text2, deadline): + """Do a quick line-level diff on both strings, then rediff the parts for + greater accuracy. + This speedup can produce non-minimal diffs. + + Args: + text1: Old string to be diffed. + text2: New string to be diffed. + deadline: Time when the diff should be complete by. + + Returns: + Array of changes. + """ + + # Scan the text on a line-by-line basis first. + (text1, text2, linearray) = self.diff_linesToChars(text1, text2) + + diffs = self.diff_main(text1, text2, False, deadline) + + # Convert the diff back to original text. + self.diff_charsToLines(diffs, linearray) + # Eliminate freak matches (e.g. blank lines) + self.diff_cleanupSemantic(diffs) + + # Rediff any replacement blocks, this time character-by-character. + # Add a dummy entry at the end. + diffs.append((self.DIFF_EQUAL, '')) + pointer = 0 + count_delete = 0 + count_insert = 0 + text_delete = '' + text_insert = '' + while pointer < len(diffs): + if diffs[pointer][0] == self.DIFF_INSERT: + count_insert += 1 + text_insert += diffs[pointer][1] + elif diffs[pointer][0] == self.DIFF_DELETE: + count_delete += 1 + text_delete += diffs[pointer][1] + elif diffs[pointer][0] == self.DIFF_EQUAL: + # Upon reaching an equality, check for prior redundancies. + if count_delete >= 1 and count_insert >= 1: + # Delete the offending records and add the merged ones. + subDiff = self.diff_main(text_delete, text_insert, False, deadline) + diffs[pointer - count_delete - count_insert : pointer] = subDiff + pointer = pointer - count_delete - count_insert + len(subDiff) + count_insert = 0 + count_delete = 0 + text_delete = '' + text_insert = '' + + pointer += 1 + + diffs.pop() # Remove the dummy entry at the end. + + return diffs + + def diff_bisect(self, text1, text2, deadline): + """Find the 'middle snake' of a diff, split the problem in two + and return the recursively constructed diff. + See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. + + Args: + text1: Old string to be diffed. + text2: New string to be diffed. + deadline: Time at which to bail if not yet complete. + + Returns: + Array of diff tuples. + """ + + # Cache the text lengths to prevent multiple calls. + text1_length = len(text1) + text2_length = len(text2) + max_d = (text1_length + text2_length + 1) // 2 + v_offset = max_d + v_length = 2 * max_d + v1 = [-1] * v_length + v1[v_offset + 1] = 0 + v2 = v1[:] + delta = text1_length - text2_length + # If the total number of characters is odd, then the front path will + # collide with the reverse path. + front = (delta % 2 != 0) + # Offsets for start and end of k loop. + # Prevents mapping of space beyond the grid. + k1start = 0 + k1end = 0 + k2start = 0 + k2end = 0 + for d in range(max_d): + # Bail out if deadline is reached. + if time.time() > deadline: + break + + # Walk the front path one step. + for k1 in range(-d + k1start, d + 1 - k1end, 2): + k1_offset = v_offset + k1 + if k1 == -d or (k1 != d and + v1[k1_offset - 1] < v1[k1_offset + 1]): + x1 = v1[k1_offset + 1] + else: + x1 = v1[k1_offset - 1] + 1 + y1 = x1 - k1 + while (x1 < text1_length and y1 < text2_length and + text1[x1] == text2[y1]): + x1 += 1 + y1 += 1 + v1[k1_offset] = x1 + if x1 > text1_length: + # Ran off the right of the graph. + k1end += 2 + elif y1 > text2_length: + # Ran off the bottom of the graph. + k1start += 2 + elif front: + k2_offset = v_offset + delta - k1 + if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1: + # Mirror x2 onto top-left coordinate system. + x2 = text1_length - v2[k2_offset] + if x1 >= x2: + # Overlap detected. + return self.diff_bisectSplit(text1, text2, x1, y1, deadline) + + # Walk the reverse path one step. + for k2 in range(-d + k2start, d + 1 - k2end, 2): + k2_offset = v_offset + k2 + if k2 == -d or (k2 != d and + v2[k2_offset - 1] < v2[k2_offset + 1]): + x2 = v2[k2_offset + 1] + else: + x2 = v2[k2_offset - 1] + 1 + y2 = x2 - k2 + while (x2 < text1_length and y2 < text2_length and + text1[-x2 - 1] == text2[-y2 - 1]): + x2 += 1 + y2 += 1 + v2[k2_offset] = x2 + if x2 > text1_length: + # Ran off the left of the graph. + k2end += 2 + elif y2 > text2_length: + # Ran off the top of the graph. + k2start += 2 + elif not front: + k1_offset = v_offset + delta - k2 + if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1: + x1 = v1[k1_offset] + y1 = v_offset + x1 - k1_offset + # Mirror x2 onto top-left coordinate system. + x2 = text1_length - x2 + if x1 >= x2: + # Overlap detected. + return self.diff_bisectSplit(text1, text2, x1, y1, deadline) + + # Diff took too long and hit the deadline or + # number of diffs equals number of characters, no commonality at all. + return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)] + + def diff_bisectSplit(self, text1, text2, x, y, deadline): + """Given the location of the 'middle snake', split the diff in two parts + and recurse. + + Args: + text1: Old string to be diffed. + text2: New string to be diffed. + x: Index of split point in text1. + y: Index of split point in text2. + deadline: Time at which to bail if not yet complete. + + Returns: + Array of diff tuples. + """ + text1a = text1[:x] + text2a = text2[:y] + text1b = text1[x:] + text2b = text2[y:] + + # Compute both diffs serially. + diffs = self.diff_main(text1a, text2a, False, deadline) + diffsb = self.diff_main(text1b, text2b, False, deadline) + + return diffs + diffsb + + def diff_linesToChars(self, text1, text2): + """Split two texts into an array of strings. Reduce the texts to a string + of hashes where each Unicode character represents one line. + + Args: + text1: First string. + text2: Second string. + + Returns: + Three element tuple, containing the encoded text1, the encoded text2 and + the array of unique strings. The zeroth element of the array of unique + strings is intentionally blank. + """ + lineArray = [] # e.g. lineArray[4] == "Hello\n" + lineHash = {} # e.g. lineHash["Hello\n"] == 4 + + # "\x00" is a valid character, but various debuggers don't like it. + # So we'll insert a junk entry to avoid generating a null character. + lineArray.append('') + + def diff_linesToCharsMunge(text): + """Split a text into an array of strings. Reduce the texts to a string + of hashes where each Unicode character represents one line. + Modifies linearray and linehash through being a closure. + + Args: + text: String to encode. + + Returns: + Encoded string. + """ + chars = [] + # Walk the text, pulling out a substring for each line. + # text.split('\n') would would temporarily double our memory footprint. + # Modifying text would create many large strings to garbage collect. + lineStart = 0 + lineEnd = -1 + while lineEnd < len(text) - 1: + lineEnd = text.find('\n', lineStart) + if lineEnd == -1: + lineEnd = len(text) - 1 + line = text[lineStart:lineEnd + 1] + + if line in lineHash: + chars.append(chr(lineHash[line])) + else: + if len(lineArray) == maxLines: + # Bail out at 1114111 because chr(1114112) throws. + line = text[lineStart:] + lineEnd = len(text) + lineArray.append(line) + lineHash[line] = len(lineArray) - 1 + chars.append(chr(len(lineArray) - 1)) + lineStart = lineEnd + 1 + return "".join(chars) + + # Allocate 2/3rds of the space for text1, the rest for text2. + maxLines = 666666 + chars1 = diff_linesToCharsMunge(text1) + maxLines = 1114111 + chars2 = diff_linesToCharsMunge(text2) + return (chars1, chars2, lineArray) + + def diff_charsToLines(self, diffs, lineArray): + """Rehydrate the text in a diff from a string of line hashes to real lines + of text. + + Args: + diffs: Array of diff tuples. + lineArray: Array of unique strings. + """ + for i in range(len(diffs)): + text = [] + for char in diffs[i][1]: + text.append(lineArray[ord(char)]) + diffs[i] = (diffs[i][0], "".join(text)) + + def diff_commonPrefix(self, text1, text2): + """Determine the common prefix of two strings. + + Args: + text1: First string. + text2: Second string. + + Returns: + The number of characters common to the start of each string. + """ + # Quick check for common null cases. + if not text1 or not text2 or text1[0] != text2[0]: + return 0 + # Binary search. + # Performance analysis: https://neil.fraser.name/news/2007/10/09/ + pointermin = 0 + pointermax = min(len(text1), len(text2)) + pointermid = pointermax + pointerstart = 0 + while pointermin < pointermid: + if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]: + pointermin = pointermid + pointerstart = pointermin + else: + pointermax = pointermid + pointermid = (pointermax - pointermin) // 2 + pointermin + return pointermid + + def diff_commonSuffix(self, text1, text2): + """Determine the common suffix of two strings. + + Args: + text1: First string. + text2: Second string. + + Returns: + The number of characters common to the end of each string. + """ + # Quick check for common null cases. + if not text1 or not text2 or text1[-1] != text2[-1]: + return 0 + # Binary search. + # Performance analysis: https://neil.fraser.name/news/2007/10/09/ + pointermin = 0 + pointermax = min(len(text1), len(text2)) + pointermid = pointermax + pointerend = 0 + while pointermin < pointermid: + if (text1[-pointermid:len(text1) - pointerend] == + text2[-pointermid:len(text2) - pointerend]): + pointermin = pointermid + pointerend = pointermin + else: + pointermax = pointermid + pointermid = (pointermax - pointermin) // 2 + pointermin + return pointermid + + def diff_commonOverlap(self, text1, text2): + """Determine if the suffix of one string is the prefix of another. + + Args: + text1 First string. + text2 Second string. + + Returns: + The number of characters common to the end of the first + string and the start of the second string. + """ + # Cache the text lengths to prevent multiple calls. + text1_length = len(text1) + text2_length = len(text2) + # Eliminate the null case. + if text1_length == 0 or text2_length == 0: + return 0 + # Truncate the longer string. + if text1_length > text2_length: + text1 = text1[-text2_length:] + elif text1_length < text2_length: + text2 = text2[:text1_length] + text_length = min(text1_length, text2_length) + # Quick check for the worst case. + if text1 == text2: + return text_length + + # Start by looking for a single character match + # and increase length until no match is found. + # Performance analysis: https://neil.fraser.name/news/2010/11/04/ + best = 0 + length = 1 + while True: + pattern = text1[-length:] + found = text2.find(pattern) + if found == -1: + return best + length += found + if found == 0 or text1[-length:] == text2[:length]: + best = length + length += 1 + + def diff_halfMatch(self, text1, text2): + """Do the two texts share a substring which is at least half the length of + the longer text? + This speedup can produce non-minimal diffs. + + Args: + text1: First string. + text2: Second string. + + Returns: + Five element Array, containing the prefix of text1, the suffix of text1, + the prefix of text2, the suffix of text2 and the common middle. Or None + if there was no match. + """ + if self.Diff_Timeout <= 0: + # Don't risk returning a non-optimal diff if we have unlimited time. + return None + if len(text1) > len(text2): + (longtext, shorttext) = (text1, text2) + else: + (shorttext, longtext) = (text1, text2) + if len(longtext) < 4 or len(shorttext) * 2 < len(longtext): + return None # Pointless. + + def diff_halfMatchI(longtext, shorttext, i): + """Does a substring of shorttext exist within longtext such that the + substring is at least half the length of longtext? + Closure, but does not reference any external variables. + + Args: + longtext: Longer string. + shorttext: Shorter string. + i: Start index of quarter length substring within longtext. + + Returns: + Five element Array, containing the prefix of longtext, the suffix of + longtext, the prefix of shorttext, the suffix of shorttext and the + common middle. Or None if there was no match. + """ + seed = longtext[i:i + len(longtext) // 4] + best_common = '' + j = shorttext.find(seed) + while j != -1: + prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:]) + suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j]) + if len(best_common) < suffixLength + prefixLength: + best_common = (shorttext[j - suffixLength:j] + + shorttext[j:j + prefixLength]) + best_longtext_a = longtext[:i - suffixLength] + best_longtext_b = longtext[i + prefixLength:] + best_shorttext_a = shorttext[:j - suffixLength] + best_shorttext_b = shorttext[j + prefixLength:] + j = shorttext.find(seed, j + 1) + + if len(best_common) * 2 >= len(longtext): + return (best_longtext_a, best_longtext_b, + best_shorttext_a, best_shorttext_b, best_common) + else: + return None + + # First check if the second quarter is the seed for a half-match. + hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4) + # Check again based on the third quarter. + hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2) + if not hm1 and not hm2: + return None + elif not hm2: + hm = hm1 + elif not hm1: + hm = hm2 + else: + # Both matched. Select the longest. + if len(hm1[4]) > len(hm2[4]): + hm = hm1 + else: + hm = hm2 + + # A half-match was found, sort out the return data. + if len(text1) > len(text2): + (text1_a, text1_b, text2_a, text2_b, mid_common) = hm + else: + (text2_a, text2_b, text1_a, text1_b, mid_common) = hm + return (text1_a, text1_b, text2_a, text2_b, mid_common) + + def diff_cleanupSemantic(self, diffs): + """Reduce the number of edits by eliminating semantically trivial + equalities. + + Args: + diffs: Array of diff tuples. + """ + changes = False + equalities = [] # Stack of indices where equalities are found. + lastEquality = None # Always equal to diffs[equalities[-1]][1] + pointer = 0 # Index of current position. + # Number of chars that changed prior to the equality. + length_insertions1, length_deletions1 = 0, 0 + # Number of chars that changed after the equality. + length_insertions2, length_deletions2 = 0, 0 + while pointer < len(diffs): + if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found. + equalities.append(pointer) + length_insertions1, length_insertions2 = length_insertions2, 0 + length_deletions1, length_deletions2 = length_deletions2, 0 + lastEquality = diffs[pointer][1] + else: # An insertion or deletion. + if diffs[pointer][0] == self.DIFF_INSERT: + length_insertions2 += len(diffs[pointer][1]) + else: + length_deletions2 += len(diffs[pointer][1]) + # Eliminate an equality that is smaller or equal to the edits on both + # sides of it. + if (lastEquality and (len(lastEquality) <= + max(length_insertions1, length_deletions1)) and + (len(lastEquality) <= max(length_insertions2, length_deletions2))): + # Duplicate record. + diffs.insert(equalities[-1], (self.DIFF_DELETE, lastEquality)) + # Change second copy to insert. + diffs[equalities[-1] + 1] = (self.DIFF_INSERT, + diffs[equalities[-1] + 1][1]) + # Throw away the equality we just deleted. + equalities.pop() + # Throw away the previous equality (it needs to be reevaluated). + if len(equalities): + equalities.pop() + if len(equalities): + pointer = equalities[-1] + else: + pointer = -1 + # Reset the counters. + length_insertions1, length_deletions1 = 0, 0 + length_insertions2, length_deletions2 = 0, 0 + lastEquality = None + changes = True + pointer += 1 + + # Normalize the diff. + if changes: + self.diff_cleanupMerge(diffs) + self.diff_cleanupSemanticLossless(diffs) + + # Find any overlaps between deletions and insertions. + # e.g: abcxxxxxxdef + # -> abcxxxdef + # e.g: xxxabcdefxxx + # -> defxxxabc + # Only extract an overlap if it is as big as the edit ahead or behind it. + pointer = 1 + while pointer < len(diffs): + if (diffs[pointer - 1][0] == self.DIFF_DELETE and + diffs[pointer][0] == self.DIFF_INSERT): + deletion = diffs[pointer - 1][1] + insertion = diffs[pointer][1] + overlap_length1 = self.diff_commonOverlap(deletion, insertion) + overlap_length2 = self.diff_commonOverlap(insertion, deletion) + if overlap_length1 >= overlap_length2: + if (overlap_length1 >= len(deletion) / 2.0 or + overlap_length1 >= len(insertion) / 2.0): + # Overlap found. Insert an equality and trim the surrounding edits. + diffs.insert(pointer, (self.DIFF_EQUAL, + insertion[:overlap_length1])) + diffs[pointer - 1] = (self.DIFF_DELETE, + deletion[:len(deletion) - overlap_length1]) + diffs[pointer + 1] = (self.DIFF_INSERT, + insertion[overlap_length1:]) + pointer += 1 + else: + if (overlap_length2 >= len(deletion) / 2.0 or + overlap_length2 >= len(insertion) / 2.0): + # Reverse overlap found. + # Insert an equality and swap and trim the surrounding edits. + diffs.insert(pointer, (self.DIFF_EQUAL, deletion[:overlap_length2])) + diffs[pointer - 1] = (self.DIFF_INSERT, + insertion[:len(insertion) - overlap_length2]) + diffs[pointer + 1] = (self.DIFF_DELETE, deletion[overlap_length2:]) + pointer += 1 + pointer += 1 + pointer += 1 + + def diff_cleanupSemanticLossless(self, diffs): + """Look for single edits surrounded on both sides by equalities + which can be shifted sideways to align the edit to a word boundary. + e.g: The cat came. -> The cat came. + + Args: + diffs: Array of diff tuples. + """ + + def diff_cleanupSemanticScore(one, two): + """Given two strings, compute a score representing whether the + internal boundary falls on logical boundaries. + Scores range from 6 (best) to 0 (worst). + Closure, but does not reference any external variables. + + Args: + one: First string. + two: Second string. + + Returns: + The score. + """ + if not one or not two: + # Edges are the best. + return 6 + + # Each port of this function behaves slightly differently due to + # subtle differences in each language's definition of things like + # 'whitespace'. Since this function's purpose is largely cosmetic, + # the choice has been made to use each language's native features + # rather than force total conformity. + char1 = one[-1] + char2 = two[0] + nonAlphaNumeric1 = not char1.isalnum() + nonAlphaNumeric2 = not char2.isalnum() + whitespace1 = nonAlphaNumeric1 and char1.isspace() + whitespace2 = nonAlphaNumeric2 and char2.isspace() + lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n") + lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n") + blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one) + blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two) + + if blankLine1 or blankLine2: + # Five points for blank lines. + return 5 + elif lineBreak1 or lineBreak2: + # Four points for line breaks. + return 4 + elif nonAlphaNumeric1 and not whitespace1 and whitespace2: + # Three points for end of sentences. + return 3 + elif whitespace1 or whitespace2: + # Two points for whitespace. + return 2 + elif nonAlphaNumeric1 or nonAlphaNumeric2: + # One point for non-alphanumeric. + return 1 + return 0 + + pointer = 1 + # Intentionally ignore the first and last element (don't need checking). + while pointer < len(diffs) - 1: + if (diffs[pointer - 1][0] == self.DIFF_EQUAL and + diffs[pointer + 1][0] == self.DIFF_EQUAL): + # This is a single edit surrounded by equalities. + equality1 = diffs[pointer - 1][1] + edit = diffs[pointer][1] + equality2 = diffs[pointer + 1][1] + + # First, shift the edit as far left as possible. + commonOffset = self.diff_commonSuffix(equality1, edit) + if commonOffset: + commonString = edit[-commonOffset:] + equality1 = equality1[:-commonOffset] + edit = commonString + edit[:-commonOffset] + equality2 = commonString + equality2 + + # Second, step character by character right, looking for the best fit. + bestEquality1 = equality1 + bestEdit = edit + bestEquality2 = equality2 + bestScore = (diff_cleanupSemanticScore(equality1, edit) + + diff_cleanupSemanticScore(edit, equality2)) + while edit and equality2 and edit[0] == equality2[0]: + equality1 += edit[0] + edit = edit[1:] + equality2[0] + equality2 = equality2[1:] + score = (diff_cleanupSemanticScore(equality1, edit) + + diff_cleanupSemanticScore(edit, equality2)) + # The >= encourages trailing rather than leading whitespace on edits. + if score >= bestScore: + bestScore = score + bestEquality1 = equality1 + bestEdit = edit + bestEquality2 = equality2 + + if diffs[pointer - 1][1] != bestEquality1: + # We have an improvement, save it back to the diff. + if bestEquality1: + diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1) + else: + del diffs[pointer - 1] + pointer -= 1 + diffs[pointer] = (diffs[pointer][0], bestEdit) + if bestEquality2: + diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2) + else: + del diffs[pointer + 1] + pointer -= 1 + pointer += 1 + + # Define some regex patterns for matching boundaries. + BLANKLINEEND = re.compile(r"\n\r?\n$") + BLANKLINESTART = re.compile(r"^\r?\n\r?\n") + + def diff_cleanupEfficiency(self, diffs): + """Reduce the number of edits by eliminating operationally trivial + equalities. + + Args: + diffs: Array of diff tuples. + """ + changes = False + equalities = [] # Stack of indices where equalities are found. + lastEquality = None # Always equal to diffs[equalities[-1]][1] + pointer = 0 # Index of current position. + pre_ins = False # Is there an insertion operation before the last equality. + pre_del = False # Is there a deletion operation before the last equality. + post_ins = False # Is there an insertion operation after the last equality. + post_del = False # Is there a deletion operation after the last equality. + while pointer < len(diffs): + if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found. + if (len(diffs[pointer][1]) < self.Diff_EditCost and + (post_ins or post_del)): + # Candidate found. + equalities.append(pointer) + pre_ins = post_ins + pre_del = post_del + lastEquality = diffs[pointer][1] + else: + # Not a candidate, and can never become one. + equalities = [] + lastEquality = None + + post_ins = post_del = False + else: # An insertion or deletion. + if diffs[pointer][0] == self.DIFF_DELETE: + post_del = True + else: + post_ins = True + + # Five types to be split: + # ABXYCD + # AXCD + # ABXC + # AXCD + # ABXC + + if lastEquality and ((pre_ins and pre_del and post_ins and post_del) or + ((len(lastEquality) < self.Diff_EditCost / 2) and + (pre_ins + pre_del + post_ins + post_del) == 3)): + # Duplicate record. + diffs.insert(equalities[-1], (self.DIFF_DELETE, lastEquality)) + # Change second copy to insert. + diffs[equalities[-1] + 1] = (self.DIFF_INSERT, + diffs[equalities[-1] + 1][1]) + equalities.pop() # Throw away the equality we just deleted. + lastEquality = None + if pre_ins and pre_del: + # No changes made which could affect previous entry, keep going. + post_ins = post_del = True + equalities = [] + else: + if len(equalities): + equalities.pop() # Throw away the previous equality. + if len(equalities): + pointer = equalities[-1] + else: + pointer = -1 + post_ins = post_del = False + changes = True + pointer += 1 + + if changes: + self.diff_cleanupMerge(diffs) + + def diff_cleanupMerge(self, diffs): + """Reorder and merge like edit sections. Merge equalities. + Any edit section can move as long as it doesn't cross an equality. + + Args: + diffs: Array of diff tuples. + """ + diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end. + pointer = 0 + count_delete = 0 + count_insert = 0 + text_delete = '' + text_insert = '' + while pointer < len(diffs): + if diffs[pointer][0] == self.DIFF_INSERT: + count_insert += 1 + text_insert += diffs[pointer][1] + pointer += 1 + elif diffs[pointer][0] == self.DIFF_DELETE: + count_delete += 1 + text_delete += diffs[pointer][1] + pointer += 1 + elif diffs[pointer][0] == self.DIFF_EQUAL: + # Upon reaching an equality, check for prior redundancies. + if count_delete + count_insert > 1: + if count_delete != 0 and count_insert != 0: + # Factor out any common prefixies. + commonlength = self.diff_commonPrefix(text_insert, text_delete) + if commonlength != 0: + x = pointer - count_delete - count_insert - 1 + if x >= 0 and diffs[x][0] == self.DIFF_EQUAL: + diffs[x] = (diffs[x][0], diffs[x][1] + + text_insert[:commonlength]) + else: + diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength])) + pointer += 1 + text_insert = text_insert[commonlength:] + text_delete = text_delete[commonlength:] + # Factor out any common suffixies. + commonlength = self.diff_commonSuffix(text_insert, text_delete) + if commonlength != 0: + diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] + + diffs[pointer][1]) + text_insert = text_insert[:-commonlength] + text_delete = text_delete[:-commonlength] + # Delete the offending records and add the merged ones. + new_ops = [] + if len(text_delete) != 0: + new_ops.append((self.DIFF_DELETE, text_delete)) + if len(text_insert) != 0: + new_ops.append((self.DIFF_INSERT, text_insert)) + pointer -= count_delete + count_insert + diffs[pointer : pointer + count_delete + count_insert] = new_ops + pointer += len(new_ops) + 1 + elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL: + # Merge this equality with the previous one. + diffs[pointer - 1] = (diffs[pointer - 1][0], + diffs[pointer - 1][1] + diffs[pointer][1]) + del diffs[pointer] + else: + pointer += 1 + + count_insert = 0 + count_delete = 0 + text_delete = '' + text_insert = '' + + if diffs[-1][1] == '': + diffs.pop() # Remove the dummy entry at the end. + + # Second pass: look for single edits surrounded on both sides by equalities + # which can be shifted sideways to eliminate an equality. + # e.g: ABAC -> ABAC + changes = False + pointer = 1 + # Intentionally ignore the first and last element (don't need checking). + while pointer < len(diffs) - 1: + if (diffs[pointer - 1][0] == self.DIFF_EQUAL and + diffs[pointer + 1][0] == self.DIFF_EQUAL): + # This is a single edit surrounded by equalities. + if diffs[pointer][1].endswith(diffs[pointer - 1][1]): + # Shift the edit over the previous equality. + if diffs[pointer - 1][1] != "": + diffs[pointer] = (diffs[pointer][0], + diffs[pointer - 1][1] + + diffs[pointer][1][:-len(diffs[pointer - 1][1])]) + diffs[pointer + 1] = (diffs[pointer + 1][0], + diffs[pointer - 1][1] + diffs[pointer + 1][1]) + del diffs[pointer - 1] + changes = True + elif diffs[pointer][1].startswith(diffs[pointer + 1][1]): + # Shift the edit over the next equality. + diffs[pointer - 1] = (diffs[pointer - 1][0], + diffs[pointer - 1][1] + diffs[pointer + 1][1]) + diffs[pointer] = (diffs[pointer][0], + diffs[pointer][1][len(diffs[pointer + 1][1]):] + + diffs[pointer + 1][1]) + del diffs[pointer + 1] + changes = True + pointer += 1 + + # If shifts were made, the diff needs reordering and another shift sweep. + if changes: + self.diff_cleanupMerge(diffs) + + def diff_xIndex(self, diffs, loc): + """loc is a location in text1, compute and return the equivalent location + in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8 + + Args: + diffs: Array of diff tuples. + loc: Location within text1. + + Returns: + Location within text2. + """ + chars1 = 0 + chars2 = 0 + last_chars1 = 0 + last_chars2 = 0 + for x in range(len(diffs)): + (op, text) = diffs[x] + if op != self.DIFF_INSERT: # Equality or deletion. + chars1 += len(text) + if op != self.DIFF_DELETE: # Equality or insertion. + chars2 += len(text) + if chars1 > loc: # Overshot the location. + break + last_chars1 = chars1 + last_chars2 = chars2 + + if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE: + # The location was deleted. + return last_chars2 + # Add the remaining len(character). + return last_chars2 + (loc - last_chars1) + + def diff_prettyHtml(self, diffs): + """Convert a diff array into a pretty HTML report. + + Args: + diffs: Array of diff tuples. + + Returns: + HTML representation. + """ + html = [] + for (op, data) in diffs: + text = (data.replace("&", "&").replace("<", "<") + .replace(">", ">").replace("\n", "¶
")) + if op == self.DIFF_INSERT: + html.append("%s" % text) + elif op == self.DIFF_DELETE: + html.append("%s" % text) + elif op == self.DIFF_EQUAL: + html.append("%s" % text) + return "".join(html) + + def diff_text1(self, diffs): + """Compute and return the source text (all equalities and deletions). + + Args: + diffs: Array of diff tuples. + + Returns: + Source text. + """ + text = [] + for (op, data) in diffs: + if op != self.DIFF_INSERT: + text.append(data) + return "".join(text) + + def diff_text2(self, diffs): + """Compute and return the destination text (all equalities and insertions). + + Args: + diffs: Array of diff tuples. + + Returns: + Destination text. + """ + text = [] + for (op, data) in diffs: + if op != self.DIFF_DELETE: + text.append(data) + return "".join(text) + + def diff_levenshtein(self, diffs): + """Compute the Levenshtein distance; the number of inserted, deleted or + substituted characters. + + Args: + diffs: Array of diff tuples. + + Returns: + Number of changes. + """ + levenshtein = 0 + insertions = 0 + deletions = 0 + for (op, data) in diffs: + if op == self.DIFF_INSERT: + insertions += len(data) + elif op == self.DIFF_DELETE: + deletions += len(data) + elif op == self.DIFF_EQUAL: + # A deletion and an insertion is one substitution. + levenshtein += max(insertions, deletions) + insertions = 0 + deletions = 0 + levenshtein += max(insertions, deletions) + return levenshtein + + def diff_toDelta(self, diffs): + """Crush the diff into an encoded string which describes the operations + required to transform text1 into text2. + E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. + Operations are tab-separated. Inserted text is escaped using %xx notation. + + Args: + diffs: Array of diff tuples. + + Returns: + Delta text. + """ + text = [] + for (op, data) in diffs: + if op == self.DIFF_INSERT: + # High ascii will raise UnicodeDecodeError. Use Unicode instead. + data = data.encode("utf-8") + text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# ")) + elif op == self.DIFF_DELETE: + text.append("-%d" % len(data)) + elif op == self.DIFF_EQUAL: + text.append("=%d" % len(data)) + return "\t".join(text) + + def diff_fromDelta(self, text1, delta): + """Given the original text1, and an encoded string which describes the + operations required to transform text1 into text2, compute the full diff. + + Args: + text1: Source string for the diff. + delta: Delta text. + + Returns: + Array of diff tuples. + + Raises: + ValueError: If invalid input. + """ + diffs = [] + pointer = 0 # Cursor in text1 + tokens = delta.split("\t") + for token in tokens: + if token == "": + # Blank tokens are ok (from a trailing \t). + continue + # Each token begins with a one character parameter which specifies the + # operation of this token (delete, insert, equality). + param = token[1:] + if token[0] == "+": + param = urllib.parse.unquote(param) + diffs.append((self.DIFF_INSERT, param)) + elif token[0] == "-" or token[0] == "=": + try: + n = int(param) + except ValueError: + raise ValueError("Invalid number in diff_fromDelta: " + param) + if n < 0: + raise ValueError("Negative number in diff_fromDelta: " + param) + text = text1[pointer : pointer + n] + pointer += n + if token[0] == "=": + diffs.append((self.DIFF_EQUAL, text)) + else: + diffs.append((self.DIFF_DELETE, text)) + else: + # Anything else is an error. + raise ValueError("Invalid diff operation in diff_fromDelta: " + + token[0]) + if pointer != len(text1): + raise ValueError( + "Delta length (%d) does not equal source text length (%d)." % + (pointer, len(text1))) + return diffs + + # MATCH FUNCTIONS + + def match_main(self, text, pattern, loc): + """Locate the best instance of 'pattern' in 'text' near 'loc'. + + Args: + text: The text to search. + pattern: The pattern to search for. + loc: The location to search around. + + Returns: + Best match index or -1. + """ + # Check for null inputs. + if text == None or pattern == None: + raise ValueError("Null inputs. (match_main)") + + loc = max(0, min(loc, len(text))) + if text == pattern: + # Shortcut (potentially not guaranteed by the algorithm) + return 0 + elif not text: + # Nothing to match. + return -1 + elif text[loc:loc + len(pattern)] == pattern: + # Perfect match at the perfect spot! (Includes case of null pattern) + return loc + else: + # Do a fuzzy compare. + match = self.match_bitap(text, pattern, loc) + return match + + def match_bitap(self, text, pattern, loc): + """Locate the best instance of 'pattern' in 'text' near 'loc' using the + Bitap algorithm. + + Args: + text: The text to search. + pattern: The pattern to search for. + loc: The location to search around. + + Returns: + Best match index or -1. + """ + # Python doesn't have a maxint limit, so ignore this check. + #if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits: + # raise ValueError("Pattern too long for this application.") + + # Initialise the alphabet. + s = self.match_alphabet(pattern) + + def match_bitapScore(e, x): + """Compute and return the score for a match with e errors and x location. + Accesses loc and pattern through being a closure. + + Args: + e: Number of errors in match. + x: Location of match. + + Returns: + Overall score for match (0.0 = good, 1.0 = bad). + """ + accuracy = float(e) / len(pattern) + proximity = abs(loc - x) + if not self.Match_Distance: + # Dodge divide by zero error. + return proximity and 1.0 or accuracy + return accuracy + (proximity / float(self.Match_Distance)) + + # Highest score beyond which we give up. + score_threshold = self.Match_Threshold + # Is there a nearby exact match? (speedup) + best_loc = text.find(pattern, loc) + if best_loc != -1: + score_threshold = min(match_bitapScore(0, best_loc), score_threshold) + # What about in the other direction? (speedup) + best_loc = text.rfind(pattern, loc + len(pattern)) + if best_loc != -1: + score_threshold = min(match_bitapScore(0, best_loc), score_threshold) + + # Initialise the bit arrays. + matchmask = 1 << (len(pattern) - 1) + best_loc = -1 + + bin_max = len(pattern) + len(text) + # Empty initialization added to appease pychecker. + last_rd = None + for d in range(len(pattern)): + # Scan for the best match each iteration allows for one more error. + # Run a binary search to determine how far from 'loc' we can stray at + # this error level. + bin_min = 0 + bin_mid = bin_max + while bin_min < bin_mid: + if match_bitapScore(d, loc + bin_mid) <= score_threshold: + bin_min = bin_mid + else: + bin_max = bin_mid + bin_mid = (bin_max - bin_min) // 2 + bin_min + + # Use the result from this iteration as the maximum for the next. + bin_max = bin_mid + start = max(1, loc - bin_mid + 1) + finish = min(loc + bin_mid, len(text)) + len(pattern) + + rd = [0] * (finish + 2) + rd[finish + 1] = (1 << d) - 1 + for j in range(finish, start - 1, -1): + if len(text) <= j - 1: + # Out of range. + charMatch = 0 + else: + charMatch = s.get(text[j - 1], 0) + if d == 0: # First pass: exact match. + rd[j] = ((rd[j + 1] << 1) | 1) & charMatch + else: # Subsequent passes: fuzzy match. + rd[j] = (((rd[j + 1] << 1) | 1) & charMatch) | ( + ((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1] + if rd[j] & matchmask: + score = match_bitapScore(d, j - 1) + # This match will almost certainly be better than any existing match. + # But check anyway. + if score <= score_threshold: + # Told you so. + score_threshold = score + best_loc = j - 1 + if best_loc > loc: + # When passing loc, don't exceed our current distance from loc. + start = max(1, 2 * loc - best_loc) + else: + # Already passed loc, downhill from here on in. + break + # No hope for a (better) match at greater error levels. + if match_bitapScore(d + 1, loc) > score_threshold: + break + last_rd = rd + return best_loc + + def match_alphabet(self, pattern): + """Initialise the alphabet for the Bitap algorithm. + + Args: + pattern: The text to encode. + + Returns: + Hash of character locations. + """ + s = {} + for char in pattern: + s[char] = 0 + for i in range(len(pattern)): + s[pattern[i]] |= 1 << (len(pattern) - i - 1) + return s + + # PATCH FUNCTIONS + + def patch_addContext(self, patch, text): + """Increase the context until it is unique, + but don't let the pattern expand beyond Match_MaxBits. + + Args: + patch: The patch to grow. + text: Source text. + """ + if len(text) == 0: + return + pattern = text[patch.start2 : patch.start2 + patch.length1] + padding = 0 + + # Look for the first and last matches of pattern in text. If two different + # matches are found, increase the pattern length. + while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits == + 0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - + self.Patch_Margin)): + padding += self.Patch_Margin + pattern = text[max(0, patch.start2 - padding) : + patch.start2 + patch.length1 + padding] + # Add one chunk for good luck. + padding += self.Patch_Margin + + # Add the prefix. + prefix = text[max(0, patch.start2 - padding) : patch.start2] + if prefix: + patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)] + # Add the suffix. + suffix = text[patch.start2 + patch.length1 : + patch.start2 + patch.length1 + padding] + if suffix: + patch.diffs.append((self.DIFF_EQUAL, suffix)) + + # Roll back the start points. + patch.start1 -= len(prefix) + patch.start2 -= len(prefix) + # Extend lengths. + patch.length1 += len(prefix) + len(suffix) + patch.length2 += len(prefix) + len(suffix) + + def patch_make(self, a, b=None, c=None): + """Compute a list of patches to turn text1 into text2. + Use diffs if provided, otherwise compute it ourselves. + There are four ways to call this function, depending on what data is + available to the caller: + Method 1: + a = text1, b = text2 + Method 2: + a = diffs + Method 3 (optimal): + a = text1, b = diffs + Method 4 (deprecated, use method 3): + a = text1, b = text2, c = diffs + + Args: + a: text1 (methods 1,3,4) or Array of diff tuples for text1 to + text2 (method 2). + b: text2 (methods 1,4) or Array of diff tuples for text1 to + text2 (method 3) or undefined (method 2). + c: Array of diff tuples for text1 to text2 (method 4) or + undefined (methods 1,2,3). + + Returns: + Array of Patch objects. + """ + text1 = None + diffs = None + if isinstance(a, str) and isinstance(b, str) and c is None: + # Method 1: text1, text2 + # Compute diffs from text1 and text2. + text1 = a + diffs = self.diff_main(text1, b, True) + if len(diffs) > 2: + self.diff_cleanupSemantic(diffs) + self.diff_cleanupEfficiency(diffs) + elif isinstance(a, list) and b is None and c is None: + # Method 2: diffs + # Compute text1 from diffs. + diffs = a + text1 = self.diff_text1(diffs) + elif isinstance(a, str) and isinstance(b, list) and c is None: + # Method 3: text1, diffs + text1 = a + diffs = b + elif (isinstance(a, str) and isinstance(b, str) and + isinstance(c, list)): + # Method 4: text1, text2, diffs + # text2 is not used. + text1 = a + diffs = c + else: + raise ValueError("Unknown call format to patch_make.") + + if not diffs: + return [] # Get rid of the None case. + patches = [] + patch = patch_obj() + char_count1 = 0 # Number of characters into the text1 string. + char_count2 = 0 # Number of characters into the text2 string. + prepatch_text = text1 # Recreate the patches to determine context info. + postpatch_text = text1 + for x in range(len(diffs)): + (diff_type, diff_text) = diffs[x] + if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL: + # A new patch starts here. + patch.start1 = char_count1 + patch.start2 = char_count2 + if diff_type == self.DIFF_INSERT: + # Insertion + patch.diffs.append(diffs[x]) + patch.length2 += len(diff_text) + postpatch_text = (postpatch_text[:char_count2] + diff_text + + postpatch_text[char_count2:]) + elif diff_type == self.DIFF_DELETE: + # Deletion. + patch.length1 += len(diff_text) + patch.diffs.append(diffs[x]) + postpatch_text = (postpatch_text[:char_count2] + + postpatch_text[char_count2 + len(diff_text):]) + elif (diff_type == self.DIFF_EQUAL and + len(diff_text) <= 2 * self.Patch_Margin and + len(patch.diffs) != 0 and len(diffs) != x + 1): + # Small equality inside a patch. + patch.diffs.append(diffs[x]) + patch.length1 += len(diff_text) + patch.length2 += len(diff_text) + + if (diff_type == self.DIFF_EQUAL and + len(diff_text) >= 2 * self.Patch_Margin): + # Time for a new patch. + if len(patch.diffs) != 0: + self.patch_addContext(patch, prepatch_text) + patches.append(patch) + patch = patch_obj() + # Unlike Unidiff, our patch lists have a rolling context. + # https://github.com/google/diff-match-patch/wiki/Unidiff + # Update prepatch text & pos to reflect the application of the + # just completed patch. + prepatch_text = postpatch_text + char_count1 = char_count2 + + # Update the current character count. + if diff_type != self.DIFF_INSERT: + char_count1 += len(diff_text) + if diff_type != self.DIFF_DELETE: + char_count2 += len(diff_text) + + # Pick up the leftover patch if not empty. + if len(patch.diffs) != 0: + self.patch_addContext(patch, prepatch_text) + patches.append(patch) + return patches + + def patch_deepCopy(self, patches): + """Given an array of patches, return another array that is identical. + + Args: + patches: Array of Patch objects. + + Returns: + Array of Patch objects. + """ + patchesCopy = [] + for patch in patches: + patchCopy = patch_obj() + # No need to deep copy the tuples since they are immutable. + patchCopy.diffs = patch.diffs[:] + patchCopy.start1 = patch.start1 + patchCopy.start2 = patch.start2 + patchCopy.length1 = patch.length1 + patchCopy.length2 = patch.length2 + patchesCopy.append(patchCopy) + return patchesCopy + + def patch_apply(self, patches, text): + """Merge a set of patches onto the text. Return a patched text, as well + as a list of true/false values indicating which patches were applied. + + Args: + patches: Array of Patch objects. + text: Old text. + + Returns: + Two element Array, containing the new text and an array of boolean values. + """ + if not patches: + return (text, []) + + # Deep copy the patches so that no changes are made to originals. + patches = self.patch_deepCopy(patches) + + nullPadding = self.patch_addPadding(patches) + text = nullPadding + text + nullPadding + self.patch_splitMax(patches) + + # delta keeps track of the offset between the expected and actual location + # of the previous patch. If there are patches expected at positions 10 and + # 20, but the first patch was found at 12, delta is 2 and the second patch + # has an effective expected position of 22. + delta = 0 + results = [] + for patch in patches: + expected_loc = patch.start2 + delta + text1 = self.diff_text1(patch.diffs) + end_loc = -1 + if len(text1) > self.Match_MaxBits: + # patch_splitMax will only provide an oversized pattern in the case of + # a monster delete. + start_loc = self.match_main(text, text1[:self.Match_MaxBits], + expected_loc) + if start_loc != -1: + end_loc = self.match_main(text, text1[-self.Match_MaxBits:], + expected_loc + len(text1) - self.Match_MaxBits) + if end_loc == -1 or start_loc >= end_loc: + # Can't find valid trailing context. Drop this patch. + start_loc = -1 + else: + start_loc = self.match_main(text, text1, expected_loc) + if start_loc == -1: + # No match found. :( + results.append(False) + # Subtract the delta for this failed patch from subsequent patches. + delta -= patch.length2 - patch.length1 + else: + # Found a match. :) + results.append(True) + delta = start_loc - expected_loc + if end_loc == -1: + text2 = text[start_loc : start_loc + len(text1)] + else: + text2 = text[start_loc : end_loc + self.Match_MaxBits] + if text1 == text2: + # Perfect match, just shove the replacement text in. + text = (text[:start_loc] + self.diff_text2(patch.diffs) + + text[start_loc + len(text1):]) + else: + # Imperfect match. + # Run a diff to get a framework of equivalent indices. + diffs = self.diff_main(text1, text2, False) + if (len(text1) > self.Match_MaxBits and + self.diff_levenshtein(diffs) / float(len(text1)) > + self.Patch_DeleteThreshold): + # The end points match, but the content is unacceptably bad. + results[-1] = False + else: + self.diff_cleanupSemanticLossless(diffs) + index1 = 0 + for (op, data) in patch.diffs: + if op != self.DIFF_EQUAL: + index2 = self.diff_xIndex(diffs, index1) + if op == self.DIFF_INSERT: # Insertion + text = text[:start_loc + index2] + data + text[start_loc + + index2:] + elif op == self.DIFF_DELETE: # Deletion + text = text[:start_loc + index2] + text[start_loc + + self.diff_xIndex(diffs, index1 + len(data)):] + if op != self.DIFF_DELETE: + index1 += len(data) + # Strip the padding off. + text = text[len(nullPadding):-len(nullPadding)] + return (text, results) + + def patch_addPadding(self, patches): + """Add some padding on text start and end so that edges can match + something. Intended to be called only from within patch_apply. + + Args: + patches: Array of Patch objects. + + Returns: + The padding string added to each side. + """ + paddingLength = self.Patch_Margin + nullPadding = "" + for x in range(1, paddingLength + 1): + nullPadding += chr(x) + + # Bump all the patches forward. + for patch in patches: + patch.start1 += paddingLength + patch.start2 += paddingLength + + # Add some padding on start of first diff. + patch = patches[0] + diffs = patch.diffs + if not diffs or diffs[0][0] != self.DIFF_EQUAL: + # Add nullPadding equality. + diffs.insert(0, (self.DIFF_EQUAL, nullPadding)) + patch.start1 -= paddingLength # Should be 0. + patch.start2 -= paddingLength # Should be 0. + patch.length1 += paddingLength + patch.length2 += paddingLength + elif paddingLength > len(diffs[0][1]): + # Grow first equality. + extraLength = paddingLength - len(diffs[0][1]) + newText = nullPadding[len(diffs[0][1]):] + diffs[0][1] + diffs[0] = (diffs[0][0], newText) + patch.start1 -= extraLength + patch.start2 -= extraLength + patch.length1 += extraLength + patch.length2 += extraLength + + # Add some padding on end of last diff. + patch = patches[-1] + diffs = patch.diffs + if not diffs or diffs[-1][0] != self.DIFF_EQUAL: + # Add nullPadding equality. + diffs.append((self.DIFF_EQUAL, nullPadding)) + patch.length1 += paddingLength + patch.length2 += paddingLength + elif paddingLength > len(diffs[-1][1]): + # Grow last equality. + extraLength = paddingLength - len(diffs[-1][1]) + newText = diffs[-1][1] + nullPadding[:extraLength] + diffs[-1] = (diffs[-1][0], newText) + patch.length1 += extraLength + patch.length2 += extraLength + + return nullPadding + + def patch_splitMax(self, patches): + """Look through the patches and break up any which are longer than the + maximum limit of the match algorithm. + Intended to be called only from within patch_apply. + + Args: + patches: Array of Patch objects. + """ + patch_size = self.Match_MaxBits + if patch_size == 0: + # Python has the option of not splitting strings due to its ability + # to handle integers of arbitrary precision. + return + for x in range(len(patches)): + if patches[x].length1 <= patch_size: + continue + bigpatch = patches[x] + # Remove the big old patch. + del patches[x] + x -= 1 + start1 = bigpatch.start1 + start2 = bigpatch.start2 + precontext = '' + while len(bigpatch.diffs) != 0: + # Create one of several smaller patches. + patch = patch_obj() + empty = True + patch.start1 = start1 - len(precontext) + patch.start2 = start2 - len(precontext) + if precontext: + patch.length1 = patch.length2 = len(precontext) + patch.diffs.append((self.DIFF_EQUAL, precontext)) + + while (len(bigpatch.diffs) != 0 and + patch.length1 < patch_size - self.Patch_Margin): + (diff_type, diff_text) = bigpatch.diffs[0] + if diff_type == self.DIFF_INSERT: + # Insertions are harmless. + patch.length2 += len(diff_text) + start2 += len(diff_text) + patch.diffs.append(bigpatch.diffs.pop(0)) + empty = False + elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and + patch.diffs[0][0] == self.DIFF_EQUAL and + len(diff_text) > 2 * patch_size): + # This is a large deletion. Let it pass in one chunk. + patch.length1 += len(diff_text) + start1 += len(diff_text) + empty = False + patch.diffs.append((diff_type, diff_text)) + del bigpatch.diffs[0] + else: + # Deletion or equality. Only take as much as we can stomach. + diff_text = diff_text[:patch_size - patch.length1 - + self.Patch_Margin] + patch.length1 += len(diff_text) + start1 += len(diff_text) + if diff_type == self.DIFF_EQUAL: + patch.length2 += len(diff_text) + start2 += len(diff_text) + else: + empty = False + + patch.diffs.append((diff_type, diff_text)) + if diff_text == bigpatch.diffs[0][1]: + del bigpatch.diffs[0] + else: + bigpatch.diffs[0] = (bigpatch.diffs[0][0], + bigpatch.diffs[0][1][len(diff_text):]) + + # Compute the head context for the next patch. + precontext = self.diff_text2(patch.diffs) + precontext = precontext[-self.Patch_Margin:] + # Append the end context for this patch. + postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin] + if postcontext: + patch.length1 += len(postcontext) + patch.length2 += len(postcontext) + if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL: + patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] + + postcontext) + else: + patch.diffs.append((self.DIFF_EQUAL, postcontext)) + + if not empty: + x += 1 + patches.insert(x, patch) + + def patch_toText(self, patches): + """Take a list of patches and return a textual representation. + + Args: + patches: Array of Patch objects. + + Returns: + Text representation of patches. + """ + text = [] + for patch in patches: + text.append(str(patch)) + return "".join(text) + + def patch_fromText(self, textline): + """Parse a textual representation of patches and return a list of patch + objects. + + Args: + textline: Text representation of patches. + + Returns: + Array of Patch objects. + + Raises: + ValueError: If invalid input. + """ + patches = [] + if not textline: + return patches + text = textline.split('\n') + while len(text) != 0: + m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0]) + if not m: + raise ValueError("Invalid patch string: " + text[0]) + patch = patch_obj() + patches.append(patch) + patch.start1 = int(m.group(1)) + if m.group(2) == '': + patch.start1 -= 1 + patch.length1 = 1 + elif m.group(2) == '0': + patch.length1 = 0 + else: + patch.start1 -= 1 + patch.length1 = int(m.group(2)) + + patch.start2 = int(m.group(3)) + if m.group(4) == '': + patch.start2 -= 1 + patch.length2 = 1 + elif m.group(4) == '0': + patch.length2 = 0 + else: + patch.start2 -= 1 + patch.length2 = int(m.group(4)) + + del text[0] + + while len(text) != 0: + if text[0]: + sign = text[0][0] + else: + sign = '' + line = urllib.parse.unquote(text[0][1:]) + if sign == '+': + # Insertion. + patch.diffs.append((self.DIFF_INSERT, line)) + elif sign == '-': + # Deletion. + patch.diffs.append((self.DIFF_DELETE, line)) + elif sign == ' ': + # Minor equality. + patch.diffs.append((self.DIFF_EQUAL, line)) + elif sign == '@': + # Start of next patch. + break + elif sign == '': + # Blank line? Whatever. + pass + else: + # WTF? + raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line)) + del text[0] + return patches + + +class patch_obj: + """Class representing one patch operation. + """ + + def __init__(self): + """Initializes with an empty list of diffs. + """ + self.diffs = [] + self.start1 = None + self.start2 = None + self.length1 = 0 + self.length2 = 0 + + def __str__(self): + """Emulate GNU diff's format. + Header: @@ -382,8 +481,9 @@ + Indices are printed as 1-based, not 0-based. + + Returns: + The GNU diff string. + """ + if self.length1 == 0: + coords1 = str(self.start1) + ",0" + elif self.length1 == 1: + coords1 = str(self.start1 + 1) + else: + coords1 = str(self.start1 + 1) + "," + str(self.length1) + if self.length2 == 0: + coords2 = str(self.start2) + ",0" + elif self.length2 == 1: + coords2 = str(self.start2 + 1) + else: + coords2 = str(self.start2 + 1) + "," + str(self.length2) + text = ["@@ -", coords1, " +", coords2, " @@\n"] + # Escape the body of the patch with %xx notation. + for (op, data) in self.diffs: + if op == diff_match_patch.DIFF_INSERT: + text.append("+") + elif op == diff_match_patch.DIFF_DELETE: + text.append("-") + elif op == diff_match_patch.DIFF_EQUAL: + text.append(" ") + # High ascii will raise UnicodeDecodeError. Use Unicode instead. + data = data.encode("utf-8") + text.append(urllib.parse.quote(data, "!~*'();/?:@&=+$,# ") + "\n") + return "".join(text) diff --git a/xmldiff/diff.py b/xmldiff/diff.py new file mode 100644 index 0000000..2dd61ed --- /dev/null +++ b/xmldiff/diff.py @@ -0,0 +1,399 @@ +from __future__ import division + +from collections import namedtuple +from copy import deepcopy +from difflib import SequenceMatcher +from lxml import etree +from xmldiff import utils + + +# Update, Move, Delete and Insert are the edit script actions: +DeleteNode = namedtuple('DeleteNode', 'node') +InsertNode = namedtuple('InsertNode', 'target tag position') +MoveNode = namedtuple('MoveNode', 'node target position') + +UpdateTextIn = namedtuple('UpdateTextIn', 'node text') +UpdateTextAfter = namedtuple('UpdateTextAfter', 'node text') + +UpdateAttrib = namedtuple('UpdateAttrib', 'node name value') +DeleteAttrib = namedtuple('DeleteAttrib', 'node name') +InsertAttrib = namedtuple('InsertAttrib', 'node name value') +RenameAttrib = namedtuple('RenameAttrib', 'node oldname newname') + + +class Differ(object): + + def __init__(self, F=0.5, uniqueattrs=None): + # The minimum similarity between two nodes to consider them equal + self.F = F + # uniquattrs is a list of attributes that uniquely identifies a node + # inside a document. Defaults to 'xml:id'. + if uniqueattrs is None: + uniqueattrs = ['{http://www.w3.org/XML/1998/namespace}id'] + self.uniqueattrs = uniqueattrs + self.clear() + # Avoid recreating this for every node + self._sequencematcher = SequenceMatcher() + + def clear(self): + # Use None for all values, as markings that they aren't done yet. + self.left = None + self.right = None + self._matches = None + self._l2rmap = None + self._r2lmap = None + self._inorder = None + + def set_trees(self, left, right): + self.clear() + + # Make sure we were passed two lxml elements: + if not (etree.iselement(left) and etree.iselement(right)): + raise TypeError("The 'left' and 'right' parameters must be " + "lxml Elements.") + + # Left gets modified as a part of the diff, deepcopy it first. + self.left = deepcopy(left) + self.right = right + + def append_match(self, lnode, rnode, max_match): + self._matches.append((lnode, rnode, max_match)) + self._l2rmap[id(lnode)] = rnode + self._r2lmap[id(rnode)] = lnode + + def match(self, left=None, right=None): + # This is not a generator, because the diff() functions needs + # _l2rmap and _r2lmap, so if match() was a generator, then + # diff() would have to first do list(self.match()) without storing + # the result, and that would be silly. + + # Nothing in this library is actually using the resulting list of + # matches match() returns, but it may be useful for somebody that + # actually do not want a diff, but only a list of matches. + # It also makes testing the match function easier. + + if left is not None or right is not None: + self.set_trees(left, right) + + if self._matches is not None: + # We already matched these sequences, use the cache + return self._matches + + # Initialize the caches: + self._matches = [] + self._l2rmap = {} + self._r2lmap = {} + self._inorder = set() + + # Let's just do the naive slow matchings, we can implement + # FastMatch later + lroot = self.left.getroottree() + lnodes = utils.post_order_traverse(self.left) + + rroot = self.right.getroottree() + rnodes = list(utils.post_order_traverse(self.right)) + + for lnode in lnodes: + max_match = 0 + match_node = None + + for rnode in rnodes: + match = self.leaf_ratio(lnode, rnode) + child_ratio = self.child_ratio(lnode, rnode) + if child_ratio is not None: + match = (match + child_ratio) / 2 + if match > max_match: + match_node = rnode + max_match = match + + # Try to shortcut for nodes that are not only equal but also + # in the same place in the tree + if (match == 1.0 and + utils.getpath(lnode, lroot) == utils.getpath(rnode, rroot)): + # This is a complete match, break here + break + + if max_match >= self.F: + self.append_match(lnode, match_node, max_match) + + # We don't want to check nodes that already are matched + if match_node is not None: + rnodes.remove(match_node) + + # TODO: If the roots do not match, we should create new roots, and + # have the old roots be children of the new roots, but let's skip + # that for now, we don't need it. That's strictly a part of the + # insert phase, but hey, even the paper defining the phases + # ignores the phases, so... + # For now, just make sure the roots are matched: + if id(self.left) not in self._l2rmap: + self.append_match(self.left, self.right, 1.0) + + return self._matches + + def node_text(self, node): + texts = node.xpath('text()') + + for each in sorted(node.attrib.items()): + texts.append(':'.join(each)) + + text = u' '.join(texts).strip() + return utils.cleanup_whitespace(text) + + def leaf_ratio(self, left, right): + # How similar two nodes are, with no consideration of their children + if (isinstance(left, etree._Comment) or + isinstance(right, etree._Comment)): + if (isinstance(left, etree._Comment) and + isinstance(right, etree._Comment)): + # comments + self._sequencematcher.set_seqs(left.text, right.text) + return self._sequencematcher.ratio() + # One is a comment the other is not: + return 0 + + # Get rid of the URN in the tag for comparison: + ltag = left.tag.rsplit('}')[-1] + rtag = right.tag.rsplit('}')[-1] + # If the prefix and the tag is the same, that's OK. But if the prefix + # or the tag has changed, then it's definitely not the same. + # However, this way we allow changing the URN of the prefix. + if (left.prefix, ltag) != (right.prefix, rtag): + # Different tags == not the same node at all + return 0 + + for attr in self.uniqueattrs: + if attr in left.attrib or attr in right.attrib: + # One of the nodes have a unique attribute, we check only that. + # If only one node has it, it means they are not the same. + return int(left.attrib.get(attr) == right.attrib.get(attr)) + + # We use a simple ratio here, I tried Levenshtein distances + # but that took a 100 times longer. + ltext = self.node_text(left) + rtext = self.node_text(right) + self._sequencematcher.set_seqs(ltext, rtext) + return self._sequencematcher.ratio() + + def child_ratio(self, left, right): + # How similar the children of two nodes are + left_children = left.getchildren() + right_children = right.getchildren() + if not left_children and not right_children: + return None + count = 0 + child_count = max((len(left_children), len(right_children))) + for lchild in left_children: + for rchild in right_children: + if self._l2rmap.get(id(lchild)) is rchild: + count += 1 + right_children.remove(rchild) + break + + return count / child_count + + def update_node_attr(self, left, right): + left_xpath = utils.getpath(left) + right_xpath = utils.getpath(right) + + # Update: Look for differences in attributes + + left_keys = set(left.attrib.keys()) + right_keys = set(right.attrib.keys()) + new_keys = right_keys.difference(left_keys) + removed_keys = left_keys.difference(right_keys) + common_keys = left_keys.intersection(right_keys) + + # We sort the attributes to get a consistent order in the edit script. + # That's only so we can do testing in a reasonable way... + for key in sorted(common_keys): + if left.attrib[key] != right.attrib[key]: + yield UpdateAttrib(left_xpath, key, right.attrib[key]) + left.attrib[key] = right.attrib[key] + + # Align: Not needed here, we don't care about the order of + # attributes. + + # Move: Check if any of the new attributes have the same value + # as the removed attributes. If they do, it's actually + # a renaming, and a move is one action instead of remove + insert + newattrmap = {v: k for (k, v) in right.attrib.items() + if k in new_keys} + for lk in sorted(removed_keys): + value = left.attrib[lk] + if value in newattrmap: + rk = newattrmap[value] + yield RenameAttrib(left_xpath, lk, rk) + # Remove from list of new attributes + new_keys.remove(rk) + # Update left node + left.attrib[rk] = value + del left.attrib[lk] + + # Insert: Find new attributes + for key in sorted(new_keys): + yield InsertAttrib(right_xpath, key, right.attrib[key]) + left.attrib[key] = right.attrib[key] + + # Delete: remove removed attributes + for key in sorted(removed_keys): + if key not in left.attrib: + # This was already moved + continue + yield DeleteAttrib(left_xpath, key) + del left.attrib[key] + + def update_node_text(self, left, right): + left_xpath = utils.getpath(left) + + # Lastly, do the differences in texts. This inserts nodes + # when making an XML diff, so it's best to have this last. + if left.text != right.text: + yield UpdateTextIn(left_xpath, right.text) + left.text = right.text + + if left.tail != right.tail: + yield UpdateTextAfter(left_xpath, right.tail) + left.tail = right.tail + + def find_pos(self, child): + parent = child.getparent() + # The paper here first checks in the child is the first child in + # order, but I am entirely unable to actually make that happen, and + # if it does, the "else:" will catch that case anyway, and it also + # deals with the case of no child being in order. + + # Find the last sibling before the child that is in order + i = parent.index(child) - 1 + while i >= 0: + sibling = parent[i] + if sibling in self._inorder: + # That's it + break + i -= 1 + else: + # No previous sibling in order. + return 0 + + # Now find the partner of this in the left tree + sibling_match = self._r2lmap[id(sibling)] + i = 0 + for child in sibling_match.getparent().getchildren(): + if child in self._inorder: + i += 1 + if child is sibling_match: + break + return i + + def align_children(self, left, right): + lchildren = [c for c in left.getchildren() + if (id(c) in self._l2rmap and + self._l2rmap[id(c)].getparent() is right)] + rchildren = [c for c in right.getchildren() + if (id(c) in self._r2lmap and + self._r2lmap[id(c)].getparent() is left)] + if not lchildren or not rchildren: + # Nothing to align + return + + lcs = utils.longest_common_subsequence( + lchildren, rchildren, + lambda x, y: self._l2rmap[id(x)] is y) + + for x, y in lcs: + # Mark these as in order + self._inorder.add(lchildren[x]) + self._inorder.add(rchildren[y]) + + # Go over those children that are not in order: + for unaligned_left in set(lchildren) - self._inorder: + unaligned_right = self._l2rmap[id(unaligned_left)] + right_pos = self.find_pos(unaligned_right) + rtarget = unaligned_right.getparent() + ltarget = self._r2lmap[id(rtarget)] + yield MoveNode( + utils.getpath(unaligned_left), + utils.getpath(rtarget), + right_pos) + # Do the actual move: + left.remove(unaligned_left) + ltarget.insert(right_pos, unaligned_left) + + def diff(self, left=None, right=None): + # Make sure the matching is done first, diff() needs the l2r/r2l maps. + if not self._matches: + self.match(left, right) + + # The paper talks about the five phases, and then does four of them + # in one phase, in a different order that described. This + # implementation in turn differs in order yet again. + ltree = self.left.getroottree() + rtree = self.right.getroottree() + + for rnode in utils.breadth_first_traverse(self.right): + # (a) + rparent = rnode.getparent() + ltarget = self._r2lmap.get(id(rparent)) + + # (b) Insert + if id(rnode) not in self._r2lmap: + # (i) + pos = self.find_pos(rnode) + # (ii) + yield InsertNode(utils.getpath(ltarget, ltree), rnode.tag, pos) + # (iii) + lnode = ltarget.makeelement(rnode.tag) + self.append_match(lnode, rnode, 1.0) + ltarget.insert(pos, lnode) + self._inorder.add(lnode) + self._inorder.add(rnode) + # And then we update attributes. This is different from the + # paper, because the paper assumes nodes only has labels and + # values. Nodes also has texts, we do them later. + for action in self.update_node_attr(lnode, rnode): + yield action + + # (c) + else: + # Normally there is a check that rnode isn't a root, + # but that's perhaps only because comparing valueless + # roots is pointless, but in an elementtree we have no such + # thing as a valueless root anyway. + # (i) + lnode = self._r2lmap[id(rnode)] + + # (ii) Update + # XXX If they are exactly equal, we can skip this, + # maybe store match results in a cache? + for action in self.update_node_attr(lnode, rnode): + yield action + + # (iii) Move + lparent = lnode.getparent() + if ltarget is not lparent: + pos = self.find_pos(rnode) + yield MoveNode( + utils.getpath(lnode, ltree), + utils.getpath(rparent, rtree), + pos) + # Move the node from current parent to target + lparent.remove(lnode) + ltarget.insert(pos, lnode) + + # (d) Align + for action in self.align_children(lnode, rnode): + yield action + + # And lastly, we update all node texts. We do this after + # aligning children, because when you generate an XML diff + # from this, that XML diff update generates more children, + # confusing later inserts or deletes. + lnode = self._r2lmap[id(rnode)] + for action in self.update_node_text(lnode, rnode): + yield action + + for lnode in utils.reverse_post_order_traverse(self.left): + if id(lnode) not in self._l2rmap: + # No match + yield DeleteNode(utils.getpath(lnode, ltree)) + lnode.getparent().remove(lnode) diff --git a/xmldiff/diff_match_patch.py b/xmldiff/diff_match_patch.py new file mode 100644 index 0000000..2bdb619 --- /dev/null +++ b/xmldiff/diff_match_patch.py @@ -0,0 +1,5 @@ +import sys +if sys.version_info[0] == 3: + from xmldiff._diff_match_patch_py3 import * +else: + from xmldiff._diff_match_patch_py2 import * diff --git a/xmldiff/formatting.py b/xmldiff/formatting.py new file mode 100644 index 0000000..497c903 --- /dev/null +++ b/xmldiff/formatting.py @@ -0,0 +1,621 @@ +import json +import re +import six + +from collections import namedtuple +from copy import deepcopy +from lxml import etree +from xmldiff.diff_match_patch import diff_match_patch +from xmldiff.diff import UpdateTextIn, UpdateTextAfter +from xmldiff.utils import cleanup_whitespace + + +DIFF_NS = 'http://namespaces.shoobx.com/diff' +DIFF_PREFIX = 'diff' + +# Flags for whitespace handling in the text aware formatters: +WS_BOTH = 3 # Normalize ignorable whitespace and text whitespace +WS_TEXT = 2 # Normalize whitespace only inside text tags +WS_TAGS = 1 # Delete ignorable whitespace (between tags) +WS_NONE = 0 # Preserve all whitespace + +# Placeholder tag type +T_OPEN = 0 +T_CLOSE = 1 +T_SINGLE = 2 + + +# These Bases can be abstract baseclasses, but it's a pain to support +# Python 2.7 in that case, because there is no abc.ABC. Right now this +# is just a description of the API. + +class BaseFormatter(object): + + def __init__(self, normalize=WS_TAGS, pretty_print=False): + """Formatters must as a minimum have a normalize parameter + + This is used by the main API to decide is whitespace between the + tags should be stripped (the remove_blank_text flag in lxml) and + if tags that are known texts tags should be normalized before + comparing. String content in non-text tags will not be + normalized with the included formatters. + + pretty_print is used to choose between a compact and a pretty output. + This is currently only used by the XML and RML formatters. + + Formatters may of course have more options than these, but these + two are the ones that can be set from the command line. + """ + + def prepare(self, left_tree, right_tree): + """Allows the formatter to prepare the trees before diffing + + That preparing may need some "unpreparing", but it's then done + by the formatters format() method, and is not a part of the + public interface.""" + + def format(self, diff, orig_tree): + """Formats the diff and returns a unicode string + + A formatter that returns XML with diff markup will need the original + tree available to do it's job, so there is an orig_tree parameter, + but it may be ignored by differs that don't need it. + """ + + +PlaceholderEntry = namedtuple('PlaceholderEntry', 'element ttype close_ph') + + +class PlaceholderMaker(object): + """Replace tags with unicode placeholders + + This class searches for certain tags in an XML tree and replaces them + with unicode placeholders. The idea is to replace structured content + (in this case XML elements) with unicode characters which then + participate in the regular text diffing algorithm. This makes text + diffing easier and faster. + + The code can then unreplace the unicode placeholders with the tags. + """ + + def __init__(self, text_tags=(), formatting_tags=()): + self.text_tags = text_tags + self.formatting_tags = formatting_tags + self.placeholder2tag = {} + self.tag2placeholder = {} + # This number represents the beginning of the largest private-use + # block (13,000 characters) in the unicode space. + self.placeholder = 0xf0000 + + insert_elem = etree.Element('{%s}insert' % DIFF_NS) + insert_close = self.get_placeholder( + insert_elem, T_CLOSE, None) + insert_open = self.get_placeholder( + insert_elem, T_OPEN, insert_close) + + delete_elem = etree.Element('{%s}delete' % DIFF_NS) + delete_close = self.get_placeholder( + delete_elem, T_CLOSE, None) + delete_open = self.get_placeholder( + delete_elem, T_OPEN, delete_close) + + self.diff_tags = { + 'insert': (insert_open, insert_close), + 'delete': (delete_open, delete_close)} + + def get_placeholder(self, element, ttype, close_ph): + tag = etree.tounicode(element) + ph = self.tag2placeholder.get((tag, ttype, close_ph)) + if ph is not None: + return ph + + self.placeholder += 1 + ph = six.unichr(self.placeholder) + self.placeholder2tag[ph] = PlaceholderEntry(element, ttype, close_ph) + self.tag2placeholder[tag, ttype, close_ph] = ph + return ph + + def is_placeholder(self, char): + return len(char) == 1 and char in self.placeholder2tag + + def is_formatting(self, element): + return element.tag in self.formatting_tags + + def do_element(self, element): + for child in element: + # Resolve all formatting text by allowing the inside text to + # participate in the text diffing. + tail = child.tail or u'' + child.tail = u'' + new_text = element.text or u'' + + if self.is_formatting(child): + ph_close = self.get_placeholder(child, T_CLOSE, None) + ph_open = self.get_placeholder(child, T_OPEN, ph_close) + # If it's known text formatting tags, do this hierarchically + self.do_element(child) + text = child.text or u'' + child.text = u'' + # Stick the placeholder in instead of the start and end tags: + element.text = new_text + ph_open + text + ph_close + tail + else: + ph_single = self.get_placeholder(child, T_SINGLE, None) + # Replace the whole tag including content: + element.text = new_text + ph_single + tail + + # Remove the element from the tree now that we have inserted a + # placeholder. + element.remove(child) + + def do_tree(self, tree): + if self.text_tags: + for elem in tree.xpath('//'+'|//'.join(self.text_tags)): + self.do_element(elem) + + def split_string(self, text): + regexp = u'([%s])' % u''.join(self.placeholder2tag) + return re.split(regexp, text, flags=re.MULTILINE) + + def undo_string(self, text): + result = u'' + segments = self.split_string(text) + + while segments: + seg = segments.pop(0) + + # Segments can be either plain string or placeholders. + if self.is_placeholder(seg): + entry = self.placeholder2tag[seg] + element = entry.element + # Is this a open/close segment? + if entry.ttype == T_OPEN: + # Yup + next_seg = segments.pop(0) + new_text = u'' + while next_seg != entry.close_ph: + new_text += next_seg + next_seg = segments.pop(0) + element.text = new_text + + new_text = etree.tounicode(element) + result += self.undo_string(new_text) + else: + result += seg + return result + + def undo_element(self, elem): + if self.placeholder2tag: + if elem.text: + index = 0 + new_text = self.undo_string(elem.text) + content = etree.fromstring(u'%s' % new_text) + elem.text = content.text + for child in content: + self.undo_element(child) + elem.insert(index, child) + index += 1 + + for child in elem: + self.undo_element(child) + + if elem.tail: + new_text = self.undo_string(elem.tail) + content = etree.fromstring(u'%s' % new_text) + elem.tail = content.text + parent = elem.getparent() + index = parent.index(elem) + 1 + for child in content: + self.undo_element(child) + parent.insert(index, child) + index += 1 + + def undo_tree(self, tree): + self.undo_element(tree) + + def mark_diff(self, ph, action): + entry = self.placeholder2tag[ph] + if entry.ttype == T_CLOSE: + # Close tag, nothing to mark + return ph + + # Mark the tag as having a diff-action. We do need to + # make a copy of it and get a new placeholder: + elem = entry.element + elem = deepcopy(elem) + if self.is_formatting(elem): + # Formatting element, add a diff attribute + action += '-formatting' + elem.attrib['{%s}%s' % (DIFF_NS, action)] = '' + else: + # Not formatting, wrap content + elem.text = self.wrap_diff(elem.text, action) + + # And make a new placeholder for this new entry: + return self.get_placeholder(elem, entry.ttype, entry.close_ph) + + def wrap_diff(self, text, action): + open_ph, close_ph = self.diff_tags[action] + return open_ph + text + close_ph + + +class XMLFormatter(BaseFormatter): + """A formatter that also replaces formatting tags with unicode characters + + The idea of this differ is to replace structured content (in this case XML + elements) with unicode characters which then participate in the regular + text diffing algorithm. This is done in the prepare() step. + + Each identical XML element will get a unique unicode character. If the + node is changed for any reason, a new unicode character is assigned to the + node. This allows identity detection of structured content between the + two text versions while still allowing customization during diffing time, + such as marking a new formatting node. The latter feature allows for + granular style change detection independently of text changes. + + In order for the algorithm to not go crazy and convert entire XML + documents to text (though that is perfectly doable), a few rules have been + defined. + + - The `textTags` attribute lists all the XML nodes by name which can + contain text. All XML nodes within those text nodes are converted to + unicode placeholders. If you want better control over which parts of + your XML document are considered text, you can simply override the + ``insert_placeholders(tree)`` function. It is purposefully kept small to + allow easy subclassing. + + - By default, all tags inside text tags are treated as immutable + units. That means the node itself including its entire sub-structure is + assigned one unicode character. + + - The ``formattingTags`` attribute is used to specify tags that format the + text. For these tags, the opening and closing tags receive unique + unicode characters, allowing for sub-structure change detection and + formatting changes. During the diff markup phase, formatting notes are + annotated to mark them as inserted or deleted allowing for markup + specific to those formatting changes. + + The diffed version of the structural tree is passed into the + ``finalize(tree)`` method to convert all the placeholders back into + structural content before formatting. + + The ``normalize`` parameter decides how to normalize whitespace. + WS_TEXT normalizes only inside text_tags, WS_TAGS will remove ignorable + whitespace between tags, WS_BOTH do both, and WS_NONE will preserve + all whitespace. + """ + + def __init__(self, normalize=WS_NONE, pretty_print=True, + text_tags=(), formatting_tags=()): + # Mapping from placeholders -> structural content and vice versa. + self.normalize = normalize + self.pretty_print = pretty_print + self.text_tags = text_tags + self.formatting_tags = formatting_tags + self.placeholderer = PlaceholderMaker( + text_tags=text_tags, formatting_tags=formatting_tags) + + def prepare(self, left_tree, right_tree): + """prepare() is run on the trees before diffing + + This is so the formatter can apply magic before diffing.""" + self.placeholderer.do_tree(left_tree) + self.placeholderer.do_tree(right_tree) + + def finalize(self, result_tree): + """finalize() is run on the resulting tree before returning it + + This is so the formatter cab apply magic after diffing.""" + self.placeholderer.undo_tree(result_tree) + + def format(self, diff, orig_tree): + # Make a new tree, both because we want to add the diff namespace + # and also because we don't want to modify the original tree. + + result = deepcopy(orig_tree) + etree.register_namespace(DIFF_PREFIX, DIFF_NS) + + deferred = [] + for action in diff: + if isinstance(action, (UpdateTextIn, UpdateTextAfter)): + # We need to do text updates last + deferred.append(action) + continue + self.handle_action(action, result) + + for action in reversed(deferred): + self.handle_action(action, result) + + self.finalize(result) + + etree.cleanup_namespaces(result, top_nsmap={DIFF_PREFIX: DIFF_NS}) + return etree.tounicode(result, pretty_print=self.pretty_print) + + def handle_action(self, action, result): + action_type = type(action) + method = getattr(self, '_handle_' + action_type.__name__) + method(action, result) + + def _xpath(self, node, xpath): + # This method finds an element with xpath and makes sure that + # one and exactly one element is found. This is to protect against + # formatting a diff on the wrong tree, or against using ambigous + # edit script xpaths. + result = node.xpath(xpath, namespaces=node.nsmap) + if len(result) == 0: + raise ValueError('xpath %s not found.' % xpath) + if len(result) > 1: + raise ValueError('Multiple nodes found for xpath %s.' % xpath) + return result[0] + + def _extend_diff_attr(self, node, action, value): + diffattr = '{%s}%s-attr' % (DIFF_NS, action) + oldvalue = node.attrib.get(diffattr, '') + if oldvalue: + value = oldvalue + ';' + value + node.attrib[diffattr] = value + + def _delete_attrib(self, node, name): + del node.attrib[name] + self._extend_diff_attr(node, 'delete', name) + + def _handle_DeleteAttrib(self, action, tree): + node = self._xpath(tree, action.node) + self._delete_attrib(node, action.name) + + def _delete_node(self, node): + node.attrib['{%s}delete' % DIFF_NS] = '' + + def _handle_DeleteNode(self, action, tree): + node = self._xpath(tree, action.node) + self._delete_node(node) + + def _insert_attrib(self, node, name, value): + node.attrib[name] = value + self._extend_diff_attr(node, 'add', name) + + def _handle_InsertAttrib(self, action, tree): + node = self._xpath(tree, action.node) + self._insert_attrib(node, action.name, action.value) + + def _insert_node(self, target, node, position): + # Insert node as a child. However, position is the position in the + # new tree, and the diff tree may have deleted children, so we must + # adjust the position for that. + pos = 0 + offset = 0 + for child in target.getchildren(): + if '{%s}delete' % DIFF_NS in child.attrib: + offset += 1 + else: + pos += 1 + if pos > position: + # We found the right offset + break + + node.attrib['{%s}insert' % DIFF_NS] = '' + target.insert(position + offset, node) + + def _handle_InsertNode(self, action, tree): + target = self._xpath(tree, action.target) + new_node = target.makeelement(action.tag, nsmap=target.nsmap) + self._insert_node(target, new_node, action.position) + + def _rename_attrib(self, node, oldname, newname): + node.attrib[newname] = node.attrib[oldname] + del node.attrib[oldname] + self._extend_diff_attr(node, 'rename', '%s:%s' % (oldname, newname)) + + def _handle_RenameAttrib(self, action, tree): + node = self._xpath(tree, action.node) + self._rename_attrib(node, action.oldname, action.newname) + + def _handle_MoveNode(self, action, tree): + node = self._xpath(tree, action.node) + inserted = deepcopy(node) + target = self._xpath(tree, action.target) + self._delete_node(node) + self._insert_node(target, inserted, action.position) + + def _update_attrib(self, node, name, value): + oldval = node.attrib[name] + node.attrib[name] = value + self._extend_diff_attr(node, 'update', '%s:%s' % (name, oldval)) + + def _handle_UpdateAttrib(self, action, tree): + node = self._xpath(tree, action.node) + self._update_attrib(node, action.name, action.value) + + def _realign_placeholders(self, diff): + # Since the differ always deletes first and insert second, + # placeholders that represent XML open and close tags will get + # misaligned. This method will fix that order. + new_diff = [] # Diff list with proper tree structure. + stack = [] # Current node path. + + def _stack_pop(): + return stack.pop() if stack else (None, None) + + for op, text in diff: + segments = self.placeholderer.split_string(text) + new_text = u'' + for seg in segments: + if not seg: + continue + # There is nothing to do for regular text. + if not self.placeholderer.is_placeholder(seg): + new_text += seg + continue + # Handle all structural replacement elements. + entry = self.placeholderer.placeholder2tag[seg] + if entry.ttype == T_SINGLE: + # There is nothing to do for singletons since they are + # fully self-contained. + new_text += seg + continue + elif entry.ttype == T_OPEN: + # Opening tags are added to the stack, so we know what + # needs to be closed when. We are assuming that tags are + # opened in the desired order. + stack.append((op, entry)) + new_text += seg + continue + elif entry.ttype == T_CLOSE: + # Due to the nature of the text diffing algorithm, closing + # tags can be out of order. But since we know what we need + # to close, we simply glean at the stack to know what + # needs to be closed before the requested node closure can + # happen. + stack_op, stack_entry = _stack_pop() + while ( + stack_entry is not None and + stack_entry.close_ph != seg + ): + new_diff.append((stack_op, stack_entry.close_ph)) + stack_op, stack_entry = _stack_pop() + + # Stephan: We have situations where the opening tag + # remains in place but the closing text moves from on + # position to another. In those cases, we will have two + # closing tags for one opening one. Since we want to + # prefer the new version over the old in terms of + # formatting, we ignore the deletion and close the tag + # where it was inserted. + # Lennart: I could not make any case that made + # stack_op > op, so I removed the handling, and + # put in an assert + if stack_entry is not None: + assert stack_op <= op + new_text += seg + if new_text: + new_diff.append((op, new_text)) + return new_diff + + def _make_diff_tags(self, left_value, right_value, node, target=None): + if bool(self.normalize & WS_TEXT): + left_value = cleanup_whitespace(left_value or u'').strip() + right_value = cleanup_whitespace(right_value or u'').strip() + + text_diff = diff_match_patch() + diff = text_diff.diff_main(left_value or '', right_value or '') + text_diff.diff_cleanupSemantic(diff) + + diff = self._realign_placeholders(diff) + + cur_child = None + if target is None: + target = node + else: + cur_child = node + + for op, text in diff: + if op == 0: + if cur_child is None: + node.text = (node.text or u'') + text + else: + cur_child.tail = (cur_child.tail or u'') + text + continue + + if op == -1: + action = 'delete' + elif op == 1: + action = 'insert' + + if self.placeholderer.is_placeholder(text): + ph = self.placeholderer.mark_diff(text, action) + + if cur_child is None: + node.text = (node.text or u'') + ph + + else: + new_text = self.placeholderer.wrap_diff(text, action) + + if cur_child is None: + node.text = (node.text or u'') + new_text + else: + cur_child.tail = (cur_child.tail or u'') + new_text + + def _handle_UpdateTextIn(self, action, tree): + node = self._xpath(tree, action.node) + left_value = node.text + right_value = action.text + node.text = None + + self._make_diff_tags(left_value, right_value, node) + + return node + + def _handle_UpdateTextAfter(self, action, tree): + node = self._xpath(tree, action.node) + left_value = node.tail + right_value = action.text + node.tail = None + + self._make_diff_tags(left_value, right_value, node, node.getparent()) + + return node + + +class RMLFormatter(XMLFormatter): + + def __init__(self, normalize=WS_BOTH, pretty_print=True, + text_tags=('para', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'), + formatting_tags=('b', 'u', 'i', 'strike', 'em', 'super', + 'sup', 'sub', 'link', 'a', 'span')): + super(RMLFormatter, self).__init__( + normalize=normalize, pretty_print=pretty_print, + text_tags=text_tags, formatting_tags=formatting_tags) + + +class DiffFormatter(BaseFormatter): + + def __init__(self, normalize=WS_TAGS, pretty_print=False): + self.normalize = normalize + # No pretty print support, nothing to be pretty about + + # Nothing to prepare or finalize (one-liners for code coverage) + def prepare(self, left, right): return + + def finalize(self, left, right): return + + def format(self, diff, orig_tree): + # This Formatter don't need the left tree, but the XMLFormatter + # does, so the parameter is required. + res = u'\n'.join(self._format_action(action) for action in diff) + return res + + def _format_action(self, action): + return u'[%s]' % self.handle_action(action) + + def handle_action(self, action): + action_type = type(action) + method = getattr(self, '_handle_' + action_type.__name__) + return u', '.join(method(action)) + + def _handle_DeleteAttrib(self, action): + return u"delete-attribute", action.node, action.name + + def _handle_DeleteNode(self, action): + return u"delete", action.node + + def _handle_InsertAttrib(self, action): + return (u"insert-attribute", action.node, action.name, + json.dumps(action.value)) + + def _handle_InsertNode(self, action): + return u"insert", action.target, action.tag, str(action.position) + + def _handle_RenameAttrib(self, action): + return (u"move-attribute", action.node, action.oldname, action.newname) + + def _handle_MoveNode(self, action): + return u"move", action.node, action.target, str(action.position) + + def _handle_UpdateAttrib(self, action): + return (u"update-attribute", action.node, action.name, + json.dumps(action.value)) + + def _handle_UpdateTextIn(self, action): + return u"update-text", action.node, json.dumps(action.text) + + def _handle_UpdateTextAfter(self, action): + return u"update-text-after", action.node, json.dumps(action.text) diff --git a/xmldiff/main.py b/xmldiff/main.py new file mode 100644 index 0000000..758670a --- /dev/null +++ b/xmldiff/main.py @@ -0,0 +1,85 @@ +"""All major API points and command line tools""" +import pkg_resources + +from argparse import ArgumentParser, FileType +from lxml import etree +from xmldiff import diff, formatting + +__version__ = pkg_resources.require("xmldiff")[0].version + + +def diff_trees(left, right, F=0.5, uniqueattrs=None, formatter=None): + """Takes two lxml root elements or element trees""" + if isinstance(left, etree._ElementTree): + left = left.getroot() + if isinstance(right, etree._ElementTree): + right = right.getroot() + if formatter is not None: + formatter.prepare(left, right) + differ = diff.Differ(F=F, uniqueattrs=uniqueattrs) + diffs = differ.diff(left, right) + + if formatter is None: + return list(diffs) + + return formatter.format(diffs, left) + + +def diff_texts(left, right, F=0.5, uniqueattrs=None, formatter=None): + """Takes two Unicode strings containing XML""" + normalize = bool(getattr(formatter, 'normalize', 1) & formatting.WS_TAGS) + parser = etree.XMLParser(remove_blank_text=normalize) + left_tree = etree.fromstring(left, parser) + right_tree = etree.fromstring(right, parser) + return diff_trees(left_tree, right_tree, F=F, uniqueattrs=uniqueattrs, + formatter=formatter) + + +def diff_files(left, right, F=0.5, uniqueattrs=None, formatter=None): + """Takes two filenames or streams, and diffs the XML in those files""" + normalize = bool(getattr(formatter, 'normalize', 1) & formatting.WS_TAGS) + parser = etree.XMLParser(remove_blank_text=normalize) + left_tree = etree.parse(left, parser) + right_tree = etree.parse(right, parser) + return diff_trees(left_tree, right_tree, F=F, uniqueattrs=uniqueattrs, + formatter=formatter) + + +def make_parser(): + parser = ArgumentParser(description='Create a diff for two XML files.') + parser.add_argument('file1', type=FileType('r'), + help='the first input file') + parser.add_argument('file2', type=FileType('r'), + help='the second input file') + parser.add_argument('-f', '--formatter', default='diff', + choices=['diff', 'xml', 'rml'], + help='formatter selection') + parser.add_argument('-w', '--keep-whitespace', action='store_true', + help="do not strip ignorable whitespace") + parser.add_argument('-p', '--pretty-print', action='store_true', + help="try to make XML output more readable") + parser.add_argument('-v', '--version', action='version', + help='display version and exit.', + version="xmldiff %s" % __version__) + return parser + + +def run(args=None): + parser = make_parser() + args = parser.parse_args(args=args) + + if args.keep_whitespace: + normalize = formatting.WS_NONE + else: + normalize = formatting.WS_BOTH + + FORMATTERS = { + 'diff': formatting.DiffFormatter, + 'xml': formatting.XMLFormatter, + 'rml': formatting.RMLFormatter, + } + + formatter = FORMATTERS[args.formatter](normalize=normalize, + pretty_print=args.pretty_print) + result = diff_files(args.file1, args.file2, formatter=formatter) + print(result) diff --git a/xmldiff/utils.py b/xmldiff/utils.py new file mode 100644 index 0000000..5fe6139 --- /dev/null +++ b/xmldiff/utils.py @@ -0,0 +1,121 @@ +from __future__ import division + +import re +from operator import eq + + +def post_order_traverse(node): + for child in node.getchildren(): + # PY3: Man, I want yield from! + for item in post_order_traverse(child): + yield item + yield node + + +def reverse_post_order_traverse(node): + for child in reversed(node.getchildren()): + # PY3: Man, I want yield from! + for item in reverse_post_order_traverse(child): + yield item + yield node + + +def breadth_first_traverse(node): + # First yield the root node + yield node + + # Then go into the recursing part: + for item in _breadth_first_recurse(node): + yield item + + +def _breadth_first_recurse(node): + for child in node.getchildren(): + yield child + + for child in node.getchildren(): + for item in _breadth_first_recurse(child): + # PY3: Man, I want yield from! + yield item + + +# LCS from Myers: An O(ND) Difference Algorithm and Its Variations. This +# implementation uses Chris Marchetti's technique of only keeping the history +# per dpath, and not per node, so it should be vastly less memory intensive. +# It also skips any items that are equal in the beginning and end, speeding +# up the search, and using even less memory. +def longest_common_subsequence(left_sequence, right_sequence, eqfn=eq): + + start = 0 + lend = lslen = len(left_sequence) + rend = rslen = len(right_sequence) + + # Trim off the matching items at the beginning + while (start < lend and start < rend and + eqfn(left_sequence[start], right_sequence[start])): + start += 1 + + # trim off the matching items at the end + while (start < lend and start < rend and + eqfn(left_sequence[lend - 1], right_sequence[rend - 1])): + lend -= 1 + rend -= 1 + + left = left_sequence[start:lend] + right = right_sequence[start:rend] + + lmax = len(left) + rmax = len(right) + furtherst = {1: (0, [])} + + if not lmax + rmax: + # The sequences are equal + r = range(lslen) + return zip(r, r) + + for d in range(0, lmax + rmax + 1): + for k in range(-d, d + 1, 2): + if (k == -d or + (k != d and furtherst[k - 1][0] < furtherst[k + 1][0])): + # Go down + old_x, history = furtherst[k + 1] + x = old_x + else: + # Go left + old_x, history = furtherst[k - 1] + x = old_x + 1 + + # Copy the history + history = history[:] + y = x - k + + while x < lmax and y < rmax and eqfn(left[x], right[y]): + # We found a match + history.append((x + start, y + start)) + x += 1 + y += 1 + + if x >= lmax and y >= rmax: + # This is the best match + return [(e, e) for e in range(start)] + history + \ + list(zip(range(lend, lslen), range(rend, rslen))) + else: + furtherst[k] = (x, history) + + +WHITESPACE = re.compile(u'\s+', flags=re.MULTILINE) + + +def cleanup_whitespace(text): + return WHITESPACE.sub(' ', text) + + +def getpath(element, tree=None): + if tree is None: + tree = element.getroottree() + xpath = tree.getpath(element) + if xpath[-1] != ']': + # The path is unique without specifying a count. However, we always + # want that count, so we add [1]. + xpath = xpath + '[1]' + return xpath