From 17b6160d410486f9f67c33a991feca5cf8d54072 Mon Sep 17 00:00:00 2001 From: Han Seoul-Oh Date: Tue, 4 May 2021 01:20:53 -0700 Subject: [PATCH 1/2] Improve diff readability for eg "Bob," -> "Bob" If a hunk consists of only deleting or only adding at the beginning or end of a word, then combine them into one hunk. Examples: Changes from simplediff: - Alice, Bob and Charlie + Alice, Bob, and Charlie simplediff: Alice, BobBob, Charlie this diff: Alice, Bob, Charlie - Alice Bob Charlie's Angels And David + Alice Bob Charlie David simplediff: Alice Bob Charlie's Angels AndCharlie David this diff: Alice Bob Charlie's Angels And David Same as simplediff: hunks you wouldn't want simplified: - Alice Bob Charlie + Alice Robert Charlie diff: Alice Bob Robert Charlie if the change isn't only at the beginning or end: - Alice Bob Charlie + Alice Blob Charlie diff: Alice Bob Bob Charlie - Alice Bobby Charlie + Alice bb Charlie diff: Alice Bobby bb - Alice Zeneca Charlie + Alice AstraZeneca Charlie's diff: Alice Zeneca Charlie AstraZeneca Charlie's --- nytdiff.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 3 deletions(-) diff --git a/nytdiff.py b/nytdiff.py index e5fd615735..9261f434f4 100644 --- a/nytdiff.py +++ b/nytdiff.py @@ -15,7 +15,7 @@ from pytz import timezone import requests import tweepy -from simplediff import html_diff +from simplediff import string_diff from selenium import webdriver TIMEZONE = 'America/Buenos_Aires' @@ -183,12 +183,99 @@ def strip_html(self, html_str): styles=styles, strip=strip) + def html_diff(old, new): + """ + Like simplediff.html_diff(), with a tweak: if a hunk + consists of only deleting or only adding at the beginning + or end of a word, then it's combined into one hunk. + + Examples: + Changes from simplediff: + - Alice, Bob and Charlie + + Alice, Bob, and Charlie + simplediff: Alice, BobBob, Charlie + this diff: Alice, Bob, Charlie + + - Alice Bob Charlie's Angels And David + + Alice Bob Charlie David + simplediff: Alice Bob Charlie's Angels AndCharlie David + this diff: Alice Bob Charlie's Angels And David + + Same as simplediff: + hunks you wouldn't want simplified: + - Alice Bob Charlie + + Alice Robert Charlie + diff: Alice Bob Robert Charlie + + if the change isn't only at the beginning or end: + - Alice Bob Charlie + + Alice Blob Charlie + diff: Alice Bob Bob Charlie + + - Alice Bobby Charlie + + Alice bb Charlie + diff: Alice Bobby bb + + - Alice Zeneca Charlie + + Alice AstraZeneca Charlie's + diff: Alice Zeneca Charlie AstraZeneca Charlie's + """ + def hunk_to_html(op, words): + words = ' '.join(words) + if op == '-': + return '{}'.format(words) + if op == '+': + return '{}'.format(words) + return words + + hunks = string_diff(old, new) + html = [] + skip_next = False + for (prev_op, prev_words), (next_op, next_words) in zip(hunks[:-1], hunks[1:]): + if prev_op == '-' and next_op == '+': + if len(prev_words) == 1: + [old_word] = prev_words + first_new_word, last_new_word = next_words[0], next_words[-1] + if first_new_word.startswith(old_word): + next_words[0] = old_word + '' + first_new_word[len(old_word):] + html.append(' '.join(next_words) + '') + skip_next = True + continue + elif last_new_word.endswith(old_word): + next_words[-1] = last_new_word[:-len(old_word)] + '' + old_word + html.append('' + ' '.join(next_words)) + skip_next = True + continue + if len(next_words) == 1: + [new_word] = next_words + first_old_word, last_old_word = prev_words[0], prev_words[-1] + if first_old_word.startswith(new_word): + prev_words[0] = new_word + '' + first_old_word[len(new_word):] + html.append(' '.join(prev_words) + '') + skip_next = True + continue + elif last_old_word.endswith(new_word): + prev_words[-1] = last_old_word[:-len(new_word)] + '' + new_word + html.append('' + ' '.join(prev_words)) + skip_next = True + continue + if skip_next: + skip_next = False + continue + html.append(hunk_to_html(prev_op, prev_words)) + + if not skip_next: + html.append(hunk_to_html(*(hunks[-1]))) + + return ' '.join(html) + def show_diff(self, old, new): if len(old) == 0 or len(new) == 0: logging.info('Old or New empty') return False new_hash = hashlib.sha224(new.encode('utf8')).hexdigest() - logging.info(html_diff(old, new)) + htmldiff = self.html_diff(old, new) + logging.info(htmldiff) html = """ @@ -202,7 +289,7 @@ def show_diff(self, old, new):

- """.format(html_diff(old, new)) + """.format(htmldiff) with open('tmp.html', 'w') as f: f.write(html) From a5524c4885d08d050e2861d0c0a0b21a7d4c35f7 Mon Sep 17 00:00:00 2001 From: Han Seoul-Oh Date: Tue, 4 May 2021 01:30:10 -0700 Subject: [PATCH 2/2] Fix html_diff docstring typos --- nytdiff.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nytdiff.py b/nytdiff.py index 9261f434f4..c2a7c69ecd 100644 --- a/nytdiff.py +++ b/nytdiff.py @@ -210,11 +210,11 @@ def html_diff(old, new): if the change isn't only at the beginning or end: - Alice Bob Charlie + Alice Blob Charlie - diff: Alice Bob Bob Charlie + diff: Alice Bob Blob Charlie - Alice Bobby Charlie + Alice bb Charlie - diff: Alice Bobby bb + diff: Alice Bobby bb Charlie - Alice Zeneca Charlie + Alice AstraZeneca Charlie's