Feat/limit tweet (#12)

* Moved html to template and prevented multiple browsers * Changed image diff to be a class * Added script * Added validator infrastructure * Fixed script * Added tweet validator support and content_validator.py * Fixed CR comments
alonmln · Feb 3, 2021 · 29f0552 · 29f0552
1 parent 9409397
commit 29f0552
Show file tree

Hide file tree

Showing 9 changed files with 187 additions and 93 deletions.
diff --git a/base_parser.py b/base_parser.py
@@ -7,7 +7,7 @@
 
 from data_provider import DataProvider
 from twitter_helper import upload_media, tweet_text, tweet_with_media
-from image_diff_generator import generate_image_diff
+from image_diff_generator import ImageDiffGenerator
 
 if 'TESTING' in os.environ:
     if os.environ['TESTING'] == 'False':
@@ -38,13 +38,19 @@ def should_use_first_item_dedup(self):
     def get_source():
         raise NotImplemented()
 
-    def _validate_change(self, url: str, new: str):
-        return True
+    def get_integrity_validators(self):
+        return []
 
-    def validate_change(self, url: str, old: str, new: str):
-        if not self._validate_change(url, new):
-            logging.info(f"Detected error. old was {old} new was {new} url {url}")
-            return False
+    def get_tweet_validators(self):
+        return []
+
+    @staticmethod
+    def validate(validators: list, url: str, old: str, new: str):
+        for validator in validators:
+            if not validator.validate_change(url, old, new):
+                logging.info(
+                    f"Detected error. old was \n{old}\n new was \n{new}\n url {url} type: {validator.__name__}")
+                return False
         return True
 
     def tweet(self, text: str, article_id: str, url: str, image_path: str):
@@ -68,14 +74,14 @@ def tweet(self, text: str, article_id: str, url: str, image_path: str):
     def store_data(self, data: Dict):
         if self.data_provider.is_article_tracked(data['article_id'], self.get_source()):
             count = self.data_provider.get_article_version_count(data[
-                    'article_id'], self.get_source(), data['hash'])
+                                                                     'article_id'], self.get_source(), data['hash'])
             if count != 1:  # Changed
                 self.tweet_all_changes(data)
         else:
             self.data_provider.track_article(data)
 
     def tweet_change(self, previous_data: str, current_data: str, text_to_tweet: str, article_id: str, url: str):
-        saved_image_diff_path = generate_image_diff(previous_data, current_data, text_to_tweet)
+        saved_image_diff_path = ImageDiffGenerator.generate_image_diff(previous_data, current_data, text_to_tweet)
         self.tweet(text_to_tweet, article_id, url, saved_image_diff_path)
 
     def tweet_all_changes(self, data: Dict):
@@ -85,25 +91,26 @@ def tweet_all_changes(self, data: Dict):
 
         save_to_db = False
 
-        if self.should_tweet(url, previous_version['title'], data['title']):
-            self.tweet_change(previous_version['title'], data['title'], "שינוי בכותרת", article_id, url)
+        if self.validate(self.get_integrity_validators(), url, previous_version['title'], data['title']):
             save_to_db = True
+            if self.should_tweet(url, previous_version['title'], data['title']):
+                self.tweet_change(previous_version['title'], data['title'], "שינוי בכותרת", article_id, url)
 
-        if self.should_tweet(url, previous_version['abstract'], data['abstract']):
-            self.tweet_change(previous_version['abstract'], data['abstract'], "שינוי בתת כותרת", article_id, url)
+        if self.validate(self.get_integrity_validators(), url, previous_version['abstract'], data['abstract']):
             save_to_db = True
+            if self.should_tweet(url, previous_version['abstract'], data['abstract']):
+                self.tweet_change(previous_version['abstract'], data['abstract'], "שינוי בתת כותרת", article_id, url)
 
         if save_to_db:
             self.data_provider.increase_article_version(data)
 
     def should_tweet(self, url: str, previous_data: str, current_data: str):
         if len(previous_data) == 0 or len(current_data) == 0:
-            logging.info('Old or New empty')
             return False
         if previous_data == current_data:
             return False
-        if not self.validate_change(url, previous_data, current_data):
-            return
+        if not self.validate(self.get_tweet_validators(), url, previous_data, current_data):
+            return False
 
         return True
 
@@ -120,4 +127,4 @@ def loop_entries(self, entries):
             try:
                 self.store_data(article_dict)
             except BaseException as e:
-                logging.exception(f'Problem looping entry: {article_dict}')
+                logging.exception(f'Problem looping entry: {article_dict}')
diff --git a/css/styles.css b/css/styles.css
@@ -40,6 +40,7 @@ img {
     vertical-align: middle;
     margin-right: -5px;
     margin-bottom: 5px;
+    width: 30px;
 }
 
 .alignleft {

diff --git a/haaretz_parser.py b/haaretz_parser.py
@@ -2,7 +2,8 @@
 import hashlib
 from datetime import datetime
 
-from validators import validate_string_in_html
+import validators.html_validator
+import validators.content_validator
 from rss_parser import RSSParser
 
 HAARETZ_RSS = "https://www.haaretz.co.il/cmlink/1.1617539"
@@ -20,9 +21,11 @@ def get_source():
     def should_use_first_item_dedup(self):
         return True
 
-    def _validate_change(self, url: str, new: str):
-        return validate_string_in_html(url, new)
+    def get_integrity_validators(self):
+        return [validators.html_validator]
 
+    def get_tweet_validators(self):
+        return [validators.content_validator]
 
     def entry_to_dict(self, article):
         article_dict = dict()
@@ -36,4 +39,3 @@ def entry_to_dict(self, article):
             repr(od.items()).encode('utf-8')).hexdigest()
         article_dict['date_time'] = datetime.now(self.tz)
         return article_dict
-
diff --git a/image_diff_generator.py b/image_diff_generator.py
@@ -8,80 +8,64 @@
 
 from html_utils import strip_html
 
-PHANTOMJS_PATH = os.environ['PHANTOMJS_PATH']
 
+class ImageDiffGenerator:
+    html_template = None
+    driver = None
+    phantomjs_path = None
 
-def generate_image_diff(old: str, new: str, text_to_tweet: str):
-    stripped_old = strip_html(old)
-    stripped_new = strip_html(new)
-    new_hash = hashlib.sha224(stripped_new.encode('utf8')).hexdigest()
-    diff_html = html_diff(stripped_old, stripped_new)
-    html = f"""
-    <!doctype html>
-    <html lang="en">
+    @staticmethod
+    def init():
+        if ImageDiffGenerator.html_template is None:
+            with open("template.html", "r", encoding="utf-8") as html_file:
+                ImageDiffGenerator.html_template = html_file.read()
 
-    <head>
-      <meta charset="utf-8">
-      <link rel="stylesheet" href="./css/styles.css">
-    </head>
+            ImageDiffGenerator.phantomjs_path = os.environ['PHANTOMJS_PATH']
+            ImageDiffGenerator.driver = webdriver.PhantomJS(executable_path=ImageDiffGenerator.phantomjs_path)
 
-    <body style="width: 500px;">
-      <div id="wrapper">
-        <div>
-          {text_to_tweet}:
-        </div>
-        <p>
-          {diff_html}
-        </p>
-        <div>
-            <p class="alignleft">
-              <img src="img/twitter.png" width="30">
-              @ILNewsDiff
-              <span class="alignright">
-                כותרת בשינוי אדרת
-              </span>
-            </p>
-        </div>
-      </div>
-    </body>
+    @staticmethod
+    def generate_image_diff(old: str, new: str, text_to_tweet: str):
+        ImageDiffGenerator.init()
+        stripped_old = strip_html(old)
+        stripped_new = strip_html(new)
+        new_hash = hashlib.sha224(stripped_new.encode('utf8')).hexdigest()
+        diff_html = html_diff(stripped_old, stripped_new)
 
-    </html>
-    """
-    with open('tmp.html', 'w', encoding="utf-8") as f:
-        f.write(html)
-    driver = webdriver.PhantomJS(
-        executable_path=PHANTOMJS_PATH)
+        html = ImageDiffGenerator.html_template.replace("text_to_tweet", text_to_tweet) \
+            .replace("diff_html", diff_html)
 
-    driver.get('tmp.html')
+        with open('tmp.html', 'w', encoding="utf-8") as f:
+            f.write(html)
 
-    e = driver.find_element_by_id('wrapper')
-    start_height = e.location['y']
-    block_height = e.size['height']
-    end_height = start_height
-    start_width = e.location['x']
-    block_width = e.size['width']
-    end_width = start_width
-    total_height = start_height + block_height + end_height
-    total_width = 510  # Override because body width is set to 500
-    timestamp = str(int(time.time()))
-    driver.save_screenshot('./tmp.png')
-    img = Image.open('./tmp.png')
-    img2 = img.crop((0, 0, total_width, total_height))
-    if int(total_width) > int(total_height * 2):
-        background = Image.new('RGBA', (total_width, int(total_width / 2)),
-                               (255, 255, 255, 0))
-        bg_w, bg_h = background.size
-        offset = (int((bg_w - total_width) / 2),
-                  int((bg_h - total_height) / 2))
-    else:
-        background = Image.new('RGBA', (total_width, total_height),
-                               (255, 255, 255, 0))
-        bg_w, bg_h = background.size
-        offset = (int((bg_w - total_width) / 2),
-                  int((bg_h - total_height) / 2))
-    background.paste(img2, offset)
-    filename = timestamp + new_hash
-    saved_file_path = f'./output/{filename}.png'
-    background.save(saved_file_path)
-    return saved_file_path
+        ImageDiffGenerator.driver.get('tmp.html')
 
+        e = ImageDiffGenerator.driver.find_element_by_id('wrapper')
+        start_height = e.location['y']
+        block_height = e.size['height']
+        end_height = start_height
+        start_width = e.location['x']
+        block_width = e.size['width']
+        end_width = start_width
+        total_height = start_height + block_height + end_height
+        total_width = 510  # Override because body width is set to 500
+        timestamp = str(int(time.time()))
+        ImageDiffGenerator.driver.save_screenshot('./tmp.png')
+        img = Image.open('./tmp.png')
+        img2 = img.crop((0, 0, total_width, total_height))
+        if int(total_width) > int(total_height * 2):
+            background = Image.new('RGBA', (total_width, int(total_width / 2)),
+                                   (255, 255, 255, 0))
+            bg_w, bg_h = background.size
+            offset = (int((bg_w - total_width) / 2),
+                      int((bg_h - total_height) / 2))
+        else:
+            background = Image.new('RGBA', (total_width, total_height),
+                                   (255, 255, 255, 0))
+            bg_w, bg_h = background.size
+            offset = (int((bg_w - total_width) / 2),
+                      int((bg_h - total_height) / 2))
+        background.paste(img2, offset)
+        filename = timestamp + new_hash
+        saved_file_path = f'./output/{filename}.png'
+        background.save(saved_file_path)
+        return saved_file_path
diff --git a/israel_hayom_parser.py b/israel_hayom_parser.py
@@ -2,7 +2,7 @@
 import hashlib
 from datetime import datetime
 
-from validators import validate_string_in_html
+import validators.content_validator
 from rss_parser import RSSParser
 
 ISRAEL_HAYOM_RSS = "https://www.israelhayom.co.il/rss.xml"
@@ -20,6 +20,9 @@ def get_source():
     def should_use_first_item_dedup(self):
         return True
 
+    def get_tweet_validators(self):
+        return [validators.content_validator]
+
     def entry_to_dict(self, article):
         article_dict = dict()
         article_dict['article_id'] = article.guid

diff --git a/scripts/validate_parser.py b/scripts/validate_parser.py
@@ -0,0 +1,35 @@
+from pytz import timezone
+
+import feedparser
+
+from validators import html_validator
+from israel_hayom_parser import IsraelHayomParser as Parser
+
+TIMEZONE = 'Israel'
+LOCAL_TZ = timezone(TIMEZONE)
+
+
+def main():
+    parser = Parser(LOCAL_TZ)
+    r = feedparser.parse(parser.url)
+
+    if r is None:
+        print("RSS was empty")
+        return
+
+    print(f"Checking {len(r.entries)} entries")
+    for entry in r.entries[::-1]:
+        entry_dict = parser.entry_to_dict(entry)
+        url = entry_dict["url"]
+        title = entry_dict["title"]
+        description = entry_dict["abstract"]
+        if not html_validator.validate_change(url, "", title):
+            print(f"Could not find title \n{title} \nin {url}")
+        if not html_validator.validate_change(url, "", description):
+            print(f"Could not find description \n{description}\nin {url}")
+
+    print("Finished iterating")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/template.html b/template.html
@@ -0,0 +1,27 @@
+<!doctype html>
+<html lang="en">
+    <head>
+        <meta charset="utf-8">
+        <link rel="stylesheet" href="./css/styles.css">
+    </head>
+
+    <body style="width: 500px;">
+        <div id="wrapper">
+            <div>
+                text_to_tweet:
+            </div>
+            <p>
+                diff_html
+            </p>
+            <div>
+                <p class="alignleft">
+                    <img src="img/twitter.png"/>
+                    @ILNewsDiff
+                    <span class="alignright">
+                        כותרת בשינוי אדרת
+                    </span>
+                </p>
+            </div>
+        </div>
+    </body>
+</html>
diff --git a/validators/content_validator.py b/validators/content_validator.py
@@ -0,0 +1,35 @@
+import math
+import re
+
+
+def one_char_difference(first: str, second: str):
+    """
+    Checks if There is maximum of one char difference between first and second
+    """
+    if math.fabs(len(first) - len(second)) >= 2:
+        return False
+
+    for i, (c1, c2) in enumerate(zip(first, second)):
+        if c1 != c2:
+            return first[i + 1:] == second[i + 1:] \
+                   or first[i:] == second[i + 1:] \
+                   or first[i + 1:] == second[i:]
+
+    return True
+
+
+ALPHABET_WITH_HEBREW_PATTERN = r"[^a-zA-Zא-ת0-9]"
+
+
+def validate_change(url: str, old: str, new: str):
+    """
+    Checks if there is a maximum of 1 char difference out of only the alphabet chars in old an new or if one has '?'
+    and the second doesn't
+    """
+    if ("?" in old and "?" not in new) or ("?" in new and "?" not in old):
+        return True
+
+    old_stripped = re.sub(ALPHABET_WITH_HEBREW_PATTERN, '', old)
+    new_stripped = re.sub(ALPHABET_WITH_HEBREW_PATTERN, '', new)
+
+    return not one_char_difference(old_stripped, new_stripped)
diff --git a/validators.py → validators/html_validator.py b/validators.py → validators/html_validator.py
@@ -2,7 +2,7 @@
 from bs4 import BeautifulSoup
 
 
-def validate_string_in_html(url: str, string_to_validate: str):
+def validate_change(url: str, old: str, new: str):
     page = requests.get(url)
     soup = BeautifulSoup(page.content, "html.parser")
-    return soup.find(string=string_to_validate) is not None
+    return soup.find(string=new) is not None