Skip to content

Commit

Permalink
Feat/limit tweet (#12)
Browse files Browse the repository at this point in the history
* Moved html to template and prevented multiple browsers

* Changed image diff to be a class

* Added script

* Added validator infrastructure

* Fixed script

* Added tweet validator support and content_validator.py

* Fixed CR comments
  • Loading branch information
alonmln authored Feb 3, 2021
1 parent 9409397 commit 29f0552
Show file tree
Hide file tree
Showing 9 changed files with 187 additions and 93 deletions.
41 changes: 24 additions & 17 deletions base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from data_provider import DataProvider
from twitter_helper import upload_media, tweet_text, tweet_with_media
from image_diff_generator import generate_image_diff
from image_diff_generator import ImageDiffGenerator

if 'TESTING' in os.environ:
if os.environ['TESTING'] == 'False':
Expand Down Expand Up @@ -38,13 +38,19 @@ def should_use_first_item_dedup(self):
def get_source():
raise NotImplemented()

def _validate_change(self, url: str, new: str):
return True
def get_integrity_validators(self):
return []

def validate_change(self, url: str, old: str, new: str):
if not self._validate_change(url, new):
logging.info(f"Detected error. old was {old} new was {new} url {url}")
return False
def get_tweet_validators(self):
return []

@staticmethod
def validate(validators: list, url: str, old: str, new: str):
for validator in validators:
if not validator.validate_change(url, old, new):
logging.info(
f"Detected error. old was \n{old}\n new was \n{new}\n url {url} type: {validator.__name__}")
return False
return True

def tweet(self, text: str, article_id: str, url: str, image_path: str):
Expand All @@ -68,14 +74,14 @@ def tweet(self, text: str, article_id: str, url: str, image_path: str):
def store_data(self, data: Dict):
if self.data_provider.is_article_tracked(data['article_id'], self.get_source()):
count = self.data_provider.get_article_version_count(data[
'article_id'], self.get_source(), data['hash'])
'article_id'], self.get_source(), data['hash'])
if count != 1: # Changed
self.tweet_all_changes(data)
else:
self.data_provider.track_article(data)

def tweet_change(self, previous_data: str, current_data: str, text_to_tweet: str, article_id: str, url: str):
saved_image_diff_path = generate_image_diff(previous_data, current_data, text_to_tweet)
saved_image_diff_path = ImageDiffGenerator.generate_image_diff(previous_data, current_data, text_to_tweet)
self.tweet(text_to_tweet, article_id, url, saved_image_diff_path)

def tweet_all_changes(self, data: Dict):
Expand All @@ -85,25 +91,26 @@ def tweet_all_changes(self, data: Dict):

save_to_db = False

if self.should_tweet(url, previous_version['title'], data['title']):
self.tweet_change(previous_version['title'], data['title'], "שינוי בכותרת", article_id, url)
if self.validate(self.get_integrity_validators(), url, previous_version['title'], data['title']):
save_to_db = True
if self.should_tweet(url, previous_version['title'], data['title']):
self.tweet_change(previous_version['title'], data['title'], "שינוי בכותרת", article_id, url)

if self.should_tweet(url, previous_version['abstract'], data['abstract']):
self.tweet_change(previous_version['abstract'], data['abstract'], "שינוי בתת כותרת", article_id, url)
if self.validate(self.get_integrity_validators(), url, previous_version['abstract'], data['abstract']):
save_to_db = True
if self.should_tweet(url, previous_version['abstract'], data['abstract']):
self.tweet_change(previous_version['abstract'], data['abstract'], "שינוי בתת כותרת", article_id, url)

if save_to_db:
self.data_provider.increase_article_version(data)

def should_tweet(self, url: str, previous_data: str, current_data: str):
if len(previous_data) == 0 or len(current_data) == 0:
logging.info('Old or New empty')
return False
if previous_data == current_data:
return False
if not self.validate_change(url, previous_data, current_data):
return
if not self.validate(self.get_tweet_validators(), url, previous_data, current_data):
return False

return True

Expand All @@ -120,4 +127,4 @@ def loop_entries(self, entries):
try:
self.store_data(article_dict)
except BaseException as e:
logging.exception(f'Problem looping entry: {article_dict}')
logging.exception(f'Problem looping entry: {article_dict}')
1 change: 1 addition & 0 deletions css/styles.css
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ img {
vertical-align: middle;
margin-right: -5px;
margin-bottom: 5px;
width: 30px;
}

.alignleft {
Expand Down
10 changes: 6 additions & 4 deletions haaretz_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import hashlib
from datetime import datetime

from validators import validate_string_in_html
import validators.html_validator
import validators.content_validator
from rss_parser import RSSParser

HAARETZ_RSS = "https://www.haaretz.co.il/cmlink/1.1617539"
Expand All @@ -20,9 +21,11 @@ def get_source():
def should_use_first_item_dedup(self):
return True

def _validate_change(self, url: str, new: str):
return validate_string_in_html(url, new)
def get_integrity_validators(self):
return [validators.html_validator]

def get_tweet_validators(self):
return [validators.content_validator]

def entry_to_dict(self, article):
article_dict = dict()
Expand All @@ -36,4 +39,3 @@ def entry_to_dict(self, article):
repr(od.items()).encode('utf-8')).hexdigest()
article_dict['date_time'] = datetime.now(self.tz)
return article_dict

122 changes: 53 additions & 69 deletions image_diff_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,80 +8,64 @@

from html_utils import strip_html

PHANTOMJS_PATH = os.environ['PHANTOMJS_PATH']

class ImageDiffGenerator:
html_template = None
driver = None
phantomjs_path = None

def generate_image_diff(old: str, new: str, text_to_tweet: str):
stripped_old = strip_html(old)
stripped_new = strip_html(new)
new_hash = hashlib.sha224(stripped_new.encode('utf8')).hexdigest()
diff_html = html_diff(stripped_old, stripped_new)
html = f"""
<!doctype html>
<html lang="en">
@staticmethod
def init():
if ImageDiffGenerator.html_template is None:
with open("template.html", "r", encoding="utf-8") as html_file:
ImageDiffGenerator.html_template = html_file.read()

<head>
<meta charset="utf-8">
<link rel="stylesheet" href="./css/styles.css">
</head>
ImageDiffGenerator.phantomjs_path = os.environ['PHANTOMJS_PATH']
ImageDiffGenerator.driver = webdriver.PhantomJS(executable_path=ImageDiffGenerator.phantomjs_path)

<body style="width: 500px;">
<div id="wrapper">
<div>
{text_to_tweet}:
</div>
<p>
{diff_html}
</p>
<div>
<p class="alignleft">
<img src="img/twitter.png" width="30">
@ILNewsDiff
<span class="alignright">
כותרת בשינוי אדרת
</span>
</p>
</div>
</div>
</body>
@staticmethod
def generate_image_diff(old: str, new: str, text_to_tweet: str):
ImageDiffGenerator.init()
stripped_old = strip_html(old)
stripped_new = strip_html(new)
new_hash = hashlib.sha224(stripped_new.encode('utf8')).hexdigest()
diff_html = html_diff(stripped_old, stripped_new)

</html>
"""
with open('tmp.html', 'w', encoding="utf-8") as f:
f.write(html)
driver = webdriver.PhantomJS(
executable_path=PHANTOMJS_PATH)
html = ImageDiffGenerator.html_template.replace("text_to_tweet", text_to_tweet) \
.replace("diff_html", diff_html)

driver.get('tmp.html')
with open('tmp.html', 'w', encoding="utf-8") as f:
f.write(html)

e = driver.find_element_by_id('wrapper')
start_height = e.location['y']
block_height = e.size['height']
end_height = start_height
start_width = e.location['x']
block_width = e.size['width']
end_width = start_width
total_height = start_height + block_height + end_height
total_width = 510 # Override because body width is set to 500
timestamp = str(int(time.time()))
driver.save_screenshot('./tmp.png')
img = Image.open('./tmp.png')
img2 = img.crop((0, 0, total_width, total_height))
if int(total_width) > int(total_height * 2):
background = Image.new('RGBA', (total_width, int(total_width / 2)),
(255, 255, 255, 0))
bg_w, bg_h = background.size
offset = (int((bg_w - total_width) / 2),
int((bg_h - total_height) / 2))
else:
background = Image.new('RGBA', (total_width, total_height),
(255, 255, 255, 0))
bg_w, bg_h = background.size
offset = (int((bg_w - total_width) / 2),
int((bg_h - total_height) / 2))
background.paste(img2, offset)
filename = timestamp + new_hash
saved_file_path = f'./output/{filename}.png'
background.save(saved_file_path)
return saved_file_path
ImageDiffGenerator.driver.get('tmp.html')

e = ImageDiffGenerator.driver.find_element_by_id('wrapper')
start_height = e.location['y']
block_height = e.size['height']
end_height = start_height
start_width = e.location['x']
block_width = e.size['width']
end_width = start_width
total_height = start_height + block_height + end_height
total_width = 510 # Override because body width is set to 500
timestamp = str(int(time.time()))
ImageDiffGenerator.driver.save_screenshot('./tmp.png')
img = Image.open('./tmp.png')
img2 = img.crop((0, 0, total_width, total_height))
if int(total_width) > int(total_height * 2):
background = Image.new('RGBA', (total_width, int(total_width / 2)),
(255, 255, 255, 0))
bg_w, bg_h = background.size
offset = (int((bg_w - total_width) / 2),
int((bg_h - total_height) / 2))
else:
background = Image.new('RGBA', (total_width, total_height),
(255, 255, 255, 0))
bg_w, bg_h = background.size
offset = (int((bg_w - total_width) / 2),
int((bg_h - total_height) / 2))
background.paste(img2, offset)
filename = timestamp + new_hash
saved_file_path = f'./output/{filename}.png'
background.save(saved_file_path)
return saved_file_path
5 changes: 4 additions & 1 deletion israel_hayom_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import hashlib
from datetime import datetime

from validators import validate_string_in_html
import validators.content_validator
from rss_parser import RSSParser

ISRAEL_HAYOM_RSS = "https://www.israelhayom.co.il/rss.xml"
Expand All @@ -20,6 +20,9 @@ def get_source():
def should_use_first_item_dedup(self):
return True

def get_tweet_validators(self):
return [validators.content_validator]

def entry_to_dict(self, article):
article_dict = dict()
article_dict['article_id'] = article.guid
Expand Down
35 changes: 35 additions & 0 deletions scripts/validate_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from pytz import timezone

import feedparser

from validators import html_validator
from israel_hayom_parser import IsraelHayomParser as Parser

TIMEZONE = 'Israel'
LOCAL_TZ = timezone(TIMEZONE)


def main():
parser = Parser(LOCAL_TZ)
r = feedparser.parse(parser.url)

if r is None:
print("RSS was empty")
return

print(f"Checking {len(r.entries)} entries")
for entry in r.entries[::-1]:
entry_dict = parser.entry_to_dict(entry)
url = entry_dict["url"]
title = entry_dict["title"]
description = entry_dict["abstract"]
if not html_validator.validate_change(url, "", title):
print(f"Could not find title \n{title} \nin {url}")
if not html_validator.validate_change(url, "", description):
print(f"Could not find description \n{description}\nin {url}")

print("Finished iterating")


if __name__ == '__main__':
main()
27 changes: 27 additions & 0 deletions template.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<link rel="stylesheet" href="./css/styles.css">
</head>

<body style="width: 500px;">
<div id="wrapper">
<div>
text_to_tweet:
</div>
<p>
diff_html
</p>
<div>
<p class="alignleft">
<img src="img/twitter.png"/>
@ILNewsDiff
<span class="alignright">
כותרת בשינוי אדרת
</span>
</p>
</div>
</div>
</body>
</html>
35 changes: 35 additions & 0 deletions validators/content_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import math
import re


def one_char_difference(first: str, second: str):
"""
Checks if There is maximum of one char difference between first and second
"""
if math.fabs(len(first) - len(second)) >= 2:
return False

for i, (c1, c2) in enumerate(zip(first, second)):
if c1 != c2:
return first[i + 1:] == second[i + 1:] \
or first[i:] == second[i + 1:] \
or first[i + 1:] == second[i:]

return True


ALPHABET_WITH_HEBREW_PATTERN = r"[^a-zA-Zא-ת0-9]"


def validate_change(url: str, old: str, new: str):
"""
Checks if there is a maximum of 1 char difference out of only the alphabet chars in old an new or if one has '?'
and the second doesn't
"""
if ("?" in old and "?" not in new) or ("?" in new and "?" not in old):
return True

old_stripped = re.sub(ALPHABET_WITH_HEBREW_PATTERN, '', old)
new_stripped = re.sub(ALPHABET_WITH_HEBREW_PATTERN, '', new)

return not one_char_difference(old_stripped, new_stripped)
4 changes: 2 additions & 2 deletions validators.py → validators/html_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from bs4 import BeautifulSoup


def validate_string_in_html(url: str, string_to_validate: str):
def validate_change(url: str, old: str, new: str):
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
return soup.find(string=string_to_validate) is not None
return soup.find(string=new) is not None

0 comments on commit 29f0552

Please sign in to comment.