Skip to content

Commit

Permalink
Update news scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
noidexe committed Dec 3, 2022
1 parent b6cb8b4 commit d598cb5
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 19 deletions.
79 changes: 62 additions & 17 deletions scenes/NewsFeed.gd
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ var news_item_scene = preload("res://scenes/NewsItem.tscn")

const news_cache_file = "user://news_cache.bin"

const VOID_HTML_ELEMENTS = ["area", "base", "br", "col", "command", "embed", "hr",
"img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"]

onready var feed_vbox = $"Feed"
onready var loading_text = $"Feed/Loading"

Expand Down Expand Up @@ -58,7 +61,6 @@ func _save_news_cache(news : Array):
# Analyzes html for <div class="news-item"> elements
# which will be further parsed by _parse_news_item()
func _get_news(buffer) -> Array:
var parsed_news = []

var xml = XMLParser.new()
var error = xml.open_buffer(buffer)
Expand All @@ -72,20 +74,40 @@ func _get_news(buffer) -> Array:
print("Error %s reading XML" % err)
break

# Look for <div class="news-item"> elements
# Look for <div class="posts"> element
if xml.get_node_type() == XMLParser.NODE_ELEMENT and xml.get_node_name() == "div":
var class_attr = xml.get_named_attribute_value_safe("class")
# Take note of the offsets from <div class="news-item> to </div>
# Take note of the offsets from <div class="posts"> to </div>
# to further analyze
if "news-item" in class_attr:
if "posts" in class_attr:
var tag_open_offset = xml.get_node_offset()
xml.skip_section()
xml.read()
_skip_section_handle_void_elements(xml)
var tag_close_offset = xml.get_node_offset()
parsed_news.append(_parse_news_item(buffer, tag_open_offset, tag_close_offset))

return _parse_posts(buffer, tag_open_offset, tag_close_offset)
else:
print("Error %s getting download info" % error)
# return an empty array by default
return []



func _parse_posts(buffer, begin_ofs, end_ofs):
var parsed_news = []

var xml = XMLParser.new()
var error = xml.open_buffer(buffer)
if error != OK:
printerr("Error parsing news item. Error code: %s" % error)
xml.seek(begin_ofs) # automatically does xml.read()
while(xml.get_node_offset() != end_ofs):
if xml.get_node_type() == XMLParser.NODE_ELEMENT:
match xml.get_node_name():
"a":
var tag_open_offset = xml.get_node_offset()
_skip_section_handle_void_elements(xml)
var tag_close_offset = xml.get_node_offset()
parsed_news.append(_parse_news_item(buffer, tag_open_offset, tag_close_offset))
xml.read()
return parsed_news


Expand All @@ -106,30 +128,33 @@ func _parse_news_item(buffer, begin_ofs, end_ofs):
while(xml.get_node_offset() != end_ofs):
if xml.get_node_type() == XMLParser.NODE_ELEMENT:
match xml.get_node_name():
"a":
parsed_item["link"] = xml.get_named_attribute_value_safe("href")
"div":
if "image" in xml.get_named_attribute_value_safe("class"):
if "thumbnail" in xml.get_named_attribute_value_safe("class"):
var image_style = xml.get_named_attribute_value_safe("style")
var url_start = image_style.find("'") + 1
var url_end = image_style.find_last("'")
var image_url = image_style.substr(url_start,url_end - url_start)

parsed_item["image"] = image_url
parsed_item["link"] = xml.get_named_attribute_value_safe("href")
"h3":
if "title" in xml.get_named_attribute_value_safe("class"):
xml.read()
parsed_item["title"] = xml.get_node_data().strip_edges() if xml.get_node_type() == XMLParser.NODE_TEXT else ""
xml.read()
parsed_item["title"] = xml.get_node_data().strip_edges() if xml.get_node_type() == XMLParser.NODE_TEXT else ""
"h4":
if "author" in xml.get_named_attribute_value_safe("class"):
xml.read()
parsed_item["author"] = xml.get_node_data().strip_edges() if xml.get_node_type() == XMLParser.NODE_TEXT else ""
"span":
if "date" in xml.get_named_attribute_value_safe("class"):
xml.read()
parsed_item["date"] = xml.get_node_data().strip_edges() if xml.get_node_type() == XMLParser.NODE_TEXT else ""
parsed_item["date"] = xml.get_node_data().strip_edges().lstrip("&nbsp;-&nbsp;") if xml.get_node_type() == XMLParser.NODE_TEXT else ""
if "by" in xml.get_named_attribute_value_safe("class"):
xml.read()
parsed_item["author"] = xml.get_node_data().strip_edges() if xml.get_node_type() == XMLParser.NODE_TEXT else ""
"p":
xml.read()
parsed_item["contents"] = xml.get_node_data().strip_edges() if xml.get_node_type() == XMLParser.NODE_TEXT else ""
if "excerpt" in xml.get_named_attribute_value_safe("class"):
xml.read()
parsed_item["contents"] = xml.get_node_data().strip_edges() if xml.get_node_type() == XMLParser.NODE_TEXT else ""
xml.read()

# Return the dictionary with the news entry once we are done
Expand All @@ -138,3 +163,23 @@ func _parse_news_item(buffer, begin_ofs, end_ofs):
func _on_screen_resized():
visible = get_viewport_rect().size.x > 1046



# This is line by line a gdscript implementation of XMLParser
# the only difference is that void html elements do not increase
# tagcount
func _skip_section_handle_void_elements(xml : XMLParser ):
if xml.is_empty():
return

var tagcount : int = 1

while tagcount and xml.read() == OK:
if (
xml.get_node_type() == XMLParser.NODE_ELEMENT and
!xml.is_empty() and
!xml.get_node_name() in VOID_HTML_ELEMENTS
):
tagcount += 1
elif xml.get_node_type() == XMLParser.NODE_ELEMENT_END:
tagcount -= 1
2 changes: 1 addition & 1 deletion scenes/NewsItem.gd
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func set_info(info : Dictionary):
url = info.link
hint_tooltip = url
title.text = info.title
author.text = "%s (%s)" % [info.author, info.date]
author.text = "%s - %s" % [info.author, info.date]
contents.text = info.contents
_load_image(info.image)

Expand Down
2 changes: 1 addition & 1 deletion scenes/Version.gd
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ const api_endpoint = "https://api.github.com/repos/noidexe/godot-version-manager
# Add '-devel' for versions not intended for release
# Remove '-devel' when commiting a build to be tagged as release
# Remember to update version in export settings before exporting
const version_tag = "v1.6.1"
const version_tag = "v1.6.2"

# Initialized to the release list page as a fallback in case it fails to
# get the link to the latest release for some reason
Expand Down

0 comments on commit d598cb5

Please sign in to comment.