Skip to content

Commit

Permalink
Drop now useless html_get_article test. Simplify code.
Browse files Browse the repository at this point in the history
  • Loading branch information
lwindolf committed Jan 16, 2025
1 parent 57b4687 commit 0dfc348
Show file tree
Hide file tree
Showing 4 changed files with 2 additions and 139 deletions.
28 changes: 1 addition & 27 deletions src/html.c
Original file line number Diff line number Diff line change
Expand Up @@ -325,30 +325,4 @@ html_discover_favicon (const gchar * data, const gchar * defaultBaseUri)
xmlFreeDoc (doc);

return results;
}

gchar *
html_get_article (const gchar *data, const gchar *baseUri) {
xmlDocPtr doc;
xmlNodePtr root;
gchar *result = NULL;

doc = xhtml_parse ((gchar *)data, (size_t)strlen (data));
if (!doc) {
debug (DEBUG_PARSING, "XHTML parsing error on '%s'", baseUri);
return NULL;
}

/* In the past we did some special extraction, but to allow Readability.js
to extract different things like LD+JSON we now just pass the whole doc */
root = xmlDocGetRootElement (doc);
result = xhtml_extract (root, 1, baseUri);
xmlFreeDoc (doc);

return result;
}

gchar *
html_get_amp_url (const gchar *data) {
return search_links_dirty (data, LINK_AMPHTML);
}
}
23 changes: 0 additions & 23 deletions src/html.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,27 +49,4 @@ GSList * html_auto_discover_feed(const gchar* data, const gchar *baseUri);
*/
GSList * html_discover_favicon(const gchar* data, const gchar *baseUri);

/**
* html_get_article:
*
* Parse HTML as XHTML to extract containing HTML5 article.
*
* @data: the HTML to check
* @baseUri: URI of the downloaded HTML used to resolve relative URIs
*
* Returns: XHTML fragment representing the <article> or NULL
*/
gchar * html_get_article(const gchar *data, const gchar *baseUri);

/**
* html_get_amp_url:
*
* Parse HTML and returns AMP URL if found
*
* @data: the HTML to check
*
* Returns: AMP URL or NULL. Must be free'd by caller
*/
gchar * html_get_amp_url(const gchar *data);

#endif
2 changes: 1 addition & 1 deletion src/node_providers/feed.c
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ feed_enrich_item_cb (const UpdateResult * const result, gpointer userdata, updat
if (!item)
return;

article = html_get_article (result->data, result->source);
article = xhtml_extract_from_string (result->data, result->source);
if (article) {
// Enable AMP images by replacing <amg-img> by <img>
gchar **tmp_split = g_strsplit(article, "<amp-img", 0);
Expand Down
88 changes: 0 additions & 88 deletions src/tests/parse_html.c
Original file line number Diff line number Diff line change
Expand Up @@ -123,69 +123,6 @@ gchar *tc_xml_rce[] = {
NULL
};

/* HTML5 extraction test cases */

gchar *tc_article[] = {
"<html lang='fr'><script>blabla</script><style>body { background:red }</style><body><article><p>1</p></article></body></html>",
"<p>1</p>\n"
};

gchar *tc_article_main[] = {
"<html lang='fr'><script>blabla</script><style>body { background:red }</style><body><main><p>1</p></main></body></html>",
"<p>1</p>\n"
};

gchar *tc_article_main2[] = {
"<html lang='fr'><script>blabla</script><style>body { background:red }</style><body><main><article><p>1</p></article></main></body></html>",
"<p>1</p>\n"
};

gchar *tc_article_micro_format[] = {
"<html><head></head><body><div property='articleBody'><p>1</p></div></body></html>",
"<p>1</p>\n"
};

gchar *tc_article_cms_content_id[] = {
"<html><head></head><body><div id='content'><p>1</p></div></body></html>",
"<p>1</p>\n"
};

gchar *tc_article_missing[] = {
"<html><head></head><body><p>1</p></body></html>",
NULL
};

/* this test case is about an empty tag "<x></x>" not being collapsed to "<x/>"
but to be output as "<x> </x>" instead */
gchar *tc_article_empty_tags[] = {
"<html><head></head><body><article><p>1</p><div class='something' data-nr='555'></div></article></body></html>",
"<p>1</p><div class=\"something\" data-nr=\"555\"> </div>\n"
};

/* this test case is about nested empty tags "<x><x></x></x>" being expanded as "<x><x> </x> </x>" */
gchar *tc_article_empty_tags_nested[] = {
"<html><head></head><body><article><p>1</p><div><div class='something' data-nr='555'></div></div></article></body></html>",
"<p>1</p><div>\n <div class=\"something\" data-nr=\"555\"> </div>\n</div>\n"
};

/* this test case is about empty XHTML tags "<x/>" being expanded */
gchar *tc_article_self_closed_tags[] = {
"<html><head></head><body><article><p>1</p><div class='something' data-nr='555'/></article></body></html>",
"<p>1</p><div class=\"something\" data-nr=\"555\"> </div>\n"
};

/* this test case is about nested empty XHTML tags "<x/>" being expanded */
gchar *tc_article_self_closed_tags_nested[] = {
"<html><head></head><body><article><p>1</p><div><div class='something' data-nr='555'/></div></article></body></html>",
"<p>1</p><div>\n <div class=\"something\" data-nr=\"555\"> </div>\n</div>\n"
};

/* this test case is about stripping inline script and CSS */
gchar *tc_article_strip_inline_code[] = {
"<html><head></head><body><article><p>1<script>alert('Hallo');</script></p><style>p { font-size: 2em }</style></article></body></html>",
"<p>1</p>\n"
};

static void
tc_auto_discover_link (gconstpointer user_data)
{
Expand All @@ -208,19 +145,6 @@ tc_auto_discover_link (gconstpointer user_data)
g_slist_free_full (list, g_free);
}

static void
tc_get_article (gconstpointer user_data)
{
gchar **tc = (gchar **)user_data;
gchar *result = html_get_article (tc[0], "https://example.com");
if (!tc[1])
g_assert_null (result);
else
g_assert_cmpstr (tc[1], ==, result);

g_free (result);
}

int
main (int argc, char *argv[])
{
Expand All @@ -242,17 +166,5 @@ main (int argc, char *argv[])
g_test_add_data_func ("/html/auto_discover_link_xml_atom3", &tc_xml_atom3, &tc_auto_discover_link);
g_test_add_data_func ("/html/auto_discover_link_xml_rce", &tc_xml_rce, &tc_auto_discover_link);

g_test_add_data_func ("/html/html5_extract_article", &tc_article, &tc_get_article);
g_test_add_data_func ("/html/html5_extract_article_main", &tc_article_main, &tc_get_article);
g_test_add_data_func ("/html/html5_extract_article_main2", &tc_article_main2, &tc_get_article);
g_test_add_data_func ("/html/html5_extract_article_micro_format", &tc_article_micro_format, &tc_get_article);
g_test_add_data_func ("/html/html5_extract_article_cms_content_id", &tc_article_cms_content_id, &tc_get_article);
g_test_add_data_func ("/html/html5_extract_article_missing", &tc_article_missing, &tc_get_article);
g_test_add_data_func ("/html/html5_extract_article_empty_tags", &tc_article_empty_tags, &tc_get_article);
g_test_add_data_func ("/html/html5_extract_article_empty_tags_nested", &tc_article_empty_tags_nested, &tc_get_article);
g_test_add_data_func ("/html/html5_extract_article_self_closed_tags", &tc_article_self_closed_tags, &tc_get_article);
g_test_add_data_func ("/html/html5_extract_article_self_closed_tags_nested", &tc_article_self_closed_tags_nested, &tc_get_article);
g_test_add_data_func ("/html/html5_extract_article_strip_inline_code", &tc_article_strip_inline_code, &tc_get_article);

return g_test_run();
}

0 comments on commit 0dfc348

Please sign in to comment.