-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrss_finder.py
56 lines (47 loc) · 1.85 KB
/
rss_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import urllib.parse
import html5lib
import feedparser
import requests
def find_feed(url):
try:
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
html = response.text
tree = html5lib.parse(html, namespaceHTMLElements=False)
# base for relative URLs
base = tree.findall('.//base')
base_url = base[0].attrib['href'] if base and 'href' in base[0].attrib else url
# prioritize Atom over RSS
links = tree.findall("""head/link[@rel='alternate'][@type='application/atom+xml']""") + tree.findall("""head/link[@rel='alternate'][@type='application/rss+xml']""")
for link in links:
href = link.attrib.get('href', '').strip()
if href:
return urllib.parse.urljoin(base_url, href)
# heuristic search for common feed paths
for suffix in [
'feed', 'feed/', 'rss', 'atom', 'feed.xml',
'/feed', '/feed/', '/rss', '/atom', '/feed.xml',
'index.atom', 'index.rss', 'index.xml', 'atom.xml', 'rss.xml',
'/index.atom', '/index.rss', '/index.xml', '/atom.xml', '/rss.xml',
'.rss', '/.rss', '?rss=1', '?feed=rss2',
]:
try:
potential_feed = urllib.parse.urljoin(base_url, suffix)
response = requests.get(potential_feed)
if response.status_code == 200:
return potential_feed
except Exception:
continue
except Exception as e:
print(f"An error occurred: {e}")
return None
if __name__ == "__main__":
import sys
if len(sys.argv) != 2:
print("Usage: python rss_finder.py [URL]")
sys.exit(1)
url = sys.argv[1]
feed_url = find_feed(url)
if feed_url:
print(f"Feed URL found: {feed_url}")
else:
print("No feed URL found.")