-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathsimplecrawl2.py
137 lines (112 loc) · 4.04 KB
/
simplecrawl2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from bs4 import BeautifulSoup
import requests
from requests.exceptions import ReadTimeout
from string import ascii_uppercase
import re
import urllib.parse as urlparse
import json
import csv
import sys
import operator
import os
import datetime
# Variablen und Konstanten
data_en = []
all_podcasts = {}
startlink = 'https://podcasts.apple.com/us/genre/podcasts/id26'
def get_id(url):
parts = urlparse.urlsplit(url)
if parts.hostname == 'podcasts.apple.com':
idstr = parts.path.rpartition('/')[2] # extract 'id123456'
if idstr.startswith('id'):
try: return int(idstr[2:])
except ValueError: pass
raise ValueError("Invalid url: %r" % (url,))
def savedata(the_data, filename):
if len(the_data)>0:
with open(filename, 'w', newline="") as outfile:
json.dump(the_data, outfile)
def saveall():
print ("saving data_en...")
savedata(data_en, savedir + '/' + 'data_en.json')
print ("done.")
# flush memory
data_en.clear()
GENRE_FILTER = [
"Comedy",
"Health & Fitness",
"Kids & Family",
"News",
"Science",
"Society & Culture",
"Sports",
"True Crime",
]
allcatpage = requests.get(startlink, timeout=5)
categories = BeautifulSoup(allcatpage.content, "html.parser")
# Verzeichnis für die Ergebnisdaten anlegen
savedir = "crawl_" + str(datetime.date.today())
if not os.path.exists(savedir):
os.mkdir(savedir)
# Save links...
with open(savedir + '/' + 'allpodcastlinks.json', 'w') as outfile:
# Arbeitsschritt 1 - wie sammeln erst mal alle podcast links auf der itunes Seite ein
top_level_genres = categories.select('.top-level-genre')
# print(repr(top_level_genres))
genres = dict()
for category in top_level_genres: # Loop through all genres
itunesGenre = category.get_text()
# skip genres not in the filter
if itunesGenre not in GENRE_FILTER:
continue
print(itunesGenre)
sub_genres = category.parent.select('.top-level-subgenres li a')
# print(str(sub_genres))
subs = [category]
for subcat in sub_genres:
print("\t"+subcat.get_text())
subs.append(subcat)
genres[itunesGenre] = subs
print("\nfetch links")
for (genre_name, category_genres) in genres.items():
print(genre_name)
for category in category_genres:
subgenre_name = category.get_text()
if genre_name == subgenre_name:
subgenre_name = ""
print("\t"+genre_name)
try:
categorypage = requests.get(category['href'], timeout=5)
except ReadTimeout as e:
print("error fetching {}/{} {}: ".format(genre_name, subgenre_name, e))
continue
allpodcasts = BeautifulSoup(categorypage.content, 'html.parser')
allpodcastlinks = allpodcasts.select('#selectedcontent ul>li a')
linkcount = len(allpodcastlinks)
rank = 0
for link in allpodcastlinks: # Finally! We loop through all podcast links! Yey!
rank = rank + 1
title = link.get_text()
print("\t\t"+title)
if "/id" in link['href']:
theID = get_id(link['href'])
genre = {
"genre": genre_name,
"subgenre": subgenre_name,
"rank": rank,
}
if not theID in all_podcasts:
all_podcasts[theID] = {
"itunesID": theID,
"title": title,
"genres": [genre],
"link": link['href'],
}
else:
all_podcasts[theID]["genres"].append(genre)
print('json dump to outfile\n')
podlist = list(all_podcasts.values())
podlist.sort(key=operator.itemgetter("itunesID"))
for pod in podlist:
json.dump(pod, outfile)
outfile.write("\n")