-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathInfiniteScraperOptionalArgs.py
140 lines (99 loc) · 5.42 KB
/
InfiniteScraperOptionalArgs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Open a particular NPR show archive page
# 1. Go back X number of days before today (they usually hold 5 shows per page)
# 2. Scrape the music tags from the HTML to get artist and song names
# 3. Sanitize (remove special characters) Reference: https://stackoverflow.com/questions/5843518/remove-all-special-characters-punctuation-and-spaces-from-string
# 4. Search youtube for each artist/song and grab first result
# 5. Create custom shareable temporary youtube playlist with top results
# 1. Go back X number of days before today (they usually hold 5 shows per page)
# 2. Scrape the music tags from the HTML to get artist and song names
#Scrapes an npr.org show page and creates a CSV file playlist of all the music interludes.
# https://www.npr.org/programs/morning-edition/archive
# Author: Adam Harris
# Date 11/27/2018
#
#now to crawl npr's pages creates a youtube platlist of music from the show for each week.
# Most of this code is referenced form :https://medium.freecodecamp.org/how-to-scrape-websites-with-python-and-beautifulsoup-5946935d93fe
# and https://codeburst.io/scraper-b82146396249
# import libraries
import urllib2
from bs4 import BeautifulSoup
import re #used for regex to strip special chars from song and band names
from sys import argv # used for catching arguments on the command line
#libs for infinite scrolling
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import requests
import unittest, time, re
testing = 1 # If this is set to 1 then it'll only print CSV list of band and song names, setting to any other value queries youtube and creates the playlists.
def scraper(linkToScrape):
html_source = driver.page_source
page = html_source.encode('utf-8')
# parse the html using beautiful soup and store in variable `soup`
soup = BeautifulSoup(page, "html.parser")
#reference: https://stackoverflow.com/questions/48484463/how-to-python-scrape-text-in-span-class
# GET Band Names
bands = []
for band in soup.find_all ('span', {'class': 'song-meta-artist'}):
bands.append(re.sub('[^\w\s]','',band.text))
titles = []
for title in soup.find_all ('span', {'class': 'song-meta-title'}):
titles.append(re.sub('[^\w\s]','',title.text))# Sanitize (remove special characters) Reference: https://stackoverflow.com/questions/5843518/remove-all-special-characters-punctuation-and-spaces-from-string
# 4. Search youtube for each artist/song and grab first result
# Make youtube playlists without logging in using just URLs
# Reference: https://www.labnol.org/internet/create-youtube-playlists/28827/
url = "https://www.youtube.com/results?search_query="
if testing ==1:
playlistUrl =''
else:
splaylistUrl = 'http://www.youtube.com/watch_videos?video_ids='
for index, item in enumerate(bands): #go through each song/band and search youtube for the first result
if testing ==1: #just testing, only print names of bands and songs
playlistUrl += bands[index] +','+ titles[index]+'\n'
else: #otherwise query youtube and build the full youtube list.
r = requests.get(url+bands[index] +' '+ titles[index])
html = r.text
soup=bs(html,'html.parser')
vid = soup.find('a',attrs={'class':'yt-uix-tile-link'}) # get all the <a> tags from the html
# 5. Create custom shareable temporary youtube playlist with top results for each song
playlistUrl += vid['href'][9:]+',' #Strip off the "/watch?v=" from the link and add a comma to form a list
print playlistUrl.rstrip(',') #remove the last comma since I did no check for it in the list to speed it up
# ================== Main code below ====================
#Step 2: Get a ordered list of artists and song titles
#Use chromedriver to load autoscrolling NPR archive page
driver = webdriver.Chrome(executable_path=r'chromedriver') #basic non-headless chrome driver
driver.implicitly_wait(30)
base_url = "https://www.npr.org/programs/morning-edition/archive"
verificationErrors = []
accept_next_alert = True
#delay = 3
driver.get(base_url) #actually open the mainpage here
if len(argv)<2:
scrollTimes = 1
else:
scrollTimes = int(argv[1])
for i in range(1,scrollTimes):# scroll down X times
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(4)
#now that this has scrolled to the appropriate amount, open all the "more from this episode" links:
links = driver.find_elements_by_xpath('//article/section/a')#get all links to "more from this episode"
linksBuffer = links
#print 'got all the links'
for index, link in enumerate(linksBuffer):
newLink = link.get_attribute('href')
#print 'visiting link '+newLink
driver.execute_script('window.open('');')
time.sleep(1)
#print 'switching to window'
#print index+1
driver.switch_to.window(driver.window_handles[index+1])
#here's where we should call the scraper function
driver.get(newLink) #loads the new page into driver
scraper(newLink) #pass the new link to the scraper
#print 'Switch Successful, going back to base URL'
driver.switch_to.window(driver.window_handles[0])
# kill the chrome window and release from memory:
driver.stop_client()
driver.close() #closes chromedriver browser window
driver.quit() #removes chromedriver cmd window