-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
153 lines (127 loc) · 3.88 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# -*- coding: utf-8 -*-
from htmldom import htmldom
import html
import json
import spotipy
from oauth2 import Oauth2
import time
from functools import reduce
############################################################################
class SpotifyEditor:
"""
Editor that performs functions on an authenticated spotify user's library
"""
MAX_SEARCH_HITS = 2
def __init__(self, token=None):
self.sp = spotipy.Spotify(auth=token)
print(token)
me = self.sp.me()
self.id = me["id"]
def create_playlist(self, name, public=True):
self.sp.user_playlist_create(self.id, name, public)
def get_playlist_id_by_name(self, name):
playlists = self.sp.user_playlists(self.id)["items"]
for playlist in playlists:
if playlist["name"] == name:
return playlist["id"]
return None
def add_records_to_playlist(self, records, playlist):
"""
Adds a collection of records to a given playlist
records: a collection of key value pairs Record(artist, song)
playlist: a playlist name
"""
pId = self.get_playlist_id_by_name(playlist)
if pId is None:
return
ids = []
for record in records:
recordId = self._get_record_by_name(record)
if recordId is not None:
print("Found record: ",
record.song, " by ", record.artist,
" to playlist ", playlist)
ids.append(recordId)
self.add_uniques_to_playlist(pId, ids)
def add_uniques_to_playlist(self, playlistId, ids):
"""
Given a list of track identifiers, add a list of track ids
to a playlist pid that do not already exist in the playlist
"""
existing = set()
newItems = set(ids)
pl = self.sp.user_playlist(self.id, playlistId)
paging = pl["tracks"]
while True:
for track in paging["items"]:
existing.add(track["track"]["id"])
paging = self.sp.next(paging)
if paging is None:
break
toAdd = newItems - existing
if len(toAdd) < 1:
print("No new items added to playlist")
else:
self.sp.user_playlist_add_tracks(self.id, playlistId, toAdd)
def _get_record_by_name(self, record):
"""
searches for a record using the api, gets the match, and returns
the uri of the record to be looked up
"""
print("Getting: ", record.song, ", ", record.artist)
result = self.sp.search("artist:" + record.artist +
" title:" + record.song,
SpotifyEditor.MAX_SEARCH_HITS, 0, "track")
items = result["tracks"]["items"]
if len(items) < 1:
print("Could not find ", record.song, " by ", record.artist)
return None
else:
return items[0]["id"]
############################################################################
class WTTSParser:
def __init__(self, url='http://wttsfm.com/on-air/overeasy/'):
self.dom = self.createDom(url)
self.records = []
def createDom(self, url):
return htmldom.HtmlDom(url).createDom()
def parseArtistSongList(self):
"""
Fills the list of records
Subclasses should override this method
"""
p = self.dom.find("h4").next()
sunking = "(Sun King Studio 92)"
slen = len(sunking) * -1
for node in p:
text = self._unescape(node.text()).split("\n")
i = 0
artist = None
for line in text:
line = line.strip()
if len(line) > 1:
if i % 2 == 0:
artist = line
else:
if line.endswith(sunking):
line = line[:slen]
self.records.append(Record(artist, line))
i += 1
def getRecords(self):
return self.records
def _unescape(self, text):
"""
Simple unescaper for HTML
"""
repls ={"<": "<",
">": ">",
"&": "&",
"“": "",
"”": "",
"’": "\'"}
return reduce(lambda a, kv: a.replace(*kv), repls.items(), text)
############################################################################
class Record:
def __init__(self, artist, song):
self.artist = artist
self.song = song