-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathflickscrape.py
executable file
·292 lines (267 loc) · 9.43 KB
/
flickscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/usr/bin/python3
import os, sys, re, math, csv, requests
import lxml
from bs4 import BeautifulSoup
sovietMovies = []
skippedFlicks = []
base = "https://sovietmoviesonline.com/"
link = base + "all_movies.html"
class Flick:
badFile = True
badSrt = True
fileSize = 0.0
srtSize = 0.0
title = ""
downloadUrl = ""
srtUrl = ""
name = ""
year = ""
imdb = ""
og = ""
director = ""
def __init__(self, url, num):
self.url = url
self.num = num
def __str__(self):
string = (
"URL: " + self.url + "\n" +
"Title: " + self.title + "\n" +
"Download Url: " + self.downloadUrl + "\n" +
"Sub Url: " + self.srtUrl + "\n" +
"File Size: " + str(self.fileSize / 1000000000) + " gb\n" +
"SRT Size: " + str(self.fileSize / 1000) + " mb\n" +
"Bad File: " + str(self.badFile) + "\n" +
"Bad SRT: " + str(self.badSrt) + "\n" +
"Original Name: " + self.og + "\n" +
"Year: " + self.year + "\n" +
"IMDB: " + self.imdb + "\n" +
"Director: " + self.director
)
return string
def csvOut(flick):
csvString = ""
csvString += flick.title + "|"
csvString += flick.year + "|"
csvString += flick.director + "|"
csvString += flick.imdb + "|"
csvString += flick.og + "|"
csvString += str(flick.fileSize / 1000000000) + "|"
csvString += str(flick.srtSize / 1000) + "|"
csvString += flick.url + "|"
csvString += flick.downloadUrl + "|"
csvString += flick.srtUrl + "|"
csvString += str(flick.badFile) + "|"
csvString += str(flick.badSrt) + "\n"
return csvString
def errMsg(msg):
sys.stderr.write(msg + "\n")
sys.stderr.flush()
#Try and download flick and english subtitles
def downloadFlicks(flick):
d = os.path.realpath(__file__)
baseDir = d.rsplit('/', 1)
download = flick.downloadUrl
chunkSize = 1024 * 1024
h = requests.head(flick.downloadUrl, allow_redirects=True)
header = h.headers
content_type = header.get('content-type')
if 'text' in content_type.lower():
print("Not downloadable")
elif 'html' in content_type.lower():
print("Not downloadable")
else:
if flick.badFile == 1:
print("Bad Movie File Set, skipping...")
else:
titleNoSpace = flick.title.replace(' ', '_')
newDir = baseDir[0] + "/" + titleNoSpace
if not os.path.isdir(newDir):
os.mkdir(newDir)
print("Downloading: " + flick.downloadUrl)
r = requests.get(flick.downloadUrl, stream=True)
with open((newDir + "/" + titleNoSpace + '.mp4'), 'wb') as f:
for chunk in r.iter_content(chunk_size=chunkSize):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
if flick.badSrt == 1:
print("Bad Srt set, skipping...")
else:
r = requests.get(flick.srtUrl, stream=True)
with open((newDir + "/" + titleNoSpace + '.srt'), 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
#Creates CSV file containg film information
#Defaults to this script's directory
def writeCsv():
d = os.path.realpath(__file__)
scriptDir = d.rsplit('/', 1)
scriptDir = scriptDir[0]
csvName = scriptDir + "/flickscrape-output"
x = 1
csvPath = csvName + ".csv"
while True:
if os.path.exists(csvPath):
csvPath = csvName + "_" + str(x) + ".csv"
x += 1
else:
break
f = open(csvPath, "w")
startLine = "Title|Year|Director|IMDb Score|Russian Title|Flick Size (in GB)|SRT Size (KB)|Flick URL|Download url|SRT url|Bad File|Bad Subtitles\n"
f.write(startLine)
for flick in sovietMovies:
print(csvOut(flick))
f.write(csvOut(flick))
for flick in skippedFlicks:
flick.fileSize = 0
flick.srtSize = 0
f.write(csvOut(flick))
#Flicks are initialized to not download.
#Cuts redundancy setting Flick
def downloadInit(flick):
flickHead = requests.head(flick.downloadUrl)
srtHead = requests.head(flick.srtUrl)
try:
flick.srtSize = int(srtHead.headers['content-length'])
flick.badSrt = False
except KeyError:
errMsg("Key Error: " + flick.srtUrl)
try:
flick.fileSize = int(flickHead.headers['content-length'])
flick.badFile = False
except KeyError:
errMsg("Key Error: " + flick.downloadUrl)
#Scrape info from html page and create flick object
#Any content not found is simply skipped
#Assumptions:
# First Table Found Contains: Original Title, IMDB, Views, Year
# first <div class="director">...</div> contains films director
# There is <div id="error404">...</div> on 404 pages
def processFilm(flick):
global link
global base
#Segmenting these strings more because I know these links aren't always perfect
#ie if film has two parts it may be <num>-1.mp4 and <num>-2.mp4
downloadBase = base + "movies/" + str(flick.num)
flick.downloadUrl = downloadBase + ".mp4"
flick.srtUrl = downloadBase + ".srt"
downloadInit(flick)
flickUrl = flick.url
num = flick.num
filmReq = requests.get(flickUrl)
filmSource = filmReq.text
soup = BeautifulSoup(filmSource, "lxml")
for i in soup.find_all('div'):
if i.id == "error404":
skippedFlicks.append(flick)
return
try:
flickTitle = soup.find_all('h1')
flick.title = flickTitle[0].contents[0]
except IndexError:
errMsg("Flick Title not found: " + flickUrl)
skippedFlicks.append(flick)
return
if filmReq.status_code == requests.codes.ok:
try:
#May be able to pull director data from Flick text on page
#<div class="movie-description">...</div>
director = soup.find('div', {'class': ['director']}).contents[0].contents[0]
flick.director = director
except (IndexError, AttributeError):
errMsg("Director not found: " + flickUrl)
table = soup.find_all('table')
try:
td = table[0].find_all('td')
try:
flick.og = td[0].contents[1].strip()
except IndexError:
errMsg("Original Title not found.")
try:
flick.imdb = td[1].contents[1].strip()
except IndexError:
errMsg("IMDB rating not found.")
#Number of views is td[2].contents[1] hence the jump
try:
flick.year = td[3].contents[1].strip()
except IndexError:
errMsg("Release Year not found.")
return
except IndexError:
errMsg("Error finding <td>'s, skipping: " + flickUrl)
skippedFlicks.append(flick)
else:
errMsg("Bad request code on flick page, skipping: " + flickUrl)
skippedFlicks.append(flick)
return
if flick.badFile == True:
skippedFlicks.append(flick)
else:
sovietMovies.append(flick)
def main():
print("--- Extracting Film Links ---")
global link
#global sovietMovies
sovietRequest = requests.get(link)
data = sovietRequest.text
soup = BeautifulSoup(data, "lxml")
allLinks = [] #links before removing dupes
links = [] #links after remove dupes
tempDict = {
"url": "",
"num": 1,
}
for link in soup.find_all('a'):
tempUrl = link.get('href')
#make sure it's a proper film link and not blog
movieRegex = re.escape(base) + r"(?!blog).*\.html"
match = re.match(movieRegex, tempUrl)
if match:
tempSplit = tempUrl.split('/')
movieBase = tempSplit[len(tempSplit) - 1]
baseSplit = movieBase.split('-')
num = baseSplit[0]
linkInfo = tempDict.copy()
linkInfo["url"] = tempUrl
linkInfo["num"] = int(num)
allLinks.append(linkInfo)
#Check for duplicate links
seen = set()
for x in allLinks:
if x["num"] not in seen:
links.append(x)
seen.add(x["num"])
print("--- Processing Flicks ---")
#for i in range(0, 2, 1):
# tempFlick = Flick(links[i]["url"], links[i]["num"])
# processFilm(tempFlick)
for movie in links:
tempFlick = Flick(movie["url"], movie["num"])
processFilm(tempFlick)
#print("--- Flicks to Download ---")
#for i in sovietMovies:
# print(i)
#print("--- Flicks to Skip ---")
#for i in skippedFlicks:
# print(i)
#CSV delimited by pipe "|" and nothing else.
#Extra string delimiters will create an unruly CSV
print("--- Generating CSV File ---")
writeCsv()
totalSize = 0.0
print("--- Calculating Total Size ---")
for i in sovietMovies:
totalSize += i.fileSize
totalSize += i.srtSize
totalSize = totalSize / 1000000000
inputString = "\nTotal Download Size: " + str(round(totalSize, 2)) + " gb"
inputString += "\nWould you like to download [y/n]? "
downloadChoice = input(inputString)
if downloadChoice != 'y':
print("No Downloads Today")
exit(1)
print("--- Downloading Files ---")
for i in sovietMovies:
downloadFlicks(i)
if __name__ == "__main__":
main()