-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathget_data.py
61 lines (54 loc) · 1.88 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import imdb
import imdb.helpers
import urllib
import json
from multiprocessing import Pool
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class PersonEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, imdb.Person.Person) or isinstance(o, imdb.Company.Company):
return {k: o[k] for k in o.keys()}
elif isinstance(o, imdb.Movie.Movie):
import ipdb; ipdb.set_trace() # BREAKPOINT
return str(o.movieID)
else:
return super(PersonEncoder, self).default(o)
def save_movie(imdbId):
json_path = 'mmimdb/dataset/' + imdbId + '.json'
jpeg_path = 'mmimdb/dataset/' + imdbId + '.jpeg'
try:
if os.path.isfile(json_path) and os.path.isfile(jpeg_path):
return
ia = imdb.IMDb()
m = ia.get_movie(imdbId)
jsonData = {}
for k in m.keys():
v = m[k]
jsonData[k] = v
with open(json_path, 'w') as f:
json.dump(jsonData, f, cls=PersonEncoder, indent=4)
imgUrl = imdb.helpers.fullSizeCoverURL(m)
if imgUrl is not None:
imageData = urllib.urlopen(imdb.helpers.fullSizeCoverURL(m)).read()
with open(jpeg_path, 'wb') as f:
f.write(imageData)
else:
logger.log(logging.WARNING, imdbId + ':Does not have image poster')
except Exception as e:
logger.log(logging.ERROR, imdbId + ':' + str(e))
if os.path.isfile(json_path):
os.remove(json_path)
if os.path.isfile(jpeg_path):
os.remove(jpeg_path)
if __name__ == "__main__":
with open('links.csv', 'r') as f:
lines = f.readlines()
imdbIds = [line.split(',')[1] for line in lines]
p = Pool(8)
p.map(save_movie, imdbIds[1:])
p.close()
# for imdbId in imdbIds[1:]:
# save_movie(imdbId)