-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimages.py
62 lines (49 loc) · 1.87 KB
/
images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from urlparse import urlsplit
from urlparse import urlparse
from bs4 import BeautifulSoup
import multiprocessing
import multiprocessing
import urllib2
import json
import os
import re
def process_url(raw_url):
if ' ' not in raw_url[-1]:
raw_url = raw_url.replace(' ', '%20')
return raw_url
elif ' ' in raw_url[-1]:
raw_url = raw_url[:-1]
raw_url = raw_url.replace(' ', '%20')
return raw_url
# get the images first and then join them later... required if parallelized later
def getImg(url):
parse_object = urlparse(url)
opener = urllib2.build_opener()
opener.addheaders = [
('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/50.0.2661.102 Chrome/50.0.2661.102 Safari/537.36')]
urlcontent = opener.open(url).read()
soup = BeautifulSoup(urlcontent, "lxml")
images = soup.findAll("img")
imgurls = re.findall('img .*src="(.*?)"', urlcontent)
collected_images = []
for image in images:
try:
imgurl = re.findall('img .*src="(.*?)"', str(image))[0]
if imgurl[-3:] != "svg":
imgurl = process_url(imgurl)
if 'height' in str(image) and 'width' in str(image):
if int(image['height']) > 80 and int(image['width']) > 80:
collected_images.append(image)
# print (imgurl, image["alt"], image['height'], image['width'])
else:
imgdata = urllib2.urlopen(imgurl).read()
if len(imgdata) > 5000:
collected_images.append(image)
# print (image, len(imgdata))
except:
pass
return collected_images
if __name__ == '__main__':
url = raw_input("enter website to get images from\n")
images = getImg(url)
print images