Skip to content

Commit

Permalink
爬取《帅啊》网帅哥图片
Browse files Browse the repository at this point in the history
  • Loading branch information
Jack-Cherish committed May 22, 2017
1 parent 5aef747 commit e946faa
Showing 1 changed file with 49 additions and 0 deletions.
49 changes: 49 additions & 0 deletions shuaia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import requests
import os
import time

if __name__ == '__main__':
list_url = []
for num in range(1,3):
if num == 1:
url = 'http://www.shuaia.net/index.html'
else:
url = 'http://www.shuaia.net/index_%d.html' % num
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
req = requests.get(url = url,headers = headers)
req.encoding = 'utf-8'
html = req.text
bf = BeautifulSoup(html, 'lxml')
targets_url = bf.find_all(class_='item-img')

for each in targets_url:
list_url.append(each.img.get('alt') + '=' + each.get('href'))

print('连接采集完成')

for each_img in list_url:
img_info = each_img.split('=')
target_url = img_info[1]
filename = img_info[0] + '.jpg'
print('下载:' + filename)
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
img_req = requests.get(url = target_url,headers = headers)
img_req.encoding = 'utf-8'
img_html = img_req.text
img_bf_1 = BeautifulSoup(img_html, 'lxml')
img_url = img_bf_1.find_all('div', class_='wr-single-content-list')
img_bf_2 = BeautifulSoup(str(img_url), 'lxml')
img_url = 'http://www.shuaia.net' + img_bf_2.div.img.get('src')
if 'images' not in os.listdir():
os.makedirs('images')
urlretrieve(url = img_url,filename = 'images/' + filename)
time.sleep(1)

print('下载完成!')

0 comments on commit e946faa

Please sign in to comment.