forked from jackfrued/Python-100-Days
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
331 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import requests | ||
from bs4 import BeautifulSoup | ||
# selenium是一个自动化测试工具 | ||
# 通过它可以模拟浏览器的行为来访问Web页面 | ||
from selenium import webdriver | ||
|
||
|
||
def main(): | ||
# 先下载chromedriver并且将可执行程序放到PATH环境变量路径下 | ||
# 创建谷歌Chrome浏览器内核 | ||
driver = webdriver.Chrome() | ||
# 通过浏览器内核加载页面(可以加载动态生成的内容) | ||
driver.get('https://www.taobao.com/markets/mm/mm2017') | ||
# driver.page_source获得的页面包含了JavaScript动态创建的内容 | ||
soup = BeautifulSoup(driver.page_source, 'lxml') | ||
all_images = soup.select('img[src]') | ||
for image in all_images: | ||
url = image.get('src') | ||
try: | ||
if not str(url).startswith('http'): | ||
url = 'http:' + url | ||
filename = url[url.rfind('/') + 1:] | ||
print(filename) | ||
resp = requests.get(url) | ||
with open('c:/images/' + filename, 'wb') as f: | ||
f.write(resp.content) | ||
except OSError: | ||
print(filename + '下载失败!') | ||
print('图片下载完成!') | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
from enum import Enum, unique | ||
from queue import Queue | ||
from random import random | ||
from threading import Thread, current_thread | ||
from time import sleep | ||
from urllib.parse import urlparse | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
|
||
@unique | ||
class SpiderStatus(Enum): | ||
IDLE = 0 | ||
WORKING = 1 | ||
|
||
|
||
def decode_page(page_bytes, charsets=('utf-8',)): | ||
page_html = None | ||
for charset in charsets: | ||
try: | ||
page_html = page_bytes.decode(charset) | ||
break | ||
except UnicodeDecodeError: | ||
pass | ||
return page_html | ||
|
||
|
||
class Retry(object): | ||
|
||
def __init__(self, *, retry_times=3, | ||
wait_secs=5, errors=(Exception, )): | ||
self.retry_times = retry_times | ||
self.wait_secs = wait_secs | ||
self.errors = errors | ||
|
||
def __call__(self, fn): | ||
|
||
def wrapper(*args, **kwargs): | ||
for _ in range(self.retry_times): | ||
try: | ||
return fn(*args, **kwargs) | ||
except self.errors as e: | ||
print(e) | ||
sleep((random() + 1) * self.wait_secs) | ||
return None | ||
|
||
return wrapper | ||
|
||
|
||
class Spider(object): | ||
|
||
def __init__(self): | ||
self.status = SpiderStatus.IDLE | ||
|
||
@Retry() | ||
def fetch(self, current_url, *, charsets=('utf-8', ), | ||
user_agent=None, proxies=None): | ||
thread_name = current_thread().name | ||
print(f'[{thread_name}]: {current_url}') | ||
headers = {'user-agent': user_agent} if user_agent else {} | ||
resp = requests.get(current_url, | ||
headers=headers, proxies=proxies) | ||
return decode_page(resp.content, charsets) \ | ||
if resp.status_code == 200 else None | ||
|
||
def parse(self, html_page, *, domain='m.sohu.com'): | ||
soup = BeautifulSoup(html_page, 'lxml') | ||
url_links = [] | ||
for a_tag in soup.body.select('a[href]'): | ||
parser = urlparse(a_tag.attrs['href']) | ||
scheme = parser.scheme or 'http' | ||
netloc = parser.netloc or domain | ||
if scheme != 'javascript' and netloc == domain: | ||
path = parser.path | ||
query = '?' + parser.query if parser.query else '' | ||
full_url = f'{scheme}://{netloc}{path}{query}' | ||
if full_url not in visited_urls: | ||
url_links.append(full_url) | ||
return url_links | ||
|
||
def extract(self, html_page): | ||
pass | ||
|
||
def store(self, data_dict): | ||
pass | ||
|
||
|
||
class SpiderThread(Thread): | ||
|
||
def __init__(self, name, spider, tasks_queue): | ||
super().__init__(name=name, daemon=True) | ||
self.spider = spider | ||
self.tasks_queue = tasks_queue | ||
|
||
def run(self): | ||
while True: | ||
current_url = self.tasks_queue.get() | ||
visited_urls.add(current_url) | ||
self.spider.status = SpiderStatus.WORKING | ||
html_page = self.spider.fetch(current_url) | ||
if html_page not in [None, '']: | ||
url_links = self.spider.parse(html_page) | ||
for url_link in url_links: | ||
self.tasks_queue.put(url_link) | ||
self.spider.status = SpiderStatus.IDLE | ||
|
||
|
||
def is_any_alive(spider_threads): | ||
return any([spider_thread.spider.status == SpiderStatus.WORKING | ||
for spider_thread in spider_threads]) | ||
|
||
|
||
visited_urls = set() | ||
|
||
|
||
def main(): | ||
task_queue = Queue() | ||
task_queue.put('http://m.sohu.com/') | ||
spider_threads = [SpiderThread('thread-%d' % i, Spider(), task_queue) | ||
for i in range(10)] | ||
for spider_thread in spider_threads: | ||
spider_thread.start() | ||
|
||
while not task_queue.empty() or is_any_alive(spider_threads): | ||
pass | ||
|
||
print('Over!') | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
import pickle | ||
import zlib | ||
from enum import Enum, unique | ||
from hashlib import sha1 | ||
from random import random | ||
from threading import Thread, current_thread | ||
from time import sleep | ||
from urllib.parse import urlparse | ||
|
||
import pymongo | ||
import redis | ||
import requests | ||
from bs4 import BeautifulSoup | ||
from bson import Binary | ||
|
||
|
||
@unique | ||
class SpiderStatus(Enum): | ||
IDLE = 0 | ||
WORKING = 1 | ||
|
||
|
||
def decode_page(page_bytes, charsets=('utf-8',)): | ||
page_html = None | ||
for charset in charsets: | ||
try: | ||
page_html = page_bytes.decode(charset) | ||
break | ||
except UnicodeDecodeError: | ||
pass | ||
return page_html | ||
|
||
|
||
class Retry(object): | ||
|
||
def __init__(self, *, retry_times=3, | ||
wait_secs=5, errors=(Exception, )): | ||
self.retry_times = retry_times | ||
self.wait_secs = wait_secs | ||
self.errors = errors | ||
|
||
def __call__(self, fn): | ||
|
||
def wrapper(*args, **kwargs): | ||
for _ in range(self.retry_times): | ||
try: | ||
return fn(*args, **kwargs) | ||
except self.errors as e: | ||
print(e) | ||
sleep((random() + 1) * self.wait_secs) | ||
return None | ||
|
||
return wrapper | ||
|
||
|
||
class Spider(object): | ||
|
||
def __init__(self): | ||
self.status = SpiderStatus.IDLE | ||
|
||
@Retry() | ||
def fetch(self, current_url, *, charsets=('utf-8', ), | ||
user_agent=None, proxies=None): | ||
thread_name = current_thread().name | ||
print(f'[{thread_name}]: {current_url}') | ||
headers = {'user-agent': user_agent} if user_agent else {} | ||
resp = requests.get(current_url, | ||
headers=headers, proxies=proxies) | ||
return decode_page(resp.content, charsets) \ | ||
if resp.status_code == 200 else None | ||
|
||
def parse(self, html_page, *, domain='m.sohu.com'): | ||
soup = BeautifulSoup(html_page, 'lxml') | ||
for a_tag in soup.body.select('a[href]'): | ||
parser = urlparse(a_tag.attrs['href']) | ||
scheme = parser.scheme or 'http' | ||
netloc = parser.netloc or domain | ||
if scheme != 'javascript' and netloc == domain: | ||
path = parser.path | ||
query = '?' + parser.query if parser.query else '' | ||
full_url = f'{scheme}://{netloc}{path}{query}' | ||
if not redis_client.sismember('visited_urls', full_url): | ||
redis_client.rpush('m_sohu_task', full_url) | ||
|
||
def extract(self, html_page): | ||
pass | ||
|
||
def store(self, data_dict): | ||
pass | ||
|
||
|
||
class SpiderThread(Thread): | ||
|
||
def __init__(self, name, spider): | ||
super().__init__(name=name, daemon=True) | ||
self.spider = spider | ||
|
||
def run(self): | ||
while True: | ||
current_url = redis_client.lpop('m_sohu_task') | ||
while not current_url: | ||
current_url = redis_client.lpop('m_sohu_task') | ||
self.spider.status = SpiderStatus.WORKING | ||
current_url = current_url.decode('utf-8') | ||
if not redis_client.sismember('visited_urls', current_url): | ||
redis_client.sadd('visited_urls', current_url) | ||
html_page = self.spider.fetch(current_url) | ||
if html_page not in [None, '']: | ||
hasher = hasher_proto.copy() | ||
hasher.update(current_url.encode('utf-8')) | ||
doc_id = hasher.hexdigest() | ||
if not sohu_data_coll.find_one({'_id': doc_id}): | ||
sohu_data_coll.insert_one({ | ||
'_id': doc_id, | ||
'url': current_url, | ||
'page': Binary(zlib.compress(pickle.dumps(html_page))) | ||
}) | ||
self.spider.parse(html_page) | ||
self.spider.status = SpiderStatus.IDLE | ||
|
||
|
||
def is_any_alive(spider_threads): | ||
return any([spider_thread.spider.status == SpiderStatus.WORKING | ||
for spider_thread in spider_threads]) | ||
|
||
|
||
redis_client = redis.Redis(host='120.77.222.217', | ||
port=6379, password='1qaz2wsx') | ||
mongo_client = pymongo.MongoClient(host='120.77.222.217', port=27017) | ||
db = mongo_client.msohu | ||
sohu_data_coll = db.webpages | ||
hasher_proto = sha1() | ||
|
||
|
||
def main(): | ||
if not redis_client.exists('m_sohu_task'): | ||
redis_client.rpush('m_sohu_task', 'http://m.sohu.com/') | ||
spider_threads = [SpiderThread('thread-%d' % i, Spider()) | ||
for i in range(10)] | ||
for spider_thread in spider_threads: | ||
spider_thread.start() | ||
|
||
while redis_client.exists('m_sohu_task') or is_any_alive(spider_threads): | ||
pass | ||
|
||
print('Over!') | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters