From 5f5869995cbabd9f9532b279a7f4a3e964c5d745 Mon Sep 17 00:00:00 2001 From: sunyongshi <337828064@163.com> Date: Tue, 31 Jul 2018 17:09:07 +0800 Subject: [PATCH] =?UTF-8?q?=E7=AC=94=E8=B6=A3=E7=9C=8B=E5=BD=93=E6=97=B6?= =?UTF-8?q?=E6=97=A0=E6=B3=95=E6=89=93=E5=BC=80=EF=BC=8C=E5=B0=B1=E5=8F=82?= =?UTF-8?q?=E7=85=A7=E6=95=99=E7=A8=8B=E7=88=AC=E5=8F=96=E4=BA=86=E7=AC=94?= =?UTF-8?q?=E8=B6=A3=E9=98=81=E5=86=85=E5=AE=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- one_hour_spider/biquge20180731.py | 52 +++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 one_hour_spider/biquge20180731.py diff --git a/one_hour_spider/biquge20180731.py b/one_hour_spider/biquge20180731.py new file mode 100644 index 00000000..5cc55760 --- /dev/null +++ b/one_hour_spider/biquge20180731.py @@ -0,0 +1,52 @@ +# -*- coding:utf-8 -*- +import requests +from bs4 import BeautifulSoup +import os + +""" +从www.biqubao.com笔趣阁爬取小说,楼主教程中的网址我当时没打开, +就参照楼主教程,爬取了笔趣阁小说网的内容。 + 2018-07-31 +""" + +if __name__=='__main__': + #所要爬取的小说主页,每次使用时,修改该网址即可,同时保证本地保存根路径存在即可 + target="https://www.biqubao.com/book/17570/" + # 本地保存爬取的文本根路径 + save_path = 'G:/pythonlearn' + #笔趣阁网站根路径 + index_path='https://www.biqubao.com' + + req=requests.get(url=target) + #查看request默认的编码,发现与网站response不符,改为网站使用的gdk + print(req.encoding) + req.encoding = 'gbk' + #解析html + soup=BeautifulSoup(req.text,"html.parser") + list_tag=soup.div(id="list") + print('list_tag:',list_tag) + #获取小说名称 + story_title=list_tag[0].dl.dt.string + # 根据小说名称创建一个文件夹,如果不存在就新建 + dir_path=save_path+'/'+story_title + if not os.path.exists(dir_path): + os.path.join(save_path,story_title) + os.mkdir(dir_path) + #开始循环每一个章节,获取章节名称,与章节对应的网址 + for dd_tag in list_tag[0].dl.find_all('dd'): + #章节名称 + chapter_name=dd_tag.string + #章节网址 + chapter_url=index_path+dd_tag.a.get('href') + #访问该章节详情网址,爬取该章节正文 + chapter_req = requests.get(url=chapter_url) + chapter_req.encoding = 'gbk' + chapter_soup = BeautifulSoup(chapter_req.text, "html.parser") + #解析出来正文所在的标签 + content_tag = chapter_soup.div.find(id="content") + #获取正文文本,并将空格替换为换行符 + content_text = str(content_tag.text.replace('\xa0','\n')) + #将当前章节,写入以章节名字命名的txt文件 + with open(dir_path+'/'+chapter_name+'.txt', 'w') as f: + f.write('本文网址:'+chapter_url) + f.write(content_text) \ No newline at end of file