From 5f5869995cbabd9f9532b279a7f4a3e964c5d745 Mon Sep 17 00:00:00 2001
From: sunyongshi <337828064@163.com>
Date: Tue, 31 Jul 2018 17:09:07 +0800
Subject: [PATCH] =?UTF-8?q?=E7=AC=94=E8=B6=A3=E7=9C=8B=E5=BD=93=E6=97=B6?=
 =?UTF-8?q?=E6=97=A0=E6=B3=95=E6=89=93=E5=BC=80=EF=BC=8C=E5=B0=B1=E5=8F=82?=
 =?UTF-8?q?=E7=85=A7=E6=95=99=E7=A8=8B=E7=88=AC=E5=8F=96=E4=BA=86=E7=AC=94?=
 =?UTF-8?q?=E8=B6=A3=E9=98=81=E5=86=85=E5=AE=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 one_hour_spider/biquge20180731.py | 52 +++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 one_hour_spider/biquge20180731.py

diff --git a/one_hour_spider/biquge20180731.py b/one_hour_spider/biquge20180731.py
new file mode 100644
index 00000000..5cc55760
--- /dev/null
+++ b/one_hour_spider/biquge20180731.py
@@ -0,0 +1,52 @@
+# -*- coding:utf-8 -*-
+import requests
+from bs4 import BeautifulSoup
+import os
+
+"""
+从www.biqubao.com笔趣阁爬取小说，楼主教程中的网址我当时没打开，
+就参照楼主教程，爬取了笔趣阁小说网的内容。
+    2018-07-31
+"""
+
+if __name__=='__main__':
+    #所要爬取的小说主页，每次使用时，修改该网址即可，同时保证本地保存根路径存在即可
+    target="https://www.biqubao.com/book/17570/"
+    # 本地保存爬取的文本根路径
+    save_path = 'G:/pythonlearn'
+    #笔趣阁网站根路径
+    index_path='https://www.biqubao.com'
+
+    req=requests.get(url=target)
+    #查看request默认的编码，发现与网站response不符，改为网站使用的gdk
+    print(req.encoding)
+    req.encoding = 'gbk'
+    #解析html
+    soup=BeautifulSoup(req.text,"html.parser")
+    list_tag=soup.div(id="list")
+    print('list_tag:',list_tag)
+    #获取小说名称
+    story_title=list_tag[0].dl.dt.string
+    # 根据小说名称创建一个文件夹,如果不存在就新建
+    dir_path=save_path+'/'+story_title
+    if not os.path.exists(dir_path):
+        os.path.join(save_path,story_title)
+        os.mkdir(dir_path)
+    #开始循环每一个章节，获取章节名称，与章节对应的网址
+    for dd_tag in list_tag[0].dl.find_all('dd'):
+        #章节名称
+        chapter_name=dd_tag.string
+        #章节网址
+        chapter_url=index_path+dd_tag.a.get('href')
+        #访问该章节详情网址，爬取该章节正文
+        chapter_req = requests.get(url=chapter_url)
+        chapter_req.encoding = 'gbk'
+        chapter_soup = BeautifulSoup(chapter_req.text, "html.parser")
+        #解析出来正文所在的标签
+        content_tag = chapter_soup.div.find(id="content")
+        #获取正文文本，并将空格替换为换行符
+        content_text = str(content_tag.text.replace('\xa0','\n'))
+        #将当前章节，写入以章节名字命名的txt文件
+        with open(dir_path+'/'+chapter_name+'.txt', 'w') as f:
+            f.write('本文网址:'+chapter_url)
+            f.write(content_text)
\ No newline at end of file