网易客户端爬虫

xiahui1986 · Dec 8, 2014 · dc26568 · dc26568
1 parent 9eb1cb8
commit dc26568
Show file tree

Hide file tree

Showing 8 changed files with 449 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+*.pyc
+*.swp
+/utils/logs/
+/spider/logs/
diff --git a/__init__.py b/__init__.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+#-*-coding:utf-8-*-
+
+
+def test():
+    pass
+
+if __name__ == "__main__":
+    test()
+
diff --git a/database.sql b/database.sql
@@ -0,0 +1,34 @@
+-- 网易轻松一刻数据记录
+drop table if exists wangyi;
+create table wangyi(
+    id int not null auto_increment,
+    item_type varchar(32) not null,         -- 栏目类型，比如 qingsongyike
+    title varchar(512) not null,
+    url varchar(512) not null,
+    docid varchar(32) not null,
+    cover_img varchar(512),
+    ptime varchar(32) not null,
+    today char(10) not null,
+    body text not null,
+    open_times int not null default 0,      -- 本页被浏览的次数，默认是 0
+    KEY(id),
+    KEY(item_type),
+    CONSTRAINT docid_uniq PRIMARY KEY(item_type, docid)
+)ENGINE=InnoDB DEFAULT CHARSET=utf8;
+
+-- 运行控制表，控制运行及更新
+drop table if exists run_control;
+create table run_control(
+    id int not null auto_increment,
+    item varchar(32) not null,                              -- 栏目类型
+    total int not null,
+    one_page int not null,                       -- 记录一次请求多少条记录回来
+    last_run timestamp default current_timestamp on update current_timestamp,
+    KEY(id),
+    PRIMARY KEY(item)
+)ENGINE=InnoDB DEFAULT CHARSET=utf8;
+
+insert into run_control(item, total, one_page) values('qingsongyike', 400, 10);
+insert into run_control(item, total, one_page) values('huanqiukanke', 120, 10);
+insert into run_control(item, total, one_page) values('pangbianguaitan', 160, 10);
+insert into run_control(item, total, one_page) values('wangyigengtie', 380, 10);
diff --git a/spider/general_run.py b/spider/general_run.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+#-*-coding:utf-8-*-
+'''
+#=============================================================================
+# FileName:     general_run.py
+# Desc:         调用程序
+# Author:       leyle
+# Email:        [email protected]
+# HomePage:     http://www.leyle.com/
+# Git_page:     https://github.com/leyle
+# Version:      0.0.1
+# LastChange:   2014-12-08 10:15:23
+# History:      
+#=============================================================================
+'''
+
+"""
+    调用网易的各个栏目进行内容爬取
+"""
+
+from wangyi import  WANGYI
+import time
+
+def qingsongyike():
+    qsyk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1350383429665/", list_docid="T1350383429665", item_type="qingsongyike", title_key=["每日轻松一刻"])
+    qsyk.run()
+
+def pangbianguaitan():
+    pbgt = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1396928667862/", list_docid="T1396928667862", item_type="pangbianguaitan", title_key=["胖编怪谈"])
+    pbgt.run()
+
+def huanqiukanke():
+    hqkk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1381482353221/", list_docid="T1381482353221", item_type="huanqiukanke", title_key=["今日环球侃客", "无德无信外国人"])
+    hqkk.run()
+
+def wangyizuigengtie():
+    wygt = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1411719652285/", list_docid="T1411719652285", item_type="wangyigengtie", title_key=["网易新闻有态度"], key="adTitle")
+    wygt.run()
+
+def run_forever():
+    while True:
+        qingsongyike()
+        pangbianguaitan()
+        huanqiukanke()
+        wangyizuigengtie()
+
+        time.sleep(600)
+
+def test():
+    wangyizuigengtie()
+
+if __name__ == "__main__":
+    run_forever()
+    #test()
+
diff --git a/spider/wangyi.py b/spider/wangyi.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+#-*-coding:utf-8-*-
+'''
+#=============================================================================
+# FileName:     wangyi.py
+# Desc:         爬虫主程序
+# Author:       leyle
+# Email:        [email protected]
+# HomePage:     http://www.leyle.com/
+# Git_page:     https://github.com/leyle
+# Version:      0.0.1
+# LastChange:   2014-12-08 10:15:07
+# History:      
+#=============================================================================
+'''
+
+"""
+    列表 url: http://c.m.163.com/nc/article/list/T1396928667862/150-10.html
+    胖编怪谈: http://c.m.163.com/nc/article/AC5QK4K400964JJM/full.html
+"""
+import sys
+import simplejson as json
+import MySQLdb
+import time
+
+sys.path.append("..")
+
+from utils import utils
+
+reload(sys)
+sys.setdefaultencoding('utf-8')
+
+class WANGYI(object):
+    def __init__(self, list_url, list_docid, item_type, title_key, key="title", start=0, end=10):
+        self._start = start
+        self._end = end
+        self._data = ''
+        self._list_url = list_url
+        self._list_docid = list_docid
+        self._item_type = item_type
+        self._title_key = title_key
+        self._key = key
+        self._docid = []
+        self._need_init = True
+
+    def run(self):
+        if self._need_init:
+            self.init_qsyk()
+
+        self.download_and_insert()
+
+    def clean_all(self):
+        """ 目的是清理掉全局变量的值，方便循环调用 """
+        self._data = ''
+        self._docid = []
+
+
+    def get_docid_from_json(self):
+        """ 根据指定的起始、结束区间，提取这个区间的每日轻松一刻的 url 关键元素 """
+        url = self._list_url + str(self._start) + "-" + str(self._end) + ".html"
+        self._data = utils.download_page(url)
+        if self._data:
+            self._data = json.loads(self._data)
+            if self._data.has_key(self._list_docid):
+                self._data = self._data[self._list_docid]
+                self.extract_docid()
+
+    def extract_docid(self):
+        if self._data:
+            for d in self._data:
+                for title in self._title_key:
+                    if str(d[self._key]).find(title) != -1:
+                        tmp = {}
+                        tmp["docid"] = d['docid']
+                        tmp["cover_img"] = d['imgsrc'] if d.has_key('imgsrc') else ''
+
+                        self._docid.append(tmp)
+
+    def download_and_insert(self):
+        if not self._docid:
+            self.get_docid_from_json()
+
+        if self._docid:
+            for docid in self._docid:
+                self.get_qsyk_and_insert(docid)
+
+        self.clean_all()
+
+    def get_qsyk_and_insert(self, docid):
+        cover_img = MySQLdb.escape_string(docid['cover_img'])
+        docid = docid['docid']
+
+        if self.db_has_exist(docid):
+            return
+
+        url = "http://c.3g.163.com/nc/article/%s/full.html" % str(docid)
+        data = utils.download_page(url, True)
+
+        if data:
+            data = data[docid]
+            if data:
+                ptime = data['ptime']
+                today = ptime.split(' ')[0]
+                imgs = data['img']
+                body = data['body'].encode('utf-8')
+
+                title = data['title']
+                title = title.replace(' ', '')
+                title = title.replace('（', '(')
+                title = title.replace('）', ')')
+                title = title.replace('(', '-')
+                title = title.replace(')', '')
+                title_hash = utils.url_hash(str(title))
+
+                for img in imgs:
+                    body = body.replace(img['ref'], "<img src=\"" + img['src'] + "\"/><hr>")
+
+                body = body.replace('%', '%%')
+                body = MySQLdb.escape_string(body)
+                sql = "insert into wangyi(item_type, title, url, docid, cover_img, ptime, today, body) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self._item_type, title, url, docid, cover_img, ptime, today, body)
+                utils.insert_mysql(sql)
+
+    def db_has_exist(self, docid):
+        sql = "select * from wangyi where docid='%s' and item_type='%s'" % (str(docid), self._item_type)
+        if utils.query_mysql(sql):
+            return True
+        else:
+            return False
+
+    def init_qsyk(self):
+        """ 检查 run_control 表中 total 数据是否是0，如果不是，就运行程序，直到满足了 total 为止，并将 total 置为 0 """
+        if self._need_init:
+            sql = "select total, one_page from run_control where item='%s'" % (self._item_type)
+            ret = utils.query_mysql(sql)
+
+            total = 0
+            one_page = 0
+            if ret:
+                total = int(ret[0]['total'])
+                one_page = int(ret[0]['one_page'])
+
+            if total > 0:
+                for i in range(0, total, one_page):
+                    self._start = i
+                    self.download_and_insert()
+
+                sql = "update run_control set total=0 where item='%s'" % (self._item_type)
+                utils.update_mysql(sql)
+
+            self._need_init = False
+
+def test():
+    #def __init__(self, list_url, list_docid, item_type, title_key, start=0, end=10):
+    #qsyk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1396928667862/", list_docid="T1396928667862", item_type="pangbianguaitan", title_key=["胖编怪谈"])
+    #qsyk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1350383429665/", list_docid="T1350383429665", item_type="qingsongyike", title_key=["每日轻松一刻"])
+    qsyk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1381482353221/", list_docid="T1381482353221", item_type="huanqiukanke", title_key=["今日环球侃客", "无德无信外国人"])
+    qsyk.run()
+
+if __name__ == "__main__":
+    test()
+
diff --git a/utils/__init__.py b/utils/__init__.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+#-*-coding:utf-8-*-
+
+
+def test():
+    pass
+
+if __name__ == "__main__":
+    test()
+
diff --git a/utils/mylogger.py b/utils/mylogger.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+#-*-coding:utf-8-*-
+'''
+#=============================================================================
+# FileName:     mylogger.py
+# Desc:         日志记录函数，可以滚动打包日志
+# Author:       leyle
+# Email:        [email protected]
+# HomePage:     http://www.leyle.com/
+# Git_page:     https://github.com/leyle
+# Version:      0.0.1
+# LastChange:   2014-12-08 10:13:38
+# History:      
+#=============================================================================
+'''
+
+import logging
+import logging.handlers
+import sys
+import os
+import time
+
+LOGGING_MSG_FORMAT = "%(name)s %(levelname)s %(asctime)s: %(message)s"
+LOGGING_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+MAXBYTE = 1024*1024*50  #50M
+BACKUPCOUNT = 20  #循环备份的最大数
+
+def get_logger(logname):
+    root_logger = logging.getLogger(logname)
+    if len(root_logger.handlers) == 0:
+        path = os.path.join(sys.path[0], 'logs/')
+        if not os.path.isdir(path):
+            os.mkdir(path)
+        filename = path + logname + ".log"
+        handler = logging.handlers.RotatingFileHandler(
+                    filename,
+                    mode = "a",
+                    maxBytes = MAXBYTE,
+                    backupCount = BACKUPCOUNT,
+                    encoding = "utf-8"
+                    )
+        fmter = logging.Formatter(LOGGING_MSG_FORMAT, LOGGING_DATE_FORMAT)
+        handler.setFormatter(fmter)
+        root_logger.addHandler(handler)
+        root_logger.setLevel(logging.DEBUG)
+
+    line_name = "%s" % logname
+    return logging.getLogger(line_name)
+
+def test():
+    mylog = get_logger("log_name")
+    for i in range(0, 1000):
+        mylog.info("%d" % i)
+
+if __name__ == "__main__":
+    test()