Skip to content

Commit

Permalink
网易客户端爬虫
Browse files Browse the repository at this point in the history
  • Loading branch information
leyle committed Dec 8, 2014
1 parent 9eb1cb8 commit dc26568
Show file tree
Hide file tree
Showing 8 changed files with 449 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*.pyc
*.swp
/utils/logs/
/spider/logs/
10 changes: 10 additions & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env python
#-*-coding:utf-8-*-


def test():
pass

if __name__ == "__main__":
test()

34 changes: 34 additions & 0 deletions database.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
-- 网易轻松一刻数据记录
drop table if exists wangyi;
create table wangyi(
id int not null auto_increment,
item_type varchar(32) not null, -- 栏目类型,比如 qingsongyike
title varchar(512) not null,
url varchar(512) not null,
docid varchar(32) not null,
cover_img varchar(512),
ptime varchar(32) not null,
today char(10) not null,
body text not null,
open_times int not null default 0, -- 本页被浏览的次数,默认是 0
KEY(id),
KEY(item_type),
CONSTRAINT docid_uniq PRIMARY KEY(item_type, docid)
)ENGINE=InnoDB DEFAULT CHARSET=utf8;

-- 运行控制表,控制运行及更新
drop table if exists run_control;
create table run_control(
id int not null auto_increment,
item varchar(32) not null, -- 栏目类型
total int not null,
one_page int not null, -- 记录一次请求多少条记录回来
last_run timestamp default current_timestamp on update current_timestamp,
KEY(id),
PRIMARY KEY(item)
)ENGINE=InnoDB DEFAULT CHARSET=utf8;

insert into run_control(item, total, one_page) values('qingsongyike', 400, 10);
insert into run_control(item, total, one_page) values('huanqiukanke', 120, 10);
insert into run_control(item, total, one_page) values('pangbianguaitan', 160, 10);
insert into run_control(item, total, one_page) values('wangyigengtie', 380, 10);
55 changes: 55 additions & 0 deletions spider/general_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python
#-*-coding:utf-8-*-
'''
#=============================================================================
# FileName: general_run.py
# Desc: 调用程序
# Author: leyle
# Email: [email protected]
# HomePage: http://www.leyle.com/
# Git_page: https://github.com/leyle
# Version: 0.0.1
# LastChange: 2014-12-08 10:15:23
# History:
#=============================================================================
'''

"""
调用网易的各个栏目进行内容爬取
"""

from wangyi import WANGYI
import time

def qingsongyike():
qsyk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1350383429665/", list_docid="T1350383429665", item_type="qingsongyike", title_key=["每日轻松一刻"])
qsyk.run()

def pangbianguaitan():
pbgt = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1396928667862/", list_docid="T1396928667862", item_type="pangbianguaitan", title_key=["胖编怪谈"])
pbgt.run()

def huanqiukanke():
hqkk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1381482353221/", list_docid="T1381482353221", item_type="huanqiukanke", title_key=["今日环球侃客", "无德无信外国人"])
hqkk.run()

def wangyizuigengtie():
wygt = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1411719652285/", list_docid="T1411719652285", item_type="wangyigengtie", title_key=["网易新闻有态度"], key="adTitle")
wygt.run()

def run_forever():
while True:
qingsongyike()
pangbianguaitan()
huanqiukanke()
wangyizuigengtie()

time.sleep(600)

def test():
wangyizuigengtie()

if __name__ == "__main__":
run_forever()
#test()

161 changes: 161 additions & 0 deletions spider/wangyi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
#!/usr/bin/env python
#-*-coding:utf-8-*-
'''
#=============================================================================
# FileName: wangyi.py
# Desc: 爬虫主程序
# Author: leyle
# Email: [email protected]
# HomePage: http://www.leyle.com/
# Git_page: https://github.com/leyle
# Version: 0.0.1
# LastChange: 2014-12-08 10:15:07
# History:
#=============================================================================
'''

"""
列表 url: http://c.m.163.com/nc/article/list/T1396928667862/150-10.html
胖编怪谈: http://c.m.163.com/nc/article/AC5QK4K400964JJM/full.html
"""
import sys
import simplejson as json
import MySQLdb
import time

sys.path.append("..")

from utils import utils

reload(sys)
sys.setdefaultencoding('utf-8')

class WANGYI(object):
def __init__(self, list_url, list_docid, item_type, title_key, key="title", start=0, end=10):
self._start = start
self._end = end
self._data = ''
self._list_url = list_url
self._list_docid = list_docid
self._item_type = item_type
self._title_key = title_key
self._key = key
self._docid = []
self._need_init = True

def run(self):
if self._need_init:
self.init_qsyk()

self.download_and_insert()

def clean_all(self):
""" 目的是清理掉全局变量的值,方便循环调用 """
self._data = ''
self._docid = []


def get_docid_from_json(self):
""" 根据指定的起始、结束区间,提取这个区间的每日轻松一刻的 url 关键元素 """
url = self._list_url + str(self._start) + "-" + str(self._end) + ".html"
self._data = utils.download_page(url)
if self._data:
self._data = json.loads(self._data)
if self._data.has_key(self._list_docid):
self._data = self._data[self._list_docid]
self.extract_docid()

def extract_docid(self):
if self._data:
for d in self._data:
for title in self._title_key:
if str(d[self._key]).find(title) != -1:
tmp = {}
tmp["docid"] = d['docid']
tmp["cover_img"] = d['imgsrc'] if d.has_key('imgsrc') else ''

self._docid.append(tmp)

def download_and_insert(self):
if not self._docid:
self.get_docid_from_json()

if self._docid:
for docid in self._docid:
self.get_qsyk_and_insert(docid)

self.clean_all()

def get_qsyk_and_insert(self, docid):
cover_img = MySQLdb.escape_string(docid['cover_img'])
docid = docid['docid']

if self.db_has_exist(docid):
return

url = "http://c.3g.163.com/nc/article/%s/full.html" % str(docid)
data = utils.download_page(url, True)

if data:
data = data[docid]
if data:
ptime = data['ptime']
today = ptime.split(' ')[0]
imgs = data['img']
body = data['body'].encode('utf-8')

title = data['title']
title = title.replace(' ', '')
title = title.replace('(', '(')
title = title.replace(')', ')')
title = title.replace('(', '-')
title = title.replace(')', '')
title_hash = utils.url_hash(str(title))

for img in imgs:
body = body.replace(img['ref'], "<img src=\"" + img['src'] + "\"/><hr>")

body = body.replace('%', '%%')
body = MySQLdb.escape_string(body)
sql = "insert into wangyi(item_type, title, url, docid, cover_img, ptime, today, body) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self._item_type, title, url, docid, cover_img, ptime, today, body)
utils.insert_mysql(sql)

def db_has_exist(self, docid):
sql = "select * from wangyi where docid='%s' and item_type='%s'" % (str(docid), self._item_type)
if utils.query_mysql(sql):
return True
else:
return False

def init_qsyk(self):
""" 检查 run_control 表中 total 数据是否是0,如果不是,就运行程序,直到满足了 total 为止,并将 total 置为 0 """
if self._need_init:
sql = "select total, one_page from run_control where item='%s'" % (self._item_type)
ret = utils.query_mysql(sql)

total = 0
one_page = 0
if ret:
total = int(ret[0]['total'])
one_page = int(ret[0]['one_page'])

if total > 0:
for i in range(0, total, one_page):
self._start = i
self.download_and_insert()

sql = "update run_control set total=0 where item='%s'" % (self._item_type)
utils.update_mysql(sql)

self._need_init = False

def test():
#def __init__(self, list_url, list_docid, item_type, title_key, start=0, end=10):
#qsyk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1396928667862/", list_docid="T1396928667862", item_type="pangbianguaitan", title_key=["胖编怪谈"])
#qsyk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1350383429665/", list_docid="T1350383429665", item_type="qingsongyike", title_key=["每日轻松一刻"])
qsyk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1381482353221/", list_docid="T1381482353221", item_type="huanqiukanke", title_key=["今日环球侃客", "无德无信外国人"])
qsyk.run()

if __name__ == "__main__":
test()

10 changes: 10 additions & 0 deletions utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env python
#-*-coding:utf-8-*-


def test():
pass

if __name__ == "__main__":
test()

56 changes: 56 additions & 0 deletions utils/mylogger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python
#-*-coding:utf-8-*-
'''
#=============================================================================
# FileName: mylogger.py
# Desc: 日志记录函数,可以滚动打包日志
# Author: leyle
# Email: [email protected]
# HomePage: http://www.leyle.com/
# Git_page: https://github.com/leyle
# Version: 0.0.1
# LastChange: 2014-12-08 10:13:38
# History:
#=============================================================================
'''

import logging
import logging.handlers
import sys
import os
import time

LOGGING_MSG_FORMAT = "%(name)s %(levelname)s %(asctime)s: %(message)s"
LOGGING_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
MAXBYTE = 1024*1024*50 #50M
BACKUPCOUNT = 20 #循环备份的最大数

def get_logger(logname):
root_logger = logging.getLogger(logname)
if len(root_logger.handlers) == 0:
path = os.path.join(sys.path[0], 'logs/')
if not os.path.isdir(path):
os.mkdir(path)
filename = path + logname + ".log"
handler = logging.handlers.RotatingFileHandler(
filename,
mode = "a",
maxBytes = MAXBYTE,
backupCount = BACKUPCOUNT,
encoding = "utf-8"
)
fmter = logging.Formatter(LOGGING_MSG_FORMAT, LOGGING_DATE_FORMAT)
handler.setFormatter(fmter)
root_logger.addHandler(handler)
root_logger.setLevel(logging.DEBUG)

line_name = "%s" % logname
return logging.getLogger(line_name)

def test():
mylog = get_logger("log_name")
for i in range(0, 1000):
mylog.info("%d" % i)

if __name__ == "__main__":
test()
Loading

0 comments on commit dc26568

Please sign in to comment.