forked from leokelly/163spider
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
449 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
*.pyc | ||
*.swp | ||
/utils/logs/ | ||
/spider/logs/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#!/usr/bin/env python | ||
#-*-coding:utf-8-*- | ||
|
||
|
||
def test(): | ||
pass | ||
|
||
if __name__ == "__main__": | ||
test() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
-- 网易轻松一刻数据记录 | ||
drop table if exists wangyi; | ||
create table wangyi( | ||
id int not null auto_increment, | ||
item_type varchar(32) not null, -- 栏目类型,比如 qingsongyike | ||
title varchar(512) not null, | ||
url varchar(512) not null, | ||
docid varchar(32) not null, | ||
cover_img varchar(512), | ||
ptime varchar(32) not null, | ||
today char(10) not null, | ||
body text not null, | ||
open_times int not null default 0, -- 本页被浏览的次数,默认是 0 | ||
KEY(id), | ||
KEY(item_type), | ||
CONSTRAINT docid_uniq PRIMARY KEY(item_type, docid) | ||
)ENGINE=InnoDB DEFAULT CHARSET=utf8; | ||
|
||
-- 运行控制表,控制运行及更新 | ||
drop table if exists run_control; | ||
create table run_control( | ||
id int not null auto_increment, | ||
item varchar(32) not null, -- 栏目类型 | ||
total int not null, | ||
one_page int not null, -- 记录一次请求多少条记录回来 | ||
last_run timestamp default current_timestamp on update current_timestamp, | ||
KEY(id), | ||
PRIMARY KEY(item) | ||
)ENGINE=InnoDB DEFAULT CHARSET=utf8; | ||
|
||
insert into run_control(item, total, one_page) values('qingsongyike', 400, 10); | ||
insert into run_control(item, total, one_page) values('huanqiukanke', 120, 10); | ||
insert into run_control(item, total, one_page) values('pangbianguaitan', 160, 10); | ||
insert into run_control(item, total, one_page) values('wangyigengtie', 380, 10); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#!/usr/bin/env python | ||
#-*-coding:utf-8-*- | ||
''' | ||
#============================================================================= | ||
# FileName: general_run.py | ||
# Desc: 调用程序 | ||
# Author: leyle | ||
# Email: [email protected] | ||
# HomePage: http://www.leyle.com/ | ||
# Git_page: https://github.com/leyle | ||
# Version: 0.0.1 | ||
# LastChange: 2014-12-08 10:15:23 | ||
# History: | ||
#============================================================================= | ||
''' | ||
|
||
""" | ||
调用网易的各个栏目进行内容爬取 | ||
""" | ||
|
||
from wangyi import WANGYI | ||
import time | ||
|
||
def qingsongyike(): | ||
qsyk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1350383429665/", list_docid="T1350383429665", item_type="qingsongyike", title_key=["每日轻松一刻"]) | ||
qsyk.run() | ||
|
||
def pangbianguaitan(): | ||
pbgt = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1396928667862/", list_docid="T1396928667862", item_type="pangbianguaitan", title_key=["胖编怪谈"]) | ||
pbgt.run() | ||
|
||
def huanqiukanke(): | ||
hqkk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1381482353221/", list_docid="T1381482353221", item_type="huanqiukanke", title_key=["今日环球侃客", "无德无信外国人"]) | ||
hqkk.run() | ||
|
||
def wangyizuigengtie(): | ||
wygt = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1411719652285/", list_docid="T1411719652285", item_type="wangyigengtie", title_key=["网易新闻有态度"], key="adTitle") | ||
wygt.run() | ||
|
||
def run_forever(): | ||
while True: | ||
qingsongyike() | ||
pangbianguaitan() | ||
huanqiukanke() | ||
wangyizuigengtie() | ||
|
||
time.sleep(600) | ||
|
||
def test(): | ||
wangyizuigengtie() | ||
|
||
if __name__ == "__main__": | ||
run_forever() | ||
#test() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
#!/usr/bin/env python | ||
#-*-coding:utf-8-*- | ||
''' | ||
#============================================================================= | ||
# FileName: wangyi.py | ||
# Desc: 爬虫主程序 | ||
# Author: leyle | ||
# Email: [email protected] | ||
# HomePage: http://www.leyle.com/ | ||
# Git_page: https://github.com/leyle | ||
# Version: 0.0.1 | ||
# LastChange: 2014-12-08 10:15:07 | ||
# History: | ||
#============================================================================= | ||
''' | ||
|
||
""" | ||
列表 url: http://c.m.163.com/nc/article/list/T1396928667862/150-10.html | ||
胖编怪谈: http://c.m.163.com/nc/article/AC5QK4K400964JJM/full.html | ||
""" | ||
import sys | ||
import simplejson as json | ||
import MySQLdb | ||
import time | ||
|
||
sys.path.append("..") | ||
|
||
from utils import utils | ||
|
||
reload(sys) | ||
sys.setdefaultencoding('utf-8') | ||
|
||
class WANGYI(object): | ||
def __init__(self, list_url, list_docid, item_type, title_key, key="title", start=0, end=10): | ||
self._start = start | ||
self._end = end | ||
self._data = '' | ||
self._list_url = list_url | ||
self._list_docid = list_docid | ||
self._item_type = item_type | ||
self._title_key = title_key | ||
self._key = key | ||
self._docid = [] | ||
self._need_init = True | ||
|
||
def run(self): | ||
if self._need_init: | ||
self.init_qsyk() | ||
|
||
self.download_and_insert() | ||
|
||
def clean_all(self): | ||
""" 目的是清理掉全局变量的值,方便循环调用 """ | ||
self._data = '' | ||
self._docid = [] | ||
|
||
|
||
def get_docid_from_json(self): | ||
""" 根据指定的起始、结束区间,提取这个区间的每日轻松一刻的 url 关键元素 """ | ||
url = self._list_url + str(self._start) + "-" + str(self._end) + ".html" | ||
self._data = utils.download_page(url) | ||
if self._data: | ||
self._data = json.loads(self._data) | ||
if self._data.has_key(self._list_docid): | ||
self._data = self._data[self._list_docid] | ||
self.extract_docid() | ||
|
||
def extract_docid(self): | ||
if self._data: | ||
for d in self._data: | ||
for title in self._title_key: | ||
if str(d[self._key]).find(title) != -1: | ||
tmp = {} | ||
tmp["docid"] = d['docid'] | ||
tmp["cover_img"] = d['imgsrc'] if d.has_key('imgsrc') else '' | ||
|
||
self._docid.append(tmp) | ||
|
||
def download_and_insert(self): | ||
if not self._docid: | ||
self.get_docid_from_json() | ||
|
||
if self._docid: | ||
for docid in self._docid: | ||
self.get_qsyk_and_insert(docid) | ||
|
||
self.clean_all() | ||
|
||
def get_qsyk_and_insert(self, docid): | ||
cover_img = MySQLdb.escape_string(docid['cover_img']) | ||
docid = docid['docid'] | ||
|
||
if self.db_has_exist(docid): | ||
return | ||
|
||
url = "http://c.3g.163.com/nc/article/%s/full.html" % str(docid) | ||
data = utils.download_page(url, True) | ||
|
||
if data: | ||
data = data[docid] | ||
if data: | ||
ptime = data['ptime'] | ||
today = ptime.split(' ')[0] | ||
imgs = data['img'] | ||
body = data['body'].encode('utf-8') | ||
|
||
title = data['title'] | ||
title = title.replace(' ', '') | ||
title = title.replace('(', '(') | ||
title = title.replace(')', ')') | ||
title = title.replace('(', '-') | ||
title = title.replace(')', '') | ||
title_hash = utils.url_hash(str(title)) | ||
|
||
for img in imgs: | ||
body = body.replace(img['ref'], "<img src=\"" + img['src'] + "\"/><hr>") | ||
|
||
body = body.replace('%', '%%') | ||
body = MySQLdb.escape_string(body) | ||
sql = "insert into wangyi(item_type, title, url, docid, cover_img, ptime, today, body) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self._item_type, title, url, docid, cover_img, ptime, today, body) | ||
utils.insert_mysql(sql) | ||
|
||
def db_has_exist(self, docid): | ||
sql = "select * from wangyi where docid='%s' and item_type='%s'" % (str(docid), self._item_type) | ||
if utils.query_mysql(sql): | ||
return True | ||
else: | ||
return False | ||
|
||
def init_qsyk(self): | ||
""" 检查 run_control 表中 total 数据是否是0,如果不是,就运行程序,直到满足了 total 为止,并将 total 置为 0 """ | ||
if self._need_init: | ||
sql = "select total, one_page from run_control where item='%s'" % (self._item_type) | ||
ret = utils.query_mysql(sql) | ||
|
||
total = 0 | ||
one_page = 0 | ||
if ret: | ||
total = int(ret[0]['total']) | ||
one_page = int(ret[0]['one_page']) | ||
|
||
if total > 0: | ||
for i in range(0, total, one_page): | ||
self._start = i | ||
self.download_and_insert() | ||
|
||
sql = "update run_control set total=0 where item='%s'" % (self._item_type) | ||
utils.update_mysql(sql) | ||
|
||
self._need_init = False | ||
|
||
def test(): | ||
#def __init__(self, list_url, list_docid, item_type, title_key, start=0, end=10): | ||
#qsyk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1396928667862/", list_docid="T1396928667862", item_type="pangbianguaitan", title_key=["胖编怪谈"]) | ||
#qsyk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1350383429665/", list_docid="T1350383429665", item_type="qingsongyike", title_key=["每日轻松一刻"]) | ||
qsyk = WANGYI(list_url="http://c.m.163.com/nc/article/list/T1381482353221/", list_docid="T1381482353221", item_type="huanqiukanke", title_key=["今日环球侃客", "无德无信外国人"]) | ||
qsyk.run() | ||
|
||
if __name__ == "__main__": | ||
test() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#!/usr/bin/env python | ||
#-*-coding:utf-8-*- | ||
|
||
|
||
def test(): | ||
pass | ||
|
||
if __name__ == "__main__": | ||
test() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
#!/usr/bin/env python | ||
#-*-coding:utf-8-*- | ||
''' | ||
#============================================================================= | ||
# FileName: mylogger.py | ||
# Desc: 日志记录函数,可以滚动打包日志 | ||
# Author: leyle | ||
# Email: [email protected] | ||
# HomePage: http://www.leyle.com/ | ||
# Git_page: https://github.com/leyle | ||
# Version: 0.0.1 | ||
# LastChange: 2014-12-08 10:13:38 | ||
# History: | ||
#============================================================================= | ||
''' | ||
|
||
import logging | ||
import logging.handlers | ||
import sys | ||
import os | ||
import time | ||
|
||
LOGGING_MSG_FORMAT = "%(name)s %(levelname)s %(asctime)s: %(message)s" | ||
LOGGING_DATE_FORMAT = "%Y-%m-%d %H:%M:%S" | ||
MAXBYTE = 1024*1024*50 #50M | ||
BACKUPCOUNT = 20 #循环备份的最大数 | ||
|
||
def get_logger(logname): | ||
root_logger = logging.getLogger(logname) | ||
if len(root_logger.handlers) == 0: | ||
path = os.path.join(sys.path[0], 'logs/') | ||
if not os.path.isdir(path): | ||
os.mkdir(path) | ||
filename = path + logname + ".log" | ||
handler = logging.handlers.RotatingFileHandler( | ||
filename, | ||
mode = "a", | ||
maxBytes = MAXBYTE, | ||
backupCount = BACKUPCOUNT, | ||
encoding = "utf-8" | ||
) | ||
fmter = logging.Formatter(LOGGING_MSG_FORMAT, LOGGING_DATE_FORMAT) | ||
handler.setFormatter(fmter) | ||
root_logger.addHandler(handler) | ||
root_logger.setLevel(logging.DEBUG) | ||
|
||
line_name = "%s" % logname | ||
return logging.getLogger(line_name) | ||
|
||
def test(): | ||
mylog = get_logger("log_name") | ||
for i in range(0, 1000): | ||
mylog.info("%d" % i) | ||
|
||
if __name__ == "__main__": | ||
test() |
Oops, something went wrong.