Skip to content

Commit

Permalink
升级scrapy1.0版本
Browse files Browse the repository at this point in the history
  • Loading branch information
jiangzm committed May 12, 2016
1 parent c42497e commit 36c19bd
Show file tree
Hide file tree
Showing 34 changed files with 115 additions and 103 deletions.
4 changes: 2 additions & 2 deletions alexa/alexa/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def process_item(self, item, spider):
self.file.write(line)
return item

def spider_closed(self, spider):
def close_spider(self, spider):
self.file.close()


Expand All @@ -46,5 +46,5 @@ def process_item(self, item, spider):
final_item = dict(item.items() + ritem.items())
self.r.set(item['id'], final_item)

def spider_closed(self, spider):
def close_spider(self, spider):
return
8 changes: 4 additions & 4 deletions alexa/alexa/spiders/alexa_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@

from scrapy.selector import Selector
try:
from scrapy.spider import Spider
from scrapy.spiders import Spider
except:
from scrapy.spider import BaseSpider as Spider
from scrapy.spiders import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor as sle


from alexa.items import *
Expand Down
4 changes: 2 additions & 2 deletions amazonbook/amazonbook/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def process_item(self, item, spider):
self.file.write(line)
return item

def spider_closed(self, spider):
def close_spider(self, spider):
self.file.close()


Expand All @@ -46,5 +46,5 @@ def process_item(self, item, spider):
final_item = dict(item.items() + ritem.items())
self.r.set(item['id'], final_item)

def spider_closed(self, spider):
def close_spider(self, spider):
return
8 changes: 4 additions & 4 deletions amazonbook/amazonbook/spiders/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@

from scrapy.selector import Selector
try:
from scrapy.spider import Spider
from scrapy.spiders import Spider
except:
from scrapy.spider import BaseSpider as Spider
from scrapy.spiders import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor as sle


from amazonbook.items import *
Expand Down
4 changes: 2 additions & 2 deletions dmoz/dmoz/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def process_item(self, item, spider):
self.file.write(line)
return item

def spider_closed(self, spider):
def close_spider(self, spider):
self.file.close()


Expand All @@ -46,5 +46,5 @@ def process_item(self, item, spider):
final_item = dict(item.items() + ritem.items())
self.r.set(item['id'], final_item)

def spider_closed(self, spider):
def close_spider(self, spider):
return
8 changes: 4 additions & 4 deletions dmoz/dmoz/spiders/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@

from scrapy.selector import Selector
try:
from scrapy.spider import Spider
from scrapy.spiders import Spider
except:
from scrapy.spider import BaseSpider as Spider
from scrapy.spiders import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor as sle


from dmoz.items import *
Expand Down
2 changes: 1 addition & 1 deletion doubanbook/doubanbook/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def process_item(self, item, spider):
self.file.write(line)
return item

def spider_closed(self, spider):
def close_spider(self, spider):
self.file.close()


Expand Down
21 changes: 12 additions & 9 deletions doubanbook/doubanbook/spiders/douban_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

from scrapy.selector import Selector
try:
from scrapy.spider import Spider
from scrapy.spiders import Spider
except:
from scrapy.spider import BaseSpider as Spider
from scrapy.spiders import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor as sle


from doubanbook.items import *
Expand All @@ -20,12 +20,12 @@ class DoubanBookSpider(CrawlSpider):
name = "doubanbook"
allowed_domains = ["douban.com"]
start_urls = [
"http://book.douban.com/tag/"
"https://book.douban.com/tag/"
]
rules = [
Rule(sle(allow=("/subject/\d+/\?from=tag$")), callback='parse_2'),
Rule(sle(allow=("/tag/[^/]+/\?focus=book$", )), follow=True),
Rule(sle(allow=("/tag/$", )), follow=True),
Rule(sle(allow=("/subject/\d+$")), callback='parse_2'),
Rule(sle(allow=("/tag/[^/]+$", )), follow=True),
#Rule(sle(allow=("/tag/$", )), follow=True),
]

def parse_2(self, response):
Expand All @@ -47,6 +47,9 @@ def parse_1(self, response):
# url cannot encode to Chinese easily.. XXX
info('parsed ' + str(response))

def _process_request(self, request):
def process_request(self, request):
info('process ' + str(request))
return request

def closed(self, reason):
info("DoubanBookSpider Closed:" + reason)
8 changes: 6 additions & 2 deletions doubanmovie/doubanmovie/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,13 @@ def process_item(self, item, spider):
self.file.write(line)
return item

def spider_closed(self, spider):
def close_spider(self, spider):
print("JsonWithEncodingPipeline closed")
self.file.close()

def open_spider(self, spider):
print("JsonWithEncodingPipeline opend")


class RedisPipeline(object):

Expand All @@ -46,5 +50,5 @@ def process_item(self, item, spider):
final_item = dict(item.items() + ritem.items())
self.r.set(item['id'], final_item)

def spider_closed(self, spider):
def close_spider(self, spider):
return
2 changes: 1 addition & 1 deletion doubanmovie/doubanmovie/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#USER_AGENT = 'doubanmovie (+http://www.yourdomain.com)'

DOWNLOADER_MIDDLEWARES = {
# 'misc.middleware.CustomHttpProxyMiddleware': 400,
#'misc.middleware.CustomHttpProxyMiddleware': 400,
'misc.middleware.CustomUserAgentMiddleware': 401,
}

Expand Down
21 changes: 13 additions & 8 deletions doubanmovie/doubanmovie/spiders/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@

from scrapy.selector import Selector
try:
from scrapy.spider import Spider
from scrapy.spiders import Spider
except:
from scrapy.spider import BaseSpider as Spider
from scrapy.spiders import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor as sle


from doubanmovie.items import *
Expand All @@ -24,17 +24,21 @@ class doubanmovieSpider(CommonSpider):
name = "doubanmovie"
allowed_domains = ["douban.com"]
start_urls = [
"http://movie.douban.com/chart",
#"https://movie.douban.com/tag/",
"https://movie.douban.com/chart"
]
rules = [
Rule(sle(allow=(".*movie.douban.com/subject/[0-9]+/$")), callback='parse_1', follow=True),
#Rule(sle(allow=("/tag/[0-9]{4}$")), follow=True),
#Rule(sle(allow=("/tag/[0-9]{4}/?start=[0-9]{2,4}&type=T$")), follow=True),
#Rule(sle(allow=("/subject/[0-9]+$")), callback='parse_1'),
Rule(sle(allow=("/subject/[0-9]+/$")), callback='parse_1', follow=True),
]

list_css_rules = {
'.linkto': {
'url': 'a::attr(href)',
'name': 'a::text',
}
}
}

list_css_rules_2 = {
Expand All @@ -54,5 +58,6 @@ class doubanmovieSpider(CommonSpider):
def parse_1(self, response):
info('Parse '+response.url)
x = self.parse_with_rules(response, self.content_css_rules, dict)
print(repr(x).decode('raw_unicode_escape'))
return x
#print(repr(x).decode('raw_unicode_escape'))
# return self.parse_with_rules(response, self.css_rules, doubanmovieItem)
4 changes: 2 additions & 2 deletions googlescholar/googlescholar/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def process_item(self, item, spider):
self.file.write(line)
return item

def spider_closed(self, spider):
def close_spider(self, spider):
self.file.close()


Expand All @@ -46,5 +46,5 @@ def process_item(self, item, spider):
final_item = dict(item.items() + ritem.items())
self.r.set(item['id'], final_item)

def spider_closed(self, spider):
def close_spider(self, spider):
return
8 changes: 4 additions & 4 deletions googlescholar/googlescholar/spiders/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@

from scrapy.selector import Selector
try:
from scrapy.spider import Spider
from scrapy.spiders import Spider
except:
from scrapy.spider import BaseSpider as Spider
from scrapy.spiders import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor as sle


from googlescholar.items import *
Expand Down
2 changes: 1 addition & 1 deletion hrtencent/hrtencent/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,5 @@ def process_item(self, item, spider):
self.file.write(line)
return item

def spider_closed(self, spider):
def close_spider(self, spider):
self.file.close()
8 changes: 4 additions & 4 deletions hrtencent/hrtencent/spiders/hrtencent_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

from scrapy.selector import Selector
try:
from scrapy.spider import Spider
from scrapy.spiders import Spider
except:
from scrapy.spider import BaseSpider as Spider
from scrapy.spiders import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor as sle


from hrtencent.items import *
Expand Down
4 changes: 2 additions & 2 deletions linkedin/linkedin/linkedin/spiders/LinkedinSpider.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from scrapy import log
from linkedin.items import LinkedinItem, PersonProfileItem
Expand Down
2 changes: 1 addition & 1 deletion misc/proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
PROXIES = [
#{"ip_port": "127.0.0.1:8087"}, #goagent
#{"ip_port": "127.0.0.1:8118"}, #tor via privoxy
{"ip_port": "43.245.202.120:8080"}, #tor via privoxy
{"ip_port": "127.0.0.1:1080"}, #tor via privoxy
]

FREE_PROXIES = [
Expand Down
8 changes: 4 additions & 4 deletions misc/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from scrapy.spiders import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.linkextractors import LinkExtractor as sle


from .log import *
Expand Down Expand Up @@ -102,8 +102,8 @@ def traversal(self, sel, rules, item_class, item, items):
self.traversal(i, nv, item_class, item, items)

DEBUG=True
def debug(sth):
if DEBUG == True:
def debug(self, sth):
if self.DEBUG == True:
print(sth)

def deal_text(self, sel, item, force_1_item, k, v):
Expand Down Expand Up @@ -144,7 +144,7 @@ def dfs(self, sel, rules, item_class, force_1_item):

items = []
if item_class != dict:
self.traversal(sel, rules, item_class, None, items, force_1_item)
self.traversal(sel, rules, item_class, None, items)
else:
self.traversal_dict(sel, rules, item_class, None, items, force_1_item)

Expand Down
4 changes: 2 additions & 2 deletions proxylist/proxylist/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def process_item(self, item, spider):
self.file.write(line)
return item

def spider_closed(self, spider):
def close_spider(self, spider):
self.file.close()


Expand Down Expand Up @@ -76,5 +76,5 @@ def process_item(self, item, spider):
final_item = dict(item.items() + ritem.items())
self.r.set(item['id'], final_item)

def spider_closed(self, spider):
def close_spider(self, spider):
return
2 changes: 1 addition & 1 deletion proxylist/proxylist/spiders/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from scrapy.spiders import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.linkextractors import LinkExtractor as sle
from scrapy.linkextractors import LinkExtractor as sle


Expand Down
4 changes: 2 additions & 2 deletions qqnews/qqnews/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def process_item(self, item, spider):
self.file.write(line)
return item

def spider_closed(self, spider):
def close_spider(self, spider):
self.file.close()


Expand All @@ -46,5 +46,5 @@ def process_item(self, item, spider):
final_item = dict(item.items() + ritem.items())
self.r.set(item['id'], final_item)

def spider_closed(self, spider):
def close_spider(self, spider):
return
8 changes: 4 additions & 4 deletions qqnews/qqnews/spiders/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

from scrapy.selector import Selector
try:
from scrapy.spider import Spider
from scrapy.spiders import Spider
except:
from scrapy.spider import BaseSpider as Spider
from scrapy.spiders import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor as sle


from qqnews.items import *
Expand Down
Loading

0 comments on commit 36c19bd

Please sign in to comment.