add an example for crawl ziroom

geekan · Nov 25, 2016 · 0861ab7 · 0861ab7
1 parent bcd35d0
commit 0861ab7
Show file tree

Hide file tree

Showing 7 changed files with 168 additions and 0 deletions.
diff --git a/ziroom/scrapy.cfg b/ziroom/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = ziroom.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = ziroom
diff --git a/ziroom/ziroom/__init__.py b/ziroom/ziroom/__init__.py
diff --git a/ziroom/ziroom/items.py b/ziroom/ziroom/items.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class ZiroomItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    room_id = scrapy.Field()
+    room_price = scrapy.Field()
+    room_name = scrapy.Field()
+    modifyDate = scrapy.Field()
diff --git a/ziroom/ziroom/pipelines.py b/ziroom/ziroom/pipelines.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import redis
+
+
+from scrapy import signals
+
+
+import json
+import codecs
+from collections import OrderedDict
+
+
+class JsonWithEncodingPipeline(object):
+
+    def __init__(self):
+        self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
+
+    def process_item(self, item, spider):
+        line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
+        self.file.write(line)
+        return item
+
+    def close_spider(self, spider):
+        self.file.close()
+
+
+class RedisPipeline(object):
+
+    def __init__(self):
+        self.r = redis.StrictRedis(host='localhost', port=6379)
+
+    def process_item(self, item, spider):
+        if not item['id']:
+            print 'no id item!!'
+
+        str_recorded_item = self.r.get(item['id'])
+        final_item = None
+        if str_recorded_item is None:
+            final_item = item
+        else:
+            ritem = eval(self.r.get(item['id']))
+            final_item = dict(item.items() + ritem.items())
+        self.r.set(item['id'], final_item)
+
+    def spider_closed(self, spider):
+        return
diff --git a/ziroom/ziroom/settings.py b/ziroom/ziroom/settings.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+# Scrapy settings for ziroom project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+
+import sys
+import os
+from os.path import dirname
+path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
+sys.path.append(path)
+
+BOT_NAME = 'ziroom'
+
+SPIDER_MODULES = ['ziroom.spiders']
+NEWSPIDER_MODULE = 'ziroom.spiders'
+
+
+
+DOWNLOADER_MIDDLEWARES = {
+    #'misc.middleware.CustomHttpProxyMiddleware': 400,
+    'misc.middleware.CustomUserAgentMiddleware': 401,
+}
+
+ITEM_PIPELINES = {
+    'ziroom.pipelines.JsonWithEncodingPipeline': 300,
+    #'template.pipelines.RedisPipeline': 301,
+}
+
+LOG_LEVEL = 'INFO'
diff --git a/ziroom/ziroom/spiders/__init__.py b/ziroom/ziroom/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/ziroom/ziroom/spiders/spider.py b/ziroom/ziroom/spiders/spider.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+from scrapy.spiders import Spider
+from scrapy import Request
+import re
+import time
+
+from ziroom.items import ZiroomItem
+
+class Parse():
+    def __init__(self, response):
+        self.response = response
+        self.room_detail = response.xpath('//div[@class="room_detail_right"]')[0]
+        self.room_info = ' '.join(self.room_detail.xpath('.//ul[@class="detail_room"]/li/text()').extract())
+        self.metro_info = ''.join(self.room_detail.xpath('.//span[@id="lineList"]/text()').extract()).replace(' ', '').replace('\n',
+                                                                                                                  '')
+    def getID(self):
+        return int(re.findall('\d+', self.response.url)[0])
+    def getName(self):
+        return self.room_detail.xpath('.//h2/text()').extract()[0].replace(' ', '').replace('\n', '')
+    def getPrice(self):
+        room_price = int(self.room_detail.xpath('.//span[@class="room_price"]/text()').extract()[0][1:])
+        if room_price < 500:
+            room_price *= 30
+        return room_price
+
+
+
+class PagesSpider(Spider):
+    name = "ziroom"
+    start_urls = ['http://www.ziroom.com/z/nl/z3.html?p=1']
+
+    def parse(self, response):
+        print response.url
+        page = re.findall('p=(\d+)', response.url)[0]
+
+        houseList = response.xpath('//ul[@id="houseList"]/li')
+        for each in houseList:
+            url = each.xpath('div/h3/a/@href').extract()[0][2:].encode('utf-8')
+            yield Request('http://' + url, self.parseItem)
+
+        url = response.url
+        url_new = url.replace(page, str(int(page) + 1))
+        # yield Request(url_new, self.parse)
+
+    def parseItem(self, response):
+        p = Parse(response)
+        item = ZiroomItem()
+        item['modifyDate'] = int(time.time())
+        item['room_id'] = p.getID()
+        item['room_price'] = p.getPrice()
+        item['room_name'] = p.getName()
+        yield item