Skip to content

Commit

Permalink
add an example for crawl ziroom
Browse files Browse the repository at this point in the history
  • Loading branch information
刘畅 authored and 刘畅 committed Nov 25, 2016
1 parent bcd35d0 commit 0861ab7
Show file tree
Hide file tree
Showing 7 changed files with 168 additions and 0 deletions.
11 changes: 11 additions & 0 deletions ziroom/scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = ziroom.settings

[deploy]
#url = http://localhost:6800/
project = ziroom
Empty file added ziroom/ziroom/__init__.py
Empty file.
17 changes: 17 additions & 0 deletions ziroom/ziroom/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class ZiroomItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
room_id = scrapy.Field()
room_price = scrapy.Field()
room_name = scrapy.Field()
modifyDate = scrapy.Field()
51 changes: 51 additions & 0 deletions ziroom/ziroom/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import redis


from scrapy import signals


import json
import codecs
from collections import OrderedDict


class JsonWithEncodingPipeline(object):

def __init__(self):
self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')

def process_item(self, item, spider):
line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
self.file.write(line)
return item

def close_spider(self, spider):
self.file.close()


class RedisPipeline(object):

def __init__(self):
self.r = redis.StrictRedis(host='localhost', port=6379)

def process_item(self, item, spider):
if not item['id']:
print 'no id item!!'

str_recorded_item = self.r.get(item['id'])
final_item = None
if str_recorded_item is None:
final_item = item
else:
ritem = eval(self.r.get(item['id']))
final_item = dict(item.items() + ritem.items())
self.r.set(item['id'], final_item)

def spider_closed(self, spider):
return
33 changes: 33 additions & 0 deletions ziroom/ziroom/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
# Scrapy settings for ziroom project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#

import sys
import os
from os.path import dirname
path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
sys.path.append(path)

BOT_NAME = 'ziroom'

SPIDER_MODULES = ['ziroom.spiders']
NEWSPIDER_MODULE = 'ziroom.spiders'



DOWNLOADER_MIDDLEWARES = {
#'misc.middleware.CustomHttpProxyMiddleware': 400,
'misc.middleware.CustomUserAgentMiddleware': 401,
}

ITEM_PIPELINES = {
'ziroom.pipelines.JsonWithEncodingPipeline': 300,
#'template.pipelines.RedisPipeline': 301,
}

LOG_LEVEL = 'INFO'
4 changes: 4 additions & 0 deletions ziroom/ziroom/spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
52 changes: 52 additions & 0 deletions ziroom/ziroom/spiders/spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy import Request
import re
import time

from ziroom.items import ZiroomItem

class Parse():
def __init__(self, response):
self.response = response
self.room_detail = response.xpath('//div[@class="room_detail_right"]')[0]
self.room_info = ' '.join(self.room_detail.xpath('.//ul[@class="detail_room"]/li/text()').extract())
self.metro_info = ''.join(self.room_detail.xpath('.//span[@id="lineList"]/text()').extract()).replace(' ', '').replace('\n',
'')
def getID(self):
return int(re.findall('\d+', self.response.url)[0])
def getName(self):
return self.room_detail.xpath('.//h2/text()').extract()[0].replace(' ', '').replace('\n', '')
def getPrice(self):
room_price = int(self.room_detail.xpath('.//span[@class="room_price"]/text()').extract()[0][1:])
if room_price < 500:
room_price *= 30
return room_price



class PagesSpider(Spider):
name = "ziroom"
start_urls = ['http://www.ziroom.com/z/nl/z3.html?p=1']

def parse(self, response):
print response.url
page = re.findall('p=(\d+)', response.url)[0]

houseList = response.xpath('//ul[@id="houseList"]/li')
for each in houseList:
url = each.xpath('div/h3/a/@href').extract()[0][2:].encode('utf-8')
yield Request('http://' + url, self.parseItem)

url = response.url
url_new = url.replace(page, str(int(page) + 1))
# yield Request(url_new, self.parse)

def parseItem(self, response):
p = Parse(response)
item = ZiroomItem()
item['modifyDate'] = int(time.time())
item['room_id'] = p.getID()
item['room_price'] = p.getPrice()
item['room_name'] = p.getName()
yield item

0 comments on commit 0861ab7

Please sign in to comment.