Skip to content

Commit

Permalink
add twitch as doing
Browse files Browse the repository at this point in the history
  • Loading branch information
geekan committed Jun 12, 2016
1 parent 63a762b commit f7ca890
Show file tree
Hide file tree
Showing 8 changed files with 170 additions and 0 deletions.
4 changes: 4 additions & 0 deletions underdev/twitch/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
AJAX part. Need render engine.

Data format may be JSON:
- https://api.twitch.tv/kraken/videos/top?limit=20&offset=0&period=week&broadcast_type=all&on_site=1
11 changes: 11 additions & 0 deletions underdev/twitch/scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# http://doc.scrapy.org/en/latest/topics/scrapyd.html

[settings]
default = twitch.settings

[deploy]
#url = http://localhost:6800/
project = twitch
Empty file.
11 changes: 11 additions & 0 deletions underdev/twitch/twitch/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item, Field

class twitchItem(Item):
# define the fields for your item here like:
name = Field()

50 changes: 50 additions & 0 deletions underdev/twitch/twitch/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import redis


from scrapy import signals


import json
import codecs
from collections import OrderedDict


class JsonWithEncodingPipeline(object):

def __init__(self):
self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')

def process_item(self, item, spider):
line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
self.file.write(line)
return item

def close_spider(self, spider):
self.file.close()


class RedisPipeline(object):

def __init__(self):
self.r = redis.StrictRedis(host='localhost', port=6379)

def process_item(self, item, spider):
if not item['id']:
print 'no id item!!'

str_recorded_item = self.r.get(item['id'])
final_item = None
if str_recorded_item is None:
final_item = item
else:
ritem = eval(self.r.get(item['id']))
final_item = dict(item.items() + ritem.items())
self.r.set(item['id'], final_item)

def close_spider(self, spider):
return
36 changes: 36 additions & 0 deletions underdev/twitch/twitch/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Scrapy settings for twitch project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#

import sys
import os
from os.path import dirname
path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
sys.path.append(path)
from misc.log import *

BOT_NAME = 'twitch'

SPIDER_MODULES = ['twitch.spiders']
NEWSPIDER_MODULE = 'twitch.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'twitch (+http://www.yourdomain.com)'

DOWNLOADER_MIDDLEWARES = {
# 'misc.middleware.CustomHttpProxyMiddleware': 400,
'misc.middleware.CustomUserAgentMiddleware': 401,
}

ITEM_PIPELINES = {
'twitch.pipelines.JsonWithEncodingPipeline': 300,
#'twitch.pipelines.RedisPipeline': 301,
}

LOG_LEVEL = 'INFO'

DOWNLOAD_DELAY = 1
4 changes: 4 additions & 0 deletions underdev/twitch/twitch/spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
54 changes: 54 additions & 0 deletions underdev/twitch/twitch/spiders/spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import re
import json
from urlparse import urlparse
import urllib
import pdb


from scrapy.selector import Selector
try:
from scrapy.spiders import Spider
except:
from scrapy.spiders import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor as sle


from twitch.items import *
from misc.log import *
from misc.spider import CommonSpider


class twitchSpider(CommonSpider):
name = "twitch"
allowed_domains = ["twitch.tv"]
start_urls = [
"https://www.twitch.tv/directory/game/Hearthstone%3A%20Heroes%20of%20Warcraft"
]
rules = [
Rule(sle(allow=("https://www.twitch.tv/directory/game/Hearthstone%3A%20Heroes%20of%20Warcraft")), callback='parse_1', follow=True),
]

list_css_rules = {
'.content': {
'room_name': '.meta .title a::text',
'author': '.meta .info a::text',
'people_count': '.meta .info a::attr(data-ember-action)'
}
}

content_css_rules = {
'text': '#Cnt-Main-Article-QQ p *::text',
'images': '#Cnt-Main-Article-QQ img::attr(src)',
'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
}

def parse_1(self, response):
info('Parse '+response.url)
x = self.parse_with_rules(response, self.list_css_rules, dict)
# x = self.parse_with_rules(response, self.content_css_rules, dict)
import pdb; pdb.set_trace()
print(json.dumps(x, ensure_ascii=False, indent=2))
# pp.pprint(x)
# return self.parse_with_rules(response, self.css_rules, twitchItem)

0 comments on commit f7ca890

Please sign in to comment.