add twitch as doing

geekan · Jun 12, 2016 · f7ca890 · f7ca890
1 parent 63a762b
commit f7ca890
Show file tree

Hide file tree

Showing 8 changed files with 170 additions and 0 deletions.
diff --git a/underdev/twitch/README b/underdev/twitch/README
@@ -0,0 +1,4 @@
+AJAX part. Need render engine.
+
+Data format may be JSON:
+ - https://api.twitch.tv/kraken/videos/top?limit=20&offset=0&period=week&broadcast_type=all&on_site=1
diff --git a/underdev/twitch/scrapy.cfg b/underdev/twitch/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/en/latest/topics/scrapyd.html
+
+[settings]
+default = twitch.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = twitch
diff --git a/underdev/twitch/twitch/__init__.py b/underdev/twitch/twitch/__init__.py
diff --git a/underdev/twitch/twitch/items.py b/underdev/twitch/twitch/items.py
@@ -0,0 +1,11 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+from scrapy.item import Item, Field
+
+class twitchItem(Item):
+    # define the fields for your item here like:
+    name = Field()
+
diff --git a/underdev/twitch/twitch/pipelines.py b/underdev/twitch/twitch/pipelines.py
@@ -0,0 +1,50 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import redis
+
+
+from scrapy import signals
+
+
+import json
+import codecs
+from collections import OrderedDict
+
+
+class JsonWithEncodingPipeline(object):
+
+    def __init__(self):
+        self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
+
+    def process_item(self, item, spider):
+        line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
+        self.file.write(line)
+        return item
+
+    def close_spider(self, spider):
+        self.file.close()
+
+
+class RedisPipeline(object):
+
+    def __init__(self):
+        self.r = redis.StrictRedis(host='localhost', port=6379)
+
+    def process_item(self, item, spider):
+        if not item['id']:
+            print 'no id item!!'
+
+        str_recorded_item = self.r.get(item['id'])
+        final_item = None
+        if str_recorded_item is None:
+            final_item = item
+        else:
+            ritem = eval(self.r.get(item['id']))
+            final_item = dict(item.items() + ritem.items())
+        self.r.set(item['id'], final_item)
+
+    def close_spider(self, spider):
+        return
diff --git a/underdev/twitch/twitch/settings.py b/underdev/twitch/twitch/settings.py
@@ -0,0 +1,36 @@
+# Scrapy settings for twitch project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+
+import sys
+import os
+from os.path import dirname
+path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
+sys.path.append(path)
+from misc.log import *
+
+BOT_NAME = 'twitch'
+
+SPIDER_MODULES = ['twitch.spiders']
+NEWSPIDER_MODULE = 'twitch.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'twitch (+http://www.yourdomain.com)'
+
+DOWNLOADER_MIDDLEWARES = {
+   # 'misc.middleware.CustomHttpProxyMiddleware': 400,
+    'misc.middleware.CustomUserAgentMiddleware': 401,
+}
+
+ITEM_PIPELINES = {
+    'twitch.pipelines.JsonWithEncodingPipeline': 300,
+    #'twitch.pipelines.RedisPipeline': 301,
+}
+
+LOG_LEVEL = 'INFO'
+
+DOWNLOAD_DELAY = 1
diff --git a/underdev/twitch/twitch/spiders/__init__.py b/underdev/twitch/twitch/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/underdev/twitch/twitch/spiders/spider.py b/underdev/twitch/twitch/spiders/spider.py
@@ -0,0 +1,54 @@
+import re
+import json
+from urlparse import urlparse
+import urllib
+import pdb
+
+
+from scrapy.selector import Selector
+try:
+    from scrapy.spiders import Spider
+except:
+    from scrapy.spiders import BaseSpider as Spider
+from scrapy.utils.response import get_base_url
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor as sle
+
+
+from twitch.items import *
+from misc.log import *
+from misc.spider import CommonSpider
+
+
+class twitchSpider(CommonSpider):
+    name = "twitch"
+    allowed_domains = ["twitch.tv"]
+    start_urls = [
+        "https://www.twitch.tv/directory/game/Hearthstone%3A%20Heroes%20of%20Warcraft"
+    ]
+    rules = [
+        Rule(sle(allow=("https://www.twitch.tv/directory/game/Hearthstone%3A%20Heroes%20of%20Warcraft")), callback='parse_1', follow=True),
+    ]
+
+    list_css_rules = { 
+        '.content': {
+            'room_name': '.meta .title a::text',
+            'author': '.meta .info a::text',
+            'people_count': '.meta .info a::attr(data-ember-action)'
+        }   
+    }   
+
+    content_css_rules = { 
+        'text': '#Cnt-Main-Article-QQ p *::text',
+        'images': '#Cnt-Main-Article-QQ img::attr(src)',
+        'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
+    }
+
+    def parse_1(self, response):
+        info('Parse '+response.url)
+        x = self.parse_with_rules(response, self.list_css_rules, dict)
+        # x = self.parse_with_rules(response, self.content_css_rules, dict)
+        import pdb; pdb.set_trace()
+        print(json.dumps(x, ensure_ascii=False, indent=2))
+        # pp.pprint(x)
+        # return self.parse_with_rules(response, self.css_rules, twitchItem)