add youtube_trending

geekan · Jun 13, 2016 · f5e3b41 · f5e3b41
1 parent 2695027
commit f5e3b41
Show file tree

Hide file tree

Showing 7 changed files with 164 additions and 0 deletions.
diff --git a/youtube_trending/scrapy.cfg b/youtube_trending/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/en/latest/topics/scrapyd.html
+
+[settings]
+default = youtube_trending.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = youtube_trending
diff --git a/youtube_trending/youtube_trending/__init__.py b/youtube_trending/youtube_trending/__init__.py
diff --git a/youtube_trending/youtube_trending/items.py b/youtube_trending/youtube_trending/items.py
@@ -0,0 +1,11 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+from scrapy.item import Item, Field
+
+class youtube_trendingItem(Item):
+    # define the fields for your item here like:
+    name = Field()
+
diff --git a/youtube_trending/youtube_trending/pipelines.py b/youtube_trending/youtube_trending/pipelines.py
@@ -0,0 +1,50 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import redis
+
+
+from scrapy import signals
+
+
+import json
+import codecs
+from collections import OrderedDict
+
+
+class JsonWithEncodingPipeline(object):
+
+    def __init__(self):
+        self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
+
+    def process_item(self, item, spider):
+        line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
+        self.file.write(line)
+        return item
+
+    def close_spider(self, spider):
+        self.file.close()
+
+
+class RedisPipeline(object):
+
+    def __init__(self):
+        self.r = redis.StrictRedis(host='localhost', port=6379)
+
+    def process_item(self, item, spider):
+        if not item['id']:
+            print 'no id item!!'
+
+        str_recorded_item = self.r.get(item['id'])
+        final_item = None
+        if str_recorded_item is None:
+            final_item = item
+        else:
+            ritem = eval(self.r.get(item['id']))
+            final_item = dict(item.items() + ritem.items())
+        self.r.set(item['id'], final_item)
+
+    def close_spider(self, spider):
+        return
diff --git a/youtube_trending/youtube_trending/settings.py b/youtube_trending/youtube_trending/settings.py
@@ -0,0 +1,36 @@
+# Scrapy settings for youtube_trending project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+
+import sys
+import os
+from os.path import dirname
+path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
+sys.path.append(path)
+from misc.log import *
+
+BOT_NAME = 'youtube_trending'
+
+SPIDER_MODULES = ['youtube_trending.spiders']
+NEWSPIDER_MODULE = 'youtube_trending.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'youtube_trending (+http://www.yourdomain.com)'
+
+DOWNLOADER_MIDDLEWARES = {
+   # 'misc.middleware.CustomHttpProxyMiddleware': 400,
+    'misc.middleware.CustomUserAgentMiddleware': 401,
+}
+
+ITEM_PIPELINES = {
+    'youtube_trending.pipelines.JsonWithEncodingPipeline': 300,
+    #'youtube_trending.pipelines.RedisPipeline': 301,
+}
+
+LOG_LEVEL = 'INFO'
+
+DOWNLOAD_DELAY = 1
diff --git a/youtube_trending/youtube_trending/spiders/__init__.py b/youtube_trending/youtube_trending/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/youtube_trending/youtube_trending/spiders/spider.py b/youtube_trending/youtube_trending/spiders/spider.py
@@ -0,0 +1,52 @@
+import re
+import json
+from urlparse import urlparse
+import urllib
+import pdb
+
+
+from scrapy.selector import Selector
+try:
+    from scrapy.spiders import Spider
+except:
+    from scrapy.spiders import BaseSpider as Spider
+from scrapy.utils.response import get_base_url
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor as sle
+
+
+from youtube_trending.items import *
+from misc.log import *
+from misc.spider import CommonSpider
+
+
+class youtube_trendingSpider(CommonSpider):
+    name = "youtube_trending"
+    allowed_domains = ["youtube.com"]
+    start_urls = [
+        "https://www.youtube.com/feed/trending",
+    ]
+    rules = [
+        Rule(sle(allow=("feed/trending$")), callback='parse_1', follow=True),
+    ]
+
+    list_css_rules = { 
+        '.yt-lockup-content': {
+            'video_title': '.yt-lockup-title a::text',
+            'author': '.yt-lockup-byline a::text',
+        }   
+    }   
+
+    content_css_rules = { 
+        'text': '#Cnt-Main-Article-QQ p *::text',
+        'images': '#Cnt-Main-Article-QQ img::attr(src)',
+        'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
+    }
+
+    def parse_1(self, response):
+        info('Parse '+response.url)
+        x = self.parse_with_rules(response, self.list_css_rules, dict)
+        # x = self.parse_with_rules(response, self.content_css_rules, dict)
+        print(json.dumps(x, ensure_ascii=False, indent=2))
+        # pp.pprint(x)
+        # return self.parse_with_rules(response, self.css_rules, youtube_trendingItem)