From 340181aa566df239f7ab223801f16f2eb87e0747 Mon Sep 17 00:00:00 2001 From: geekan Date: Mon, 13 Jun 2016 15:59:23 +0000 Subject: [PATCH] add github_trending repo. --- github_trending/github_trending/__init__.py | 0 github_trending/github_trending/items.py | 11 ++++ github_trending/github_trending/pipelines.py | 50 ++++++++++++++++++ github_trending/github_trending/settings.py | 36 +++++++++++++ .../github_trending/spiders/__init__.py | 4 ++ .../github_trending/spiders/spider.py | 52 +++++++++++++++++++ github_trending/scrapy.cfg | 11 ++++ 7 files changed, 164 insertions(+) create mode 100644 github_trending/github_trending/__init__.py create mode 100644 github_trending/github_trending/items.py create mode 100644 github_trending/github_trending/pipelines.py create mode 100644 github_trending/github_trending/settings.py create mode 100644 github_trending/github_trending/spiders/__init__.py create mode 100644 github_trending/github_trending/spiders/spider.py create mode 100644 github_trending/scrapy.cfg diff --git a/github_trending/github_trending/__init__.py b/github_trending/github_trending/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/github_trending/github_trending/items.py b/github_trending/github_trending/items.py new file mode 100644 index 0000000..b06a114 --- /dev/null +++ b/github_trending/github_trending/items.py @@ -0,0 +1,11 @@ +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +from scrapy.item import Item, Field + +class github_trendingItem(Item): + # define the fields for your item here like: + name = Field() + diff --git a/github_trending/github_trending/pipelines.py b/github_trending/github_trending/pipelines.py new file mode 100644 index 0000000..eca743c --- /dev/null +++ b/github_trending/github_trending/pipelines.py @@ -0,0 +1,50 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html + +import redis + + +from scrapy import signals + + +import json +import codecs +from collections import OrderedDict + + +class JsonWithEncodingPipeline(object): + + def __init__(self): + self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') + + def process_item(self, item, spider): + line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" + self.file.write(line) + return item + + def close_spider(self, spider): + self.file.close() + + +class RedisPipeline(object): + + def __init__(self): + self.r = redis.StrictRedis(host='localhost', port=6379) + + def process_item(self, item, spider): + if not item['id']: + print 'no id item!!' + + str_recorded_item = self.r.get(item['id']) + final_item = None + if str_recorded_item is None: + final_item = item + else: + ritem = eval(self.r.get(item['id'])) + final_item = dict(item.items() + ritem.items()) + self.r.set(item['id'], final_item) + + def close_spider(self, spider): + return diff --git a/github_trending/github_trending/settings.py b/github_trending/github_trending/settings.py new file mode 100644 index 0000000..e14f5f0 --- /dev/null +++ b/github_trending/github_trending/settings.py @@ -0,0 +1,36 @@ +# Scrapy settings for github_trending project +# +# For simplicity, this file contains only the most important settings by +# default. All the other settings are documented here: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# + +import sys +import os +from os.path import dirname +path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) +sys.path.append(path) +from misc.log import * + +BOT_NAME = 'github_trending' + +SPIDER_MODULES = ['github_trending.spiders'] +NEWSPIDER_MODULE = 'github_trending.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'github_trending (+http://www.yourdomain.com)' + +DOWNLOADER_MIDDLEWARES = { + # 'misc.middleware.CustomHttpProxyMiddleware': 400, + 'misc.middleware.CustomUserAgentMiddleware': 401, +} + +ITEM_PIPELINES = { + 'github_trending.pipelines.JsonWithEncodingPipeline': 300, + #'github_trending.pipelines.RedisPipeline': 301, +} + +LOG_LEVEL = 'INFO' + +DOWNLOAD_DELAY = 1 diff --git a/github_trending/github_trending/spiders/__init__.py b/github_trending/github_trending/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/github_trending/github_trending/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/github_trending/github_trending/spiders/spider.py b/github_trending/github_trending/spiders/spider.py new file mode 100644 index 0000000..74c7f0c --- /dev/null +++ b/github_trending/github_trending/spiders/spider.py @@ -0,0 +1,52 @@ +import re +import json +from urlparse import urlparse +import urllib +import pdb + + +from scrapy.selector import Selector +try: + from scrapy.spiders import Spider +except: + from scrapy.spiders import BaseSpider as Spider +from scrapy.utils.response import get_base_url +from scrapy.spiders import CrawlSpider, Rule +from scrapy.linkextractors import LinkExtractor as sle + + +from github_trending.items import * +from misc.log import * +from misc.spider import CommonSpider + + +class github_trendingSpider(CommonSpider): + name = "github_trending" + allowed_domains = ["github.com"] + start_urls = [ + "http://www.github.com/trending", + ] + rules = [ + Rule(sle(allow=("/trending$")), callback='parse_1', follow=True), + ] + + list_css_rules = { + '.repo-list-item': { + 'repo_name': '.repo-list-name a::attr(href)', + 'repo_meta': '.repo-list-meta::text', + } + } + + content_css_rules = { + 'text': '#Cnt-Main-Article-QQ p *::text', + 'images': '#Cnt-Main-Article-QQ img::attr(src)', + 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', + } + + def parse_1(self, response): + info('Parse '+response.url) + x = self.parse_with_rules(response, self.list_css_rules, dict) + # x = self.parse_with_rules(response, self.content_css_rules, dict) + print(json.dumps(x, ensure_ascii=False, indent=2)) + # pp.pprint(x) + # return self.parse_with_rules(response, self.css_rules, github_trendingItem) diff --git a/github_trending/scrapy.cfg b/github_trending/scrapy.cfg new file mode 100644 index 0000000..bd2d364 --- /dev/null +++ b/github_trending/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# http://doc.scrapy.org/en/latest/topics/scrapyd.html + +[settings] +default = github_trending.settings + +[deploy] +#url = http://localhost:6800/ +project = github_trending