From e4fb2e304b1efdba72123624793234a884bf0f9b Mon Sep 17 00:00:00 2001 From: dolguldor Date: Tue, 23 Jan 2018 17:28:05 +0530 Subject: [PATCH 1/3] added genric crawler to cache scrapy response --- .../scrapy_ft_jobs_sites/crawler/__init__.py | 0 .../scrapy_ft_jobs_sites/crawler/services.py | 39 +++++++++++++++++++ .../scrapy_ft_jobs_sites/settings.py | 15 ++++--- .../scrapy_ft_jobs_sites/spiders/indeedin.py | 11 ++++-- 4 files changed, 56 insertions(+), 9 deletions(-) create mode 100644 scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/crawler/__init__.py create mode 100644 scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/crawler/services.py diff --git a/scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/crawler/__init__.py b/scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/crawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/crawler/services.py b/scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/crawler/services.py new file mode 100644 index 0000000..6e606e3 --- /dev/null +++ b/scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/crawler/services.py @@ -0,0 +1,39 @@ +import os +from hashlib import md5 +from datetime import datetime + +from scrapy_ft_jobs_sites.settings import CRAWLER_DIR + +MAX_HASH_CHARS = 8 + + +def check_or_create_directory(spider_name): + """ + this is used to create directory for storing the html content + for a given spider on its domain name + """ + + crawl_id = md5(str(datetime.now().strftime("%Y%m%d")) + ).hexdigest()[:MAX_HASH_CHARS] + base_directory = os.path.join(CRAWLER_DIR[0], spider_name) + + all_directories = [base_directory, os.path.join(base_directory, crawl_id)] + + # this is used to check if the directory exists or not then its creates one + for current_dir in all_directories: + if not os.path.exists(current_dir): + os.mkdir(current_dir) + + return crawl_id + + +def cache_response(spider_name, html_response): + """ + this method is used to save HTML response to disk + """ + crawl_id = check_or_create_directory(spider_name) + page_id = md5(str(datetime.now())).hexdigest()[:MAX_HASH_CHARS] + file_path = os.path.join(CRAWLER_DIR[0], spider_name, crawl_id, page_id) + with open(file_path, "w") as file_to_save: + file_to_save.write(html_response.encode("utf-8")) + file_to_save.close() diff --git a/scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/settings.py b/scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/settings.py index c37c62f..4f02504 100644 --- a/scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/settings.py +++ b/scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/settings.py @@ -8,13 +8,18 @@ # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html +import os BOT_NAME = 'scrapy_ft_jobs_sites' SPIDER_MODULES = ['scrapy_ft_jobs_sites.spiders'] NEWSPIDER_MODULE = 'scrapy_ft_jobs_sites.spiders' +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +CRAWLER_DIR = ( + os.path.join(BASE_DIR, "scrapy_ft_jobs_sites/data"), +) # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'scrapy_ft_jobs_sites (+http://www.yourdomain.com)' @@ -39,32 +44,32 @@ #TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { +# DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { +# SPIDER_MIDDLEWARES = { # 'scrapy_ft_jobs_sites.middlewares.ScrapyFtJobsSitesSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { +# DOWNLOADER_MIDDLEWARES = { # 'scrapy_ft_jobs_sites.middlewares.MyCustomDownloaderMiddleware': 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html -#EXTENSIONS = { +# EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { +# ITEM_PIPELINES = { # 'scrapy_ft_jobs_sites.pipelines.ScrapyFtJobsSitesPipeline': 300, #} diff --git a/scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/spiders/indeedin.py b/scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/spiders/indeedin.py index 1a03705..ad12e50 100644 --- a/scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/spiders/indeedin.py +++ b/scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/spiders/indeedin.py @@ -11,6 +11,8 @@ from .base import BaseFTSpider from .parsers import IndeedParser +from scrapy_ft_jobs_sites.crawler.services import cache_response + class IndeedInSpider(BaseFTSpider): name = 'indeedin' @@ -25,10 +27,8 @@ class IndeedInSpider(BaseFTSpider): follow=False), ) - def __init__(self, *args, **kwargs): super(IndeedInSpider, self).__init__(*args, **kwargs) - # Logic for start_urls creations self.start_urls = [] @@ -37,10 +37,13 @@ def __init__(self, *args, **kwargs): self.start_urls.append(URL) for pagination in range(10, 60, 10): - URL = self.base_url_pattern + item.replace(" ", "+") + "&start=" + str(pagination) + URL = self.base_url_pattern + \ + item.replace(" ", "+") + "&start=" + str(pagination) self.start_urls.append(URL) def parse_item(self, response): soup = BeautifulSoup(response.body, 'html.parser') parser = IndeedParser(soup, "".join(self.allowed_domains)) - return parser.parse_response() \ No newline at end of file + # this function will save the html content to disk + cache_response(self.name, soup.text) + return parser.parse_response() From 4da0ae9f011a3311b312d374d77d709e058902f6 Mon Sep 17 00:00:00 2001 From: dolguldor Date: Tue, 23 Jan 2018 17:30:07 +0530 Subject: [PATCH 2/3] updated readme.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e5e64fd..b0819d4 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ This is a scrapers for Job Aggregator written with Scrapy - Shalini Chaturvedi - Yamini Parab - Tejashree Mane -- Jitendra Varma +- (Jitendra Varma)[https://github.com/jitendravarma] - Ravi Pal - Oankar Marathe - Nilam Pal @@ -52,4 +52,4 @@ This is a scrapers for Job Aggregator written with Scrapy - First commit - Sep 3 2017 - Refactored Spiders and Parsers - - Added `test_parsers.py` which is test suite used for checking parser \ No newline at end of file + - Added `test_parsers.py` which is test suite used for checking parser From f72b5591b11b6a3a5bcea57a26a5a483b54990b5 Mon Sep 17 00:00:00 2001 From: dolguldor Date: Tue, 23 Jan 2018 17:31:34 +0530 Subject: [PATCH 3/3] fixed syntax in readme.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b0819d4..8e50bd8 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ This is a scrapers for Job Aggregator written with Scrapy - Shalini Chaturvedi - Yamini Parab - Tejashree Mane -- (Jitendra Varma)[https://github.com/jitendravarma] +- [Jitendra Varma](https://github.com/jitendravarma) - Ravi Pal - Oankar Marathe - Nilam Pal