Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pull request for crawler #5

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ This is a scrapers for Job Aggregator written with Scrapy
- Shalini Chaturvedi
- Yamini Parab
- Tejashree Mane
- Jitendra Varma
- [Jitendra Varma](https://github.com/jitendravarma)
- Ravi Pal
- Oankar Marathe
- Nilam Pal
Expand Down Expand Up @@ -52,4 +52,4 @@ This is a scrapers for Job Aggregator written with Scrapy
- First commit
- Sep 3 2017
- Refactored Spiders and Parsers
- Added `test_parsers.py` which is test suite used for checking parser
- Added `test_parsers.py` which is test suite used for checking parser
Empty file.
39 changes: 39 additions & 0 deletions scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/crawler/services.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import os
from hashlib import md5
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file should possibly be called utils.py

from datetime import datetime

from scrapy_ft_jobs_sites.settings import CRAWLER_DIR

MAX_HASH_CHARS = 8
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this to settings.py



def check_or_create_directory(spider_name):
"""
this is used to create directory for storing the html content
for a given spider on its domain name
"""

crawl_id = md5(str(datetime.now().strftime("%Y%m%d"))
).hexdigest()[:MAX_HASH_CHARS]
base_directory = os.path.join(CRAWLER_DIR[0], spider_name)

all_directories = [base_directory, os.path.join(base_directory, crawl_id)]

# this is used to check if the directory exists or not then its creates one
for current_dir in all_directories:
if not os.path.exists(current_dir):
os.mkdir(current_dir)

return crawl_id


def cache_response(spider_name, html_response):
"""
this method is used to save HTML response to disk
"""
crawl_id = check_or_create_directory(spider_name)
page_id = md5(str(datetime.now())).hexdigest()[:MAX_HASH_CHARS]
file_path = os.path.join(CRAWLER_DIR[0], spider_name, crawl_id, page_id)
with open(file_path, "w") as file_to_save:
file_to_save.write(html_response.encode("utf-8"))
file_to_save.close()
15 changes: 10 additions & 5 deletions scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,18 @@
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
import os

BOT_NAME = 'scrapy_ft_jobs_sites'

SPIDER_MODULES = ['scrapy_ft_jobs_sites.spiders']
NEWSPIDER_MODULE = 'scrapy_ft_jobs_sites.spiders'

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

CRAWLER_DIR = (
os.path.join(BASE_DIR, "scrapy_ft_jobs_sites/data"),
)
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapy_ft_jobs_sites (+http://www.yourdomain.com)'

Expand All @@ -39,32 +44,32 @@
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# SPIDER_MIDDLEWARES = {
# 'scrapy_ft_jobs_sites.middlewares.ScrapyFtJobsSitesSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# DOWNLOADER_MIDDLEWARES = {
# 'scrapy_ft_jobs_sites.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# ITEM_PIPELINES = {
# 'scrapy_ft_jobs_sites.pipelines.ScrapyFtJobsSitesPipeline': 300,
#}

Expand Down
11 changes: 7 additions & 4 deletions scrapy_ft_jobs_sites/scrapy_ft_jobs_sites/spiders/indeedin.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from .base import BaseFTSpider
from .parsers import IndeedParser

from scrapy_ft_jobs_sites.crawler.services import cache_response


class IndeedInSpider(BaseFTSpider):
name = 'indeedin'
Expand All @@ -25,10 +27,8 @@ class IndeedInSpider(BaseFTSpider):
follow=False),
)


def __init__(self, *args, **kwargs):
super(IndeedInSpider, self).__init__(*args, **kwargs)

# Logic for start_urls creations
self.start_urls = []

Expand All @@ -37,10 +37,13 @@ def __init__(self, *args, **kwargs):
self.start_urls.append(URL)

for pagination in range(10, 60, 10):
URL = self.base_url_pattern + item.replace(" ", "+") + "&start=" + str(pagination)
URL = self.base_url_pattern + \
item.replace(" ", "+") + "&start=" + str(pagination)
self.start_urls.append(URL)

def parse_item(self, response):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make sure you have test case for parsing logic, this will help making sure our standard is enforced

soup = BeautifulSoup(response.body, 'html.parser')
parser = IndeedParser(soup, "".join(self.allowed_domains))
return parser.parse_response()
# this function will save the html content to disk
cache_response(self.name, soup.text)
return parser.parse_response()