-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 7f75800
Showing
19,292 changed files
with
3,118,415 additions
and
0 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
/config.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
### Job search project | ||
|
||
## How to use this repository | ||
|
||
- extract data with scrapy crawler | ||
- run `main.py` to preprocess and enrich the data | ||
- run the flask web app to display the results | ||
|
||
### Gather data from linkedIn | ||
|
||
The first step of the project is to extract job offerings from linkedIn. You can run the scrapy crawl command with three parameters: | ||
- `keywords`: the position or job title to look for (e.g. data scientist) | ||
- `location`: the location of the job (e.g. Berlin) | ||
- `filter_time`: the time span of the job offerings to show | ||
|
||
|
||
The command to extract all data scientist jobs in germany would look like this (run it in the `/scraping` directory): | ||
``` console | ||
scrapy crawl linkedin -a keywords="data scientist" -a location="deutschland" -a filter_time="false" | ||
``` | ||
if you wish to extract only the jobs of the last 24 hours you can use `filter_time=1` | ||
|
||
### Preprocessing and company ratings | ||
The script `main.py` loads the data from the database, preprocesses it and finds ratings for new companies. | ||
|
||
``` console | ||
python main.py | ||
``` | ||
|
||
### Send job offerings per mail | ||
``` console | ||
python mail_app.py | ||
``` | ||
|
||
### Airflow | ||
cp automate.py ~/airflow/dags | ||
airflow scheduler | ||
|
||
new terminal | ||
airflow webserver | ||
|
||
|
||
|
||
|
||
airflow unpause 'job_postings' && airflow trigger_dag 'job_postings' --conf '{"keywords":"data analyst", "location":"münchen"}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from datetime import datetime, timedelta | ||
|
||
import airflow | ||
import pandas as pd | ||
import json | ||
from airflow import DAG | ||
from airflow.operators.bash_operator import BashOperator | ||
|
||
with open("config.json") as json_data_file: | ||
config = json.load(json_data_file) | ||
|
||
default_args = { | ||
'owner': 'airflow', | ||
'start_date': datetime(2020, 6, 13) | ||
} | ||
|
||
dag = DAG(dag_id='job_postings', default_args=default_args, schedule_interval='@daily') | ||
|
||
# create task for every search in settings | ||
for i, (keywords, location) in enumerate(zip(config['keywords'], config['location'])): | ||
|
||
scrape = BashOperator( | ||
task_id=f'crawl_{i}', | ||
bash_command= """ cd ~/Documents/job_mail/scraping/ && scrapy crawl linkedin -a keywords="{{params.keywords}}" -a location="{{params.location}}" -a filter_time="1" """, | ||
dag=dag, | ||
params={'keywords':keywords, 'location':location} | ||
) | ||
|
||
enrich = BashOperator( | ||
task_id=f'enrich_{i}', | ||
bash_command='cd ~/Documents/job_mail/ && python main.py', | ||
dag=dag | ||
) | ||
|
||
mail = BashOperator( | ||
task_id=f'mail_{i}', | ||
bash_command= """cd ~/Documents/job_mail/flask_mail && python mail_app.py --keywords "{{params.keywords}}" --location "{{params.location}}" --username "{{params.username}}" --password "{{params.password}}" """, | ||
dag=dag, | ||
params={'keywords':keywords, 'location':location, 'username':config['gmail_username'], 'password':config['gmail_password']} | ||
) | ||
|
||
scrape >> enrich >> mail |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from flask import Flask, render_template, url_for | ||
from flask_mail import Mail, Message | ||
import pandas as pd | ||
import ast | ||
import datetime | ||
import argparse | ||
|
||
|
||
|
||
# load the data | ||
job_df = pd.read_sql_table('jobs_preprocessed_table','sqlite:///../scraping/jobs.db') | ||
company_df = pd.read_sql_table('company_ratings','sqlite:///../scraping/jobs.db') | ||
|
||
# join the data | ||
job_data = pd.merge(job_df, company_df, how='left', on=['company_name','city']) | ||
|
||
# sort dataframe | ||
job_data = job_data.sort_values(['total','reviews','views'], ascending=False) | ||
job_data['tfidf_data'] = [ast.literal_eval(job) for job in job_data['tfidf_data']] | ||
|
||
# extract only the jobs which were posted today | ||
today = datetime.datetime.now() | ||
job_data['post_date_short'] = [job.date() for job in job_data['post_date']] | ||
job_data = job_data[job_data['post_date_short']==today.date()] | ||
|
||
# Initiate the parser | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-k", "--keywords", dest='keywords', help="define the keywords to search for", action='store') | ||
parser.add_argument("-l", "--location", dest='location',help="define the location to search for", action='store') | ||
parser.add_argument("-u", "--username", dest='username',help="Username for gmail account", action='store') | ||
parser.add_argument("-p", "--password", dest='password',help="Password for gmail account", action='store') | ||
|
||
# Read arguments from the command line | ||
args = parser.parse_args() | ||
|
||
# filter keywords and location | ||
job_data = job_data[job_data['search_keywords']==args.keywords] | ||
job_data = job_data[job_data['search_location']==args.location] | ||
|
||
# convert dataframe to list of dicts | ||
job_data = job_data.to_dict('records') | ||
|
||
num_results = len(job_data) | ||
|
||
|
||
app = Flask(__name__) | ||
|
||
mail_settings = { | ||
'SERVER_NAME': 'smtp.gmail.com', | ||
'MAIL_SERVER':'smtp.gmail.com', | ||
'MAIL_PORT':465, | ||
'MAIL_USERNAME':args.username, | ||
'MAIL_PASSWORD':args.password, | ||
'MAIL_USE_TLS':False, | ||
'MAIL_USE_SSL':True | ||
} | ||
app.config.update(mail_settings) | ||
mail = Mail(app) | ||
|
||
if __name__ == '__main__': | ||
with app.app_context(): | ||
msg = Message(subject=f'Job offerings for {today.strftime("%Y-%m-%d %H:%M:%S")}', | ||
sender = '[email protected]', | ||
recipients = ['[email protected]']) | ||
msg.html = render_template('mail.html', job_data=job_data, num_results=num_results, | ||
search_keywords=args.keywords, search_location=args.location) | ||
mail.send(msg) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
<!-- use inline css because many mail provider do strip external references --> | ||
<head> | ||
<style> | ||
.badge { | ||
display: inline-block; | ||
padding: 0.25em 0.4em; | ||
font-size: 75%; | ||
font-weight: 700; | ||
line-height: 1; | ||
text-align: center; | ||
white-space: nowrap; | ||
vertical-align: baseline; | ||
border-radius: 0.25rem; | ||
transition: color 0.15s ease-in-out, background-color 0.15s ease-in-out, border-color 0.15s ease-in-out, box-shadow 0.15s ease-in-out; | ||
} | ||
.badge-primary { | ||
color: #fff; | ||
background-color: #007bff; | ||
} | ||
.badge-warning { | ||
color: #212529; | ||
background-color: #ffc107; | ||
} | ||
.badge-info { | ||
color: #fff; | ||
background-color: #17a2b8; | ||
} | ||
.content-section { | ||
background: #ffffff; | ||
padding: 10px 20px; | ||
border: 1px solid #dddddd; | ||
border-radius: 3px; | ||
margin-bottom: 20px; | ||
} | ||
.article-title { | ||
color: #444444; | ||
} | ||
|
||
a.article-title:hover { | ||
color: #428bca; | ||
text-decoration: none; | ||
} | ||
|
||
.article-content { | ||
white-space: pre-line; | ||
} | ||
|
||
.article-metadata { | ||
padding-bottom: 1px; | ||
margin-bottom: 4px; | ||
border-bottom: 1px solid #e3e3e3 | ||
} | ||
</style> | ||
</head> | ||
<h3>Hey there,</h3> | ||
<p>I found <b>{{ num_results }}</b> new Job offerings for your search <b>{{ search_keywords }} in {{ search_location }}</b></p> | ||
<br> | ||
|
||
{% for job in job_data %} | ||
<article class="media content-section"> | ||
<div class="media-body"> | ||
<div class="article-metadata"> | ||
<a class="mr-2" href="#">{{ job['company_name'] }}</a> | ||
<small class="text-muted">{{ job['city'] }}</small> | ||
<a class="mr-2" href="#">Rating: {{ job['total']}} / {{ job['reviews']}}</a> | ||
<a> Level: {{ job['rating_level']}}</a> | ||
<h1></h1> | ||
</div> | ||
<!-- Main part of the post --> | ||
<h3><a class="article-title" href="{{ ''.join(['http://', job.link]) }}">{{ job['title'] }}</a></h3> | ||
|
||
<p class="article-content">{{ job['description_text'][:400] }}</p> | ||
|
||
<!-- Industrie --> | ||
{% set industrie_list = job.industries.split('; ') %} | ||
<p> | ||
{% for industrie in industrie_list %} | ||
<span class="badge badge-primary">{{ industrie }}</span> | ||
{% endfor %} | ||
</p> | ||
<p> | ||
<span class="badge badge-warning">{{ job.seniority_level }}</span> | ||
</p> | ||
<!-- Tfidf --> | ||
{% for word, score in job['tfidf_data'].items()%} | ||
<span class="badge badge-info">{{ word }}</span> | ||
{% endfor %} | ||
</div> | ||
|
||
<br> | ||
</article> | ||
{% endfor %} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import requests | ||
import pandas as pd | ||
import numpy as np | ||
import urllib | ||
|
||
|
||
def get_kununu_rating(company, location, first_iter=True): | ||
# define common terms; sometimes search term or kununu company has unnecessary GmbH | ||
# which confuses the search | ||
COMMON_TERMS = ['gmbh','ag','dach'] | ||
THRESHOLD = 0.3 | ||
|
||
base = 'https://api.kununu.com/v1/search/profiles?' | ||
# set api parameter to current tuple | ||
params = {'page':1, | ||
'per_page':18, | ||
'q':company, | ||
'location':location} | ||
if first_iter==False: | ||
del params['location'] | ||
# concat the url with params | ||
url = base + urllib.parse.urlencode(params, quote_via=urllib.parse.quote) | ||
# create the request | ||
r = requests.get(url, headers={'Referer':'https://www.kununu.com/us/search', | ||
'Accept':'application/vnd.kununu.v1+json;version=2016-05-11', | ||
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}) | ||
request_data = r.json() | ||
|
||
input_company_list = company.split(' ') | ||
input_company_set = set([word.lower() for word in input_company_list if word not in COMMON_TERMS]) | ||
|
||
# do the following iteration as long as found is not True | ||
found = False | ||
|
||
try: | ||
# first best solution: everything is the same | ||
for profile in request_data['profiles']: | ||
res_company_list = profile['name'].lower().split(' ') | ||
res_company_set = [word for word in res_company_list if word not in COMMON_TERMS] | ||
|
||
overlap = input_company_set.intersection(res_company_set) | ||
len(overlap) / len(input_company_set) | ||
|
||
if (len(overlap) / len(input_company_set))>THRESHOLD and profile['city'].lower()==location.lower(): | ||
found = True | ||
profile['rating_level'] = 'match' | ||
profile['kununu_link'] = url | ||
return profile | ||
# next best solution: same name but 'all' location | ||
if found == False: | ||
for profile in request_data['profiles']: | ||
res_company_list = profile['name'].lower().split(' ') | ||
res_company_set = [word for word in res_company_list if word not in COMMON_TERMS] | ||
|
||
overlap = input_company_set.intersection(res_company_set) | ||
len(overlap) / len(input_company_set) | ||
|
||
if (len(overlap) / len(input_company_set))>THRESHOLD and profile['city']=='all': | ||
found = True | ||
profile['rating_level'] = 'all' | ||
profile['kununu_link'] = url | ||
return profile | ||
# next best solution: same name but different location | ||
if found == False: | ||
for profile in request_data['profiles']: | ||
res_company_list = profile['name'].lower().split(' ') | ||
res_company_set = [word for word in res_company_list if word not in COMMON_TERMS] | ||
|
||
overlap = input_company_set.intersection(res_company_set) | ||
len(overlap) / len(input_company_set) | ||
|
||
if (len(overlap) / len(input_company_set))>0.8: | ||
found = True | ||
profile['rating_level'] = 'different location' | ||
profile['kununu_link'] = url | ||
return profile | ||
|
||
# try all options without the location once again | ||
if first_iter==True: | ||
return get_kununu_rating(company=company, location=location, first_iter=False) | ||
|
||
pass | ||
# if nothing works | ||
except: | ||
pass |
Oops, something went wrong.