Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
pirnerjonas committed Jun 15, 2020
0 parents commit 7f75800
Show file tree
Hide file tree
Showing 19,292 changed files with 3,118,415 additions and 0 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/config.json
45 changes: 45 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
### Job search project

## How to use this repository

- extract data with scrapy crawler
- run `main.py` to preprocess and enrich the data
- run the flask web app to display the results

### Gather data from linkedIn

The first step of the project is to extract job offerings from linkedIn. You can run the scrapy crawl command with three parameters:
- `keywords`: the position or job title to look for (e.g. data scientist)
- `location`: the location of the job (e.g. Berlin)
- `filter_time`: the time span of the job offerings to show


The command to extract all data scientist jobs in germany would look like this (run it in the `/scraping` directory):
``` console
scrapy crawl linkedin -a keywords="data scientist" -a location="deutschland" -a filter_time="false"
```
if you wish to extract only the jobs of the last 24 hours you can use `filter_time=1`

### Preprocessing and company ratings
The script `main.py` loads the data from the database, preprocesses it and finds ratings for new companies.

``` console
python main.py
```

### Send job offerings per mail
``` console
python mail_app.py
```

### Airflow
cp automate.py ~/airflow/dags
airflow scheduler

new terminal
airflow webserver




airflow unpause 'job_postings' && airflow trigger_dag 'job_postings' --conf '{"keywords":"data analyst", "location":"münchen"}'
42 changes: 42 additions & 0 deletions automate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from datetime import datetime, timedelta

import airflow
import pandas as pd
import json
from airflow import DAG
from airflow.operators.bash_operator import BashOperator

with open("config.json") as json_data_file:
config = json.load(json_data_file)

default_args = {
'owner': 'airflow',
'start_date': datetime(2020, 6, 13)
}

dag = DAG(dag_id='job_postings', default_args=default_args, schedule_interval='@daily')

# create task for every search in settings
for i, (keywords, location) in enumerate(zip(config['keywords'], config['location'])):

scrape = BashOperator(
task_id=f'crawl_{i}',
bash_command= """ cd ~/Documents/job_mail/scraping/ && scrapy crawl linkedin -a keywords="{{params.keywords}}" -a location="{{params.location}}" -a filter_time="1" """,
dag=dag,
params={'keywords':keywords, 'location':location}
)

enrich = BashOperator(
task_id=f'enrich_{i}',
bash_command='cd ~/Documents/job_mail/ && python main.py',
dag=dag
)

mail = BashOperator(
task_id=f'mail_{i}',
bash_command= """cd ~/Documents/job_mail/flask_mail && python mail_app.py --keywords "{{params.keywords}}" --location "{{params.location}}" --username "{{params.username}}" --password "{{params.password}}" """,
dag=dag,
params={'keywords':keywords, 'location':location, 'username':config['gmail_username'], 'password':config['gmail_password']}
)

scrape >> enrich >> mail
67 changes: 67 additions & 0 deletions flask_mail/mail_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from flask import Flask, render_template, url_for
from flask_mail import Mail, Message
import pandas as pd
import ast
import datetime
import argparse



# load the data
job_df = pd.read_sql_table('jobs_preprocessed_table','sqlite:///../scraping/jobs.db')
company_df = pd.read_sql_table('company_ratings','sqlite:///../scraping/jobs.db')

# join the data
job_data = pd.merge(job_df, company_df, how='left', on=['company_name','city'])

# sort dataframe
job_data = job_data.sort_values(['total','reviews','views'], ascending=False)
job_data['tfidf_data'] = [ast.literal_eval(job) for job in job_data['tfidf_data']]

# extract only the jobs which were posted today
today = datetime.datetime.now()
job_data['post_date_short'] = [job.date() for job in job_data['post_date']]
job_data = job_data[job_data['post_date_short']==today.date()]

# Initiate the parser
parser = argparse.ArgumentParser()
parser.add_argument("-k", "--keywords", dest='keywords', help="define the keywords to search for", action='store')
parser.add_argument("-l", "--location", dest='location',help="define the location to search for", action='store')
parser.add_argument("-u", "--username", dest='username',help="Username for gmail account", action='store')
parser.add_argument("-p", "--password", dest='password',help="Password for gmail account", action='store')

# Read arguments from the command line
args = parser.parse_args()

# filter keywords and location
job_data = job_data[job_data['search_keywords']==args.keywords]
job_data = job_data[job_data['search_location']==args.location]

# convert dataframe to list of dicts
job_data = job_data.to_dict('records')

num_results = len(job_data)


app = Flask(__name__)

mail_settings = {
'SERVER_NAME': 'smtp.gmail.com',
'MAIL_SERVER':'smtp.gmail.com',
'MAIL_PORT':465,
'MAIL_USERNAME':args.username,
'MAIL_PASSWORD':args.password,
'MAIL_USE_TLS':False,
'MAIL_USE_SSL':True
}
app.config.update(mail_settings)
mail = Mail(app)

if __name__ == '__main__':
with app.app_context():
msg = Message(subject=f'Job offerings for {today.strftime("%Y-%m-%d %H:%M:%S")}',
sender = '[email protected]',
recipients = ['[email protected]'])
msg.html = render_template('mail.html', job_data=job_data, num_results=num_results,
search_keywords=args.keywords, search_location=args.location)
mail.send(msg)
92 changes: 92 additions & 0 deletions flask_mail/templates/mail.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
<!-- use inline css because many mail provider do strip external references -->
<head>
<style>
.badge {
display: inline-block;
padding: 0.25em 0.4em;
font-size: 75%;
font-weight: 700;
line-height: 1;
text-align: center;
white-space: nowrap;
vertical-align: baseline;
border-radius: 0.25rem;
transition: color 0.15s ease-in-out, background-color 0.15s ease-in-out, border-color 0.15s ease-in-out, box-shadow 0.15s ease-in-out;
}
.badge-primary {
color: #fff;
background-color: #007bff;
}
.badge-warning {
color: #212529;
background-color: #ffc107;
}
.badge-info {
color: #fff;
background-color: #17a2b8;
}
.content-section {
background: #ffffff;
padding: 10px 20px;
border: 1px solid #dddddd;
border-radius: 3px;
margin-bottom: 20px;
}
.article-title {
color: #444444;
}

a.article-title:hover {
color: #428bca;
text-decoration: none;
}

.article-content {
white-space: pre-line;
}

.article-metadata {
padding-bottom: 1px;
margin-bottom: 4px;
border-bottom: 1px solid #e3e3e3
}
</style>
</head>
<h3>Hey there,</h3>
<p>I found <b>{{ num_results }}</b> new Job offerings for your search <b>{{ search_keywords }} in {{ search_location }}</b></p>
<br>

{% for job in job_data %}
<article class="media content-section">
<div class="media-body">
<div class="article-metadata">
<a class="mr-2" href="#">{{ job['company_name'] }}</a>
<small class="text-muted">{{ job['city'] }}</small>
<a class="mr-2" href="#">Rating: {{ job['total']}} / {{ job['reviews']}}</a>
<a> Level: {{ job['rating_level']}}</a>
<h1></h1>
</div>
<!-- Main part of the post -->
<h3><a class="article-title" href="{{ ''.join(['http://', job.link]) }}">{{ job['title'] }}</a></h3>

<p class="article-content">{{ job['description_text'][:400] }}</p>

<!-- Industrie -->
{% set industrie_list = job.industries.split('; ') %}
<p>
{% for industrie in industrie_list %}
<span class="badge badge-primary">{{ industrie }}</span>
{% endfor %}
</p>
<p>
<span class="badge badge-warning">{{ job.seniority_level }}</span>
</p>
<!-- Tfidf -->
{% for word, score in job['tfidf_data'].items()%}
<span class="badge badge-info">{{ word }}</span>
{% endfor %}
</div>

<br>
</article>
{% endfor %}
85 changes: 85 additions & 0 deletions kununu_ratings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import requests
import pandas as pd
import numpy as np
import urllib


def get_kununu_rating(company, location, first_iter=True):
# define common terms; sometimes search term or kununu company has unnecessary GmbH
# which confuses the search
COMMON_TERMS = ['gmbh','ag','dach']
THRESHOLD = 0.3

base = 'https://api.kununu.com/v1/search/profiles?'
# set api parameter to current tuple
params = {'page':1,
'per_page':18,
'q':company,
'location':location}
if first_iter==False:
del params['location']
# concat the url with params
url = base + urllib.parse.urlencode(params, quote_via=urllib.parse.quote)
# create the request
r = requests.get(url, headers={'Referer':'https://www.kununu.com/us/search',
'Accept':'application/vnd.kununu.v1+json;version=2016-05-11',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'})
request_data = r.json()

input_company_list = company.split(' ')
input_company_set = set([word.lower() for word in input_company_list if word not in COMMON_TERMS])

# do the following iteration as long as found is not True
found = False

try:
# first best solution: everything is the same
for profile in request_data['profiles']:
res_company_list = profile['name'].lower().split(' ')
res_company_set = [word for word in res_company_list if word not in COMMON_TERMS]

overlap = input_company_set.intersection(res_company_set)
len(overlap) / len(input_company_set)

if (len(overlap) / len(input_company_set))>THRESHOLD and profile['city'].lower()==location.lower():
found = True
profile['rating_level'] = 'match'
profile['kununu_link'] = url
return profile
# next best solution: same name but 'all' location
if found == False:
for profile in request_data['profiles']:
res_company_list = profile['name'].lower().split(' ')
res_company_set = [word for word in res_company_list if word not in COMMON_TERMS]

overlap = input_company_set.intersection(res_company_set)
len(overlap) / len(input_company_set)

if (len(overlap) / len(input_company_set))>THRESHOLD and profile['city']=='all':
found = True
profile['rating_level'] = 'all'
profile['kununu_link'] = url
return profile
# next best solution: same name but different location
if found == False:
for profile in request_data['profiles']:
res_company_list = profile['name'].lower().split(' ')
res_company_set = [word for word in res_company_list if word not in COMMON_TERMS]

overlap = input_company_set.intersection(res_company_set)
len(overlap) / len(input_company_set)

if (len(overlap) / len(input_company_set))>0.8:
found = True
profile['rating_level'] = 'different location'
profile['kununu_link'] = url
return profile

# try all options without the location once again
if first_iter==True:
return get_kununu_rating(company=company, location=location, first_iter=False)

pass
# if nothing works
except:
pass
Loading

0 comments on commit 7f75800

Please sign in to comment.