first commit

pirnerjonas · Jun 15, 2020 · 7f75800 · 7f75800
commit 7f75800
Show file tree

Hide file tree

Showing 19,292 changed files with 3,118,415 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+/config.json
diff --git a/README.md b/README.md
@@ -0,0 +1,45 @@
+### Job search project
+
+## How to use this repository
+
+- extract data with scrapy crawler
+- run `main.py` to preprocess and enrich the data
+- run the flask web app to display the results
+
+### Gather data from linkedIn 
+
+The first step of the project is to extract job offerings from linkedIn. You can run the scrapy crawl command with three parameters:
+- `keywords`: the position or job title to look for (e.g. data scientist)
+- `location`: the location of the job (e.g. Berlin)
+- `filter_time`: the time span of the job offerings to show
+
+
+The command to extract all data scientist jobs in germany would look like this (run it in the `/scraping` directory):
+``` console
+scrapy crawl linkedin -a keywords="data scientist" -a location="deutschland" -a filter_time="false"
+```
+if you wish to extract only the jobs of the last 24 hours you can use `filter_time=1`
+
+### Preprocessing and company ratings
+The script `main.py` loads the data from the database, preprocesses it and finds ratings for new companies. 
+
+``` console
+python main.py
+```
+
+### Send job offerings per mail
+``` console
+python mail_app.py
+```
+
+### Airflow
+cp automate.py ~/airflow/dags
+airflow scheduler
+
+new terminal
+airflow webserver
+
+
+
+
+airflow unpause 'job_postings' && airflow trigger_dag 'job_postings' --conf '{"keywords":"data analyst", "location":"münchen"}'
diff --git a/automate.py b/automate.py
@@ -0,0 +1,42 @@
+from datetime import datetime, timedelta
+
+import airflow
+import pandas as pd
+import json
+from airflow import DAG
+from airflow.operators.bash_operator import BashOperator
+
+with open("config.json") as json_data_file:
+    config = json.load(json_data_file)
+
+default_args = {
+        'owner': 'airflow',
+        'start_date': datetime(2020, 6, 13)
+        }
+
+dag = DAG(dag_id='job_postings', default_args=default_args, schedule_interval='@daily')
+
+# create task for every search in settings
+for i, (keywords, location) in enumerate(zip(config['keywords'], config['location'])):
+
+   scrape = BashOperator(
+      task_id=f'crawl_{i}',
+      bash_command= """ cd ~/Documents/job_mail/scraping/ && scrapy crawl linkedin -a keywords="{{params.keywords}}" -a location="{{params.location}}" -a filter_time="1" """,
+      dag=dag,
+      params={'keywords':keywords, 'location':location}
+   )
+
+   enrich = BashOperator(
+      task_id=f'enrich_{i}',
+      bash_command='cd ~/Documents/job_mail/ && python main.py',
+      dag=dag
+   )
+
+   mail = BashOperator(
+      task_id=f'mail_{i}',
+      bash_command= """cd ~/Documents/job_mail/flask_mail && python mail_app.py --keywords "{{params.keywords}}" --location "{{params.location}}" --username "{{params.username}}" --password "{{params.password}}" """,
+      dag=dag,
+      params={'keywords':keywords, 'location':location, 'username':config['gmail_username'], 'password':config['gmail_password']}
+   )
+
+   scrape >> enrich >> mail
diff --git a/flask_mail/mail_app.py b/flask_mail/mail_app.py
@@ -0,0 +1,67 @@
+from flask import Flask, render_template, url_for
+from flask_mail import Mail, Message
+import pandas as pd
+import ast
+import datetime
+import argparse
+
+
+
+# load the data
+job_df = pd.read_sql_table('jobs_preprocessed_table','sqlite:///../scraping/jobs.db')
+company_df = pd.read_sql_table('company_ratings','sqlite:///../scraping/jobs.db')
+
+# join the data
+job_data = pd.merge(job_df, company_df, how='left', on=['company_name','city'])
+
+# sort dataframe 
+job_data = job_data.sort_values(['total','reviews','views'], ascending=False)
+job_data['tfidf_data'] = [ast.literal_eval(job) for job in job_data['tfidf_data']]
+
+# extract only the jobs which were posted today
+today = datetime.datetime.now()
+job_data['post_date_short'] = [job.date() for job in job_data['post_date']]
+job_data = job_data[job_data['post_date_short']==today.date()]
+
+# Initiate the parser
+parser = argparse.ArgumentParser()
+parser.add_argument("-k", "--keywords", dest='keywords', help="define the keywords to search for", action='store')
+parser.add_argument("-l", "--location", dest='location',help="define the location to search for", action='store')
+parser.add_argument("-u", "--username", dest='username',help="Username for gmail account", action='store')
+parser.add_argument("-p", "--password", dest='password',help="Password for gmail account", action='store')
+
+# Read arguments from the command line
+args = parser.parse_args()
+
+# filter keywords and location
+job_data = job_data[job_data['search_keywords']==args.keywords]
+job_data = job_data[job_data['search_location']==args.location]
+
+# convert dataframe to list of dicts
+job_data = job_data.to_dict('records')
+
+num_results = len(job_data)
+
+
+app = Flask(__name__)
+
+mail_settings = {
+    'SERVER_NAME': 'smtp.gmail.com',
+    'MAIL_SERVER':'smtp.gmail.com',
+    'MAIL_PORT':465,
+    'MAIL_USERNAME':args.username,
+    'MAIL_PASSWORD':args.password,
+    'MAIL_USE_TLS':False,
+    'MAIL_USE_SSL':True
+}
+app.config.update(mail_settings)
+mail = Mail(app)
+
+if __name__ == '__main__':
+    with app.app_context():
+        msg = Message(subject=f'Job offerings for {today.strftime("%Y-%m-%d %H:%M:%S")}', 
+                      sender = '[email protected]', 
+                      recipients = ['[email protected]'])
+        msg.html = render_template('mail.html', job_data=job_data, num_results=num_results, 
+                                    search_keywords=args.keywords, search_location=args.location)
+        mail.send(msg)
diff --git a/flask_mail/templates/mail.html b/flask_mail/templates/mail.html
@@ -0,0 +1,92 @@
+<!-- use inline css because many mail provider do strip external references -->
+<head>
+    <style>
+        .badge {
+            display: inline-block;
+            padding: 0.25em 0.4em;
+            font-size: 75%;
+            font-weight: 700;
+            line-height: 1;
+            text-align: center;
+            white-space: nowrap;
+            vertical-align: baseline;
+            border-radius: 0.25rem;
+            transition: color 0.15s ease-in-out, background-color 0.15s ease-in-out, border-color 0.15s ease-in-out, box-shadow 0.15s ease-in-out;
+        }
+        .badge-primary {
+            color: #fff;
+            background-color: #007bff;
+        }
+        .badge-warning {
+            color: #212529;
+            background-color: #ffc107;
+        }
+        .badge-info {
+            color: #fff;
+            background-color: #17a2b8;
+        }
+        .content-section {
+            background: #ffffff;
+            padding: 10px 20px;
+            border: 1px solid #dddddd;
+            border-radius: 3px;
+            margin-bottom: 20px;
+        }
+        .article-title {
+            color: #444444;
+        }
+
+        a.article-title:hover {
+            color: #428bca;
+            text-decoration: none;
+        }
+
+        .article-content {
+            white-space: pre-line;
+        }
+
+        .article-metadata {
+            padding-bottom: 1px;
+            margin-bottom: 4px;
+            border-bottom: 1px solid #e3e3e3
+        }
+    </style>
+</head>
+<h3>Hey there,</h3>
+<p>I found <b>{{ num_results }}</b> new Job offerings for your search <b>{{ search_keywords }} in {{ search_location }}</b></p>
+<br>
+
+{% for job in job_data %}
+<article class="media content-section">
+    <div class="media-body">
+        <div class="article-metadata">
+            <a class="mr-2" href="#">{{ job['company_name'] }}</a>
+            <small class="text-muted">{{ job['city'] }}</small>
+            <a class="mr-2" href="#">Rating: {{ job['total']}} / {{ job['reviews']}}</a>
+            <a> Level: {{ job['rating_level']}}</a>
+            <h1></h1>
+          </div>
+          <!-- Main part of the post -->
+          <h3><a class="article-title" href="{{ ''.join(['http://', job.link]) }}">{{ job['title'] }}</a></h3>
+
+          <p class="article-content">{{ job['description_text'][:400] }}</p>
+
+          <!-- Industrie -->
+          {% set industrie_list = job.industries.split('; ') %}
+          <p>
+            {% for industrie in industrie_list %}
+              <span class="badge badge-primary">{{ industrie }}</span>            
+            {% endfor %}
+          </p>
+          <p>
+            <span class="badge badge-warning">{{ job.seniority_level }}</span>
+          </p>
+          <!-- Tfidf -->
+          {% for word, score in job['tfidf_data'].items()%}
+            <span class="badge badge-info">{{ word }}</span>
+          {% endfor %}
+    </div>
+
+    <br>
+</article>
+{% endfor %}
diff --git a/kununu_ratings.py b/kununu_ratings.py
@@ -0,0 +1,85 @@
+import requests
+import pandas as pd
+import numpy as np
+import urllib
+
+
+def get_kununu_rating(company, location, first_iter=True):
+    # define common terms; sometimes search term or kununu company has unnecessary GmbH 
+    # which confuses the search 
+    COMMON_TERMS = ['gmbh','ag','dach']
+    THRESHOLD = 0.3
+
+    base = 'https://api.kununu.com/v1/search/profiles?'
+    # set api parameter to current tuple
+    params = {'page':1,
+              'per_page':18,
+              'q':company,
+              'location':location}
+    if first_iter==False:
+        del params['location'] 
+    # concat the url with params
+    url = base + urllib.parse.urlencode(params, quote_via=urllib.parse.quote)
+    # create the request
+    r = requests.get(url, headers={'Referer':'https://www.kununu.com/us/search', 
+                                    'Accept':'application/vnd.kununu.v1+json;version=2016-05-11', 
+                                    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'})
+    request_data = r.json()
+
+    input_company_list = company.split(' ')
+    input_company_set = set([word.lower() for word in input_company_list if word not in COMMON_TERMS])
+
+    # do the following iteration as long as found is not True
+    found = False
+
+    try:
+        # first best solution: everything is the same
+        for profile in request_data['profiles']:
+            res_company_list = profile['name'].lower().split(' ')
+            res_company_set = [word for word in res_company_list if word not in COMMON_TERMS]
+
+            overlap = input_company_set.intersection(res_company_set)
+            len(overlap) / len(input_company_set)
+
+            if (len(overlap) / len(input_company_set))>THRESHOLD and profile['city'].lower()==location.lower():
+                found = True
+                profile['rating_level'] = 'match'
+                profile['kununu_link'] = url
+                return profile
+        # next best solution: same name but 'all' location
+        if found == False:
+            for profile in request_data['profiles']: 
+                res_company_list = profile['name'].lower().split(' ')
+                res_company_set = [word for word in res_company_list if word not in COMMON_TERMS]
+
+                overlap = input_company_set.intersection(res_company_set)
+                len(overlap) / len(input_company_set)
+
+                if (len(overlap) / len(input_company_set))>THRESHOLD and profile['city']=='all':
+                    found = True
+                    profile['rating_level'] = 'all'
+                    profile['kununu_link'] = url
+                    return profile
+        # next best solution: same name but different location
+        if found == False:
+            for profile in request_data['profiles']: 
+                res_company_list = profile['name'].lower().split(' ')
+                res_company_set = [word for word in res_company_list if word not in COMMON_TERMS]
+
+                overlap = input_company_set.intersection(res_company_set)
+                len(overlap) / len(input_company_set)
+
+                if (len(overlap) / len(input_company_set))>0.8:
+                    found = True
+                    profile['rating_level'] = 'different location'
+                    profile['kununu_link'] = url
+                    return profile
+
+        # try all options without the location once again
+        if first_iter==True:
+                return get_kununu_rating(company=company, location=location, first_iter=False)
+
+        pass
+    # if nothing works    
+    except:
+        pass