Interactive skills hierarchy plot notebook (#63)

* Add interactive skills hierachy notebook and json files for renaming some skill groups in utils * Add plot for number of job adversta dn unique number of skills * Look at other regions in the region to rest of uk level B comparisons * Add proportions to tranversal skill output and output level A covid proportion changes in table too * black plot interactictive hier * apply black to both transversal and geography application notebooks
nestauk · Sep 29, 2021 · ac4c3d4 · ac4c3d4
1 parent e7ee1c4
commit ac4c3d4
Show file tree

Hide file tree

Showing 4 changed files with 233 additions and 51 deletions.
diff --git a/skills_taxonomy_v2/analysis/skills_extraction/notebooks/Effect of sample size.py b/skills_taxonomy_v2/analysis/skills_extraction/notebooks/Effect of sample size.py
@@ -95,4 +95,61 @@
 )
 
 
+# %% [markdown]
+# ## Number of job adverts and unique number of skills
+
+# %%
+file_name_date = '2021.08.31'
+sentence_data = load_s3_data(s3, BUCKET_NAME, f'outputs/skills_extraction/extracted_skills/{file_name_date}_sentences_data.json')
+
+
+# %%
+sentence_data = pd.DataFrame(sentence_data)
+sentence_data = sentence_data[sentence_data['Cluster number']!=-1]
+
+# %%
+len(sentence_data)
+
+# %%
+unique_skills = {}
+for k in tqdm(range(0, len(sentence_data))):
+    unique_skills[k] = sentence_data.iloc[0:k]['Cluster number'].nunique()
+
+# %%
+x = list(unique_skills.keys())
+y = list(unique_skills.values())
+
+nesta_orange = [255/255,90/255,0/255]
+plt.plot(x,y, color='black');
+plt.xlabel('Number of sentences')
+plt.ylabel('Number of unique skills')
+plt.savefig('outputs/skills_extraction/figures/num_sent_num_skills.pdf',bbox_inches='tight')
+
+
+# %% [markdown]
+# ## Together
+
+# %%
+x_vocab = [v[0] for v in num_sentences_and_vocab_size]
+y_vocab = [v[1] for v in num_sentences_and_vocab_size]
+
+x_skills = list(unique_skills.keys())
+y_skills = list(unique_skills.values())
+
+# %%
+fig, axs = plt.subplots(1,2, figsize=(10,3))
+
+axs[0].plot(x_vocab, y_vocab, color='black');
+axs[0].axvline(322071, color=nesta_orange, ls='--')
+axs[0].set_xlabel('Number of sentences')
+axs[0].set_ylabel('Number of unique words in vocab')
+
+axs[1].plot(x_skills, y_skills, color='black');
+axs[1].set_xlabel('Number of sentences')
+axs[1].set_ylabel('Number of unique skills')
+
+plt.tight_layout()
+plt.savefig('outputs/skills_extraction/figures/num_sent_num_skills_vocab_size.pdf',bbox_inches='tight')
+
+
 # %%
diff --git a/skills_taxonomy_v2/analysis/skills_taxonomy/notebooks/Tranversal Skills.py b/skills_taxonomy_v2/analysis/skills_taxonomy/notebooks/Tranversal Skills.py
@@ -723,9 +723,20 @@ def print_untrans_skills(skill_group_scores, level_skill_group, cent_max, clust_
                 }
 
 # %%
+num_all_job_ads = sentence_data["job id"].nunique()
 trans_skills_levc = pd.concat(
     [trans_skills_levc, pd.DataFrame(trans_skills_hier).T], axis=1
 )
+trans_skills_levc["Percentage of job adverts with this skill"] = trans_skills_levc[
+    "Level C"
+].apply(
+    lambda x: round(
+        sentence_data[sentence_data["Hierarchy level C"] == int(x)]["job id"].nunique()
+        * 100
+        / num_all_job_ads,
+        2,
+    )
+)
 trans_skills_levc.to_csv("outputs/skills_taxonomy/transversal/lev_c_trans_skills.csv")
 trans_skills_levc
 

diff --git a/skills_taxonomy_v2/analysis/skills_taxonomy_application/Application - COVID.py b/skills_taxonomy_v2/analysis/skills_taxonomy_application/Application - COVID.py
@@ -204,6 +204,31 @@
 )
 
 
+# %%
+sentence_data_with_meta_filter = sentence_data_with_meta[sentence_data_with_meta[
+    'covid']=='Post-COVID']
+level_a_prop_post_covid = sentence_data_with_meta_filter['Hierarchy level A name'].value_counts()/len(sentence_data_with_meta_filter)
+
+sentence_data_precovid = sentence_data_with_meta[sentence_data_with_meta[
+    'covid']=='Pre-COVID']
+level_a_prop_pre_covid = sentence_data_precovid['Hierarchy level A name'].value_counts()/len(sentence_data_precovid)
+
+df = pd.concat([
+    pd.DataFrame(level_a_prop_pre_covid).rename(
+        columns={'Hierarchy level A name':'Proportion of level A skill group in pre-covid job adverts only'}),
+    pd.DataFrame(level_a_prop_post_covid).rename(
+        columns={'Hierarchy level A name':'Proportion of level A skill group in post-covid job adverts only'})
+], axis=1)
+df['Increase from before to after COVID'] = df['Proportion of level A skill group in post-covid job adverts only']/df['Proportion of level A skill group in pre-covid job adverts only']
+df.round(3).to_csv('outputs/skills_taxonomy_application/covid_application/covid_prepost_leva.csv')
+
+
+
+# %%
+prop_level_a_covid.reset_index().groupby(['level_1','covid']).apply(
+    lambda x: x['Hierarchy level A name'].iloc[0]
+)
+
 # %% [markdown]
 # ## pre vs post covid quotients
 

diff --git a/skills_taxonomy_v2/analysis/skills_taxonomy_application/Application - Geography.py b/skills_taxonomy_v2/analysis/skills_taxonomy_application/Application - Geography.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # ---
 # jupyter:
 #   jupytext:
@@ -464,9 +465,12 @@
 sum(sentence_data_with_meta["subregion"].notna())
 
 # %%
-level_b_prop_all = sentence_data_with_meta[
-    "Hierarchy level B name"
-].value_counts() / len(sentence_data_with_meta)
+sentence_data_rest = sentence_data_with_meta[
+    sentence_data_with_meta["NUTs region"] != "Greater London"
+]
+level_b_prop_rest = sentence_data_rest["Hierarchy level B name"].value_counts() / len(
+    sentence_data_rest
+)
 
 sentence_data_with_meta_filter = sentence_data_with_meta[
     sentence_data_with_meta["subregion"] == "Greater London"
@@ -475,7 +479,8 @@
     "Hierarchy level B name"
 ].value_counts() / len(sentence_data_with_meta_filter)
 
-london_quotient = level_b_prop_london / level_b_prop_all
+london_quotient = level_b_prop_london / level_b_prop_rest
+
 london_quotient = london_quotient[pd.notnull(london_quotient)].sort_values(
     ascending=True
 )
@@ -494,11 +499,12 @@
     bbox_inches="tight",
 )
 
-
-# %%
-level_a_prop_all = sentence_data_with_meta[
-    "Hierarchy level A name"
-].value_counts() / len(sentence_data_with_meta)
+sentence_data_rest = sentence_data_with_meta[
+    sentence_data_with_meta["NUTs region"] != "Greater London"
+]
+level_a_prop_rest = sentence_data_rest["Hierarchy level A name"].value_counts() / len(
+    sentence_data_rest
+)
 
 sentence_data_with_meta_filter = sentence_data_with_meta[
     sentence_data_with_meta["subregion"] == "Greater London"
@@ -507,7 +513,7 @@
     "Hierarchy level A name"
 ].value_counts() / len(sentence_data_with_meta_filter)
 
-london_quotient = level_a_prop_london / level_a_prop_all
+london_quotient = level_a_prop_london / level_a_prop_rest
 london_quotient = london_quotient[pd.notnull(london_quotient)].sort_values(
     ascending=True
 )
@@ -526,69 +532,152 @@
     bbox_inches="tight",
 )
 
-
 # %% [markdown]
-# ## Load other metadata
+# ## Other outliers
 
 # %%
-# Really big!
-job_id_data_dict = load_s3_data(
-    s3, bucket_name, "outputs/tk_data_analysis/metadata_date_dict.json"
+# The North East has a much higher demand for “Teaching and care”.
+
+region = "North East (England)"
+
+sentence_data_region = sentence_data_with_meta[
+    sentence_data_with_meta["NUTs region"] == region
+]
+level_b_prop_region = sentence_data_region[
+    "Hierarchy level B name"
+].value_counts() / len(sentence_data_region)
+
+sentence_data_rest = sentence_data_with_meta[
+    sentence_data_with_meta["NUTs region"] != region
+]
+level_b_prop_rest = sentence_data_rest["Hierarchy level B name"].value_counts() / len(
+    sentence_data_rest
 )
-len(job_id_data_dict)
+
+region_quotient = level_b_prop_region / level_b_prop_rest
+region_quotient = region_quotient[pd.notnull(region_quotient)].sort_values(
+    ascending=True
+)
+
+region_quotient
 
 # %%
-date_years = [v[0][0:4] for k, v in job_id_data_dict.items()]
-Counter(date_years)
+sentence_data[sentence_data["Hierarchy level B name"] == "clinical-patients-nursing"][
+    "Hierarchy level C name"
+].value_counts()
 
-# %% [markdown]
-# ## Level B in 2019 compared to others
 
 # %%
-level_b_prop_all = skill_job_meta["Hierarchy level B name"].value_counts() / len(
-    skill_job_meta
-)
+# Wales has a particular low demand for “Customer service and marketing” skills.
+region = "Wales"
 
-skill_job_meta_filter = skill_job_meta[skill_job_meta["job year"] == "2019"]
-level_b_prop_2019 = skill_job_meta_filter[
+sentence_data_region = sentence_data_with_meta[
+    sentence_data_with_meta["NUTs region"] == region
+]
+level_b_prop_region = sentence_data_region[
     "Hierarchy level B name"
-].value_counts() / len(skill_job_meta_filter)
+].value_counts() / len(sentence_data_region)
 
-# %%
-year_quotient = level_b_prop_2019 / level_b_prop_all
-year_quotient = year_quotient[pd.notnull(year_quotient)].sort_values(ascending=True)
+sentence_data_rest = sentence_data_with_meta[
+    sentence_data_with_meta["NUTs region"] != region
+]
+level_b_prop_rest = sentence_data_rest["Hierarchy level B name"].value_counts() / len(
+    sentence_data_rest
+)
+
+region_quotient = level_b_prop_region / level_b_prop_rest
+region_quotient = region_quotient[pd.notnull(region_quotient)].sort_values(
+    ascending=True
+)
+
+region_quotient
 
 # %%
-year_quotient.plot.barh(
-    figsize=(8, 10),
-    ylabel="Year quotient",
-    xlabel="Level B hierarchy",
-    title="Year 2019 quotient",
+
+region = "Northern Ireland"
+
+sentence_data_region = sentence_data_with_meta[
+    sentence_data_with_meta["NUTs region"] == region
+]
+level_b_prop_region = sentence_data_region[
+    "Hierarchy level B name"
+].value_counts() / len(sentence_data_region)
+
+sentence_data_rest = sentence_data_with_meta[
+    sentence_data_with_meta["NUTs region"] != region
+]
+level_b_prop_rest = sentence_data_rest["Hierarchy level B name"].value_counts() / len(
+    sentence_data_rest
 )
-plt.axvline(1)
 
-# %% [markdown]
-# ## Level A in 2019 compared to others
+
+region_quotient = level_b_prop_region / level_b_prop_rest
+region_quotient = region_quotient[pd.notnull(region_quotient)].sort_values(
+    ascending=True
+)
+
+region_quotient
 
 # %%
-level_a_prop_all = skill_job_meta["Hierarchy level A name"].value_counts() / len(
-    skill_job_meta
+region = "East Midlands (England)"
+
+sentence_data_region = sentence_data_with_meta[
+    sentence_data_with_meta["NUTs region"] == region
+]
+level_b_prop_region = sentence_data_region[
+    "Hierarchy level B name"
+].value_counts() / len(sentence_data_region)
+
+sentence_data_rest = sentence_data_with_meta[
+    sentence_data_with_meta["NUTs region"] != region
+]
+level_b_prop_rest = sentence_data_rest["Hierarchy level B name"].value_counts() / len(
+    sentence_data_rest
 )
 
-skill_job_meta_filter = skill_job_meta[skill_job_meta["job year"] == "2019"]
-level_a_prop_2019 = skill_job_meta_filter[
-    "Hierarchy level A name"
-].value_counts() / len(skill_job_meta_filter)
+
+region_quotient = level_b_prop_region / level_b_prop_rest
+region_quotient = region_quotient[pd.notnull(region_quotient)].sort_values(
+    ascending=True
+)
+
+region_quotient
 
 # %%
-year_quotient = level_a_prop_2019 / level_a_prop_all
-year_quotient = year_quotient[pd.notnull(year_quotient)].sort_values(ascending=True)
+sentence_data[sentence_data["Hierarchy level B name"] == "driving-licence-vehicle"][
+    "Hierarchy level C name"
+].value_counts()
 
 # %%
-year_quotient.plot.barh(
-    figsize=(8, 3),
-    ylabel="Year quotient",
-    xlabel="Level A hierarchy",
-    title="Year 2019 quotient",
+
+sentence_data[sentence_data["Hierarchy level B name"] == "stock-contractors-warehouse"][
+    "Hierarchy level C name"
+].value_counts()
+
+
+# %%
+region = "London"
+
+sentence_data_region = sentence_data_with_meta[
+    sentence_data_with_meta["NUTs region"] == region
+]
+level_b_prop_region = sentence_data_region[
+    "Hierarchy level B name"
+].value_counts() / len(sentence_data_region)
+
+sentence_data_rest = sentence_data_with_meta[
+    sentence_data_with_meta["NUTs region"] != region
+]
+level_b_prop_rest = sentence_data_rest["Hierarchy level B name"].value_counts() / len(
+    sentence_data_rest
+)
+
+
+region_quotient = level_b_prop_region / level_b_prop_rest
+region_quotient = region_quotient[pd.notnull(region_quotient)].sort_values(
+    ascending=True
 )
-plt.axvline(1)
+
+region_quotient
+
+# %%