Skip to content

Commit

Permalink
Interactive skills hierarchy plot notebook (#63)
Browse files Browse the repository at this point in the history
* Add interactive skills hierachy notebook and json files for renaming some skill groups in utils

* Add plot for number of job adversta dn unique number of skills

* Look at other regions in the region to rest of uk level B comparisons

* Add proportions to tranversal skill output and output level A covid proportion changes in table too

* black plot interactictive hier

* apply black to both transversal and geography application notebooks
  • Loading branch information
lizgzil authored Sep 29, 2021
1 parent e7ee1c4 commit ac4c3d4
Show file tree
Hide file tree
Showing 4 changed files with 233 additions and 51 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,61 @@
)


# %% [markdown]
# ## Number of job adverts and unique number of skills

# %%
file_name_date = '2021.08.31'
sentence_data = load_s3_data(s3, BUCKET_NAME, f'outputs/skills_extraction/extracted_skills/{file_name_date}_sentences_data.json')


# %%
sentence_data = pd.DataFrame(sentence_data)
sentence_data = sentence_data[sentence_data['Cluster number']!=-1]

# %%
len(sentence_data)

# %%
unique_skills = {}
for k in tqdm(range(0, len(sentence_data))):
unique_skills[k] = sentence_data.iloc[0:k]['Cluster number'].nunique()

# %%
x = list(unique_skills.keys())
y = list(unique_skills.values())

nesta_orange = [255/255,90/255,0/255]
plt.plot(x,y, color='black');
plt.xlabel('Number of sentences')
plt.ylabel('Number of unique skills')
plt.savefig('outputs/skills_extraction/figures/num_sent_num_skills.pdf',bbox_inches='tight')


# %% [markdown]
# ## Together

# %%
x_vocab = [v[0] for v in num_sentences_and_vocab_size]
y_vocab = [v[1] for v in num_sentences_and_vocab_size]

x_skills = list(unique_skills.keys())
y_skills = list(unique_skills.values())

# %%
fig, axs = plt.subplots(1,2, figsize=(10,3))

axs[0].plot(x_vocab, y_vocab, color='black');
axs[0].axvline(322071, color=nesta_orange, ls='--')
axs[0].set_xlabel('Number of sentences')
axs[0].set_ylabel('Number of unique words in vocab')

axs[1].plot(x_skills, y_skills, color='black');
axs[1].set_xlabel('Number of sentences')
axs[1].set_ylabel('Number of unique skills')

plt.tight_layout()
plt.savefig('outputs/skills_extraction/figures/num_sent_num_skills_vocab_size.pdf',bbox_inches='tight')


# %%
Original file line number Diff line number Diff line change
Expand Up @@ -723,9 +723,20 @@ def print_untrans_skills(skill_group_scores, level_skill_group, cent_max, clust_
}

# %%
num_all_job_ads = sentence_data["job id"].nunique()
trans_skills_levc = pd.concat(
[trans_skills_levc, pd.DataFrame(trans_skills_hier).T], axis=1
)
trans_skills_levc["Percentage of job adverts with this skill"] = trans_skills_levc[
"Level C"
].apply(
lambda x: round(
sentence_data[sentence_data["Hierarchy level C"] == int(x)]["job id"].nunique()
* 100
/ num_all_job_ads,
2,
)
)
trans_skills_levc.to_csv("outputs/skills_taxonomy/transversal/lev_c_trans_skills.csv")
trans_skills_levc

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,31 @@
)


# %%
sentence_data_with_meta_filter = sentence_data_with_meta[sentence_data_with_meta[
'covid']=='Post-COVID']
level_a_prop_post_covid = sentence_data_with_meta_filter['Hierarchy level A name'].value_counts()/len(sentence_data_with_meta_filter)

sentence_data_precovid = sentence_data_with_meta[sentence_data_with_meta[
'covid']=='Pre-COVID']
level_a_prop_pre_covid = sentence_data_precovid['Hierarchy level A name'].value_counts()/len(sentence_data_precovid)

df = pd.concat([
pd.DataFrame(level_a_prop_pre_covid).rename(
columns={'Hierarchy level A name':'Proportion of level A skill group in pre-covid job adverts only'}),
pd.DataFrame(level_a_prop_post_covid).rename(
columns={'Hierarchy level A name':'Proportion of level A skill group in post-covid job adverts only'})
], axis=1)
df['Increase from before to after COVID'] = df['Proportion of level A skill group in post-covid job adverts only']/df['Proportion of level A skill group in pre-covid job adverts only']
df.round(3).to_csv('outputs/skills_taxonomy_application/covid_application/covid_prepost_leva.csv')



# %%
prop_level_a_covid.reset_index().groupby(['level_1','covid']).apply(
lambda x: x['Hierarchy level A name'].iloc[0]
)

# %% [markdown]
# ## pre vs post covid quotients

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
# ---
# jupyter:
# jupytext:
Expand Down Expand Up @@ -464,9 +465,12 @@
sum(sentence_data_with_meta["subregion"].notna())

# %%
level_b_prop_all = sentence_data_with_meta[
"Hierarchy level B name"
].value_counts() / len(sentence_data_with_meta)
sentence_data_rest = sentence_data_with_meta[
sentence_data_with_meta["NUTs region"] != "Greater London"
]
level_b_prop_rest = sentence_data_rest["Hierarchy level B name"].value_counts() / len(
sentence_data_rest
)

sentence_data_with_meta_filter = sentence_data_with_meta[
sentence_data_with_meta["subregion"] == "Greater London"
Expand All @@ -475,7 +479,8 @@
"Hierarchy level B name"
].value_counts() / len(sentence_data_with_meta_filter)

london_quotient = level_b_prop_london / level_b_prop_all
london_quotient = level_b_prop_london / level_b_prop_rest

london_quotient = london_quotient[pd.notnull(london_quotient)].sort_values(
ascending=True
)
Expand All @@ -494,11 +499,12 @@
bbox_inches="tight",
)


# %%
level_a_prop_all = sentence_data_with_meta[
"Hierarchy level A name"
].value_counts() / len(sentence_data_with_meta)
sentence_data_rest = sentence_data_with_meta[
sentence_data_with_meta["NUTs region"] != "Greater London"
]
level_a_prop_rest = sentence_data_rest["Hierarchy level A name"].value_counts() / len(
sentence_data_rest
)

sentence_data_with_meta_filter = sentence_data_with_meta[
sentence_data_with_meta["subregion"] == "Greater London"
Expand All @@ -507,7 +513,7 @@
"Hierarchy level A name"
].value_counts() / len(sentence_data_with_meta_filter)

london_quotient = level_a_prop_london / level_a_prop_all
london_quotient = level_a_prop_london / level_a_prop_rest
london_quotient = london_quotient[pd.notnull(london_quotient)].sort_values(
ascending=True
)
Expand All @@ -526,69 +532,152 @@
bbox_inches="tight",
)


# %% [markdown]
# ## Load other metadata
# ## Other outliers

# %%
# Really big!
job_id_data_dict = load_s3_data(
s3, bucket_name, "outputs/tk_data_analysis/metadata_date_dict.json"
# The North East has a much higher demand for “Teaching and care”.

region = "North East (England)"

sentence_data_region = sentence_data_with_meta[
sentence_data_with_meta["NUTs region"] == region
]
level_b_prop_region = sentence_data_region[
"Hierarchy level B name"
].value_counts() / len(sentence_data_region)

sentence_data_rest = sentence_data_with_meta[
sentence_data_with_meta["NUTs region"] != region
]
level_b_prop_rest = sentence_data_rest["Hierarchy level B name"].value_counts() / len(
sentence_data_rest
)
len(job_id_data_dict)

region_quotient = level_b_prop_region / level_b_prop_rest
region_quotient = region_quotient[pd.notnull(region_quotient)].sort_values(
ascending=True
)

region_quotient

# %%
date_years = [v[0][0:4] for k, v in job_id_data_dict.items()]
Counter(date_years)
sentence_data[sentence_data["Hierarchy level B name"] == "clinical-patients-nursing"][
"Hierarchy level C name"
].value_counts()

# %% [markdown]
# ## Level B in 2019 compared to others

# %%
level_b_prop_all = skill_job_meta["Hierarchy level B name"].value_counts() / len(
skill_job_meta
)
# Wales has a particular low demand for “Customer service and marketing” skills.
region = "Wales"

skill_job_meta_filter = skill_job_meta[skill_job_meta["job year"] == "2019"]
level_b_prop_2019 = skill_job_meta_filter[
sentence_data_region = sentence_data_with_meta[
sentence_data_with_meta["NUTs region"] == region
]
level_b_prop_region = sentence_data_region[
"Hierarchy level B name"
].value_counts() / len(skill_job_meta_filter)
].value_counts() / len(sentence_data_region)

# %%
year_quotient = level_b_prop_2019 / level_b_prop_all
year_quotient = year_quotient[pd.notnull(year_quotient)].sort_values(ascending=True)
sentence_data_rest = sentence_data_with_meta[
sentence_data_with_meta["NUTs region"] != region
]
level_b_prop_rest = sentence_data_rest["Hierarchy level B name"].value_counts() / len(
sentence_data_rest
)

region_quotient = level_b_prop_region / level_b_prop_rest
region_quotient = region_quotient[pd.notnull(region_quotient)].sort_values(
ascending=True
)

region_quotient

# %%
year_quotient.plot.barh(
figsize=(8, 10),
ylabel="Year quotient",
xlabel="Level B hierarchy",
title="Year 2019 quotient",

region = "Northern Ireland"

sentence_data_region = sentence_data_with_meta[
sentence_data_with_meta["NUTs region"] == region
]
level_b_prop_region = sentence_data_region[
"Hierarchy level B name"
].value_counts() / len(sentence_data_region)

sentence_data_rest = sentence_data_with_meta[
sentence_data_with_meta["NUTs region"] != region
]
level_b_prop_rest = sentence_data_rest["Hierarchy level B name"].value_counts() / len(
sentence_data_rest
)
plt.axvline(1)

# %% [markdown]
# ## Level A in 2019 compared to others

region_quotient = level_b_prop_region / level_b_prop_rest
region_quotient = region_quotient[pd.notnull(region_quotient)].sort_values(
ascending=True
)

region_quotient

# %%
level_a_prop_all = skill_job_meta["Hierarchy level A name"].value_counts() / len(
skill_job_meta
region = "East Midlands (England)"

sentence_data_region = sentence_data_with_meta[
sentence_data_with_meta["NUTs region"] == region
]
level_b_prop_region = sentence_data_region[
"Hierarchy level B name"
].value_counts() / len(sentence_data_region)

sentence_data_rest = sentence_data_with_meta[
sentence_data_with_meta["NUTs region"] != region
]
level_b_prop_rest = sentence_data_rest["Hierarchy level B name"].value_counts() / len(
sentence_data_rest
)

skill_job_meta_filter = skill_job_meta[skill_job_meta["job year"] == "2019"]
level_a_prop_2019 = skill_job_meta_filter[
"Hierarchy level A name"
].value_counts() / len(skill_job_meta_filter)

region_quotient = level_b_prop_region / level_b_prop_rest
region_quotient = region_quotient[pd.notnull(region_quotient)].sort_values(
ascending=True
)

region_quotient

# %%
year_quotient = level_a_prop_2019 / level_a_prop_all
year_quotient = year_quotient[pd.notnull(year_quotient)].sort_values(ascending=True)
sentence_data[sentence_data["Hierarchy level B name"] == "driving-licence-vehicle"][
"Hierarchy level C name"
].value_counts()

# %%
year_quotient.plot.barh(
figsize=(8, 3),
ylabel="Year quotient",
xlabel="Level A hierarchy",
title="Year 2019 quotient",

sentence_data[sentence_data["Hierarchy level B name"] == "stock-contractors-warehouse"][
"Hierarchy level C name"
].value_counts()


# %%
region = "London"

sentence_data_region = sentence_data_with_meta[
sentence_data_with_meta["NUTs region"] == region
]
level_b_prop_region = sentence_data_region[
"Hierarchy level B name"
].value_counts() / len(sentence_data_region)

sentence_data_rest = sentence_data_with_meta[
sentence_data_with_meta["NUTs region"] != region
]
level_b_prop_rest = sentence_data_rest["Hierarchy level B name"].value_counts() / len(
sentence_data_rest
)


region_quotient = level_b_prop_region / level_b_prop_rest
region_quotient = region_quotient[pd.notnull(region_quotient)].sort_values(
ascending=True
)
plt.axvline(1)

region_quotient

# %%

0 comments on commit ac4c3d4

Please sign in to comment.