Re-run some analysis, and investgate and fix tk sample (#78)

* Update filter tk data for new sample * Use full job adverts sample in filter bulk data * use set * New method for filter metadata sample * Append to list not add to dict * Correct comma * New method to get bulk metadata without having problems with duplicate job ids * Add dependence * Save out remainder data * Add script to find the duplicated skill sentences * fix save issue * Add new figures and notebook analysis for new taxonomy * Add extra info about sample size to readmes * Use manual names for level A names in build taxonomy * Add script to find all the tk data with no text field * Get no text and full text counts * Add length diagnostic * Add script to get sentence skill preds for new job adverts and append to results * Improve predict replacemnet * Use chunks of job adverts for predicting extra skill sentences * Add init to sentence classifier folder, needed for metaflow * Add more diagnostic data to get_no_texts_tk_data.py * Add extra data to the tk sample to replace sample found from expired files * Add a print of the figure for the multiskill analysis
nestauk · Jan 10, 2022 · ce0f994 · ce0f994
1 parent 3aeee4b
commit ce0f994
Show file tree

Hide file tree

Showing 17 changed files with 2,305 additions and 659 deletions.
diff --git a/outputs/reports/figures/extract_skill_methodology_overview.jpg b/outputs/reports/figures/extract_skill_methodology_overview.jpg
diff --git a/outputs/reports/figures/hierarchy_numbers.jpg b/outputs/reports/figures/hierarchy_numbers.jpg
diff --git a/outputs/reports/figures/hierarchy_overview.jpg b/outputs/reports/figures/hierarchy_overview.jpg
diff --git a/skills_taxonomy_v2/analysis/skills_extraction/notebooks/Multi-skill sentences.py b/skills_taxonomy_v2/analysis/skills_extraction/notebooks/Multi-skill sentences.py
@@ -53,6 +53,9 @@
 # %%
 pd.notnull(tagged_sentences_data["Well split and formatted into single sentence?"][27])
 
+# %%
+tagged_sentences_data[tagged_sentences_data["Well split and formatted into single sentence?"]==False]["original sentence"].tolist()
+
 
 # %%
 def get_binary():
@@ -66,7 +69,7 @@ def get_binary():
 tagged_sentences_data.head(3)
 
 # %%
-tagged_sentences_data[["Well split and formatted into single sentence?", "One skill mentioned? (or at least very similar skills)"]].value_counts()
+tagged_sentences_data[["Well split and formatted into single sentence?", "One skill mentioned? (or at least very similar skills)"]].value_counts(dropna=False)
 
 # %%
 tagged_sentences_data.plot.scatter(x="length original", y = 'Well split binary')
@@ -98,7 +101,7 @@ def get_binary():
         one_skill_accuracy.append(filt["One skill binary"].sum()/filt["One skill binary"].notnull().sum())
         well_split_accuracy.append(filt["Well split binary"].sum()/filt["Well split binary"].notnull().sum())
         num_data.append(len(filt))
-    
+
 
 # %%
 fig, ax1 = plt.subplots(figsize=(8,4))
@@ -113,10 +116,12 @@ def get_binary():
 
 fig.tight_layout()  # otherwise the right y-label is slightly clipped
 
-
+ax1.set_xlabel("Sentence length threshold")
 plt.axvline(x=250, c="red")
+plt.savefig("multiskill_sents_analysis.png")
 plt.show()
 
+
 # %%
 tagged_sentences_data.boxplot(by=[
     "One skill mentioned? (or at least very similar skills)",