Cath analysis changes (#66)

* Add some analysis to look at how many sentences are predicted to be skill sentences from a sample * Change alpha value for a skills in 2d space plot * Use the percetnage of skills from each level in job title skills analysis outputs * some changes to the covid analysis notebook - looking at date distribution, only using skills which make up a certain percetnage in output * Edit region application notebook plots to be colour coded not radius size
nestauk · Oct 1, 2021 · a71366d · a71366d
1 parent ac4c3d4
commit a71366d
Show file tree

Hide file tree

Showing 5 changed files with 447 additions and 76 deletions.
diff --git a/...sis/sentence_classifier/notebooks/Skills Classifier 1.0 - Analyse sentence predictions.py b/...sis/sentence_classifier/notebooks/Skills Classifier 1.0 - Analyse sentence predictions.py
@@ -24,6 +24,9 @@
 # 1. How many skill sentences now? 5,823,903
 # 2. Distribution of length of them?
 
+# %%
+# cd ../../../..
+
 # %%
 import json
 
@@ -33,14 +36,138 @@
 import pandas as pd
 
 # %%
-# cd ../../../..
+from skills_taxonomy_v2.pipeline.sentence_classifier.predict_sentence_class import *
 
 # %%
-from skills_taxonomy_v2.pipeline.sentence_classifier.predict_sentence_class import (
-    get_s3_data_paths,
-    load_neccessary,
+input_dir = "inputs/data/"
+model_config_name = "2021.08.16"
+data_dir = "textkernel-files/"
+
+# %%
+sent_classifier, _ = load_model(model_config_name)
+nlp = spacy.load("en_core_web_sm")
+
+# %%
+root = os.path.join(input_dir, data_dir)
+
+s3 = boto3.resource("s3")
+bucket = s3.Bucket(BUCKET_NAME)
+data_paths = get_s3_data_paths(bucket, root)
+
+# %% [markdown]
+# ### Load one file of data
+
+# %%
+data_path = data_paths[0]
+data = load_s3_data(data_path, s3)
+
+# %%
+data_5000 = data[0:5000]
+
+# %%
+with Pool(4) as pool:  # 4 cpus
+    partial_split_sentence = partial(split_sentence, nlp=nlp, min_length=30)
+    split_sentence_pool_output = pool.map(partial_split_sentence, data_5000)
+
+# %%
+# Process output into one list of sentences for all documents
+sentences = []
+job_ids = []
+for i, (job_id, s) in enumerate(split_sentence_pool_output):
+    if s:
+        sentences += s
+        job_ids += [job_id] * len(s)
+
+# %%
+print(f"There were {len(sentences)} sentences in {len(data_5000)} job adverts")
+print(f"This is about {len(sentences)/len(data_5000)} sentences in each job advert")
+
+# %%
+print(
+    f"So in a sample of 1 mil job adverts we'd expect {(len(sentences)/len(data_5000))*1000000} sentences"
+)
+print(f"And since we found 4 million skill sentences in this sample")
+print(
+    f"it means {round(4000000*100/((len(sentences)/len(data_5000))*1000000),1)}% of sentences are skill sentences"
 )
 
+# %% [markdown]
+# ### Predict skill sentences for a small number of job adverts
+
+# %%
+data_path = data_paths[0]
+data = load_s3_data(data_path, s3)
+
+# %%
+# Only use a small sample
+data = data[0:100]
+
+# %%
+with Pool(4) as pool:  # 4 cpus
+    partial_split_sentence = partial(split_sentence, nlp=nlp, min_length=30)
+    split_sentence_pool_output = pool.map(partial_split_sentence, data)
+
+# %%
+# Process output into one list of sentences for all documents
+sentences = []
+job_ids = []
+for i, (job_id, s) in enumerate(split_sentence_pool_output):
+    if s:
+        sentences += s
+        job_ids += [job_id] * len(s)
+
+# %%
+sentences_vec = sent_classifier.transform(sentences)
+pool_sentences_vec = [(vec_ix, [vec]) for vec_ix, vec in enumerate(sentences_vec)]
+
+# Manually chunk up the data to predict multiple in a pool
+# This is because predict can't deal with massive vectors
+pool_sentences_vecs = []
+pool_sentences_vec = []
+for vec_ix, vec in enumerate(sentences_vec):
+    pool_sentences_vec.append((vec_ix, vec))
+    if len(pool_sentences_vec) > 1000:
+        pool_sentences_vecs.append(pool_sentences_vec)
+        pool_sentences_vec = []
+if len(pool_sentences_vec) != 0:
+    # Add the final chunk if not empty
+    pool_sentences_vecs.append(pool_sentences_vec)
+
+
+# %%
+with Pool(4) as pool:  # 4 cpus
+    partial_predict_sentences = partial(
+        predict_sentences, sent_classifier=sent_classifier
+    )
+    predict_sentences_pool_output = pool.map(
+        partial_predict_sentences, pool_sentences_vecs
+    )
+
+# %%
+skill_sentences_dict = defaultdict(list)
+for chunk_output in predict_sentences_pool_output:
+    for (sent_ix, pred) in chunk_output:
+        if pred == 1:
+            job_id = job_ids[sent_ix]
+            sentence = sentences[sent_ix]
+            skill_sentences_dict[job_id].append(sentence)
+
+# %%
+num_skill_sent_job = [len(s) for s in skill_sentences_dict.values()]
+
+# %%
+print(f"From a sample of {len(data)} job adverts")
+print(f"There were {len(sentences)} sentences found")
+print(f"{len(skill_sentences_dict)} job adverts had skill sentences in")
+print(f"There were {sum(num_skill_sent_job)} skill sentences found")
+print(
+    f"Each job had a mean number of {round(np.mean(num_skill_sent_job),1)} skill sentences in"
+)
+print(f"{sum(num_skill_sent_job)*100/len(sentences)}% of sentences are skill sentences")
+
+
+# %% [markdown]
+# ## Older code
 
 # %%
 def load_s3_json_data(file_name, s3, bucket_name):

diff --git a/...axonomy_v2/analysis/skills_extraction/notebooks/Skills Extraction Analysis and Figures.py b/...axonomy_v2/analysis/skills_extraction/notebooks/Skills Extraction Analysis and Figures.py
@@ -114,7 +114,8 @@
 nlp = spacy.load("en_core_web_sm")
 
 bert_vectorizer = BertVectorizer(
-    bert_model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", multi_process=True,
+    bert_model_name="sentence-transformers/paraphrase-MiniLM-L6-v2",
+    multi_process=True,
 )
 bert_vectorizer.fit()
 
@@ -256,9 +257,15 @@
 color_palette = viridis
 
 ds_dict = dict(
-    x=reduced_x, y=reduced_y, texts=sentence_clusters["description"].tolist(),
+    x=reduced_x,
+    y=reduced_y,
+    texts=sentence_clusters["description"].tolist(),
+)
+hover = HoverTool(
+    tooltips=[
+        ("Sentence", "@texts"),
+    ]
 )
-hover = HoverTool(tooltips=[("Sentence", "@texts"),])
 source = ColumnDataSource(ds_dict)
 
 p = figure(
@@ -269,7 +276,12 @@
     toolbar_location="below",
 )
 p.circle(
-    x="x", y="y", radius=0.01, alpha=0.1, source=source, color="black",
+    x="x",
+    y="y",
+    radius=0.01,
+    alpha=0.1,
+    source=source,
+    color="black",
 )
 p.xaxis.visible = False
 p.xgrid.visible = False
@@ -294,7 +306,12 @@
     texts=sentence_clusters["description"].tolist(),
     label=colors_by_labels,
 )
-hover = HoverTool(tooltips=[("Sentence", "@texts"), ("Skill cluster", "@label"),])
+hover = HoverTool(
+    tooltips=[
+        ("Sentence", "@texts"),
+        ("Skill cluster", "@label"),
+    ]
+)
 source = ColumnDataSource(ds_dict)
 unique_colors = list(set(colors_by_labels))
 num_unique_colors = len(unique_colors)
@@ -351,7 +368,12 @@
 )
 source2 = ColumnDataSource(ds_dict_2)
 
-hover = HoverTool(tooltips=[("Sentence", "@texts"), ("Clustered", "@label"),])
+hover = HoverTool(
+    tooltips=[
+        ("Sentence", "@texts"),
+        ("Clustered", "@label"),
+    ]
+)
 
 p = figure(
     plot_width=500,
@@ -361,10 +383,20 @@
     toolbar_location="below",
 )
 p.circle(
-    x="x", y="y", radius=0.01, alpha=0.5, source=source1, color="grey",
+    x="x",
+    y="y",
+    radius=0.01,
+    alpha=0.5,
+    source=source1,
+    color="grey",
 )
 p.circle(
-    x="x", y="y", radius=0.01, alpha=0.5, source=source2, color="red",
+    x="x",
+    y="y",
+    radius=0.01,
+    alpha=0.5,
+    source=source2,
+    color="red",
 )
 p.xaxis.visible = False
 p.xgrid.visible = False
@@ -391,7 +423,12 @@
     texts=sentence_clusters_notnone["description"].tolist(),
     label=colors_by_labels,
 )
-hover = HoverTool(tooltips=[("Sentence", "@texts"), ("Skill cluster", "@label"),])
+hover = HoverTool(
+    tooltips=[
+        ("Sentence", "@texts"),
+        ("Skill cluster", "@label"),
+    ]
+)
 source = ColumnDataSource(ds_dict)
 unique_colors = list(set(colors_by_labels))
 num_unique_colors = len(unique_colors)
@@ -519,7 +556,7 @@
     toolbar_location="below",
 )
 
-p.circle(x="x", y="y", radius=0.04, alpha=0.15, source=source, color="grey")
+p.circle(x="x", y="y", radius=0.04, alpha=0.08, source=source, color="grey")
 
 skills_clusters_sample_n = [
     13189,