Skip to content

Commit

Permalink
Cath analysis changes (#66)
Browse files Browse the repository at this point in the history
* Add some analysis to look at how many sentences are predicted to be skill sentences from a sample

* Change alpha value for a skills in 2d space plot

* Use the percetnage of skills from each level in job title skills analysis outputs

* some changes to the covid analysis notebook - looking at date distribution, only using skills which make up a certain percetnage in output

* Edit region application notebook plots to be colour coded not radius size
  • Loading branch information
lizgzil authored Oct 1, 2021
1 parent ac4c3d4 commit a71366d
Show file tree
Hide file tree
Showing 5 changed files with 447 additions and 76 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
# 1. How many skill sentences now? 5,823,903
# 2. Distribution of length of them?

# %%
# cd ../../../..

# %%
import json

Expand All @@ -33,14 +36,138 @@
import pandas as pd

# %%
# cd ../../../..
from skills_taxonomy_v2.pipeline.sentence_classifier.predict_sentence_class import *

# %%
from skills_taxonomy_v2.pipeline.sentence_classifier.predict_sentence_class import (
get_s3_data_paths,
load_neccessary,
input_dir = "inputs/data/"
model_config_name = "2021.08.16"
data_dir = "textkernel-files/"

# %%
sent_classifier, _ = load_model(model_config_name)
nlp = spacy.load("en_core_web_sm")

# %%
root = os.path.join(input_dir, data_dir)

s3 = boto3.resource("s3")
bucket = s3.Bucket(BUCKET_NAME)
data_paths = get_s3_data_paths(bucket, root)

# %% [markdown]
# ### Load one file of data

# %%
data_path = data_paths[0]
data = load_s3_data(data_path, s3)

# %%
data_5000 = data[0:5000]

# %%
with Pool(4) as pool: # 4 cpus
partial_split_sentence = partial(split_sentence, nlp=nlp, min_length=30)
split_sentence_pool_output = pool.map(partial_split_sentence, data_5000)

# %%
# Process output into one list of sentences for all documents
sentences = []
job_ids = []
for i, (job_id, s) in enumerate(split_sentence_pool_output):
if s:
sentences += s
job_ids += [job_id] * len(s)

# %%
print(f"There were {len(sentences)} sentences in {len(data_5000)} job adverts")
print(f"This is about {len(sentences)/len(data_5000)} sentences in each job advert")

# %%
print(
f"So in a sample of 1 mil job adverts we'd expect {(len(sentences)/len(data_5000))*1000000} sentences"
)
print(f"And since we found 4 million skill sentences in this sample")
print(
f"it means {round(4000000*100/((len(sentences)/len(data_5000))*1000000),1)}% of sentences are skill sentences"
)

# %% [markdown]
# ### Predict skill sentences for a small number of job adverts

# %%
data_path = data_paths[0]
data = load_s3_data(data_path, s3)

# %%
# Only use a small sample
data = data[0:100]

# %%
with Pool(4) as pool: # 4 cpus
partial_split_sentence = partial(split_sentence, nlp=nlp, min_length=30)
split_sentence_pool_output = pool.map(partial_split_sentence, data)

# %%
# Process output into one list of sentences for all documents
sentences = []
job_ids = []
for i, (job_id, s) in enumerate(split_sentence_pool_output):
if s:
sentences += s
job_ids += [job_id] * len(s)

# %%
sentences_vec = sent_classifier.transform(sentences)
pool_sentences_vec = [(vec_ix, [vec]) for vec_ix, vec in enumerate(sentences_vec)]

# Manually chunk up the data to predict multiple in a pool
# This is because predict can't deal with massive vectors
pool_sentences_vecs = []
pool_sentences_vec = []
for vec_ix, vec in enumerate(sentences_vec):
pool_sentences_vec.append((vec_ix, vec))
if len(pool_sentences_vec) > 1000:
pool_sentences_vecs.append(pool_sentences_vec)
pool_sentences_vec = []
if len(pool_sentences_vec) != 0:
# Add the final chunk if not empty
pool_sentences_vecs.append(pool_sentences_vec)


# %%
with Pool(4) as pool: # 4 cpus
partial_predict_sentences = partial(
predict_sentences, sent_classifier=sent_classifier
)
predict_sentences_pool_output = pool.map(
partial_predict_sentences, pool_sentences_vecs
)

# %%
skill_sentences_dict = defaultdict(list)
for chunk_output in predict_sentences_pool_output:
for (sent_ix, pred) in chunk_output:
if pred == 1:
job_id = job_ids[sent_ix]
sentence = sentences[sent_ix]
skill_sentences_dict[job_id].append(sentence)

# %%
num_skill_sent_job = [len(s) for s in skill_sentences_dict.values()]

# %%
print(f"From a sample of {len(data)} job adverts")
print(f"There were {len(sentences)} sentences found")
print(f"{len(skill_sentences_dict)} job adverts had skill sentences in")
print(f"There were {sum(num_skill_sent_job)} skill sentences found")
print(
f"Each job had a mean number of {round(np.mean(num_skill_sent_job),1)} skill sentences in"
)
print(f"{sum(num_skill_sent_job)*100/len(sentences)}% of sentences are skill sentences")


# %% [markdown]
# ## Older code

# %%
def load_s3_json_data(file_name, s3, bucket_name):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@
nlp = spacy.load("en_core_web_sm")

bert_vectorizer = BertVectorizer(
bert_model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", multi_process=True,
bert_model_name="sentence-transformers/paraphrase-MiniLM-L6-v2",
multi_process=True,
)
bert_vectorizer.fit()

Expand Down Expand Up @@ -256,9 +257,15 @@
color_palette = viridis

ds_dict = dict(
x=reduced_x, y=reduced_y, texts=sentence_clusters["description"].tolist(),
x=reduced_x,
y=reduced_y,
texts=sentence_clusters["description"].tolist(),
)
hover = HoverTool(
tooltips=[
("Sentence", "@texts"),
]
)
hover = HoverTool(tooltips=[("Sentence", "@texts"),])
source = ColumnDataSource(ds_dict)

p = figure(
Expand All @@ -269,7 +276,12 @@
toolbar_location="below",
)
p.circle(
x="x", y="y", radius=0.01, alpha=0.1, source=source, color="black",
x="x",
y="y",
radius=0.01,
alpha=0.1,
source=source,
color="black",
)
p.xaxis.visible = False
p.xgrid.visible = False
Expand All @@ -294,7 +306,12 @@
texts=sentence_clusters["description"].tolist(),
label=colors_by_labels,
)
hover = HoverTool(tooltips=[("Sentence", "@texts"), ("Skill cluster", "@label"),])
hover = HoverTool(
tooltips=[
("Sentence", "@texts"),
("Skill cluster", "@label"),
]
)
source = ColumnDataSource(ds_dict)
unique_colors = list(set(colors_by_labels))
num_unique_colors = len(unique_colors)
Expand Down Expand Up @@ -351,7 +368,12 @@
)
source2 = ColumnDataSource(ds_dict_2)

hover = HoverTool(tooltips=[("Sentence", "@texts"), ("Clustered", "@label"),])
hover = HoverTool(
tooltips=[
("Sentence", "@texts"),
("Clustered", "@label"),
]
)

p = figure(
plot_width=500,
Expand All @@ -361,10 +383,20 @@
toolbar_location="below",
)
p.circle(
x="x", y="y", radius=0.01, alpha=0.5, source=source1, color="grey",
x="x",
y="y",
radius=0.01,
alpha=0.5,
source=source1,
color="grey",
)
p.circle(
x="x", y="y", radius=0.01, alpha=0.5, source=source2, color="red",
x="x",
y="y",
radius=0.01,
alpha=0.5,
source=source2,
color="red",
)
p.xaxis.visible = False
p.xgrid.visible = False
Expand All @@ -391,7 +423,12 @@
texts=sentence_clusters_notnone["description"].tolist(),
label=colors_by_labels,
)
hover = HoverTool(tooltips=[("Sentence", "@texts"), ("Skill cluster", "@label"),])
hover = HoverTool(
tooltips=[
("Sentence", "@texts"),
("Skill cluster", "@label"),
]
)
source = ColumnDataSource(ds_dict)
unique_colors = list(set(colors_by_labels))
num_unique_colors = len(unique_colors)
Expand Down Expand Up @@ -519,7 +556,7 @@
toolbar_location="below",
)

p.circle(x="x", y="y", radius=0.04, alpha=0.15, source=source, color="grey")
p.circle(x="x", y="y", radius=0.04, alpha=0.08, source=source, color="grey")

skills_clusters_sample_n = [
13189,
Expand Down
Loading

0 comments on commit a71366d

Please sign in to comment.