Skip to content

Commit

Permalink
Rerunning skills extraction (#75)
Browse files Browse the repository at this point in the history
* Add a new config for the new skills extraction

* Use better transformer model and make prints nicer

* Add script to get a embedding sample

* Use min in sample

* filter by sentence length for sample too

* Split up outputs

* Add script to experiment with umap parameters

* Just change nneighbours

* Add script for redcuing embeddings

* Fix indents

* Intermediately save out

* Output 300k sample embeds

* Put script to get embeddings sample in analysis folder

* Add notebook to investigate embedding sample size for reducer

* Add analysis of multiskill sentences

* Delete script to look at reduction params

* Add notebook to find good clustering parameters

* Analysis cluster dist threshold notebook

* Add new method for extracting skills from reduced embeddings

* Neaten up reduce embeddings.py

* Add notebooks to analyse merging cluste thresh and extracted skills

* Update REAME with latest extract skill info
  • Loading branch information
lizgzil authored Nov 29, 2021
1 parent 078e135 commit 0b8f9ae
Show file tree
Hide file tree
Showing 11 changed files with 2,180 additions and 8 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""
Import the embeddings.
Find a good sample number to train the reducer class on.
Find a good number of dimensions to reduce the embeddings to.
"""

import yaml
import random
from tqdm import tqdm

import pandas as pd
import boto3
from sklearn import metrics
import matplotlib.pyplot as plt
import umap.umap_ as umap
from sklearn.neighbors import NearestNeighbors

from skills_taxonomy_v2.getters.s3_data import get_s3_data_paths, save_to_s3, load_s3_data
from skills_taxonomy_v2.pipeline.skills_extraction.extract_skills_utils import (
load_sentences_embeddings,ExtractSkills
)
from skills_taxonomy_v2 import BUCKET_NAME

sentence_embeddings_dir = 'outputs/skills_extraction/word_embeddings/data/2021.11.05'

s3 = boto3.resource("s3")

sentence_embeddings_dirs = get_s3_data_paths(s3, BUCKET_NAME, sentence_embeddings_dir, file_types=["*.json"])

# You want a sample of 1 million embeddings (which should be far more than we actually will need to use)
# So get a random 2000 from each file

# Load a sample of the embeddings from each file
# when sentence len <250 and
# No repeats

original_sentences = {}
for embedding_dir in sentence_embeddings_dirs:
if "original_sentences.json" in embedding_dir:
original_sentences.update(load_s3_data(s3, BUCKET_NAME, embedding_dir))

n_each_file = 2000
sent_thresh = 250

n_all_each_file = {}
n_in_sample_each_file = {}
unique_sentences = set()
embeddings_sample = []


for embedding_dir in tqdm(sentence_embeddings_dirs):
if "embeddings.json" in embedding_dir:
sentence_embeddings = load_s3_data(s3, BUCKET_NAME, embedding_dir)
n_all_each_file[embedding_dir] = len(sentence_embeddings)
random.seed(42)
sentence_embeddings_sample = random.sample(sentence_embeddings, min(len(sentence_embeddings), n_each_file))
count = 0
for _, sent_id, words, embedding in sentence_embeddings_sample:
if words not in unique_sentences:
original_sentence = original_sentences[str(sent_id)]
if len(original_sentence) < sent_thresh:
unique_sentences.add(words)
embeddings_sample.append(embedding)
count += 1
n_in_sample_each_file[embedding_dir] = count

save_to_s3(
s3, BUCKET_NAME, n_in_sample_each_file, "outputs/skills_extraction/word_embeddings/data/2021.11.05_n_in_sample_each_file.json",
)
save_to_s3(
s3, BUCKET_NAME, n_all_each_file, "outputs/skills_extraction/word_embeddings/data/2021.11.05_n_all_each_file.json",
)

save_to_s3(
s3, BUCKET_NAME, embeddings_sample[0:250000], "outputs/skills_extraction/word_embeddings/data/2021.11.05_sample_0.json",
)
save_to_s3(
s3, BUCKET_NAME, embeddings_sample[250000:500000], "outputs/skills_extraction/word_embeddings/data/2021.11.05_sample_1.json",
)
save_to_s3(
s3, BUCKET_NAME, embeddings_sample[500000:750000], "outputs/skills_extraction/word_embeddings/data/2021.11.05_sample_2.json",
)
save_to_s3(
s3, BUCKET_NAME, embeddings_sample[750000:], "outputs/skills_extraction/word_embeddings/data/2021.11.05_sample_3.json",
)

# The order is random anyway so no need to resample
save_to_s3(
s3,
BUCKET_NAME,
embeddings_sample[0:300000],
"outputs/skills_extraction/word_embeddings/data/2021.11.05_sample_300k.json",
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# ---
# jupyter:
# jupytext:
# cell_metadata_filter: -all
# comment_magics: true
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.11.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---

# %% [markdown]
# ## Analysing labelled data to find good distance threshold for merging clusters
#
# The data that was labelled was created by clustering using `dbscan_eps = 0.01` and `dbscan_min_samples = 4`. When fit to 300000 random sentences this produces:
# - 11551 clusters
# - 0 clusters which are larger than 10,000 sentences
# - 8892 clusters which have <10 sentences
# - 117,923 sentences not clustered
# - Average size of cluster is 16 sentences
#
# Here we find a threshold of about `0.05` gives a good prediction of whether two clusters should be merged or not based off 108 labelled data points.

# %%
import yaml
import random
from tqdm import tqdm
import json
from collections import Counter

import pandas as pd
import numpy as np
import boto3
from sklearn import metrics
import matplotlib.pyplot as plt
import umap.umap_ as umap
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import euclidean_distances

from skills_taxonomy_v2.getters.s3_data import get_s3_data_paths, save_to_s3, load_s3_data
from skills_taxonomy_v2.pipeline.skills_extraction.extract_skills_utils import (
load_sentences_embeddings,ExtractSkills
)
from skills_taxonomy_v2 import BUCKET_NAME

# %%
test_merge_thresh_labelled = pd.read_csv("test_merge_thresh_manual_labels_IMAN2.csv")

# %%
test_merge_thresh_labelled = test_merge_thresh_labelled[test_merge_thresh_labelled["Should merge?"].notnull()]
test_merge_thresh_labelled["Should merge?"] = test_merge_thresh_labelled["Should merge?"].str.rstrip()
test_merge_thresh_labelled["Small cluster is a skill?"] = test_merge_thresh_labelled["Small cluster is a skill?"].str.rstrip()

test_merge_thresh_labelled["Centroid distance log"] = test_merge_thresh_labelled["Centroid distance"].apply(lambda x: np.log10(x))
test_merge_thresh_labelled["Average small cluster sentence length"] = test_merge_thresh_labelled["Small cluster sentences"].apply(lambda x: np.mean([len(s) for s in x.split(",")]))

len(test_merge_thresh_labelled)


# %%
test_merge_thresh_labelled["Should merge?"].value_counts()

# %%
test_merge_thresh_labelled.boxplot(column="Centroid distance", by="Should merge?")

# %%
# test_merge_thresh_labelled.boxplot(column="Centroid distance", by="Should merge?")

# %%
distances = test_merge_thresh_labelled["Centroid distance"].tolist()
truth = test_merge_thresh_labelled["Should merge?"].tolist()
# If you want to convert the maybes to true:
truth = ["TRUE" if t=="MAYBE" else t for t in truth]
# # If you want to convert the maybes to false:
# truth = ["FALSE" if t=="MAYBE" else t for t in truth]

dist_thresh_dict = {}
for dist_thresh in list(np.arange(0.01, 0.5, step=0.001)):

prediction = ["TRUE" if d<dist_thresh else "FALSE" for d in distances]
num_greater_than_thresh = len([d for d in distances if d>=dist_thresh])

dist_thresh_dict[dist_thresh] = {
"prop_correct": sum([a==b for a,b in zip(prediction, truth)])/len(truth),
"prop_correct_truth_true": sum([((a==b) and (b=="TRUE")) for a,b in zip(prediction, truth)])/len(truth),
"prop_correct_truth_false": sum([((a==b) and (b=="FALSE")) for a,b in zip(prediction, truth)])/len(truth),
"prop_incorrect_truth_true": sum([((a!=b) and (b=="TRUE")) for a,b in zip(prediction, truth)])/len(truth),
"prop_incorrect_truth_false": sum([((a!=b) and (b=="FALSE")) for a,b in zip(prediction, truth)])/len(truth),
"num_data": num_greater_than_thresh
}

# %%
best_stats = dist_thresh_dict[0.04999999999999997]
print(f"The predictions are true {round(best_stats['prop_correct'],2)} of the time")
print(f"When the truth is to merge {round(best_stats['prop_correct_truth_true'],2)} of the time we do merge")
print(f"When the truth is to not merge {round(best_stats['prop_correct_truth_false'],2)} of the time we don't merge")
print(f"When the truth is to merge {round(best_stats['prop_incorrect_truth_true'],2)} of the time we don't merge")
print(f"When the truth is to not merge {round(best_stats['prop_incorrect_truth_false'],2)} of the time we do merge")


# %%
fig,ax = plt.subplots()
x = [k for k,v in dist_thresh_dict.items()]
y = [v["prop_correct"] for k,v in dist_thresh_dict.items()]
yt = [v["prop_correct_truth_true"] for k,v in dist_thresh_dict.items()]
yf = [v["prop_correct_truth_false"] for k,v in dist_thresh_dict.items()]
y2 = [v["num_data"] for k,v in dist_thresh_dict.items()]

ax.plot(x, y, color="black", label="Accuracy")
ax.plot(x, yt, color="red", label="TP rate")
ax.plot(x, yf, color="purple", label="TN rate")
ax.set_ylabel("Proportion of data points\ncorrectly predicted")
ax.set_xlabel("Centroid distance threshold")
plt.title("Finding a threshold distance to merge clusters\n(if distance between 2 clusters is < this then merge)")
plt.legend()

ax2=ax.twinx()
ax2.plot(x, y2, color="blue")
ax2.set_ylabel("Number of data points with\ndistance greater than this", color="blue")

plt.axvline(0.05, color="orange");
plt.savefig("../figures/nov_2021/finding_good_merge_params2.pdf")
plt.savefig("../figures/nov_2021/finding_good_merge_params2.png")


# %%
### Where did things go wrong?
dist_thresh = 0.05
predicted = []
wrong_rows=pd.DataFrame()
for i, row in test_merge_thresh_labelled.iterrows():
pred_label = "FALSE"
if row["Centroid distance"] < dist_thresh:
pred_label = "TRUE"
predicted.append(pred_label)
true_label = "TRUE" if row["Should merge?"]=="MAYBE" else row["Should merge?"]
if pred_label!= true_label:
new_row = pd.DataFrame(row).T
new_row["Merge prediction"] = pred_label
wrong_rows = pd.concat([wrong_rows, new_row])

# %%
test_merge_thresh_labelled["Merge prediction"] = predicted
test_merge_thresh_labelled["Should merge - maybe is true"] = test_merge_thresh_labelled["Should merge?"].apply(
lambda x: "TRUE" if x=="MAYBE" else x)


# %%
test_merge_thresh_labelled.groupby(["Should merge - maybe is true",
"Merge prediction"])['Centroid distance'].count()


# %%
test_merge_thresh_labelled.groupby(["Should merge - maybe is true","Merge prediction","Small cluster is a skill?"])['Centroid distance'].count()


# %%
test_merge_thresh_labelled["Should merge?"].value_counts()

# %%
wrong_rows["Should merge?"].value_counts()

# %%
wrong_rows["Merge prediction"].value_counts()

# %%
right_rows = test_merge_thresh_labelled[~test_merge_thresh_labelled["Unnamed: 0"].isin(wrong_rows["Unnamed: 0"].tolist())]

wrong_rows["Average small cluster sentence length"].hist(alpha=0.5, color="green")
right_rows["Average small cluster sentence length"].hist(alpha=0.5, color="red")


# %%
print(f"With a distance threshold of {dist_thresh} we find {round(len(right_rows)/len(test_merge_thresh_labelled),2)} correct merge decisions on our sample of {len(test_merge_thresh_labelled)} data points.")
print(f"We find the mean sentence length for wrong merge decisions as {wrong_rows['Average small cluster sentence length'].mean().round(2)},")
print(f"and the mean sentence length of correct merge decisions as {right_rows['Average small cluster sentence length'].mean().round(2)}.")
print(f"The proportion of small clusters not being a good skill cluster is {round(wrong_rows['Small cluster is a skill?'].value_counts()['FALSE']/len(wrong_rows),2)} in the wrong merge decisions")
print(f"and {round(right_rows['Small cluster is a skill?'].value_counts()['FALSE']/len(right_rows),2)} in the correct merge decisions.")
print("In all - bad merge decisions are often because the cluster isn't really a skill anyway.")


# %%
# Truth is don't merge and the small cluster is a good skill cluster
wrong_rows[((wrong_rows["Should merge?"]=="FALSE") & (wrong_rows['Small cluster is a skill?']!="FALSE"))][
["Small cluster sentences", "Merge into cluster sentences (10 examples)"]].values.tolist()

# %%
test_merge_thresh_labelled_nomaybe = test_merge_thresh_labelled[test_merge_thresh_labelled["Should merge?"]!="MAYBE"]
distances = test_merge_thresh_labelled_nomaybe["Centroid distance"].tolist()
truth = test_merge_thresh_labelled_nomaybe["Should merge?"].tolist()

dist_thresh_dict_nomaybe = {}
for dist_thresh in list(np.arange(0.01, 0.5, step=0.001)):

prediction = ["TRUE" if d<dist_thresh else "FALSE" for d in distances]
num_greater_than_thresh = len([d for d in distances if d>=dist_thresh])
dist_thresh_dict_nomaybe[dist_thresh] = {
"prop_correct": sum([a==b for a,b in zip(prediction, truth)])/len(truth),
"num_data": num_greater_than_thresh
}

fig,ax = plt.subplots()
x = [k for k,v in dist_thresh_dict_nomaybe.items()]
y = [v["prop_correct"] for k,v in dist_thresh_dict_nomaybe.items()]
y2 = [v["num_data"] for k,v in dist_thresh_dict_nomaybe.items()]

ax.plot(x, y, color="black")
ax.set_ylabel("Proportion of data points\ncorrectly predicted")
ax.set_xlabel("Centroid distance threshold")
plt.title("Finding a threshold distance to merge clusters\n(if distance between 2 clusters is < this then merge)")

ax2=ax.twinx()
ax2.plot(x, y2, color="blue")
ax2.set_ylabel("Number of data points with\ndistance greater than this", color="blue")

plt.axvline(0.04, color="orange");


# %% [markdown]
# ## Is there anything about the small cluster being a good skill or not based off average sentence length?
# Nothing for this sample

# %%
test_merge_thresh_labelled["Small cluster is a skill?"].value_counts()

# %%
test_merge_thresh_labelled.boxplot(column="Average small cluster sentence length", by="Small cluster is a skill?")


# %%
Loading

0 comments on commit 0b8f9ae

Please sign in to comment.