-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add scripts for building the taxonomy * Add location data to saving out metadata from tk files * Save out metadata every 100 tk files * put metadata files in folders * fix tabs * Save out less metadata info and more regularly since files are really large * fix some getting of bulk metadata
- Loading branch information
Showing
5 changed files
with
410 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
flows: | ||
build_taxonomy_flow: | ||
params: | ||
clustered_sentences_path: "outputs/skills_extraction/extracted_skills/2021.08.31_sentences_data.json" | ||
skills_data_path: 'outputs/skills_extraction/extracted_skills/2021.08.31_skills_data.json' | ||
level_c_n: 250 | ||
level_b_n: 50 | ||
level_a_n: 10 | ||
k_means_max_iter: 5000 | ||
check_low_siloutte_b: True | ||
silhouette_threshold: 0 | ||
level_a_consensus_numclust_its: 100 | ||
level_names_tfidif_n: 3 | ||
use_level_a_consensus: True | ||
output_dir: "outputs/skills_hierarchy/" |
7 changes: 6 additions & 1 deletion
7
skills_taxonomy_v2/pipeline/skills_taxonomy/Skills Taxonomy.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,8 @@ | ||
# Skills Taxonomy | ||
|
||
In this pipeline we build the taxonomy from skills extracted from TextKernel job adverts. | ||
In this pipeline we build the taxonomy from skills extracted from TextKernel job adverts. | ||
|
||
This is run by: | ||
``` | ||
python -i skills_taxonomy_v2/pipeline/skills_taxonomy/build_taxonomy.py --config_path 'skills_taxonomy_v2/config/skills_taxonomy/2021.09.06.yaml' | ||
``` |
195 changes: 195 additions & 0 deletions
195
skills_taxonomy_v2/pipeline/skills_taxonomy/build_taxonomy.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
from argparse import ArgumentParser | ||
import logging | ||
import yaml | ||
|
||
import pandas as pd | ||
from tqdm import tqdm | ||
import boto3 | ||
|
||
from skills_taxonomy_v2.getters.s3_data import load_s3_data, get_s3_data_paths, save_to_s3 | ||
from skills_taxonomy_v2 import BUCKET_NAME | ||
from skills_taxonomy_v2.pipeline.skills_taxonomy.build_taxonomy_utils import ( | ||
get_many_clusters, | ||
get_consensus_clusters_mappings, | ||
get_top_tf_idf_words, | ||
get_level_names, | ||
get_new_level_consensus, | ||
get_new_level) | ||
from skills_taxonomy_v2.pipeline.skills_extraction.extract_skills_utils import ( | ||
get_output_config_stamped, | ||
) | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
def parse_arguments(parser): | ||
|
||
parser.add_argument( | ||
"--config_path", | ||
help="Path to config file", | ||
default="skills_taxonomy_v2/config/skills_taxonomy/2021.09.06.yaml", | ||
) | ||
|
||
return parser.parse_args() | ||
|
||
if __name__ == "__main__": | ||
|
||
parser = ArgumentParser() | ||
args = parse_arguments(parser) | ||
|
||
with open(args.config_path, "r") as f: | ||
config = yaml.load(f, Loader=yaml.FullLoader) | ||
|
||
FLOW_ID = "build_taxonomy_flow" | ||
|
||
flow_config = config["flows"][FLOW_ID] | ||
params = flow_config["params"] | ||
|
||
s3 = boto3.resource("s3") | ||
|
||
sentence_embs = load_s3_data(s3, BUCKET_NAME, params["clustered_sentences_path"]) | ||
sentence_embs = pd.DataFrame(sentence_embs) | ||
# Remove not clustered sentences | ||
sentence_embs = sentence_embs[sentence_embs['Cluster number']!=-1] | ||
|
||
skills_data = load_s3_data(s3, BUCKET_NAME, params["skills_data_path"]) | ||
|
||
logger.info("Getting lowest level hierarchy ...") | ||
level_c_cluster_mapper = get_new_level( | ||
sentence_embs, | ||
previous_level_col='Cluster number', | ||
k_means_n=params["level_c_n"], | ||
k_means_max_iter=params["k_means_max_iter"]) | ||
sentence_embs['Level C'] = sentence_embs['Cluster number'].apply(lambda x: level_c_cluster_mapper[x]) | ||
logger.info(f"Lowest level hierarchy has {sentence_embs['Level C'].nunique()} sections") | ||
|
||
logger.info("Getting mid level hierarchy ...") | ||
if params["check_low_siloutte_b"]: | ||
logger.info("Points with low siloutte scores are put in their own cluster") | ||
level_b_cluster_mapper = get_new_level( | ||
sentence_embs, | ||
previous_level_col='Level C', | ||
k_means_n=params["level_b_n"], | ||
k_means_max_iter=params["k_means_max_iter"], | ||
check_low_siloutte=params["check_low_siloutte_b"], | ||
silhouette_threshold=params["silhouette_threshold"]) | ||
sentence_embs['Level B'] = sentence_embs['Level C'].apply(lambda x: level_b_cluster_mapper[x]) | ||
logger.info(f"Mid level hierarchy has {sentence_embs['Level B'].nunique()} sections") | ||
|
||
logger.info("Getting top level hierarchy ...") | ||
if params["use_level_a_consensus"]: | ||
logger.info("... using consensus clustering") | ||
level_a_cluster_mapper = get_new_level_consensus( | ||
sentence_embs, | ||
previous_level_col='Level B', | ||
k_means_n=params["level_a_n"], | ||
numclust_its=params["level_a_consensus_numclust_its"] | ||
) | ||
sentence_embs['Level A'] = sentence_embs['Level B'].apply(lambda x: level_a_cluster_mapper[x]) | ||
else: | ||
level_a_cluster_mapper = get_new_level( | ||
sentence_embs, | ||
previous_level_col='Level B', | ||
k_means_n=params["level_a_n"], | ||
k_means_max_iter=params["k_means_max_iter"] | ||
) | ||
sentence_embs['Level A'] = sentence_embs['Level B'].apply(lambda x: level_a_cluster_mapper[x]) | ||
logger.info(f"Top level hierarchy has {sentence_embs['Level A'].nunique()} sections") | ||
|
||
# Level D is just a merging of level C skills which were given the same name (i.e. no clustering) | ||
# If there are skills with the same name in level C then group these | ||
sentence_embs['Skill name'] = sentence_embs['Cluster number'].apply(lambda x: skills_data[str(x)]['Skills name']) | ||
level_c_name_mapper = {} | ||
for level_c_num, level_c_data in sentence_embs.groupby('Level C'): | ||
level_c_skill_names = level_c_data['Skill name'].unique().tolist() | ||
level_c_name_mapper[level_c_num] = {level_c_skill_name:i for i, level_c_skill_name in enumerate(level_c_skill_names)} | ||
|
||
def get_level_c_merged_names(skill): | ||
return level_c_name_mapper[skill['Level C']][skill['Skill name']] | ||
|
||
sentence_embs['Level D'] = sentence_embs.apply(get_level_c_merged_names, axis=1) | ||
|
||
# Level names | ||
level_a_names = get_level_names(sentence_embs, 'Level A', top_n=params["level_names_tfidif_n"]) | ||
level_b_names = get_level_names(sentence_embs, 'Level B', top_n=params["level_names_tfidif_n"]) | ||
level_c_names = get_level_names(sentence_embs, 'Level C', top_n=params["level_names_tfidif_n"]) | ||
|
||
logger.info("Creating and saving dictionary of hierarchical information per skill ...") | ||
# Dict of hierarchy information per skill | ||
# {skill_num: {hierarchy info for this skill}} | ||
|
||
skill_hierarchy = {} | ||
for skill_num, skill_info in skills_data.items(): | ||
skill_num = int(skill_num) | ||
if skill_num != -1: | ||
hier_info = {} | ||
level_c = level_c_cluster_mapper[skill_num] | ||
level_b = level_b_cluster_mapper[level_c] | ||
level_a = level_a_cluster_mapper[level_b] | ||
hier_info['Skill name'] = skill_info['Skills name'] | ||
hier_info['Hierarchy level A'] = level_a | ||
hier_info['Hierarchy level B'] = level_b | ||
hier_info['Hierarchy level C'] = level_c | ||
hier_info['Hierarchy level D'] = level_c_name_mapper[level_c][skill_info['Skills name']] | ||
hier_info['Hierarchy level A name'] = level_a_names[level_a] | ||
hier_info['Hierarchy level B name'] = level_b_names[level_b] | ||
hier_info['Hierarchy level C name'] = level_c_names[level_c] | ||
hier_info['Hierarchy ID'] = f"{level_a}-{level_b}-{level_c}" | ||
hier_info['Number of sentences that created skill'] = len(skill_info['Texts']) | ||
skill_hierarchy[skill_num] = hier_info | ||
|
||
# Save json | ||
skill_hierarchy_file = get_output_config_stamped( | ||
args.config_path, params["output_dir"], "skills_hierarchy.json" | ||
) | ||
save_to_s3(s3, BUCKET_NAME, skill_hierarchy, skill_hierarchy_file) | ||
logger.info(f"Saved to {skill_hierarchy_file}") | ||
|
||
|
||
logger.info("Creating and saving dictionary of hierarchical information per level ...") | ||
# Dict of hierarchy information per level | ||
# {level_a_num: {level_b_info: {level_c_info}}} | ||
|
||
hier_structure = {} | ||
for level_a_num, level_a_num_data in sentence_embs.groupby('Level A'): | ||
level_b_structure = {} | ||
for level_b_num, level_b_num_data in level_a_num_data.groupby('Level B'): | ||
level_c_structure = {} | ||
for level_c_num, level_c_num_data in level_b_num_data.groupby('Level C'): | ||
level_d_structure = {} | ||
for level_d_num, level_d_num_data in level_c_num_data.groupby('Level D'): | ||
skill_nums = level_d_num_data['Cluster number'].unique().tolist() | ||
# The name at this level is the skill names all these level Ds are grouped on | ||
level_d_structure[level_d_num] = { | ||
'Name': skill_hierarchy[skill_nums[0]]['Skill name'], | ||
'Number of skills': len(skill_nums), | ||
'Skills': {k: { | ||
'Skill name': skill_hierarchy[k]['Skill name'], | ||
'Number of sentences that created skill': skill_hierarchy[k]['Number of sentences that created skill'], | ||
} for k in skill_nums} | ||
} | ||
skill_nums_c = level_c_num_data['Cluster number'].unique().tolist() | ||
level_c_structure[level_c_num] = { | ||
'Name': level_c_names[level_c_num], | ||
'Number of skills': len(skill_nums_c), | ||
'Level D': level_d_structure | ||
} | ||
skill_nums_b = level_b_num_data['Cluster number'].unique().tolist() | ||
level_b_structure[level_b_num] = { | ||
'Name': level_b_names[level_b_num], | ||
'Number of skills': len(skill_nums_b), | ||
'Level C': level_c_structure | ||
} | ||
skill_nums_a = level_a_num_data['Cluster number'].unique().tolist() | ||
hier_structure[level_a_num] = { | ||
'Name': level_a_names[level_a_num], | ||
'Number of skills': len(skill_nums_a), | ||
'Level B': level_b_structure | ||
} | ||
|
||
# Save json | ||
hier_structure_file = get_output_config_stamped( | ||
args.config_path, params["output_dir"], "hierarchy_structure.json" | ||
) | ||
save_to_s3(s3, BUCKET_NAME, hier_structure, hier_structure_file) | ||
|
||
logger.info(f"Saved to {hier_structure_file}") |
148 changes: 148 additions & 0 deletions
148
skills_taxonomy_v2/pipeline/skills_taxonomy/build_taxonomy_utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
import json | ||
import pickle | ||
import random | ||
from collections import Counter, defaultdict | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from tqdm import tqdm | ||
from sklearn.feature_extraction.text import TfidfVectorizer | ||
from sklearn.cluster import KMeans | ||
from sklearn.metrics import silhouette_samples | ||
from Levenshtein import distance | ||
|
||
def get_many_clusters(skill_nums, average_emb_clust, n_clusters, numclust_its=10): | ||
|
||
# - numclust_its iterations of clustering, | ||
# - changing random prop_sampled% of data | ||
# - changing clustering intial conditions | ||
clustering_results = pd.DataFrame(index=skill_nums) | ||
for i in range(numclust_its): | ||
clustering = KMeans(n_clusters=n_clusters, random_state=i) | ||
cluster_num = clustering.fit_predict( | ||
[average_emb_clust[k] for k in skill_nums]).tolist() | ||
new_clustering_results = pd.DataFrame(cluster_num, index=skill_nums, columns=[f'Cluster set {i}']) | ||
clustering_results = pd.concat([clustering_results, new_clustering_results], axis=1) | ||
|
||
return clustering_results | ||
|
||
def get_consensus_clusters_mappings(consensus_results_df, k): | ||
""" | ||
consensus_results_df: a dataframe of each skill and the clusters it was assigned | ||
to with 10 iterations of clustering | ||
""" | ||
|
||
consensus_sets = ["".join([str(cc) for cc in c]) for c in consensus_results_df.values.tolist()] | ||
|
||
consensus_sets_unique = set(consensus_sets) | ||
|
||
# e.g. how similar is '1234' to '1235'? | ||
all_dists_matrix = [] | ||
for set_1 in consensus_sets_unique: | ||
temp_list = [] | ||
for set_2 in consensus_sets_unique: | ||
lev_dict = distance(set_1, set_2) | ||
temp_list.append(lev_dict) | ||
all_dists_matrix.append(temp_list) | ||
|
||
# Cluster the consensus sets to group them together | ||
# e.g. '1234', '1235' and '1233' in group 1 | ||
# '5478' and '5479' in group 2 | ||
|
||
clustering_dists = KMeans(n_clusters = k, random_state=42) | ||
cluster_num = clustering_dists.fit_predict(all_dists_matrix).tolist() | ||
consensus_set_mapper = dict(zip(list(consensus_sets_unique), cluster_num)) | ||
|
||
return [consensus_set_mapper[c] for c in consensus_sets] | ||
|
||
def get_top_tf_idf_words(vect, feature_names, top_n=2): | ||
""" | ||
From https://stackoverflow.com/questions/34232190/scikit-learn-tfidfvectorizer-how-to-get-top-n-terms-with-highest-tf-idf-score | ||
""" | ||
sorted_nzs = np.argsort(vect.data)[: -(top_n + 1) : -1] | ||
return feature_names[vect.indices[sorted_nzs]].tolist() | ||
|
||
def get_level_names(sentence_embs, level_col_name, top_n): | ||
|
||
# Merge all the texts within each subsection of this level | ||
hier_level_texts = [] | ||
level_nums = [] | ||
for level_num, level_data in sentence_embs.groupby(level_col_name): | ||
hier_level_texts.append(" ".join(level_data['description'].tolist())) | ||
level_nums.append(level_num) | ||
|
||
vectorizer = TfidfVectorizer() | ||
vect = vectorizer.fit_transform(hier_level_texts) | ||
|
||
feature_names = np.array(vectorizer.get_feature_names()) | ||
|
||
level_names = {level_num: '-'.join( | ||
get_top_tf_idf_words(doc_vec, feature_names, top_n=top_n) | ||
) for level_num, doc_vec in zip(level_nums, vect)} | ||
|
||
return level_names | ||
|
||
def get_new_level(sentence_embs, previous_level_col, k_means_n, k_means_max_iter, check_low_siloutte=False, silhouette_threshold=0): | ||
|
||
# Mean sentence embedding for the previous level | ||
average_emb_dict = dict( | ||
sentence_embs.groupby(previous_level_col)['reduced_points_umap'].apply(lambda x: np.mean(x.tolist(), axis=0).tolist())) | ||
|
||
cluster_mapper = cluster_level_mapper( | ||
average_emb_dict, | ||
k_means_n=k_means_n, | ||
k_means_max_iter=k_means_max_iter, | ||
check_low_siloutte=check_low_siloutte, | ||
silhouette_threshold=silhouette_threshold | ||
) | ||
|
||
return cluster_mapper | ||
|
||
def cluster_level_mapper(embeddings_dict, k_means_n, k_means_max_iter=5000, check_low_siloutte=False, silhouette_threshold=0): | ||
""" | ||
Cluster the embeddings in embeddings_dict values to create a mapper dictionary | ||
from the embeddings_dict keys to the cluster number. | ||
e.g. embeddings_dict = {0: [1.23,5.67], 1: [4.56,7.8],...} | ||
prev2next_map = {0:5, 1:34, ...} | ||
""" | ||
|
||
clustering = KMeans(n_clusters=k_means_n, max_iter=k_means_max_iter, random_state=42) | ||
cluster_num = clustering.fit_predict(list(embeddings_dict.values())).tolist() | ||
|
||
if check_low_siloutte: | ||
# The Silhouette Coefficient is a measure of how well samples are clustered with samples | ||
# that are similar to themselves. | ||
silhouette_samples_n = silhouette_samples(list(embeddings_dict.values()), cluster_num) | ||
# Give any not well clustered points a new cluster number | ||
not_well_clust = list(np.argwhere(silhouette_samples_n < silhouette_threshold).flatten()) | ||
new_cluster_num = k_means_n | ||
for ix in not_well_clust: | ||
cluster_num[ix] = new_cluster_num | ||
new_cluster_num += 1 | ||
|
||
cluster_mapper = {k:v for k,v in zip(list(embeddings_dict.keys()), cluster_num)} | ||
|
||
return cluster_mapper | ||
|
||
def get_new_level_consensus(sentence_embs, previous_level_col, k_means_n, numclust_its): | ||
|
||
# Mean sentence embedding for the previous level | ||
average_emb_dict = dict( | ||
sentence_embs.groupby(previous_level_col)['reduced_points_umap'].apply(lambda x: np.mean(x.tolist(), axis=0).tolist())) | ||
|
||
clustering_results = get_many_clusters( | ||
list(average_emb_dict.keys()), | ||
list(average_emb_dict.values()), | ||
n_clusters=k_means_n, | ||
numclust_its=numclust_its | ||
) | ||
|
||
consensus_set_mappings = get_consensus_clusters_mappings(clustering_results, k=k_means_n) | ||
|
||
cluster_mapper = dict(zip( | ||
list(average_emb_dict.keys()), | ||
consensus_set_mappings | ||
)) | ||
|
||
return cluster_mapper | ||
|
Oops, something went wrong.