Skip to content

Commit

Permalink
Build taxonomy (#58)
Browse files Browse the repository at this point in the history
* Add scripts for building the taxonomy

* Add location data to saving out metadata from tk files

* Save out metadata every 100 tk files

* put metadata files in folders

* fix tabs

* Save out less metadata info and more regularly since files are really large

* fix some getting of bulk metadata
  • Loading branch information
lizgzil authored Sep 13, 2021
1 parent 479c233 commit 029714f
Show file tree
Hide file tree
Showing 5 changed files with 410 additions and 27 deletions.
15 changes: 15 additions & 0 deletions skills_taxonomy_v2/config/skills_taxonomy/2021.09.06.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
flows:
build_taxonomy_flow:
params:
clustered_sentences_path: "outputs/skills_extraction/extracted_skills/2021.08.31_sentences_data.json"
skills_data_path: 'outputs/skills_extraction/extracted_skills/2021.08.31_skills_data.json'
level_c_n: 250
level_b_n: 50
level_a_n: 10
k_means_max_iter: 5000
check_low_siloutte_b: True
silhouette_threshold: 0
level_a_consensus_numclust_its: 100
level_names_tfidif_n: 3
use_level_a_consensus: True
output_dir: "outputs/skills_hierarchy/"
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Skills Taxonomy

In this pipeline we build the taxonomy from skills extracted from TextKernel job adverts.
In this pipeline we build the taxonomy from skills extracted from TextKernel job adverts.

This is run by:
```
python -i skills_taxonomy_v2/pipeline/skills_taxonomy/build_taxonomy.py --config_path 'skills_taxonomy_v2/config/skills_taxonomy/2021.09.06.yaml'
```
195 changes: 195 additions & 0 deletions skills_taxonomy_v2/pipeline/skills_taxonomy/build_taxonomy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
from argparse import ArgumentParser
import logging
import yaml

import pandas as pd
from tqdm import tqdm
import boto3

from skills_taxonomy_v2.getters.s3_data import load_s3_data, get_s3_data_paths, save_to_s3
from skills_taxonomy_v2 import BUCKET_NAME
from skills_taxonomy_v2.pipeline.skills_taxonomy.build_taxonomy_utils import (
get_many_clusters,
get_consensus_clusters_mappings,
get_top_tf_idf_words,
get_level_names,
get_new_level_consensus,
get_new_level)
from skills_taxonomy_v2.pipeline.skills_extraction.extract_skills_utils import (
get_output_config_stamped,
)

logger = logging.getLogger(__name__)

def parse_arguments(parser):

parser.add_argument(
"--config_path",
help="Path to config file",
default="skills_taxonomy_v2/config/skills_taxonomy/2021.09.06.yaml",
)

return parser.parse_args()

if __name__ == "__main__":

parser = ArgumentParser()
args = parse_arguments(parser)

with open(args.config_path, "r") as f:
config = yaml.load(f, Loader=yaml.FullLoader)

FLOW_ID = "build_taxonomy_flow"

flow_config = config["flows"][FLOW_ID]
params = flow_config["params"]

s3 = boto3.resource("s3")

sentence_embs = load_s3_data(s3, BUCKET_NAME, params["clustered_sentences_path"])
sentence_embs = pd.DataFrame(sentence_embs)
# Remove not clustered sentences
sentence_embs = sentence_embs[sentence_embs['Cluster number']!=-1]

skills_data = load_s3_data(s3, BUCKET_NAME, params["skills_data_path"])

logger.info("Getting lowest level hierarchy ...")
level_c_cluster_mapper = get_new_level(
sentence_embs,
previous_level_col='Cluster number',
k_means_n=params["level_c_n"],
k_means_max_iter=params["k_means_max_iter"])
sentence_embs['Level C'] = sentence_embs['Cluster number'].apply(lambda x: level_c_cluster_mapper[x])
logger.info(f"Lowest level hierarchy has {sentence_embs['Level C'].nunique()} sections")

logger.info("Getting mid level hierarchy ...")
if params["check_low_siloutte_b"]:
logger.info("Points with low siloutte scores are put in their own cluster")
level_b_cluster_mapper = get_new_level(
sentence_embs,
previous_level_col='Level C',
k_means_n=params["level_b_n"],
k_means_max_iter=params["k_means_max_iter"],
check_low_siloutte=params["check_low_siloutte_b"],
silhouette_threshold=params["silhouette_threshold"])
sentence_embs['Level B'] = sentence_embs['Level C'].apply(lambda x: level_b_cluster_mapper[x])
logger.info(f"Mid level hierarchy has {sentence_embs['Level B'].nunique()} sections")

logger.info("Getting top level hierarchy ...")
if params["use_level_a_consensus"]:
logger.info("... using consensus clustering")
level_a_cluster_mapper = get_new_level_consensus(
sentence_embs,
previous_level_col='Level B',
k_means_n=params["level_a_n"],
numclust_its=params["level_a_consensus_numclust_its"]
)
sentence_embs['Level A'] = sentence_embs['Level B'].apply(lambda x: level_a_cluster_mapper[x])
else:
level_a_cluster_mapper = get_new_level(
sentence_embs,
previous_level_col='Level B',
k_means_n=params["level_a_n"],
k_means_max_iter=params["k_means_max_iter"]
)
sentence_embs['Level A'] = sentence_embs['Level B'].apply(lambda x: level_a_cluster_mapper[x])
logger.info(f"Top level hierarchy has {sentence_embs['Level A'].nunique()} sections")

# Level D is just a merging of level C skills which were given the same name (i.e. no clustering)
# If there are skills with the same name in level C then group these
sentence_embs['Skill name'] = sentence_embs['Cluster number'].apply(lambda x: skills_data[str(x)]['Skills name'])
level_c_name_mapper = {}
for level_c_num, level_c_data in sentence_embs.groupby('Level C'):
level_c_skill_names = level_c_data['Skill name'].unique().tolist()
level_c_name_mapper[level_c_num] = {level_c_skill_name:i for i, level_c_skill_name in enumerate(level_c_skill_names)}

def get_level_c_merged_names(skill):
return level_c_name_mapper[skill['Level C']][skill['Skill name']]

sentence_embs['Level D'] = sentence_embs.apply(get_level_c_merged_names, axis=1)

# Level names
level_a_names = get_level_names(sentence_embs, 'Level A', top_n=params["level_names_tfidif_n"])
level_b_names = get_level_names(sentence_embs, 'Level B', top_n=params["level_names_tfidif_n"])
level_c_names = get_level_names(sentence_embs, 'Level C', top_n=params["level_names_tfidif_n"])

logger.info("Creating and saving dictionary of hierarchical information per skill ...")
# Dict of hierarchy information per skill
# {skill_num: {hierarchy info for this skill}}

skill_hierarchy = {}
for skill_num, skill_info in skills_data.items():
skill_num = int(skill_num)
if skill_num != -1:
hier_info = {}
level_c = level_c_cluster_mapper[skill_num]
level_b = level_b_cluster_mapper[level_c]
level_a = level_a_cluster_mapper[level_b]
hier_info['Skill name'] = skill_info['Skills name']
hier_info['Hierarchy level A'] = level_a
hier_info['Hierarchy level B'] = level_b
hier_info['Hierarchy level C'] = level_c
hier_info['Hierarchy level D'] = level_c_name_mapper[level_c][skill_info['Skills name']]
hier_info['Hierarchy level A name'] = level_a_names[level_a]
hier_info['Hierarchy level B name'] = level_b_names[level_b]
hier_info['Hierarchy level C name'] = level_c_names[level_c]
hier_info['Hierarchy ID'] = f"{level_a}-{level_b}-{level_c}"
hier_info['Number of sentences that created skill'] = len(skill_info['Texts'])
skill_hierarchy[skill_num] = hier_info

# Save json
skill_hierarchy_file = get_output_config_stamped(
args.config_path, params["output_dir"], "skills_hierarchy.json"
)
save_to_s3(s3, BUCKET_NAME, skill_hierarchy, skill_hierarchy_file)
logger.info(f"Saved to {skill_hierarchy_file}")


logger.info("Creating and saving dictionary of hierarchical information per level ...")
# Dict of hierarchy information per level
# {level_a_num: {level_b_info: {level_c_info}}}

hier_structure = {}
for level_a_num, level_a_num_data in sentence_embs.groupby('Level A'):
level_b_structure = {}
for level_b_num, level_b_num_data in level_a_num_data.groupby('Level B'):
level_c_structure = {}
for level_c_num, level_c_num_data in level_b_num_data.groupby('Level C'):
level_d_structure = {}
for level_d_num, level_d_num_data in level_c_num_data.groupby('Level D'):
skill_nums = level_d_num_data['Cluster number'].unique().tolist()
# The name at this level is the skill names all these level Ds are grouped on
level_d_structure[level_d_num] = {
'Name': skill_hierarchy[skill_nums[0]]['Skill name'],
'Number of skills': len(skill_nums),
'Skills': {k: {
'Skill name': skill_hierarchy[k]['Skill name'],
'Number of sentences that created skill': skill_hierarchy[k]['Number of sentences that created skill'],
} for k in skill_nums}
}
skill_nums_c = level_c_num_data['Cluster number'].unique().tolist()
level_c_structure[level_c_num] = {
'Name': level_c_names[level_c_num],
'Number of skills': len(skill_nums_c),
'Level D': level_d_structure
}
skill_nums_b = level_b_num_data['Cluster number'].unique().tolist()
level_b_structure[level_b_num] = {
'Name': level_b_names[level_b_num],
'Number of skills': len(skill_nums_b),
'Level C': level_c_structure
}
skill_nums_a = level_a_num_data['Cluster number'].unique().tolist()
hier_structure[level_a_num] = {
'Name': level_a_names[level_a_num],
'Number of skills': len(skill_nums_a),
'Level B': level_b_structure
}

# Save json
hier_structure_file = get_output_config_stamped(
args.config_path, params["output_dir"], "hierarchy_structure.json"
)
save_to_s3(s3, BUCKET_NAME, hier_structure, hier_structure_file)

logger.info(f"Saved to {hier_structure_file}")
148 changes: 148 additions & 0 deletions skills_taxonomy_v2/pipeline/skills_taxonomy/build_taxonomy_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import json
import pickle
import random
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from Levenshtein import distance

def get_many_clusters(skill_nums, average_emb_clust, n_clusters, numclust_its=10):

# - numclust_its iterations of clustering,
# - changing random prop_sampled% of data
# - changing clustering intial conditions
clustering_results = pd.DataFrame(index=skill_nums)
for i in range(numclust_its):
clustering = KMeans(n_clusters=n_clusters, random_state=i)
cluster_num = clustering.fit_predict(
[average_emb_clust[k] for k in skill_nums]).tolist()
new_clustering_results = pd.DataFrame(cluster_num, index=skill_nums, columns=[f'Cluster set {i}'])
clustering_results = pd.concat([clustering_results, new_clustering_results], axis=1)

return clustering_results

def get_consensus_clusters_mappings(consensus_results_df, k):
"""
consensus_results_df: a dataframe of each skill and the clusters it was assigned
to with 10 iterations of clustering
"""

consensus_sets = ["".join([str(cc) for cc in c]) for c in consensus_results_df.values.tolist()]

consensus_sets_unique = set(consensus_sets)

# e.g. how similar is '1234' to '1235'?
all_dists_matrix = []
for set_1 in consensus_sets_unique:
temp_list = []
for set_2 in consensus_sets_unique:
lev_dict = distance(set_1, set_2)
temp_list.append(lev_dict)
all_dists_matrix.append(temp_list)

# Cluster the consensus sets to group them together
# e.g. '1234', '1235' and '1233' in group 1
# '5478' and '5479' in group 2

clustering_dists = KMeans(n_clusters = k, random_state=42)
cluster_num = clustering_dists.fit_predict(all_dists_matrix).tolist()
consensus_set_mapper = dict(zip(list(consensus_sets_unique), cluster_num))

return [consensus_set_mapper[c] for c in consensus_sets]

def get_top_tf_idf_words(vect, feature_names, top_n=2):
"""
From https://stackoverflow.com/questions/34232190/scikit-learn-tfidfvectorizer-how-to-get-top-n-terms-with-highest-tf-idf-score
"""
sorted_nzs = np.argsort(vect.data)[: -(top_n + 1) : -1]
return feature_names[vect.indices[sorted_nzs]].tolist()

def get_level_names(sentence_embs, level_col_name, top_n):

# Merge all the texts within each subsection of this level
hier_level_texts = []
level_nums = []
for level_num, level_data in sentence_embs.groupby(level_col_name):
hier_level_texts.append(" ".join(level_data['description'].tolist()))
level_nums.append(level_num)

vectorizer = TfidfVectorizer()
vect = vectorizer.fit_transform(hier_level_texts)

feature_names = np.array(vectorizer.get_feature_names())

level_names = {level_num: '-'.join(
get_top_tf_idf_words(doc_vec, feature_names, top_n=top_n)
) for level_num, doc_vec in zip(level_nums, vect)}

return level_names

def get_new_level(sentence_embs, previous_level_col, k_means_n, k_means_max_iter, check_low_siloutte=False, silhouette_threshold=0):

# Mean sentence embedding for the previous level
average_emb_dict = dict(
sentence_embs.groupby(previous_level_col)['reduced_points_umap'].apply(lambda x: np.mean(x.tolist(), axis=0).tolist()))

cluster_mapper = cluster_level_mapper(
average_emb_dict,
k_means_n=k_means_n,
k_means_max_iter=k_means_max_iter,
check_low_siloutte=check_low_siloutte,
silhouette_threshold=silhouette_threshold
)

return cluster_mapper

def cluster_level_mapper(embeddings_dict, k_means_n, k_means_max_iter=5000, check_low_siloutte=False, silhouette_threshold=0):
"""
Cluster the embeddings in embeddings_dict values to create a mapper dictionary
from the embeddings_dict keys to the cluster number.
e.g. embeddings_dict = {0: [1.23,5.67], 1: [4.56,7.8],...}
prev2next_map = {0:5, 1:34, ...}
"""

clustering = KMeans(n_clusters=k_means_n, max_iter=k_means_max_iter, random_state=42)
cluster_num = clustering.fit_predict(list(embeddings_dict.values())).tolist()

if check_low_siloutte:
# The Silhouette Coefficient is a measure of how well samples are clustered with samples
# that are similar to themselves.
silhouette_samples_n = silhouette_samples(list(embeddings_dict.values()), cluster_num)
# Give any not well clustered points a new cluster number
not_well_clust = list(np.argwhere(silhouette_samples_n < silhouette_threshold).flatten())
new_cluster_num = k_means_n
for ix in not_well_clust:
cluster_num[ix] = new_cluster_num
new_cluster_num += 1

cluster_mapper = {k:v for k,v in zip(list(embeddings_dict.keys()), cluster_num)}

return cluster_mapper

def get_new_level_consensus(sentence_embs, previous_level_col, k_means_n, numclust_its):

# Mean sentence embedding for the previous level
average_emb_dict = dict(
sentence_embs.groupby(previous_level_col)['reduced_points_umap'].apply(lambda x: np.mean(x.tolist(), axis=0).tolist()))

clustering_results = get_many_clusters(
list(average_emb_dict.keys()),
list(average_emb_dict.values()),
n_clusters=k_means_n,
numclust_its=numclust_its
)

consensus_set_mappings = get_consensus_clusters_mappings(clustering_results, k=k_means_n)

cluster_mapper = dict(zip(
list(average_emb_dict.keys()),
consensus_set_mappings
))

return cluster_mapper

Loading

0 comments on commit 029714f

Please sign in to comment.