Rerun skills taxonomy (#77)

* Correcy misatke in skills extraction readme * Add little script to calculate mean embeddings for each skill * Update new config for skills taxonomy * Update build_taxonomy.py to have the option of not creating a level D, or when skills names might not exist. Also separated out loading of data and includes option for new format of skills data. * function in build taxonomy utils to append clustering with manual interventions * Add json for mnaual naming of level A groups and manual grouping of level B skills * Add other analysis bits to readme * Add new manual mapper dict from consultation with India and George. Also update some of the manual mapper functions * Edits to the name process - silence some annoying nltk logs, add logging to skills_naming_utils, use same config file as newest skill extraction, and fix a bug in clean_cluster_description as well as clean out some unneccessary procssing bits * The way the embeddings were loading wasn't loading all of them for some reason - so changed how this is done, also added some safety bits in get_skill_info where it saves out every 100 skills rather than wait a whole day until saving * Remove unneccessary imports from naming skills * Readd numba and remove unneccessary imports * Fix index bug in cluster embeddings * Add some naming fixes to the building of the taxonomy and outputting, no duplicate names, also output centroid
nestauk · Dec 21, 2021 · 3aeee4b · 3aeee4b
1 parent ef386a8
commit 3aeee4b
Show file tree

Hide file tree

Showing 18 changed files with 734 additions and 367 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -28,4 +28,4 @@ geopandas
 rtree
 urllib3
 shapely
-pattern
+pattern
diff --git a/skills_taxonomy_v2/analysis/README.md b/skills_taxonomy_v2/analysis/README.md
@@ -39,8 +39,13 @@ Within `notebooks`, there's a number of `.ipynb` and `.py` files related to expe
 
 In this folder we have two scripts for various bits of analysis and figure plotting after extracting skills:
 
-1. `Effect of sample size.ipynb` - Investigate the effect of sample size of skill sentences and how many words are in the vocab.
-2. `Skills Extraction Analysis and Figures.ipynb` - Various analysis and figure generation of the skills extracted. Outputs are in `outputs/skills_extraction/figures/..`
+1. `Multi-skill sentences.pynb` - Analysis of multi-skill sentence - how long are sentences typically when they have multiple skills in.
+2. `Experiment - Data reduction sample size.ipynb` - Analysis of finding a good sample size of embeddings to fit the reducer class to.
+3. `Effect of sample size.ipynb` - Investigate the effect of sample size of skill sentences and how many words are in the vocab.
+4. `Experiment - Clustering parameters.ipynb` - Analysis of which are the optimal clustering parameters to extract skills from.
+5. `Effect of merging clusters distance threshold.ipynb` - Investigation of the distance threshold for which two skill clusters should be merged into one.
+6. `Skills Extraction Analysis and Figures.ipynb` - Various analysis and figure generation of the skills extracted. Outputs are in `outputs/skills_extraction/figures/..`
+7. `Extracted skills - 2021.11.05.py`
 
 In this folder we also have experimentation notebooks showing 4 approaches for skills extraction approaches, including:
 

diff --git a/skills_taxonomy_v2/config/skills_extraction/2021.11.05.yaml b/skills_taxonomy_v2/config/skills_extraction/2021.11.05.yaml
@@ -30,8 +30,10 @@ flows:
         output_dir: "outputs/skills_extraction/extracted_skills/"
   name_skills:
     params:
-      sentence_skills_path: 'outputs/skills_extraction/extracted_skills/2021.11.05_sentences_data.json'
-      embedding_sample_path: 'outputs/skills_extraction/extracted_skills/2021.11.05_sentence_id_2_embedding_dict.json.gz'
+      skill_sentences_path: "outputs/skills_extraction/extracted_skills/2021.11.05_sentences_skills_data.json"
+      skills_path: "outputs/skills_extraction/extracted_skills/2021.11.05_skills_data.json"
+      skills_embeds_path: "outputs/skills_extraction/reduced_embeddings/"
+      mean_skills_embeds_path: "outputs/skills_extraction/extracted_skills/2021.11.05_skill_mean_embeddings.json"
       ngram: 3
       min_count: 3
       threshold: 0.25

diff --git a/skills_taxonomy_v2/config/skills_extraction/2021.11.09.yaml b/skills_taxonomy_v2/config/skills_extraction/2021.11.09.yaml
diff --git a/skills_taxonomy_v2/config/skills_extraction/2021.12.07.yaml b/skills_taxonomy_v2/config/skills_extraction/2021.12.07.yaml
diff --git a/skills_taxonomy_v2/config/skills_taxonomy/2021.11.30.yaml b/skills_taxonomy_v2/config/skills_taxonomy/2021.11.30.yaml
@@ -0,0 +1,18 @@
+flows:
+  build_taxonomy_flow:
+    params:
+      reduced_embeddings_dir: "outputs/skills_extraction/reduced_embeddings/"
+      clustered_sentences_path: "outputs/skills_extraction/extracted_skills/2021.11.05_sentences_skills_data.json"
+      skills_data_path: "outputs/skills_extraction/extracted_skills/2021.11.05_skills_data.json"
+      skills_names_data_path: "outputs/skills_extraction/extracted_skills/2021.11.05_skills_data_named.json"
+      cluster_column_name: "Cluster number predicted"
+      embedding_column_name: "embedding"
+      skills_data_texts_name: "Sentences"
+      level_c_n: 250
+      level_b_n: 60
+      k_means_max_iter: 5000
+      check_low_siloutte_b: False
+      create_level_d: False
+      level_names_tfidif_n: 3
+      level_a_manual_clusters_path: "skills_taxonomy_v2/utils/2021.12.20_level_a_mapper_dict.json"
+      output_dir: "outputs/skills_taxonomy/"
diff --git a/skills_taxonomy_v2/pipeline/sentence_classifier/utils.py b/skills_taxonomy_v2/pipeline/sentence_classifier/utils.py
@@ -21,8 +21,8 @@
 
 import nltk
 
-nltk.download("averaged_perceptron_tagger")
-nltk.download("punkt")
+nltk.download("averaged_perceptron_tagger", quiet=True)
+nltk.download("punkt", quiet=True)
 
 
 # ---------------------------------------------------------------------------------

diff --git a/skills_taxonomy_v2/pipeline/skills_extraction/README.md b/skills_taxonomy_v2/pipeline/skills_extraction/README.md
@@ -95,8 +95,18 @@ Furthermore, for the 12 times the clusters shouldn't be merged but werem, 11 of
 
 We thus fit our clustering algorithm on 300,000 random <100 character sentences - this created 11551 clusters. Then for the 8892 clusters found with less than 10 sentences in, we iteratively merged nearest neighbours when the Eucliean distance was less than 0.05. This resulted in us finding 6784 skill clusters. 
 
-Out of the 4,097,008 sentences in our sample, 1,465,639 had under 100 characters - using these we then went about predicting clusters using the centroids from these 6784 clusters. We use the predicted clusters for all 1,465,639 sentences as our 8892 skills. Note that in 35% of the sentences which were used in the fitting of the clustering algorithm, the predicted cluster was different to the cluster it had been assigned in the fitting. Analysis a figure plotting of these clusters is given in `Extracted skills - 2021.11.05.ipynb` and creating these clusters was found by running `cluster_embeddings.py --config_path skills_taxonomy_v2/config/skills_extraction/2021.11.05.yaml`. This saves two files: `s3://skills-taxonomy-v2/outputs/skills_extraction/extracted_skills/2021.11.05_sentences_skills_data.json` (each sentences ID/file ID with which cluster it was predicted to be in, and if in the training set - also which cluster it was originally assigned to) and `s3://skills-taxonomy-v2/outputs/skills_extraction/extracted_skills/2021.11.05_skills_data.json` (a dictionary of skill numbers and the sentences in them, along with the skill centroid coordinate).
+Out of the 4,097,008 sentences in our sample, 1,465,639 had under 100 characters - using these we then went about predicting clusters using the centroids from these 6784 clusters. We use the predicted clusters for all 1,465,639 sentences as our 6784 skills. Note that in 35% of the sentences which were used in the fitting of the clustering algorithm, the predicted cluster was different to the cluster it had been assigned in the fitting. Analysis a figure plotting of these clusters is given in `Extracted skills - 2021.11.05.ipynb` and creating these clusters was found by running `cluster_embeddings.py --config_path skills_taxonomy_v2/config/skills_extraction/2021.11.05.yaml`. This saves two files: `s3://skills-taxonomy-v2/outputs/skills_extraction/extracted_skills/2021.11.05_sentences_skills_data.json` (each sentences ID/file ID with which cluster it was predicted to be in, and if in the training set - also which cluster it was originally assigned to) and `s3://skills-taxonomy-v2/outputs/skills_extraction/extracted_skills/2021.11.05_skills_data.json` (a dictionary of skill numbers and the sentences in them, along with the skill centroid coordinate).
 
+### 4. Skills naming
+
+This has also been improved (see `skills_taxonomy_v2/analysis/skills_extraction/Skill Naming Experiments.md` for details).
+
+It can be run with:
+```
+python skills_taxonomy_v2/pipeline/skills_extraction/skills_naming.py --config_path 'skills_taxonomy_v2/config/skills_extraction/2021.11.05.yaml'
+```
+
+This will output `outputs/skills_extraction/extracted_skills/2021.11.05_skills_data_named.json`.
 
 ## `2021.08.31.yaml` summary
 

diff --git a/skills_taxonomy_v2/pipeline/skills_extraction/cluster_embeddings.py b/skills_taxonomy_v2/pipeline/skills_extraction/cluster_embeddings.py
@@ -251,6 +251,14 @@ def parse_arguments(parser):
 
     new_sentences_data = cluster_embeddings.predict_clusters(sentences_data)
 
+    # Create mapping of 'Cluster number predicted' to new index
+    # At the moment this isn't 0:num skills, its 0,1,2,5,10,11,12..
+    num_skills = new_sentences_data['Cluster number predicted'].nunique()
+    skill_nums = [n for n in new_sentences_data['Cluster number predicted'].unique() if n!=-2]
+    reindex_map = dict(zip(skill_nums, range(0, num_skills - 1)))
+    reindex_map[-2] = -2
+    new_sentences_data['Cluster number predicted'] = new_sentences_data['Cluster number predicted'].apply(lambda x: reindex_map[x])
+
     # Merge in the original cluster nums (for those in sample)
     clustered_sentences_data = pd.merge(
         new_sentences_data[['job id', 'sentence id', 'Cluster number predicted']],

diff --git a/skills_taxonomy_v2/pipeline/skills_extraction/skills_naming.py b/skills_taxonomy_v2/pipeline/skills_extraction/skills_naming.py
@@ -11,29 +11,21 @@
 
 Usage:
 
-    python -i skills_taxonomy_v2/pipeline/skills_extraction/skills_naming.py --config_path 'skills_taxonomy_v2/config/skills_extraction/2021.12.07.yaml'
+    python -i skills_taxonomy_v2/pipeline/skills_extraction/skills_naming.py --config_path 'skills_taxonomy_v2/config/skills_extraction/2021.11.05.yaml'
 """
 
 from argparse import ArgumentParser
 import logging
 import yaml
-import itertools
-import pickle
 
-import numpy as np
 import pandas as pd
 from tqdm import tqdm
 import boto3
 
-from sklearn.metrics.pairwise import cosine_similarity
-
-from skills_taxonomy_v2.getters.s3_data import load_s3_data, save_to_s3
+from skills_taxonomy_v2.getters.s3_data import load_s3_data, save_to_s3, get_s3_data_paths
 from skills_taxonomy_v2 import BUCKET_NAME
 
 from skills_taxonomy_v2.pipeline.skills_extraction.skills_naming_utils import (
-    get_new_skills_embeds,
-    clean_cluster_description,
-    get_clean_ngrams,
     get_skill_info,
 )
 from skills_taxonomy_v2.pipeline.skills_extraction.extract_skills_utils import (
@@ -42,13 +34,26 @@
 
 logger = logging.getLogger(__name__)
 
+def load_process_sentence_data(s3, reduced_embeddings_paths):
+    sentences_data = pd.DataFrame()
+    for reduced_embeddings_path in tqdm(reduced_embeddings_paths):
+        sentences_data_i = load_s3_data(
+            s3, BUCKET_NAME,
+            reduced_embeddings_path
+        )
+        sentences_data = pd.concat([sentences_data, pd.DataFrame(sentences_data_i)])
+    sentences_data.reset_index(drop=True, inplace=True)
+    logger.info(f"{len(sentences_data)} sentences loaded")
+    return sentences_data
+
+
 
 def parse_arguments(parser):
 
     parser.add_argument(
         "--config_path",
         help="Path to config file",
-        default="skills_taxonomy_v2/config/skills_extraction/2021.12.07.yaml",
+        default="skills_taxonomy_v2/config/skills_extraction/2021.11.05.yaml",
     )
 
     return parser.parse_args()
@@ -71,28 +76,34 @@ def parse_arguments(parser):
 
     # Load data
     skill_sentences = load_s3_data(s3, BUCKET_NAME, params["skill_sentences_path"])
-    skills_embeds = get_new_skills_embeds(params["skills_embeds_path"], BUCKET_NAME)
+
+    reduced_embeddings_paths = get_s3_data_paths(
+            s3,
+            BUCKET_NAME,
+            params["skills_embeds_path"],
+            file_types=["*sentences_data_*.json"]
+            )
+
+    skills_embeds_df = load_process_sentence_data(s3, reduced_embeddings_paths)
+
     sent_cluster_embeds = load_s3_data(
         s3, BUCKET_NAME, params["mean_skills_embeds_path"]
     )
     skills = load_s3_data(s3, BUCKET_NAME, params["skills_path"])
 
     # wrangle data in the format needed
-    skills_embeds_df = pd.DataFrame(skills_embeds)[
-        ["original sentence", "job id", "sentence id", "embedding"]
-    ]
-
+
     skill_sentences_df = pd.DataFrame(skill_sentences)[
         ["job id", "sentence id", "Cluster number predicted"]
     ]
 
-    skill_sentences_df = skill_sentences_df[
-        skill_sentences_df["Cluster number predicted"] != -2
-    ]
-
     merged_sents_embeds = pd.merge(
-        skills_embeds_df, skill_sentences_df, on=["job id", "sentence id"]
-    )
+        skills_embeds_df, skill_sentences_df, on=["job id", "sentence id"], how='left'
+    ) 
+
+    merged_sents_embeds = merged_sents_embeds[
+        merged_sents_embeds["Cluster number predicted"] != -2
+    ]
 
     skills_df = pd.DataFrame(skills).T
     skills_df["Skill number"] = skills_df.index
@@ -109,15 +120,22 @@ def parse_arguments(parser):
         )
     )
 
+    logger.info(f"Generating skill names for {len(skills_df)} skills...")
+
+    # Save skill information
+    skills_data_output_path = params["skills_path"].split(".json")[0] + "_named.json"
+    logger.info(f"Saving skill names to {skills_data_output_path}")
+
     # generate skills names
+    # and save iteratively
     skills_data = get_skill_info(
         skills_df,
         params["num_top_sent"],
         params["ngram"],
         params["min_count"],
         params["threshold"],
+        s3,
+        BUCKET_NAME,
+        skills_data_output_path,
     )
-
-    # Save skill information
-    skills_data_output_path = params["skills_path"].split(".json")[0] + "_named.json"
-    save_to_s3(s3, BUCKET_NAME, skills_data, skills_data_output_path)
+    logger.info(f"{len(skills_data)} skills names saved")
diff --git a/skills_taxonomy_v2/pipeline/skills_extraction/skills_naming_embeddings.py b/skills_taxonomy_v2/pipeline/skills_extraction/skills_naming_embeddings.py
@@ -0,0 +1,42 @@
+"""
+For the skill naming we need the mean embedding for each skill.
+"""
+import pandas as pd
+import numpy as np
+import boto3
+from tqdm import tqdm
+
+from collections import defaultdict
+
+from skills_taxonomy_v2.getters.s3_data import get_s3_data_paths, save_to_s3, load_s3_data
+from skills_taxonomy_v2 import BUCKET_NAME
+
+s3 = boto3.resource("s3")
+
+# Load skills
+# The sentences ID + cluster num
+sentence_embs = load_s3_data(s3, BUCKET_NAME, "outputs/skills_extraction/extracted_skills/2021.11.05_sentences_skills_data.json")
+sentence_embs = pd.DataFrame(sentence_embs)
+sentence_embs = sentence_embs[sentence_embs["Cluster number predicted"] >= 0]
+
+# Load embeddings
+sentence_embeddings_dirs = get_s3_data_paths(
+	s3, BUCKET_NAME, 'outputs/skills_extraction/word_embeddings/data/2021.11.05', file_types=["*embeddings.json"])
+
+skill_embeddings = defaultdict(list)
+for embedding_dir in tqdm(sentence_embeddings_dirs):
+	sentence_embeddings = load_s3_data(s3, BUCKET_NAME, embedding_dir)
+	sentence_embeddings_df = pd.DataFrame(sentence_embeddings)
+	temp_merge = pd.merge(sentence_embs, sentence_embeddings_df, how="inner", left_on=['job id', 'sentence id'], right_on=[0,1])
+	for skill_num, embeddings in temp_merge.groupby('Cluster number predicted'):
+		skill_embeddings[skill_num].extend(embeddings[3].tolist())
+
+# Get mean embedding for each skill number
+print("Getting mean embeddings")
+mean_skill_embeddings = {}
+for skill_num, embeddings_list in skill_embeddings.items():
+	mean_skill_embeddings[skill_num] = np.mean(embeddings_list, axis=0).tolist()
+
+# Save out
+print("Saving mean embeddings")
+save_to_s3(s3, BUCKET_NAME, mean_skill_embeddings, 'outputs/skills_extraction/extracted_skills/2021.11.05_skill_mean_embeddings.json')
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,4 +28,4 @@ geopandas @@
     rtree
     urllib3
     shapely
-    pattern
+    pattern