diff --git a/README.md b/README.md index 1b9e43e..f654667 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,10 @@ More details of the steps included in this project, and running instructions, ca 3. [skills_extraction](skills_taxonomy_v2/pipeline/skills_extraction/README.md) - Extracting skills from skill sentences. 4. [skills_taxonomy](skills_taxonomy_v2/pipeline/skills_taxonomy/README.md) - Building the skills taxonomy from extracted skills. +### Extract skills example + +A simple example of extracting skills from a toy dataset of job adverts is given in [the examples folder](skills_taxonomy_v2/examples/extract_skills.py). + ### Analysis This repository also contains various pieces of analysis of the taxonomy. These are discussed in the main analysis [README file](skills_taxonomy_v2/analysis/README.md). diff --git a/skills_taxonomy_v2/examples/Extract Skills.py b/skills_taxonomy_v2/examples/Extract Skills.py new file mode 100644 index 0000000..3d6aa6a --- /dev/null +++ b/skills_taxonomy_v2/examples/Extract Skills.py @@ -0,0 +1,93 @@ +# --- +# jupyter: +# jupytext: +# cell_metadata_filter: -all +# comment_magics: true +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.11.4 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% +# cd ../.. + +# %% +import json + +import pandas as pd +import matplotlib.pyplot as plt + +# %% +from skills_taxonomy_v2.examples.extract_skills import ( + load_prerequisites, split_skills, predict_skill_sents, reduce_embeddings, get_skill_name, cluster_embeddings +) +from skills_taxonomy_v2.pipeline.skills_extraction.get_sentence_embeddings import get_embeddings + +# %% +sent_classifier_model_dir = 'outputs/sentence_classifier/models/2021_08_16.pkl' +job_adverts_file = 'skills_taxonomy_v2/examples/job_advert_examples.txt' + +# Parameters - these will need tweaking depending on your input data +reduction_n_neighbors = 6 +reduction_min_dist = 0.0 +clustering_eps = 1 +clustering_min_samples = 1 + +# %% +nlp, bert_vectorizer, sent_classifier = load_prerequisites(sent_classifier_model_dir) + +# Load your job advert texts; a list of dicts with the keys "full_text" and "job_id" +with open(job_adverts_file) as f: + job_adverts = json.load(f) + +# Run the pipeline to extract skills +all_job_ids, all_sentences = split_skills(job_adverts) +skill_sentences_dict = predict_skill_sents(sent_classifier, all_job_ids, all_sentences) +sentence_embeddings, original_sentences = get_embeddings(skill_sentences_dict, nlp, bert_vectorizer) +sentences_data_df = reduce_embeddings(sentence_embeddings, original_sentences, reduction_n_neighbors, reduction_min_dist) +sentences_clustered = cluster_embeddings(sentences_data_df, clustering_eps, clustering_min_samples) + +# %% [markdown] +# ## Look at skills extracted + +# %% +skill_name_dict = sentences_clustered.groupby('cluster_number').apply(lambda x: get_skill_name(x)).to_dict() +job_skills_dict = sentences_clustered.groupby('job id')['cluster_number'].unique().to_dict() + + +for job_advert in job_adverts: + job_advert["Skills"] = [skill_name_dict[skill_num] for skill_num in job_skills_dict[job_advert['job_id']]] + +print(f"There are {len(skill_name_dict)} skills extracted using this data") +for i in range(3): + print(f'The job advert: \n{job_adverts[i]["full_text"]} \nHas skills: \n{job_adverts[i]["Skills"]}\n') + + + +# %% [markdown] +# ## Plot sentences coloured by which skill they are assigned to + +# %% +fig, ax = plt.subplots(figsize=(8,8)) +# plot +sentences_clustered.plot.scatter( + x = 'reduced_points x', + y = 'reduced_points y', + c = 'cluster_number', + cmap = "rainbow", + colorbar=False, + ax=ax, s=50, alpha=0.6); +ax.xaxis.set_visible(False) +ax.yaxis.set_visible(False) +ax.axis('off') +# annotate points in axis +for idx, row in sentences_clustered.iterrows(): + ax.annotate(row['original sentence'], (row['reduced_points x'], row['reduced_points y']) ) + +# %% diff --git a/skills_taxonomy_v2/examples/extract_skills.py b/skills_taxonomy_v2/examples/extract_skills.py new file mode 100644 index 0000000..92fd314 --- /dev/null +++ b/skills_taxonomy_v2/examples/extract_skills.py @@ -0,0 +1,210 @@ +""" +This pipeline creates and extracts skills from an input list of job adverts. + +Prerequisites: +- You have access to our S3 bucket or have the skills classifier pkl file stored locally +- You have installed the environment requirements for this repo + +Note: +- This script is not optimised to process 1000s of job adverts, so we recommend modifying this +code if you wanted to process many job adverts. +- It's advised to modify the data reduction and clustering parameters as these will be +very sensitive to the input data. +- The variable sentences_clustered contains the information needed for plotting sentences in 2D, +and finding which specific sentences were assigned to each skill + +Running this script with the default job_advert_examples.txt file will print out: + +The job advert: +This is a sentence about the company and the salary. We require applicants to have skills in Microsoft Excel. +Has skills: +['microsoft-excel-require'] + +The job advert: +We want Microsoft Excel skills for this role. Communication skills are also essential. +Has skills: +['microsoft-excel-require', 'communication-important-essential'] + +The job advert: +This role has a very competitive starting salary. Skills for good communication are very important. +Has skills: +['communication-important-essential'] + +""" + +from skills_taxonomy_v2.pipeline.sentence_classifier.utils import split_sentence +from skills_taxonomy_v2.pipeline.sentence_classifier.sentence_classifier import ( + SentenceClassifier, BertVectorizer +) +from skills_taxonomy_v2.pipeline.skills_extraction.get_sentence_embeddings import get_embeddings +from skills_taxonomy_v2.pipeline.skills_extraction.cluster_embeddings import ClusterEmbeddings + +import umap.umap_ as umap +import spacy +import pandas as pd + +from tqdm import tqdm +from collections import Counter, defaultdict +import pickle +import json +import os +import logging +logger = logging.getLogger(__name__) + +sent_classifier_model_dir = 'outputs/sentence_classifier/models/2021_08_16.pkl' +job_adverts_file = 'skills_taxonomy_v2/examples/job_advert_examples.txt' + +# Parameters - these will need tweaking depending on your input data +reduction_n_neighbors = 6 +reduction_min_dist = 0.0 +clustering_eps = 1 +clustering_min_samples = 1 + +def load_prerequisites(sent_classifier_model_dir): + + nlp = spacy.load("en_core_web_sm") + bert_vectorizer = BertVectorizer( + bert_model_name="sentence-transformers/all-MiniLM-L6-v2", + ) + bert_vectorizer.fit() + + # Load the sentence classifier (either locally or on S3) + + sent_classifier = SentenceClassifier(bert_model_name= "bert-base-uncased") + + if os.path.exists(os.path.join(os.getcwd(), sent_classifier_model_dir)): + sent_classifier.classifier = pickle.load(sent_classifier_model_dir) + sent_classifier.load_bert() + else: + logger.info("Sentence Classifier not found locally, so trying to find on S3") + sent_classifier.load_model(sent_classifier_model_dir.split('/')[-1]) + + return nlp, bert_vectorizer, sent_classifier + +def split_skills(job_adverts): + + all_job_ids = [] + all_sentences = [] + for job_advert in job_adverts: + job_id, sentences = split_sentence(job_advert, min_length=30) + all_job_ids += [job_id]*len(sentences) + all_sentences += sentences + + return all_job_ids, all_sentences + +def predict_skill_sents(sent_classifier, all_job_ids, all_sentences): + + sentences_vec = sent_classifier.transform(all_sentences) + sentences_pred = sent_classifier.predict(sentences_vec) + + skill_sentences_dict = defaultdict(list) + for job_id, sent, pred in zip(all_job_ids, all_sentences, sentences_pred): + if pred == 1: + skill_sentences_dict[job_id].append(sent) + + return skill_sentences_dict + +def reduce_embeddings(sentence_embeddings, original_sentences, reduction_n_neighbors, reduction_min_dist): + + reducer_class = umap.UMAP( + n_neighbors=reduction_n_neighbors, + min_dist=reduction_min_dist, + random_state=42, + n_components=2, + ) + reducer_class.fit([e for _,_,_,e in sentence_embeddings]) + + sent_thresh = 250 + mask_seq = "[MASK]" + embedding_list = [] + words_list = [] + sentence_list = [] + jobid_list = [] + sentid_list = [] + for job_id, sent_id, words, embedding in sentence_embeddings: + original_sentence = original_sentences[sent_id] + if len(original_sentence) < sent_thresh: + embedding_list.append(embedding) + words_list.append(words.replace(mask_seq, "").split()) + sentence_list.append(original_sentence) + jobid_list.append(job_id) + sentid_list.append(sent_id) + + sentence_embeddings_red = reducer_class.transform(embedding_list).tolist() + sentences_data = { + "description": words_list, + "original sentence": sentence_list, + "job id": jobid_list, + "sentence id": sentid_list, + "embedding": sentence_embeddings_red, + } + + sentences_data_df = pd.DataFrame(sentences_data) + + sentences_data_df["reduced_points x"] = sentences_data_df["embedding"].apply(lambda x: x[0]) + sentences_data_df["reduced_points y"] = sentences_data_df["embedding"].apply(lambda x: x[1]) + sentences_data_df["original sentence length"] = sentences_data_df["original sentence"].apply(lambda x:len(x)) + + return sentences_data_df + +def get_skill_name(skill_data): + """ + Find the most common words in the skill description words as a way to name skills + """ + common_description_words = Counter([v for d in skill_data['description'] for v in d]).most_common(3) + return '-'.join([c[0] for c in common_description_words]) + +def cluster_embeddings(sentences_data_df, clustering_eps, clustering_min_samples): + + cluster_embeddings = ClusterEmbeddings( + dbscan_eps=clustering_eps, + dbscan_min_samples=clustering_min_samples, + train_cluster_n=len(sentences_data_df), + ) + _ = cluster_embeddings.get_clusters(sentences_data_df) + sentences_clustered = cluster_embeddings.sentences_data_short_sample + + return sentences_clustered + +if __name__ == '__main__': + + logger.info("Loading pre-trained models and data ...") + + # Load pre-trained models needed for this pipeline + + nlp, bert_vectorizer, sent_classifier = load_prerequisites(sent_classifier_model_dir) + + # Load your job advert texts; a list of dicts with the keys "full_text" and "job_id" + + with open(job_adverts_file) as f: + job_adverts = json.load(f) + + # Run the pipeline to extract skills + + logger.info("Split the sentences ...") + all_job_ids, all_sentences = split_skills(job_adverts) + + logger.info("Predict skill sentences ...") + skill_sentences_dict = predict_skill_sents(sent_classifier, all_job_ids, all_sentences) + + logger.info("Embed skill sentences ...") + sentence_embeddings, original_sentences = get_embeddings(skill_sentences_dict, nlp, bert_vectorizer) + + logger.info("Reduce embeddings ...") + sentences_data_df = reduce_embeddings(sentence_embeddings, original_sentences, reduction_n_neighbors, reduction_min_dist) + + logger.info("Cluster the reduced embeddings ...") + sentences_clustered = cluster_embeddings(sentences_data_df, clustering_eps, clustering_min_samples) + + logger.info("Summarise skill information ...") + + skill_name_dict = sentences_clustered.groupby('cluster_number').apply(lambda x: get_skill_name(x)).to_dict() + job_skills_dict = sentences_clustered.groupby('job id')['cluster_number'].unique().to_dict() + + for job_advert in job_adverts: + job_advert["Skills"] = [skill_name_dict[skill_num] for skill_num in job_skills_dict[job_advert['job_id']]] + + print(f"There are {len(skill_name_dict)} skills extracted using this data") + for i in range(3): + print(f'The job advert: \n{job_adverts[i]["full_text"]} \nHas skills: \n{job_adverts[i]["Skills"]}\n') + diff --git a/skills_taxonomy_v2/examples/job_advert_examples.txt b/skills_taxonomy_v2/examples/job_advert_examples.txt new file mode 100644 index 0000000..9695037 --- /dev/null +++ b/skills_taxonomy_v2/examples/job_advert_examples.txt @@ -0,0 +1 @@ +[{"full_text": "This is a sentence about the company and the salary. We require applicants to have skills in Microsoft Excel.", "job_id": 0 }, {"full_text": "We want Microsoft Excel skills for this role. Communication skills are also essential.", "job_id": 1 }, {"full_text": "This role has a very competitive starting salary. Skills for good communication are very important.", "job_id": 2 }] \ No newline at end of file diff --git a/skills_taxonomy_v2/pipeline/skills_extraction/cluster_embeddings.py b/skills_taxonomy_v2/pipeline/skills_extraction/cluster_embeddings.py index 4270bc2..68ed485 100644 --- a/skills_taxonomy_v2/pipeline/skills_extraction/cluster_embeddings.py +++ b/skills_taxonomy_v2/pipeline/skills_extraction/cluster_embeddings.py @@ -52,14 +52,15 @@ def load_process_sentence_data(s3, reduced_embeddings_paths): class ClusterEmbeddings(): def __init__( self, - dbscan_eps, - dbscan_min_samples, - max_length, - train_cluster_n, - train_cluster_rand_seed, - small_cluster_size_threshold, - max_centroid_dist_before_merge + dbscan_eps=0.01, + dbscan_min_samples=4, + max_length=100, + train_cluster_n=300000, + train_cluster_rand_seed=42, + small_cluster_size_threshold=10, + max_centroid_dist_before_merge=0.05 ): + self.dbscan_eps = dbscan_eps self.dbscan_min_samples = dbscan_min_samples self.max_length = max_length @@ -112,7 +113,8 @@ def merge_clusters(self): logger.info(f"Merging small clusters where suitable ...") cluster_centroids = self.sentences_data_short_sample.groupby('cluster_number')["embedding"].apply( lambda x: np.mean(x.tolist(), axis=0).tolist()).to_dict() - _ = cluster_centroids.pop(-1) + if -1 in cluster_centroids: + _ = cluster_centroids.pop(-1) all_cluster_size_dict = {k:v for k,v in Counter(clustering_number).items() if k!=-1} small_cluster_size_dict = {k:v for k,v in all_cluster_size_dict.items() if v < self.small_cluster_size_threshold} @@ -160,7 +162,8 @@ def predict_clusters(self, sentences_data): # Re-calculate cluster centroids merged_cluster_centroids = self.sentences_data_short_sample.groupby('Merged clusters')["embedding"].apply( lambda x: np.mean(x.tolist(), axis=0).tolist()).to_dict() - _ = merged_cluster_centroids.pop(-1) + if -1 in merged_cluster_centroids: + _ = merged_cluster_centroids.pop(-1) # Careful of order of list when it came from a dict (can get messed up) merged_cluster_embeddings = list(merged_cluster_centroids.values()) merged_cluster_nums = list(merged_cluster_centroids.keys()) diff --git a/skills_taxonomy_v2/pipeline/skills_extraction/get_sentence_embeddings.py b/skills_taxonomy_v2/pipeline/skills_extraction/get_sentence_embeddings.py index c45be9b..3091ec3 100644 --- a/skills_taxonomy_v2/pipeline/skills_extraction/get_sentence_embeddings.py +++ b/skills_taxonomy_v2/pipeline/skills_extraction/get_sentence_embeddings.py @@ -59,6 +59,51 @@ with open(custom_stopwords_dir) as file: custom_stopwords = file.read().splitlines() +def get_embeddings(data, nlp, bert_vectorizer, token_len_threshold=20, stopwords=stopwords.words(), custom_stopwords=custom_stopwords): + logger.info(f"Processing {len(data)} sentences...") + start_time = time.time() + + # For each sentence mask out stop words, proper nouns etc. + masked_sentences = [] + sentence_job_ids = [] + sentence_hashes = [] + original_sentences = {} + + for job_id, sentences in tqdm(data.items()): + for sentence in sentences: + masked_sentence = process_sentence_mask( + sentence, + nlp, + bert_vectorizer, + token_len_threshold, + stopwords=stopwords, + custom_stopwords=custom_stopwords, + ) + if masked_sentence.replace("[MASK]", "").replace(" ", ""): + # Don't include sentence if it only consists of masked words + masked_sentences.append(masked_sentence) + sentence_job_ids.append(job_id) + # Keep a record of the original sentence via a hashed id + original_sentence_id = hash(sentence) + sentence_hashes.append(original_sentence_id) + original_sentences[original_sentence_id] = sentence + logger.info(f"Processing sentences took {time.time() - start_time} seconds") + + logger.info(f"Getting embeddings for {len(masked_sentences)} sentences...") + start_time = time.time() + # Find sentence embeddings in bulk for all masked sentences + masked_sentence_embeddings = bert_vectorizer.transform(masked_sentences) + output_tuple_list = [ + (job_id, sent_id, sent, emb.tolist()) + for job_id, sent_id, sent, emb in zip( + sentence_job_ids, + sentence_hashes, + masked_sentences, + masked_sentence_embeddings, + ) + ] + logger.info(f"Getting embeddings took {time.time() - start_time} seconds") + return output_tuple_list, original_sentences def parse_arguments(parser): parser.add_argument( @@ -135,49 +180,8 @@ def try_less_data(s3, bucket_name, data, output_file_dir, stop_recursion=False): ) data = load_s3_data(s3, BUCKET_NAME, data_path) - logger.info(f"Processing {len(data)} sentences...") - start_time = time.time() - # For each sentence mask out stop words, proper nouns etc. - masked_sentences = [] - sentence_job_ids = [] - sentence_hashes = [] - original_sentences = {} - - for job_id, sentences in tqdm(data.items()): - for sentence in sentences: - masked_sentence = process_sentence_mask( - sentence, - nlp, - bert_vectorizer, - token_len_threshold, - stopwords=stopwords.words(), - custom_stopwords=custom_stopwords, - ) - if masked_sentence.replace("[MASK]", "").replace(" ", ""): - # Don't include sentence if it only consists of masked words - masked_sentences.append(masked_sentence) - sentence_job_ids.append(job_id) - # Keep a record of the original sentence via a hashed id - original_sentence_id = hash(sentence) - sentence_hashes.append(original_sentence_id) - original_sentences[original_sentence_id] = sentence - logger.info(f"Processing sentences took {time.time() - start_time} seconds") - - logger.info(f"Getting embeddings for {len(masked_sentences)} sentences...") - start_time = time.time() - # Find sentence embeddings in bulk for all masked sentences - masked_sentence_embeddings = bert_vectorizer.transform(masked_sentences) - output_tuple_list = [ - (job_id, sent_id, sent, emb.tolist()) - for job_id, sent_id, sent, emb in zip( - sentence_job_ids, - sentence_hashes, - masked_sentences, - masked_sentence_embeddings, - ) - ] - logger.info(f"Getting embeddings took {time.time() - start_time} seconds") - + output_tuple_list, original_sentences = get_embeddings(data, nlp, bert_vectorizer, token_len_threshold=token_len_threshold, stopwords=stopwords.words(), custom_stopwords=custom_stopwords) + # Save the output in a folder with a similar naming structure to the input data_dir = os.path.relpath(data_path, skill_sentences_dir) output_file_dir = os.path.join(