diff --git a/README.md b/README.md index b876a89..281ff83 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,30 @@ poetry install ``` +## Usage + +``` +from nlp_link.linker import NLPLinker + +nlp_link = NLPLinker() + +# dict inputs +comparison_data = {'a': 'cats', 'b': 'dogs', 'd': 'rats', 'e': 'birds'} +input_data = {'x': 'owls', 'y': 'feline', 'z': 'doggies', 'za': 'dogs', 'zb': 'chair'} +nlp_link.load(comparison_data) +matches = nlp_link.link_dataset(input_data) +# Top match output +print(matches) + +# list inputs +comparison_data = ['cats', 'dogs', 'rats', 'birds'] +input_data = ['owls', 'feline', 'doggies', 'dogs','chair'] +nlp_link.load(comparison_data) +matches = nlp_link.link_dataset(input_data) +# Top match output +print(matches) +``` + ## Tests To run tests: @@ -39,7 +63,7 @@ Docs for this repo are automatically published to gh-pages branch via. Github ac However, if you are editing the docs you can test them out locally by running ``` -cd guidelines -pip install -r docs/requirements.txt +cd docs + mkdocs serve ``` diff --git a/docs/README.md b/docs/README.md index 5a47f18..6398ce2 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,5 +1,104 @@ -# nlp-link +# 🖇️ NLP Link -Documentation for NLP Link +NLP Link finds the most similar word (or words) in a reference list to an inputted word. For example, if you are trying to find which word is most similar to 'puppies' from a reference list of `['cats', 'dogs', 'rats', 'birds']`, nlp-link will return 'dogs'. -- [Page1](./page1.md) +Another functionality of this package is using the linking methodology to find the [SOC](https://www.ons.gov.uk/methodology/classificationsandstandards/standardoccupationalclassificationsoc) code most similar to an inputted job title. More on this [here](./page1.md). + +## 🔨 Usage + +Install the package using pip: + +```bash +pip install nlp-link +``` + +### Basic usage + +Match two lists in python: + +```python + +from nlp_link.linker import NLPLinker + +nlp_link = NLPLinker() + +# list inputs +comparison_data = ['cats', 'dogs', 'rats', 'birds'] +input_data = ['owls', 'feline', 'doggies', 'dogs','chair'] +nlp_link.load(comparison_data) +matches = nlp_link.link_dataset(input_data) +# Top match output +print(matches) + +``` + +Which outputs: + +``` + input_id input_text link_id link_text similarity +0 0 owls 3 birds 0.613577 +1 1 feline 0 cats 0.669633 +2 2 doggies 1 dogs 0.757443 +3 3 dogs 1 dogs 1.000000 +4 4 chair 0 cats 0.331178 + +``` + +### Extended usage + +Match using dictionary inputs (where the key is a unique ID): + +```python + +from nlp_link.linker import NLPLinker + +nlp_link = NLPLinker() + +# dict inputs +comparison_data = {'a': 'cats', 'b': 'dogs', 'd': 'rats', 'e': 'birds'} +input_data = {'x': 'owls', 'y': 'feline', 'z': 'doggies', 'za': 'dogs', 'zb': 'chair'} +nlp_link.load(comparison_data) +matches = nlp_link.link_dataset(input_data) +# Top match output +print(matches) + +``` + +Which outputs: + +``` + input_id input_text link_id link_text similarity +0 x owls e birds 0.613577 +1 y feline a cats 0.669633 +2 z doggies b dogs 0.757443 +3 za dogs b dogs 1.000000 +4 zb chair a cats 0.331178 + +``` + +Output several most similar matches using the `top_n` argument (`format_output` needs to be set to False for this): + +```python + +from nlp_link.linker import NLPLinker + +nlp_link = NLPLinker() + +comparison_data = {'a': 'cats', 'b': 'dogs', 'c': 'kittens', 'd': 'rats', 'e': 'birds'} +input_data = {'x': 'pets', 'y': 'feline'} +nlp_link.load(comparison_data) +matches = nlp_link.link_dataset(input_data, top_n=2, format_output=False) +# Top match output +print(matches) +# Format output for ease of reading +print({input_data[k]: [comparison_data[r] for r, _ in v] for k,v in matches.items()}) +``` + +Which will output: + +``` +{'x': [['b', 0.8171109], ['a', 0.7650396]], 'y': [['a', 0.6696329], ['c', 0.5778763]]} +{'pets': ['dogs', 'cats'], 'feline': ['cats', 'kittens']} +``` + +The `drop_most_similar` argument can be set to True if you don't want to output the most similar match - this might be the case if you were comparing a list with itself. For this you would run `nlp_link.link_dataset(input_data, drop_most_similar=True)`. diff --git a/docs/mkdocs.yaml b/docs/mkdocs.yaml index 7e1a52a..7cc1433 100644 --- a/docs/mkdocs.yaml +++ b/docs/mkdocs.yaml @@ -39,6 +39,6 @@ theme: name: Switch to light mode nav: - Home: README.md - - Page 1: page1.md + - SOCMapper: page1.md plugins: - same-dir diff --git a/docs/page1.md b/docs/page1.md index 65e1cf4..ca22904 100644 --- a/docs/page1.md +++ b/docs/page1.md @@ -1 +1,27 @@ -## Title +# 🗺️ SOC Mapper + +The SOC mapper relies on the [SOC coding index](https://www.ons.gov.uk/methodology/classificationsandstandards/standardoccupationalclassificationsoc/soc2020/soc2020volume2codingrulesandconventions) released by the ONS. This dataset contains over 30,000 job titles with the SOC code. + +The `SOCMapper` class in `soc_map.py` maps job title(s) to SOC(s). + +## 🔨 Core functionality + +``` +from nlp_link.soc_mapper.soc_map import SOCMapper + +soc_mapper = SOCMapper() +soc_mapper.load() +job_titles=["data scientist", "Assistant nurse", "Senior financial consultant - London"] + +soc_mapper.get_soc(job_titles, return_soc_name=True) +``` + +Which will output + +``` +[((('2433/04', 'Statistical data scientists'), ('2433', 'Actuaries, economists and statisticians'), '2425'), 'Data scientist'), ((('6131/99', 'Nursing auxiliaries and assistants n.e.c.'), ('6131', 'Nursing auxiliaries and assistants'), '6141'), 'Assistant nurse'), ((('2422/02', 'Financial advisers and planners'), ('2422', 'Finance and investment analysts and advisers'), '3534'), 'Financial consultant')] +``` + +## 📖 Read more + +Read more about the methods and evaluation of the SOCMapper [here](https://github.com/nestauk/nlp-link/soc_mapper/README.md). diff --git a/nlp_link/linker.py b/nlp_link/linker.py index 6a60dd5..f06f5a9 100644 --- a/nlp_link/linker.py +++ b/nlp_link/linker.py @@ -25,19 +25,18 @@ """ -from sentence_transformers import SentenceTransformer -import torch from tqdm import tqdm from sklearn.metrics.pairwise import cosine_similarity import numpy as np import pandas as pd from typing import Union, Optional -import logging -from nlp_link.linker_utils import chunk_list +from nlp_link.linker_utils import chunk_list, get_embeddings, load_bert -logger = logging.getLogger(__name__) +from wasabi import msg, Printer + +msg_print = Printer() # TO DO: cosine or euclidean? @@ -76,16 +75,16 @@ def _process_dataset( try: return dict(zip(input_data[id_column], input_data[text_column])) except: - logger.warning( + msg.warn( "Input is a dataframe, please specify id_column and text_column" ) else: - logger.warning( + msg.warn( "The input_data input must be a dictionary, a list or pandas dataframe" ) if not isinstance(input_data[0], str): - logger.warning( + msg.warn( "The input_data input must be a list of texts, or a dictionary where the values are texts" ) @@ -100,12 +99,7 @@ def load( A list of texts or a dictionary of texts where the key is the unique id. If a list is given then a unique id will be assigned with the index order. """ - logger.info("Loading model") - device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu") - self.bert_model = SentenceTransformer( - "sentence-transformers/all-MiniLM-L6-v2", device=device - ) - self.bert_model.max_seq_length = 512 + self.bert_model = load_bert() self.comparison_data = self._process_dataset(comparison_data) self.comparison_data_texts = list(self.comparison_data.values()) @@ -123,19 +117,12 @@ def _get_embeddings(self, text_list: list) -> np.array: np.array: The embeddings for the input list of texts """ - logger.info( - f"Finding embeddings for {len(text_list)} texts chunked into {round(len(text_list)/self.embed_chunk_size)} chunks" + return get_embeddings( + text_list=text_list, + embed_chunk_size=self.embed_chunk_size, + batch_size=self.batch_size, + bert_model=self.bert_model, ) - all_embeddings = [] - for batch_texts in tqdm(chunk_list(text_list, self.embed_chunk_size)): - all_embeddings.append( - self.bert_model.encode( - np.array(batch_texts), batch_size=self.batch_size - ) - ) - all_embeddings = np.concatenate(all_embeddings) - - return all_embeddings def get_matches( self, @@ -161,7 +148,7 @@ def get_matches( dict: The top matches for each input id. """ - logger.info( + msg.info( f"Finding the top dataset matches for {len(input_data_ids)} input texts chunked into {round(len(input_data_ids)/self.match_chunk_size)}" ) @@ -222,11 +209,11 @@ def link_dataset( """ try: - logger.info( + msg.info( f"Comparing {len(input_data)} input texts to {len(self.comparison_embeddings)} comparison texts" ) except: - logger.warning( + msg.warning( "self.comparison_embeddings does not exist - you may have not run load()" ) diff --git a/nlp_link/linker_utils.py b/nlp_link/linker_utils.py index 3618a54..1746610 100644 --- a/nlp_link/linker_utils.py +++ b/nlp_link/linker_utils.py @@ -1,3 +1,56 @@ +from tqdm import tqdm + +import numpy as np +from sentence_transformers import SentenceTransformer +import torch + +from wasabi import msg, Printer + +msg_print = Printer() + + def chunk_list(orig_list, n_chunks): for i in range(0, len(orig_list), n_chunks): yield orig_list[i : i + n_chunks] + + +def load_bert(bert_model_name="sentence-transformers/all-MiniLM-L6-v2"): + + with msg_print.loading("Loading BERT model"): + device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu") + bert_model = SentenceTransformer(bert_model_name, device=device) + bert_model.max_seq_length = 512 + msg.good("BERT model loaded") + return bert_model + + +def get_embeddings( + text_list: list, + bert_model, + embed_chunk_size: int = 500, + batch_size: int = 32, +) -> np.array: + """ + Get embeddings for a list of texts + + Args: + text_list (list): A lists of texts. + bert_model: An initialised SentenceTransformer BERT model. + embed_chunk_size (int): The number of texts per chunk to process. + batch_size (int): BERT batch_size. + Returns: + np.array: The embeddings for the input list of texts + """ + + msg.info( + f"Finding embeddings for {len(text_list)} texts chunked into {round(len(text_list)/embed_chunk_size)} chunks" + ) + all_embeddings = [] + for batch_texts in tqdm(chunk_list(text_list, embed_chunk_size)): + all_embeddings.append( + bert_model.encode(np.array(batch_texts), batch_size=batch_size) + ) + all_embeddings = np.concatenate(all_embeddings) + msg.good("Texts embedded.") + + return all_embeddings diff --git a/nlp_link/soc_mapper/README.md b/nlp_link/soc_mapper/README.md new file mode 100644 index 0000000..a3fd793 --- /dev/null +++ b/nlp_link/soc_mapper/README.md @@ -0,0 +1,236 @@ +# 🗺️ SOCMapper + +Key files and folders in this directory are: + +1. [soc_map.py](https://github.com/nestauk/nlp-link/soc_mapper/soc_map.py): The script containing the `SOCMapper` class. +2. [soc_map_utils.py](https://github.com/nestauk/nlp-link/soc_mapper/soc_map_utils.py): Functions for loading data and cleaning job titles for the `SOCMapper` class. +3. [config.yaml](ttps://github.com/nestauk/nlp-link/soc_mapper/config.yaml): The default arguments for the `SOCMapper` class. + +# 🗺️ SOC Mapper + +The SOC mapper relies on the [SOC coding index](https://www.ons.gov.uk/methodology/classificationsandstandards/standardoccupationalclassificationsoc/soc2020/soc2020volume2codingrulesandconventions) released by the ONS. This dataset contains over 30,000 job titles with the SOC code. + +The `SOCMapper` class in `soc_map.py` maps job title(s) to SOC(s). + +## 🔨 Core functionality + +``` +from nlp_link.soc_mapper.soc_map import SOCMapper + +soc_mapper = SOCMapper() +soc_mapper.load() +job_titles=["data scientist", "Assistant nurse", "Senior financial consultant - London"] + +soc_mapper.get_soc(job_titles, return_soc_name=True) +``` + +### Modifications + +If you want to match job titles to a different, locally saved, version of a SOC coding index file, you can do this with: + +``` +from nlp_link.soc_mapper.soc_map import SOCMapper + +soc_mapper = SOCMapper(soc_dir = LOCAL_DIR_OF_SOC_CODING_INDEX) +soc_mapper.load(save_embeds = True) +``` + +Where `LOCAL_DIR_OF_SOC_CODING_INDEX` is the location of your locally saved version of the SOC coding index xlsx file, e.g. `data/soc2020volume2thecodingindexexcel22022024.xlsx`. If `save_embeds = True` then the embeddings will be saved in this same directory. + +If this file has different column names from what is outlined in `nlp_link/soc_mapper/config.yaml`, then you can edit them individually by running: + +``` +from nlp_link.soc_mapper.soc_map import SOCMapper + +soc_mapper = SOCMapper(soc_dir = LOCAL_DIR_OF_SOC_CODING_INDEX) +soc_mapper.soc_mapper_config['sheet_name'] = 'Name of new sheet name' +soc_mapper.load(save_embeds = True) + +``` + +## 📤 Output + +The output for one job title is in the format + +``` +(((SOC 2020 Extension code, SOC 2020 Extension name), (SOC 2020 4-digit code, SOC 2020 4-digit name), SOC 2010 code), job title matched to in SOC data) +``` + +for example + +``` +((('2422/02', 'Financial advisors and planners'), ('2422', 'Fi +nance and investment analysts and advisers'), '3534'), 'financial consultant') +``` + +If the names of the SOC codes aren't needed then you can set `return_soc_name=False`. The variables `soc_mapper.soc_2020_6_dict` and `soc_mapper.soc_2020_4_dict` give the names of each SOC 2020 6 and 4 digit codes. + +The following table gives the results of using the SOCMapper function on the job titles in the "Input job title" column. + +| Input job title | SOC 2020 EXT code | SOC 2020 sub-unit group | SOC 2020 unit group | SOC 2010 code | SOC data job title | +| ------------------------------------ | ----------------- | ----------------------------------------- | -------------------------------------------- | ------------- | -------------------- | +| data scientist | 2433/04 | Statistical data scientists | Actuaries, economists and statisticians | 2425 | Data scientist | +| Assistant nurse | 6131/99 | Nursing auxiliaries and assistants n.e.c. | Nursing auxiliaries and assistants | 6141 | Assistant nurse | +| Senior financial consultant - London | 2422/02 | Financial advisors and planners | Finance and investment analysts and advisers | 3534 | Financial consultant | + +## 🖊️ Methodology + +The SOCMapper works by finding the semantically closest job titles between the inputed job titles and the job titles in the ONS SOC dataset. An overview of the methodology is in the diagram below. + +
+ +
+ +**Step 1:** We clean the inputted job title. This cleaning involves removing words which describe the job conditions but not the job title; e.g. removing common placenames or words like "part-time". + +For example, if our inputted job adverts were + +``` +["Data Scientist - London", "Electric motor assembler - part time", "Data visualisation developer £30k per annum"] +``` + +these would be cleaned to + +``` +['Data Scientist', 'Electric motor assembler', 'Data visualisation developer'] +``` + +**Step 2:** We process the [ONS SOC coding index](https://www.ons.gov.uk/methodology/classificationsandstandards/standardoccupationalclassificationsoc/standardoccupationalclassificationsocextensionproject). An example of this dataset is: + +| SOC 2010 | SOC 2020 | SOC 2020 Ext Code | INDEXOCC | ADD | IND | INDEXOCC NATURAL WORD ORDER | SOC 2020 UNIT GROUP DESCRIPTIONS | SUB-UNIT GROUP DESCRIPTIONS | +| -------- | -------- | ----------------- | ------------------ | --------- | ----------- | --------------------------- | -------------------------------------------------- | --------------------------------------------------------- | +| 2425 | 2433 | 2433/04 | Scientist, data | | | Data scientist | Actuaries, economists and statisticians | Statistical data scientists | +| 2136 | 2134 | 2134/99 | Analyst, data | computing | | data analyst | Programmers and software development professionals | Programmers and software development professionals n.e.c. | +| 3539 | 3544 | 3544/00 | Analyst, data | | | data analyst | Data analysts | Data analysts | +| 2136 | 2134 | 2134/03 | Developer, analyst | | | analyst developer | Programmers and software development professionals | Software developers | +| 8139 | 8149 | 8149/00 | Assembler, meter | | | meter assembler | Assemblers and routine operatives n.e.c. | Assemblers and routine operatives n.e.c. | +| 8131 | 8141 | 8141/00 | Assembler, motor | electric | | motor assembler | Assemblers (electrical and electronic products) | Assemblers (electrical and electronic products) | +| 8132 | 8142 | 8142/02 | Assembler, motor | | engineering | motor assembler | Assemblers (vehicles and metal goods) | Vehicle and vehicle part assemblers | + +We can combine the `INDEXOCC NATURAL WORD ORDER`, `ADD` and `IND` columns to create unique job titles. The dictionary of unique job titles to SOC information would be: + +``` +{"data scientist": ("2433/04", "2433", "2425"), "data analyst computing": ("2134/99", "2134", "2136"), "data analyst": ("3544/00", "3544", "3539"), "analyst developer": ("2134/03", "2134", "2136"), "meter assembler": ("8149/00", "8149", "8139"), "motor assembler electric": ("8141/00", "8141", "8131"), "motor assembler engineering": ("8142/02", "8142", "8132")} +``` + +**Step 3:** We embed these unique ONS job titles and the input job title using the `all-MiniLM-L6-v2` Sentence Tranformers pretrained model. + +**Step 4:** We then calculate the cosine similarity between the embedded input job title and all the embedded ONS job titles. + +In our example, the cosine similarity scores for each input job title (row) and each ONS SOC data job title (columns) are: + +| | data scientist | data analyst computing | data analyst | analyst developer | meter assembler | motor assembler electric | motor assembler engineering | +| ---------------------------- | -------------- | ---------------------- | ------------ | ----------------- | --------------- | ------------------------ | --------------------------- | +| Data Scientist | **1.0** | 0.69 | 0.81 | 0.56 | 0.20 | 0.07 | 0.15 | +| Electric motor assembler | 0.07 | 0.20 | 0.11 | 0.15 | 0.52 | **0.81** | 0.80 | +| Data visualisation developer | 0.53 | 0.57 | 0.59 | 0.47 | 0.22 | 0.04 | 0.17 | + +**Step 5:** Finally, we find the SOC information for the ONS job title with the highest similarity as long as it is over a certain threshold (default is 0.67). If there is no single job title with a particularly high similiarity, then we use a consensus approach at the SOC 2020 4-digit level (default to having over 3 matches with over 0.5 similarity score). + +With the default values, the final matches for each inputted job title would be: + +| | ONS job title matched to | SOC | SOC description | +| ---------------------------- | ------------------------ | ------- | ----------------------------------------------- | +| Data Scientist | data scientist | 2433/04 | Statistical data scientists | +| Electric motor assembler | motor assembler electric | 8141/00 | Assemblers (electrical and electronic products) | +| Data visualisation developer | None | None | None | + +However, if we had set slightly different conditions for the consensus approach, another outcome could be that the "Data visualisation developer" job title was mapped to the SOC "2134 - Programmers and software professionals" since 2 out of the 4 matches with over 0.45 similarity were from this 4-digit SOC. + +## 🤔 Evaluation + +To get the evaluation sample we found the most common job titles in Nesta's job advert dataset, and a random sample of job titles. + +These datasets were manually labelled with how well we thought the job title was matched to a SOC code. We chose 3 categories - excellent, good or poor. + +### From a **random sample** of 200 job titles: + +- 59.6% had a SOC 6-digit code matched +- 5% were only able to have a SOC 4-digit code matched +- 35.5% had no SOC matched + +Using 118 job titles of the random sample with SOC 6-digit codes found: + +- 66% had excellent quality SOC matches +- 23% had good quality SOC matches +- 11% had poor quality SOC matches + +From the 5% (10 job titles) of the random sample with SOC 4-digit codes found: + +- 80% had excellent quality SOC matches +- 10% had good quality SOC matches +- 10% had poor quality SOC matches + +### We also labelled 300 of the **most commonly occuring** job titles in our dataset with quality measures. + +- 89% had a SOC 6-digit code matched +- 4% were only able to have a SOC 4-digit code matched +- 7% had no SOC matched + +Using 255 job titles of the most commonly occuring job titles with SOC 6-digit codes found: + +- 82% had excellent quality SOC matches +- 10% had good quality SOC matches +- 8% had poor quality SOC matches + +From the 20 job titles of the most commonly occuring job titles with SOC 4-digit codes found: + +- 95% had excellent quality SOC matches +- 5% had good quality SOC matches + +We note that the results from the most commonly occuring job titles are probably better since the job title tends to be more clean and standardised. + +### Examples of **excellent** matches: + +| job_title | num_job_ads | prop_job_ads | soc_2020_6_name | occ_matched | match_prob | +| ----------------------------------------- | ----------- | ------------ | ---------------------------------------- | ------------------------ | ---------- | +| Care Assistant - Bank - Care Home | 22444 | 0.0031 | Domiciliary care workers | home care assistant | 0.78 | +| Pastry Demi Chef de Partie | 1 | 0.0000 | Chefs | chef de partie | 0.79 | +| Forklift Driver | 2922 | 0.0004 | Fork-lift truck drivers | fork lift truck driver | 0.88 | +| Finance ManagerRB | 1 | 0.0000 | Company secretaries and finance managers | finance manager | 0.85 | +| Service Engineer Carpentry and Decorating | 1 | 0.0000 | Painters and decorators | decorating contractor | 0.72 | +| Senior Software Engineer | 2681 | 0.0004 | Software developers | senior software engineer | 1.00 | +| Change Business Analyst - FMCG experience | 1 | 0.0000 | Business analysts | business change analyst | 0.69 | +| Private Client Solicitor | 5338 | 0.0007 | Solicitors and lawyers n.e.c. | solicitor | 0.75 | +| Internal Sales Executive | 2281 | 0.0003 | Business sales executives | sales executive | 0.85 | +| HR Advisor | 10386 | 0.0014 | Human resources advisors | human resources adviser | 0.85 | + +### Examples of **good** matches: + +| job_title | num_job_ads | prop_job_ads | soc_2020_6_name | occ_matched | match_prob | +| ---------------------------------------- | ----------- | ------------ | ----------------------------------------------------- | ------------------------------- | ---------- | +| Domestic Assistant | 2934 | 0.0004 | Commercial cleaners | domestic assistant | 1 | +| Holiday Club Admin Manager | 1 | 0.0000 | Hotel and accommodation managers and proprietors | holiday centre manager | 0.79 | +| Training and Support Manager | 1 | 0.0000 | Education managers | learning support manager | 0.80 | +| Digital Marketing Executive | 4554 | 0.0006 | Marketing consultants | digital marketing executive | 1 | +| Operations Manager -Commercial Insurance | 1 | 0.0000 | Financial managers and directors n.e.c. | insurance company manager | 0.79 | +| Field Service Engineer | 7272 | 0.0010 | Telecoms and related network installers and repairers | home service field engineer | 0.87 | +| Tutor | 3370 | 0.0005 | Higher education teaching professionals n.e.c. | course tutor | 0.92 | +| Assistant Manager - Truro | 2 | 0.0000 | Other administrative occupations n.e.c. | manager's assistant | 0.78 | +| Marketing Executive | 8363 | 0.0012 | Marketing consultants | marketing executive | 1 | +| Chartered Financial Advisor - Berkshire | 2 | 0.0000 | Financial accountants | chartered management accountant | 0.70 | + +### Examples of **bad** matches: + +| job_title | num_job_ads | prop_job_ads | soc_2020_6_name | occ_matched | match_prob | +| ----------------------------------------------------------------- | ----------- | ------------ | ------------------------------------------------------- | --------------------------------- | ---------- | +| Academic Mentor | 2847 | 0.000 | Learning and behaviour mentors | learning mentor | 0.85 | +| Electronics Assembly Technician - Oxford - £30,000 per annum | 2 | 0.000 | Metal working production and maintenance fitters n.e.c. | assembly engineer | 0.67 | +| Senior Administrator | 2315 | 0.000 | Registrars | senior registration administrator | 0.84 | +| Census officer | 3547 | 0.000 | Office managers | census district manager | 0.80 | +| Operative | 2201 | 0.000 | Textile process operatives n.e.c. | general operative | 0.77 | +| Business Case Manager Business | 1 | 0.000 | National government administrative occupations n.e.c. | case manager | 0.76 | +| Production Operative | 16113 | 0.002 | Printing machine assistants | finishing operative | 0.76 | +| Night Care Assistant | 11302 | 0.002 | Shelf fillers | night assistant | 0.86 | +| Supply Teacher | 7316 | 0.001 | Inventory and stock controllers | supplies superintendent | 0.72 | +| Carpenter - Timber Frame | 1 | 0.000 | Agricultural and fishing trades n.e.c. | timber contractor | 0.72 | + +### Observations + +A random sample of the job titles that don't match to a SOC revealed that extra cleaning may help them match to SOC codes. Some of the job titles that didn't match include: + +['Disability Assessor Homebase / Front Office', 'Clinical Fellow ST3 Stroke medicine', 'IT Engineer - £25-£30k - Normanton / Hybrid', 'Entry Level Graduate Scheme', 'Operatives Needed', 'Waiting Staff 1', 'Staff Nurse, General Surgery - Band 5', 'PHP Developer Laravel Vue.js', 'Bike Courier parttime Liverpool', 'E&I Technician Days', '1-1 Tutor Required - Wakefield', 'Flexcube Analyst permanent', 'Infection Prevention Control Nurse - Band 8a', 'Blinds and Curtains Installer', 'Senior Community Host - Woking', 'Data Architect, Microsoft Stack, Remote', 'Factory Cleaning Operative - £1000 sign on bonus!', 'Retail Customer Service CSM 30hrs - Multi-site', 'Retail Customer Service CSA 30hrs', 'Driver weekend Liverpool'] + +## Future work + +It'd be good to compare our SOCmapper performance to other mappers out there, for example [this python package that maps to 3-digit SOC](https://github.com/aeturrell/occupationcoder) or [the online tool from Cascot](https://cascotweb.warwick.ac.uk/#/classification/soc2020). diff --git a/nlp_link/soc_mapper/SOCMapper_overview.jpg b/nlp_link/soc_mapper/SOCMapper_overview.jpg new file mode 100644 index 0000000..01e6865 Binary files /dev/null and b/nlp_link/soc_mapper/SOCMapper_overview.jpg differ diff --git a/nlp_link/soc_mapper/__init__.py b/nlp_link/soc_mapper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/nlp_link/soc_mapper/config.yaml b/nlp_link/soc_mapper/config.yaml new file mode 100644 index 0000000..95e8346 --- /dev/null +++ b/nlp_link/soc_mapper/config.yaml @@ -0,0 +1,18 @@ +soc_data: + soc_dir: "s3://nesta-open-data/soc_mapper/soc_coding_index_soc2020_volume2_22022024/soc2020volume2thecodingindexexcel22022024.xlsx" + sheet_name: "SOC2020 coding index" + soc_2020_ext_col: "SOC 2020 ext" + soc_2020_col: "SOC 2020" + soc_2010_col: "SOC 2010" + natural_order_col: "INDEXOCC - natural word order" + sug_col: "SOC 2020 ext SUG title" + ug_col: "SOC2020 unit group title" + add_col: "ADD" + ind_col: "IND" +soc_mapper: + reset_embeddings: False + match_top_n: 10 + sim_threshold: 0.67 + top_n_sim_threshold: 0.5 + minimum_n: 3 + minimum_prop: 0.5 diff --git a/nlp_link/soc_mapper/soc_map.py b/nlp_link/soc_mapper/soc_map.py new file mode 100644 index 0000000..d1b78c1 --- /dev/null +++ b/nlp_link/soc_mapper/soc_map.py @@ -0,0 +1,406 @@ +""" +A class to map inputted job titles to their most likely SOC 2020 4-digit codes. + +Usage: + +from soc_mapper.soc_map import SOCMapper + +soc_mapper = SOCMapper() +soc_mapper.load() +job_titles=["data scientist", "Assistant nurse", "Senior financial consultant - London"] + +soc_mapper.get_soc(job_titles, return_soc_name=True) + +""" + +from collections import Counter +import os +from typing import List, Union + +import pandas as pd +from tqdm import tqdm +import numpy as np + +from nlp_link.soc_mapper.soc_map_utils import ( + load_job_title_soc, + process_job_title_soc, + job_title_cleaner, + unique_soc_job_titles, +) +from nlp_link.linker import NLPLinker + +from nlp_link.linker_utils import load_bert + +from utils.utils import ( + soc_mapper_config, + load_s3_json, + load_local_json, + save_to_s3, + save_json_dict, +) + +from wasabi import msg, Printer + +msg_print = Printer() + + +class SOCMapper(object): + """Class for linking job titles to SOC codes. + + The input job title is matched to a dataset of job titles with their 2020 SOC. + - If the most similar job title is very similar, then the corresponding 6-digit SOC is outputted. + - Otherwise, we look at a group of the most similar job titles, and if they all have the same 4-digit SOC, then this is outputted. + + ---------- + + Args: + soc_dir (str): The directory of the SOC coding index xlsx file. + match_top_n (int): The number of most similar SOC matches to consider when calculating the final SOC and outputing + sim_threshold (float): The similarity threshold for outputting the most similar SOC match. + top_n_sim_threshold (float): The similarity threshold for a match being added to a group of SOC matches. + minimum_n (int): The minimum size of a group of SOC matches. + minimum_prop (float): If a group of SOC matches have a high proportion (>= minimum_prop) of the same SOC being matched, then use this SOC. + + ---------- + Methods + ---------- + + load_process_soc_data(): + Load the SOC data + load(reset_embeddings=False, save_embeds=False): + Load everything to use this class, recalculate SOC embeddings and save if desired + find_most_similar_matches(job_titles, job_title_embeddings): + Using the inputted job title embeddings and the SOC embeddings, find the full information about the most similar SOC job titles + find_most_likely_soc(match_row): + For the full match information for one job title, find the most likely SOC (via top match, or group of top matches) + get_soc(job_titles, additional_info=False): + (main function) For inputted job titles, output the best SOC match, add extra information about matches using the additional_info argument + + ---------- + Usage + ---------- + from soc_mapper.soc_map import SOCMapper + soc_mapper = SOCMapper() + soc_mapper.load() + matches = soc_mapper.get_soc(job_titles=["data scientist", "Assistant nurse", "Senior financial consultant - London"]) + >>> [(('2433/02', '2433', '2425'), 'data scientist'), (('6131/99', '6131', '6141'), 'assistant nurse'), (('2422/02', '2422', '3534'), 'financial consultant')] + """ + + def __init__( + self, + soc_dir: str = soc_mapper_config["soc_data"]["soc_dir"], + match_top_n: int = soc_mapper_config["soc_mapper"]["match_top_n"], + sim_threshold: float = soc_mapper_config["soc_mapper"]["sim_threshold"], + top_n_sim_threshold: float = soc_mapper_config["soc_mapper"][ + "top_n_sim_threshold" + ], + minimum_n: int = soc_mapper_config["soc_mapper"]["minimum_n"], + minimum_prop: float = soc_mapper_config["soc_mapper"]["minimum_prop"], + ): + self.soc_dir = soc_dir + self.match_top_n = match_top_n + self.sim_threshold = sim_threshold + self.top_n_sim_threshold = top_n_sim_threshold + self.minimum_n = minimum_n + self.minimum_prop = minimum_prop + + self.soc_mapper_config = soc_mapper_config # This is so a user could change the soc_data values easily if needed + + def load_process_soc_data(self): + """ + Load the job titles to SOC codes dataset as found on the ONS website. + A small amount of processing. + """ + + jobtitle_soc_data = process_job_title_soc( + load_job_title_soc(soc_mapper_config=self.soc_mapper_config), + soc_mapper_config=self.soc_mapper_config, + ) + + return jobtitle_soc_data + + def load( + self, + reset_embeddings: bool = soc_mapper_config["soc_mapper"]["reset_embeddings"], + save_embeds: bool = False, + ): + """ + Load the BERT model, SOC coding index data, and load or calculate embeddings for the job titles in this dataset. + Args: + reset_embeddings (bool): Whether to re-calculate and save soc coding index embeddings or not. Will be done anyway if + an embeddings file isn't found. + save_embeds (bool): Whether to save the out the embeddings or not (only used if the embeddings weren't loaded) + """ + + self.nlp_link = NLPLinker() + self.nlp_link.bert_model = load_bert() + + self.jobtitle_soc_data = self.load_process_soc_data() + + self.soc_2020_6_dict = dict( + zip( + self.jobtitle_soc_data["SOC_2020_EXT"], + self.jobtitle_soc_data["SUB-UNIT GROUP DESCRIPTIONS"], + ) + ) + self.soc_2020_4_dict = dict( + zip( + self.jobtitle_soc_data["SOC_2020"], + self.jobtitle_soc_data["SOC 2020 UNIT GROUP DESCRIPTIONS"], + ) + ) + self.job_title_2_soc6_4 = unique_soc_job_titles(self.jobtitle_soc_data) + + embeddings_output_dir = os.path.dirname(self.soc_dir) + + if "s3://" in embeddings_output_dir: + s3_bucket_name = embeddings_output_dir.split("s3://")[1].split("/")[0] + embeddings_output_s3_folder = "/".join( + embeddings_output_dir.split("s3://")[1].split("/")[1:] + ) + embeddings_path = os.path.join( + embeddings_output_s3_folder, "soc_job_embeddings.json" + ) + job_titles_path = os.path.join( + embeddings_output_s3_folder, "soc_job_embeddings_titles.json" + ) + else: + s3_bucket_name = None + embeddings_path = os.path.join( + embeddings_output_dir, "soc_job_embeddings.json" + ) + job_titles_path = os.path.join( + embeddings_output_dir, "soc_job_embeddings_titles.json" + ) + + try: + if not reset_embeddings: + try: + if s3_bucket_name: + with msg_print.loading( + f"Loading SOC job title embeddings from S3 ..." + ): + self.all_soc_embeddings = load_s3_json( + s3_bucket_name, embeddings_path + ) + self.soc_job_titles = load_s3_json( + s3_bucket_name, job_titles_path + ) + else: + with msg_print.loading( + f"Loading SOC job title embeddings locally ..." + ): + self.all_soc_embeddings = load_local_json(embeddings_path) + self.soc_job_titles = load_local_json(job_titles_path) + msg.good("SOC job title embeddings loaded.") + except: + msg.warn(f"SOC job title embeddings not found.") + raise + else: + raise + except: + msg.info(f"Calculating SOC job title embeddings ...") + + # Embed the SOC job titles + self.soc_job_titles = list(self.job_title_2_soc6_4.keys()) + + self.all_soc_embeddings = self.nlp_link._get_embeddings(self.soc_job_titles) + + if save_embeds: + msg.info(f"Saving SOC job title embeddings") + if s3_bucket_name: + save_to_s3(s3_bucket_name, self.all_soc_embeddings, embeddings_path) + save_to_s3(s3_bucket_name, self.soc_job_titles, job_titles_path) + msg.good(f"Saved to s3://{s3_bucket_name} + {embeddings_path} ...") + else: + save_json_dict(self.all_soc_embeddings, embeddings_path) + save_json_dict(self.soc_job_titles, job_titles_path) + msg.good(f"Saved to {embeddings_path} ...") + else: + msg.warn( + f"Newly calculated SOC job title embeddings were not saved, set save_embeds=True if you'd like to save them to speed up future use." + ) + + def find_most_similar_matches( + self, + job_titles: Union[str, List[str]], + job_title_embeddings: np.array(object), + ) -> list: + """ + Using the job title embeddings and the SOC job title embeddings, + find the top n SOC job titles which are most similar to each input job title. + + Args: + job_titles (str or list of strings): One or a list of inputted job titles. + job_title_embeddings (np.array()): The embeddings for the inputted job titles. + + Outputs: + list: A list of the most similar SOC data for each inputted job title. + """ + + matches_topn_dict = self.nlp_link.get_matches( + input_data_ids=list(range(len(job_titles))), + input_embeddings=job_title_embeddings, + comparison_data_ids=list(range(len(self.all_soc_embeddings))), + comparison_embeddings=self.all_soc_embeddings, + top_n=self.match_top_n, + ) + + job_top_soc_matches = [] + for k, v in matches_topn_dict.items(): + top_soc_matches = [] + for top_match in v: + soc_ix = top_match[0] + similarity = top_match[1] + soc_text = self.soc_job_titles[soc_ix] + top_soc_matches.append( + [ + soc_text, + self.job_title_2_soc6_4[soc_text][0], # 6 digit + self.job_title_2_soc6_4[soc_text][1], # 4 digit + self.job_title_2_soc6_4[soc_text][2], # 2010 4 digit + similarity, + ] + ) + job_top_soc_matches.append( + { + "job_title": job_titles[k], + "top_soc_matches": top_soc_matches, + } + ) + + return job_top_soc_matches + + def find_most_likely_soc( + self, + match_row: dict, + ) -> tuple: + """ + For a single job title and the details of the most similar SOC matches, find a single most likely SOC + 1. If the top match has a really high similarity score (>sim_threshold) at the 6-digit level then use this. + This will return (soc, job_title) + 2. Get the 4-digit SOCs of the good (>top_n_sim_threshold) matches in the top n most similar. + 3. If there are a few of these (>=minimum_n) and over a certain proportion (>minimum_prop) of these are the same at the 4 digit level - use this as the SOC. + This will return (soc, the job titles given for this same soc) + + Returns data in the format ((soc_2020_6, soc_2020_4, soc_2010), job_title) or None + If pathway 1. (above) isn't true then the output will be ((None, soc_2020_4, None), job_title) and job_title will be a set of multiple + + Args: + match_row: One element from the list outputted in find_most_similar_matches. + e.g. {"job_title": 'principal data scientist', "top_soc_matches": [["data scientist", 6digit SOC, 4 digit SOC, 4 digit 2010 SOC, similarity_score], ...]} + + Output: + tuple, None: Details of the most likely SOC match for this job title. + """ + + top_soc_match = match_row["top_soc_matches"][0][0] + top_soc_match_code = ( + match_row["top_soc_matches"][0][1], + match_row["top_soc_matches"][0][2], + match_row["top_soc_matches"][0][3], + ) # 6 digit, 4 digit, 4 2010 + top_soc_match_score = match_row["top_soc_matches"][0][4] # The score + + if top_soc_match_score > self.sim_threshold: + return (top_soc_match_code, top_soc_match) + else: + all_good_socs = [ + t[2] # 4 digit 2020 SOC + for t in match_row["top_soc_matches"] + if t[4] > self.top_n_sim_threshold + ] + if len(all_good_socs) >= self.minimum_n: + common_soc, num_common_soc = Counter(all_good_socs).most_common(1)[0] + prop_most_common_soc = num_common_soc / len(all_good_socs) + if prop_most_common_soc > self.minimum_prop: + return ( + (None, common_soc, None), + set( + [ + t[0] + for t in match_row["top_soc_matches"] + if ( + (t[4] > self.top_n_sim_threshold) + and (t[2] == common_soc) + ) + ] + ), + ) + else: + return None + else: + return None + + def get_soc( + self, + job_titles: Union[str, List[str]], + additional_info: bool = False, + return_soc_name: bool = False, + clean_job_title: bool = True, + ) -> list: + """ + Get the most likely SOC for each inputted job title + + Args: + job_titles (str, list): A single job title or a list of raw job titles + additional_info (bool): Whether to provide additional information about the matches. + Return just the most likely soc match (False) or the top soc matches (True) + return_soc_name (bool): Whether to output the SOC names of the most likely SOC (or just the codes). + When applied to lots of data this might not be as desirable. + clean_job_title (bool): Whether to apply the cleaning function to the job title. + + Output: + list: A list of the top matches for each job title inputted + + """ + + if isinstance(job_titles, str): + job_titles = [job_titles] + + # Clean the job titles + if clean_job_title: + job_titles = [job_title_cleaner(job_title) for job_title in job_titles] + + # Embed the input job titles + job_title_embeddings = self.nlp_link._get_embeddings(job_titles) + + top_soc_matches = self.find_most_similar_matches( + job_titles, job_title_embeddings + ) + + msg.info(f"Finding most likely SOC") + found_count = 0 + for job_matches in top_soc_matches: + most_likely_soc = self.find_most_likely_soc(job_matches) + if most_likely_soc: + ((soc_2020_6, soc_2020_4, soc_2010_4), job_title) = most_likely_soc + if return_soc_name: + job_matches["most_likely_soc"] = ( + ( + (soc_2020_6, self.soc_2020_6_dict.get(soc_2020_6)), + (soc_2020_4, self.soc_2020_4_dict.get(soc_2020_4)), + soc_2010_4, + ), + job_title, + ) + else: + job_matches["most_likely_soc"] = ( + (soc_2020_6, soc_2020_4, soc_2010_4), + job_title, + ) + else: + job_matches["most_likely_soc"] = None + if most_likely_soc: + found_count += 1 + + msg.good( + f"Found SOCs for {found_count*100/len(top_soc_matches)}% of the job titles" + ) + + if additional_info: + return top_soc_matches + else: + return [ + job_matches.get("most_likely_soc") for job_matches in top_soc_matches + ] diff --git a/nlp_link/soc_mapper/soc_map_utils.py b/nlp_link/soc_mapper/soc_map_utils.py new file mode 100644 index 0000000..facf8d4 --- /dev/null +++ b/nlp_link/soc_mapper/soc_map_utils.py @@ -0,0 +1,285 @@ +import pandas as pd + +import re + +from utils.utils import soc_mapper_config + + +def load_job_title_soc(soc_mapper_config: dict = soc_mapper_config) -> pd.DataFrame(): + """ + Load the ONS dataset which gives SOC codes for thousands of job titles + """ + + jobtitle_soc_data = pd.read_excel( + soc_mapper_config["soc_data"]["soc_dir"], + sheet_name=soc_mapper_config["soc_data"]["sheet_name"], + converters={ + soc_mapper_config["soc_data"]["soc_2020_ext_col"]: str, + soc_mapper_config["soc_data"]["soc_2020_col"]: str, + soc_mapper_config["soc_data"]["soc_2010_col"]: str, + }, + ) + + return jobtitle_soc_data + + +def process_job_title_soc( + jobtitle_soc_data: pd.DataFrame(), soc_mapper_config: dict = soc_mapper_config +) -> pd.DataFrame(): + """Standardise the column names for use in soc_map.py + Args: + jobtitle_soc_data (pd.DataFrame): the raw ONS SOC coding index dataset + Returns: + pd.DataFrame: the cleaned ONS SOC coding index dataset + """ + + jobtitle_soc_data = jobtitle_soc_data.rename( + columns={ + soc_mapper_config["soc_data"]["soc_2020_ext_col"]: "SOC_2020_EXT", + soc_mapper_config["soc_data"]["soc_2020_col"]: "SOC_2020", + soc_mapper_config["soc_data"]["soc_2010_col"]: "SOC_2010", + soc_mapper_config["soc_data"][ + "natural_order_col" + ]: "INDEXOCC NATURAL WORD ORDER", + soc_mapper_config["soc_data"]["sug_col"]: "SUB-UNIT GROUP DESCRIPTIONS", + soc_mapper_config["soc_data"]["ug_col"]: "SOC 2020 UNIT GROUP DESCRIPTIONS", + soc_mapper_config["soc_data"]["add_col"]: "ADD", + soc_mapper_config["soc_data"]["ind_col"]: "IND", + } + ) + + # Clean + jobtitle_soc_data = jobtitle_soc_data[jobtitle_soc_data["SOC_2020"] != "}}}}"] + + return jobtitle_soc_data + + +def unique_soc_job_titles(jobtitle_soc_data: pd.DataFrame()) -> dict: + """ + Taking the dataset of job titles and which SOC they belong to - create a unique + dictionary where each key is a job title and the value is the SOC code. + There are additional words to include in the job title if at first + it is not unique. + + Args: + jobtitle_soc_data (pd.DataFrame): the cleaned ONS SOC coding index dataset. + + Returns: + dict: A dictionary where each key is a job title and the value is the SOC code. + + """ + + col_name_0 = "INDEXOCC NATURAL WORD ORDER" + col_name_1 = "ADD" + col_name_2 = "IND" + + jobtitle_soc_data[f"{col_name_0} and {col_name_1}"] = jobtitle_soc_data.apply( + lambda x: ( + x[col_name_0] + " " + x[col_name_1] + if pd.notnull(x[col_name_1]) + else x[col_name_0] + ), + axis=1, + ) + jobtitle_soc_data[f"{col_name_0} and {col_name_1} and {col_name_2}"] = ( + jobtitle_soc_data.apply( + lambda x: ( + x[f"{col_name_0} and {col_name_1}"] + " " + x[col_name_2] + if pd.notnull(x[col_name_2]) + else x[f"{col_name_0} and {col_name_1}"] + ), + axis=1, + ) + ) + + # Try to find a unique job title to SOC 2020 4 or 6 code mapping + job_title_2_soc6_4 = {} + for job_title, grouped_soc_data in jobtitle_soc_data.groupby(col_name_0): + if grouped_soc_data["SOC_2020_EXT"].nunique() == 1: + job_title_2_soc6_4[job_title] = ( + grouped_soc_data["SOC_2020_EXT"].unique()[0], + grouped_soc_data["SOC_2020"].unique()[0], + grouped_soc_data["SOC_2010"].unique()[0], + ) + else: + for job_title_1, grouped_soc_data_1 in grouped_soc_data.groupby( + f"{col_name_0} and {col_name_1}" + ): + if grouped_soc_data_1["SOC_2020_EXT"].nunique() == 1: + job_title_2_soc6_4[job_title_1] = ( + grouped_soc_data_1["SOC_2020_EXT"].unique()[0], + grouped_soc_data_1["SOC_2020"].unique()[0], + grouped_soc_data_1["SOC_2010"].unique()[0], + ) + else: + for ( + job_title_2, + grouped_soc_data_2, + ) in grouped_soc_data_1.groupby( + f"{col_name_0} and {col_name_1} and {col_name_2}" + ): + if grouped_soc_data_2["SOC_2020_EXT"].nunique() == 1: + job_title_2_soc6_4[job_title_2] = ( + grouped_soc_data_2["SOC_2020_EXT"].unique()[0], + grouped_soc_data_2["SOC_2020"].unique()[0], + grouped_soc_data_2["SOC_2010"].unique()[0], + ) + + return job_title_2_soc6_4 + + +def unique_soc_descriptions(soc_data: pd.DataFrame()) -> dict: + """ + Taking the dataset of SOC and their descriptions - create a unique + dictionary where each key is a description and the value is the SOC code. + + Args: + soc_data (pd.DataFrame): the cleaned ONS SOC coding index dataset. + + Returns: + dict: A dictionary where each key is a SOC description and the value is the SOC code. + + """ + soc_data["SUB-UNIT GROUP DESCRIPTIONS"] = soc_data[ + "SUB-UNIT GROUP DESCRIPTIONS" + ].apply(lambda x: x.replace(" n.e.c.", "").replace(" n.e.c", "")) + + dd = soc_data[ + ["SUB-UNIT GROUP DESCRIPTIONS", "SOC_2020_EXT", "SOC_2020", "SOC_2010"] + ].drop_duplicates() + + # There can be multiple 2010 codes for each 6 digit, so just output the most common + soc_desc_2_code = {} + for description, soc_info in dd.groupby("SUB-UNIT GROUP DESCRIPTIONS"): + soc_2020_6 = soc_info["SOC_2020_EXT"].value_counts().index[0] + soc_2020_4 = soc_info["SOC_2020"].value_counts().index[0] + soc_2010 = list(soc_info["SOC_2010"].unique()) + soc_desc_2_code[description] = (soc_2020_6, soc_2020_4, soc_2010) + + return soc_desc_2_code + + +major_places = [ + "Central London", + "Midlands", + "London", + "Birmingham", + "Leeds", + "Glasgow", + "Sheffield", + "Bradford", + "Manchester", + "Edinburgh", + "Liverpool", + "Bristol", + "Cardiff", + "Coventry", + "Nottingham", + "Leicester", + "Sunderland", + "Belfast", + "Newcastle", + "Brighton", + "Hull", + "Plymouth", + "Carlisle", + "Berkshire", + "Doncaster", + "Bedford", + "Chichester", + "Wakefield", +] +lower_case_end_words = [ + "nights", + "part time", + "full time", + "hybrid", + "maternity cover", + "remote", + "self employed", + "work from home", + "benefits", + "flexible", + "Office Based", +] + +lower_case_all_end_words = [ + word.lower() for word in major_places + lower_case_end_words +] + + +def job_title_cleaner( + text: str, lower_case_all_end_words: list = lower_case_all_end_words +) -> str: + """ + Will apply a bunch of cleaning to a job title + - removing certain things (locations or work type after a "-") + - fixes some unicode £ -> £ + - Removes text after "£"" + + Assumption: weird bad stuff comes after dashes or £ signs. + So this won't work well for e.g "£30k Data Scientist" or "Remote - London Data Scientist" + + This isn't perfect, but should hopefully help quite a few examples + + Examples: + 'Part Home Based Block Manager - Chichester' -> 'Part Home Based Block Manager' + 'Employment Solicitor - Claimant - Leeds' -> 'Employment Solicitor - Claimant' + 'Retail Customer Service CSM 16hrs' -> 'Retail Customer Service CSM' + 'Bike Delivery Driver - London' -> 'Bike Delivery Driver' + 'Fulfillment Associate - £1000 Sign on Bonus!' -> 'Fulfillment Associate' + + Args: + text (str): the text of the job title you want to clean + lower_case_all_end_words (list): a list of all the words to clean out + if they are at the end of the job title. + Returns: + str: the cleaned job title + + """ + if text: + text = str(text) + + findreplace = { + "&": " and ", + " ": " ", + "£": "£", + "(part time)": " ", + } + for f, r in findreplace.items(): + text = text.replace(f, r) + # Get rid of any double + spaces + text = re.sub(r"\s{2,}", " ", text).strip() + + # Remove mentions of hours e.g. Customer Service 30hrs -> Customer Service + text = re.sub(r"\d+\s*hrs", "", text).strip() + + # If there is a "£" remove everything after it (e.g. £30k per annum) + # Unless it occurs very early in the text + if "£" in text: + matches = re.findall(r"£", text) + index_found = text.index(matches[0]) + if index_found > 4: + text = " ".join(text.split("£")[0:-1]).strip() if "£" in text else text + + # Remove certain things after the last dash + if " - " in text: + last_bit = text.split(" - ")[-1].strip().lower() + # if any of the target words are in this, then remove everything after the dash + # e.g "Data Scientist - remote, London" -> "Data Scientist" + found = False + for word in lower_case_all_end_words: + if word in last_bit: + found = True + break + if last_bit == "": # This may happen if a £ was found + found = True + if found: + # Remove everything after the lastedash + text = " - ".join(text.split(" - ")[0:-1]).strip() + + if text: # The cleaning may make it so we are left with nothing + if text[-1] == "-": + text = text[0:-1].strip() + + return text diff --git a/pyproject.toml b/pyproject.toml index 44deb8b..4acb1d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,10 @@ torch = "^1.13.1" pytest = "^8.3.2" tqdm = "^4.66.4" numpy = "^1.26.4" +s3fs = "^2022.5.0" +openpyxl = "^3.1.3" +boto3 = "^1.21.21" +wasabi = "^1.1.3" [build-system] requires = ["poetry-core"] diff --git a/tests/test_soc_mapper.py b/tests/test_soc_mapper.py new file mode 100644 index 0000000..1e635a9 --- /dev/null +++ b/tests/test_soc_mapper.py @@ -0,0 +1,71 @@ +# Needed for Github Actions to not fail (see torch bug https://github.com/pytorch/pytorch/issues/121101) +import torch + +torch.set_num_threads(1) + +from nlp_link.soc_mapper.soc_map import SOCMapper + + +def test_find_most_likely_soc(): + + # Made up top match data + match_row = { + "job_title": "data scientist", + "top_soc_matches": [ + ["Data scientist", "2433/04", "2433", "2425", 0.95], + ["Data scientist computing", "2133/99", "2133", "2135", 0.9], + ["Data engineer", "2133/03", "2133", "2135", 0.8], + ["Data analyst", "3544/00", "3544", "3539", 0.8], + ["Computer scientist", "2133/01", "2133", "2135", 0.7], + ["Finance adviser", "2422/02", "2422", "3534", 0.5], + ], + } + + # When there is a top match over the sim_threshold + soc_mapper = SOCMapper( + sim_threshold=0.91, + ) + + result = soc_mapper.find_most_likely_soc(match_row) + assert result[1] == "Data scientist" + + # When there is no top match over the sim_threshold but at least minimum_n and over minimum_prop of + # the matches over top_n_sim_threshold similarity are the same + soc_mapper = SOCMapper( + sim_threshold=0.98, top_n_sim_threshold=0.65, minimum_n=3, minimum_prop=0.5 + ) + + result = soc_mapper.find_most_likely_soc(match_row) + assert result[0][1] == "2133" + + # When there is no top match over the sim_threshold and there are some + # matches over top_n_sim_threshold similarity but not enough (