Merge pull request #5 from nestauk/add-soc-mapper

Add socmapper functionality
nestauk · Oct 28, 2024 · 921d886 · 921d886
2 parents 15e40fe + 49ee5e9
commit 921d886
Show file tree

Hide file tree

Showing 15 changed files with 1,355 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -24,6 +24,30 @@ poetry install
 
 ```
 
+## Usage
+
+```
+from nlp_link.linker import NLPLinker
+
+nlp_link = NLPLinker()
+
+# dict inputs
+comparison_data = {'a': 'cats', 'b': 'dogs', 'd': 'rats', 'e': 'birds'}
+input_data = {'x': 'owls', 'y': 'feline', 'z': 'doggies', 'za': 'dogs', 'zb': 'chair'}
+nlp_link.load(comparison_data)
+matches = nlp_link.link_dataset(input_data)
+# Top match output
+print(matches)
+
+# list inputs
+comparison_data = ['cats', 'dogs', 'rats', 'birds']
+input_data = ['owls', 'feline', 'doggies', 'dogs','chair']
+nlp_link.load(comparison_data)
+matches = nlp_link.link_dataset(input_data)
+# Top match output
+print(matches)
+```
+
 ## Tests
 
 To run tests:
@@ -39,7 +63,7 @@ Docs for this repo are automatically published to gh-pages branch via. Github ac
 However, if you are editing the docs you can test them out locally by running
 
 ```
-cd guidelines
-pip install -r docs/requirements.txt
+cd docs
+<!-- pip install -r docs/requirements.txt -->
 mkdocs serve
 ```
diff --git a/docs/README.md b/docs/README.md
@@ -1,5 +1,104 @@
-# nlp-link
+# 🖇️ NLP Link
 
-Documentation for NLP Link
+NLP Link finds the most similar word (or words) in a reference list to an inputted word. For example, if you are trying to find which word is most similar to 'puppies' from a reference list of `['cats', 'dogs', 'rats', 'birds']`, nlp-link will return 'dogs'.
 
-- [Page1](./page1.md)
+Another functionality of this package is using the linking methodology to find the [SOC](https://www.ons.gov.uk/methodology/classificationsandstandards/standardoccupationalclassificationsoc) code most similar to an inputted job title. More on this [here](./page1.md).
+
+## 🔨 Usage
+
+Install the package using pip:
+
+```bash
+pip install nlp-link
+```
+
+### Basic usage
+
+Match two lists in python:
+
+```python
+
+from nlp_link.linker import NLPLinker
+
+nlp_link = NLPLinker()
+
+# list inputs
+comparison_data = ['cats', 'dogs', 'rats', 'birds']
+input_data = ['owls', 'feline', 'doggies', 'dogs','chair']
+nlp_link.load(comparison_data)
+matches = nlp_link.link_dataset(input_data)
+# Top match output
+print(matches)
+
+```
+
+Which outputs:
+
+```
+   input_id input_text  link_id link_text  similarity
+0         0       owls        3     birds    0.613577
+1         1     feline        0      cats    0.669633
+2         2    doggies        1      dogs    0.757443
+3         3       dogs        1      dogs    1.000000
+4         4      chair        0      cats    0.331178
+
+```
+
+### Extended usage
+
+Match using dictionary inputs (where the key is a unique ID):
+
+```python
+
+from nlp_link.linker import NLPLinker
+
+nlp_link = NLPLinker()
+
+# dict inputs
+comparison_data = {'a': 'cats', 'b': 'dogs', 'd': 'rats', 'e': 'birds'}
+input_data = {'x': 'owls', 'y': 'feline', 'z': 'doggies', 'za': 'dogs', 'zb': 'chair'}
+nlp_link.load(comparison_data)
+matches = nlp_link.link_dataset(input_data)
+# Top match output
+print(matches)
+
+```
+
+Which outputs:
+
+```
+  input_id input_text link_id link_text  similarity
+0        x       owls       e     birds    0.613577
+1        y     feline       a      cats    0.669633
+2        z    doggies       b      dogs    0.757443
+3       za       dogs       b      dogs    1.000000
+4       zb      chair       a      cats    0.331178
+
+```
+
+Output several most similar matches using the `top_n` argument (`format_output` needs to be set to False for this):
+
+```python
+
+from nlp_link.linker import NLPLinker
+
+nlp_link = NLPLinker()
+
+comparison_data = {'a': 'cats', 'b': 'dogs', 'c': 'kittens', 'd': 'rats', 'e': 'birds'}
+input_data = {'x': 'pets', 'y': 'feline'}
+nlp_link.load(comparison_data)
+matches = nlp_link.link_dataset(input_data, top_n=2, format_output=False)
+# Top match output
+print(matches)
+# Format output for ease of reading
+print({input_data[k]: [comparison_data[r] for r, _ in v] for k,v in matches.items()})
+```
+
+Which will output:
+
+```
+{'x': [['b', 0.8171109], ['a', 0.7650396]], 'y': [['a', 0.6696329], ['c', 0.5778763]]}
+{'pets': ['dogs', 'cats'], 'feline': ['cats', 'kittens']}
+```
+
+The `drop_most_similar` argument can be set to True if you don't want to output the most similar match - this might be the case if you were comparing a list with itself. For this you would run `nlp_link.link_dataset(input_data, drop_most_similar=True)`.
diff --git a/docs/mkdocs.yaml b/docs/mkdocs.yaml
@@ -39,6 +39,6 @@ theme:
         name: Switch to light mode
 nav:
   - Home: README.md
-  - Page 1: page1.md
+  - SOCMapper: page1.md
 plugins:
   - same-dir
diff --git a/docs/page1.md b/docs/page1.md
@@ -1 +1,27 @@
-## Title
+# 🗺️ SOC Mapper
+
+The SOC mapper relies on the [SOC coding index](https://www.ons.gov.uk/methodology/classificationsandstandards/standardoccupationalclassificationsoc/soc2020/soc2020volume2codingrulesandconventions) released by the ONS. This dataset contains over 30,000 job titles with the SOC code.
+
+The `SOCMapper` class in `soc_map.py` maps job title(s) to SOC(s).
+
+## 🔨 Core functionality
+
+```
+from nlp_link.soc_mapper.soc_map import SOCMapper
+
+soc_mapper = SOCMapper()
+soc_mapper.load()
+job_titles=["data scientist", "Assistant nurse", "Senior financial consultant - London"]
+
+soc_mapper.get_soc(job_titles, return_soc_name=True)
+```
+
+Which will output
+
+```
+[((('2433/04', 'Statistical data scientists'), ('2433', 'Actuaries, economists and statisticians'), '2425'), 'Data scientist'), ((('6131/99', 'Nursing auxiliaries and assistants n.e.c.'), ('6131', 'Nursing auxiliaries and assistants'), '6141'), 'Assistant nurse'), ((('2422/02', 'Financial advisers and planners'), ('2422', 'Finance and investment analysts and advisers'), '3534'), 'Financial consultant')]
+```
+
+## 📖 Read more
+
+Read more about the methods and evaluation of the SOCMapper [here](https://github.com/nestauk/nlp-link/soc_mapper/README.md).
diff --git a/nlp_link/linker.py b/nlp_link/linker.py
@@ -25,19 +25,18 @@
 
 """
 
-from sentence_transformers import SentenceTransformer
-import torch
 from tqdm import tqdm
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 import pandas as pd
 
 from typing import Union, Optional
-import logging
 
-from nlp_link.linker_utils import chunk_list
+from nlp_link.linker_utils import chunk_list, get_embeddings, load_bert
 
-logger = logging.getLogger(__name__)
+from wasabi import msg, Printer
+
+msg_print = Printer()
 
 # TO DO: cosine or euclidean?
 
@@ -76,16 +75,16 @@ def _process_dataset(
             try:
                 return dict(zip(input_data[id_column], input_data[text_column]))
             except:
-                logger.warning(
+                msg.warn(
                     "Input is a dataframe, please specify id_column and text_column"
                 )
         else:
-            logger.warning(
+            msg.warn(
                 "The input_data input must be a dictionary, a list or pandas dataframe"
             )
 
         if not isinstance(input_data[0], str):
-            logger.warning(
+            msg.warn(
                 "The input_data input must be a list of texts, or a dictionary where the values are texts"
             )
 
@@ -100,12 +99,7 @@ def load(
                 A list of texts or a dictionary of texts where the key is the unique id.
                 If a list is given then a unique id will be assigned with the index order.
         """
-        logger.info("Loading model")
-        device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")
-        self.bert_model = SentenceTransformer(
-            "sentence-transformers/all-MiniLM-L6-v2", device=device
-        )
-        self.bert_model.max_seq_length = 512
+        self.bert_model = load_bert()
 
         self.comparison_data = self._process_dataset(comparison_data)
         self.comparison_data_texts = list(self.comparison_data.values())
@@ -123,19 +117,12 @@ def _get_embeddings(self, text_list: list) -> np.array:
             np.array: The embeddings for the input list of texts
         """
 
-        logger.info(
-            f"Finding embeddings for {len(text_list)} texts chunked into {round(len(text_list)/self.embed_chunk_size)} chunks"
+        return get_embeddings(
+            text_list=text_list,
+            embed_chunk_size=self.embed_chunk_size,
+            batch_size=self.batch_size,
+            bert_model=self.bert_model,
         )
-        all_embeddings = []
-        for batch_texts in tqdm(chunk_list(text_list, self.embed_chunk_size)):
-            all_embeddings.append(
-                self.bert_model.encode(
-                    np.array(batch_texts), batch_size=self.batch_size
-                )
-            )
-        all_embeddings = np.concatenate(all_embeddings)
-
-        return all_embeddings
 
     def get_matches(
         self,
@@ -161,7 +148,7 @@ def get_matches(
             dict: The top matches for each input id.
         """
 
-        logger.info(
+        msg.info(
             f"Finding the top dataset matches for {len(input_data_ids)} input texts chunked into {round(len(input_data_ids)/self.match_chunk_size)}"
         )
 
@@ -222,11 +209,11 @@ def link_dataset(
         """
 
         try:
-            logger.info(
+            msg.info(
                 f"Comparing {len(input_data)} input texts to {len(self.comparison_embeddings)} comparison texts"
             )
         except:
-            logger.warning(
+            msg.warning(
                 "self.comparison_embeddings does not exist - you may have not run load()"
             )
 

diff --git a/nlp_link/linker_utils.py b/nlp_link/linker_utils.py
@@ -1,3 +1,56 @@
+from tqdm import tqdm
+
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import torch
+
+from wasabi import msg, Printer
+
+msg_print = Printer()
+
+
 def chunk_list(orig_list, n_chunks):
     for i in range(0, len(orig_list), n_chunks):
         yield orig_list[i : i + n_chunks]
+
+
+def load_bert(bert_model_name="sentence-transformers/all-MiniLM-L6-v2"):
+
+    with msg_print.loading("Loading BERT model"):
+        device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")
+        bert_model = SentenceTransformer(bert_model_name, device=device)
+        bert_model.max_seq_length = 512
+    msg.good("BERT model loaded")
+    return bert_model
+
+
+def get_embeddings(
+    text_list: list,
+    bert_model,
+    embed_chunk_size: int = 500,
+    batch_size: int = 32,
+) -> np.array:
+    """
+    Get embeddings for a list of texts
+
+    Args:
+        text_list (list): A lists of texts.
+        bert_model: An initialised SentenceTransformer BERT model.
+        embed_chunk_size (int): The number of texts per chunk to process.
+        batch_size (int): BERT batch_size.
+    Returns:
+        np.array: The embeddings for the input list of texts
+    """
+
+    msg.info(
+        f"Finding embeddings for {len(text_list)} texts chunked into {round(len(text_list)/embed_chunk_size)} chunks"
+    )
+    all_embeddings = []
+    for batch_texts in tqdm(chunk_list(text_list, embed_chunk_size)):
+        all_embeddings.append(
+            bert_model.encode(np.array(batch_texts), batch_size=batch_size)
+        )
+    all_embeddings = np.concatenate(all_embeddings)
+    msg.good("Texts embedded.")
+
+    return all_embeddings