diff --git a/nlp_link/soc_mapper/soc_map.py b/nlp_link/soc_mapper/soc_map.py
index 39d5a9f..89605d2 100644
--- a/nlp_link/soc_mapper/soc_map.py
+++ b/nlp_link/soc_mapper/soc_map.py
@@ -27,6 +27,8 @@
     load_job_title_soc,
     process_job_title_soc,
     job_title_cleaner,
+    unique_soc_job_titles,
+    unique_soc_descriptions,
 )
 
 from nlp_link.linker_utils import chunk_list
@@ -35,43 +37,25 @@
 import logging
 
 
-# from dap_prinz_green_jobs import BUCKET_NAME, logger, config, PROJECT_DIR
-
-
 class SOCMapper(object):
     """Class for linking job titles to SOC codes.
 
-        The input job title is matched to a dataset of job titles with their 2020 SOC.
-        - If the most similar job title is very similar, then the corresponding 6-digit SOC is outputted.
-        - Otherwise, we look at a group of the most similar job titles, and if they all have the same 4-digit SOC, then this is outputted.
+    The input job title is matched to a dataset of job titles with their 2020 SOC.
+    - If the most similar job title is very similar, then the corresponding 6-digit SOC is outputted.
+    - Otherwise, we look at a group of the most similar job titles, and if they all have the same 4-digit SOC, then this is outputted.
 
-        Attributes
     ----------
 
-    :param local: Whether to read data from a local location or not, defaults to True
-    :type local: bool
-
-    :param embeddings_output_dir: (optional) The directory the embeddings are stored, or will be stored if saved.
-    You are unlikely to need to change this from "outputs/data/green_occupations/soc_matching/" unless the SOC data changes
-    :type embeddings_output_dir: str, None
-
-    :param batch_size: How many job titles per batch for embedding, defaults to 500
-    :type batch_size: int
-
-    :param match_top_n: The number of most similar SOC matches to consider when calculating the final SOC and outputing
-    :type match_top_n: int
-
-    :param sim_threshold: The similarity threshold for outputting the most similar SOC match.
-    :type sim_threshold: float
-
-    :param top_n_sim_threshold: The similarity threshold for a match being added to a group of SOC matches.
-    :type top_n_sim_threshold: float
-
-    :param minimum_n: The minimum size of a group of SOC matches.
-    :type minimum_n: int
-
-    :param minimum_prop: If a group of SOC matches have a high proportion (>= minimum_prop) of the same SOC being matched, then use this SOC.
-    :type minimum_prop: float
+    Args:
+        local (bool): Whether to read data from a local location or not, defaults to True
+        embeddings_output_dir (str, optional): The directory the embeddings are stored, or will be stored if saved.
+            You are unlikely to need to change this from "outputs/data/green_occupations/soc_matching/" unless the SOC data changes
+        batch_size (int): How many job titles per batch for embedding, defaults to 500
+        match_top_n (int): The number of most similar SOC matches to consider when calculating the final SOC and outputing
+        sim_threshold (float): The similarity threshold for outputting the most similar SOC match.
+        top_n_sim_threshold (float): The similarity threshold for a match being added to a group of SOC matches.
+        minimum_n (int): The minimum size of a group of SOC matches.
+        minimum_prop (float): If a group of SOC matches have a high proportion (>= minimum_prop) of the same SOC being matched, then use this SOC.
 
     ----------
     Methods
@@ -79,20 +63,18 @@ class SOCMapper(object):
 
     load_process_soc_data():
         Load the SOC data
-    unique_soc_job_titles(jobtitle_soc_data):
-        Convert the SOC data into a dict where each key is a job title and the value is the SOC code
-        embed_texts(texts):
-                Get sentence embeddings for a list of input texts
-        load(save_embeds=False):
-                Load everything to use this class, calculate SOC embeddings if they weren't inputted, save embeddings if desired
-        find_most_similar_matches(job_titles, job_title_embeddings):
-                Using the inputted job title embeddings and the SOC embeddings, find the full information about the most similar SOC job titles
-        find_most_likely_soc(match_row):
-                For the full match information for one job title, find the most likely SOC (via top match, or group of top matches)
-        get_soc(job_titles, additional_info=False):
-                (main function) For inputted job titles, output the best SOC match, add extra information about matches using the additional_info argument
-
-        ----------
+    embed_texts(texts):
+            Get sentence embeddings for a list of input texts
+    load(save_embeds=False):
+            Load everything to use this class, calculate SOC embeddings if they weren't inputted, save embeddings if desired
+    find_most_similar_matches(job_titles, job_title_embeddings):
+            Using the inputted job title embeddings and the SOC embeddings, find the full information about the most similar SOC job titles
+    find_most_likely_soc(match_row):
+            For the full match information for one job title, find the most likely SOC (via top match, or group of top matches)
+    get_soc(job_titles, additional_info=False):
+            (main function) For inputted job titles, output the best SOC match, add extra information about matches using the additional_info argument
+
+    ----------
     Usage
     ----------
         from soc_mapper.soc_map import SOCMapper
@@ -136,95 +118,6 @@ def load_process_soc_data(self):
 
         return jobtitle_soc_data
 
-    def unique_soc_job_titles(self, jobtitle_soc_data: pd.DataFrame()) -> dict:
-        """
-        Taking the dataset of job titles and which SOC they belong to - create a unique
-        dictionary where each key is a job title and the value is the SOC code.
-        There are additional words to include in the job title if at first
-        it is not unique.
-        """
-
-        col_name_0 = "INDEXOCC NATURAL WORD ORDER"
-        col_name_1 = "ADD"
-        col_name_2 = "IND"
-
-        jobtitle_soc_data[f"{col_name_0} and {col_name_1}"] = jobtitle_soc_data.apply(
-            lambda x: (
-                x[col_name_0] + " " + x[col_name_1]
-                if pd.notnull(x[col_name_1])
-                else x[col_name_0]
-            ),
-            axis=1,
-        )
-        jobtitle_soc_data[f"{col_name_0} and {col_name_1} and {col_name_2}"] = (
-            jobtitle_soc_data.apply(
-                lambda x: (
-                    x[f"{col_name_0} and {col_name_1}"] + " " + x[col_name_2]
-                    if pd.notnull(x[col_name_2])
-                    else x[f"{col_name_0} and {col_name_1}"]
-                ),
-                axis=1,
-            )
-        )
-
-        # Try to find a unique job title to SOC 2020 4 or 6 code mapping
-        job_title_2_soc6_4 = {}
-        for job_title, grouped_soc_data in jobtitle_soc_data.groupby(col_name_0):
-            if grouped_soc_data["SOC_2020_EXT"].nunique() == 1:
-                job_title_2_soc6_4[job_title] = (
-                    grouped_soc_data["SOC_2020_EXT"].unique()[0],
-                    grouped_soc_data["SOC_2020"].unique()[0],
-                    grouped_soc_data["SOC_2010"].unique()[0],
-                )
-            else:
-                for job_title_1, grouped_soc_data_1 in grouped_soc_data.groupby(
-                    f"{col_name_0} and {col_name_1}"
-                ):
-                    if grouped_soc_data_1["SOC_2020_EXT"].nunique() == 1:
-                        job_title_2_soc6_4[job_title_1] = (
-                            grouped_soc_data_1["SOC_2020_EXT"].unique()[0],
-                            grouped_soc_data_1["SOC_2020"].unique()[0],
-                            grouped_soc_data_1["SOC_2010"].unique()[0],
-                        )
-                    else:
-                        for (
-                            job_title_2,
-                            grouped_soc_data_2,
-                        ) in grouped_soc_data_1.groupby(
-                            f"{col_name_0} and {col_name_1} and {col_name_2}"
-                        ):
-                            if grouped_soc_data_2["SOC_2020_EXT"].nunique() == 1:
-                                job_title_2_soc6_4[job_title_2] = (
-                                    grouped_soc_data_2["SOC_2020_EXT"].unique()[0],
-                                    grouped_soc_data_2["SOC_2020"].unique()[0],
-                                    grouped_soc_data_2["SOC_2010"].unique()[0],
-                                )
-
-        return job_title_2_soc6_4
-
-    def unique_soc_descriptions(self, soc_data: pd.DataFrame()) -> dict:
-        """
-        Taking the dataset of SOC and their descriptions - create a unique
-        dictionary where each key is a description and the value is the SOC code.
-        """
-        soc_data["SUB-UNIT GROUP DESCRIPTIONS"] = soc_data[
-            "SUB-UNIT GROUP DESCRIPTIONS"
-        ].apply(lambda x: x.replace(" n.e.c.", "").replace(" n.e.c", ""))
-
-        dd = soc_data[
-            ["SUB-UNIT GROUP DESCRIPTIONS", "SOC_2020_EXT", "SOC_2020", "SOC_2010"]
-        ].drop_duplicates()
-
-        # There can be multiple 2010 codes for each 6 digit, so just output the most common
-        soc_desc_2_code = {}
-        for description, soc_info in dd.groupby("SUB-UNIT GROUP DESCRIPTIONS"):
-            soc_2020_6 = soc_info["SOC_2020_EXT"].value_counts().index[0]
-            soc_2020_4 = soc_info["SOC_2020"].value_counts().index[0]
-            soc_2010 = list(soc_info["SOC_2010"].unique())
-            soc_desc_2_code[description] = (soc_2020_6, soc_2020_4, soc_2010)
-
-        return soc_desc_2_code
-
     def embed_texts(
         self,
         texts: list,
@@ -259,13 +152,11 @@ def load(self, save_embeds=False, job_titles=True):
         )
 
         if job_titles:
-            self.job_title_2_soc6_4 = self.unique_soc_job_titles(self.jobtitle_soc_data)
+            self.job_title_2_soc6_4 = unique_soc_job_titles(self.jobtitle_soc_data)
         else:
             # This is a bit of an appended use case - so I've called the variable the same
             # so it fits in with the rest of the pipeline
-            self.job_title_2_soc6_4 = self.unique_soc_descriptions(
-                self.jobtitle_soc_data
-            )
+            self.job_title_2_soc6_4 = unique_soc_descriptions(self.jobtitle_soc_data)
 
         embeddings_path = os.path.join(
             self.embeddings_output_dir, "soc_job_embeddings.json"
diff --git a/nlp_link/soc_mapper/soc_map_utils.py b/nlp_link/soc_mapper/soc_map_utils.py
index d7d6389..86479b1 100644
--- a/nlp_link/soc_mapper/soc_map_utils.py
+++ b/nlp_link/soc_mapper/soc_map_utils.py
@@ -24,7 +24,13 @@ def load_job_title_soc() -> pd.DataFrame():
 
 
 def process_job_title_soc(jobtitle_soc_data: pd.DataFrame()) -> pd.DataFrame():
-    # Standardise the column names for use in soc_map.py
+    """Standardise the column names for use in soc_map.py
+    Args:
+        jobtitle_soc_data (pd.DataFrame): the raw ONS SOC coding index dataset
+    Returns:
+        pd.DataFrame: the cleaned ONS SOC coding index dataset
+    """
+
     jobtitle_soc_data = jobtitle_soc_data.rename(
         columns={
             soc_mapper_config["soc_data"]["soc_2020_ext_col"]: "SOC_2020_EXT",
@@ -46,6 +52,111 @@ def process_job_title_soc(jobtitle_soc_data: pd.DataFrame()) -> pd.DataFrame():
     return jobtitle_soc_data
 
 
+def unique_soc_job_titles(jobtitle_soc_data: pd.DataFrame()) -> dict:
+    """
+    Taking the dataset of job titles and which SOC they belong to - create a unique
+    dictionary where each key is a job title and the value is the SOC code.
+    There are additional words to include in the job title if at first
+    it is not unique.
+
+    Args:
+        jobtitle_soc_data (pd.DataFrame): the cleaned ONS SOC coding index dataset.
+
+    Returns:
+        dict: A dictionary where each key is a job title and the value is the SOC code.
+
+    """
+
+    col_name_0 = "INDEXOCC NATURAL WORD ORDER"
+    col_name_1 = "ADD"
+    col_name_2 = "IND"
+
+    jobtitle_soc_data[f"{col_name_0} and {col_name_1}"] = jobtitle_soc_data.apply(
+        lambda x: (
+            x[col_name_0] + " " + x[col_name_1]
+            if pd.notnull(x[col_name_1])
+            else x[col_name_0]
+        ),
+        axis=1,
+    )
+    jobtitle_soc_data[f"{col_name_0} and {col_name_1} and {col_name_2}"] = (
+        jobtitle_soc_data.apply(
+            lambda x: (
+                x[f"{col_name_0} and {col_name_1}"] + " " + x[col_name_2]
+                if pd.notnull(x[col_name_2])
+                else x[f"{col_name_0} and {col_name_1}"]
+            ),
+            axis=1,
+        )
+    )
+
+    # Try to find a unique job title to SOC 2020 4 or 6 code mapping
+    job_title_2_soc6_4 = {}
+    for job_title, grouped_soc_data in jobtitle_soc_data.groupby(col_name_0):
+        if grouped_soc_data["SOC_2020_EXT"].nunique() == 1:
+            job_title_2_soc6_4[job_title] = (
+                grouped_soc_data["SOC_2020_EXT"].unique()[0],
+                grouped_soc_data["SOC_2020"].unique()[0],
+                grouped_soc_data["SOC_2010"].unique()[0],
+            )
+        else:
+            for job_title_1, grouped_soc_data_1 in grouped_soc_data.groupby(
+                f"{col_name_0} and {col_name_1}"
+            ):
+                if grouped_soc_data_1["SOC_2020_EXT"].nunique() == 1:
+                    job_title_2_soc6_4[job_title_1] = (
+                        grouped_soc_data_1["SOC_2020_EXT"].unique()[0],
+                        grouped_soc_data_1["SOC_2020"].unique()[0],
+                        grouped_soc_data_1["SOC_2010"].unique()[0],
+                    )
+                else:
+                    for (
+                        job_title_2,
+                        grouped_soc_data_2,
+                    ) in grouped_soc_data_1.groupby(
+                        f"{col_name_0} and {col_name_1} and {col_name_2}"
+                    ):
+                        if grouped_soc_data_2["SOC_2020_EXT"].nunique() == 1:
+                            job_title_2_soc6_4[job_title_2] = (
+                                grouped_soc_data_2["SOC_2020_EXT"].unique()[0],
+                                grouped_soc_data_2["SOC_2020"].unique()[0],
+                                grouped_soc_data_2["SOC_2010"].unique()[0],
+                            )
+
+    return job_title_2_soc6_4
+
+
+def unique_soc_descriptions(soc_data: pd.DataFrame()) -> dict:
+    """
+    Taking the dataset of SOC and their descriptions - create a unique
+    dictionary where each key is a description and the value is the SOC code.
+
+    Args:
+        soc_data (pd.DataFrame): the cleaned ONS SOC coding index dataset.
+
+    Returns:
+        dict: A dictionary where each key is a SOC description and the value is the SOC code.
+
+    """
+    soc_data["SUB-UNIT GROUP DESCRIPTIONS"] = soc_data[
+        "SUB-UNIT GROUP DESCRIPTIONS"
+    ].apply(lambda x: x.replace(" n.e.c.", "").replace(" n.e.c", ""))
+
+    dd = soc_data[
+        ["SUB-UNIT GROUP DESCRIPTIONS", "SOC_2020_EXT", "SOC_2020", "SOC_2010"]
+    ].drop_duplicates()
+
+    # There can be multiple 2010 codes for each 6 digit, so just output the most common
+    soc_desc_2_code = {}
+    for description, soc_info in dd.groupby("SUB-UNIT GROUP DESCRIPTIONS"):
+        soc_2020_6 = soc_info["SOC_2020_EXT"].value_counts().index[0]
+        soc_2020_4 = soc_info["SOC_2020"].value_counts().index[0]
+        soc_2010 = list(soc_info["SOC_2010"].unique())
+        soc_desc_2_code[description] = (soc_2020_6, soc_2020_4, soc_2010)
+
+    return soc_desc_2_code
+
+
 major_places = [
     "Central London",
     "Midlands",
@@ -95,7 +206,9 @@ def process_job_title_soc(jobtitle_soc_data: pd.DataFrame()) -> pd.DataFrame():
 ]
 
 
-def job_title_cleaner(text, lower_case_all_end_words=lower_case_all_end_words):
+def job_title_cleaner(
+    text: str, lower_case_all_end_words: list = lower_case_all_end_words
+) -> str:
     """
     Will apply a bunch of cleaning to a job title
     - removing certain things (locations or work type after a "-")
@@ -113,6 +226,14 @@ def job_title_cleaner(text, lower_case_all_end_words=lower_case_all_end_words):
     'Retail Customer Service CSM 16hrs' -> 'Retail Customer Service CSM'
     'Bike Delivery Driver - London' -> 'Bike Delivery Driver'
     'Fulfillment Associate - &#163;1000 Sign on Bonus!' -> 'Fulfillment Associate'
+
+    Args:
+        text (str): the text of the job title you want to clean
+        lower_case_all_end_words (list): a list of all the words to clean out
+        if they are at the end of the job title.
+    Returns:
+        str: the cleaned job title
+
     """
     if text:
         text = str(text)