diff --git a/README.md b/README.md
index 1b9e43e..f654667 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,10 @@ More details of the steps included in this project, and running instructions, ca
 3. [skills_extraction](skills_taxonomy_v2/pipeline/skills_extraction/README.md) - Extracting skills from skill sentences.
 4. [skills_taxonomy](skills_taxonomy_v2/pipeline/skills_taxonomy/README.md) - Building the skills taxonomy from extracted skills.
 
+### Extract skills example
+
+A simple example of extracting skills from a toy dataset of job adverts is given in [the examples folder](skills_taxonomy_v2/examples/extract_skills.py).
+
 ### Analysis
 
 This repository also contains various pieces of analysis of the taxonomy. These are discussed in the main analysis [README file](skills_taxonomy_v2/analysis/README.md).
diff --git a/skills_taxonomy_v2/examples/Extract Skills.py b/skills_taxonomy_v2/examples/Extract Skills.py
new file mode 100644
index 0000000..3d6aa6a
--- /dev/null
+++ b/skills_taxonomy_v2/examples/Extract Skills.py	
@@ -0,0 +1,93 @@
+# ---
+# jupyter:
+#   jupytext:
+#     cell_metadata_filter: -all
+#     comment_magics: true
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.11.4
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+# ---
+
+# %%
+# cd ../..
+
+# %%
+import json
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# %%
+from skills_taxonomy_v2.examples.extract_skills import (
+    load_prerequisites, split_skills, predict_skill_sents, reduce_embeddings, get_skill_name, cluster_embeddings
+)
+from skills_taxonomy_v2.pipeline.skills_extraction.get_sentence_embeddings import get_embeddings
+
+# %%
+sent_classifier_model_dir = 'outputs/sentence_classifier/models/2021_08_16.pkl'
+job_adverts_file = 'skills_taxonomy_v2/examples/job_advert_examples.txt'
+
+# Parameters - these will need tweaking depending on your input data
+reduction_n_neighbors = 6
+reduction_min_dist = 0.0
+clustering_eps = 1
+clustering_min_samples = 1
+
+# %%
+nlp, bert_vectorizer, sent_classifier = load_prerequisites(sent_classifier_model_dir)
+
+# Load your job advert texts; a list of dicts with the keys "full_text" and "job_id"
+with open(job_adverts_file) as f:
+    job_adverts = json.load(f)
+
+# Run the pipeline to extract skills
+all_job_ids, all_sentences = split_skills(job_adverts)
+skill_sentences_dict = predict_skill_sents(sent_classifier, all_job_ids, all_sentences)
+sentence_embeddings, original_sentences = get_embeddings(skill_sentences_dict, nlp, bert_vectorizer)
+sentences_data_df = reduce_embeddings(sentence_embeddings, original_sentences, reduction_n_neighbors, reduction_min_dist)
+sentences_clustered = cluster_embeddings(sentences_data_df, clustering_eps, clustering_min_samples)
+
+# %% [markdown]
+# ## Look at skills extracted
+
+# %%
+skill_name_dict = sentences_clustered.groupby('cluster_number').apply(lambda x: get_skill_name(x)).to_dict()
+job_skills_dict = sentences_clustered.groupby('job id')['cluster_number'].unique().to_dict()
+
+
+for job_advert in job_adverts:
+    job_advert["Skills"] = [skill_name_dict[skill_num] for skill_num in job_skills_dict[job_advert['job_id']]]
+
+print(f"There are {len(skill_name_dict)} skills extracted using this data")
+for i in range(3):
+    print(f'The job advert: \n{job_adverts[i]["full_text"]} \nHas skills: \n{job_adverts[i]["Skills"]}\n')
+
+
+
+# %% [markdown]
+# ## Plot sentences coloured by which skill they are assigned to
+
+# %%
+fig, ax = plt.subplots(figsize=(8,8))
+# plot
+sentences_clustered.plot.scatter(
+    x = 'reduced_points x',
+    y = 'reduced_points y',
+    c = 'cluster_number',
+    cmap = "rainbow",
+    colorbar=False,
+    ax=ax, s=50, alpha=0.6);
+ax.xaxis.set_visible(False)
+ax.yaxis.set_visible(False)
+ax.axis('off')
+# annotate points in axis
+for idx, row in sentences_clustered.iterrows():
+    ax.annotate(row['original sentence'], (row['reduced_points x'], row['reduced_points y']) )
+
+# %%
diff --git a/skills_taxonomy_v2/examples/extract_skills.py b/skills_taxonomy_v2/examples/extract_skills.py
new file mode 100644
index 0000000..92fd314
--- /dev/null
+++ b/skills_taxonomy_v2/examples/extract_skills.py
@@ -0,0 +1,210 @@
+"""
+This pipeline creates and extracts skills from an input list of job adverts.
+
+Prerequisites:
+- You have access to our S3 bucket or have the skills classifier pkl file stored locally
+- You have installed the environment requirements for this repo
+
+Note:
+- This script is not optimised to process 1000s of job adverts, so we recommend modifying this
+code if you wanted to process many job adverts.
+- It's advised to modify the data reduction and clustering parameters as these will be
+very sensitive to the input data.
+- The variable sentences_clustered contains the information needed for plotting sentences in 2D,
+and finding which specific sentences were assigned to each skill
+
+Running this script with the default job_advert_examples.txt file will print out:
+
+The job advert:
+This is a sentence about the company and the salary. We require applicants to have skills in Microsoft Excel.
+Has skills:
+['microsoft-excel-require']
+
+The job advert:
+We want Microsoft Excel skills for this role. Communication skills are also essential.
+Has skills:
+['microsoft-excel-require', 'communication-important-essential']
+
+The job advert:
+This role has a very competitive starting salary. Skills for good communication are very important.
+Has skills:
+['communication-important-essential']
+
+"""
+
+from skills_taxonomy_v2.pipeline.sentence_classifier.utils import split_sentence
+from skills_taxonomy_v2.pipeline.sentence_classifier.sentence_classifier import (
+	SentenceClassifier, BertVectorizer
+)
+from skills_taxonomy_v2.pipeline.skills_extraction.get_sentence_embeddings import get_embeddings
+from skills_taxonomy_v2.pipeline.skills_extraction.cluster_embeddings import ClusterEmbeddings
+
+import umap.umap_ as umap
+import spacy
+import pandas as pd
+
+from tqdm import tqdm
+from collections import Counter, defaultdict
+import pickle
+import json
+import os
+import logging
+logger = logging.getLogger(__name__)
+
+sent_classifier_model_dir = 'outputs/sentence_classifier/models/2021_08_16.pkl'
+job_adverts_file = 'skills_taxonomy_v2/examples/job_advert_examples.txt'
+
+# Parameters - these will need tweaking depending on your input data
+reduction_n_neighbors = 6
+reduction_min_dist = 0.0
+clustering_eps = 1
+clustering_min_samples = 1
+
+def load_prerequisites(sent_classifier_model_dir):
+
+	nlp = spacy.load("en_core_web_sm")
+	bert_vectorizer = BertVectorizer(
+			bert_model_name="sentence-transformers/all-MiniLM-L6-v2",
+		)
+	bert_vectorizer.fit()
+
+	# Load the sentence classifier (either locally or on S3)
+
+	sent_classifier = SentenceClassifier(bert_model_name= "bert-base-uncased")
+
+	if os.path.exists(os.path.join(os.getcwd(), sent_classifier_model_dir)):
+		sent_classifier.classifier = pickle.load(sent_classifier_model_dir)
+		sent_classifier.load_bert()
+	else:
+		logger.info("Sentence Classifier not found locally, so trying to find on S3")
+		sent_classifier.load_model(sent_classifier_model_dir.split('/')[-1])
+
+	return nlp, bert_vectorizer, sent_classifier
+
+def split_skills(job_adverts):
+
+	all_job_ids = []
+	all_sentences = []
+	for job_advert in job_adverts:
+		job_id, sentences = split_sentence(job_advert, min_length=30)
+		all_job_ids += [job_id]*len(sentences)
+		all_sentences += sentences
+
+	return all_job_ids, all_sentences
+
+def predict_skill_sents(sent_classifier, all_job_ids, all_sentences):
+
+	sentences_vec = sent_classifier.transform(all_sentences)
+	sentences_pred = sent_classifier.predict(sentences_vec)
+
+	skill_sentences_dict = defaultdict(list)
+	for job_id, sent, pred in zip(all_job_ids, all_sentences, sentences_pred):
+		if pred == 1:
+			skill_sentences_dict[job_id].append(sent)
+
+	return skill_sentences_dict
+
+def reduce_embeddings(sentence_embeddings, original_sentences, reduction_n_neighbors, reduction_min_dist):
+
+	reducer_class = umap.UMAP(
+		n_neighbors=reduction_n_neighbors,
+		min_dist=reduction_min_dist,
+		random_state=42,
+		n_components=2,
+	)
+	reducer_class.fit([e for _,_,_,e in sentence_embeddings])
+
+	sent_thresh = 250
+	mask_seq = "[MASK]"
+	embedding_list = []
+	words_list = []
+	sentence_list = []
+	jobid_list = []
+	sentid_list = []
+	for job_id, sent_id, words, embedding in sentence_embeddings:
+		original_sentence = original_sentences[sent_id]
+		if len(original_sentence) < sent_thresh:
+			embedding_list.append(embedding)
+			words_list.append(words.replace(mask_seq, "").split())
+			sentence_list.append(original_sentence)
+			jobid_list.append(job_id)
+			sentid_list.append(sent_id)
+
+	sentence_embeddings_red = reducer_class.transform(embedding_list).tolist()
+	sentences_data =  {
+		"description": words_list,
+		"original sentence": sentence_list,
+		"job id": jobid_list,
+		"sentence id": sentid_list,
+		"embedding": sentence_embeddings_red,
+	}
+	
+	sentences_data_df = pd.DataFrame(sentences_data)
+
+	sentences_data_df["reduced_points x"] = sentences_data_df["embedding"].apply(lambda x: x[0])
+	sentences_data_df["reduced_points y"] = sentences_data_df["embedding"].apply(lambda x: x[1])
+	sentences_data_df["original sentence length"] = sentences_data_df["original sentence"].apply(lambda x:len(x))
+
+	return sentences_data_df
+
+def get_skill_name(skill_data):
+	"""
+	Find the most common words in the skill description words as a way to name skills
+	"""
+	common_description_words = Counter([v for d in skill_data['description'] for v in d]).most_common(3)
+	return '-'.join([c[0] for c in common_description_words])
+
+def cluster_embeddings(sentences_data_df, clustering_eps, clustering_min_samples):
+
+	cluster_embeddings = ClusterEmbeddings(
+		dbscan_eps=clustering_eps,
+		dbscan_min_samples=clustering_min_samples,
+		train_cluster_n=len(sentences_data_df),
+		)
+	_ = cluster_embeddings.get_clusters(sentences_data_df)
+	sentences_clustered = cluster_embeddings.sentences_data_short_sample
+
+	return sentences_clustered
+
+if __name__ == '__main__':
+
+	logger.info("Loading pre-trained models and data ...")
+
+	# Load pre-trained models needed for this pipeline
+	
+	nlp, bert_vectorizer, sent_classifier = load_prerequisites(sent_classifier_model_dir)
+	
+	# Load your job advert texts; a list of dicts with the keys "full_text" and "job_id"
+
+	with open(job_adverts_file) as f:
+		job_adverts = json.load(f)
+
+	# Run the pipeline to extract skills
+
+	logger.info("Split the sentences ...")
+	all_job_ids, all_sentences = split_skills(job_adverts)
+
+	logger.info("Predict skill sentences ...")
+	skill_sentences_dict = predict_skill_sents(sent_classifier, all_job_ids, all_sentences)
+
+	logger.info("Embed skill sentences ...")
+	sentence_embeddings, original_sentences = get_embeddings(skill_sentences_dict, nlp, bert_vectorizer)
+
+	logger.info("Reduce embeddings ...")
+	sentences_data_df = reduce_embeddings(sentence_embeddings, original_sentences, reduction_n_neighbors, reduction_min_dist)
+
+	logger.info("Cluster the reduced embeddings ...")
+	sentences_clustered = cluster_embeddings(sentences_data_df, clustering_eps, clustering_min_samples)
+	
+	logger.info("Summarise skill information ...")
+
+	skill_name_dict = sentences_clustered.groupby('cluster_number').apply(lambda x: get_skill_name(x)).to_dict()
+	job_skills_dict = sentences_clustered.groupby('job id')['cluster_number'].unique().to_dict()
+
+	for job_advert in job_adverts:
+		job_advert["Skills"] = [skill_name_dict[skill_num] for skill_num in job_skills_dict[job_advert['job_id']]]
+
+	print(f"There are {len(skill_name_dict)} skills extracted using this data")
+	for i in range(3):
+		print(f'The job advert: \n{job_adverts[i]["full_text"]} \nHas skills: \n{job_adverts[i]["Skills"]}\n')
+
diff --git a/skills_taxonomy_v2/examples/job_advert_examples.txt b/skills_taxonomy_v2/examples/job_advert_examples.txt
new file mode 100644
index 0000000..9695037
--- /dev/null
+++ b/skills_taxonomy_v2/examples/job_advert_examples.txt
@@ -0,0 +1 @@
+[{"full_text": "This is a sentence about the company and the salary. We require applicants to have skills in Microsoft Excel.", "job_id": 0 }, {"full_text": "We want Microsoft Excel skills for this role. Communication skills are also essential.", "job_id": 1 }, {"full_text": "This role has a very competitive starting salary. Skills for good communication are very important.", "job_id": 2 }]
\ No newline at end of file
diff --git a/skills_taxonomy_v2/pipeline/skills_extraction/cluster_embeddings.py b/skills_taxonomy_v2/pipeline/skills_extraction/cluster_embeddings.py
index 4270bc2..68ed485 100644
--- a/skills_taxonomy_v2/pipeline/skills_extraction/cluster_embeddings.py
+++ b/skills_taxonomy_v2/pipeline/skills_extraction/cluster_embeddings.py
@@ -52,14 +52,15 @@ def load_process_sentence_data(s3, reduced_embeddings_paths):
 class ClusterEmbeddings():
     def __init__(
         self,
-        dbscan_eps,
-        dbscan_min_samples,
-        max_length,
-        train_cluster_n,
-        train_cluster_rand_seed,
-        small_cluster_size_threshold,
-        max_centroid_dist_before_merge
+        dbscan_eps=0.01,
+        dbscan_min_samples=4,
+        max_length=100,
+        train_cluster_n=300000,
+        train_cluster_rand_seed=42,
+        small_cluster_size_threshold=10,
+        max_centroid_dist_before_merge=0.05
     ):
+
         self.dbscan_eps = dbscan_eps
         self.dbscan_min_samples  = dbscan_min_samples
         self.max_length = max_length
@@ -112,7 +113,8 @@ def merge_clusters(self):
         logger.info(f"Merging small clusters where suitable ...")
         cluster_centroids = self.sentences_data_short_sample.groupby('cluster_number')["embedding"].apply(
             lambda x: np.mean(x.tolist(), axis=0).tolist()).to_dict()
-        _ = cluster_centroids.pop(-1)
+        if -1 in cluster_centroids:
+            _ = cluster_centroids.pop(-1)
         
         all_cluster_size_dict = {k:v for k,v in Counter(clustering_number).items() if k!=-1}
         small_cluster_size_dict = {k:v for k,v in all_cluster_size_dict.items() if v < self.small_cluster_size_threshold}
@@ -160,7 +162,8 @@ def predict_clusters(self, sentences_data):
         # Re-calculate cluster centroids
         merged_cluster_centroids = self.sentences_data_short_sample.groupby('Merged clusters')["embedding"].apply(
             lambda x: np.mean(x.tolist(), axis=0).tolist()).to_dict()
-        _ = merged_cluster_centroids.pop(-1)
+        if -1 in merged_cluster_centroids:
+            _ = merged_cluster_centroids.pop(-1)
         # Careful of order of list when it came from a dict (can get messed up)
         merged_cluster_embeddings = list(merged_cluster_centroids.values())
         merged_cluster_nums = list(merged_cluster_centroids.keys())
diff --git a/skills_taxonomy_v2/pipeline/skills_extraction/get_sentence_embeddings.py b/skills_taxonomy_v2/pipeline/skills_extraction/get_sentence_embeddings.py
index c45be9b..3091ec3 100644
--- a/skills_taxonomy_v2/pipeline/skills_extraction/get_sentence_embeddings.py
+++ b/skills_taxonomy_v2/pipeline/skills_extraction/get_sentence_embeddings.py
@@ -59,6 +59,51 @@
 with open(custom_stopwords_dir) as file:
     custom_stopwords = file.read().splitlines()
 
+def get_embeddings(data, nlp, bert_vectorizer, token_len_threshold=20, stopwords=stopwords.words(), custom_stopwords=custom_stopwords):
+    logger.info(f"Processing {len(data)} sentences...")
+    start_time = time.time()
+
+    # For each sentence mask out stop words, proper nouns etc.
+    masked_sentences = []
+    sentence_job_ids = []
+    sentence_hashes = []
+    original_sentences = {}
+
+    for job_id, sentences in tqdm(data.items()):
+        for sentence in sentences:
+            masked_sentence = process_sentence_mask(
+                sentence,
+                nlp,
+                bert_vectorizer,
+                token_len_threshold,
+                stopwords=stopwords,
+                custom_stopwords=custom_stopwords,
+            )
+            if masked_sentence.replace("[MASK]", "").replace(" ", ""):
+                # Don't include sentence if it only consists of masked words
+                masked_sentences.append(masked_sentence)
+                sentence_job_ids.append(job_id)
+                # Keep a record of the original sentence via a hashed id
+                original_sentence_id = hash(sentence)
+                sentence_hashes.append(original_sentence_id)
+                original_sentences[original_sentence_id] = sentence
+    logger.info(f"Processing sentences took {time.time() - start_time} seconds")
+
+    logger.info(f"Getting embeddings for {len(masked_sentences)} sentences...")
+    start_time = time.time()
+    # Find sentence embeddings in bulk for all masked sentences
+    masked_sentence_embeddings = bert_vectorizer.transform(masked_sentences)
+    output_tuple_list = [
+        (job_id, sent_id, sent, emb.tolist())
+        for job_id, sent_id, sent, emb in zip(
+            sentence_job_ids,
+            sentence_hashes,
+            masked_sentences,
+            masked_sentence_embeddings,
+        )
+    ]
+    logger.info(f"Getting embeddings took {time.time() - start_time} seconds")
+    return output_tuple_list, original_sentences
 
 def parse_arguments(parser):
     parser.add_argument(
@@ -135,49 +180,8 @@ def try_less_data(s3, bucket_name, data, output_file_dir, stop_recursion=False):
         )
         data = load_s3_data(s3, BUCKET_NAME, data_path)
 
-        logger.info(f"Processing {len(data)} sentences...")
-        start_time = time.time()
-        # For each sentence mask out stop words, proper nouns etc.
-        masked_sentences = []
-        sentence_job_ids = []
-        sentence_hashes = []
-        original_sentences = {}
-
-        for job_id, sentences in tqdm(data.items()):
-            for sentence in sentences:
-                masked_sentence = process_sentence_mask(
-                    sentence,
-                    nlp,
-                    bert_vectorizer,
-                    token_len_threshold,
-                    stopwords=stopwords.words(),
-                    custom_stopwords=custom_stopwords,
-                )
-                if masked_sentence.replace("[MASK]", "").replace(" ", ""):
-                    # Don't include sentence if it only consists of masked words
-                    masked_sentences.append(masked_sentence)
-                    sentence_job_ids.append(job_id)
-                    # Keep a record of the original sentence via a hashed id
-                    original_sentence_id = hash(sentence)
-                    sentence_hashes.append(original_sentence_id)
-                    original_sentences[original_sentence_id] = sentence
-        logger.info(f"Processing sentences took {time.time() - start_time} seconds")
-
-        logger.info(f"Getting embeddings for {len(masked_sentences)} sentences...")
-        start_time = time.time()
-        # Find sentence embeddings in bulk for all masked sentences
-        masked_sentence_embeddings = bert_vectorizer.transform(masked_sentences)
-        output_tuple_list = [
-            (job_id, sent_id, sent, emb.tolist())
-            for job_id, sent_id, sent, emb in zip(
-                sentence_job_ids,
-                sentence_hashes,
-                masked_sentences,
-                masked_sentence_embeddings,
-            )
-        ]
-        logger.info(f"Getting embeddings took {time.time() - start_time} seconds")
-        
+        output_tuple_list, original_sentences = get_embeddings(data, nlp, bert_vectorizer, token_len_threshold=token_len_threshold, stopwords=stopwords.words(), custom_stopwords=custom_stopwords)
+
         # Save the output in a folder with a similar naming structure to the input
         data_dir = os.path.relpath(data_path, skill_sentences_dir)
         output_file_dir = os.path.join(