From 13a52e73807df5fbc7beb851c8bea67774c26a0f Mon Sep 17 00:00:00 2001 From: RFOxbury Date: Fri, 20 Dec 2024 18:21:36 +0000 Subject: [PATCH] Add BIT France analysis --- .gitignore | 4 +- .../pipeline/bit_france/README.md | 16 ++ .../pipeline/bit_france/__init__.py | 0 .../pipeline/bit_france/analysis/__init__.py | 0 .../bit_france/analysis/assign_roles.py | 35 +++ .../analysis/cluster_analysis_utils.py | 95 ++++++++ .../bit_france/analysis/convert_txt.py | 39 ++++ .../bit_france/analysis/convert_txt_df.py | 79 +++++++ .../analysis/get_smallest_noise_prop.py | 53 +++++ .../bit_france/analysis/prep_outputs.py | 153 +++++++++++++ .../analysis/prep_outputs_for_report.py | 63 ++++++ .../bit_france/analysis/topic_modelling.py | 199 +++++++++++++++++ .../bit_france/report/bit_france_report.qmd | 206 ++++++++++++++++++ .../pipeline/name_clusters.py | 40 ++-- dsp_interview_transcripts/utils/repr_docs.py | 11 +- pyproject.toml | 6 +- 16 files changed, 972 insertions(+), 27 deletions(-) create mode 100644 dsp_interview_transcripts/pipeline/bit_france/README.md create mode 100644 dsp_interview_transcripts/pipeline/bit_france/__init__.py create mode 100644 dsp_interview_transcripts/pipeline/bit_france/analysis/__init__.py create mode 100644 dsp_interview_transcripts/pipeline/bit_france/analysis/assign_roles.py create mode 100644 dsp_interview_transcripts/pipeline/bit_france/analysis/cluster_analysis_utils.py create mode 100644 dsp_interview_transcripts/pipeline/bit_france/analysis/convert_txt.py create mode 100644 dsp_interview_transcripts/pipeline/bit_france/analysis/convert_txt_df.py create mode 100644 dsp_interview_transcripts/pipeline/bit_france/analysis/get_smallest_noise_prop.py create mode 100644 dsp_interview_transcripts/pipeline/bit_france/analysis/prep_outputs.py create mode 100644 dsp_interview_transcripts/pipeline/bit_france/analysis/prep_outputs_for_report.py create mode 100644 dsp_interview_transcripts/pipeline/bit_france/analysis/topic_modelling.py create mode 100644 dsp_interview_transcripts/pipeline/bit_france/report/bit_france_report.qmd diff --git a/.gitignore b/.gitignore index bf3fc2e..6389739 100644 --- a/.gitignore +++ b/.gitignore @@ -176,4 +176,6 @@ outputs/* # data files *.csv poetry.lock -poetry.lock + +# quarto +*.html diff --git a/dsp_interview_transcripts/pipeline/bit_france/README.md b/dsp_interview_transcripts/pipeline/bit_france/README.md new file mode 100644 index 0000000..da67022 --- /dev/null +++ b/dsp_interview_transcripts/pipeline/bit_france/README.md @@ -0,0 +1,16 @@ + +The scripts run in this order: + +1. `analysis/convert_txt.py` + +2. `analysis/convert_txt_df.py` + +3. `analysis/assign_roles.py` + +4. `analysis/topic_modelling.py` + +5. `analysis/prep_outputs.py` + +6. `../name_clusters.py` (yes, this script is outside the `bit_france/` directory) + +7. `analysis/prep_outputs_for_report.py` diff --git a/dsp_interview_transcripts/pipeline/bit_france/__init__.py b/dsp_interview_transcripts/pipeline/bit_france/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dsp_interview_transcripts/pipeline/bit_france/analysis/__init__.py b/dsp_interview_transcripts/pipeline/bit_france/analysis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dsp_interview_transcripts/pipeline/bit_france/analysis/assign_roles.py b/dsp_interview_transcripts/pipeline/bit_france/analysis/assign_roles.py new file mode 100644 index 0000000..d3038f8 --- /dev/null +++ b/dsp_interview_transcripts/pipeline/bit_france/analysis/assign_roles.py @@ -0,0 +1,35 @@ +""" +Assign interview roles based on mapping table +""" +import pandas as pd + +from dsp_interview_transcripts import PROJECT_DIR +from dsp_interview_transcripts import logger + + +def assign_role(row): + # This rule is an exception because for some reason this file name doesn't parse + if "E25" in row["file_name"] and row["speaker_id"] == 0: + return "informant" + if row["informant"] == 0 and row["speaker_id"] == 0: + return "informant" + elif row["informant"] == 1 and row["speaker_id"] == 1: + return "informant" + else: + return "other" + + +if __name__ == "__main__": + mapping = pd.read_csv(f"{PROJECT_DIR}/data/bit_france/converted/file_mapping.csv") + data = pd.read_csv(f"{PROJECT_DIR}/data/bit_france/converted/combined_transcripts.csv") + + data_with_mapping = data.merge(mapping[["file_name", "informant"]], on="file_name", how="left") + + data_with_mapping["role"] = data_with_mapping.apply(assign_role, axis=1) + roles = data_with_mapping[data_with_mapping["role"] == "informant"][["file_name", "speaker_id"]].drop_duplicates() + for idx, row in roles.iterrows(): + logger.info(f"File: {row['file_name']} Speaker ID: {row['speaker_id']}") + + data_with_mapping.to_csv( + f"{PROJECT_DIR}/data/bit_france/converted/combined_transcripts_with_roles.csv", index=False + ) diff --git a/dsp_interview_transcripts/pipeline/bit_france/analysis/cluster_analysis_utils.py b/dsp_interview_transcripts/pipeline/bit_france/analysis/cluster_analysis_utils.py new file mode 100644 index 0000000..1bf8702 --- /dev/null +++ b/dsp_interview_transcripts/pipeline/bit_france/analysis/cluster_analysis_utils.py @@ -0,0 +1,95 @@ +import re + +from collections import defaultdict +from typing import Dict +from typing import Iterator + +import numpy as np + +from sklearn.feature_extraction.text import TfidfVectorizer + + +def remove_non_alphanumeric(text: str) -> str: + return re.sub(r"[^0-9a-zA-Z ]+", "", text) + + +def simple_tokenizer(text: str) -> Iterator[str]: + return [token.strip() for token in text.split(" ") if len(token) > 0] + + +def cluster_texts(documents: Iterator[str], cluster_labels: Iterator) -> Dict: + """ + Creates a large text string for each cluster, by joining up the + text strings (documents) belonging to the same cluster + Args: + documents: A list of text strings + cluster_labels: A list of cluster labels, indicating the membership of the text strings + Returns: + A dictionary where keys are cluster labels, and values are cluster text documents + """ + + assert len(documents) == len(cluster_labels) + doc_type = type(documents[0]) + + cluster_text_dict = defaultdict(doc_type) + for i, doc in enumerate(documents): + if doc_type is str: + cluster_text_dict[cluster_labels[i]] += doc + " " + elif doc_type is list: + cluster_text_dict[cluster_labels[i]] += doc + return cluster_text_dict + + +def cluster_keywords( + documents: Iterator[str], + cluster_labels: Iterator[int], + n: int = 10, + tokenizer=simple_tokenizer, + max_df: float = 0.90, + min_df: float = 0.01, + Vectorizer=TfidfVectorizer, +) -> Dict: + """ + Generates keywords that characterise the cluster, using the specified Vectorizer + Args: + documents: List of (preprocessed) text documents + cluster_labels: List of integer cluster labels + n: Number of top keywords to return + Vectorizer: Vectorizer object to use (eg, TfidfVectorizer, CountVectorizer) + tokenizer: Function to use to tokenise the input documents; by default splits the document into words + Returns: + Dictionary that maps cluster integer labels to a list of keywords + """ + + # Define vectorizer + vectorizer = Vectorizer( + analyzer="word", + tokenizer=tokenizer, + preprocessor=lambda x: x, + token_pattern=None, + max_df=max_df, + min_df=min_df, + max_features=10000, + ) + + # Create cluster text documents + cluster_documents = cluster_texts(documents, cluster_labels) + unique_cluster_labels = list(cluster_documents.keys()) + + # Apply the vectorizer + token_score_matrix = vectorizer.fit_transform(list(cluster_documents.values())) + + # Create a token lookup dictionary + id_to_token = dict(zip(list(vectorizer.vocabulary_.values()), list(vectorizer.vocabulary_.keys()))) + + # For each cluster, check the top n tokens + top_cluster_tokens = {} + for i in range(token_score_matrix.shape[0]): + # Get the cluster feature vector + x = token_score_matrix[i, :].todense() + # Find the indices of the top n tokens + x = list(np.flip(np.argsort(np.array(x)))[0])[0:n] + # Find the tokens corresponding to the top n indices + top_cluster_tokens[unique_cluster_labels[i]] = [id_to_token[j] for j in x] + + return top_cluster_tokens diff --git a/dsp_interview_transcripts/pipeline/bit_france/analysis/convert_txt.py b/dsp_interview_transcripts/pipeline/bit_france/analysis/convert_txt.py new file mode 100644 index 0000000..1b766eb --- /dev/null +++ b/dsp_interview_transcripts/pipeline/bit_france/analysis/convert_txt.py @@ -0,0 +1,39 @@ +import os + +from pathlib import Path + +from docx import Document + +from dsp_interview_transcripts import PROJECT_DIR + + +def convert_docx_to_txt(input_dir, output_dir): + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + if file_name.endswith(".docx"): + input_path = os.path.join(input_dir, file_name) + output_file_name = os.path.splitext(file_name)[0] + ".txt" + output_path = os.path.join(output_dir, output_file_name) + + try: + doc = Document(input_path) + with open(output_path, "w", encoding="utf-8") as txt_file: + for paragraph in doc.paragraphs: + txt_file.write(paragraph.text + "\n") + print(f"Converted: {file_name} -> {output_file_name}") + except Exception as e: + print(f"Failed to convert {file_name}: {e}") + + +if __name__ == "__main__": + + for input_directory in [ + f"{PROJECT_DIR}/data/bit_france/Salariés", + f"{PROJECT_DIR}/data/bit_france/Elus", + f"{PROJECT_DIR}/data/bit_france/Décideurs", + f"{PROJECT_DIR}/data/bit_france/Médecins du travail", + ]: + profession = input_directory.split("/")[-1] + output_directory = PROJECT_DIR / f"data/bit_france/converted/{profession}" + convert_docx_to_txt(input_directory, output_directory) diff --git a/dsp_interview_transcripts/pipeline/bit_france/analysis/convert_txt_df.py b/dsp_interview_transcripts/pipeline/bit_france/analysis/convert_txt_df.py new file mode 100644 index 0000000..f994ec7 --- /dev/null +++ b/dsp_interview_transcripts/pipeline/bit_france/analysis/convert_txt_df.py @@ -0,0 +1,79 @@ +import os +import re + +import pandas as pd + +from dsp_interview_transcripts import PROJECT_DIR + + +def process_text_files(input_dir, output_csv, profession): + data = [] # List to store rows for the DataFrame + + # Iterate through all files in the input directory + for file_name in os.listdir(input_dir): + if file_name.endswith(".txt"): + file_path = os.path.join(input_dir, file_name) + + # Read the file content + with open(file_path, "r", encoding="utf-8") as file: + content = file.read() + + # Use regex to extract speaker ID and their spoken text + # Pattern 1: Speaker [\d]\n- Text + pattern1 = re.findall(r"Speaker (\d)\n- (.*?)\n", content, re.DOTALL) + + # Pattern 2: Speaker [\d] [timestamp] - Text + pattern2 = re.findall(r"Speaker (\d)\s+\d{2}:\d{2}\s+-\s+(.*?)\n", content, re.DOTALL) + + turns = pattern1 + pattern2 + + # Add each turn as a row in the DataFrame + for speaker_id, text in turns: + data.append( + { + "speaker_id": int(speaker_id), + "text": text.strip(), + "file_name": file_name, + "profession": profession, + } + ) + + # Create a DataFrame from the collected data + df = pd.DataFrame(data, columns=["speaker_id", "text", "file_name", "profession"]) + + # Save the DataFrame to a CSV file + df.to_csv(output_csv, index=False, encoding="utf-8") + + print(f"Processed files saved to {output_csv}") + + +if __name__ == "__main__": + + input_dirs = [ + f"{PROJECT_DIR}/data/bit_france/converted/Salariés", + f"{PROJECT_DIR}/data/bit_france/converted/Elus", + f"{PROJECT_DIR}/data/bit_france/converted/Décideurs", + f"{PROJECT_DIR}/data/bit_france/converted/Médecins du travail", + ] + + for input_directory in input_dirs: + profession = input_directory.split("/")[-1] + output_directory = PROJECT_DIR / f"data/bit_france/converted/{profession}/output.csv" + + process_text_files(input_directory, output_directory, profession) + + dataframes = [] + + for directory in input_dirs: + file_path = os.path.join(directory, "output.csv") + if os.path.exists(file_path): # Ensure the file exists + df = pd.read_csv(file_path) + dataframes.append(df) + else: + print(f"File not found: {file_path}") + + combined_df = pd.concat(dataframes, ignore_index=True) + output_path = f"{PROJECT_DIR}/data/bit_france/converted/combined_transcripts.csv" + combined_df.to_csv(output_path, index=False) + + print(f"Combined CSV saved to {output_path}") diff --git a/dsp_interview_transcripts/pipeline/bit_france/analysis/get_smallest_noise_prop.py b/dsp_interview_transcripts/pipeline/bit_france/analysis/get_smallest_noise_prop.py new file mode 100644 index 0000000..86fdf15 --- /dev/null +++ b/dsp_interview_transcripts/pipeline/bit_france/analysis/get_smallest_noise_prop.py @@ -0,0 +1,53 @@ +import os +import re + +from dsp_interview_transcripts import PROJECT_DIR + + +def find_file_with_smallest_number(directory): + """ + Finds the file ending in '.txt' within the given directory that contains the smallest float number. + + Parameters: + directory (str): The path to the directory containing the '.txt' files. + + Returns: + str: The name of the file containing the smallest float number, or None if no '.txt' files are found. + """ + smallest_number = float("inf") + smallest_file = None + + # Regex to match float numbers in the file content + float_pattern = re.compile(r"[-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?") + + # Iterate through files in the directory + for filename in os.listdir(directory): + if filename.endswith(".txt"): + file_path = os.path.join(directory, filename) + try: + with open(file_path, "r") as file: + content = file.read() + + # Find all float numbers in the file content + numbers = [float(match) for match in float_pattern.findall(content)] + + # Check for the smallest number + if numbers: + min_number = min(numbers) + if min_number < smallest_number: + smallest_number = min_number + smallest_file = filename + + except (ValueError, IOError) as e: + print(f"Error processing file {filename}: {e}") + + return smallest_file + + +# Example usage +directory_path = f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/outputs" +result = find_file_with_smallest_number(directory_path) +if result: + print(f"The file with the smallest number is: {result}") +else: + print("No '.txt' files with valid numbers were found in the directory.") diff --git a/dsp_interview_transcripts/pipeline/bit_france/analysis/prep_outputs.py b/dsp_interview_transcripts/pipeline/bit_france/analysis/prep_outputs.py new file mode 100644 index 0000000..622aeaf --- /dev/null +++ b/dsp_interview_transcripts/pipeline/bit_france/analysis/prep_outputs.py @@ -0,0 +1,153 @@ +import ast +import re + +import cluster_analysis_utils +import numpy as np +import pandas as pd +import plotly.express as px + +from nltk.corpus import stopwords +from nltk.stem import WordNetLemmatizer +from sklearn.cluster import KMeans +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfVectorizer +from umap import UMAP + +from dsp_interview_transcripts import PROJECT_DIR +from dsp_interview_transcripts.utils.repr_docs import * + + +lemmatizer = WordNetLemmatizer() + +french_stopwords = stopwords.words("french") + ["aussi", "moi"] + + +def preproc(text: str) -> str: + text = re.sub(r"[\.,]+", "", text).lower() + text = cluster_analysis_utils.simple_tokenizer(text) + text = [t for t in text if t not in french_stopwords] + return " ".join(text) + + +def merge_2d_embeddings(speaker_data_topics): + speaker_data_topics["embeddings"] = speaker_data_topics["embeddings"].apply(ast.literal_eval) + embeddings_array = np.vstack(speaker_data_topics["embeddings"].values) + + umap_2d = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine", random_state=42) + embeddings_2d = umap_2d.fit_transform(embeddings_array) + + embeddings_df = pd.DataFrame(embeddings_2d, columns=["x", "y"]) + return pd.concat([speaker_data_topics, embeddings_df], axis=1) + + +def save_repr_docs(speaker_data_topics, outpath): + _, clustered_data = get_min_radius( + speaker_data_topics, k_neighbours=10, topic_col="topic", embedding_col="embeddings", metric="cosine" + ) + + clustered_data["quartile"] = clustered_data.groupby("topic")["radius_10"].transform( + lambda x: pd.qcut(x, q=4, labels=["1st", "2nd", "3rd", "greater than 3rd"]) + ) + repr_docs = extract_repr_docs(clustered_data, n=10, random_seed=42, user_id_col="file_name") + + repr_docs = repr_docs[["Topic", "Name", "Representation", "file_name", "profession", "text"]] + + repr_docs.to_csv(outpath, index=False) + + return repr_docs + + +def get_data_w_keywords(speaker_data_topics): + clusterer = KMeans(n_clusters=35, random_state=10) + clusterer.fit(speaker_data_topics[["x", "y"]]) + soft_clusters = list(clusterer.labels_) + + texts = speaker_data_topics["text"].apply(preproc) + + cluster_texts = cluster_analysis_utils.cluster_texts(texts, soft_clusters) + + cluster_keywords = cluster_analysis_utils.cluster_keywords( + documents=list(cluster_texts.values()), + cluster_labels=list(cluster_texts.keys()), + n=2, + max_df=0.90, + min_df=0.01, + Vectorizer=TfidfVectorizer, + ) + + speaker_data_topics["soft_cluster"] = soft_clusters + speaker_data_topics["soft_cluster_"] = [str(x) for x in soft_clusters] + + centroids = ( + speaker_data_topics.groupby("soft_cluster") + .agg(x_c=("x", "mean"), y_c=("y", "mean")) + .reset_index() + .assign(keywords=lambda x: x.soft_cluster.apply(lambda y: ", ".join(cluster_keywords[y]))) + ) + + data_viz = pd.merge(speaker_data_topics, centroids, on="soft_cluster", how="left") + + data_viz.loc[data_viz["Topic"] == -1, "keywords"] = "" + + return data_viz, centroids + + +def get_topics_by_profession(speaker_data_topics): + + speaker_data_grouped = ( + speaker_data_topics.groupby(["profession", "Topic", "Name"]) + .agg( + # median_word_count=('word_count', 'median'), + unique_file_names=("file_name", "nunique"), + count=("file_name", "count"), + ) + .reset_index() + .rename(columns={"profession": "informant"}) + ) + + # Calculate the total count of records per profession + speaker_data_grouped["total_count"] = speaker_data_grouped.groupby("informant")["count"].transform("sum") + + # Calculate the proportion for each topic within each profession + speaker_data_grouped["proportion"] = speaker_data_grouped["count"] / speaker_data_grouped["total_count"] + + return speaker_data_grouped + + +if __name__ == "__main__": + speaker_data_topics = pd.read_csv( + f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/outputs/speaker_data_topics_selection_leaf_min_length_9_min_cluster_50_red_probabilities.csv" + ) + topic_info = pd.read_csv( + f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/outputs/bertopic_topic_info_selection_leaf_min_length_9_min_cluster_50_red_probabilities.csv" + ) + + speaker_data_topics = merge_2d_embeddings(speaker_data_topics) + + speaker_data_topics = pd.merge( + speaker_data_topics, + topic_info[["Topic", "Name", "Representation"]], + left_on="topic", + right_on="Topic", + how="left", + ) + + speaker_data_topics.to_csv( + f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/outputs/speaker_data_topics.csv", index=False + ) + + repr_docs = save_repr_docs( + speaker_data_topics, + outpath=f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/outputs/repr_docs.csv", + ) + + data_viz, centroids = get_data_w_keywords(speaker_data_topics) + + data_viz.to_csv(f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/report/data_viz.csv", index=False) + centroids.to_csv(f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/report/centroids.csv", index=False) + + speaker_data_grouped = get_topics_by_profession(speaker_data_topics) + + speaker_data_grouped.to_csv( + f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/report/speaker_data_grouped.csv", index=False + ) diff --git a/dsp_interview_transcripts/pipeline/bit_france/analysis/prep_outputs_for_report.py b/dsp_interview_transcripts/pipeline/bit_france/analysis/prep_outputs_for_report.py new file mode 100644 index 0000000..b2b132b --- /dev/null +++ b/dsp_interview_transcripts/pipeline/bit_france/analysis/prep_outputs_for_report.py @@ -0,0 +1,63 @@ +""" +This script comes after prep_outputs.py :) +""" +import pandas as pd + +from dsp_interview_transcripts import PROJECT_DIR + + +DATA_PATH = f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france" + +if __name__ == "__main__": + + speaker_data_topics = pd.read_csv(f"{DATA_PATH}/outputs/speaker_data_topics.csv") + + names_descriptions = pd.read_csv(f"{DATA_PATH}/outputs/topic_names_and_descriptions.csv") + + repr_docs = pd.read_csv(f"{DATA_PATH}/outputs/repr_docs.csv") + + repr_docs_final = pd.merge( + repr_docs, names_descriptions[["Name", "llama3.2_name", "llama3.2_description"]], on="Name", how="left" + ) + repr_docs_final = repr_docs_final[ + ["Topic", "Name", "Representation", "llama3.2_name", "llama3.2_description", "profession", "file_name", "text"] + ].rename( + columns={ + "Representation": "Keywords", + "llama3.2_name": "AI-generated name", + "llama3.2_description": "AI-generated description", + } + ) + + repr_docs_final.to_csv(f"{DATA_PATH}/report/repr_docs_final.csv", index=False) + + repr_docs_brief = repr_docs_final[ + ["Topic", "Name", "Keywords", "AI-generated name", "AI-generated description"] + ].drop_duplicates() + repr_docs_brief.to_csv(f"{DATA_PATH}/report/repr_docs_brief.csv", index=False) + + speaker_data_names_descriptions = pd.merge(speaker_data_topics, repr_docs_brief, on="Name", how="left") + + raw_transcripts = pd.read_csv(f"{PROJECT_DIR}/data/bit_france/converted/combined_transcripts.csv") + + transcripts = pd.merge( + raw_transcripts, + speaker_data_names_descriptions[ + ["text", "file_name", "Name", "Keywords", "AI-generated name", "AI-generated description"] + ], + on=["file_name", "text"], + how="left", + ) + + transcripts[ + [ + "profession", + "file_name", + "speaker_id", + "text", + "Name", + "Keywords", + "AI-generated name", + "AI-generated description", + ] + ].to_csv(f"{DATA_PATH}/report/full_data.csv", index=False) diff --git a/dsp_interview_transcripts/pipeline/bit_france/analysis/topic_modelling.py b/dsp_interview_transcripts/pipeline/bit_france/analysis/topic_modelling.py new file mode 100644 index 0000000..bbe3ce5 --- /dev/null +++ b/dsp_interview_transcripts/pipeline/bit_france/analysis/topic_modelling.py @@ -0,0 +1,199 @@ +""" +Example usage: +``` +python dsp_interview_transcripts/pipeline/bit_france/analysis/topic_modelling.py -s eom -l 9 -c 50 +``` +""" + +import random + +import nltk +import numpy as np +import pandas as pd +import plac +import torch + +from bertopic import BERTopic +from bertopic.dimensionality import BaseDimensionalityReduction +from bertopic.representation import KeyBERTInspired +from bertopic.representation import MaximalMarginalRelevance +from hdbscan import HDBSCAN +from nltk.corpus import stopwords +from nltk.tokenize import sent_tokenize +from sentence_transformers import SentenceTransformer +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.preprocessing import normalize +from umap import UMAP + +from dsp_interview_transcripts import PROJECT_DIR +from dsp_interview_transcripts import S3_BUCKET +from dsp_interview_transcripts import config +from dsp_interview_transcripts import logger +from dsp_interview_transcripts.getters.data_getters import save_to_s3 +from dsp_interview_transcripts.getters.interim import get_cleaned_data +from dsp_interview_transcripts.utils.repr_docs import * + + +nltk.download("stopwords") +nltk.download("punkt") + +french_stopwords = stopwords.words("french") + +# Set random seeds +RANDOM_SEED = 42 +from numpy import random as npr + + +npr.seed(RANDOM_SEED) +random.seed(RANDOM_SEED) +torch.manual_seed(RANDOM_SEED) + +# SENTENCE_MODEL = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2') +model_name = "dangvantuan/sentence-camembert-large" +SENTENCE_MODEL = SentenceTransformer(model_name) + +MIN_CLUSTER_SIZE = 50 +MIN_LENGTH = 5 +SELECTION = "eom" + + +@plac.opt("selection", "Cluster selection method ('eom' or 'leaf')", type=str) +@plac.opt("min_length", "Minimum length of word count for filtering", type=int, abbrev="l") +@plac.opt("min_cluster_size", "Minimum cluster size for HDBSCAN", type=int, abbrev="c") +@plac.opt( + "reduction_strategy", + "Strategy for reducing outliers ('embeddings', 'probabilities', 'distributions', 'ctfidf')", + type=str, + abbrev="r", +) +def main(selection="eom", min_length=5, min_cluster_size=50, reduction_strategy="embeddings"): + + if reduction_strategy == "ctfidf": + reduction_strategy = "c-tf-idf" + + data = pd.read_csv(f"{PROJECT_DIR}/data/bit_france/converted/combined_transcripts_with_roles.csv") + logger.info(f"N records: {len(data)}") + # We have to deduplicate because there seems to be at least one duplicate interview + data = data.drop_duplicates(subset=["text"]) + logger.info(f"N records after deduplication: {len(data)}") + + logger.info(data["role"].value_counts()) + + # Isolate just the interviewees + speaker_data = data[data["role"] == "informant"] + + speaker_data["word_count"] = speaker_data["text"].apply(lambda x: len(x.split())) + + speaker_data_filtered = speaker_data[speaker_data["word_count"] > min_length] + + docs = speaker_data_filtered["text"].tolist() + logger.info("Embedding user messages...") + embeddings = SENTENCE_MODEL.encode(docs, show_progress_bar=True) + embeddings_path = ( + f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/outputs/embeddings_min_length_{min_length}.npy" + ) + np.save(embeddings_path, embeddings) + + # empty_reduction_model = BaseDimensionalityReduction() + + umap_model = UMAP( + n_neighbors=15, + n_components=50, + min_dist=0.1, + metric="cosine", + random_state=RANDOM_SEED, + ) + + hdbscan_model = HDBSCAN( + min_samples=5, + min_cluster_size=min_cluster_size, + metric="euclidean", + cluster_selection_method=selection, + prediction_data=True, + ) + + vectorizer_model = TfidfVectorizer( + stop_words=french_stopwords, + min_df=1, + max_df=0.85, + ngram_range=(1, 3), + ) + + # KeyBERT + keybert_model = KeyBERTInspired() + + # MMR + mmr_model = MaximalMarginalRelevance(diversity=0.3) + + # All representation models + representation_model = { + "KeyBERT": keybert_model, + "MMR": mmr_model, + } + + topic_model = BERTopic( + # Pipeline models + embedding_model=model_name, + umap_model=umap_model, + hdbscan_model=hdbscan_model, + vectorizer_model=vectorizer_model, + representation_model=representation_model, + # Hyperparameters + top_n_words=10, + verbose=True, + calculate_probabilities=True, + ) + + topics, probs = topic_model.fit_transform(docs, embeddings) + + if reduction_strategy == "probabilities": + new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy=reduction_strategy) + else: + new_topics = topic_model.reduce_outliers(docs, topics, strategy=reduction_strategy) + + topic_model.update_topics( + docs, + topics=new_topics, + top_n_words=10, + n_gram_range=(1, 3), + vectorizer_model=vectorizer_model, + ctfidf_model=None, + representation_model=representation_model, + ) + + # Default BERTopic scatterplot + fig = topic_model.visualize_documents(docs, embeddings=embeddings) + output_path = f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/outputs/bertopic_visualization_selection_{selection}_min_length_{min_length}_min_cluster_{min_cluster_size}_red_{reduction_strategy}.html" + fig.write_html(output_path) + + # Default BERTopic summary info + topic_info = topic_model.get_topic_info() + topic_info_path = f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/outputs/bertopic_topic_info_selection_{selection}_min_length_{min_length}_min_cluster_{min_cluster_size}_red_{reduction_strategy}.csv" + topic_info.to_csv(topic_info_path, index=False) + + # What proportion is noise? + total_elements = len(new_topics) + count_noise = new_topics.count(-1) + proportion_noise = count_noise / total_elements + logger.info(f"{proportion_noise}") + noise_path = f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/outputs/noise_prop_selection_{selection}_min_length_{min_length}_min_cluster_{min_cluster_size}_red_{reduction_strategy}.txt" + with open(noise_path, "w") as f: + f.write(str(proportion_noise)) + + speaker_data_filtered["embeddings"] = embeddings.tolist() + speaker_data_filtered["topic"] = new_topics + + data_out_path = f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/outputs/speaker_data_topics_selection_{selection}_min_length_{min_length}_min_cluster_{min_cluster_size}_red_{reduction_strategy}.csv" + speaker_data_filtered.to_csv(data_out_path, index=False) + + # Check for topics that have a low median word count (although if min_len is high, there won't be any) + check_word_count = speaker_data_filtered.groupby("topic").agg( + median_word_count=("word_count", "median"), unique_file_names=("file_name", "nunique") + ) + logger.info(check_word_count) + word_count_path = f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/outputs/topic_median_word_count_selection_{selection}_min_length_{min_length}_min_cluster_{min_cluster_size}_red_{reduction_strategy}.csv" + check_word_count.to_csv(word_count_path, index=False) + + +if __name__ == "__main__": + plac.call(main) diff --git a/dsp_interview_transcripts/pipeline/bit_france/report/bit_france_report.qmd b/dsp_interview_transcripts/pipeline/bit_france/report/bit_france_report.qmd new file mode 100644 index 0000000..f52bd7c --- /dev/null +++ b/dsp_interview_transcripts/pipeline/bit_france/report/bit_france_report.qmd @@ -0,0 +1,206 @@ +--- +title: "BIT France analysis" +# Changing the background colour for the title block +title-block-banner: "#0000FF" +# Changing the text colour for the title block +title-block-banner-color: "#FFFFFF" +# Do we want to add a table of contents: true/false +toc: true +# Where do we want the table of contents to be located: left/right/body +toc-location: left +# Do we want the sections numbered: true/false +number-sections: true +format: + html: + page-layout: full + code-fold: true +jupyter: python3 +--- + +# Introduction + +We have prepared a preliminary analysis of the interview transcripts. Below you will find a section explaining our methods, and then a section presenting some visual representations of the data. + +Any questions, please do not hesitate to contact [rosie.oxbury@nesta.org.uk](@mailto:rosie.oxbury@nesta.org.uk) or [karlis.kanders@nesta.org.uk](@mailto:karlis.kanders@nesta.org.uk). + +# Methods + +## Text preprocessing +* We avoided manipulating the text beyond converting `.docx` files to `.txt`. +* The topic modelling is currently based **only on sections of the interview spoken by the informant**, not the interviewer. +* For the topic modelling, each "turn" in conversation (i.e. each utterance) is treated as a single data point. +* Currently, **turns shorter than 9 words are excluded from the topic modelling**. This is because we have found that such short utterances tend to "confuse" the model and result in topics that are harder to interpret. + +## Topic modelling +We used the popular library [BERTopic](https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.get_representative_docs) to extract topics from the data. This method works by: + +* Embedding the text data - this means translating the text into a numerical representation (vectors) that a computer can understand. +* Applying a clustering algorithm to the vectors. This works to group together similar vectors (and therefore similar text) into clusters. +* Extracting the key words that distinguish each cluster from the others. + +## Scope of topic modelling +For this first iteration, we created one topic model across all the data (rather than individual models for the 4 types of informants). In the next iteration we can create a separate model for each group of informants if that is likely to be useful. Below, we have provided a plot that shows a breakdown of which topics were mentioned relatively more often in each group of informants. + +## Topic names and descriptions +We then also used a LLM (llama3.2) to give each cluster a name and description. This part is somewhat experimental. We have shown the resulting names and descriptions as "AI-generated name" and "AI-generated description" in the table below, but in the plots we use the keyword-based topic names that come from the topic model. + +# Results + +```{python} +#| echo: false + +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go + +NESTA_COLOURS = [ + "#0000FF", + "#FDB633", + "#18A48C", + "#9A1BBE", + "#EB003B", + "#FF6E47", + "#646363", + "#0F294A", + "#97D9E3", + "#A59BEE", + "#F6A4B7", + "#D2C9C0", + # "#FFFFFF", + "#000000", +] +``` + +## List of topics +The table below shows the topics that we derived: + +```{python} +#| echo: false +#| tbl-cap: "Topic names and keywords, and AI-generated names and descriptions" + +df = pd.read_csv('repr_docs_brief.csv') +df[['Name', 'Keywords', 'AI-generated name', 'AI-generated description']].style +``` + +## Mentions of topics by informant group + +The plot below shows the proportion of utterances mentioning a topic within each informant group - so within a group of informants e.g. Salariés, the proportions should sum to 1 (and because there are ~20 topics, we would expect the average proportion to be ~0.05). + +For example, we can see that the group "Médecins du travail" mention the topic "8_médecine_médecine travail_prévention_médecin travail" relatively frequently. Meanwhile the group "Salariés" mention the topic "5_café_midi_heures jour_10 minutes" more often compared to other groups. + +```{python} +#| echo: false +#| label: fig-professions +#| fig-cap: "Distributions of mentions of each topic within informant groups" +#| fig-fullwidth: true +speaker_data_grouped = pd.read_csv('speaker_data_grouped.csv') + +fig_proportion = px.bar( + speaker_data_grouped[speaker_data_grouped['Topic']!=-1], + x='informant', + y='proportion', + color='Name', + title='Mentions of each topic by informant group', + labels={'proportion': 'Proportion of Records', 'informant': 'Informant group'}, + barmode='group', + color_discrete_sequence=NESTA_COLOURS, +) + +fig_proportion.update_layout( + width=1200, # Increase width + height=600, # Adjust height + title={'x': 0.5}, # Center title + margin=dict(l=50, r=50, t=50, b=50), # Adjust margins + legend_title_text="", # Hide legend title + plot_bgcolor="white", # Set background of the plot area to white + paper_bgcolor="white", # Set background of the entire figure to white +) + +fig_proportion.show() +``` + +## Interactive mapping of the data + +The next plot shows each turn/utterance as a single point. **The x and y axes are abstract and do not have any meaning**, though points that are closer together should be similar semantically and points that are further away are more different from each other semantically. Each point on the plot is coloured by topic. + +The keywords superimposed on the graph are words that can be considered distinctive of texts in that area of the plot. + +The plot is interactive and you can: + +* hover over a point to see the file it comes from, the topic, and the text of this utterance +* click a topic in the legend to remove it from the plot (click it again to add it back) + +```{python} +#| echo: false +data_viz = pd.read_csv('data_viz.csv') +centroids = pd.read_csv('centroids.csv') + +data_viz['text_wrapped'] = data_viz['text'].str.wrap(30) +data_viz['text_wrapped'] = data_viz['text_wrapped'].apply(lambda x: x.replace('\n', '
')) + +duplicates = data_viz.duplicated(subset=['x_c','y_c', + 'keywords'], keep='first') + +data_viz.loc[duplicates, 'keywords'] = "" +data_viz['keywords'] = data_viz['keywords'].fillna("") + +professions = data_viz['profession'].unique() + +cluster_names = data_viz['Name'].unique() + +sorted_list = sorted( + cluster_names, + key=lambda x: int(x.split('_')[0]) # Extract the numeric prefix and convert to int +) + +``` + +```{python} +#| echo: false +#| label: fig-scatter +#| fig-cap: "Interactive visualisation of interview transcripts. One point = one turn in conversation. The points are coloured by topic. Keywords indicate the theme of nearby points." +#| fig-fullwidth: true +extended_colours = (NESTA_COLOURS * ((len(sorted_list) // len(NESTA_COLOURS)) + 1))[:len(sorted_list)] + +fig = px.scatter( + data_viz, + x="x", + y="y", + text="keywords", + color="Name", + # hover_data=["file_name", "text_wrapped"], + hover_data={ + "Name": True, + "file_name": True, + "text_wrapped": True, + "keywords": False, + "x": False, + "y": False + }, + custom_data=["Name", "file_name", "text_wrapped"], + color_discrete_sequence=NESTA_COLOURS, + opacity=0.3, + category_orders={"Name": sorted_list} + ) + +fig.update_traces( + textposition="top center", + textfont=dict( + size=14, # Increase font size + color="black" # Text color + ), +) + +fig.update_layout( + width=1200, # Increase width + height=800, # Adjust height + xaxis=dict(showticklabels=False, title_text=""), # Hide x-axis ticks and title + yaxis=dict(showticklabels=False, title_text=""), # Hide y-axis ticks and title + legend_title_text="", # Hide legend title + plot_bgcolor="white", # Set background of the plot area to white + paper_bgcolor="white", # Set background of the entire figure to white + ) + +fig + +``` diff --git a/dsp_interview_transcripts/pipeline/name_clusters.py b/dsp_interview_transcripts/pipeline/name_clusters.py index 078da5c..ae89a91 100644 --- a/dsp_interview_transcripts/pipeline/name_clusters.py +++ b/dsp_interview_transcripts/pipeline/name_clusters.py @@ -26,26 +26,19 @@ class NameDescription(BaseModel): prompt = """ - I have performed text clustering on some interviews where users were asked about their knowledge of - and opinions on different home heating options. In the interview, users were asked about their knowledge - of the Boiler Upgrade Scheme, a scheme that provides a subsidy to homeowners wishing to install a heatpump - instead of getting a new gas boiler for their home. + I have performed text clustering on some interviews where professionals were asked about the occupational risks of sedentary behaviour at + work, and the challenges organisations face in reducing sedentary behaviour. \n One of the clusters contains the following user responses from the interviews: {docs} The cluster is described by the following keywords: {keywords} \n - Based on the information above, please provide a name and summary for the cluster as a JSON object with two fields: - - name: A short, informative name for the cluster - - description: A summary of views of users within the cluster. You can include sentiments they express, reasons for their views, their knowledge levels, and any other relevant information. + Based on the information above, please provide a **French language** name and description for the cluster as a JSON object with two fields: + - name: A short, informative name for the cluster **in French** + - description: A short description of the cluster, based on the user responses and keywords provided **in French** \n Provide nothing except for this JSON dict. \n - Example: - {{ - "name": "Energy Efficiency", - "description": "This cluster contains user responses about energy efficiency when choosing home heating options. The users have varying degrees of knowledge about the efficiency of different systems. Some reasons for wanting to improve efficiency include environmental concerns and cost concerns." - }} """ parser = JsonOutputParser(pydantic_object=NameDescription) @@ -128,19 +121,21 @@ def name_topics( def main(production: bool = False): - MIN_LEN = config["min_length"] - if production: - OUT_PATH = config["prod_paths"]["interim_w_names_s3_path"].format(MIN_LEN=MIN_LEN) - else: - OUT_PATH = config["test_paths"]["interim_w_names_s3_path"].format(MIN_LEN=MIN_LEN) + # MIN_LEN = config["min_length"] + # if production: + # OUT_PATH = config["prod_paths"]["interim_w_names_s3_path"].format(MIN_LEN=MIN_LEN) + # else: + # OUT_PATH = config["test_paths"]["interim_w_names_s3_path"].format(MIN_LEN=MIN_LEN) + + # topic_info = get_rep_docs(production=production) - topic_info = get_rep_docs(production=production) + topic_info = pd.read_csv(f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/outputs/repr_docs.csv") - topic_info = topic_info.groupby(["Topic", "Name", "Representation"])["text_clean"].apply(list).reset_index() + topic_info = topic_info.groupby(["Topic", "Name", "Representation"])["text"].apply(list).reset_index() topic_info["Topic"] = topic_info["Topic"].astype(str) results = name_topics( - topic_info, llm_chain, text_col="text_clean", top_words_col="Representation", topic_label_col="Topic" + topic_info, llm_chain, text_col="text", top_words_col="Representation", topic_label_col="Topic" ) # Some complicated conditionals to check that what's in `results` can be parsed @@ -156,7 +151,10 @@ def main(production: bool = False): ) logger.info("Saving output...") - save_to_s3(S3_BUCKET, topic_info, OUT_PATH) + # save_to_s3(S3_BUCKET, topic_info, OUT_PATH) + topic_info.to_csv( + f"{PROJECT_DIR}/dsp_interview_transcripts/pipeline/bit_france/outputs/topic_names_and_descriptions.csv" + ) logger.info("Done!") diff --git a/dsp_interview_transcripts/utils/repr_docs.py b/dsp_interview_transcripts/utils/repr_docs.py index eb83a4e..152249a 100644 --- a/dsp_interview_transcripts/utils/repr_docs.py +++ b/dsp_interview_transcripts/utils/repr_docs.py @@ -14,6 +14,7 @@ def get_min_radius( k_neighbours: int = 10, topic_col: str = "topic", embedding_col: str = "norm_embedding", + metric="cosine", ) -> Tuple[Dict[int, np.ndarray], pd.DataFrame]: """ Calculate the minimum radius that contains `k_neighbours` neighbors for each point in each cluster. @@ -44,7 +45,7 @@ def get_min_radius( # Calculate pairwise distances within the cluster # Because the embeddings are normalized, it shouldn't matter if we use euclidean or cosine? But euclidean is a bit easier to think about/write tests for? - distances = pairwise_distances(embeddings, metric="euclidean") + distances = pairwise_distances(embeddings, metric=metric) # Order the matrix so that the 0th column is the distance to itself, # 1 column is distance to closest neighbour, 2 column is the distance to the second closest neighbour, etc. @@ -60,7 +61,9 @@ def get_min_radius( return radius_distributions, clustered_data_copy -def extract_repr_docs(clustered_data: pd.DataFrame, n: int = 10, random_seed: int = 42) -> pd.DataFrame: +def extract_repr_docs( + clustered_data: pd.DataFrame, n: int = 10, random_seed: int = 42, user_id_col="conversation" +) -> pd.DataFrame: """ Extract a representative sample of N documents for each topic. @@ -80,12 +83,12 @@ def extract_repr_docs(clustered_data: pd.DataFrame, n: int = 10, random_seed: in # Get info on the number of conversations (number of users) per topic # - so that we know one user isn't dominating - distinct_conversations = first_quartile_data.groupby("topic")["conversation"].nunique().reset_index() + distinct_conversations = first_quartile_data.groupby("topic")[user_id_col].nunique().reset_index() distinct_conversations.columns = ["topic", "distinct_conversations_in_1st_quartile"] logger.info(f"Number of distinct conversations per topic: {distinct_conversations}") # Keep only one response per user (conversation) in each topic - unique_conversations = first_quartile_data.drop_duplicates(subset=["topic", "conversation"]) + unique_conversations = first_quartile_data.drop_duplicates(subset=["topic", user_id_col]) # Take a random sample of 10 sampled_data = ( unique_conversations.groupby("topic") diff --git a/pyproject.toml b/pyproject.toml index dca63d1..b165411 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,15 +23,19 @@ emoji = "^2.13.2" ftfy = "^6.2.3" streamlit = "^1.39.0" nltk = "^3.9.1" +numpy = "<2.0" transformers = "^4.45.2" torch = "^2.4.1" wordcloud = "^1.9.3" seaborn = "^0.13.2" openpyxl = "^3.1.5" langchain = "^0.3.7" -langchain-community = "^0.3.5" s3fs = "^2024.10.0" plac = "^1.4.3" +python-docx = "^1.1.2" +sentencepiece = "^0.2.0" +quarto = "^0.1.0" +langchain-community = "^0.3.13" [tool.poetry.group.test]