From c2c32bbeb1e0bed857f2d3ee627847a6dbc80712 Mon Sep 17 00:00:00 2001
From: henrywo1fe <wolf3lionel@gmail.com>
Date: Wed, 14 Aug 2024 15:12:39 +0100
Subject: [PATCH 1/5] Using old code to use rosies pipeline

---
 .gitignore                                    |  3 ++
 README.md                                     |  9 ++--
 dsp_ai_eval/config/base.yaml                  | 49 ++++++++++--------
 dsp_ai_eval/getters/scite.py                  | 20 ++------
 .../ask_gpt_for_themes.py                     |  4 +-
 .../embed_scite_abstracts.py                  | 50 +++++++++++--------
 .../plot_abstract_clusters.py                 |  4 +-
 dsp_ai_eval/utils/clustering_utils.py         | 33 +++++++++---
 8 files changed, 99 insertions(+), 73 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1dc6ea2..cf82948 100644
--- a/.gitignore
+++ b/.gitignore
@@ -127,3 +127,6 @@ target/
 
 # NPM
 node_modules/
+
+# data
+data/
diff --git a/README.md b/README.md
index 4b51ae2..cd8e084 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@
   - Setup the conda environment
   - Configure `pre-commit`
 - Make sure you have a `.env` file with the following keys:
+
 ```
 OPENAI_API_KEY = 'YOUR-KEY-HERE'
 ```
@@ -20,21 +21,23 @@ OPENAI_API_KEY = 'YOUR-KEY-HERE'
   - `generate_themes_with_gpt/`: pipeline for obtaining repeated GPT answers to the research question.
   - `process_abstracts/`: pipeline for performing text clustering on research abstracts
   - `process_gpt_summaries/`: pipeline for performing text clustering on the summaries obtained with the `generate_themes_with_gpt/` pipeline
- 
+
 ## How to update the pipeline to run with new data
 
 At the moment the workflow is not fully reproducible - that work is forthcoming!
 
 To update the pipeline to work with new research abstracts:
+
 - Download your own research abstracts and upload to the s3 bucket
 - Update relevant paths to your data in `dsp_ai_eval/config/base.yaml`
 - Update the getters in `dsp_ai_eval/getters/scite.py` (you may have more or fewer data files to be concatenated than in the previous iteration of this project)
 - Update the data deduplication steps in `dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py`.
 
 If you wish to repeat the experiment prompting GPT repeatedly for answers to a research question:
-- Update the RQ in `dsp_ai_eval/config/base.yaml`
+
+- Update the 'research_question' in `dsp_ai_eval/config/base.yaml`
 - Update the filepaths under `gpt_themes_pipeline` in `dsp_ai_eval/config/base.yaml` as desired
-- You should now be able to run `dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt+for_themes.py`
+- You should now be able to run `dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt_for_themes.py`
 
 ## Contributor guidelines
 
diff --git a/dsp_ai_eval/config/base.yaml b/dsp_ai_eval/config/base.yaml
index a932dca..fffcec8 100644
--- a/dsp_ai_eval/config/base.yaml
+++ b/dsp_ai_eval/config/base.yaml
@@ -32,38 +32,47 @@ oa_reclustering_pipeline:
   path_vis_data: outputs/reclustering/data/visualization_data.parquet
 
 gpt_themes_pipeline:
-  path_raw_data: inputs/data/gpt/gpt_themes_repeats.jsonl
-  path_cleaned_data: inputs/data/gpt/gpt_themes_repeats_cleaned.csv
-  path_cleaned_data_w_embeddings: inputs/data/gpt/gpt_themes_repeats_cleaned_embeddings.csv
+  research_question: What health and disease information relating to cancer do the public have high interest in?
+  path_raw_data: early_health_auto_lit_analysis/RQ1/early_health_auto_lit_analysis/RQ1/inputs/data/gpt/gpt_themes_repeats.jsonl
+  path_cleaned_data: early_health_auto_lit_analysis/RQ1/inputs/data/gpt/gpt_themes_repeats_cleaned.csv
+  path_cleaned_data_w_embeddings: early_health_auto_lit_analysis/RQ1/inputs/data/gpt/gpt_themes_repeats_cleaned_embeddings.csv
   hdsbscan_min_cluster_size: 20
   n_topics: 12
   tfidf_ngram_min: 1
   tfidf_ngram_max: 3
-  dir_topic_model: outputs/models/bertopic_gpt_themes_model
-  path_probs: outputs/data/bertopic_gpt_themes_model_probs.npy
-  path_topics: outputs/data/bertopic_gpt_themes_model_topics.pkl
-  path_repr_docs: outputs/data/bertopic_gpt_themes_representative_docs.pkl
-  path_summaries: outputs/data/gpt_theme_cluster_summaries.json
-  path_summaries_cleaned: outputs/data/gpt_theme_cluster_summaries_cleaned.csv
+  dir_topic_model: early_health_auto_lit_analysis/RQ1/outputs/models/bertopic_gpt_themes_model
+  path_probs: early_health_auto_lit_analysis/RQ1/outputs/data/bertopic_gpt_themes_model_probs.npy
+  path_topics: early_health_auto_lit_analysis/RQ1/outputs/data/bertopic_gpt_themes_model_topics.pkl
+  path_repr_docs: early_health_auto_lit_analysis/RQ1/outputs/data/bertopic_gpt_themes_representative_docs.pkl
+  path_summaries: early_health_auto_lit_analysis/RQ1/outputs/data/gpt_theme_cluster_summaries.json
+  path_summaries_cleaned: early_health_auto_lit_analysis/RQ1/outputs/data/gpt_theme_cluster_summaries_cleaned.csv
   cluster_colours: tableau20
 
 abstracts_pipeline:
-  path_scite_core_references: inputs/data/scite/How_does_technology_diffusion_impact_UK_growth_and_productivity_3F.csv
-  path_scite_search1: inputs/data/scite/technology diffusion impact on uk growth-2024-02-23.csv
-  path_scite_search2: inputs/data/scite/technology diffusion impact on uk productivity-2024-02-23.csv
+  path_scite_core_references: early_health_auto_lit_analysis/RQ3.2(a)/inputs/data/scite/reducing anxiety in patients with chronic diseases through test result communication-2024-05-24.csv
+  path_scite_search1: early_health_auto_lit_analysis/RQ3.2(a)/inputs/data/scite/communication of health test results for patients with chronic diseases-2024-05-24.csv
+  # path_scite_search2: early_health_auto_lit_analysis/RQ4.1(a)/inputs/data/scite/communicating risk information for cancer-2024-05-23.csv
   citation_threshold: 5 # remove papers with fewer than N citations
-  n_most_relevant_papers: 1000
-  path_cleaned_data_w_embeddings: inputs/data/embeddings/scite_embeddings.parquet
+  n_most_relevant_papers: 1500
+  path_cleaned_data_w_embeddings: early_health_auto_lit_analysis/RQ3.2(a)/inputs/data/embeddings/scite_embeddings.parquet
   hdsbscan_min_cluster_size: 30
   tfidf_ngram_min: 1
   tfidf_ngram_max: 3
-  dir_topic_model: outputs/models/bertopic_abstracts_model
-  path_probs: outputs/data/bertopic_abstracts_model_probs.npy
-  path_topics: outputs/data/bertopic_abstracts_model_topics.pkl
-  path_repr_docs: outputs/data/bertopic_abstracts_representative_docs.pkl
-  path_summaries: outputs/data/abstracts_cluster_summaries.json
-  path_summaries_cleaned: outputs/data/abstracts_cluster_summaries_cleaned.csv
+  dir_topic_model: early_health_auto_lit_analysis/RQ3.2(a)/outputs/models/bertopic_abstracts_model
+  path_probs: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/bertopic_abstracts_model_probs.npy
+  path_topics: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/bertopic_abstracts_model_topics.pkl
+  path_repr_docs: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/bertopic_abstracts_representative_docs.pkl
+  path_summaries: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/abstracts_cluster_summaries.json
+  path_summaries_cleaned: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/abstracts_cluster_summaries_cleaned.csv
   cluster_colours: tableau20
+  recluster_dir_topic_model: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/models/bertopic_abstracts_model
+  recluster_path_probs: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/bertopic_abstracts_model_probs.npy
+  recluster_path_topics: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/bertopic_abstracts_model_topics.pkl
+  recluster_path_repr_docs: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/bertopic_abstracts_representative_docs.pkl
+  recluster_path_summaries: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/abstracts_cluster_summaries.json
+  recluster_path_summaries_cleaned: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/abstracts_cluster_summaries_cleaned.csv
+  recluster_path_cleaned_data_w_embeddings: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/inputs/data/embeddings/scite_embeddings.parquet
+  topics_to_recluster: [1]
 
 summarization_pipeline:
   gpt_model: gpt-4-turbo
diff --git a/dsp_ai_eval/getters/scite.py b/dsp_ai_eval/getters/scite.py
index 1bc410b..bb59ebd 100644
--- a/dsp_ai_eval/getters/scite.py
+++ b/dsp_ai_eval/getters/scite.py
@@ -61,24 +61,12 @@ def get_abstracts(n_abstracts=N_MOST_RELEVANT_PAPERS):
         "wider",
         n_abstracts,
     )
-    scite_wider_abstracts2 = read_scite_abstracts(
-        f"{rq_prefix}/" + config["abstracts_pipeline"]["path_scite_search2"],
-        "wider",
-        n_abstracts,
-    )
-    scite_wider_abstracts3 = read_scite_abstracts(
-        f"{rq_prefix}/" + config["abstracts_pipeline"]["path_scite_search3"],
-        "wider",
-        n_abstracts,
-    )
+    # scite_wider_abstracts2 = read_scite_abstracts(
+    #     config["abstracts_pipeline"]["path_scite_search2"], "wider", n_abstracts
+    # )
 
     scite_abstracts = pd.concat(
-        [
-            scite_main_abstracts,
-            scite_wider_abstracts1,
-            scite_wider_abstracts2,
-            scite_wider_abstracts3,
-        ]
+        [scite_main_abstracts, scite_wider_abstracts1]  # , scite_wider_abstracts2
     )
 
     return scite_abstracts
diff --git a/dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt_for_themes.py b/dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt_for_themes.py
index 753da9c..1b376db 100644
--- a/dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt_for_themes.py
+++ b/dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt_for_themes.py
@@ -28,12 +28,12 @@
 
 GPT_MODEL = "gpt-3.5-turbo"
 TEMPS = [0, 0.25, 0.5, 1]
-RQ = config["gpt_themes_pipeline"]["RQ"]
+RQ = config["gpt_themes_pipeline"]["research_question"]
 SYSTEM_MESSAGE = "You are a helpful research assistant. Given a research question, you provide a summary of the key topics in academic research on that topic."
 N_SAMPLES = 50
 
 # output
-FILENAME = "inputs/data/gpt/gpt_themes_repeats.jsonl"
+FILENAME = "early_health_auto_lit_analysis/RQ1/inputs/data/gpt/gpt_themes_repeats.jsonl"
 OUT_FILE = PROJECT_DIR / FILENAME
 rq_prefix: str = config["rq_prefix"]
 
diff --git a/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py b/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py
index 01a5c4e..1bd8014 100644
--- a/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py
+++ b/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py
@@ -24,6 +24,32 @@ def first_non_nan(series: pd.Series) -> Union[pd.Series, np.nan]:
     return series.dropna().iloc[0] if not series.dropna().empty else np.nan
 
 
+def create_agg_dict(df):
+    """
+    Create an aggregation dictionary for a dataframe.
+
+    This function creates an aggregation dictionary for a dataframe,
+    where each existing column in the dataframe is associated with the
+    'first_non_nan' function. This dictionary can be used in aggregation
+    operations, such as 'groupby.agg()'.
+
+    Parameters:
+    df (pandas.DataFrame): The dataframe for which to create the aggregation dictionary.
+
+    Returns:
+    dict: The aggregation dictionary.
+    """
+
+    # Create a list of all existing columns in the dataframe
+    existing_columns = [col for col in df.columns]
+
+    # Create an aggregation dictionary where each column is associated with the 'first_non_nan' function
+    agg_dict = {col: first_non_nan for col in existing_columns}
+
+    # Return the aggregation dictionary
+    return agg_dict
+
+
 if __name__ == "__main__":
     scite_abstracts = get_abstracts(n_abstracts=N_MOST_RELEVANT_PAPERS)
     logging.info(f"Total number of abstracts: {len(scite_abstracts)}")
@@ -38,28 +64,8 @@ def first_non_nan(series: pd.Series) -> Union[pd.Series, np.nan]:
         f"Number of abstracts remaining after dropping duplicates: {len(scite_abstracts)}"
     )
 
-    agg_dict = {
-        "date": first_non_nan,
-        "title": first_non_nan,
-        "doi": first_non_nan,
-        "authors": first_non_nan,
-        "journal": first_non_nan,
-        "short_journal": first_non_nan,
-        "volume": first_non_nan,
-        "year": first_non_nan,
-        "publisher": first_non_nan,
-        "issue": first_non_nan,
-        "page": first_non_nan,
-        "abstract": first_non_nan,
-        "category": first_non_nan,
-        "pmid": first_non_nan,
-        "issns": first_non_nan,
-        "supporting_cites": first_non_nan,
-        "contrasting_cites": first_non_nan,
-        "mentioning_cites": first_non_nan,
-        "total_cites": first_non_nan,
-        "scite_report_link": first_non_nan,
-    }
+    # Create an aggregation dictionary for the dataframe
+    agg_dict = create_agg_dict(scite_abstracts)
 
     # Group by 'doi' and aggregate
     scite_abstracts = (
diff --git a/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py b/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py
index b0d538a..0696233 100644
--- a/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py
+++ b/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py
@@ -132,11 +132,11 @@ def create_chart(
     # Save the chart
     if save:
         filename = f"scite_abstracts{filename_suffix}.html"
-        plot.save(PROJECT_DIR / f"outputs/figures/{filename}")
+        plot.save(PROJECT_DIR / f"outputs/RQ3.2(a)/figures/{filename}")
         viz_save.save(
             plot,
             f"scite_abstracts{filename_suffix}",
-            PROJECT_DIR / "outputs/figures",
+            PROJECT_DIR / "outputs/RQ3.2(a)/figures",
             save_png=True,
         )
 
diff --git a/dsp_ai_eval/utils/clustering_utils.py b/dsp_ai_eval/utils/clustering_utils.py
index 3510b50..6ef5378 100644
--- a/dsp_ai_eval/utils/clustering_utils.py
+++ b/dsp_ai_eval/utils/clustering_utils.py
@@ -17,12 +17,23 @@
 
 import re
 import pandas as pd
-from tqdm import tqdm
-from time import sleep
-from datetime import date
-from typing import List
+import re
+import spacy
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+import time
+from umap import UMAP
+
+from dsp_ai_eval import PROJECT_DIR, logging, config
+from dsp_ai_eval.utils import utils
+
+from langfuse.callback import CallbackHandler
+
+langfuse_handler = CallbackHandler()
+
+load_dotenv()
 
-from dsp_ai_eval import logger, config
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 
 GPT_MODEL = config["summarization_pipeline"]["gpt_model"]
 
@@ -64,7 +75,13 @@ def create_new_topic_model(
     calculate_probabilities=False,
     embedding_model=config["embedding_model"],
 ):
-    logger.info("Initialising BERTopic model...")
+    # Check if 'en_core_web_sm' is installed, if not, download it
+    try:
+        spacy.load("en_core_web_sm")
+    except OSError:
+        print("Model 'en_core_web_sm' not found. Downloading...")
+        os.system("python -m spacy download en_core_web_sm")
+    sentence_model = SentenceTransformer(embedding_model)
 
     umap_model = UMAP(
         n_neighbors=15,
@@ -250,11 +267,11 @@ def get_summaries(
         )
 
         # Generate the summary
-        output = (llm_chain | parser).invoke(
+        summary_result = llm_chain.invoke(
             {"texts": texts, "keywords": keywords},
             config={"callbacks": [langfuse_handler]},
         )
-        # output = parser.invoke(summary_result, config={"callbacks": [langfuse_handler]})
+        output = parser.invoke(summary_result, config={"callbacks": [langfuse_handler]})
 
         summaries[topic] = {
             "Name:": output.name,

From 7bceda21842685f9694a5530f4d7e322927bec41 Mon Sep 17 00:00:00 2001
From: henrywo1fe <wolf3lionel@gmail.com>
Date: Wed, 14 Aug 2024 16:00:47 +0100
Subject: [PATCH 2/5] added dotenv import

---
 dsp_ai_eval/utils/clustering_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dsp_ai_eval/utils/clustering_utils.py b/dsp_ai_eval/utils/clustering_utils.py
index 6ef5378..de334ee 100644
--- a/dsp_ai_eval/utils/clustering_utils.py
+++ b/dsp_ai_eval/utils/clustering_utils.py
@@ -23,6 +23,7 @@
 from sklearn.feature_extraction.text import CountVectorizer
 import time
 from umap import UMAP
+from dotenv import load_dotenv
 
 from dsp_ai_eval import PROJECT_DIR, logging, config
 from dsp_ai_eval.utils import utils

From 5e491720ac702eba03498d0a78b3874c7d744c83 Mon Sep 17 00:00:00 2001
From: henrywo1fe <wolf3lionel@gmail.com>
Date: Tue, 20 Aug 2024 10:39:27 +0100
Subject: [PATCH 3/5] Added research question.

---
 dsp_ai_eval/config/base.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dsp_ai_eval/config/base.yaml b/dsp_ai_eval/config/base.yaml
index fffcec8..0aadc90 100644
--- a/dsp_ai_eval/config/base.yaml
+++ b/dsp_ai_eval/config/base.yaml
@@ -1,7 +1,7 @@
 embedding_model: all-miniLM-L6-v2
 seed: 42
-rq_prefix: <project_name>/<research_question_number>
-RQ: <Research question>
+rq_prefix: andromeda
+RQ: How to identify misinformation?
 
 oa_abstracts_pipeline:
   path_raw_data: inputs/openalex/data/works_raw.parquet

From 572a55564fdbdd4feeb79305b17d06e3bbf92322 Mon Sep 17 00:00:00 2001
From: henrywo1fe <wolf3lionel@gmail.com>
Date: Thu, 10 Oct 2024 10:27:46 +0100
Subject: [PATCH 4/5] edits to make rosies pipeline work

---
 dsp_ai_eval/config/base.yaml                  | 36 +++++++++----------
 dsp_ai_eval/getters/scite.py                  | 16 ++++-----
 .../pipeline/process_abstracts/README.md      |  7 ++++
 .../clean_cluster_summaries.py                |  4 +--
 .../process_abstracts/cluster_abstracts.py    |  4 +--
 .../cluster_summarization_pipeline.py         |  2 +-
 .../embed_scite_abstracts.py                  |  4 +--
 .../plot_abstract_clusters.py                 | 16 ++++++---
 dsp_ai_eval/utils/clustering_utils.py         |  7 +++-
 9 files changed, 56 insertions(+), 40 deletions(-)

diff --git a/dsp_ai_eval/config/base.yaml b/dsp_ai_eval/config/base.yaml
index 0aadc90..42b5d1d 100644
--- a/dsp_ai_eval/config/base.yaml
+++ b/dsp_ai_eval/config/base.yaml
@@ -1,7 +1,7 @@
 embedding_model: all-miniLM-L6-v2
 seed: 42
-rq_prefix: andromeda
-RQ: How to identify misinformation?
+rq_prefix: "GSK_test"
+RQ: Theoretical basis for behavioural boosts and behavioural boosting
 
 oa_abstracts_pipeline:
   path_raw_data: inputs/openalex/data/works_raw.parquet
@@ -49,29 +49,29 @@ gpt_themes_pipeline:
   cluster_colours: tableau20
 
 abstracts_pipeline:
-  path_scite_core_references: early_health_auto_lit_analysis/RQ3.2(a)/inputs/data/scite/reducing anxiety in patients with chronic diseases through test result communication-2024-05-24.csv
-  path_scite_search1: early_health_auto_lit_analysis/RQ3.2(a)/inputs/data/scite/communication of health test results for patients with chronic diseases-2024-05-24.csv
+  path_scite_core_references: GSK_vaccine_uptake/rosies_version/1.1/inputs/data/scite/behavioural barriers to adult vaccination uptake in the uk-2024-08-13.csv
+  path_scite_search1: GSK_vaccine_uptake/rosies_version/1.1/inputs/data/scite/social motivators for adult vaccination in the uk-2024-08-13.csv
   # path_scite_search2: early_health_auto_lit_analysis/RQ4.1(a)/inputs/data/scite/communicating risk information for cancer-2024-05-23.csv
   citation_threshold: 5 # remove papers with fewer than N citations
   n_most_relevant_papers: 1500
-  path_cleaned_data_w_embeddings: early_health_auto_lit_analysis/RQ3.2(a)/inputs/data/embeddings/scite_embeddings.parquet
+  path_cleaned_data_w_embeddings: GSK_vaccine_uptake/rosies_version/1.1/inputs/data/embeddings/scite_embeddings.parquet
   hdsbscan_min_cluster_size: 30
   tfidf_ngram_min: 1
   tfidf_ngram_max: 3
-  dir_topic_model: early_health_auto_lit_analysis/RQ3.2(a)/outputs/models/bertopic_abstracts_model
-  path_probs: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/bertopic_abstracts_model_probs.npy
-  path_topics: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/bertopic_abstracts_model_topics.pkl
-  path_repr_docs: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/bertopic_abstracts_representative_docs.pkl
-  path_summaries: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/abstracts_cluster_summaries.json
-  path_summaries_cleaned: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/abstracts_cluster_summaries_cleaned.csv
+  dir_topic_model: GSK_vaccine_uptake/rosies_version/1.1/outputs/models/bertopic_abstracts_model
+  path_probs: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/bertopic_abstracts_model_probs.npy
+  path_topics: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/bertopic_abstracts_model_topics.pkl
+  path_repr_docs: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/bertopic_abstracts_representative_docs.pkl
+  path_summaries: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/abstracts_cluster_summaries.json
+  path_summaries_cleaned: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/abstracts_cluster_summaries_cleaned.csv
   cluster_colours: tableau20
-  recluster_dir_topic_model: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/models/bertopic_abstracts_model
-  recluster_path_probs: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/bertopic_abstracts_model_probs.npy
-  recluster_path_topics: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/bertopic_abstracts_model_topics.pkl
-  recluster_path_repr_docs: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/bertopic_abstracts_representative_docs.pkl
-  recluster_path_summaries: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/abstracts_cluster_summaries.json
-  recluster_path_summaries_cleaned: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/abstracts_cluster_summaries_cleaned.csv
-  recluster_path_cleaned_data_w_embeddings: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/inputs/data/embeddings/scite_embeddings.parquet
+  recluster_dir_topic_model: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/models/bertopic_abstracts_model
+  recluster_path_probs: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/bertopic_abstracts_model_probs.npy
+  recluster_path_topics: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/bertopic_abstracts_model_topics.pkl
+  recluster_path_repr_docs: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/bertopic_abstracts_representative_docs.pkl
+  recluster_path_summaries: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/abstracts_cluster_summaries.json
+  recluster_path_summaries_cleaned: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/abstracts_cluster_summaries_cleaned.csv
+  recluster_path_cleaned_data_w_embeddings: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/inputs/data/embeddings/scite_embeddings.parquet
   topics_to_recluster: [1]
 
 summarization_pipeline:
diff --git a/dsp_ai_eval/getters/scite.py b/dsp_ai_eval/getters/scite.py
index bb59ebd..e563bc9 100644
--- a/dsp_ai_eval/getters/scite.py
+++ b/dsp_ai_eval/getters/scite.py
@@ -52,12 +52,12 @@ def read_scite_abstracts(
 
 def get_abstracts(n_abstracts=N_MOST_RELEVANT_PAPERS):
     scite_main_abstracts = read_scite_abstracts(
-        f"{rq_prefix}/" + config["abstracts_pipeline"]["path_scite_core_references"],
+        config["abstracts_pipeline"]["path_scite_core_references"],
         "main",
         n_abstracts,
     )
     scite_wider_abstracts1 = read_scite_abstracts(
-        f"{rq_prefix}/" + config["abstracts_pipeline"]["path_scite_search1"],
+        config["abstracts_pipeline"]["path_scite_search1"],
         "wider",
         n_abstracts,
     )
@@ -74,7 +74,7 @@ def get_abstracts(n_abstracts=N_MOST_RELEVANT_PAPERS):
 
 def get_scite_df_w_embeddings():
     filename = config["abstracts_pipeline"]["path_cleaned_data_w_embeddings"]
-    return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}")
+    return load_s3_data(S3_BUCKET, f"{filename}")
 
 
 def get_topic_model():
@@ -92,24 +92,24 @@ def get_topic_model():
 
 def get_topics():
     filemame = config["abstracts_pipeline"]["path_topics"]
-    return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filemame}")
+    return load_s3_data(S3_BUCKET, f"{filemame}")
 
 
 def get_probs():
     filename = config["abstracts_pipeline"]["path_probs"]
-    return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}")
+    return load_s3_data(S3_BUCKET, f"{filename}")
 
 
 def get_representative_docs():
     filename = config["abstracts_pipeline"]["path_repr_docs"]
-    return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}")
+    return load_s3_data(S3_BUCKET, f"{filename}")
 
 
 def get_cluster_summaries():
     filename = config["abstracts_pipeline"]["path_summaries"]
-    return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}")
+    return load_s3_data(S3_BUCKET, f"{filename}")
 
 
 def get_cluster_summaries_clean():
     filename = config["abstracts_pipeline"]["path_summaries_cleaned"]
-    return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}")
+    return load_s3_data(S3_BUCKET, f"{filename}")
diff --git a/dsp_ai_eval/pipeline/process_abstracts/README.md b/dsp_ai_eval/pipeline/process_abstracts/README.md
index 0291a76..870b7fd 100644
--- a/dsp_ai_eval/pipeline/process_abstracts/README.md
+++ b/dsp_ai_eval/pipeline/process_abstracts/README.md
@@ -22,3 +22,10 @@ Note that the final step, `plot_abstract_clusters.py`, saves plots **locally** r
 4. `clean_cluster_summaries.py`: do some minor cleaning on the GPT-generated cluster summaries. (This is in a separate script from `cluster_summarization_pipeline.py` just so that if we want to modify the cleaning steps, we don't have to regenerate the summaries, as doing so comes with a cost.)
 
 5. `plot_abstract_clusters.py`: visualize the clusters that we have created!
+
+CHECKLIST
+
+- Update file paths in base.yaml
+- Update output path for figs in plot_abstract_clusters.py and add corresponding output file in finder
+- Update number of args in get_abstracts in scite.py, and in embed_scite_abstracts (?)
+- Put data from scite in S3
diff --git a/dsp_ai_eval/pipeline/process_abstracts/clean_cluster_summaries.py b/dsp_ai_eval/pipeline/process_abstracts/clean_cluster_summaries.py
index cce8083..536cf4f 100644
--- a/dsp_ai_eval/pipeline/process_abstracts/clean_cluster_summaries.py
+++ b/dsp_ai_eval/pipeline/process_abstracts/clean_cluster_summaries.py
@@ -11,6 +11,4 @@
 
     cluster_summaries_cleaned = clean_cluster_summaries(cluster_summaries)
 
-    save_to_s3(
-        S3_BUCKET, cluster_summaries_cleaned, f"{rq_prefix}/{CLUSTER_SUMMARIES_OUTPATH}"
-    )
+    save_to_s3(S3_BUCKET, cluster_summaries_cleaned, f"{CLUSTER_SUMMARIES_OUTPATH}")
diff --git a/dsp_ai_eval/pipeline/process_abstracts/cluster_abstracts.py b/dsp_ai_eval/pipeline/process_abstracts/cluster_abstracts.py
index f07bf39..1da9edd 100644
--- a/dsp_ai_eval/pipeline/process_abstracts/cluster_abstracts.py
+++ b/dsp_ai_eval/pipeline/process_abstracts/cluster_abstracts.py
@@ -10,8 +10,8 @@
 
 rq_prefix = config["rq_prefix"]
 
-TOPICS_OUTPATH = f'{rq_prefix}/{config["abstracts_pipeline"]["path_topics"]}'
-PROBS_OUTPATH = f'{rq_prefix}/{config["abstracts_pipeline"]["path_probs"]}'
+TOPICS_OUTPATH = f'{config["abstracts_pipeline"]["path_topics"]}'
+PROBS_OUTPATH = f'{config["abstracts_pipeline"]["path_probs"]}'
 MODEL_OUTPATH = config["abstracts_pipeline"]["dir_topic_model"]
 REPRESENTATIVE_DOCS_OUTPATH = (
     f'{rq_prefix}/{config["abstracts_pipeline"]["path_repr_docs"]}'
diff --git a/dsp_ai_eval/pipeline/process_abstracts/cluster_summarization_pipeline.py b/dsp_ai_eval/pipeline/process_abstracts/cluster_summarization_pipeline.py
index 0397303..12bdca2 100644
--- a/dsp_ai_eval/pipeline/process_abstracts/cluster_summarization_pipeline.py
+++ b/dsp_ai_eval/pipeline/process_abstracts/cluster_summarization_pipeline.py
@@ -33,4 +33,4 @@
         text_col="title_abstract",
     )
 
-    save_to_s3(S3_BUCKET, summaries, f"{rq_prefix}/{SUMMARIES_OUTPATH}")
+    save_to_s3(S3_BUCKET, summaries, f"{SUMMARIES_OUTPATH}")
diff --git a/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py b/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py
index 1bd8014..7ac7271 100644
--- a/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py
+++ b/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py
@@ -14,13 +14,13 @@
 
 OUT_PATH = config["abstracts_pipeline"]["path_cleaned_data_w_embeddings"]
 rq_prefix = config["rq_prefix"]
-S3_KEY = f"{rq_prefix}/{OUT_PATH}"
+S3_KEY = f"{OUT_PATH}"
 
 CITATION_THRESHOLD = config["abstracts_pipeline"]["citation_threshold"]
 N_MOST_RELEVANT_PAPERS = config["abstracts_pipeline"]["n_most_relevant_papers"]
 
 
-def first_non_nan(series: pd.Series) -> Union[pd.Series, np.nan]:
+def first_non_nan(series: pd.Series) -> Union[pd.Series, float]:
     return series.dropna().iloc[0] if not series.dropna().empty else np.nan
 
 
diff --git a/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py b/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py
index 0696233..b081c13 100644
--- a/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py
+++ b/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py
@@ -120,9 +120,15 @@ def create_chart(
             "y:Q", axis=alt.Axis(ticks=False, labels=False, title=None, grid=False)
         ),
         size=size_encode,
-        color=alt.Color("topic_name:N", legend=None).scale(
-            scheme=config["abstracts_pipeline"]["cluster_colours"]
-        ),
+        color=alt.Color(
+            "topic_name:N",
+            legend=alt.Legend(
+                title="Topic Names",
+                titleFontSize=12,
+                labelFontSize=12,
+                labelPadding=100,
+            ),
+        ).scale(scheme=config["abstracts_pipeline"]["cluster_colours"]),
         opacity=opacity_condition,
         tooltip=tooltip_fields,
     ).mark_circle()
@@ -132,11 +138,11 @@ def create_chart(
     # Save the chart
     if save:
         filename = f"scite_abstracts{filename_suffix}.html"
-        plot.save(PROJECT_DIR / f"outputs/RQ3.2(a)/figures/{filename}")
+        plot.save(PROJECT_DIR / f"outputs/GSK/1.1/figures/{filename}")
         viz_save.save(
             plot,
             f"scite_abstracts{filename_suffix}",
-            PROJECT_DIR / "outputs/RQ3.2(a)/figures",
+            PROJECT_DIR / "outputs/GSK/1.1/figures",
             save_png=True,
         )
 
diff --git a/dsp_ai_eval/utils/clustering_utils.py b/dsp_ai_eval/utils/clustering_utils.py
index de334ee..499ddba 100644
--- a/dsp_ai_eval/utils/clustering_utils.py
+++ b/dsp_ai_eval/utils/clustering_utils.py
@@ -24,12 +24,17 @@
 import time
 from umap import UMAP
 from dotenv import load_dotenv
-
+import os
+from typing import List
 from dsp_ai_eval import PROJECT_DIR, logging, config
 from dsp_ai_eval.utils import utils
+from tqdm import tqdm
+from datetime import date
+from time import sleep
 
 from langfuse.callback import CallbackHandler
 
+logger = logging.getLogger(__name__)
 langfuse_handler = CallbackHandler()
 
 load_dotenv()

From be54719465bf99dc70d864e0420a338adb77c167 Mon Sep 17 00:00:00 2001
From: henrywo1fe <wolf3lionel@gmail.com>
Date: Thu, 10 Oct 2024 17:43:50 +0100
Subject: [PATCH 5/5] bug fixes to make pipeline run smoothly

---
 dsp_ai_eval/pipeline/openalex/__init__.py              |  4 +++-
 dsp_ai_eval/pipeline/openalex/clustering.py            | 10 +++++++++-
 .../pipeline/openalex/plot_abstract_clusters.py        |  6 ++++--
 dsp_ai_eval/pipeline/openalex/works.py                 | 10 +++++++---
 4 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/dsp_ai_eval/pipeline/openalex/__init__.py b/dsp_ai_eval/pipeline/openalex/__init__.py
index 7f04a4c..b08b5c0 100644
--- a/dsp_ai_eval/pipeline/openalex/__init__.py
+++ b/dsp_ai_eval/pipeline/openalex/__init__.py
@@ -15,7 +15,9 @@
 
 
 @app.command()
-def run_pipeline(config: Annotated[Optional[Path], typer.Option()] = None):
+def run_pipeline(config=config):
+    # config: Annotated[Optional[Path], typer.Option()] = None
+    config = eval(config)
 
     works.run_pipeline(config=config)
     clustering.run_pipeline(config=config)
diff --git a/dsp_ai_eval/pipeline/openalex/clustering.py b/dsp_ai_eval/pipeline/openalex/clustering.py
index 41eb5f6..eb478cb 100644
--- a/dsp_ai_eval/pipeline/openalex/clustering.py
+++ b/dsp_ai_eval/pipeline/openalex/clustering.py
@@ -11,6 +11,7 @@
 
 @app.command()
 def cluster_abstracts(config: Annotated[Optional[Path], typer.Option()] = None):
+    # config = eval(config)
     from dsp_ai_eval.pipeline.openalex import cluster_abstracts
 
     cluster_abstracts.run_pipeline(config=config)
@@ -18,6 +19,7 @@ def cluster_abstracts(config: Annotated[Optional[Path], typer.Option()] = None):
 
 @app.command()
 def summarise_clusters(config: Annotated[Optional[Path], typer.Option()] = None):
+    # config = eval(config)
     from dsp_ai_eval.pipeline.openalex import cluster_summarization
     from dsp_ai_eval.pipeline.openalex import clean_cluster_summaries
 
@@ -27,6 +29,7 @@ def summarise_clusters(config: Annotated[Optional[Path], typer.Option()] = None)
 
 @app.command()
 def create_plots(config: Annotated[Optional[Path], typer.Option()] = None):
+    # config = eval(config)
     from dsp_ai_eval.pipeline.openalex import plot_abstract_clusters
 
     if config is None:
@@ -37,6 +40,7 @@ def create_plots(config: Annotated[Optional[Path], typer.Option()] = None):
 
 @app.command()
 def recluster(config: Annotated[Optional[Path], typer.Option()] = None):
+    # config = eval(config)
     import questionary
     from dsp_ai_eval.getters.openalex import get_cluster_summaries_clean
     from dsp_ai_eval.pipeline.openalex import reclustering
@@ -68,7 +72,11 @@ def recluster(config: Annotated[Optional[Path], typer.Option()] = None):
 
 
 @app.command()
-def run_pipeline(config: Annotated[Optional[Path], typer.Option()] = None):
+def run_pipeline(
+    config=base_config,
+    # config: Annotated[Optional[Path], typer.Option()] = None
+):
+    # config = eval(config)
     cluster_abstracts(config=config)
     summarise_clusters(config=config)
     create_plots(config=config)
diff --git a/dsp_ai_eval/pipeline/openalex/plot_abstract_clusters.py b/dsp_ai_eval/pipeline/openalex/plot_abstract_clusters.py
index d879b96..53d72b8 100644
--- a/dsp_ai_eval/pipeline/openalex/plot_abstract_clusters.py
+++ b/dsp_ai_eval/pipeline/openalex/plot_abstract_clusters.py
@@ -153,7 +153,7 @@ def create_chart(
     # Save the chart
     if save:
         filename = f"openalex_abstracts{filename_suffix}.html"
-        plot.save(PROJECT_DIR / f"outputs/figures/{filename}")
+        plot.save(PROJECT_DIR / f"outputs/figures/{filename}", format="html")
         viz_save.save(
             plot,
             f"openalex_abstracts{filename_suffix}",
@@ -222,7 +222,9 @@ def run_pipeline(
         df_vis,
         f"{config['rq_prefix']}/{config['oa_abstracts_pipeline']['path_vis_data']}",
     )
-
+    df_vis = df_vis[
+        ["topic", "topic_name", "total_cites", "doc", "x", "y"]
+    ]  # Select specific cols to avoid JSON error
     create_chart(
         df_vis, scale_by_citations=False, filename_suffix="", add_topic_legend=True
     )
diff --git a/dsp_ai_eval/pipeline/openalex/works.py b/dsp_ai_eval/pipeline/openalex/works.py
index 80a1634..d36b92e 100644
--- a/dsp_ai_eval/pipeline/openalex/works.py
+++ b/dsp_ai_eval/pipeline/openalex/works.py
@@ -9,7 +9,7 @@
 
 app = typer.Typer()
 
-user = "solomon.yu@nesta.org.uk"  # use a separate config file
+user = "henry.nurick@bi.team"  # use a separate config file
 RQ = config["RQ"]
 rq_prefix = config["rq_prefix"]
 OUTPATH = config["oa_abstracts_pipeline"]["path_raw_data"]
@@ -43,10 +43,12 @@ def get(
 
 @app.command()
 def process(
-    config: Annotated[Optional[Path], typer.Option()] = None,
+    config=config,
+    # config: Annotated[Optional[Path], typer.Option()] = None,
     openalex_rmin: int = 10,
     bm25_topk: int = 1000,
 ):
+    # config = eval(config)
     import pandas as pd
     from dsp_ai_eval.pipeline.openalex.utils import (
         filter_relevance_score,
@@ -83,8 +85,10 @@ def process(
     min_cites_count(filtered)
 
 
+@app.command()
 def run_pipeline(
-    config: Annotated[Optional[Path], typer.Option()] = None,
+    config=config,
+    # config: Annotated[Optional[Path], typer.Option()] = None,
 ):
     get(config=config)
     process(config=config)