nestauk · sqr00t · Aug 14, 2024 · Aug 14, 2024 · Aug 20, 2024 · Oct 10, 2024
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,6 @@ target/
 
 # NPM
 node_modules/
+
+# data
+data/
diff --git a/README.md b/README.md
@@ -8,6 +8,7 @@
   - Setup the conda environment
   - Configure `pre-commit`
 - Make sure you have a `.env` file with the following keys:
+
 ```
 OPENAI_API_KEY = 'YOUR-KEY-HERE'
 ```
@@ -20,21 +21,23 @@ OPENAI_API_KEY = 'YOUR-KEY-HERE'
   - `generate_themes_with_gpt/`: pipeline for obtaining repeated GPT answers to the research question.
   - `process_abstracts/`: pipeline for performing text clustering on research abstracts
   - `process_gpt_summaries/`: pipeline for performing text clustering on the summaries obtained with the `generate_themes_with_gpt/` pipeline
- 
+
 ## How to update the pipeline to run with new data
 
 At the moment the workflow is not fully reproducible - that work is forthcoming!
 
 To update the pipeline to work with new research abstracts:
+
 - Download your own research abstracts and upload to the s3 bucket
 - Update relevant paths to your data in `dsp_ai_eval/config/base.yaml`
 - Update the getters in `dsp_ai_eval/getters/scite.py` (you may have more or fewer data files to be concatenated than in the previous iteration of this project)
 - Update the data deduplication steps in `dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py`.
 
 If you wish to repeat the experiment prompting GPT repeatedly for answers to a research question:
-- Update the RQ in `dsp_ai_eval/config/base.yaml`
+
+- Update the 'research_question' in `dsp_ai_eval/config/base.yaml`
 - Update the filepaths under `gpt_themes_pipeline` in `dsp_ai_eval/config/base.yaml` as desired
-- You should now be able to run `dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt+for_themes.py`
+- You should now be able to run `dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt_for_themes.py`
 
 ## Contributor guidelines
 

diff --git a/dsp_ai_eval/config/base.yaml b/dsp_ai_eval/config/base.yaml
@@ -1,7 +1,7 @@
 embedding_model: all-miniLM-L6-v2
 seed: 42
-rq_prefix: <project_name>/<research_question_number>
-RQ: <Research question>
+rq_prefix: "GSK_test"
+RQ: Theoretical basis for behavioural boosts and behavioural boosting
 
 oa_abstracts_pipeline:
   path_raw_data: inputs/openalex/data/works_raw.parquet
@@ -32,38 +32,47 @@ oa_reclustering_pipeline:
   path_vis_data: outputs/reclustering/data/visualization_data.parquet
 
 gpt_themes_pipeline:
-  path_raw_data: inputs/data/gpt/gpt_themes_repeats.jsonl
-  path_cleaned_data: inputs/data/gpt/gpt_themes_repeats_cleaned.csv
-  path_cleaned_data_w_embeddings: inputs/data/gpt/gpt_themes_repeats_cleaned_embeddings.csv
+  research_question: What health and disease information relating to cancer do the public have high interest in?
+  path_raw_data: early_health_auto_lit_analysis/RQ1/early_health_auto_lit_analysis/RQ1/inputs/data/gpt/gpt_themes_repeats.jsonl
+  path_cleaned_data: early_health_auto_lit_analysis/RQ1/inputs/data/gpt/gpt_themes_repeats_cleaned.csv
+  path_cleaned_data_w_embeddings: early_health_auto_lit_analysis/RQ1/inputs/data/gpt/gpt_themes_repeats_cleaned_embeddings.csv
   hdsbscan_min_cluster_size: 20
   n_topics: 12
   tfidf_ngram_min: 1
   tfidf_ngram_max: 3
-  dir_topic_model: outputs/models/bertopic_gpt_themes_model
-  path_probs: outputs/data/bertopic_gpt_themes_model_probs.npy
-  path_topics: outputs/data/bertopic_gpt_themes_model_topics.pkl
-  path_repr_docs: outputs/data/bertopic_gpt_themes_representative_docs.pkl
-  path_summaries: outputs/data/gpt_theme_cluster_summaries.json
-  path_summaries_cleaned: outputs/data/gpt_theme_cluster_summaries_cleaned.csv
+  dir_topic_model: early_health_auto_lit_analysis/RQ1/outputs/models/bertopic_gpt_themes_model
+  path_probs: early_health_auto_lit_analysis/RQ1/outputs/data/bertopic_gpt_themes_model_probs.npy
+  path_topics: early_health_auto_lit_analysis/RQ1/outputs/data/bertopic_gpt_themes_model_topics.pkl
+  path_repr_docs: early_health_auto_lit_analysis/RQ1/outputs/data/bertopic_gpt_themes_representative_docs.pkl
+  path_summaries: early_health_auto_lit_analysis/RQ1/outputs/data/gpt_theme_cluster_summaries.json
+  path_summaries_cleaned: early_health_auto_lit_analysis/RQ1/outputs/data/gpt_theme_cluster_summaries_cleaned.csv
   cluster_colours: tableau20
 
 abstracts_pipeline:
-  path_scite_core_references: inputs/data/scite/How_does_technology_diffusion_impact_UK_growth_and_productivity_3F.csv
-  path_scite_search1: inputs/data/scite/technology diffusion impact on uk growth-2024-02-23.csv
-  path_scite_search2: inputs/data/scite/technology diffusion impact on uk productivity-2024-02-23.csv
+  path_scite_core_references: GSK_vaccine_uptake/rosies_version/1.1/inputs/data/scite/behavioural barriers to adult vaccination uptake in the uk-2024-08-13.csv
+  path_scite_search1: GSK_vaccine_uptake/rosies_version/1.1/inputs/data/scite/social motivators for adult vaccination in the uk-2024-08-13.csv
+  # path_scite_search2: early_health_auto_lit_analysis/RQ4.1(a)/inputs/data/scite/communicating risk information for cancer-2024-05-23.csv
   citation_threshold: 5 # remove papers with fewer than N citations
-  n_most_relevant_papers: 1000
-  path_cleaned_data_w_embeddings: inputs/data/embeddings/scite_embeddings.parquet
+  n_most_relevant_papers: 1500
+  path_cleaned_data_w_embeddings: GSK_vaccine_uptake/rosies_version/1.1/inputs/data/embeddings/scite_embeddings.parquet
   hdsbscan_min_cluster_size: 30
   tfidf_ngram_min: 1
   tfidf_ngram_max: 3
-  dir_topic_model: outputs/models/bertopic_abstracts_model
-  path_probs: outputs/data/bertopic_abstracts_model_probs.npy
-  path_topics: outputs/data/bertopic_abstracts_model_topics.pkl
-  path_repr_docs: outputs/data/bertopic_abstracts_representative_docs.pkl
-  path_summaries: outputs/data/abstracts_cluster_summaries.json
-  path_summaries_cleaned: outputs/data/abstracts_cluster_summaries_cleaned.csv
+  dir_topic_model: GSK_vaccine_uptake/rosies_version/1.1/outputs/models/bertopic_abstracts_model
+  path_probs: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/bertopic_abstracts_model_probs.npy
+  path_topics: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/bertopic_abstracts_model_topics.pkl
+  path_repr_docs: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/bertopic_abstracts_representative_docs.pkl
+  path_summaries: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/abstracts_cluster_summaries.json
+  path_summaries_cleaned: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/abstracts_cluster_summaries_cleaned.csv
   cluster_colours: tableau20
+  recluster_dir_topic_model: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/models/bertopic_abstracts_model
+  recluster_path_probs: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/bertopic_abstracts_model_probs.npy
+  recluster_path_topics: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/bertopic_abstracts_model_topics.pkl
+  recluster_path_repr_docs: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/bertopic_abstracts_representative_docs.pkl
+  recluster_path_summaries: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/abstracts_cluster_summaries.json
+  recluster_path_summaries_cleaned: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/abstracts_cluster_summaries_cleaned.csv
+  recluster_path_cleaned_data_w_embeddings: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/inputs/data/embeddings/scite_embeddings.parquet
+  topics_to_recluster: [1]
 
 summarization_pipeline:
   gpt_model: gpt-4-turbo

diff --git a/dsp_ai_eval/getters/scite.py b/dsp_ai_eval/getters/scite.py
@@ -52,41 +52,29 @@ def read_scite_abstracts(
 
 def get_abstracts(n_abstracts=N_MOST_RELEVANT_PAPERS):
     scite_main_abstracts = read_scite_abstracts(
-        f"{rq_prefix}/" + config["abstracts_pipeline"]["path_scite_core_references"],
+        config["abstracts_pipeline"]["path_scite_core_references"],
         "main",
         n_abstracts,
     )
     scite_wider_abstracts1 = read_scite_abstracts(
-        f"{rq_prefix}/" + config["abstracts_pipeline"]["path_scite_search1"],
-        "wider",
-        n_abstracts,
-    )
-    scite_wider_abstracts2 = read_scite_abstracts(
-        f"{rq_prefix}/" + config["abstracts_pipeline"]["path_scite_search2"],
-        "wider",
-        n_abstracts,
-    )
-    scite_wider_abstracts3 = read_scite_abstracts(
-        f"{rq_prefix}/" + config["abstracts_pipeline"]["path_scite_search3"],
+        config["abstracts_pipeline"]["path_scite_search1"],
         "wider",
         n_abstracts,
     )
+    # scite_wider_abstracts2 = read_scite_abstracts(
+    #     config["abstracts_pipeline"]["path_scite_search2"], "wider", n_abstracts
+    # )
 
     scite_abstracts = pd.concat(
-        [
-            scite_main_abstracts,
-            scite_wider_abstracts1,
-            scite_wider_abstracts2,
-            scite_wider_abstracts3,
-        ]
+        [scite_main_abstracts, scite_wider_abstracts1]  # , scite_wider_abstracts2
     )
 
     return scite_abstracts
 
 
 def get_scite_df_w_embeddings():
     filename = config["abstracts_pipeline"]["path_cleaned_data_w_embeddings"]
-    return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}")
+    return load_s3_data(S3_BUCKET, f"{filename}")
 
 
 def get_topic_model():
@@ -104,24 +92,24 @@ def get_topic_model():
 
 def get_topics():
     filemame = config["abstracts_pipeline"]["path_topics"]
-    return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filemame}")
+    return load_s3_data(S3_BUCKET, f"{filemame}")
 
 
 def get_probs():
     filename = config["abstracts_pipeline"]["path_probs"]
-    return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}")
+    return load_s3_data(S3_BUCKET, f"{filename}")
 
 
 def get_representative_docs():
     filename = config["abstracts_pipeline"]["path_repr_docs"]
-    return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}")
+    return load_s3_data(S3_BUCKET, f"{filename}")
 
 
 def get_cluster_summaries():
     filename = config["abstracts_pipeline"]["path_summaries"]
-    return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}")
+    return load_s3_data(S3_BUCKET, f"{filename}")
 
 
 def get_cluster_summaries_clean():
     filename = config["abstracts_pipeline"]["path_summaries_cleaned"]
-    return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}")
+    return load_s3_data(S3_BUCKET, f"{filename}")
diff --git a/dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt_for_themes.py b/dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt_for_themes.py
@@ -28,12 +28,12 @@
 
 GPT_MODEL = "gpt-3.5-turbo"
 TEMPS = [0, 0.25, 0.5, 1]
-RQ = config["gpt_themes_pipeline"]["RQ"]
+RQ = config["gpt_themes_pipeline"]["research_question"]
 SYSTEM_MESSAGE = "You are a helpful research assistant. Given a research question, you provide a summary of the key topics in academic research on that topic."
 N_SAMPLES = 50
 
 # output
-FILENAME = "inputs/data/gpt/gpt_themes_repeats.jsonl"
+FILENAME = "early_health_auto_lit_analysis/RQ1/inputs/data/gpt/gpt_themes_repeats.jsonl"
 OUT_FILE = PROJECT_DIR / FILENAME
 rq_prefix: str = config["rq_prefix"]
 

diff --git a/dsp_ai_eval/pipeline/openalex/__init__.py b/dsp_ai_eval/pipeline/openalex/__init__.py
@@ -15,7 +15,9 @@
 
 
 @app.command()
-def run_pipeline(config: Annotated[Optional[Path], typer.Option()] = None):
+def run_pipeline(config=config):
+    # config: Annotated[Optional[Path], typer.Option()] = None
+    config = eval(config)
 
     works.run_pipeline(config=config)
     clustering.run_pipeline(config=config)
diff --git a/dsp_ai_eval/pipeline/openalex/clustering.py b/dsp_ai_eval/pipeline/openalex/clustering.py
@@ -11,13 +11,15 @@
 
 @app.command()
 def cluster_abstracts(config: Annotated[Optional[Path], typer.Option()] = None):
+    # config = eval(config)
     from dsp_ai_eval.pipeline.openalex import cluster_abstracts
 
     cluster_abstracts.run_pipeline(config=config)
 
 
 @app.command()
 def summarise_clusters(config: Annotated[Optional[Path], typer.Option()] = None):
+    # config = eval(config)
     from dsp_ai_eval.pipeline.openalex import cluster_summarization
     from dsp_ai_eval.pipeline.openalex import clean_cluster_summaries
 
@@ -27,6 +29,7 @@ def summarise_clusters(config: Annotated[Optional[Path], typer.Option()] = None)
 
 @app.command()
 def create_plots(config: Annotated[Optional[Path], typer.Option()] = None):
+    # config = eval(config)
     from dsp_ai_eval.pipeline.openalex import plot_abstract_clusters
 
     if config is None:
@@ -37,6 +40,7 @@ def create_plots(config: Annotated[Optional[Path], typer.Option()] = None):
 
 @app.command()
 def recluster(config: Annotated[Optional[Path], typer.Option()] = None):
+    # config = eval(config)
     import questionary
     from dsp_ai_eval.getters.openalex import get_cluster_summaries_clean
     from dsp_ai_eval.pipeline.openalex import reclustering
@@ -68,7 +72,11 @@ def recluster(config: Annotated[Optional[Path], typer.Option()] = None):
 
 
 @app.command()
-def run_pipeline(config: Annotated[Optional[Path], typer.Option()] = None):
+def run_pipeline(
+    config=base_config,
+    # config: Annotated[Optional[Path], typer.Option()] = None
+):
+    # config = eval(config)
     cluster_abstracts(config=config)
     summarise_clusters(config=config)
     create_plots(config=config)

diff --git a/dsp_ai_eval/pipeline/openalex/plot_abstract_clusters.py b/dsp_ai_eval/pipeline/openalex/plot_abstract_clusters.py
@@ -153,7 +153,7 @@ def create_chart(
     # Save the chart
     if save:
         filename = f"openalex_abstracts{filename_suffix}.html"
-        plot.save(PROJECT_DIR / f"outputs/figures/{filename}")
+        plot.save(PROJECT_DIR / f"outputs/figures/{filename}", format="html")
         viz_save.save(
             plot,
             f"openalex_abstracts{filename_suffix}",
@@ -222,7 +222,9 @@ def run_pipeline(
         df_vis,
         f"{config['rq_prefix']}/{config['oa_abstracts_pipeline']['path_vis_data']}",
     )
-
+    df_vis = df_vis[
+        ["topic", "topic_name", "total_cites", "doc", "x", "y"]
+    ]  # Select specific cols to avoid JSON error
     create_chart(
         df_vis, scale_by_citations=False, filename_suffix="", add_topic_legend=True
     )

diff --git a/dsp_ai_eval/pipeline/openalex/works.py b/dsp_ai_eval/pipeline/openalex/works.py
@@ -9,7 +9,7 @@
 
 app = typer.Typer()
 
-user = "[email protected]"  # use a separate config file
+user = "[email protected]"  # use a separate config file
 RQ = config["RQ"]
 rq_prefix = config["rq_prefix"]
 OUTPATH = config["oa_abstracts_pipeline"]["path_raw_data"]
@@ -43,10 +43,12 @@ def get(
 
 @app.command()
 def process(
-    config: Annotated[Optional[Path], typer.Option()] = None,
+    config=config,
+    # config: Annotated[Optional[Path], typer.Option()] = None,
     openalex_rmin: int = 10,
     bm25_topk: int = 1000,
 ):
+    # config = eval(config)
     import pandas as pd
     from dsp_ai_eval.pipeline.openalex.utils import (
         filter_relevance_score,
@@ -83,8 +85,10 @@ def process(
     min_cites_count(filtered)
 
 
+@app.command()
 def run_pipeline(
-    config: Annotated[Optional[Path], typer.Option()] = None,
+    config=config,
+    # config: Annotated[Optional[Path], typer.Option()] = None,
 ):
     get(config=config)
     process(config=config)

diff --git a/dsp_ai_eval/pipeline/process_abstracts/README.md b/dsp_ai_eval/pipeline/process_abstracts/README.md
@@ -22,3 +22,10 @@ Note that the final step, `plot_abstract_clusters.py`, saves plots **locally** r
 4. `clean_cluster_summaries.py`: do some minor cleaning on the GPT-generated cluster summaries. (This is in a separate script from `cluster_summarization_pipeline.py` just so that if we want to modify the cleaning steps, we don't have to regenerate the summaries, as doing so comes with a cost.)
 
 5. `plot_abstract_clusters.py`: visualize the clusters that we have created!
+
+CHECKLIST
+
+- Update file paths in base.yaml
+- Update output path for figs in plot_abstract_clusters.py and add corresponding output file in finder
+- Update number of args in get_abstracts in scite.py, and in embed_scite_abstracts (?)
+- Put data from scite in S3
diff --git a/dsp_ai_eval/pipeline/process_abstracts/clean_cluster_summaries.py b/dsp_ai_eval/pipeline/process_abstracts/clean_cluster_summaries.py
@@ -11,6 +11,4 @@
 
     cluster_summaries_cleaned = clean_cluster_summaries(cluster_summaries)
 
-    save_to_s3(
-        S3_BUCKET, cluster_summaries_cleaned, f"{rq_prefix}/{CLUSTER_SUMMARIES_OUTPATH}"
-    )
+    save_to_s3(S3_BUCKET, cluster_summaries_cleaned, f"{CLUSTER_SUMMARIES_OUTPATH}")
diff --git a/dsp_ai_eval/pipeline/process_abstracts/cluster_abstracts.py b/dsp_ai_eval/pipeline/process_abstracts/cluster_abstracts.py
@@ -10,8 +10,8 @@
 
 rq_prefix = config["rq_prefix"]
 
-TOPICS_OUTPATH = f'{rq_prefix}/{config["abstracts_pipeline"]["path_topics"]}'
-PROBS_OUTPATH = f'{rq_prefix}/{config["abstracts_pipeline"]["path_probs"]}'
+TOPICS_OUTPATH = f'{config["abstracts_pipeline"]["path_topics"]}'
+PROBS_OUTPATH = f'{config["abstracts_pipeline"]["path_probs"]}'
 MODEL_OUTPATH = config["abstracts_pipeline"]["dir_topic_model"]
 REPRESENTATIVE_DOCS_OUTPATH = (
     f'{rq_prefix}/{config["abstracts_pipeline"]["path_repr_docs"]}'

diff --git a/dsp_ai_eval/pipeline/process_abstracts/cluster_summarization_pipeline.py b/dsp_ai_eval/pipeline/process_abstracts/cluster_summarization_pipeline.py
@@ -33,4 +33,4 @@
         text_col="title_abstract",
     )
 
-    save_to_s3(S3_BUCKET, summaries, f"{rq_prefix}/{SUMMARIES_OUTPATH}")
+    save_to_s3(S3_BUCKET, summaries, f"{SUMMARIES_OUTPATH}")
-Original file line number
+Diff line change
@@ Expand Up / @@ -127,3 +127,6 @@ target/ @@
     # NPM
     node_modules/
+    # data
+    data/