From c2c32bbeb1e0bed857f2d3ee627847a6dbc80712 Mon Sep 17 00:00:00 2001 From: henrywo1fe Date: Wed, 14 Aug 2024 15:12:39 +0100 Subject: [PATCH 1/5] Using old code to use rosies pipeline --- .gitignore | 3 ++ README.md | 9 ++-- dsp_ai_eval/config/base.yaml | 49 ++++++++++-------- dsp_ai_eval/getters/scite.py | 20 ++------ .../ask_gpt_for_themes.py | 4 +- .../embed_scite_abstracts.py | 50 +++++++++++-------- .../plot_abstract_clusters.py | 4 +- dsp_ai_eval/utils/clustering_utils.py | 33 +++++++++--- 8 files changed, 99 insertions(+), 73 deletions(-) diff --git a/.gitignore b/.gitignore index 1dc6ea2..cf82948 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ target/ # NPM node_modules/ + +# data +data/ diff --git a/README.md b/README.md index 4b51ae2..cd8e084 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ - Setup the conda environment - Configure `pre-commit` - Make sure you have a `.env` file with the following keys: + ``` OPENAI_API_KEY = 'YOUR-KEY-HERE' ``` @@ -20,21 +21,23 @@ OPENAI_API_KEY = 'YOUR-KEY-HERE' - `generate_themes_with_gpt/`: pipeline for obtaining repeated GPT answers to the research question. - `process_abstracts/`: pipeline for performing text clustering on research abstracts - `process_gpt_summaries/`: pipeline for performing text clustering on the summaries obtained with the `generate_themes_with_gpt/` pipeline - + ## How to update the pipeline to run with new data At the moment the workflow is not fully reproducible - that work is forthcoming! To update the pipeline to work with new research abstracts: + - Download your own research abstracts and upload to the s3 bucket - Update relevant paths to your data in `dsp_ai_eval/config/base.yaml` - Update the getters in `dsp_ai_eval/getters/scite.py` (you may have more or fewer data files to be concatenated than in the previous iteration of this project) - Update the data deduplication steps in `dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py`. If you wish to repeat the experiment prompting GPT repeatedly for answers to a research question: -- Update the RQ in `dsp_ai_eval/config/base.yaml` + +- Update the 'research_question' in `dsp_ai_eval/config/base.yaml` - Update the filepaths under `gpt_themes_pipeline` in `dsp_ai_eval/config/base.yaml` as desired -- You should now be able to run `dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt+for_themes.py` +- You should now be able to run `dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt_for_themes.py` ## Contributor guidelines diff --git a/dsp_ai_eval/config/base.yaml b/dsp_ai_eval/config/base.yaml index a932dca..fffcec8 100644 --- a/dsp_ai_eval/config/base.yaml +++ b/dsp_ai_eval/config/base.yaml @@ -32,38 +32,47 @@ oa_reclustering_pipeline: path_vis_data: outputs/reclustering/data/visualization_data.parquet gpt_themes_pipeline: - path_raw_data: inputs/data/gpt/gpt_themes_repeats.jsonl - path_cleaned_data: inputs/data/gpt/gpt_themes_repeats_cleaned.csv - path_cleaned_data_w_embeddings: inputs/data/gpt/gpt_themes_repeats_cleaned_embeddings.csv + research_question: What health and disease information relating to cancer do the public have high interest in? + path_raw_data: early_health_auto_lit_analysis/RQ1/early_health_auto_lit_analysis/RQ1/inputs/data/gpt/gpt_themes_repeats.jsonl + path_cleaned_data: early_health_auto_lit_analysis/RQ1/inputs/data/gpt/gpt_themes_repeats_cleaned.csv + path_cleaned_data_w_embeddings: early_health_auto_lit_analysis/RQ1/inputs/data/gpt/gpt_themes_repeats_cleaned_embeddings.csv hdsbscan_min_cluster_size: 20 n_topics: 12 tfidf_ngram_min: 1 tfidf_ngram_max: 3 - dir_topic_model: outputs/models/bertopic_gpt_themes_model - path_probs: outputs/data/bertopic_gpt_themes_model_probs.npy - path_topics: outputs/data/bertopic_gpt_themes_model_topics.pkl - path_repr_docs: outputs/data/bertopic_gpt_themes_representative_docs.pkl - path_summaries: outputs/data/gpt_theme_cluster_summaries.json - path_summaries_cleaned: outputs/data/gpt_theme_cluster_summaries_cleaned.csv + dir_topic_model: early_health_auto_lit_analysis/RQ1/outputs/models/bertopic_gpt_themes_model + path_probs: early_health_auto_lit_analysis/RQ1/outputs/data/bertopic_gpt_themes_model_probs.npy + path_topics: early_health_auto_lit_analysis/RQ1/outputs/data/bertopic_gpt_themes_model_topics.pkl + path_repr_docs: early_health_auto_lit_analysis/RQ1/outputs/data/bertopic_gpt_themes_representative_docs.pkl + path_summaries: early_health_auto_lit_analysis/RQ1/outputs/data/gpt_theme_cluster_summaries.json + path_summaries_cleaned: early_health_auto_lit_analysis/RQ1/outputs/data/gpt_theme_cluster_summaries_cleaned.csv cluster_colours: tableau20 abstracts_pipeline: - path_scite_core_references: inputs/data/scite/How_does_technology_diffusion_impact_UK_growth_and_productivity_3F.csv - path_scite_search1: inputs/data/scite/technology diffusion impact on uk growth-2024-02-23.csv - path_scite_search2: inputs/data/scite/technology diffusion impact on uk productivity-2024-02-23.csv + path_scite_core_references: early_health_auto_lit_analysis/RQ3.2(a)/inputs/data/scite/reducing anxiety in patients with chronic diseases through test result communication-2024-05-24.csv + path_scite_search1: early_health_auto_lit_analysis/RQ3.2(a)/inputs/data/scite/communication of health test results for patients with chronic diseases-2024-05-24.csv + # path_scite_search2: early_health_auto_lit_analysis/RQ4.1(a)/inputs/data/scite/communicating risk information for cancer-2024-05-23.csv citation_threshold: 5 # remove papers with fewer than N citations - n_most_relevant_papers: 1000 - path_cleaned_data_w_embeddings: inputs/data/embeddings/scite_embeddings.parquet + n_most_relevant_papers: 1500 + path_cleaned_data_w_embeddings: early_health_auto_lit_analysis/RQ3.2(a)/inputs/data/embeddings/scite_embeddings.parquet hdsbscan_min_cluster_size: 30 tfidf_ngram_min: 1 tfidf_ngram_max: 3 - dir_topic_model: outputs/models/bertopic_abstracts_model - path_probs: outputs/data/bertopic_abstracts_model_probs.npy - path_topics: outputs/data/bertopic_abstracts_model_topics.pkl - path_repr_docs: outputs/data/bertopic_abstracts_representative_docs.pkl - path_summaries: outputs/data/abstracts_cluster_summaries.json - path_summaries_cleaned: outputs/data/abstracts_cluster_summaries_cleaned.csv + dir_topic_model: early_health_auto_lit_analysis/RQ3.2(a)/outputs/models/bertopic_abstracts_model + path_probs: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/bertopic_abstracts_model_probs.npy + path_topics: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/bertopic_abstracts_model_topics.pkl + path_repr_docs: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/bertopic_abstracts_representative_docs.pkl + path_summaries: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/abstracts_cluster_summaries.json + path_summaries_cleaned: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/abstracts_cluster_summaries_cleaned.csv cluster_colours: tableau20 + recluster_dir_topic_model: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/models/bertopic_abstracts_model + recluster_path_probs: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/bertopic_abstracts_model_probs.npy + recluster_path_topics: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/bertopic_abstracts_model_topics.pkl + recluster_path_repr_docs: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/bertopic_abstracts_representative_docs.pkl + recluster_path_summaries: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/abstracts_cluster_summaries.json + recluster_path_summaries_cleaned: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/abstracts_cluster_summaries_cleaned.csv + recluster_path_cleaned_data_w_embeddings: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/inputs/data/embeddings/scite_embeddings.parquet + topics_to_recluster: [1] summarization_pipeline: gpt_model: gpt-4-turbo diff --git a/dsp_ai_eval/getters/scite.py b/dsp_ai_eval/getters/scite.py index 1bc410b..bb59ebd 100644 --- a/dsp_ai_eval/getters/scite.py +++ b/dsp_ai_eval/getters/scite.py @@ -61,24 +61,12 @@ def get_abstracts(n_abstracts=N_MOST_RELEVANT_PAPERS): "wider", n_abstracts, ) - scite_wider_abstracts2 = read_scite_abstracts( - f"{rq_prefix}/" + config["abstracts_pipeline"]["path_scite_search2"], - "wider", - n_abstracts, - ) - scite_wider_abstracts3 = read_scite_abstracts( - f"{rq_prefix}/" + config["abstracts_pipeline"]["path_scite_search3"], - "wider", - n_abstracts, - ) + # scite_wider_abstracts2 = read_scite_abstracts( + # config["abstracts_pipeline"]["path_scite_search2"], "wider", n_abstracts + # ) scite_abstracts = pd.concat( - [ - scite_main_abstracts, - scite_wider_abstracts1, - scite_wider_abstracts2, - scite_wider_abstracts3, - ] + [scite_main_abstracts, scite_wider_abstracts1] # , scite_wider_abstracts2 ) return scite_abstracts diff --git a/dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt_for_themes.py b/dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt_for_themes.py index 753da9c..1b376db 100644 --- a/dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt_for_themes.py +++ b/dsp_ai_eval/pipeline/generate_themes_with_gpt/ask_gpt_for_themes.py @@ -28,12 +28,12 @@ GPT_MODEL = "gpt-3.5-turbo" TEMPS = [0, 0.25, 0.5, 1] -RQ = config["gpt_themes_pipeline"]["RQ"] +RQ = config["gpt_themes_pipeline"]["research_question"] SYSTEM_MESSAGE = "You are a helpful research assistant. Given a research question, you provide a summary of the key topics in academic research on that topic." N_SAMPLES = 50 # output -FILENAME = "inputs/data/gpt/gpt_themes_repeats.jsonl" +FILENAME = "early_health_auto_lit_analysis/RQ1/inputs/data/gpt/gpt_themes_repeats.jsonl" OUT_FILE = PROJECT_DIR / FILENAME rq_prefix: str = config["rq_prefix"] diff --git a/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py b/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py index 01a5c4e..1bd8014 100644 --- a/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py +++ b/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py @@ -24,6 +24,32 @@ def first_non_nan(series: pd.Series) -> Union[pd.Series, np.nan]: return series.dropna().iloc[0] if not series.dropna().empty else np.nan +def create_agg_dict(df): + """ + Create an aggregation dictionary for a dataframe. + + This function creates an aggregation dictionary for a dataframe, + where each existing column in the dataframe is associated with the + 'first_non_nan' function. This dictionary can be used in aggregation + operations, such as 'groupby.agg()'. + + Parameters: + df (pandas.DataFrame): The dataframe for which to create the aggregation dictionary. + + Returns: + dict: The aggregation dictionary. + """ + + # Create a list of all existing columns in the dataframe + existing_columns = [col for col in df.columns] + + # Create an aggregation dictionary where each column is associated with the 'first_non_nan' function + agg_dict = {col: first_non_nan for col in existing_columns} + + # Return the aggregation dictionary + return agg_dict + + if __name__ == "__main__": scite_abstracts = get_abstracts(n_abstracts=N_MOST_RELEVANT_PAPERS) logging.info(f"Total number of abstracts: {len(scite_abstracts)}") @@ -38,28 +64,8 @@ def first_non_nan(series: pd.Series) -> Union[pd.Series, np.nan]: f"Number of abstracts remaining after dropping duplicates: {len(scite_abstracts)}" ) - agg_dict = { - "date": first_non_nan, - "title": first_non_nan, - "doi": first_non_nan, - "authors": first_non_nan, - "journal": first_non_nan, - "short_journal": first_non_nan, - "volume": first_non_nan, - "year": first_non_nan, - "publisher": first_non_nan, - "issue": first_non_nan, - "page": first_non_nan, - "abstract": first_non_nan, - "category": first_non_nan, - "pmid": first_non_nan, - "issns": first_non_nan, - "supporting_cites": first_non_nan, - "contrasting_cites": first_non_nan, - "mentioning_cites": first_non_nan, - "total_cites": first_non_nan, - "scite_report_link": first_non_nan, - } + # Create an aggregation dictionary for the dataframe + agg_dict = create_agg_dict(scite_abstracts) # Group by 'doi' and aggregate scite_abstracts = ( diff --git a/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py b/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py index b0d538a..0696233 100644 --- a/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py +++ b/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py @@ -132,11 +132,11 @@ def create_chart( # Save the chart if save: filename = f"scite_abstracts{filename_suffix}.html" - plot.save(PROJECT_DIR / f"outputs/figures/{filename}") + plot.save(PROJECT_DIR / f"outputs/RQ3.2(a)/figures/{filename}") viz_save.save( plot, f"scite_abstracts{filename_suffix}", - PROJECT_DIR / "outputs/figures", + PROJECT_DIR / "outputs/RQ3.2(a)/figures", save_png=True, ) diff --git a/dsp_ai_eval/utils/clustering_utils.py b/dsp_ai_eval/utils/clustering_utils.py index 3510b50..6ef5378 100644 --- a/dsp_ai_eval/utils/clustering_utils.py +++ b/dsp_ai_eval/utils/clustering_utils.py @@ -17,12 +17,23 @@ import re import pandas as pd -from tqdm import tqdm -from time import sleep -from datetime import date -from typing import List +import re +import spacy +from sentence_transformers import SentenceTransformer +from sklearn.feature_extraction.text import CountVectorizer +import time +from umap import UMAP + +from dsp_ai_eval import PROJECT_DIR, logging, config +from dsp_ai_eval.utils import utils + +from langfuse.callback import CallbackHandler + +langfuse_handler = CallbackHandler() + +load_dotenv() -from dsp_ai_eval import logger, config +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") GPT_MODEL = config["summarization_pipeline"]["gpt_model"] @@ -64,7 +75,13 @@ def create_new_topic_model( calculate_probabilities=False, embedding_model=config["embedding_model"], ): - logger.info("Initialising BERTopic model...") + # Check if 'en_core_web_sm' is installed, if not, download it + try: + spacy.load("en_core_web_sm") + except OSError: + print("Model 'en_core_web_sm' not found. Downloading...") + os.system("python -m spacy download en_core_web_sm") + sentence_model = SentenceTransformer(embedding_model) umap_model = UMAP( n_neighbors=15, @@ -250,11 +267,11 @@ def get_summaries( ) # Generate the summary - output = (llm_chain | parser).invoke( + summary_result = llm_chain.invoke( {"texts": texts, "keywords": keywords}, config={"callbacks": [langfuse_handler]}, ) - # output = parser.invoke(summary_result, config={"callbacks": [langfuse_handler]}) + output = parser.invoke(summary_result, config={"callbacks": [langfuse_handler]}) summaries[topic] = { "Name:": output.name, From 7bceda21842685f9694a5530f4d7e322927bec41 Mon Sep 17 00:00:00 2001 From: henrywo1fe Date: Wed, 14 Aug 2024 16:00:47 +0100 Subject: [PATCH 2/5] added dotenv import --- dsp_ai_eval/utils/clustering_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dsp_ai_eval/utils/clustering_utils.py b/dsp_ai_eval/utils/clustering_utils.py index 6ef5378..de334ee 100644 --- a/dsp_ai_eval/utils/clustering_utils.py +++ b/dsp_ai_eval/utils/clustering_utils.py @@ -23,6 +23,7 @@ from sklearn.feature_extraction.text import CountVectorizer import time from umap import UMAP +from dotenv import load_dotenv from dsp_ai_eval import PROJECT_DIR, logging, config from dsp_ai_eval.utils import utils From 5e491720ac702eba03498d0a78b3874c7d744c83 Mon Sep 17 00:00:00 2001 From: henrywo1fe Date: Tue, 20 Aug 2024 10:39:27 +0100 Subject: [PATCH 3/5] Added research question. --- dsp_ai_eval/config/base.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dsp_ai_eval/config/base.yaml b/dsp_ai_eval/config/base.yaml index fffcec8..0aadc90 100644 --- a/dsp_ai_eval/config/base.yaml +++ b/dsp_ai_eval/config/base.yaml @@ -1,7 +1,7 @@ embedding_model: all-miniLM-L6-v2 seed: 42 -rq_prefix: / -RQ: +rq_prefix: andromeda +RQ: How to identify misinformation? oa_abstracts_pipeline: path_raw_data: inputs/openalex/data/works_raw.parquet From 572a55564fdbdd4feeb79305b17d06e3bbf92322 Mon Sep 17 00:00:00 2001 From: henrywo1fe Date: Thu, 10 Oct 2024 10:27:46 +0100 Subject: [PATCH 4/5] edits to make rosies pipeline work --- dsp_ai_eval/config/base.yaml | 36 +++++++++---------- dsp_ai_eval/getters/scite.py | 16 ++++----- .../pipeline/process_abstracts/README.md | 7 ++++ .../clean_cluster_summaries.py | 4 +-- .../process_abstracts/cluster_abstracts.py | 4 +-- .../cluster_summarization_pipeline.py | 2 +- .../embed_scite_abstracts.py | 4 +-- .../plot_abstract_clusters.py | 16 ++++++--- dsp_ai_eval/utils/clustering_utils.py | 7 +++- 9 files changed, 56 insertions(+), 40 deletions(-) diff --git a/dsp_ai_eval/config/base.yaml b/dsp_ai_eval/config/base.yaml index 0aadc90..42b5d1d 100644 --- a/dsp_ai_eval/config/base.yaml +++ b/dsp_ai_eval/config/base.yaml @@ -1,7 +1,7 @@ embedding_model: all-miniLM-L6-v2 seed: 42 -rq_prefix: andromeda -RQ: How to identify misinformation? +rq_prefix: "GSK_test" +RQ: Theoretical basis for behavioural boosts and behavioural boosting oa_abstracts_pipeline: path_raw_data: inputs/openalex/data/works_raw.parquet @@ -49,29 +49,29 @@ gpt_themes_pipeline: cluster_colours: tableau20 abstracts_pipeline: - path_scite_core_references: early_health_auto_lit_analysis/RQ3.2(a)/inputs/data/scite/reducing anxiety in patients with chronic diseases through test result communication-2024-05-24.csv - path_scite_search1: early_health_auto_lit_analysis/RQ3.2(a)/inputs/data/scite/communication of health test results for patients with chronic diseases-2024-05-24.csv + path_scite_core_references: GSK_vaccine_uptake/rosies_version/1.1/inputs/data/scite/behavioural barriers to adult vaccination uptake in the uk-2024-08-13.csv + path_scite_search1: GSK_vaccine_uptake/rosies_version/1.1/inputs/data/scite/social motivators for adult vaccination in the uk-2024-08-13.csv # path_scite_search2: early_health_auto_lit_analysis/RQ4.1(a)/inputs/data/scite/communicating risk information for cancer-2024-05-23.csv citation_threshold: 5 # remove papers with fewer than N citations n_most_relevant_papers: 1500 - path_cleaned_data_w_embeddings: early_health_auto_lit_analysis/RQ3.2(a)/inputs/data/embeddings/scite_embeddings.parquet + path_cleaned_data_w_embeddings: GSK_vaccine_uptake/rosies_version/1.1/inputs/data/embeddings/scite_embeddings.parquet hdsbscan_min_cluster_size: 30 tfidf_ngram_min: 1 tfidf_ngram_max: 3 - dir_topic_model: early_health_auto_lit_analysis/RQ3.2(a)/outputs/models/bertopic_abstracts_model - path_probs: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/bertopic_abstracts_model_probs.npy - path_topics: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/bertopic_abstracts_model_topics.pkl - path_repr_docs: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/bertopic_abstracts_representative_docs.pkl - path_summaries: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/abstracts_cluster_summaries.json - path_summaries_cleaned: early_health_auto_lit_analysis/RQ3.2(a)/outputs/data/abstracts_cluster_summaries_cleaned.csv + dir_topic_model: GSK_vaccine_uptake/rosies_version/1.1/outputs/models/bertopic_abstracts_model + path_probs: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/bertopic_abstracts_model_probs.npy + path_topics: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/bertopic_abstracts_model_topics.pkl + path_repr_docs: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/bertopic_abstracts_representative_docs.pkl + path_summaries: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/abstracts_cluster_summaries.json + path_summaries_cleaned: GSK_vaccine_uptake/rosies_version/1.1/outputs/data/abstracts_cluster_summaries_cleaned.csv cluster_colours: tableau20 - recluster_dir_topic_model: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/models/bertopic_abstracts_model - recluster_path_probs: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/bertopic_abstracts_model_probs.npy - recluster_path_topics: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/bertopic_abstracts_model_topics.pkl - recluster_path_repr_docs: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/bertopic_abstracts_representative_docs.pkl - recluster_path_summaries: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/abstracts_cluster_summaries.json - recluster_path_summaries_cleaned: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/outputs/data/abstracts_cluster_summaries_cleaned.csv - recluster_path_cleaned_data_w_embeddings: early_health_auto_lit_analysis/RQ4.1(b)/recluster_price/inputs/data/embeddings/scite_embeddings.parquet + recluster_dir_topic_model: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/models/bertopic_abstracts_model + recluster_path_probs: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/bertopic_abstracts_model_probs.npy + recluster_path_topics: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/bertopic_abstracts_model_topics.pkl + recluster_path_repr_docs: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/bertopic_abstracts_representative_docs.pkl + recluster_path_summaries: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/abstracts_cluster_summaries.json + recluster_path_summaries_cleaned: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/outputs/data/abstracts_cluster_summaries_cleaned.csv + recluster_path_cleaned_data_w_embeddings: GSK_vaccine_uptake/rosies_version/1.1/recluster_price/inputs/data/embeddings/scite_embeddings.parquet topics_to_recluster: [1] summarization_pipeline: diff --git a/dsp_ai_eval/getters/scite.py b/dsp_ai_eval/getters/scite.py index bb59ebd..e563bc9 100644 --- a/dsp_ai_eval/getters/scite.py +++ b/dsp_ai_eval/getters/scite.py @@ -52,12 +52,12 @@ def read_scite_abstracts( def get_abstracts(n_abstracts=N_MOST_RELEVANT_PAPERS): scite_main_abstracts = read_scite_abstracts( - f"{rq_prefix}/" + config["abstracts_pipeline"]["path_scite_core_references"], + config["abstracts_pipeline"]["path_scite_core_references"], "main", n_abstracts, ) scite_wider_abstracts1 = read_scite_abstracts( - f"{rq_prefix}/" + config["abstracts_pipeline"]["path_scite_search1"], + config["abstracts_pipeline"]["path_scite_search1"], "wider", n_abstracts, ) @@ -74,7 +74,7 @@ def get_abstracts(n_abstracts=N_MOST_RELEVANT_PAPERS): def get_scite_df_w_embeddings(): filename = config["abstracts_pipeline"]["path_cleaned_data_w_embeddings"] - return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}") + return load_s3_data(S3_BUCKET, f"{filename}") def get_topic_model(): @@ -92,24 +92,24 @@ def get_topic_model(): def get_topics(): filemame = config["abstracts_pipeline"]["path_topics"] - return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filemame}") + return load_s3_data(S3_BUCKET, f"{filemame}") def get_probs(): filename = config["abstracts_pipeline"]["path_probs"] - return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}") + return load_s3_data(S3_BUCKET, f"{filename}") def get_representative_docs(): filename = config["abstracts_pipeline"]["path_repr_docs"] - return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}") + return load_s3_data(S3_BUCKET, f"{filename}") def get_cluster_summaries(): filename = config["abstracts_pipeline"]["path_summaries"] - return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}") + return load_s3_data(S3_BUCKET, f"{filename}") def get_cluster_summaries_clean(): filename = config["abstracts_pipeline"]["path_summaries_cleaned"] - return load_s3_data(S3_BUCKET, f"{rq_prefix}/{filename}") + return load_s3_data(S3_BUCKET, f"{filename}") diff --git a/dsp_ai_eval/pipeline/process_abstracts/README.md b/dsp_ai_eval/pipeline/process_abstracts/README.md index 0291a76..870b7fd 100644 --- a/dsp_ai_eval/pipeline/process_abstracts/README.md +++ b/dsp_ai_eval/pipeline/process_abstracts/README.md @@ -22,3 +22,10 @@ Note that the final step, `plot_abstract_clusters.py`, saves plots **locally** r 4. `clean_cluster_summaries.py`: do some minor cleaning on the GPT-generated cluster summaries. (This is in a separate script from `cluster_summarization_pipeline.py` just so that if we want to modify the cleaning steps, we don't have to regenerate the summaries, as doing so comes with a cost.) 5. `plot_abstract_clusters.py`: visualize the clusters that we have created! + +CHECKLIST + +- Update file paths in base.yaml +- Update output path for figs in plot_abstract_clusters.py and add corresponding output file in finder +- Update number of args in get_abstracts in scite.py, and in embed_scite_abstracts (?) +- Put data from scite in S3 diff --git a/dsp_ai_eval/pipeline/process_abstracts/clean_cluster_summaries.py b/dsp_ai_eval/pipeline/process_abstracts/clean_cluster_summaries.py index cce8083..536cf4f 100644 --- a/dsp_ai_eval/pipeline/process_abstracts/clean_cluster_summaries.py +++ b/dsp_ai_eval/pipeline/process_abstracts/clean_cluster_summaries.py @@ -11,6 +11,4 @@ cluster_summaries_cleaned = clean_cluster_summaries(cluster_summaries) - save_to_s3( - S3_BUCKET, cluster_summaries_cleaned, f"{rq_prefix}/{CLUSTER_SUMMARIES_OUTPATH}" - ) + save_to_s3(S3_BUCKET, cluster_summaries_cleaned, f"{CLUSTER_SUMMARIES_OUTPATH}") diff --git a/dsp_ai_eval/pipeline/process_abstracts/cluster_abstracts.py b/dsp_ai_eval/pipeline/process_abstracts/cluster_abstracts.py index f07bf39..1da9edd 100644 --- a/dsp_ai_eval/pipeline/process_abstracts/cluster_abstracts.py +++ b/dsp_ai_eval/pipeline/process_abstracts/cluster_abstracts.py @@ -10,8 +10,8 @@ rq_prefix = config["rq_prefix"] -TOPICS_OUTPATH = f'{rq_prefix}/{config["abstracts_pipeline"]["path_topics"]}' -PROBS_OUTPATH = f'{rq_prefix}/{config["abstracts_pipeline"]["path_probs"]}' +TOPICS_OUTPATH = f'{config["abstracts_pipeline"]["path_topics"]}' +PROBS_OUTPATH = f'{config["abstracts_pipeline"]["path_probs"]}' MODEL_OUTPATH = config["abstracts_pipeline"]["dir_topic_model"] REPRESENTATIVE_DOCS_OUTPATH = ( f'{rq_prefix}/{config["abstracts_pipeline"]["path_repr_docs"]}' diff --git a/dsp_ai_eval/pipeline/process_abstracts/cluster_summarization_pipeline.py b/dsp_ai_eval/pipeline/process_abstracts/cluster_summarization_pipeline.py index 0397303..12bdca2 100644 --- a/dsp_ai_eval/pipeline/process_abstracts/cluster_summarization_pipeline.py +++ b/dsp_ai_eval/pipeline/process_abstracts/cluster_summarization_pipeline.py @@ -33,4 +33,4 @@ text_col="title_abstract", ) - save_to_s3(S3_BUCKET, summaries, f"{rq_prefix}/{SUMMARIES_OUTPATH}") + save_to_s3(S3_BUCKET, summaries, f"{SUMMARIES_OUTPATH}") diff --git a/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py b/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py index 1bd8014..7ac7271 100644 --- a/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py +++ b/dsp_ai_eval/pipeline/process_abstracts/embed_scite_abstracts.py @@ -14,13 +14,13 @@ OUT_PATH = config["abstracts_pipeline"]["path_cleaned_data_w_embeddings"] rq_prefix = config["rq_prefix"] -S3_KEY = f"{rq_prefix}/{OUT_PATH}" +S3_KEY = f"{OUT_PATH}" CITATION_THRESHOLD = config["abstracts_pipeline"]["citation_threshold"] N_MOST_RELEVANT_PAPERS = config["abstracts_pipeline"]["n_most_relevant_papers"] -def first_non_nan(series: pd.Series) -> Union[pd.Series, np.nan]: +def first_non_nan(series: pd.Series) -> Union[pd.Series, float]: return series.dropna().iloc[0] if not series.dropna().empty else np.nan diff --git a/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py b/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py index 0696233..b081c13 100644 --- a/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py +++ b/dsp_ai_eval/pipeline/process_abstracts/plot_abstract_clusters.py @@ -120,9 +120,15 @@ def create_chart( "y:Q", axis=alt.Axis(ticks=False, labels=False, title=None, grid=False) ), size=size_encode, - color=alt.Color("topic_name:N", legend=None).scale( - scheme=config["abstracts_pipeline"]["cluster_colours"] - ), + color=alt.Color( + "topic_name:N", + legend=alt.Legend( + title="Topic Names", + titleFontSize=12, + labelFontSize=12, + labelPadding=100, + ), + ).scale(scheme=config["abstracts_pipeline"]["cluster_colours"]), opacity=opacity_condition, tooltip=tooltip_fields, ).mark_circle() @@ -132,11 +138,11 @@ def create_chart( # Save the chart if save: filename = f"scite_abstracts{filename_suffix}.html" - plot.save(PROJECT_DIR / f"outputs/RQ3.2(a)/figures/{filename}") + plot.save(PROJECT_DIR / f"outputs/GSK/1.1/figures/{filename}") viz_save.save( plot, f"scite_abstracts{filename_suffix}", - PROJECT_DIR / "outputs/RQ3.2(a)/figures", + PROJECT_DIR / "outputs/GSK/1.1/figures", save_png=True, ) diff --git a/dsp_ai_eval/utils/clustering_utils.py b/dsp_ai_eval/utils/clustering_utils.py index de334ee..499ddba 100644 --- a/dsp_ai_eval/utils/clustering_utils.py +++ b/dsp_ai_eval/utils/clustering_utils.py @@ -24,12 +24,17 @@ import time from umap import UMAP from dotenv import load_dotenv - +import os +from typing import List from dsp_ai_eval import PROJECT_DIR, logging, config from dsp_ai_eval.utils import utils +from tqdm import tqdm +from datetime import date +from time import sleep from langfuse.callback import CallbackHandler +logger = logging.getLogger(__name__) langfuse_handler = CallbackHandler() load_dotenv() From be54719465bf99dc70d864e0420a338adb77c167 Mon Sep 17 00:00:00 2001 From: henrywo1fe Date: Thu, 10 Oct 2024 17:43:50 +0100 Subject: [PATCH 5/5] bug fixes to make pipeline run smoothly --- dsp_ai_eval/pipeline/openalex/__init__.py | 4 +++- dsp_ai_eval/pipeline/openalex/clustering.py | 10 +++++++++- .../pipeline/openalex/plot_abstract_clusters.py | 6 ++++-- dsp_ai_eval/pipeline/openalex/works.py | 10 +++++++--- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/dsp_ai_eval/pipeline/openalex/__init__.py b/dsp_ai_eval/pipeline/openalex/__init__.py index 7f04a4c..b08b5c0 100644 --- a/dsp_ai_eval/pipeline/openalex/__init__.py +++ b/dsp_ai_eval/pipeline/openalex/__init__.py @@ -15,7 +15,9 @@ @app.command() -def run_pipeline(config: Annotated[Optional[Path], typer.Option()] = None): +def run_pipeline(config=config): + # config: Annotated[Optional[Path], typer.Option()] = None + config = eval(config) works.run_pipeline(config=config) clustering.run_pipeline(config=config) diff --git a/dsp_ai_eval/pipeline/openalex/clustering.py b/dsp_ai_eval/pipeline/openalex/clustering.py index 41eb5f6..eb478cb 100644 --- a/dsp_ai_eval/pipeline/openalex/clustering.py +++ b/dsp_ai_eval/pipeline/openalex/clustering.py @@ -11,6 +11,7 @@ @app.command() def cluster_abstracts(config: Annotated[Optional[Path], typer.Option()] = None): + # config = eval(config) from dsp_ai_eval.pipeline.openalex import cluster_abstracts cluster_abstracts.run_pipeline(config=config) @@ -18,6 +19,7 @@ def cluster_abstracts(config: Annotated[Optional[Path], typer.Option()] = None): @app.command() def summarise_clusters(config: Annotated[Optional[Path], typer.Option()] = None): + # config = eval(config) from dsp_ai_eval.pipeline.openalex import cluster_summarization from dsp_ai_eval.pipeline.openalex import clean_cluster_summaries @@ -27,6 +29,7 @@ def summarise_clusters(config: Annotated[Optional[Path], typer.Option()] = None) @app.command() def create_plots(config: Annotated[Optional[Path], typer.Option()] = None): + # config = eval(config) from dsp_ai_eval.pipeline.openalex import plot_abstract_clusters if config is None: @@ -37,6 +40,7 @@ def create_plots(config: Annotated[Optional[Path], typer.Option()] = None): @app.command() def recluster(config: Annotated[Optional[Path], typer.Option()] = None): + # config = eval(config) import questionary from dsp_ai_eval.getters.openalex import get_cluster_summaries_clean from dsp_ai_eval.pipeline.openalex import reclustering @@ -68,7 +72,11 @@ def recluster(config: Annotated[Optional[Path], typer.Option()] = None): @app.command() -def run_pipeline(config: Annotated[Optional[Path], typer.Option()] = None): +def run_pipeline( + config=base_config, + # config: Annotated[Optional[Path], typer.Option()] = None +): + # config = eval(config) cluster_abstracts(config=config) summarise_clusters(config=config) create_plots(config=config) diff --git a/dsp_ai_eval/pipeline/openalex/plot_abstract_clusters.py b/dsp_ai_eval/pipeline/openalex/plot_abstract_clusters.py index d879b96..53d72b8 100644 --- a/dsp_ai_eval/pipeline/openalex/plot_abstract_clusters.py +++ b/dsp_ai_eval/pipeline/openalex/plot_abstract_clusters.py @@ -153,7 +153,7 @@ def create_chart( # Save the chart if save: filename = f"openalex_abstracts{filename_suffix}.html" - plot.save(PROJECT_DIR / f"outputs/figures/{filename}") + plot.save(PROJECT_DIR / f"outputs/figures/{filename}", format="html") viz_save.save( plot, f"openalex_abstracts{filename_suffix}", @@ -222,7 +222,9 @@ def run_pipeline( df_vis, f"{config['rq_prefix']}/{config['oa_abstracts_pipeline']['path_vis_data']}", ) - + df_vis = df_vis[ + ["topic", "topic_name", "total_cites", "doc", "x", "y"] + ] # Select specific cols to avoid JSON error create_chart( df_vis, scale_by_citations=False, filename_suffix="", add_topic_legend=True ) diff --git a/dsp_ai_eval/pipeline/openalex/works.py b/dsp_ai_eval/pipeline/openalex/works.py index 80a1634..d36b92e 100644 --- a/dsp_ai_eval/pipeline/openalex/works.py +++ b/dsp_ai_eval/pipeline/openalex/works.py @@ -9,7 +9,7 @@ app = typer.Typer() -user = "solomon.yu@nesta.org.uk" # use a separate config file +user = "henry.nurick@bi.team" # use a separate config file RQ = config["RQ"] rq_prefix = config["rq_prefix"] OUTPATH = config["oa_abstracts_pipeline"]["path_raw_data"] @@ -43,10 +43,12 @@ def get( @app.command() def process( - config: Annotated[Optional[Path], typer.Option()] = None, + config=config, + # config: Annotated[Optional[Path], typer.Option()] = None, openalex_rmin: int = 10, bm25_topk: int = 1000, ): + # config = eval(config) import pandas as pd from dsp_ai_eval.pipeline.openalex.utils import ( filter_relevance_score, @@ -83,8 +85,10 @@ def process( min_cites_count(filtered) +@app.command() def run_pipeline( - config: Annotated[Optional[Path], typer.Option()] = None, + config=config, + # config: Annotated[Optional[Path], typer.Option()] = None, ): get(config=config) process(config=config)