Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Annotation retrieval and textualization #383

Merged
merged 23 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
6242743
Fill annotation store when transferring a result to the llm page.
JuliaS92 Nov 21, 2024
238203c
Link out to selected uniprot id instead of querying for the gene name
JuliaS92 Nov 21, 2024
c02f0be
Introduce constants for extracted fields and create text representati…
JuliaS92 Nov 21, 2024
9d920fe
change analysis helper to fit
JuliaS92 Nov 21, 2024
9506c0e
Add element to LLM page to display retrieved data
JuliaS92 Nov 21, 2024
94d52bd
fix tests as far as feasible
JuliaS92 Nov 21, 2024
7fb9233
extract and document getting regulated features
JuliaS92 Nov 22, 2024
d95f07d
Tests for get_regulated_features
JuliaS92 Nov 22, 2024
8d3cad5
Add documentation and tests for get_uniprot_data
JuliaS92 Nov 22, 2024
166aa99
Todos for large numbers of proteins.
JuliaS92 Nov 22, 2024
4c38218
Extract and document display of retrieved information
JuliaS92 Nov 22, 2024
e7cc17e
Fix tests for display_proteins
JuliaS92 Nov 22, 2024
f61649f
move uniprot utils tests
JuliaS92 Nov 22, 2024
bcef330
Refactor uniprot_utils (function names, docstrings, ordering)
JuliaS92 Nov 22, 2024
6e42f58
Add interface to LLM page to select uniprot information
JuliaS92 Nov 22, 2024
56367d9
Fix broken tests
JuliaS92 Nov 22, 2024
baeab65
Add tests for the two remaining public functions in uniprot utils.
JuliaS92 Nov 22, 2024
4ede179
Add important TODO
JuliaS92 Nov 22, 2024
01ecbbd
Simplify parsing of the protein name
JuliaS92 Nov 25, 2024
e9f228a
ClassNameCapital
JuliaS92 Nov 25, 2024
adb223e
Add quick fixes and todos addressing comments on https://github.com/M…
JuliaS92 Nov 25, 2024
5dc8f2c
Implement quick wins or add TODOs according to PR comments https://gi…
JuliaS92 Nov 25, 2024
d34cadc
TODOs from PR conversation
JuliaS92 Nov 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions alphastats/gui/pages/05_Analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from alphastats.gui.utils.analysis_helper import (
display_analysis_result_with_buttons,
gather_parameters_and_do_analysis,
gather_uniprot_data,
get_regulated_features,
)
from alphastats.gui.utils.ui_helper import (
StateKeys,
Expand Down Expand Up @@ -92,6 +94,10 @@ def show_start_llm_button(analysis_method: str) -> None:
if StateKeys.LLM_INTEGRATION in st.session_state:
del st.session_state[StateKeys.LLM_INTEGRATION]
st.session_state[StateKeys.LLM_INPUT] = (analysis_object, parameters)
with st.spinner("Retrieving uniprot data on regulated features ..."):
regulated_features = get_regulated_features(analysis_object)
# TODO: Add confirmation prompt if an excessive number of proteins is to be looked up.
gather_uniprot_data(regulated_features)

st.toast("LLM analysis created!", icon="✅")
st.page_link("pages/06_LLM.py", label="=> Go to LLM page..")
Expand Down
59 changes: 58 additions & 1 deletion alphastats/gui/pages/06_LLM.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,21 @@
import streamlit as st
from openai import AuthenticationError

from alphastats.dataset.keys import Cols
from alphastats.dataset.plotting import plotly_object
from alphastats.gui.utils.analysis_helper import (
display_figure,
)
from alphastats.gui.utils.llm_helper import (
get_display_available_uniprot_info,
get_display_proteins_html,
llm_connection_test,
set_api_key,
)
from alphastats.gui.utils.ui_helper import StateKeys, init_session_state, sidebar_info
from alphastats.llm.llm_integration import LLMIntegration, Models
from alphastats.llm.prompts import get_initial_prompt, get_system_message
from alphastats.llm.uniprot_utils import ExtractedFields, format_uniprot_annotation
from alphastats.plots.plot_utils import PlotlyObject

OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
Expand Down Expand Up @@ -101,7 +104,7 @@ def llm_config():
with c1:
regulated_genes_df = volcano_plot.res[volcano_plot.res["label"] != ""]
regulated_genes_dict = dict(
zip(regulated_genes_df["label"], regulated_genes_df["color"].tolist())
zip(regulated_genes_df[Cols.INDEX], regulated_genes_df["color"].tolist())
)

if not regulated_genes_dict:
Expand Down Expand Up @@ -130,6 +133,60 @@ def llm_config():
unsafe_allow_html=True,
)

st.markdown("##### Select which information from Uniprot to supply to the LLM")
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
c1, c2, c3 = st.columns((1, 1, 5))
default_fields = [
ExtractedFields.GENE,
ExtractedFields.NAME,
ExtractedFields.FUNCTIONCOMM,
]
# TODO: Turn the list of selected fields into a single list in the session state and make sure the setting is persistent across page navigation.
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
with c1:
if st.button("Select all"):
for field in ExtractedFields.get_values():
st.session_state["EX" + field] = True
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
with c2:
if st.button("Select none"):
for field in ExtractedFields.get_values():
st.session_state["EX" + field] = False
with c3:
if st.button("Recommended selection"):
for field in ExtractedFields.get_values():
st.session_state["EX" + field] = field in default_fields
c1, c2 = st.columns((1, 3))
with c1, st.expander("Show options"):
selected_fields = [
st.checkbox(
field,
value=field in default_fields
if "EX" + field not in st.session_state
else None,
key="EX" + field,
)
for field in ExtractedFields.get_values()
]
with c2, st.expander("Show preview", expanded=True):
preview_feature = st.selectbox(
"Feature id", options=list(regulated_genes_dict.keys())
)
st.markdown(
format_uniprot_annotation(
st.session_state[StateKeys.ANNOTATION_STORE][preview_feature],
fields=[
field
for field, selected in zip(
ExtractedFields.get_values(), selected_fields
)
if selected
],
)
)

with st.expander("View all available uniprot data"):
st.json(
get_display_available_uniprot_info(list(regulated_genes_dict.keys())),
expanded=False,
)

st.markdown("##### Prompts generated based on analysis input")

Expand Down
44 changes: 44 additions & 0 deletions alphastats/gui/utils/analysis_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pandas as pd
import streamlit as st

from alphastats.dataset.keys import Cols
from alphastats.gui.utils.analysis import (
ANALYSIS_OPTIONS,
PlottingOptions,
Expand All @@ -13,6 +14,7 @@
StateKeys,
show_button_download_df,
)
from alphastats.llm.uniprot_utils import get_annotations_for_feature
from alphastats.plots.plot_utils import PlotlyObject


Expand Down Expand Up @@ -197,3 +199,45 @@ def gather_parameters_and_do_analysis(

else:
raise ValueError(f"Analysis method {analysis_method} not found.")


def gather_uniprot_data(features: list) -> None:
"""
Gathers UniProt data for a list of features and stores it in the session state.

Features that are already in the session state are skipped.

Args:
features (list): A list of features for which UniProt data needs to be gathered.
Returns:
None
"""
for feature in features:
if feature in st.session_state[StateKeys.ANNOTATION_STORE]:
continue
# TODO: Add some kind of rate limitation to avoid being locked out by uniprot
st.session_state[StateKeys.ANNOTATION_STORE][feature] = (
get_annotations_for_feature(feature)
)


def get_regulated_features(analysis_object: PlotlyObject) -> list:
"""
Retrieve regulated features from the analysis object.
This function extracts features that are labeled (i.e., have a non-empty label)
from the analysis results. It is specifically designed to work with volcano plots.
Args:
analysis_object (PlotlyObject): An object containing analysis results,
including feature indices and labels.
Returns:
list: A list of regulated features that have non-empty labels.
"""
# TODO: add a method to the AbstractAnalysis class to retrieve regulated features upon analysis to store in the session state. This function here only works for volcano plots.
regulated_features = [
feature
for feature, label in zip(
analysis_object.res[Cols.INDEX], analysis_object.res["label"]
)
if label != ""
]
return regulated_features
32 changes: 30 additions & 2 deletions alphastats/gui/utils/llm_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from alphastats.gui.utils.ui_helper import StateKeys
from alphastats.llm.llm_integration import LLMIntegration
from alphastats.llm.uniprot_utils import format_uniprot_annotation


def get_display_proteins_html(protein_ids: List[str], is_upregulated: True) -> str:
Expand All @@ -16,11 +17,11 @@ def get_display_proteins_html(protein_ids: List[str], is_upregulated: True) -> s
is_upregulated (bool): whether the proteins are up- or down-regulated.
"""

uniprot_url = "https://www.uniprot.org/uniprotkb?query="
uniprot_url = "https://www.uniprot.org/uniprotkb/"

color = "green" if is_upregulated else "red"
protein_ids_html = "".join(
f'<a href = {uniprot_url + protein}><li style="color: {color};">{protein}</li></a>'
f'<a href = {uniprot_url + st.session_state[StateKeys.ANNOTATION_STORE][protein].get("primaryAccession",protein)}><li style="color: {color};">{st.session_state[StateKeys.DATASET]._feature_to_repr_map[protein]}</li></a>'
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
for protein in protein_ids
)

Expand Down Expand Up @@ -80,3 +81,30 @@ def llm_connection_test(

except Exception as e:
return str(e)


def get_display_available_uniprot_info(regulated_features: list) -> dict:
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
"""
Retrieves and formats UniProt information for a list of regulated features.

Note: The information is retrieved from the `annotation_store` in the `session_state`, which is filled when the LLM analysis is set up from the anlaysis page.

Args:
regulated_features (list): A list of features for which UniProt information is to be retrieved.
Returns:
dict: A dictionary where each key is a feature representation and the value is another dictionary
containing the 'protein ids' and 'generated text' with formatted UniProt information or an error message.
"""
text_repr = {}
for feature in regulated_features:
try:
text = format_uniprot_annotation(
st.session_state[StateKeys.ANNOTATION_STORE][feature]
)
except Exception as e:
text = e
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
text_repr[st.session_state[StateKeys.DATASET]._feature_to_repr_map[feature]] = {
"protein ids": feature,
"generated text": text,
}
return text_repr
4 changes: 4 additions & 0 deletions alphastats/gui/utils/ui_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ def init_session_state() -> None:
if StateKeys.LLM_INTEGRATION not in st.session_state:
st.session_state[StateKeys.LLM_INTEGRATION] = {}

if StateKeys.ANNOTATION_STORE not in st.session_state:
st.session_state[StateKeys.ANNOTATION_STORE] = {}


class StateKeys(metaclass=ConstantsClass):
USER_SESSION_ID = "user_session_id"
Expand All @@ -132,5 +135,6 @@ class StateKeys(metaclass=ConstantsClass):
MODEL_NAME = "model_name"
LLM_INPUT = "llm_input"
LLM_INTEGRATION = "llm_integration"
ANNOTATION_STORE = "annotation_store"

ORGANISM = "organism" # TODO this is essentially a constant
Loading
Loading