diff --git a/helm/robusta/Chart.yaml b/helm/robusta/Chart.yaml index ff42d8ea7..61669cf74 100644 --- a/helm/robusta/Chart.yaml +++ b/helm/robusta/Chart.yaml @@ -13,4 +13,8 @@ dependencies: - name: kube-prometheus-stack version: 55.7.0 condition: enablePrometheusStack - repository: "https://prometheus-community.github.io/helm-charts" \ No newline at end of file + repository: "https://prometheus-community.github.io/helm-charts" +- name: holmes + version: 0.0.2 + condition: enableHolmesGPT + repository: "https://robusta-charts.storage.googleapis.com" diff --git a/helm/robusta/templates/runner.yaml b/helm/robusta/templates/runner.yaml index b54a181e2..769e2b4da 100644 --- a/helm/robusta/templates/runner.yaml +++ b/helm/robusta/templates/runner.yaml @@ -81,6 +81,10 @@ spec: - name: DISABLE_HELM_MONITORING value: "True" {{- end }} + {{- if not .Values.enableHolmesGPT }} + - name: HOLMES_ENABLED + value: "True" + {{- end }} {{- if .Values.scaleAlertsProcessing }} - name: ALERTS_WORKERS_POOL value: "True" diff --git a/helm/robusta/values.yaml b/helm/robusta/values.yaml index 598e4cf5c..774aaf3a9 100644 --- a/helm/robusta/values.yaml +++ b/helm/robusta/values.yaml @@ -21,6 +21,8 @@ global: automountServiceAccountToken: true +enableHolmesGPT: false + # see https://docs.robusta.dev/master/user-guide/configuration.html#global-config and https://docs.robusta.dev/master/configuration/additional-settings.html#global-config globalConfig: check_prometheus_flags: true @@ -36,6 +38,10 @@ globalConfig: alertRelabel: [] # safe actions to enable authenticated users to run + +disabledPlaybooks: + - WeeklyKRRScan + lightActions: - related_pods - prometheus_enricher @@ -73,7 +79,6 @@ lightActions: - node_dmesg_enricher - status_enricher - popeye_scan -- krr_scan - handle_alertmanager_event - drain - cordon @@ -497,7 +502,8 @@ platformPlaybooks: - "robusta_ui_sink" # Any playbook name listed here will be disabled -disabledPlaybooks: [] +disabledPlaybooks: + - WeeklyKRRScan image: registry: us-central1-docker.pkg.dev/genuine-flight-317411/devel diff --git a/src/robusta/api/__init__.py b/src/robusta/api/__init__.py index 3d311f02a..418d8765a 100644 --- a/src/robusta/api/__init__.py +++ b/src/robusta/api/__init__.py @@ -12,6 +12,7 @@ from robusta.core.discovery.resource_names import ResourceNameLister from robusta.core.model.base_params import ( ActionParams, + AIInvestigateParams, AlertResourceGraphEnricherParams, BashParams, ChartValuesFormat, @@ -184,6 +185,7 @@ ) from robusta.core.reporting.custom_rendering import RendererType, charts_style, render_value from robusta.core.reporting.finding_subjects import KubeObjFindingSubject, PodFindingSubject +from robusta.core.reporting.holmes import HolmesRequest, HolmesResult, HolmesResultsBlock from robusta.core.schedule.model import ( DynamicDelayRepeat, FixedDelayRepeat, @@ -296,6 +298,7 @@ ) from robusta.integrations.prometheus.utils import ( AlertManagerDiscovery, + HolmesDiscovery, PrometheusDiscovery, ServiceDiscovery, get_prometheus_connect, diff --git a/src/robusta/core/model/base_params.py b/src/robusta/core/model/base_params.py index 33101e26e..b344f800c 100644 --- a/src/robusta/core/model/base_params.py +++ b/src/robusta/core/model/base_params.py @@ -1,6 +1,6 @@ import logging from enum import Enum, auto -from typing import Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, SecretStr, validator @@ -21,6 +21,7 @@ class ChartValuesFormat(Enum): def __str__(self): return self.name + class ResourceChartItemType(Enum): """ Item selection for Alert resource enricher @@ -70,6 +71,45 @@ def post_initialization(self): pass +class ResourceInfo(BaseModel): + name: str + namespace: Optional[str] + kind: str + node: Optional[str] + container: Optional[str] + + +class HolmesParams(ActionParams): + + holmes_url: Optional[str] + + @validator("holmes_url", allow_reuse=True) + def validate_protocol(cls, v): + if v and not v.startswith("http"): # if the user configured url without http(s) + v = f"http://{v}" + logging.info(f"Adding protocol to holmes_url: {v}") + return v + + +class AIInvestigateParams(HolmesParams): + """ + :var resource: The resource related to this investigation. A resource has a `name` and `kind`, and may have `namespace` and `node` + :var investigation_type: The type of investigation: Issue/Service/Cluster/Custom + :var runbooks: List of human readable recommended runbooks that holmes can use for the investigation. + :var ask: Override question to ask holmes + :var context: Additional information that can assist with the investigation + + :example ask: What are all the issues in my cluster right now? + :example runbooks: ["Try to get the pod logs and find errors", "get the pod yaml and check if there are finalizers"] + """ + + resource: Optional[ResourceInfo] + investigation_type: str + runbooks: Optional[List[str]] + ask: Optional[str] + context: Optional[Dict[str, Any]] + + class PodRunningParams(ActionParams): """ :var custom_annotations: custom annotations to be used for the running pod/job @@ -340,7 +380,17 @@ class OomKillParams(OOMGraphEnricherParams): container_memory_graph: Optional[bool] = False node_memory_graph: Optional[bool] = False - def __init__(self, attach_logs: Optional[bool] = False, container_memory_graph: Optional[bool] = False, - node_memory_graph: Optional[bool] = False, **kwargs): - super().__init__(attach_logs=attach_logs, container_memory_graph=container_memory_graph, - node_memory_graph=node_memory_graph, resource_type=ResourceChartResourceType.Memory.name, **kwargs) + def __init__( + self, + attach_logs: Optional[bool] = False, + container_memory_graph: Optional[bool] = False, + node_memory_graph: Optional[bool] = False, + **kwargs, + ): + super().__init__( + attach_logs=attach_logs, + container_memory_graph=container_memory_graph, + node_memory_graph=node_memory_graph, + resource_type=ResourceChartResourceType.Memory.name, + **kwargs, + ) diff --git a/src/robusta/core/model/env_vars.py b/src/robusta/core/model/env_vars.py index ba5d0229c..03df4c224 100644 --- a/src/robusta/core/model/env_vars.py +++ b/src/robusta/core/model/env_vars.py @@ -125,3 +125,5 @@ def load_bool(env_var, default: bool): POD_WAIT_RETRIES = int(os.environ.get("POD_WAIT_RETRIES", 10)) POD_WAIT_RETRIES_SECONDS = int(os.environ.get("POD_WAIT_RETRIES_SECONDS", 5)) + +HOLMES_ENABLED = load_bool("HOLMES_ENABLED", False) diff --git a/src/robusta/core/playbooks/internal/__init__.py b/src/robusta/core/playbooks/internal/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/robusta/core/playbooks/internal/ai_integration.py b/src/robusta/core/playbooks/internal/ai_integration.py new file mode 100644 index 000000000..f4a4b4f84 --- /dev/null +++ b/src/robusta/core/playbooks/internal/ai_integration.py @@ -0,0 +1,64 @@ +import json +import logging + +import requests + +from robusta.core.model.base_params import AIInvestigateParams +from robusta.core.model.events import ExecutionBaseEvent +from robusta.core.playbooks.actions_registry import action +from robusta.core.reporting import Finding, FindingSubject +from robusta.core.reporting.base import EnrichmentType +from robusta.core.reporting.consts import FindingSubjectType, FindingType +from robusta.core.reporting.holmes import HolmesRequest, HolmesResult, HolmesResultsBlock +from robusta.integrations.prometheus.utils import HolmesDiscovery + + +@action +def ask_holmes(event: ExecutionBaseEvent, params: AIInvestigateParams): + holmes_url = HolmesDiscovery.find_holmes_url(params.holmes_url) + if not holmes_url: + logging.error("Holmes url not found") + return + + try: + issue_name = params.context.get("issue_type", "unknown health issue") + holmes_req = HolmesRequest( + source=params.context.get("source", "unknown source"), + title=f"{issue_name}", + description="", + subject=params.resource.dict() if params.resource else None, + context=params.context if params.context else None, + include_tool_calls=True, + include_tool_call_results=True, + ) + result = requests.post(f"{holmes_url}/api/investigate", data=holmes_req.json()) + result.raise_for_status() + + holmes_result = HolmesResult(**json.loads(result.text)) + title_suffix = ( + f" on {params.resource.name}" + if params.resource.name and params.resource.name.lower() != "unresolved" + else "" + ) + + finding = Finding( + title=f"AI Analysis of {issue_name}{title_suffix}", + aggregation_key="HolmesInvestigationResult", + subject=FindingSubject( + name=params.resource.name, + namespace=params.resource.namespace, + subject_type=FindingSubjectType.from_kind(params.resource.kind), + node=params.resource.node, + container=params.resource.container, + ), + finding_type=FindingType.AI_ANALYSIS, + failure=False, + ) + finding.add_enrichment( + [HolmesResultsBlock(holmes_result=holmes_result)], enrichment_type=EnrichmentType.ai_analysis + ) + + event.add_finding(finding) + + except Exception: + logging.exception("Failed to get holmes analysis") diff --git a/src/robusta/core/reporting/base.py b/src/robusta/core/reporting/base.py index 8105e332e..a406921da 100644 --- a/src/robusta/core/reporting/base.py +++ b/src/robusta/core/reporting/base.py @@ -95,6 +95,7 @@ class VideoLink(BaseModel): class EnrichmentType(Enum): graph = "graph" + ai_analysis = "ai_analysis" node_info = "node_info" container_info = "container_info" k8s_events = "k8s_events" diff --git a/src/robusta/core/reporting/callbacks.py b/src/robusta/core/reporting/callbacks.py index 09fc3c399..89f60500a 100644 --- a/src/robusta/core/reporting/callbacks.py +++ b/src/robusta/core/reporting/callbacks.py @@ -27,7 +27,7 @@ def create_for_func( if not signing_key: raise Exception("Cannot create callback request with no signing key. Configure signing_key in globalConfig") - action_params = {} if choice.action_params is None else choice.action_params.dict() + action_params = {} if choice.action_params is None else choice.action_params.dict(exclude_defaults=True) if choice.kubernetes_object: action_params["kind"] = choice.kubernetes_object.kind action_params["name"] = choice.kubernetes_object.metadata.name diff --git a/src/robusta/core/reporting/consts.py b/src/robusta/core/reporting/consts.py index 11e4a6b85..c895657b1 100644 --- a/src/robusta/core/reporting/consts.py +++ b/src/robusta/core/reporting/consts.py @@ -8,6 +8,7 @@ class FindingType(Enum): CONF_CHANGE = "configuration_change" HEALTH_CHECK = "health_check" REPORT = "report" + AI_ANALYSIS = "ai_analysis" @classmethod def from_type(cls, finding_type: str) -> "FindingType": diff --git a/src/robusta/core/reporting/holmes.py b/src/robusta/core/reporting/holmes.py new file mode 100644 index 000000000..364ee3bc4 --- /dev/null +++ b/src/robusta/core/reporting/holmes.py @@ -0,0 +1,40 @@ +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel + +from robusta.core.reporting import BaseBlock + + +class HolmesRequest(BaseModel): + source: str # "prometheus" etc + title: str + description: str + subject: dict + context: Dict[str, Any] + include_tool_calls: bool = False + include_tool_call_results: bool = False + + +class ToolCallResult(BaseModel): + tool_name: str + description: str + result: str + + +class HolmesResult(BaseModel): + tool_calls: Optional[List[ToolCallResult]] = None + analysis: Optional[str] = None + + +class HolmesResultsBlock(BaseBlock): + holmes_result: Optional[HolmesResult] + + def __init__( + self, + holmes_result: Optional[HolmesResult] = None, + **kwargs, + ): + super().__init__( + holmes_result=holmes_result, + **kwargs, + ) diff --git a/src/robusta/core/sinks/mail/mail_sink_params.py b/src/robusta/core/sinks/mail/mail_sink_params.py index 07e5d7016..252eaadd3 100644 --- a/src/robusta/core/sinks/mail/mail_sink_params.py +++ b/src/robusta/core/sinks/mail/mail_sink_params.py @@ -15,8 +15,8 @@ def _get_sink_type(cls): def validate_mailto(cls, mailto): # Make sure we only handle emails and exclude other schemes provided by apprise # (there is a lot of them). - if not (mailto.startswith("mailto://") or mailto.startswith("mailtos://")): - raise AttributeError(f"{mailto} is not a mailto(s) address") + # if not (mailto.startswith("mailto://") or mailto.startswith("mailtos://")): + # raise AttributeError(f"{mailto} is not a mailto(s) address") return mailto diff --git a/src/robusta/core/sinks/robusta/dal/model_conversion.py b/src/robusta/core/sinks/robusta/dal/model_conversion.py index 0b11ad97f..3377a4813 100644 --- a/src/robusta/core/sinks/robusta/dal/model_conversion.py +++ b/src/robusta/core/sinks/robusta/dal/model_conversion.py @@ -3,7 +3,7 @@ import logging import uuid from datetime import datetime -from typing import Any, Dict +from typing import Any, Dict, List from robusta.core.model.env_vars import ENABLE_GRAPH_BLOCK from robusta.core.reporting import ( @@ -21,8 +21,9 @@ PrometheusBlock, TableBlock, ) -from robusta.core.reporting.blocks import GraphBlock, EmptyFileBlock +from robusta.core.reporting.blocks import EmptyFileBlock, GraphBlock from robusta.core.reporting.callbacks import ExternalActionRequestBuilder +from robusta.core.reporting.holmes import HolmesResultsBlock from robusta.core.sinks.transformer import Transformer from robusta.utils.parsing import datetime_to_db_str @@ -68,7 +69,7 @@ def to_finding_json(account_id: str, cluster_id: str, finding: Finding): @staticmethod def get_file_type(filename: str): last_dot_idx = filename.rindex(".") - return filename[last_dot_idx + 1:] + return filename[last_dot_idx + 1 :] @staticmethod def get_file_object(block: FileBlock): @@ -85,6 +86,22 @@ def get_empty_file_object(block: EmptyFileBlock): "data": "", } + @staticmethod + def add_ai_analysis_data(structured_data: List[Dict], block: HolmesResultsBlock): + structured_data.append( + { + "type": "markdown", + "metadata": {"type": "ai_investigation_result"}, + "data": Transformer.to_github_markdown(block.holmes_result.analysis), + } + ) + for tool_call in block.holmes_result.tool_calls: + file_block = FileBlock(f"{tool_call.description}.txt", tool_call.result.encode()) + file_block.zip() + data_obj = ModelConversion.get_file_object(file_block) + data_obj["metadata"] = {"description": tool_call.description, "tool_name": tool_call.tool_name} + structured_data.append(data_obj) + @staticmethod def to_evidence_json( account_id: str, @@ -110,7 +127,12 @@ def to_evidence_json( elif isinstance(block, GraphBlock): if ENABLE_GRAPH_BLOCK: structured_data.append( - {"type": "prometheus", "data": block.graph_data.dict(), "metadata": block.graph_data.metadata, "version": 1.0} + { + "type": "prometheus", + "data": block.graph_data.dict(), + "metadata": block.graph_data.metadata, + "version": 1.0, + } ) else: if block.is_text_file(): @@ -122,6 +144,8 @@ def to_evidence_json( if block.is_text_file(): block.zip() structured_data.append(ModelConversion.get_file_object(block)) + elif isinstance(block, HolmesResultsBlock): + ModelConversion.add_ai_analysis_data(structured_data, block) elif isinstance(block, HeaderBlock): structured_data.append({"type": "header", "data": block.text}) elif isinstance(block, ListBlock): diff --git a/src/robusta/integrations/kubernetes/custom_models.py b/src/robusta/integrations/kubernetes/custom_models.py index e65c46e5e..64becb0f2 100644 --- a/src/robusta/integrations/kubernetes/custom_models.py +++ b/src/robusta/integrations/kubernetes/custom_models.py @@ -537,6 +537,8 @@ def run_simple_job_spec( pod = job.get_single_pod() return pod.get_logs() or "" finally: + if job and not pod: + pod = job.get_single_pod() if pod and finalizers: try: # must use patch, since the pod revision changed at this point body = {"metadata": {"$deleteFromPrimitiveList/finalizers": finalizers}} diff --git a/src/robusta/integrations/prometheus/utils.py b/src/robusta/integrations/prometheus/utils.py index a0d985462..4b49dfc2c 100644 --- a/src/robusta/integrations/prometheus/utils.py +++ b/src/robusta/integrations/prometheus/utils.py @@ -176,3 +176,15 @@ def find_alert_manager_url(cls) -> Optional[str]: ], error_msg="Alert manager url could not be found. Add 'alertmanager_url' under global_config", ) + + +class HolmesDiscovery(ServiceDiscovery): + @classmethod + def find_holmes_url(cls, holmes_url: str) -> Optional[str]: + if holmes_url: + return holmes_url + + return super().find_url( + selectors=["app=holmes"], + error_msg="Holmes url could not be found.", + ) diff --git a/src/robusta/integrations/slack/sender.py b/src/robusta/integrations/slack/sender.py index 13c67a61c..9f4390e5d 100644 --- a/src/robusta/integrations/slack/sender.py +++ b/src/robusta/integrations/slack/sender.py @@ -11,8 +11,10 @@ from slack_sdk import WebClient from slack_sdk.errors import SlackApiError -from robusta.core.model.env_vars import ADDITIONAL_CERTIFICATE, SLACK_TABLE_COLUMNS_LIMIT -from robusta.core.reporting.base import Emojis, Finding, FindingStatus +from robusta.core.model.base_params import AIInvestigateParams, ResourceInfo +from robusta.core.model.env_vars import ADDITIONAL_CERTIFICATE, HOLMES_ENABLED, SLACK_TABLE_COLUMNS_LIMIT +from robusta.core.playbooks.internal.ai_integration import ask_holmes +from robusta.core.reporting.base import Emojis, EnrichmentType, Finding, FindingStatus from robusta.core.reporting.blocks import ( BaseBlock, CallbackBlock, @@ -29,7 +31,8 @@ TableBlock, ) from robusta.core.reporting.callbacks import ExternalActionRequestBuilder -from robusta.core.reporting.consts import EnrichmentAnnotation, FindingSource, SlackAnnotations +from robusta.core.reporting.consts import EnrichmentAnnotation, FindingSource, FindingType, SlackAnnotations +from robusta.core.reporting.holmes import HolmesResultsBlock, ToolCallResult from robusta.core.reporting.utils import add_pngs_for_all_svgs from robusta.core.sinks.common import ChannelTransformer from robusta.core.sinks.sink_base import KeyT @@ -299,6 +302,32 @@ def __send_blocks_to_slack( f"error sending message to slack\ne={e}\ntext={message}\nchannel={channel}\nblocks={*output_blocks,}\nattachment_blocks={*attachment_blocks,}" ) + def __create_holmes_callback(self, finding: Finding) -> CallbackBlock: + resource = ResourceInfo( + name=finding.subject.name if finding.subject.name else "", + namespace=finding.subject.namespace, + kind=finding.subject.subject_type.name if finding.subject.subject_type.name else "", + node=finding.subject.node, + container=finding.subject.container, + ) + + context: Dict[str, Any] = { + "robusta_issue_id": str(finding.id), + "issue_type": finding.aggregation_key, + "source": finding.source.name, + } + + return CallbackBlock( + { + "Ask Holmes": CallbackChoice( + action=ask_holmes, + action_params=AIInvestigateParams( + resource=resource, investigation_type="issue", ask="Why is this alert firing?", context=context + ), + ) + } + ) + def __create_finding_header(self, finding: Finding, status: FindingStatus, platform_enabled: bool) -> MarkdownBlock: title = finding.title.removeprefix("[RESOLVED] ") sev = finding.severity @@ -341,6 +370,93 @@ def __create_links(self, finding: Finding): return LinksBlock(links=links) + def __send_tool_usage(self, parent_thread: str, slack_channel: str, tool_calls: List[ToolCallResult]) -> None: + if not tool_calls: + return + + text = "*AI used info from alert and the following tools:*" + for tool in tool_calls: + file_response = self.slack_client.files_upload_v2(content=tool.result, title=f"{tool.description}") + permalink = file_response["file"]["permalink"] + text += f"\n• `<{permalink}|{tool.description}>`" + + self.slack_client.chat_postMessage( + channel=slack_channel, + thread_ts=parent_thread, + text=text, + blocks=[ + { + "type": "section", + "text": {"type": "mrkdwn", "text": text}, + } + ], + ) + + def send_holmes_analysis( + self, + finding: Finding, + slack_channel: str, + platform_enabled: bool, + thread_ts: str = None, + ): + title = finding.title + if platform_enabled: + title = f"<{finding.get_investigate_uri(self.account_id, self.cluster_name)}|*{title}*>" + + ai_enrichments = [ + enrichment for enrichment in finding.enrichments if enrichment.enrichment_type == EnrichmentType.ai_analysis + ] + + if not ai_enrichments: + logging.warning(f"No matching ai enrichments found for id: {finding.id} - {title}") + return + + ai_analysis_blocks = [block for block in ai_enrichments[0].blocks if isinstance(block, HolmesResultsBlock)] + if not ai_analysis_blocks: + logging.warning(f"No matching ai blocks found for id: {finding.id} - {title}") + return + + ai_result = ai_analysis_blocks[0].holmes_result + + blocks = [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": f":robot_face: {ai_result.analysis}", + }, + } + ] + + try: + if thread_ts: + kwargs = {"thread_ts": thread_ts} + else: + kwargs = {} + resp = self.slack_client.chat_postMessage( + channel=slack_channel, + text=title, + attachments=[ + { + "color": "#c852ff", # AI purple + "blocks": blocks, + } + ], + display_as_bot=True, + unfurl_links=False, + unfurl_media=False, + **kwargs, + ) + # We will need channel ids for future message updates + self.channel_name_to_id[slack_channel] = resp["channel"] + if not thread_ts: # if we're not in a threaded message, get the new message thread id + thread_ts = resp["ts"] + + self.__send_tool_usage(thread_ts, slack_channel, ai_result.tool_calls) + + except Exception: + logging.exception("error sending message to slack") # TODO fix error + def send_finding_to_slack( self, finding: Finding, @@ -351,6 +467,19 @@ def send_finding_to_slack( blocks: List[BaseBlock] = [] attachment_blocks: List[BaseBlock] = [] + slack_channel = ChannelTransformer.template( + sink_params.channel_override, + sink_params.slack_channel, + self.cluster_name, + finding.subject.labels, + finding.subject.annotations, + ) + + if finding.finding_type == FindingType.AI_ANALYSIS: + # holmes analysis message needs special handling + self.send_holmes_analysis(finding, slack_channel, platform_enabled, thread_ts) + return "" # [arik] Looks like the return value here is not used, needs to be removed + status: FindingStatus = ( FindingStatus.RESOLVED if finding.title.startswith("[RESOLVED]") else FindingStatus.FIRING ) @@ -360,6 +489,9 @@ def send_finding_to_slack( if platform_enabled: blocks.append(self.__create_links(finding)) + if HOLMES_ENABLED: + blocks.append(self.__create_holmes_callback(finding)) + blocks.append(MarkdownBlock(text=f"*Source:* `{self.cluster_name}`")) if finding.description: if finding.source == FindingSource.PROMETHEUS: @@ -395,13 +527,7 @@ def send_finding_to_slack( sink_params, unfurl, status, - ChannelTransformer.template( - sink_params.channel_override, - sink_params.slack_channel, - self.cluster_name, - finding.subject.labels, - finding.subject.annotations, - ), + slack_channel, thread_ts=thread_ts, )