diff --git a/api/rule_tmpl.go b/api/rule_tmpl.go index 6e076cb..2c863b1 100644 --- a/api/rule_tmpl.go +++ b/api/rule_tmpl.go @@ -10,8 +10,8 @@ import ( type RuleTmplController struct{} /* - 规则模版 API - /api/w8t/ruleTmpl +规则模版 API +/api/w8t/ruleTmpl */ func (rtc RuleTmplController) API(gin *gin.RouterGroup) { ruleTmplA := gin.Group("ruleTmpl") @@ -23,6 +23,7 @@ func (rtc RuleTmplController) API(gin *gin.RouterGroup) { ) { ruleTmplA.POST("ruleTmplCreate", rtc.Create) + ruleTmplA.POST("ruleTmplUpdate", rtc.Update) ruleTmplA.POST("ruleTmplDelete", rtc.Delete) } @@ -46,6 +47,15 @@ func (rtc RuleTmplController) Create(ctx *gin.Context) { }) } +func (rtc RuleTmplController) Update(ctx *gin.Context) { + r := new(models.RuleTemplate) + BindJson(ctx, r) + + Service(ctx, func() (interface{}, interface{}) { + return services.RuleTmplService.Update(r) + }) +} + func (rtc RuleTmplController) Delete(ctx *gin.Context) { r := new(models.RuleTemplateQuery) BindJson(ctx, r) diff --git a/api/rule_tmpl_group.go b/api/rule_tmpl_group.go index b8df93c..d37d04e 100644 --- a/api/rule_tmpl_group.go +++ b/api/rule_tmpl_group.go @@ -10,8 +10,8 @@ import ( type RuleTmplGroupController struct{} /* - 规则模版组 API - /api/w8t/ruleTmplGroup +规则模版组 API +/api/w8t/ruleTmplGroup */ func (rtg RuleTmplGroupController) API(gin *gin.RouterGroup) { ruleTmplGroupA := gin.Group("ruleTmplGroup") @@ -23,6 +23,7 @@ func (rtg RuleTmplGroupController) API(gin *gin.RouterGroup) { ) { ruleTmplGroupA.POST("ruleTmplGroupCreate", rtg.Create) + ruleTmplGroupA.POST("ruleTmplGroupUpdate", rtg.Update) ruleTmplGroupA.POST("ruleTmplGroupDelete", rtg.Delete) } @@ -46,6 +47,15 @@ func (rtg RuleTmplGroupController) Create(ctx *gin.Context) { }) } +func (rtg RuleTmplGroupController) Update(ctx *gin.Context) { + r := new(models.RuleTemplateGroup) + BindJson(ctx, r) + + Service(ctx, func() (interface{}, interface{}) { + return services.RuleTmplGroupService.Update(r) + }) +} + func (rtg RuleTmplGroupController) Delete(ctx *gin.Context) { r := new(models.RuleTemplateGroupQuery) BindJson(ctx, r) diff --git a/deploy/sql/rule_template_groups.sql b/deploy/sql/rule_template_groups.sql index a866e79..0365028 100644 --- a/deploy/sql/rule_template_groups.sql +++ b/deploy/sql/rule_template_groups.sql @@ -1,7 +1,17 @@ use watchalert; -INSERT ignore INTO `rule_template_groups` (`name`, `description`) VALUES ('Node', '服务器节点监控指标'); -INSERT ignore INTO `rule_template_groups` (`name`, `description`) VALUES ('ApiSix', 'APISix网关监控指标'); -INSERT ignore INTO `rule_template_groups` (`name`, `description`) VALUES ('Elasticsearch', 'Elasticsearch监控告警指标'); -INSERT ignore INTO `rule_template_groups` (`name`, `description`) VALUES ('Kafka', 'Kafka监控告警指标'); -INSERT ignore INTO `rule_template_groups` (`name`, `description`) VALUES ('Kubernetes', 'Kubernetes监控告警指标'); -INSERT ignore INTO `rule_template_groups` (`name`, `description`) VALUES ('Docker', 'Docker监控告警指标'); +INSERT ignore INTO watchalert.rule_template_groups (name,`number`,description,`type`) VALUES + ('APISIX',0,'APISIX指标监控','Metrics'), + ('APISIX日志监控',0,'APISIX日志监控','Logs'), + ('Docker',0,'Docker容器监控','Metrics'), + ('ElasticSearch',0,'ElasticSearch资源监控','Metrics'), + ('ETCD',0,'ETCD','Metrics'), + ('Jaeger',0,'Jaeger链路监控','Traces'), + ('Kafka',0,'Kafka监控','Metrics'), + ('Kubernetes',0,'Kubernetes事件监控','Events'), + ('KubernetesMetric',0,'Kubernetes指标监控','Metrics'), + ('MongoDB',0,'MongoDB监控','Metrics'); +INSERT ignore INTO watchalert.rule_template_groups (name,`number`,description,`type`) VALUES + ('MySQL',0,'MySQL资源监控','Metrics'), + ('Node节点监控',0,'Node服务器监控','Metrics'), + ('Redis',0,'Redis资源监控','Metrics'), + ('RocketMQ',0,'RocketMQ监控','Metrics'); diff --git a/deploy/sql/rule_templates.sql b/deploy/sql/rule_templates.sql index 4f6c91c..8bc0865 100644 --- a/deploy/sql/rule_templates.sql +++ b/deploy/sql/rule_templates.sql @@ -1,39 +1,89 @@ use watchalert; -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Exporter异常', 'Prometheus', '{\"promQL\":\"up == 0\"}', 0, 30, 60, '节点: ${instance} , Exporter 异常, 请及时处理!', 'Node'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('节点分区使用率大于80%', 'Prometheus', '{\"promQL\":\"(node_filesystem_size_bytes{fstype=~\\\"ext.?|xfs\\\",mountpoint=~\\\"/|/rootfs\\\"}-node_filesystem_free_bytes{fstype=~\\\"ext.?|xfs\\\",mountpoint=~\\\"/|/rootfs\\\"}) *100/(node_filesystem_avail_bytes {fstype=~\\\"ext.?|xfs\\\",mountpoint=~\\\"/|/rootfs\\\"}+(node_filesystem_size_bytes{fstype=~\\\"ext.?|xfs\\\",mountpoint=~\\\"/|/rootfs\\\"}-node_filesystem_free_bytes{fstype=~\\\"ext.?|xfs\\\",mountpoint=~\\\"/|/rootfs\\\"})) \\u003e 80\"}', 0, 30, 180, '节点分区使用率大于 80%', 'Node'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('节点CPU使用率大于80%', 'Prometheus', '{\"promQL\":\"100 - (avg(irate(node_cpu_seconds_total{mode=\\\"idle\\\"}[5m])) by (instance,tags) * 100) \\u003e 80\"}', 0, 30, 180, '节点 CPU 使用率大于 80% ,持续时间 2m', 'Node'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('节点内存使用率超过80%', 'Prometheus', '{\"promQL\":\"(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \\u003c 20) * on(instance,tags) group_left (nodename) node_uname_info{nodename=~\\\".+\\\"}\"}', 0, 10, 180, '节点内存使用率超过 80%, 持续时间 2m', 'Node'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('APISixEtcdUnreachable', 'Prometheus', '{\"promQL\":\"absent(apisix_etcd_reachable{job=\\\"Prod-ali-kubernetes-Apisix\\\"}) == 1\"}', 0, 10, 60, '生产环境APISix etcd服务器无法访问。配置更改可能无法生效。', 'ApiSix'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('APISixHighRequestRate', 'Prometheus', '{\"promQL\":\"rate(apisix_http_requests_total{job=\\\"Prod-ali-kubernetes-Apisix\\\"}[5m]) \\u003e 1000\"}', 0, 30, 300, '生产环境APISix服务正在经历高请求速率。每秒超过1000个请求。', 'ApiSix'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('APISixNginxMetricErrors', 'Prometheus', '{\"promQL\":\"apisix_nginx_metric_errors_total{job=\\\"Prod-ali-kubernetes-Apisix\\\"} \\u003e 0\"}', 0, 10, 60, '生产环境在APISix中收集nginx指标时发生了错误。', 'ApiSix'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('APISixNodeInfoMissing', 'Prometheus', '{\"promQL\":\"apisix_nginx_metric_errors_total{job=\\\"Prod-ali-kubernetes-Apisix\\\"} \\u003e 0\"}', 0, 10, 60, '生产环境APISix节点信息丢失。这可能表示节点或监控设置存在问题。', 'ApiSix'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('APISixNode小于3', 'Prometheus', '{\"promQL\":\"sum(apisix_node_info{job=\\\"Prod-ali-kubernetes-Apisix\\\"}) \\u003c 3\"}', 0, 10, 60, '生产环境APISix节点小于3台性能有下降。', 'ApiSix'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('APISix活跃连接接近峰值', 'Prometheus', '{\"promQL\":\"sum(apisix_nginx_http_current_connections{state=\\\"active\\\",job=\\\"Prod-ali-kubernetes-Apisix\\\"})by (job) \\u003e= 1258272\"}', 0, 10, 60, 'APISix活跃连接接近峰值尽快扩容,当前活跃数量「${value}」', 'ApiSix'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('ApisixEtcdClusterStatus', 'Prometheus', '{\"promQL\":\"sum(etcd_server_has_leader{job=\\\"Prod-ali-kubernetes-apisix-etcd\\\"}) != 3\"}', 0, 10, 180, '检测到生产Apisix-Etcd集群节点数量不满足预期(应为3个节点),可能存在节点退出或Leader丢失情况,当前剩余节点「${value}」', 'ApiSix'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('ApisixEtcdleader', 'Prometheus', '{\"promQL\":\"sum(etcd_server_is_leader{job=\\\"Prod-ali-kubernetes-apisix-etcd\\\"}) != 1\"}', 0, 10, 180, '生产Apisix-Etcd集群存在脑裂(应为1个主节点).', 'ApiSix'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('ApisixEtcdUp', 'Prometheus', '{\"promQL\":\"absent(etcd_server_has_leader{job=\\\"Prod-ali-kubernetes-apisix-etcd\\\"}) == 1\"}', 0, 10, 60, 'Etcd节点宕机', 'ApiSix'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Apisix Etcd leader变更频繁', 'Prometheus', '{\"promQL\":\"changes(etcd_server_leader_changes_seen_total{job=\\\"Prod-ali-kubernetes-apisix-etcd\\\"}[1h]) \\u003e 5\"}', 0, 30, 180, '最近1小时内Etcd集群主节点切换次数较多,可能由网络问题、OOM或升级等原因导致,具体次数为「${value}」', 'ApiSix'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('ApisixEtcdDbSize接近容量限制', 'Prometheus', '{\"promQL\":\"etcd_mvcc_db_total_size_in_bytes / (1024 * 1024) \\u003e etcd_server_quota_backend_bytes / (1024 * 1024) * 0.8\"}', 0, 10, 180, 'Etcd后端数据库${instance}接近容量限制80%,当前值当前使用率百分比「${value}%」', 'ApiSix'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('ApisixEtcdBackend提交延迟异常', 'Prometheus', '{\"promQL\":\"histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\\\"Prod-ali-kubernetes-apisix-etcd\\\"}[5m])) by (instance, le)) \\u003e 0.25\"}', 0, 30, 180, 'Etcd后端提交操作的延迟较长${instance},可能导致磁盘读写异常当前值为「${value}」.', 'ApiSix'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('ApisixEtcdRaft议问题', 'Prometheus', '{\"promQL\":\"rate(etcd_server_proposals_failed_total{job=\\\"Prod-ali-kubernetes-apisix-etcd\\\"}[1m]) \\u003e 0 or etcd_server_proposals_pending{job=\\\"Prod-ali-kubernetes-apisix-etcd\\\"} \\u003e 0 or etcd_server_proposals_committed_total{job=\\\"Prod-ali-kubernetes-apisix-etcd\\\"} - etcd_server_proposals_applied_total{job=\\\"Prod-ali-kubernetes-apisix-etcdtcd\\\"} \\u003e 0\"}', 0, 15, 180, '检测到Etcd集群中Raft协议存在问题, ${instance}可能出现提交失败、积压或提交与应用的差值大于0的情况,当前值为「${value}」.', 'ApiSix'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Elasticsearch 堆使用率过高', 'Prometheus', '{\"promQL\":\"(elasticsearch_jvm_memory_used_bytes{area=\\\"heap\\\"} / elasticsearch_jvm_memory_max_bytes{area=\\\"heap\\\"}) * 100 \\u003e 80\"}', 0, 10, 180, 'The heap usage is over 80% 当前使用率: ${value}', 'Elasticsearch'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Elasticsearch 磁盘空间不足', 'Prometheus', '{\"promQL\":\"elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 \\u003c 20\"}', 10, 0, 180, 'The disk usage is over 80% 当前使用率: ${value}', 'Elasticsearch'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Elasticsearch 集群红色', 'Prometheus', '{\"promQL\":\"elasticsearch_cluster_health_status{color=\\\"red\\\"} == 1\"}', 0, 10, 180, 'Elastic Cluster Red status 当前值: ${value}', 'Elasticsearch'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Elasticsearch 集群黄色', 'Prometheus', '{\"promQL\":\"elasticsearch_cluster_health_status{color=\\\"yellow\\\"} == 1\"}', 0, 30, 180, 'Elastic Cluster Yellow status 当前值: ${value}', 'Elasticsearch'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Elasticsearch 节点状态', 'Prometheus', '{\"promQL\":\"elasticsearch_node_stats_up \\u003c 1\"}', 0, 10, 180, 'Elasticsearch 节点故障: ${value} 请及时处理!', 'Elasticsearch'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Elasticsearch 重新定位分片的时间过长', 'Prometheus', '{\"promQL\":\"elasticsearch_cluster_health_relocating_shards \\u003e 0\"}', 0, 60, 600, 'Elasticsearch已经重新定位碎片10分钟了, 当前值: ${value}', 'Elasticsearch'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Elasticsearch 初始化分片时间过长', 'Prometheus', '{\"promQL\":\"elasticsearch_cluster_health_initializing_shards \\u003e 0\"}', 0, 60, 600, 'Elasticsearch已经初始化碎片10分钟了, 当前值:${value}', 'Elasticsearch'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Elasticsearch 集群中缺少节点', 'Prometheus', '{\"promQL\":\"elasticsearch_cluster_health_number_of_nodes \\u003c 3\"}', 0, 10, 180, 'Elasticsearch集群中节点丢失 当前集群节点数: ${value}', 'Elasticsearch'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Elasticsearch 慢查询告警', 'Prometheus', '{\"promQL\":\"irate(elasticsearch_indices_search_query_time_seconds[5m]) \\u003e 0.2\"}', 0, 10, 60, 'Elasticsearch集群出现慢查询超过200ms', 'Elasticsearch'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Kafka Node Inconsistent Numbers', 'Prometheus', '{\"promQL\":\"kafka_brokers{job=\\\"test-kafka-cluster\\\"} \\u003c 3\"}', 0, 10, 60, 'kafka集群: ${instance} Kafka集群节点数不符合预期三节点', 'Kafka'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Kafka 集群 Topic xxx副本同步小于 3', 'Prometheus', '{\"promQL\":\"sum(kafka_topic_partition_in_sync_replica{job=\\\"Flink-Kafka-Cluster\\\",topic=\\\"xxx\\\"}) by(topic,job) \\u003c 3\"}', 0, 10, 60, 'kafka集群: ${instance} Kafka 集群 Topic ${topic} 副本小于 3', 'Kafka'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('集群 Pod 副本崩溃导致重启', 'Prometheus', '{\"promQL\":\"sum(changes(kube_pod_container_status_restarts_total{namespace=\\\"zprod\\\"}[10m]) \\u003e= 1) by (pod,instance,env)\"}', 0, 10, 60, '集群环境: ${env} Pod: ${pod} 重启事件次数: ${value}', 'Kubernetes'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Kubernetes 节点尚未准备好', 'Prometheus', '{\"promQL\":\"kube_node_status_condition{condition=\\\"Ready\\\",status=\\\"true\\\"} == 0\"}', 0, 30, 300, 'Kubernetes Node not ready, instance: ${instance}', 'Kubernetes'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Kubernetes 客户端证书下周到期', 'Prometheus', '{\"promQL\":\"apiserver_client_certificate_expiration_seconds_count{job=\\\"apiserver\\\"} \\u003e 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\\\"apiserver\\\"}[5m]))) \\u003c 7*24*60*60\"}', 0, 30, 300, 'Kubernetes client certificate expires next week, instance: ${instance}', 'Kubernetes'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Kubernetes 卷磁盘空间不足', 'Prometheus', '{\"promQL\":\"kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 \\u003c 10\"}', 0, 30, 300, 'Kubernetes Volume out of disk space, instance: ${instance}', 'Kubernetes'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Kubernetes HPA 指标不可用', 'Prometheus', '{\"promQL\":\"kube_horizontalpodautoscaler_status_condition{status=\\\"false\\\", condition=\\\"ScalingActive\\\"} == 1\"}', 0, 30, 300, 'Kubernetes HPA metrics unavailability, instance: ${instance}', 'Kubernetes'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Kubernetes API 服务器错误率很高', 'Prometheus', '{\"promQL\":\"sum(rate(apiserver_request_total{job=\\\"apiserver\\\",code=~\\\"^(?:5..)$\\\"}[1m])) / sum(rate(apiserver_request_total{job=\\\"apiserver\\\"}[1m])) * 100 \\u003e 3\"}', 0, 30, 300, 'Kubernetes API server errors, instance: ${instance}', 'Kubernetes'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('Kubernetes 客户端证书即将过期', 'Prometheus', '{\"promQL\":\"apiserver_client_certificate_expiration_seconds_count{job=\\\"apiserver\\\"} \\u003e 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\\\"apiserver\\\"}[5m]))) \\u003c 24*60*60\"}', 0, 30, 300, 'Kubernetes client certificate expires soon, instance: ${instance}', 'Kubernetes'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('容器CPU利用率超过80%', 'Prometheus', '{\"promQL\":\"(sum(rate(container_cpu_usage_seconds_total{container!=\\\"\\\"}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=\\\"\\\"}/container_spec_cpu_period{container!=\\\"\\\"}) by (pod, container) * 100) \\u003e 80\"}', 0, 10, 180, 'Container High CPU utilization is above 80%, ${instance}', 'Docker'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('容器内存使用率超过80%', 'Prometheus', '{\"promQL\":\"(sum(container_memory_working_set_bytes{name!=\\\"\\\"}) BY (instance, name) / sum(container_spec_memory_limit_bytes \\u003e 0) BY (instance, name) * 100) \\u003e 80\"}', 0, 10, 180, 'Container High Memory usage is above 80%, ${instance}', 'Docker'); -INSERT ignore INTO `rule_templates` (`rule_name`, `datasource_type`, `prometheus_config`, `severity`, `eval_interval`, `for_duration`, `annotations`, `rule_group_name`) VALUES ('容器卷使用率超过80%', 'Prometheus', '{\"promQL\":\"(1 - (sum(container_fs_inodes_free{name!=\\\"\\\"}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 \\u003e 80\"}', 0, 30, 300, 'Container Volume usage is above 80%, ${instance}', 'Docker'); +INSERT ignore INTO watchalert.rule_templates (rule_group_name,rule_name,datasource_type,severity,prometheus_config,ali_cloud_sls_config,loki_config,eval_interval,for_duration,annotations,`type`,repeat_notice_interval,description,labels,effective_time,jaeger_config,kubernetes_config,elastic_search_config) VALUES + ('APISIX','APISIX ETCD DB Total Size过大','Prometheus',NULL,'{"promQL":"(etcd_mvcc_db_total_size_in_bytes / etcd_server_quota_backend_bytes) * 100 ","annotations":"实例:${instance},ETCD 数据库过大可能导致查询和写入操作的延迟增加,影响 APISIX 的性能。 数据库过大还可能增加 ETCD 集群的负载,导致资源消耗增加,甚至可能触发 OOM(内存溢出)等问题。 ","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('APISIX','APISIX ETCD leader变更频繁','Prometheus',NULL,'{"promQL":"changes(etcd_server_leader_changes_seen_total{}[1h])","annotations":"实例:${instance},频繁的 leader 变更会导致配置更新延迟和性能下降。 可能引发 ETCD 的可用性问题,从而影响 APISIX 的配置同步和运行稳定性。 可能是由于网络抖动、节点故障或资源瓶颈等问题导致的,需要进一步排查。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e5"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('APISIX','APISIX ETCD 缺失leader','Prometheus',NULL,'{"promQL":"sum(etcd_server_is_leader{}) ","annotations":"实例:${instance},当缺少 leader 时,ETCD 集群将无法处理写操作,这会导致 APISIX 无法进行配置更新。 配置无法更新可能导致流量调度、插件配置等关键功能失效,直接影响业务的运行。 这种情况通常是由于 ETCD 集群的严重故障或节点失联导致的,需及时修复。","forDuration":60,"rules":[{"severity":"P0","expr":"!=1"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('APISIX','APISIX ETCD连接状态','Prometheus',NULL,'{"promQL":"apisix_etcd_reachable ","annotations":"实例:${instance},APISIX 无法从 Etcd 读取或写入配置信息,可能导致路由规则更新失败、服务发现异常等问题,进而影响整个 API 网关的正常运行。","forDuration":60,"rules":[{"severity":"P0","expr":"!=1"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('APISIX','APISIX 每秒请求量过高','Prometheus',NULL,'{"promQL":"increase(apisix_http_requests_total{}[1m]) / 60","annotations":"实例:${instance},生产环境APISix服务正在经历高请求速率。每秒超过1000个请求。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e1000"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('APISIX','APISIX 活跃连接接近峰值','Prometheus',NULL,'{"promQL":"sum(apisix_nginx_http_current_connections{state=\"active\"}) ","annotations":"实例:${instance},连接数超过apisix工作节点的worker connections总数后,网关会停止工作,新的请求会被拒绝,导致业务受影响。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e135313"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Docker','Docker ContainerKilled','Prometheus',NULL,'{"promQL":"time() - container_last_seen","annotations":"容器:${instance},被意外杀死,请注意排查!","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e60"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Docker','Docker容器CPU使用率过高','Prometheus',NULL,'{"promQL":"(sum(rate(container_cpu_usage_seconds_total{container!=\"\"}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=\"\"}/container_spec_cpu_period{container!=\"\"}) by (pod, container) * 100)","annotations":"容器:${instance},CPU使用率过高,当前:${value}%,请注意排查!","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Docker','Docker容器内存使用率过高','Prometheus',NULL,'{"promQL":"(sum(container_memory_working_set_bytes{name!=\"\"}) BY (instance, name) / sum(container_spec_memory_limit_bytes \u003e 0) BY (instance, name) * 100)","annotations":"容器:${instance},内存使用率过高,当前:${value}%,请注意排查!","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Docker','Docker容器存储使用率过高','Prometheus',NULL,'{"promQL":"(1 - (sum(container_fs_inodes_free{name!=\"\"}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100","annotations":"容器:${instance},存储使用率过高,当前:${value}%,请注意排查!","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'); +INSERT ignore INTO watchalert.rule_templates (rule_group_name,rule_name,datasource_type,severity,prometheus_config,ali_cloud_sls_config,loki_config,eval_interval,for_duration,annotations,`type`,repeat_notice_interval,description,labels,effective_time,jaeger_config,kubernetes_config,elastic_search_config) VALUES + ('ElasticSearch','ES慢查询告警','Prometheus',NULL,'{"promQL":"rate(elasticsearch_indices_search_query_time_seconds[5m]) ","annotations":"实例:${instance},当前:${value},慢查询占用较多计算和存储资源,可能导致其他查询任务响应变慢,影响系统的整体性能。 频繁的慢查询会增加 Elasticsearch 集群的负载,导致 CPU、内存和磁盘 I/O 的压力增大,甚至导致集群不稳定。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e0.2"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('ElasticSearch','ES节点CPU使用率过高','Prometheus',NULL,'{"promQL":"round(elasticsearch_process_cpu_percent{})","annotations":"实例:${instance},当前:${value},高 CPU 使用率可能是由于高查询或写入负载、复杂的查询操作、数据压缩或解压引起的,可能导致节点性能下降、响应时间变长,甚至引发故障。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('ElasticSearch','ES节点堆内存使用率过高','Prometheus',NULL,'{"promQL":"round(((elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 ) * 100) / 100","annotations":"实例:${instance},当前:${value},内存使用率过高可能导致频繁的垃圾回收 (GC),增加延迟,甚至导致“out of memory” (OOM) 错误,使节点宕机。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('ElasticSearch','ES节点状态','Prometheus',NULL,'{"promQL":"elasticsearch_node_stats_up","annotations":"实例:${instance},当前:${value},节点状态不等于1可能是节点出现down机的情况,会影响到整个集群的稳定性。","forDuration":60,"rules":[{"severity":"P0","expr":"!=1"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('ElasticSearch','ES节点磁盘使用率过高','Prometheus',NULL,'{"promQL":"round((100 - (elasticsearch_filesystem_data_available_bytes{}/elasticsearch_filesystem_data_size_bytes{} * 100)) * 100 ) / 100","annotations":"实例:${instance},当前:${value},磁盘空间不足会导致分片无法写入或重新分配,影响集群的正常运行,可能导致集群进入黄色或红色状态。超过 90% 时,集群可能会停止写入。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('ElasticSearch','ES集群写入QPS大于50','Prometheus',NULL,'{"promQL":"rate(elasticsearch_indices_indexing_index_total[5m])","annotations":"实例:${instance},当前:${value},写入 QPS 过高可能导致写入延迟增加、集群性能下降,甚至引发写入失败。大量写入请求可能导致分片分配不均、节点负载过高等问题。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e50"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('ElasticSearch','ES集群查询QPS大于50','Prometheus',NULL,'{"promQL":"rate(elasticsearch_indices_search_query_total[5m])","annotations":"实例:${instance},当前:${value},查询 QPS 过高可能导致查询延迟增加、响应时间变长,影响查询性能。集群的查询负载增加,可能造成节点资源紧张。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e50"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('ElasticSearch','ES集群状态为红色','Prometheus',NULL,'{"promQL":"elasticsearch_cluster_health_status{color=\"red\"} ","annotations":"实例:${instance},红色状态表示集群中有主分片和副本分片不可用,数据丢失或无法访问。此时,部分或全部数据不可用,可能影响业务的正常运行。","forDuration":60,"rules":[{"severity":"P0","expr":"==1"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('ElasticSearch','ES集群状态为黄色','Prometheus',NULL,'{"promQL":"elasticsearch_cluster_health_status{color=\"yellow\"}","annotations":"实例:${instance},黄色状态表示集群中的主分片可用,但副本分片有问题,通常是由于资源不足、分片分配失败等原因。虽然数据可以读取,但副本的不可用会降低数据冗余性和容灾能力。","forDuration":60,"rules":[{"severity":"P0","expr":"==1"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('ETCD','Etcd大量失败的 GRPC 请求','Prometheus',NULL,'{"promQL":"sum(rate(grpc_server_handled_total{grpc_code!=\"OK\"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method)","annotations":"Etcd 中检测到超过 1% 的 GRPC 请求失败,当前:${value}%","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e0.01"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'); +INSERT ignore INTO watchalert.rule_templates (rule_group_name,rule_name,datasource_type,severity,prometheus_config,ali_cloud_sls_config,loki_config,eval_interval,for_duration,annotations,`type`,repeat_notice_interval,description,labels,effective_time,jaeger_config,kubernetes_config,elastic_search_config) VALUES + ('ETCD','Etcd大量失败的 HTTP 请求','Prometheus',NULL,'{"promQL":"sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method)","annotations":"Etcd 中检测到超过 1% 的 HTTP 故障,当前:${value}%","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e0.01"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('ETCD','Etcd成员不足','Prometheus',NULL,'{"promQL":"count(etcd_server_id) % 2","annotations":"Etcd 集群应具有奇数个成员,当前成员个数:${value}","forDuration":60,"rules":[{"severity":"P0","expr":"==0"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('ETCD','Etcd无Leader','Prometheus',NULL,'{"promQL":"etcd_server_has_leader","annotations":"Etcd 集群没有领导者","forDuration":60,"rules":[{"severity":"P0","expr":"==0"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('ETCD','Etcd频繁发生领导者变更','Prometheus',NULL,'{"promQL":"increase(etcd_server_leader_changes_seen_total[10m])","annotations":"Etcd 领导者在 10 分钟内变更超过 2 次。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e2"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Kafka','KafkaTopic偏移量过低','Prometheus',NULL,'{"promQL":"rate(kafka_topic_partition_current_offset{topic!=\"\"}[2m])","annotations":"Kafka集群偏移量异常,当前:${value}。可能存在数据异常问题,请观察!","forDuration":60,"rules":[{"severity":"P0","expr":"\u003c 1"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Kafka','KafkaTopic副本丢失','Prometheus',NULL,'{"promQL":"count(kafka_topic_partition_under_replicated_partition{topic!=\"\",partition=\"0\"}==0) by(topic,job)","annotations":"Kafka集群节点缺失,当前:${value}个副本。当一个分区的某个副本所在的 broker 宕机、网络不可达、数据损坏或由于其他原因导致副本不可用时,就会发生副本丢失。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003c3"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Kafka','Kafka节点离线告警','Prometheus',NULL,'{"promQL":"kafka_brokers","annotations":"Kafka集群节点缺失,当前:${value}个节点。Kafka 集群中的某个节点下线,可能会导致部分分区数据无法读取或写入,并可能导致消费者或生产者请求失败。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003c3"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('KubernetesMetric','KubernetesAPI 服务器延迟','Prometheus',NULL,'{"promQL":"histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"(?:CONNECT|WATCHLIST|WATCH|PROXY)\"} [10m])) WITHOUT (subresource))","annotations":"Kubernetes API 服务器对于 ${verb} ${resource} 的第 99 个百分位延迟为 ${value} 秒。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e1"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Kubernetes','KubernetesHPA获取外部指标失败','KubernetesEvent',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Events',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"HPA","reason":"FailedGetExternalMetric","value":1,"filter":null,"scope":10}','{"index":"","scope":0,"filter":null}'), + ('Kubernetes','KubernetesHPA获取资源指标失败','KubernetesEvent',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Events',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"HPA","reason":"FailedGetResourceMetric","value":1,"filter":null,"scope":10}','{"index":"","scope":0,"filter":null}'); +INSERT ignore INTO watchalert.rule_templates (rule_group_name,rule_name,datasource_type,severity,prometheus_config,ali_cloud_sls_config,loki_config,eval_interval,for_duration,annotations,`type`,repeat_notice_interval,description,labels,effective_time,jaeger_config,kubernetes_config,elastic_search_config) VALUES + ('Kubernetes','KubernetesHPA调整副本数失败','KubernetesEvent',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Events',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"HPA","reason":"FailedRescale","value":1,"filter":null,"scope":10}','{"index":"","scope":0,"filter":null}'), + ('Kubernetes','KubernetesNode内存压力过大','KubernetesEvent',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Events',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"Nodes","reason":"NodeUnderMemoryPressure","value":1,"filter":null,"scope":10}','{"index":"","scope":0,"filter":null}'), + ('Kubernetes','KubernetesNode磁盘压力过大','KubernetesEvent',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Events',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"Nodes","reason":"NodeUnderDiskPressure","value":1,"filter":null,"scope":10}','{"index":"","scope":0,"filter":null}'), + ('Kubernetes','KubernetesNode节点不可用','KubernetesEvent',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Events',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"Nodes","reason":"NodeNotReady","value":1,"filter":null,"scope":10}','{"index":"","scope":0,"filter":null}'), + ('Kubernetes','KubernetesPod不健康','KubernetesEvent',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Events',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"Pods","reason":"Unhealthy","value":1,"filter":null,"scope":10}','{"index":"","scope":0,"filter":null}'), + ('Kubernetes','KubernetesPod反覆崩溃','KubernetesEvent',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Events',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"Pods","reason":"CrashLoopBackOff","value":1,"filter":null,"scope":10}','{"index":"","scope":0,"filter":null}'), + ('Kubernetes','KubernetesPod启动失败','KubernetesEvent',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Events',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"Pods","reason":"Failed","value":1,"filter":null,"scope":10}','{"index":"","scope":0,"filter":null}'), + ('Kubernetes','KubernetesPod挂载卷失败','KubernetesEvent',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Events',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"Pods","reason":"FailedMount","value":1,"filter":null,"scope":10}','{"index":"","scope":0,"filter":null}'), + ('Kubernetes','KubernetesPod调度失败','KubernetesEvent',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Events',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"Pods","reason":"FailedScheduling","value":1,"filter":null,"scope":10}','{"index":"","scope":0,"filter":null}'), + ('Kubernetes','KubernetesPod超出运行期限','KubernetesEvent',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Events',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"Pods","reason":"DeadlineExceeded","value":1,"filter":null,"scope":10}','{"index":"","scope":0,"filter":null}'); +INSERT ignore INTO watchalert.rule_templates (rule_group_name,rule_name,datasource_type,severity,prometheus_config,ali_cloud_sls_config,loki_config,eval_interval,for_duration,annotations,`type`,repeat_notice_interval,description,labels,effective_time,jaeger_config,kubernetes_config,elastic_search_config) VALUES + ('Kubernetes','KubernetesPod附加卷到节点失败','KubernetesEvent',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Events',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"Pods","reason":"FailedAttachVolume","value":1,"filter":null,"scope":10}','{"index":"","scope":0,"filter":null}'), + ('Kubernetes','KubernetesPV/PVC绑定失败','KubernetesEvent',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Events',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"PVC/PV","reason":"FailedBinding","value":1,"filter":null,"scope":10}','{"index":"","scope":0,"filter":null}'), + ('KubernetesMetric','Kubernetes客户端证书下周到期','Prometheus',NULL,'{"promQL":"apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} \u003e 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m])))","annotations":"K8s集群证书下周到期,请立即处理。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003c604800"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('KubernetesMetric','Kubernetes节点网络不可用','Prometheus',NULL,'{"promQL":"kube_node_status_condition{condition=\"NetworkUnavailable\",status=\"true\"}","annotations":"节点 ${instance},存在 NetworkUnavailable 情况","forDuration":60,"rules":[{"severity":"P0","expr":"==1"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('KubernetesMetric','Kubernetes节点超出 pod 容量','Prometheus',NULL,'{"promQL":"sum by (node) ((kube_pod_status_phase{phase=\"Running\"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=\"\"})) / sum by (node) (kube_node_status_allocatable{resource=\"pods\"}) * 100 ","annotations":"节点 ${instance},超出 pod 容量","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e110"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('MongoDB','MongoDB副本成员复制存在延迟','Prometheus',NULL,'{"promQL":"mongodb_mongod_replset_member_optime_date{state=\"PRIMARY\"} - on (set) group_right mongodb_mongod_replset_member_optime_date{state=\"SECONDARY\"}","annotations":"MongoDB实例:${instance} ,复制延迟过高,当前:${value}。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e10"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('MongoDB','MongoDB副本成员状态异常','Prometheus',NULL,'{"promQL":"mongodb_mongod_replset_member_health","annotations":"实例:${instance} 不健康,如果副本集中有节点健康状态异常,可能会导致数据同步延迟、读写操作失败,甚至可能引发服务不可用的问题。","forDuration":60,"rules":[{"severity":"P0","expr":"!=1"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('MongoDB','MongoDB节点宕机','Prometheus',NULL,'{"promQL":"mongodb_up","annotations":"MongoDB实例:${instance} ,节点宕机。","forDuration":60,"rules":[{"severity":"P0","expr":"==0"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('MongoDB','MongoDB连接数过多','Prometheus',NULL,'{"promQL":"round((max(mongodb_connections{state=\"current\"}) by (instance,job) / sum(mongodb_connections{state=~\"current|available\"}) by (instance,job) * 100) * 100) / 100","annotations":"实例:${instance} 连接数过多,当连接数达到上限时,新的客户端连接请求将被拒绝,进而影响应用程序的正常运行,可能引发严重的系统不可用问题。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e50"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('MySQL','MySQL存在慢查询','Prometheus',NULL,'{"promQL":"increase(mysql_global_status_slow_queries[1m])","annotations":"MySQL实例:${instance},存在慢查询,当前条数:${value}。慢查询通常表明数据库中存在未优化的 SQL 操作,可能导致查询性能下降,影响整体响应时间。需要通过优化查询语句、添加索引、调整表结构或优化数据库配置来解决慢查询问题。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e0"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'); +INSERT ignore INTO watchalert.rule_templates (rule_group_name,rule_name,datasource_type,severity,prometheus_config,ali_cloud_sls_config,loki_config,eval_interval,for_duration,annotations,`type`,repeat_notice_interval,description,labels,effective_time,jaeger_config,kubernetes_config,elastic_search_config) VALUES + ('MySQL','MySQL宕机','Prometheus',NULL,'{"promQL":"mysql_up","annotations":"MySQL实例:${instance},意外宕机。","forDuration":60,"rules":[{"severity":"P0","expr":"==0"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('MySQL','MySQL实例发生重启','Prometheus',NULL,'{"promQL":"mysql_global_status_uptime","annotations":"MySQL实例:${instance},发生重启。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003c60"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('MySQL','MySQL连接数过多','Prometheus',NULL,'{"promQL":"max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100","annotations":"MySQL实例:${instance},连接数过多,当前:${value}。连接数接近上限意味着数据库负载很高,可能导致连接请求被拒绝,影响业务操作。 连接数耗尽时,新请求会被拒绝,导致数据库不可用。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('MySQL','MySQL高线程运行','Prometheus',NULL,'{"promQL":"max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 ","annotations":"实例:${instance},当前:${value},高线程数可能导致数据库负载增加,需要检查数据库的并发操作、慢查询、事务等待等问题,优化线程池配置和查询效率。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e60"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('KubernetesMetric','Pod CPU使用率过高','Prometheus',NULL,'{"promQL":"round((sum(rate(container_cpu_usage_seconds_total{namespace=\"kube-system\",container!~\"^POD$\"}[5m])) by (namespace,pod,container) / sum(kube_pod_container_resource_limits{namespace=\"kube-system\",resource=\"cpu\"}) by (container,namespace,pod) * 100) * 100) / 100\n","annotations":"Namespace:${namespace}\\nPod:${pod}\\nCPU使用率过高,当前:${value}%","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('KubernetesMetric','Pod 内存使用率过高','Prometheus',NULL,'{"promQL":"round(((avg by (namespace,pod,container) (container_memory_usage_bytes{namespace=\"kube-system\",container!~\"^POD$\"}) / avg by (namespace,pod,container) (container_spec_memory_limit_bytes{namespace=\"kube-system\",container!=\"\"})) * 100) * 100) / 100","annotations":"Namespace:${namespace}\\nPod:${pod}\\n内存使用率过高,当前:${value}%","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Redis','Redis主节点过多','Prometheus',NULL,'{"promQL":"count(redis_instance_info{role=\"master\"})","annotations":"Redis实例:${instance},集群中有太多节点标记为主节点。!","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e1"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Redis','Redis实例宕机','Prometheus',NULL,'{"promQL":"redis_up","annotations":"Redis实例:${instance},发生宕机,请跟进排查!","forDuration":60,"rules":[{"severity":"P0","expr":"==0"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Redis','Redis拒绝连接','Prometheus',NULL,'{"promQL":"increase(redis_rejected_connections_total[1m])","annotations":"Redis实例:${instance},一些与 Redis 的连接已被拒绝。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e0"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Redis','Redis系统内存不足','Prometheus',NULL,'{"promQL":"redis_memory_used_bytes / redis_total_system_memory_bytes * 100","annotations":"Redis实例:${instance},系统内存不足(\u003e 80%),当前:${value}%。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'); +INSERT ignore INTO watchalert.rule_templates (rule_group_name,rule_name,datasource_type,severity,prometheus_config,ali_cloud_sls_config,loki_config,eval_interval,for_duration,annotations,`type`,repeat_notice_interval,description,labels,effective_time,jaeger_config,kubernetes_config,elastic_search_config) VALUES + ('Redis','Redis缺少主节点','Prometheus',NULL,'{"promQL":"(count(redis_instance_info{role=\"master\"}) or vector(0))","annotations":"Redis实例:${instance},集群没有标记为主节点!","forDuration":60,"rules":[{"severity":"P0","expr":"\u003c1"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Redis','Redis连接数过多','Prometheus',NULL,'{"promQL":"redis_connected_clients / redis_config_maxclients * 100","annotations":"Redis实例:${instance},连接数不足(使用率 \u003e 90%),当前:${value}%。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('RocketMQ','RocketMQ Broker5分钟内QPS增长量过高','Prometheus',NULL,'{"promQL":"round(increase(rocketmq_broker_qps[5m]) * 100) / 100","annotations":"实例:${instance},QPS 的突然增长通常意味着突发的业务流量或异常的请求量。 如果增长过快且超出系统处理能力,可能会导致请求积压,响应时间增加,甚至服务不可用。 持续的高 QPS 可能导致资源耗尽,如 CPU、内存和 I/O 负载增高。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('RocketMQ','RocketMQ Broker5分钟内TPS增长量过高','Prometheus',NULL,'{"promQL":"round(increase(rocketmq_broker_tps[5m]) * 100) / 100","annotations":"实例:${instance},TPS 的快速增长通常意味着大量的消息发送或消费请求。这可能导致 Broker 的压力增加。 当 TPS 增长过快时,Broker 可能面临资源瓶颈(例如内存、CPU、磁盘 I/O),导致处理效率下降。 突增的 TPS 可能由于异常业务流量或系统故障,需进一步排查原因。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('RocketMQ','RocketMQ磁盘使用率过高','Prometheus',NULL,'{"promQL":"sum(rocketmq_brokeruntime_commitlog_disk_ratio) by (brokerIP) * 100","annotations":"实例:${instance},磁盘空间不足可能导致消息无法写入,导致消息积压,甚至服务中断。 当磁盘使用率过高时,文件系统性能也可能下降,影响读写效率。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Jaeger','Trace链路存在异常状态码','Jaeger',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Traces',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":10,"tags":"%7B\"http.status_code\"%3A\"5.%2A%3F\"%7D"}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('服务日志监控','接口耗时大于300ms','Loki',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"{app=\"apisix\"} !~ `socket.io` | json | upstream_response_time \u003e 0.3 | upstream_response_time != \"\"","logScope":10,"evalCondition":{"type":"count","operator":"\u003e=","queryValue":0,"value":1}}',10,0,NULL,'Logs',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('服务日志监控','服务ERROR错误日志「占比超过1%」','Loki',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"sum(count_over_time({namespace=\"prod\"} |~ `ERROR`[10m])) / sum(count_over_time({namespace=\"prod\"}[10m])) * 100 \u003e 1","logScope":10,"evalCondition":{"type":"count","operator":"\u003e=","queryValue":0,"value":1}}',10,0,NULL,'Logs',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('服务日志监控','服务ERROR错误日志「增长率超过2%」','Loki',NULL,'{"promQL":"","annotations":"","forDuration":0,"rules":null}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"sum(rate({namespace=\"prod\"} |~ `level` |~ `ERROR`[5m])) by (app) \u003e 2","logScope":10,"evalCondition":{"type":"count","operator":"\u003e=","queryValue":0,"value":1}}',10,0,NULL,'Logs',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Node节点监控','服务器CPU使用率监控','Prometheus',NULL,'{"promQL":"round(100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\",}[5m])) by (instance) * 100))","annotations":"节点:${instance},CPU使用率过高,当前:${value}%,高 CPU 使用率意味着服务器可能接近处理能力上限,影响性能,导致应用程序响应变慢或服务中断!","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"},{"severity":"P1","expr":"\u003e75"},{"severity":"P2","expr":"\u003e70"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'); +INSERT ignore INTO watchalert.rule_templates (rule_group_name,rule_name,datasource_type,severity,prometheus_config,ali_cloud_sls_config,loki_config,eval_interval,for_duration,annotations,`type`,repeat_notice_interval,description,labels,effective_time,jaeger_config,kubernetes_config,elastic_search_config) VALUES + ('Node节点监控','服务器TCP连接数监控','Prometheus',NULL,'{"promQL":"sum(label_replace(max(node_sockstat_TCP_alloc{}) by (instance),\"host_ip\",\"$1\",\"instance\",\"(.*):.*\"))by(instance)","annotations":"节点:${instance},TCP连接数过高,当前:${value}%,请尽快处理!","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e500"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Node节点监控','服务器入网络吞吐量异常监控','Prometheus',NULL,'{"promQL":"(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 \u003e 100) * on(instance) group_left (nodename) node_uname_info{nodename=~\".+\"}\n","annotations":"节点:${instance},主机网络接口可能接收了太多数据(\u003e 100 MB/s),当前:${value}%,请尽快处理!","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e100"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Node节点监控','服务器内存使用率监控','Prometheus',NULL,'{"promQL":"round(100 - (node_memory_MemAvailable_bytes{} / node_memory_MemTotal_bytes{} * 100))","annotations":"节点:${instance},内存使用率过高,当前:${value}%,内存紧张可能导致系统频繁进行内存交换(swap),进而引起性能下降。如果内存耗尽,应用程序可能崩溃或无法分配更多资源。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e80"},{"severity":"P1","expr":"\u003e75"},{"severity":"P2","expr":"\u003e70"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Node节点监控','服务器出网络吞吐量异常监控','Prometheus',NULL,'{"promQL":"(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 \u003e 100) * on(instance) group_left (nodename) node_uname_info{nodename=~\".+\"}","annotations":"节点:${instance},主机网络接口可能发送了太多数据(\u003e 100 MB/s),当前:${value}%,请尽快处理!","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e100"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Node节点监控','服务器文件系统设备错误','Prometheus',NULL,'{"promQL":"node_filesystem_device_error","annotations":"节点:${instance},${mountpoint} 文件系统出现设备错误,请尽快处理!","forDuration":60,"rules":[{"severity":"P0","expr":"==1"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Node节点监控','服务器时钟偏差','Prometheus',NULL,'{"promQL":"((node_timex_offset_seconds \u003e 0.05 and deriv(node_timex_offset_seconds[5m]) \u003e= 0) or (node_timex_offset_seconds \u003c -0.05 and deriv(node_timex_offset_seconds[5m]) \u003c= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~\".+\"}\n","annotations":"节点:${instance},检测到时钟偏差,请尽快处理!","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e=0"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Node节点监控','服务器状态异常','Prometheus',NULL,'{"promQL":"up{}","annotations":"实例:${instance},节点状态可能是exporter运行状态异常,也能是服务器宕机,当此告警发生后,需要进一步排查来确保节点的状态。","forDuration":60,"rules":[{"severity":"P0","expr":"==0"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Node节点监控','服务器磁盘IO写流量过大','Prometheus',NULL,'{"promQL":"(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 \u003e 100) * on(instance) group_left (nodename) node_uname_info{nodename=~\".+\"}\n","annotations":"节点:${instance},磁盘可能写入了太多数据 (\u003e 100 MB/s),当前:${value}%,请尽快处理!","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e100"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Node节点监控','服务器磁盘IO读流量过大','Prometheus',NULL,'{"promQL":"(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 \u003e 100) * on(instance) group_left (nodename) node_uname_info{nodename=~\".+\"}","annotations":"节点:${instance},磁盘可能读取了太多数据 (\u003e 100 MB/s),当前:${value}%,请尽快处理!","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e100"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'), + ('Node节点监控','服务器磁盘使用率监控','Prometheus',NULL,'{"promQL":"round(max((node_filesystem_size_bytes{fstype=~\"ext.?|xfs\",}-node_filesystem_free_bytes{fstype=~\"ext.?|xfs\",}) *100/(node_filesystem_avail_bytes {fstype=~\"ext.?|xfs\",}+(node_filesystem_size_bytes{fstype=~\"ext.?|xfs\",}-node_filesystem_free_bytes{fstype=~\"ext.?|xfs\",})))by(ecs_cname,instance))","annotations":"节点:${instance},磁盘使用率过高,当前:${value}%,磁盘空间不足会导致文件无法写入、新日志无法记录,甚至可能使服务无法正常运行。尤其在数据增长迅速的系统中,这需要重点监控。","forDuration":60,"rules":[{"severity":"P0","expr":"\u003e85"},{"severity":"P1","expr":"\u003e80"},{"severity":"P2","expr":"\u003e75"}]}','{"project":"","logstore":"","logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}','{"logQL":"","logScope":0,"evalCondition":{"type":"","operator":"","queryValue":0,"value":0}}',10,0,NULL,'Metrics',0,'',NULL,'{"week":null,"startTime":0,"endTime":0}','{"service":"","scope":0,"tags":""}','{"resource":"","reason":"","value":0,"filter":null,"scope":0}','{"index":"","scope":0,"filter":null}'); diff --git a/internal/models/rule_template.go b/internal/models/rule_template.go index cd80db8..e4d40e4 100644 --- a/internal/models/rule_template.go +++ b/internal/models/rule_template.go @@ -1,31 +1,39 @@ package models type RuleTemplateGroup struct { - Name string `json:"name"` + Name string `json:"name" gorm:"type:varchar(255);not null"` Number int `json:"number"` + Type string `json:"type"` Description string `json:"description"` } type RuleTemplateGroupQuery struct { Name string `json:"name" form:"name"` + Type string `json:"type" form:"type"` Description string `json:"description" form:"description"` Query string `json:"query" form:"query"` } type RuleTemplate struct { - RuleGroupName string `json:"ruleGroupName"` - RuleName string `json:"ruleName"` - DatasourceType string `json:"datasourceType"` - Severity int64 `json:"severity"` - PrometheusConfig PrometheusConfig `json:"prometheusConfig" gorm:"prometheusConfig;serializer:json"` - AliCloudSLSConfig AliCloudSLSConfig `json:"alicloudSLSConfig" gorm:"alicloudSLSConfig;serializer:json"` - LokiConfig LokiConfig `json:"lokiConfig" gorm:"lokiConfig;serializer:json"` - EvalInterval int64 `json:"evalInterval"` - ForDuration int64 `json:"forDuration"` - Annotations string `json:"annotations"` + Type string `json:"type"` + RuleGroupName string `json:"ruleGroupName"` + RuleName string `json:"ruleName" gorm:"type:varchar(255);not null"` + DatasourceType string `json:"datasourceType"` + EvalInterval int64 `json:"evalInterval"` + ForDuration int64 `json:"forDuration"` + RepeatNoticeInterval int64 `json:"repeatNoticeInterval"` + Description string `json:"description"` + EffectiveTime EffectiveTime `json:"effectiveTime" gorm:"effectiveTime;serializer:json"` + PrometheusConfig PrometheusConfig `json:"prometheusConfig" gorm:"prometheusConfig;serializer:json"` + AliCloudSLSConfig AliCloudSLSConfig `json:"alicloudSLSConfig" gorm:"alicloudSLSConfig;serializer:json"` + LokiConfig LokiConfig `json:"lokiConfig" gorm:"lokiConfig;serializer:json"` + JaegerConfig JaegerConfig `json:"jaegerConfig" gorm:"JaegerConfig;serializer:json"` + KubernetesConfig KubernetesConfig `json:"kubernetesConfig" gorm:"kubernetesConfig;serializer:json"` + ElasticSearchConfig ElasticSearchConfig `json:"elasticSearchConfig" gorm:"elasticSearchConfig;serializer:json"` } type RuleTemplateQuery struct { + Type string `json:"type" form:"type"` RuleGroupName string `json:"ruleGroupName" form:"ruleGroupName"` RuleName string `json:"ruleName" form:"ruleName"` DatasourceType string `json:"datasourceType" form:"datasourceType"` diff --git a/internal/models/user_permissions.go b/internal/models/user_permissions.go index 6c2b1f4..4cb7e1d 100644 --- a/internal/models/user_permissions.go +++ b/internal/models/user_permissions.go @@ -195,6 +195,10 @@ func PermissionsInfo() map[string]UserPermissions { Key: "创建规则模版", API: "/api/w8t/ruleTmpl/ruleTmplCreate", }, + "ruleTmplUpdate": { + Key: "更新规则模版", + API: "/api/w8t/ruleTmpl/ruleTmplUpdate", + }, "ruleTmplDelete": { Key: "删除规则模版", API: "/api/w8t/ruleTmpl/ruleTmplDelete", @@ -203,6 +207,10 @@ func PermissionsInfo() map[string]UserPermissions { Key: "创建规则模版组", API: "/api/w8t/ruleTmplGroup/ruleTmplGroupCreate", }, + "ruleTmplGroupUpdate": { + Key: "更新规则模版组", + API: "/api/w8t/ruleTmplGroup/ruleTmplGroupUpdate", + }, "ruleTmplGroupDelete": { Key: "删除规则模版组", API: "/api/w8t/ruleTmplGroup/ruleTmplGroupDelete", diff --git a/internal/repo/rule_tmpl.go b/internal/repo/rule_tmpl.go index 9c2e88c..f0a2413 100644 --- a/internal/repo/rule_tmpl.go +++ b/internal/repo/rule_tmpl.go @@ -13,6 +13,7 @@ type ( InterRuleTmplRepo interface { List(r models.RuleTemplateQuery) ([]models.RuleTemplate, error) Create(r models.RuleTemplate) error + Update(r models.RuleTemplate) error Delete(r models.RuleTemplateQuery) error } ) @@ -29,6 +30,7 @@ func newRuleTmplInterface(db *gorm.DB, g InterGormDBCli) InterRuleTmplRepo { func (rt RuleTmplRepo) List(r models.RuleTemplateQuery) ([]models.RuleTemplate, error) { var data []models.RuleTemplate db := rt.db.Model(&models.RuleTemplate{}).Where("rule_group_name = ?", r.RuleGroupName) + db.Where("type = ?", r.Type) if r.Query != "" { db.Where("rule_name LIKE ? OR datasource_type LIKE ?", "%"+r.Query+"%", "%"+r.Query+"%") @@ -51,6 +53,22 @@ func (rt RuleTmplRepo) Create(r models.RuleTemplate) error { return nil } +func (rt RuleTmplRepo) Update(r models.RuleTemplate) error { + u := Updates{ + Table: models.RuleTemplate{}, + Where: map[string]interface{}{ + "rule_name = ?": r.RuleName, + }, + Updates: r, + } + err := rt.g.Updates(u) + if err != nil { + return err + } + + return nil +} + func (rt RuleTmplRepo) Delete(r models.RuleTemplateQuery) error { d := Delete{ Table: models.RuleTemplate{}, diff --git a/internal/repo/rule_tmpl_group.go b/internal/repo/rule_tmpl_group.go index 900ae41..30d2b1f 100644 --- a/internal/repo/rule_tmpl_group.go +++ b/internal/repo/rule_tmpl_group.go @@ -13,6 +13,7 @@ type ( InterRuleTmplGroupRepo interface { List(r models.RuleTemplateGroupQuery) ([]models.RuleTemplateGroup, error) Create(r models.RuleTemplateGroup) error + Update(r models.RuleTemplateGroup) error Delete(r models.RuleTemplateGroupQuery) error } ) @@ -29,6 +30,7 @@ func newRuleTmplGroupInterface(db *gorm.DB, g InterGormDBCli) InterRuleTmplGroup func (rtg RuleTmplGroupRepo) List(r models.RuleTemplateGroupQuery) ([]models.RuleTemplateGroup, error) { var data []models.RuleTemplateGroup db := rtg.db.Model(&models.RuleTemplateGroup{}) + db.Where("type = ?", r.Type) if r.Query != "" { db.Where("name LIKE ? OR description LIKE ?", "%"+r.Query+"%", "%"+r.Query+"%") @@ -40,7 +42,9 @@ func (rtg RuleTmplGroupRepo) List(r models.RuleTemplateGroupQuery) ([]models.Rul for k, v := range data { var ruleCount int64 - rtg.db.Model(&models.RuleTemplate{}).Where("rule_group_name = ?", v.Name).Count(&ruleCount) + rtdb := rtg.db.Model(&models.RuleTemplate{}) + rtdb.Where("type = ?", r.Type) + rtdb.Where("rule_group_name = ?", v.Name).Count(&ruleCount) data[k].Number = int(ruleCount) } @@ -56,6 +60,22 @@ func (rtg RuleTmplGroupRepo) Create(r models.RuleTemplateGroup) error { return nil } +func (rtg RuleTmplGroupRepo) Update(r models.RuleTemplateGroup) error { + u := Updates{ + Table: models.RuleTemplateGroup{}, + Where: map[string]interface{}{ + "name = ?": r.Name, + }, + Updates: r, + } + err := rtg.g.Updates(u) + if err != nil { + return err + } + + return nil +} + func (rtg RuleTmplGroupRepo) Delete(r models.RuleTemplateGroupQuery) error { d := Delete{ Table: &models.RuleTemplateGroup{}, diff --git a/internal/services/rule_tmpl.go b/internal/services/rule_tmpl.go index 6bc5c08..d064b53 100644 --- a/internal/services/rule_tmpl.go +++ b/internal/services/rule_tmpl.go @@ -12,6 +12,7 @@ type ruleTmplService struct { type InterRuleTmplService interface { List(req interface{}) (interface{}, interface{}) Create(req interface{}) (interface{}, interface{}) + Update(req interface{}) (interface{}, interface{}) Delete(req interface{}) (interface{}, interface{}) } @@ -41,6 +42,16 @@ func (rt ruleTmplService) Create(req interface{}) (interface{}, interface{}) { return nil, nil } +func (rt ruleTmplService) Update(req interface{}) (interface{}, interface{}) { + r := req.(*models.RuleTemplate) + err := rt.ctx.DB.RuleTmpl().Update(*r) + if err != nil { + return nil, err + } + + return nil, nil +} + func (rt ruleTmplService) Delete(req interface{}) (interface{}, interface{}) { r := req.(*models.RuleTemplateQuery) err := rt.ctx.DB.RuleTmpl().Delete(*r) diff --git a/internal/services/rule_tmpl_group.go b/internal/services/rule_tmpl_group.go index a4d492f..4b622e0 100644 --- a/internal/services/rule_tmpl_group.go +++ b/internal/services/rule_tmpl_group.go @@ -12,6 +12,7 @@ type ruleTmplGroupService struct { type InterRuleTmplGroupService interface { List(req interface{}) (interface{}, interface{}) Create(req interface{}) (interface{}, interface{}) + Update(req interface{}) (interface{}, interface{}) Delete(req interface{}) (interface{}, interface{}) } @@ -41,6 +42,16 @@ func (rtg ruleTmplGroupService) Create(req interface{}) (interface{}, interface{ return nil, nil } +func (rtg ruleTmplGroupService) Update(req interface{}) (interface{}, interface{}) { + r := req.(*models.RuleTemplateGroup) + err := rtg.ctx.DB.RuleTmplGroup().Update(*r) + if err != nil { + return nil, err + } + + return nil, nil +} + func (rtg ruleTmplGroupService) Delete(req interface{}) (interface{}, interface{}) { r := req.(*models.RuleTemplateGroupQuery) err := rtg.ctx.DB.RuleTmplGroup().Delete(*r)