From 3fabbb5033d3923e80324817708899412b7069e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Fri, 19 Apr 2024 17:50:19 +0800 Subject: [PATCH 01/30] add more rca scene.py --- common/scene.py | 12 +- handler/checker/check_handler.py | 40 ++- handler/checker/check_task.py | 5 +- handler/checker/step/data_size.py | 14 +- handler/checker/step/get_system_parameter.py | 10 +- handler/checker/step/sql.py | 23 +- handler/checker/step/ssh.py | 11 +- handler/checker/step/stepbase.py | 7 +- .../information_schema_tables_two_data.yaml | 12 + handler/gather/gather_log.py | 3 + handler/rca/rca_handler.py | 2 + handler/rca/scene/ddl_disk_full_scene.py | 8 +- handler/rca/scene/disconnection_scene.py | 2 +- handler/rca/scene/log_error_scene.py | 338 ++++++++++++++++++ handler/rca/scene/major_hold_scene.py | 2 +- 15 files changed, 439 insertions(+), 50 deletions(-) create mode 100644 handler/checker/tasks/observer/table/information_schema_tables_two_data.yaml create mode 100644 handler/rca/scene/log_error_scene.py diff --git a/common/scene.py b/common/scene.py index 044445a0..735d38b0 100644 --- a/common/scene.py +++ b/common/scene.py @@ -17,7 +17,8 @@ """ from common.ssh import SshHelper from common.tool import StringUtils -from common.command import get_observer_version, get_obproxy_version +from common.command import get_observer_version, get_obproxy_version, get_observer_version_by_sql + def filter_by_version(scene, cluster, stdio=None): try: @@ -59,14 +60,19 @@ def filter_by_version(scene, cluster, stdio=None): stdio.exception("filter_by_version Exception : {0}".format(e)) raise Exception("filter_by_version Exception : {0}".format(e)) -def get_version(nodes, type, stdio=None): +def get_version(nodes, type,cluster, stdio=None): try: if len(nodes) < 1: raise Exception("input nodes is empty, please check your config") node = nodes[0] ssh = SshHelper(True, node.get("ip"), node.get("ssh_username"), node.get("ssh_password"), node.get("ssh_port"), node.get("ssh_key_file"), node) + version = "" if type == "observer": - version = get_observer_version(True, ssh, nodes[0]["home_path"], stdio) + try: + version = get_observer_version_by_sql(cluster,stdio) + except Exception as e: + stdio.warn("get observer version by sql fail, use node ssher to get. Exception:{0}".format(e)) + version = get_observer_version(True, ssh, nodes[0]["home_path"], stdio) elif type == "obproxy": version = get_obproxy_version(True, ssh, nodes[0]["home_path"], stdio) return version diff --git a/handler/checker/check_handler.py b/handler/checker/check_handler.py index 5d4152b8..c7477a52 100644 --- a/handler/checker/check_handler.py +++ b/handler/checker/check_handler.py @@ -18,6 +18,9 @@ import os import yaml + +from common.ob_connector import OBConnector +from common.ssh import SshHelper from handler.checker.check_exception import CheckException from handler.checker.check_report import TaskReport, CheckReport, CheckrReportException from handler.checker.check_task import TaskBase @@ -87,6 +90,41 @@ def __init__(self, context, check_target_type="observer"): # input_param self.options=self.context.options + # add ssher + new_node=[] + for node in self.nodes: + # add ssher + ssher = None + try: + ssher = SshHelper(True, node.get("ip"), + node.get("ssh_username"), + node.get("ssh_password"), + node.get("ssh_port"), + node.get("ssh_key_file"), + node) + except Exception as e: + self.stdio.warn("StepBase get SshHelper fail on{0} ,Exception: {1}".format(node.get("ip"), e)) + node["ssher"] = ssher + new_node.append(node) + self.nodes=new_node + self.version=get_version(self.nodes, self.check_target_type,self.cluster, self.stdio) + + # add OBConnector + obConnector = None + try: + if self.cluster is not None: + obConnector=OBConnector(ip=self.cluster.get("db_host"), + port=self.cluster.get("db_port"), + username=self.cluster.get("tenant_sys").get("user"), + password=self.cluster.get("tenant_sys").get("password"), + stdio=self.stdio, + timeout=10000) + except Exception as e: + self.stdio.warn("obConnector init error. Error info is {0}".format(e)) + finally: + self.context.set_variable('check_obConnector', obConnector) + + def handle(self): try: package_name = None @@ -173,7 +211,7 @@ def execute_one(self, task_name): # Verify if the version is within a reasonable range report = TaskReport(self.context,task_name) if not self.ignore_version: - version = get_version(self.nodes, self.check_target_type, self.stdio) + version = self.version if version: self.cluster["version"] = version self.stdio.verbose("cluster.version is {0}".format(self.cluster["version"])) diff --git a/handler/checker/check_task.py b/handler/checker/check_task.py index bcd357b4..9605bfcf 100644 --- a/handler/checker/check_task.py +++ b/handler/checker/check_task.py @@ -15,7 +15,6 @@ @file: check_task.py @desc: """ - from handler.checker.check_exception import StepResultFailException, \ StepExecuteFailException, StepResultFalseException, TaskException from handler.checker.step.stepbase import StepBase @@ -40,13 +39,15 @@ def execute(self): self.stdio.verbose("task_base execute") steps_nu = filter_by_version(self.task, self.cluster, self.stdio) if steps_nu < 0: - self.stdio.warn("Unadapted by version. SKIP") + self.stdio.warn("{0} Unadapted by version. SKIP".format(self.task['name'])) self.report.add("Unadapted by version. SKIP", "warning") return "Unadapted by version.SKIP" self.stdio.verbose("filter_by_version is return {0}".format(steps_nu)) if len(self.nodes) == 0: raise Exception("node is not exist") + for node in self.nodes: + self.stdio.verbose("run task in node: {0}".format(StringUtils.node_cut_passwd_for_log(node))) steps = self.task[steps_nu] nu = 1 diff --git a/handler/checker/step/data_size.py b/handler/checker/step/data_size.py index c93a3e9d..b8d0ff0c 100644 --- a/handler/checker/step/data_size.py +++ b/handler/checker/step/data_size.py @@ -34,18 +34,14 @@ def __init__(self,context, step, node, task_variable_dict): self.task_variable_dict = task_variable_dict try: - is_ssh = True - self.ssh_helper = SshHelper(is_ssh, node.get("ip"), - node.get("ssh_username"), - node.get("ssh_password"), - node.get("ssh_port"), - node.get("ssh_key_file"), - node) + self.ssh_helper=self.node["ssher"] + if self.ssh_helper is None: + raise Exception("self.ssh_helper is None.") except Exception as e: self.stdio.error( - "GetSystemParameterHandler ssh init fail . Please check the NODES conf Exception : {0} .".format(e)) + "DataSizeHandler ssh init fail . Please check the NODES conf Exception : {0} .".format(e)) raise Exception( - "GetSystemParameterHandler ssh init fail . Please check the NODES conf Exception : {0} .".format(e)) + "DataSizeHandler ssh init fail . Please check the NODES conf Exception : {0} .".format(e)) # step report self.parameter = [] diff --git a/handler/checker/step/get_system_parameter.py b/handler/checker/step/get_system_parameter.py index 8ecbc6d4..af3341c1 100644 --- a/handler/checker/step/get_system_parameter.py +++ b/handler/checker/step/get_system_parameter.py @@ -34,13 +34,9 @@ def __init__(self,context, step, node, task_variable_dict): self.task_variable_dict = task_variable_dict try: - is_ssh = True - self.ssh_helper = SshHelper(is_ssh, node.get("ip"), - node.get("ssh_username"), - node.get("ssh_password"), - node.get("ssh_port"), - node.get("ssh_key_file"), - node) + self.ssh_helper=self.node["ssher"] + if self.ssh_helper is None: + raise Exception("self.ssh_helper is None.") except Exception as e: self.stdio.error( "GetSystemParameterHandler ssh init fail . Please check the NODES conf Exception : {0} .".format(e)) diff --git a/handler/checker/step/sql.py b/handler/checker/step/sql.py index 7b44d86d..c55c50a7 100644 --- a/handler/checker/step/sql.py +++ b/handler/checker/step/sql.py @@ -23,24 +23,21 @@ class StepSQLHandler: - def __init__(self,context, step, ob_cluster, task_variable_dict): + def __init__(self,context, step, task_variable_dict): try: self.context = context self.stdio = context.stdio - self.ob_cluster = ob_cluster - self.ob_cluster_name = ob_cluster.get("cluster_name") + self.ob_cluster = self.context.cluster_config + self.ob_cluster_name = self.ob_cluster.get("cluster_name") self.tenant_mode = None self.sys_database = None self.database = None - self.ob_connector = OBConnector(ip=ob_cluster.get("db_host"), - port=ob_cluster.get("db_port"), - username=ob_cluster.get("tenant_sys").get("user"), - password=ob_cluster.get("tenant_sys").get("password"), - stdio=self.stdio, - timeout=10000) + self.ob_connector=self.context.get_variable('check_obConnector') + if self.ob_connector is None: + raise Exception("self.ob_connector is None.") except Exception as e: - self.stdio.error("StepSQLHandler init fail. Please check the OBCLUSTER conf. OBCLUSTER: {0} Exception : {1} .".format(ob_cluster,e)) - raise Exception("StepSQLHandler init fail. Please check the OBCLUSTER conf. OBCLUSTER: {0} Exception : {1} .".format(ob_cluster,e)) + self.stdio.error("StepSQLHandler init fail. Please check the OBCLUSTER conf. Exception : {0} .".format(e)) + raise Exception("StepSQLHandler init fail. Please check the OBCLUSTER conf. Exception : {0} .".format(e)) self.task_variable_dict = task_variable_dict self.enable_dump_db = False self.trace_id = None @@ -73,8 +70,8 @@ def execute(self): self.stdio.verbose("sql execute update task_variable_dict: {0} = {1}".format(self.step["result"]["set_value"], Util.convert_to_number(data))) self.task_variable_dict[self.step["result"]["set_value"]] = Util.convert_to_number(data) except Exception as e: - self.stdio.error("StepSQLHandler execute Exception: {0}".format(e).strip()) - raise StepExecuteFailException("StepSQLHandler execute Exception: {0}".format(e).strip()) + self.stdio.error("StepSQLHandler execute Exception: {0}".format(e)) + raise StepExecuteFailException("StepSQLHandler execute Exception: {0}".format(e)) def update_step_variable_dict(self): return self.task_variable_dict diff --git a/handler/checker/step/ssh.py b/handler/checker/step/ssh.py index 963cd19f..282477e2 100644 --- a/handler/checker/step/ssh.py +++ b/handler/checker/step/ssh.py @@ -18,7 +18,6 @@ from handler.checker.check_exception import StepExecuteFailException from handler.checker.check_report import TaskReport -from common.ssh import SshHelper from common.tool import StringUtils from common.tool import Util @@ -32,13 +31,9 @@ def __init__(self,context, step, node, task_variable_dict): self.step = step self.node = node try: - is_ssh = True - self.ssh_helper = SshHelper(is_ssh, node.get("ip"), - node.get("ssh_username"), - node.get("ssh_password"), - node.get("ssh_port"), - node.get("ssh_key_file"), - node) + self.ssh_helper=self.node["ssher"] + if self.ssh_helper is None: + raise Exception("self.ssh_helper is None.") except Exception as e: self.stdio.error( "SshHandler init fail. Please check the NODES conf. node: {0}. Exception : {1} .".format(node, e)) diff --git a/handler/checker/step/stepbase.py b/handler/checker/step/stepbase.py index a3351211..3ec68d8e 100644 --- a/handler/checker/step/stepbase.py +++ b/handler/checker/step/stepbase.py @@ -49,9 +49,8 @@ def execute(self, report): self.task_variable_dict["remote_ip"] = \ docker.from_env().containers.get(self.node["container_name"]).attrs['NetworkSettings']['Networks'][ 'bridge']["IPAddress"] - for key in self.node: - self.task_variable_dict["remote_{0}".format(key)] = self.node[key] - + for node in self.node: + self.task_variable_dict["remote_{0}".format(node)] = self.node[node] if "type" not in self.step: raise StepExecuteFailException("Missing field :type") if self.step["type"] == "get_system_parameter": @@ -59,7 +58,7 @@ def execute(self, report): elif self.step["type"] == "ssh": handler = SshHandler(self.context, self.step, self.node, self.task_variable_dict) elif self.step["type"] == "sql": - handler = StepSQLHandler(self.context, self.step, self.cluster, self.task_variable_dict) + handler = StepSQLHandler(self.context, self.step, task_variable_dict=self.task_variable_dict) elif self.step["type"] == "data_size": handler = DataSizeHandler(self.context, self.step, self.cluster, self.task_variable_dict) else: diff --git a/handler/checker/tasks/observer/table/information_schema_tables_two_data.yaml b/handler/checker/tasks/observer/table/information_schema_tables_two_data.yaml new file mode 100644 index 00000000..f6bce02c --- /dev/null +++ b/handler/checker/tasks/observer/table/information_schema_tables_two_data.yaml @@ -0,0 +1,12 @@ +info: 'A table found two records in information_schema.tables. KBA-50000056' +task: + - version: "[4.0.0.0,*]" + steps: + - type: sql + sql: 'select count(0) from oceanbase.__all_virtual_table_stat where table_id = partition_id and (tenant_id,table_id) in (select tenant_id, table_id from oceanbase.__all_virtual_table where part_level != 0);' + result: + set_value: err_table_count + # report_type: warning + verify_type: equal + verify: 0 + err_msg: 'Find have table found two records in information_schema.tables. the number of err_table_count is : #{err_table_count}. Please get more info by "select * from oceanbase.__all_virtual_table_stat where table_id = partition_id and (tenant_id,table_id) in (select tenant_id, table_id from oceanbase.__all_virtual_table where part_level != 0);". And you can by "delete from __all_table_stat where table_id=partition_id and table_id=${partition table table_id};" and "delete from __all_column_stat where table_id=partition_id and table_id=${partition table table_id};" to fix it.' diff --git a/handler/gather/gather_log.py b/handler/gather/gather_log.py index ee640f4f..d5a1174f 100644 --- a/handler/gather/gather_log.py +++ b/handler/gather/gather_log.py @@ -35,6 +35,7 @@ class GatherLogHandler(BaseShellHandler): def __init__(self, context, gather_pack_dir='./', is_scene=False): super(GatherLogHandler, self).__init__() + self.pack_dir_this_command = "" self.context = context self.stdio = context.stdio self.is_ssh = True @@ -168,6 +169,7 @@ def handle_from_node(node): summary_tuples = self.__get_overall_summary(gather_tuples, self.zip_encrypt) self.stdio.print(summary_tuples) + self.pack_dir_this_command=pack_dir_this_command # Persist the summary results to a file FileUtil.write_append(os.path.join(pack_dir_this_command, "result_summary.txt"), summary_tuples) last_info = "For result details, please run cmd \033[32m' cat {0} '\033[0m\n".format(os.path.join(pack_dir_this_command, "result_summary.txt")) @@ -333,6 +335,7 @@ def __pharse_log(self, ssh_helper, home_path, log_name, gather_path): """ log_path = os.path.join(home_path, "log") if self.grep_options is not None: + grep_cmd="" if type(self.grep_options) == str: grep_cmd = "grep -e '{grep_options}' {log_dir}/{log_name} >> {gather_path}/{log_name} ".format( grep_options=self.grep_options, diff --git a/handler/rca/rca_handler.py b/handler/rca/rca_handler.py index 92f423b3..3a004891 100644 --- a/handler/rca/rca_handler.py +++ b/handler/rca/rca_handler.py @@ -293,6 +293,8 @@ def add_record(self, record): def add_suggest(self, suggest): self.suggest += suggest + def suggest_is_empty(self): + return self.suggest == "The suggest: " def export_suggest(self): return self.suggest diff --git a/handler/rca/scene/ddl_disk_full_scene.py b/handler/rca/scene/ddl_disk_full_scene.py index 89095f80..b45db4ac 100644 --- a/handler/rca/scene/ddl_disk_full_scene.py +++ b/handler/rca/scene/ddl_disk_full_scene.py @@ -12,7 +12,7 @@ """ @time: 2024/04/01 -@file: ddl_disk_full.py +@file: ddl_disk_full_scene.py @desc: """ import re @@ -69,6 +69,9 @@ def init(self, context): tenant_data = self.ob_connector.execute_sql( "select tenant_id from oceanbase.__all_tenant where tenant_name = '{0}';".format(tenant_name)) + if len(tenant_data) == 0: + raise RCAInitException( + "can not find tenant id by tenant name: {0}. Please check the tenant name.".format(tenant_name)) self.tenant_id = tenant_data[0][0] if self.tenant_id is None: raise RCAInitException( @@ -76,6 +79,9 @@ def init(self, context): table_id_data = self.ob_connector.execute_sql( "select table_id from oceanbase.__all_virtual_table where table_name = '{0}';".format(table_name)) + if len(table_id_data) == 0: + raise RCAInitException( + "can not find table id by table name: {0}. Please check the table name.".format(table_name)) self.table_id = table_id_data[0][0] if self.table_id is None: raise RCAInitException( diff --git a/handler/rca/scene/disconnection_scene.py b/handler/rca/scene/disconnection_scene.py index 1ac6d6a2..754224a0 100644 --- a/handler/rca/scene/disconnection_scene.py +++ b/handler/rca/scene/disconnection_scene.py @@ -12,7 +12,7 @@ """ @time: 2024/03/11 -@file: disconnectionScene.py +@file: disconnection_scene.py @desc: """ import re diff --git a/handler/rca/scene/log_error_scene.py b/handler/rca/scene/log_error_scene.py new file mode 100644 index 00000000..0973db65 --- /dev/null +++ b/handler/rca/scene/log_error_scene.py @@ -0,0 +1,338 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/04/16 +@file: no_leader_scene.py +@desc: +""" +import os +import re + +from handler.rca.rca_exception import RCAInitException, RCAExecuteException +from handler.rca.rca_handler import RcaScene, RCA_ResultRecord +from common.tool import StringUtils + + +class LogErrorScene(RcaScene): + def __init__(self): + super().__init__() + self.all_tenant_election_leader_info = None + self.work_path = None + self.all_tenant_ids = None + + def init(self, context): + super().init(context) + ## observer version≥4.0.0.0 + observer_version = self.observer_version + if observer_version is None or len(observer_version.strip()) == 0: + raise RCAInitException("observer version is None. Please check the NODES conf.") + if not (observer_version == "4.0.0.0" or StringUtils.compare_versions_greater(observer_version, "4.0.0.0")): + self.stdio.error("observer version is {0}, which is less than 4.0.0.0.".format(observer_version)) + raise RCAInitException("observer version is {0}, which is less than 4.0.0.0.".format(observer_version)) + # if self.ob_connector is None: + # raise RCAInitException("ob_connector is None. Please check the NODES conf.") + self.verbose("observer version is {0}.".format(observer_version)) + if self.ob_connector is None: + raise RCAInitException("ob_connector is None. Please check the NODES conf.") + + def verbose(self, info): + self.stdio.verbose("[NoLeaderScene] {0}".format(info)) + + def execute(self): + try: + if self.observer_version >= '4.2.1.0': + self.execute_421() + return + # check Election leader + # get_all_tenant_id + self.verbose("start to get all tenant id...") + sql = "select tenant_id from oceanbase.__all_tenant;" + tenant_ids = self.ob_connector.execute_sql(sql) + if len(tenant_ids) <= 0: + raise RCAExecuteException("can not find any tenant id") + self.all_tenant_election_leader_info = {} + for tenant_id_data in tenant_ids: + record = RCA_ResultRecord() + try: + tenant_id_data = tenant_id_data[0] + + record.add_record("tenant_id:{0}.".format(tenant_id_data)) + self.execute_by_tenant_id(tenant_id_data, record) + except Exception as e: + self.verbose("check election leader error,tenant_id:{0},error:{1}".format(tenant_id_data, e)) + continue + finally: + if len(record.suggest) == 13: + record.add_suggest("no suggest") + self.Result.records.append(record) + except Exception as e: + self.stdio.error("NoLeaderScene execute Exception:{0}".format(e)) + + def execute_by_tenant_id(self, tenant_id, record): + try: + record.add_record("start step1") + election_leader_info = self.check_election_leader_by_tenant_id(tenant_id) + self.verbose("election_leader_info:{0}".format(election_leader_info)) + record.add_record("election_leader_info:{0}".format(election_leader_info)) + if election_leader_info == "": + self.verbose("can not find any election leader,tenant_id:{0}".format(tenant_id)) + record.add_record("election_leader_info is null") + record.add_suggest("can not find any election leader,tenant_id:{0}. Please check it.".format(tenant_id)) + return + record.add_record("start step2") + step_next_tag = True + ls_ids = self.ob_connector.execute_sql( + "select distinct (ls_id) from oceanbase.__all_virtual_log_stat where tenant_id={0};".format(tenant_id)) + if ls_ids is None or len(ls_ids) <= 0: + self.stdio.warn("not found log about election_leader. tenant_id: {0}".format(tenant_id)) + record.add_suggest( + "not found log on oceanbase.__all_virtual_log_stat. tenant_id: {0}".format(tenant_id)) + return + + for ls_id in ls_ids: + ls_id = ls_id[0] + leader_ls_id_bool = self.ob_connector.execute_sql( + 'select count(0) from oceanbase.__all_virtual_log_stat where role="LEADER" and tenant_id={0} and ls_id="{1}";'.format( + tenant_id, ls_id)) + leader_ls_id_bool = leader_ls_id_bool[0] + if leader_ls_id_bool <= 0: + record.add_record( + "tenant_id: {0}, ls_id: {1} on oceanbase.__all_virtual_log_stat no LEADER".format(tenant_id, + ls_id)) + record.add_suggest( + "tenant_id: {0}, ls_id: {1} on oceanbase.__all_virtual_log_stat no LEADER".format(tenant_id, + ls_id)) + self.stdio.warn( + "tenant_id: {0}, ls_id: {1} on oceanbase.__all_virtual_log_stat no LEADER".format(tenant_id, + ls_id)) + step_next_tag = False + + if step_next_tag is False: + self.verbose("step_next_tag is false") + return + record.add_record("start step3") + return + + + except Exception as e: + self.stdio.warn("execute_by_tenant_id:{0} Exception:{1}".format(tenant_id, e)) + + def execute_421(self): + try: + self.stdio.print("start execute_421") + if self.ob_connector is None: + self.stdio.error("ob_connector is None. please check conf") + return + # get data from __all_virtual_ha_diagnose + sql = "select * from oceanbase.__all_virtual_ha_diagnose;" + cursor = self.ob_connector.execute_sql_return_cursor_dictionary(sql) + diagnose_data = cursor.fetchall() + if diagnose_data is None or len(diagnose_data) <= 0: + self.stdio.warn("not found data on oceanbase.__all_virtual_ha_diagnose") + return + # get all tenant_id + tenant_ids = [] + for data in diagnose_data: + tenant_ids.append(data["tenant_id"]) + self.verbose("tenant_ids:{0}".format(tenant_ids)) + # step1 + ### tenant_diagnose_data: [tenant_id] diagnose_data + tenant_diagnose_data = {} + for data in diagnose_data: + if tenant_diagnose_data.get(data["tenant_id"]) is None: + tenant_diagnose_data[data["tenant_id"]] = [data] + else: + tenant_data = tenant_diagnose_data.get(data["tenant_id"]) + tenant_data.append(data) + tenant_diagnose_data[data["tenant_id"]] = tenant_data + self.verbose("tenant_diagnose_data:{0}".format(tenant_diagnose_data)) + self.stdio.start_loading("no_leader scene start analyzing...") + for tenant_id in tenant_diagnose_data: + record_one_tenant=self.execute_421_no_leader_by_tenant_id(tenant_id, tenant_diagnose_data[tenant_id]) + self.Result.records.append(record_one_tenant) + self.stdio.stop_loading('no_leader scene end') + return + + except Exception as e: + raise RCAExecuteException("execute_421 execute error: {0}".format(e)) + + def execute_421_no_leader_by_tenant_id(self, tenant_id,diagnose_data): + record = RCA_ResultRecord() + try: + self.stdio.verbose("start execute_421_no_leader_by_tenant_id") + record.add_record("tenant_id: {0}.".format(tenant_id)) + leader_nu={} + record.add_record("start step1") + for diagnose_data_by_tenant_id in diagnose_data: + if diagnose_data_by_tenant_id["election_role"].upper() == "LEADER": + leader_nu[diagnose_data_by_tenant_id["ls_id"]] = leader_nu.get( + diagnose_data_by_tenant_id["ls_id"], 0) + 1 + else: + leader_nu[diagnose_data_by_tenant_id["ls_id"]] = leader_nu.get( + diagnose_data_by_tenant_id["ls_id"], 0) + record.add_record("all ls_id:{0}".format(list(leader_nu.keys()))) + self.verbose("all ls_id:{0}".format(list(leader_nu.keys()))) + scene_1_tag=True + for ls_id in leader_nu: + record.add_record("on ls_id: {1} ".format(tenant_id, ls_id)) + self.verbose("on tenant_id: {0}, ls_id: {1} ".format(tenant_id, ls_id)) + if leader_nu[ls_id] > 1: + self.stdio.warn("the leader number > 1") + record.add_record("the ls_id's leader number > 1") + record.add_suggest( + "tenant_id: {0}, ls_id: {1} .the ls_id's leader number > 1".format(tenant_id, ls_id)) + scene_1_tag = False + continue + elif leader_nu[ls_id] == 0: + self.stdio.warn( + "the leader number = 0,The election layer is unable to select a new owner, and a common problem in this scenario is that the message delay is too large. You can continue to troubleshoot the problem of message delay or backlog in the log") + record.add_suggest( + "tenant_id: {0}, ls_id: {1} .the leader number = 0. The election layer is unable to select a new owner, and a common problem in this scenario is that the message delay is too large. You can continue to troubleshoot the problem of message delay or backlog in the log".format( + tenant_id, ls_id)) + scene_1_tag = False + continue + else: + ## Normal + self.verbose("Normal. The ls_id's leader number = 1") + record.add_record("Normal. The ls_id's leader number = 1") + + if scene_1_tag is False: + self.verbose("scene_1 is check") + return record + + ## scene 2 + record.add_record("start step2") + scene_2_tag = True + for tenant_diagnose_data_by_tenant_id in diagnose_data: + ls_id = tenant_diagnose_data_by_tenant_id["ls_id"] + record.add_record("on ls_id: {1} ".format(tenant_id, ls_id)) + if tenant_diagnose_data_by_tenant_id["election_role"].upper() == "LEADER" and \ + tenant_diagnose_data_by_tenant_id["palf_role"].upper() != "LEADER" and \ + tenant_diagnose_data_by_tenant_id["palf_state"].upper() != "ACTIVE": + self.stdio.warn( + "tenant_id: {0}, ls_id: {1} on oceanbase.__all_virtual_ha_diagnose election_role is LEADER but palf_role is {2} and palf_state is {3}".format( + tenant_id, + ls_id, + tenant_diagnose_data_by_tenant_id["palf_role"], + tenant_diagnose_data_by_tenant_id["palf_state"])) + record.add_record( + "tenant_id: {0}, ls_id: {1} on oceanbase.__all_virtual_ha_diagnose election_role is LEADER but palf_role is {2} and palf_state is {3}".format( + tenant_id, + ls_id, + tenant_diagnose_data_by_tenant_id["palf_role"], + tenant_diagnose_data_by_tenant_id["palf_state"])) + record.add_suggest( + "tenant_id: {0}, ls_id: {1} on oceanbase.__all_virtual_ha_diagnose election_role is LEADER but palf_role is {2} and palf_state is {3}. The newly elected leader failed to take office in the palf layer, and the palf_state can be used to determine at which stage the palf failed to take office.".format( + tenant_id, + ls_id, + tenant_diagnose_data_by_tenant_id["palf_role"], + tenant_diagnose_data_by_tenant_id["palf_state"])) + scene_2_tag = False + else: + self.verbose( + "tenant_id: {0}, ls_id: {1} on oceanbase.__all_virtual_ha_diagnose election_role is LEADER , palf_role is {2} and palf_state is {3}".format( + tenant_id, + ls_id, + tenant_diagnose_data_by_tenant_id["palf_role"], + tenant_diagnose_data_by_tenant_id["palf_state"])) + record.add_record("Normal. Unable to find a replica where both election_role and palf_role are leaders, but log_handler_role is follower") + continue + if scene_2_tag is False: + self.verbose("scene_2 is check") + return + ## scene 3 + record.add_record("start step3") + + for tenant_diagnose_data_by_tenant_id in diagnose_data: + record.add_record( + "tenant_id: {0}, ls_id: {1} ".format(tenant_diagnose_data_by_tenant_id["tenant_id"], + tenant_diagnose_data_by_tenant_id["ls_id"])) + if tenant_diagnose_data_by_tenant_id["election_role"].upper() == "LEADER" and \ + tenant_diagnose_data_by_tenant_id["palf_role"].upper() == "LEADER" and \ + tenant_diagnose_data_by_tenant_id["log_handler_role"].upper() == "follower": + record.add_record("election_role:LEADER , palf_role: LEADER, log_handler_role: follower") + log_handler_takeover_state = tenant_diagnose_data_by_tenant_id[ + "log_handler_takeover_state"].lower() + record.add_record("log_handler_takeover_state: {0}".format(log_handler_takeover_state)) + if log_handler_takeover_state == "wait_replay_done": + record.add_suggest( + "Previous stuck waiting for replay steps. Please check the issue about replay") + elif log_handler_takeover_state == "unknown": + record.add_suggest( + "Please check observe whether the remaining log streams of this tenant also have the issue of log handler failure in taking over") + elif log_handler_takeover_state == "wait_rc_handler_done": + log_handler_takeover_log_type = tenant_diagnose_data_by_tenant_id[ + "log_handler_takeover_log_type"] + record.add_record( + "log_handler_takeover_log_type: {0}".format(log_handler_takeover_log_type)) + record.add_suggest( + "log_handler_takeover_log_type is {0}. Please report oceanbase's community".format( + log_handler_takeover_log_type)) + else: + record.add_record("Normal.Unable to find a replica where the selection_role is a leader, but the palf_role and palf_state are not leaders or active, respectively") + + if record.suggest_is_empty(): + record.add_suggest("Normal. Not find the reason of the log handler failure in taking over.") + except Exception as e: + raise RCAExecuteException("tenant_id: {0}. execute_421_no_leader_by_tenant_id execute error: {1}".format(tenant_id,e)) + finally: + + return record + + + + + def check_election_leader_by_tenant_id(self, tenant_id): + try: + self.stdio.verbose("start check_election_leader_by_tenant_id") + self.gather_log.set_parameters("scope", "election") + self.gather_log.grep("T{0}_.*dump proposer info".format(tenant_id)) + self.work_path = self.store_dir + logs_name = self.gather_log.execute() + if len(logs_name) == 0: + self.stdio.warn( + "check_election_leader_by_tenant_id not found log about election_leader. tenant_id: {0}".format( + tenant_id)) + return "" + self.stdio.verbose( + "check_election_leader_by_tenant_id tenant_id: {0}, logs_name:{1}".format(tenant_id, logs_name)) + for name in logs_name: + self.stdio.verbose("read the log file: {0}".format(name)) + with open(name, 'rb') as file: + file.seek(0, os.SEEK_END) + file_length = file.tell() + file.seek(max(file_length - 1024, 0), 0) + lines = file.readlines() + last_line = lines[-1].decode().strip() + pattern = r'addr:"([^"]+)"' + match = re.search(pattern, last_line) + if match: + return match.group(1) + else: + return "" + except Exception as e: + raise RCAExecuteException( + "check_election_leader_by_tenant_id: {1}. execute error: {0}".format(e, tenant_id)) + + def export_result(self): + super().export_result() + + def get_scene_info(self): + + return {"name": "log_error", + "info_en": "Troubleshooting log related issues. Currently supported scenes: no_leader.", + "info_cn": '日志相关问题排查。目前支持:无主场景。', + } + + +log_error = LogErrorScene() \ No newline at end of file diff --git a/handler/rca/scene/major_hold_scene.py b/handler/rca/scene/major_hold_scene.py index dee2ad5a..7ebfb93d 100644 --- a/handler/rca/scene/major_hold_scene.py +++ b/handler/rca/scene/major_hold_scene.py @@ -12,7 +12,7 @@ """ @time: 2024/1/2 -@file: major_hold.py +@file: major_hold_scene.py @desc: """ import json From 54a44f1fb93248e6234ad949e41ec6dd52a9643f Mon Sep 17 00:00:00 2001 From: Teingi Date: Mon, 22 Apr 2024 20:05:15 +0800 Subject: [PATCH 02/30] update README.md --- README-CN.md | 57 ++++++++++++++++++++++++++-------------------------- README.md | 55 +++++++++++++++++++++++++------------------------- 2 files changed, 57 insertions(+), 55 deletions(-) diff --git a/README-CN.md b/README-CN.md index d14ecbcc..32e1cc0c 100644 --- a/README-CN.md +++ b/README-CN.md @@ -62,49 +62,50 @@ obdiag config -h -u [-p password] [-P port] ``` # obdiag 功能 +- 一键集群巡检 +- 一键诊断分析 +- 一键根因分析 +- 一键信息采集 -## obdiag 巡检功能 -- [一键巡检](./docs/check.md) -## obdiag 一键场景化信息采集功能 -- [一键场景化信息采集](./docs/gather_scene.md) +# 参与贡献 -## obdiag 一键信息采集功能 +obdiag 期望构建一个开放的社区,我们欢迎任何形式的贡献,您可以: -- [一键收集OB日志](./docs/gather_ob_log.md) -- [一键收集AWR报告](./docs/gather_awr.md) -- [一键收集主机信息](./docs/gather_sysstat.md) -- [一键收集slog/clog日志](./docs/gather_admin.md) -- [一键收集火焰图信息](./docs/gather_perf.md) -- [一键收集OB堆栈信息](./docs/gather_ob_stack.md) -- [一键收集并行SQL的执行详情信息](./docs/gather_sql_plan_monitor.md) -- [一键收集OBPROXY日志](./docs/gather_obproxy_log.md) -- [一键收集AWR报告](./docs/gather_awr.md) -- [一键收集全部诊断信息](./docs/gather_all.md) +- 向我们提出一个[issue](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues)。 +- 提交请求。 + +- 通过 [Issues](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues) 提交 bug。 +- 通过 [Discussion](https://github.com/oceanbase/oceanbase-diagnostic-tool/discussions) 参与或发起讨论。 +- 通过 [Pull requests](https://github.com/oceanbase/oceanbase-diagnostic-tool/pulls) 提交问题修复或者功能特性。 +- 加入 obdiag [SIG(兴趣小组)](https://oceanbase.yuque.com/org-wiki-obtech-vh7w9r/imzr6c/ixh8wb9d356at3hm) -## obdiag 一键分析功能说明 -- [一键分析OB日志](./docs/analyze_ob_log.md) -- [一键全链路诊断](./docs/analyze_flt_trace.md) +# Roadmap Ahead + +| 版本 | 迭代周期 | Function | +|---------|---------------|---------| +|1.6.0| 2024.01|
  • 场景化信息采集
  • 场景化根因分析
| +|2.0.0|2024.03|
  • context改造,场景化扩展能力增强
  • 支持在线更新巡检、采集的task
  • 根因分析二期
| +|2.1.0|2024.04|
  • 根因分析场景扩展
  • 新增 tabledump 采集
| +|2.2.0|2024.05|
  • 根因分析场景扩展
| +|2.3.0|2024.06|
  • 根因分析场景扩展
  • 支持 SQL 诊断
| +|2.4.0|2024.07|
  • 根因分析场景扩展
  • 适配两款内核的诊断工具
| +|2.5.0|2024.08|
  • 根因分析场景扩展
  • 支持 OMS 诊断
| +|3.0.0|2024.09|
  • 根因分析场景扩展
  • 服务化改造
| +|3.1.0|2024.10|
  • 根因分析场景扩展
  • 支持巡检报告比对
| +|3.2.0|2024.11|
  • 根因分析场景扩展
  • SQL 诊断二期,支持SQL问题的根因分析
| +|3.3.0|2024.12|
  • AI 化探索
| -## obdiag 一键场景化根因分析功能 -- [一键场景化根因分析](./docs/rca.md) # 许可证 OceanBase Diagnostic Tool 使用 [MulanPSL - 2.0](http://license.coscl.org.cn/MulanPSL2) 许可证。 您可以免费复制及使用源代码。当您修改或分发源代码时,请遵守木兰协议。 - -## 贡献 - -我们热烈欢迎并高度赞赏您的贡献。您可以通过以下几种方式做出贡献: - -- 向我们提出一个[issue](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues)。 -- 提交请求。 - ## 支持 如果您在使用 OceanBase LogProxy 时遇到任何问题,欢迎联系我们寻求帮助: - [GitHub Issue](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues) - [官方网站](https://www.oceanbase.com/docs/obdiag-cn) +- obdiag SIG 微信号: obdiagsig \ No newline at end of file diff --git a/README.md b/README.md index f5164359..ed373e1a 100644 --- a/README.md +++ b/README.md @@ -64,48 +64,49 @@ obdiag config -h -u [-p password] [-P port] ``` # obdiag Fuctions +- One-click cluster inspection +- One-click diagnostic analyze +- One-click root cause analysis +- One-click information collection -## obdiag check Fuctions -- [check](./docs/check.md) +For more details, please refer to [Official docs](https://www.oceanbase.com/docs/obdiag-cn) -## obdiag gather scene Fuctions -- [gather scene](./docs/gather_scene.md) +# Join the Contributing Community -## obdiag gather Fuctions +obdiag envisions an open community. We welcome your contributions in any form: -- [gather log](./docs/gather_ob_log.md) -- [gather sysstat](./docs/gather_sysstat.md) -- [gather slog/clog](./docs/gather_admin.md) -- [gather perf](./docs/gather_perf.md) -- [gather obstack](./docs/gather_ob_stack.md) -- [gather sql plan monitor](./docs/gather_sql_plan_monitor.md) -- [gather obproxy_log](./docs/gather_obproxy_log.md) -- [gather awr](./docs/gather_awr.md) -- [gather all](./docs/gather_all.md) +- Report bugs through [Issues](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues). +- Participate in or initiate discussions via [Discussion](https://github.com/oceanbase/oceanbase-diagnostic-tool/discussions). +- Contribute bug fixes or new features through [Pull requests](https://github.com/oceanbase/oceanbase-diagnostic-tool/pulls). +- Join the obdiag [SIG(Special Interest Group)](https://oceanbase.yuque.com/org-wiki-obtech-vh7w9r/imzr6c/ixh8wb9d356at3hm) -## obdiag analyze Fuctions -- [analyze log](./docs/analyze_ob_log.md) -- [analyze flt trace log](./docs/analyze_flt_trace.md) -## obdiag rca Fuctions -- [rca](./docs/rca.md) + +# Roadmap Ahead + +| Version | Date | Function | +|---------|---------------|---------| +|1.6.0| 2024.01|
  • Scenario based fault information collection
  • Scenario based root cause analysis
| +|2.0.0|2024.03|
  • Context Transformation, Enhanced Scene Expansion Capabilities
  • Support online updating of inspection and gather tasks
  • Root Cause Analysis Phase II Transformation
| +|2.1.0|2024.04|
  • Root Cause Analysis Scenario Expansion
  • Gather tabledump
| +|2.2.0|2024.05|
  • Root Cause Analysis Scenario Expansion
| +|2.3.0|2024.06|
  • Root Cause Analysis Scenario Expansion
  • Support SQL Diagnosis
| +|2.4.0|2024.07|
  • Root Cause Analysis Scenario Expansion
  • Adapting Two Additional Kernel Diagnostic Tools
| +|2.5.0|2024.08|
  • Root Cause Analysis Scenario Expansion
  • Support OMS Diagnosis
| +|3.0.0|2024.09|
  • Root Cause Analysis Scenario Expansion
  • Service-ification Transformation
| +|3.1.0|2024.10|
  • Root Cause Analysis Scenario Expansion
  • Supporting Comparative Functionality for Patrol Inspection Reports
| +|3.2.0|2024.11|
  • Root Cause Analysis Scenario Expansion
  • SQL Diagnosis Phase II, Supporting Root Cause Analysis for SQL problems
| +|3.3.0|2024.12|
  • AI for obdiag
| # Licencing OceanBase Database is under MulanPubL - 2.0 license. You can freely copy and use the source code. When you modify or distribute the source code, please obey the MulanPubL - 2.0 license. - -# Contributing - -Contributions are warmly welcomed and greatly appreciated. Here are a few ways you can contribute: - -- Raise us an [issue](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues). -- Submit Pull Requests. - # Support In case you have any problems when using obdiag, welcome reach out for help: - [GitHub Issue](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues) - [Official Website](https://www.oceanbase.com/docs/obdiag-cn) +- obdiag SIG WeChat: obdiagsig From 742cebc7ea30a917250919b6aa7e7e1c8e930105 Mon Sep 17 00:00:00 2001 From: Teingi Date: Mon, 22 Apr 2024 20:06:28 +0800 Subject: [PATCH 03/30] update README.md --- README-CN.md | 1 - README.md | 1 - 2 files changed, 2 deletions(-) diff --git a/README-CN.md b/README-CN.md index 32e1cc0c..69bd71c0 100644 --- a/README-CN.md +++ b/README-CN.md @@ -78,7 +78,6 @@ obdiag 期望构建一个开放的社区,我们欢迎任何形式的贡献, - 通过 [Issues](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues) 提交 bug。 - 通过 [Discussion](https://github.com/oceanbase/oceanbase-diagnostic-tool/discussions) 参与或发起讨论。 - 通过 [Pull requests](https://github.com/oceanbase/oceanbase-diagnostic-tool/pulls) 提交问题修复或者功能特性。 -- 加入 obdiag [SIG(兴趣小组)](https://oceanbase.yuque.com/org-wiki-obtech-vh7w9r/imzr6c/ixh8wb9d356at3hm) # Roadmap Ahead diff --git a/README.md b/README.md index ed373e1a..811be1f2 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,6 @@ obdiag envisions an open community. We welcome your contributions in any form: - Report bugs through [Issues](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues). - Participate in or initiate discussions via [Discussion](https://github.com/oceanbase/oceanbase-diagnostic-tool/discussions). - Contribute bug fixes or new features through [Pull requests](https://github.com/oceanbase/oceanbase-diagnostic-tool/pulls). -- Join the obdiag [SIG(Special Interest Group)](https://oceanbase.yuque.com/org-wiki-obtech-vh7w9r/imzr6c/ixh8wb9d356at3hm) From b4183acb5c93f3da2418661ffe40e583ff04eb6e Mon Sep 17 00:00:00 2001 From: Teingi Date: Mon, 22 Apr 2024 20:08:38 +0800 Subject: [PATCH 04/30] update README.md --- README-CN.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/README-CN.md b/README-CN.md index 69bd71c0..71135b50 100644 --- a/README-CN.md +++ b/README-CN.md @@ -71,10 +71,6 @@ obdiag config -h -u [-p password] [-P port] # 参与贡献 obdiag 期望构建一个开放的社区,我们欢迎任何形式的贡献,您可以: - -- 向我们提出一个[issue](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues)。 -- 提交请求。 - - 通过 [Issues](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues) 提交 bug。 - 通过 [Discussion](https://github.com/oceanbase/oceanbase-diagnostic-tool/discussions) 参与或发起讨论。 - 通过 [Pull requests](https://github.com/oceanbase/oceanbase-diagnostic-tool/pulls) 提交问题修复或者功能特性。 From 74f4a10afc5ae81501d9eadcbd879b6112f6887b Mon Sep 17 00:00:00 2001 From: Teingi Date: Mon, 22 Apr 2024 20:10:36 +0800 Subject: [PATCH 05/30] update README.md --- README-CN.md | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README-CN.md b/README-CN.md index 71135b50..c1df09de 100644 --- a/README-CN.md +++ b/README-CN.md @@ -77,7 +77,7 @@ obdiag 期望构建一个开放的社区,我们欢迎任何形式的贡献, # Roadmap Ahead -| 版本 | 迭代周期 | Function | +| 版本 | 迭代周期 | 功能点 | |---------|---------------|---------| |1.6.0| 2024.01|
  • 场景化信息采集
  • 场景化根因分析
| |2.0.0|2024.03|
  • context改造,场景化扩展能力增强
  • 支持在线更新巡检、采集的task
  • 根因分析二期
| diff --git a/README.md b/README.md index 811be1f2..c2b372ba 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ obdiag envisions an open community. We welcome your contributions in any form: # Roadmap Ahead -| Version | Date | Function | +| Version | Iteration Period | Function | |---------|---------------|---------| |1.6.0| 2024.01|
  • Scenario based fault information collection
  • Scenario based root cause analysis
| |2.0.0|2024.03|
  • Context Transformation, Enhanced Scene Expansion Capabilities
  • Support online updating of inspection and gather tasks
  • Root Cause Analysis Phase II Transformation
| From 47aa6259aae6e126f0d5eb74d43917988ec360e8 Mon Sep 17 00:00:00 2001 From: Teingi Date: Tue, 23 Apr 2024 19:15:04 +0800 Subject: [PATCH 06/30] update README --- README-CN.md | 6 ------ README.md | 10 +--------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/README-CN.md b/README-CN.md index c1df09de..f790fc7e 100644 --- a/README-CN.md +++ b/README-CN.md @@ -41,12 +41,6 @@ sh /usr/local/oceanbase-diagnostic-tool/init.sh ``` ## 方式二:源码安装 -源码编译环境确保有如下依赖 -- gcc -- wget -- python-devel -- mysql-devel - 源码安装需要在python >= 3.8的环境下进行 ```shell diff --git a/README.md b/README.md index c2b372ba..54e4394c 100644 --- a/README.md +++ b/README.md @@ -41,15 +41,7 @@ sh /usr/local/oceanbase-diagnostic-tool/init.sh ``` ## Method 2: Install obdiag by using the source code - -Before you install obdiag by using the source code, make sure that you have installed these dependencies: - -- gcc -- wget -- python-devel -- mysql-devel - -To install obdiag on Python3.8, run these commands: +To install obdiag on Python >= 3.8, run these commands: ```shell pip3 install -r requirements3.txt From 44e0c65e08f4f47838cc066df406ef5533f3e346 Mon Sep 17 00:00:00 2001 From: Teingi Date: Wed, 24 Apr 2024 14:46:04 +0800 Subject: [PATCH 07/30] close #177 --- rpm/oceanbase-diagnostic-tool.spec | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rpm/oceanbase-diagnostic-tool.spec b/rpm/oceanbase-diagnostic-tool.spec index c5749ecc..f97038f5 100644 --- a/rpm/oceanbase-diagnostic-tool.spec +++ b/rpm/oceanbase-diagnostic-tool.spec @@ -80,5 +80,6 @@ find /usr/local/oceanbase-diagnostic-tool/obdiag -type f -exec chmod 644 {} \; ln -sf /usr/local/oceanbase-diagnostic-tool/obdiag /usr/bin/obdiag chmod +x /usr/local/oceanbase-diagnostic-tool/obdiag cp -rf /usr/local/oceanbase-diagnostic-tool/init_obdiag_cmd.sh /etc/profile.d/obdiag.sh +sh /usr/local/oceanbase-diagnostic-tool/init.sh echo -e 'Please execute the following command to init obdiag:\n' -echo -e '\033[32m source /usr/local/oceanbase-diagnostic-tool/init.sh \n \033[0m' +echo -e '\033[32m source ~/.bashrc \n \033[0m' From 58e5571207014f7be96434901ebf382a16e140a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Wed, 24 Apr 2024 17:19:06 +0800 Subject: [PATCH 08/30] update check cases --- cmd.py | 2 + .../ob_enable_plan_cache_bad_version.yaml | 2 +- .../tasks/observer/cluster/sys_log_level.yaml | 23 +++++++++ .../checker/tasks/observer/system/aio.yaml | 4 +- .../tasks/observer/system/clock_source.yaml | 11 ++++ .../observer/system/dependent_software.yaml | 9 +--- .../tasks/observer/system/parameter.yaml | 50 +++++++++---------- .../system/parameter_ip_local_port_range.yaml | 6 +-- .../observer/system/parameter_tcp_rmem.yaml | 6 +-- .../observer/system/parameter_tcp_wmem.yaml | 6 +-- .../observer/system/ulimit_parameter.yaml | 8 +-- handler/gather/gather_obproxy_log.py | 2 +- handler/rca/rca_handler.py | 9 ++++ handler/rca/scene/log_error_scene.py | 5 +- 14 files changed, 89 insertions(+), 54 deletions(-) create mode 100644 handler/checker/tasks/observer/cluster/sys_log_level.yaml create mode 100644 handler/checker/tasks/observer/system/clock_source.yaml diff --git a/cmd.py b/cmd.py index 986dec43..0c0fcfb1 100644 --- a/cmd.py +++ b/cmd.py @@ -322,6 +322,8 @@ def do_command(self): return False cmd = '%s %s' % (self.prev_cmd, base) ROOT_IO.track_limit += 1 + if "main.py" in cmd: + telemetry.work_tag=False telemetry.push_cmd_info("cmd: {0}. args:{1}".format(cmd,args)) return self.commands[base].init(cmd, args).do_command() diff --git a/handler/checker/tasks/observer/cluster/ob_enable_plan_cache_bad_version.yaml b/handler/checker/tasks/observer/cluster/ob_enable_plan_cache_bad_version.yaml index 13b957e8..77e6d8a3 100644 --- a/handler/checker/tasks/observer/cluster/ob_enable_plan_cache_bad_version.yaml +++ b/handler/checker/tasks/observer/cluster/ob_enable_plan_cache_bad_version.yaml @@ -6,6 +6,6 @@ task: sql: 'select name from oceanbase.__all_virtual_tenant_parameter_stat where name like "%ob_enable_plan_cache%" and value like "%true%";' result: set_value: ob_enable_plan_cache - verify: '[ -z "ob_enable_plan_cache" ]' + verify: '[ -z "$ob_enable_plan_cache" ]' err_msg: 'On this version, ob_enable_plan_cache suggestion to close' diff --git a/handler/checker/tasks/observer/cluster/sys_log_level.yaml b/handler/checker/tasks/observer/cluster/sys_log_level.yaml new file mode 100644 index 00000000..1d67338c --- /dev/null +++ b/handler/checker/tasks/observer/cluster/sys_log_level.yaml @@ -0,0 +1,23 @@ +info: "Check sys_log_level ." +task: + - version: "[4.0.0.0,*]" + steps: + - type: sql + sql: 'SELECT value FROM oceanbase.__all_virtual_sys_parameter_stat where name like "%syslog_level%";' + result: + set_value: sys_log_level + verify_type: equal + report_type: warning + verify: '[[ ${sys_log_level} == "WDIAG"]]' + err_msg: "on 4.x, the recommended value for sys_log_level is WDIAG" + + - version: "[3.0.0,4.0.0.0)" + steps: + - type: sql + sql: 'SELECT value FROM oceanbase.__all_virtual_sys_parameter_stat where name like "%syslog_level%";' + result: + set_value: sys_log_level + verify_type: equal + report_type: warning + verify: '[[ ${sys_log_level} == "INFO"]]' + err_msg: "on 3.x, the recommended value for sys_log_level is INFO" diff --git a/handler/checker/tasks/observer/system/aio.yaml b/handler/checker/tasks/observer/system/aio.yaml index 6ad6adeb..d72b3369 100644 --- a/handler/checker/tasks/observer/system/aio.yaml +++ b/handler/checker/tasks/observer/system/aio.yaml @@ -11,11 +11,11 @@ task: set_value: aio_max_nr report_type: warning verify: "[ ${aio_max_nr} -ge 1048576 ]" - err_msg: 'fs.aio-max-nr : #{aio_max_nr} is a non recommended value, recommended value need >1048576' + err_msg: 'fs.aio-max-nr : #{aio_max_nr} . recommended: >1048576' - type: get_system_parameter parameter: fs.aio-nr result: set_value: aio_nr report_type: warning verify: "[ $((aio_max_nr - aio_nr)) -ge $((20000*${observer_nu})) ]" - err_msg: 'fs.aio-nr : #{aio_nr} is a non recommended value, recommended value need aio-max-nr - aio-nr>20000 * observer_num' + err_msg: 'fs.aio-nr : #{aio_nr} . recommended: aio-max-nr - aio-nr>20000 * observer_num' diff --git a/handler/checker/tasks/observer/system/clock_source.yaml b/handler/checker/tasks/observer/system/clock_source.yaml new file mode 100644 index 00000000..e494f5d9 --- /dev/null +++ b/handler/checker/tasks/observer/system/clock_source.yaml @@ -0,0 +1,11 @@ +info: 'Check the type of clock_source is tsc' +task: + - steps: + - type: ssh + ssh: "cat /sys/devices/system/clocksource/clocksource0/current_clocksource" + result: + set_value: clock_source + verify: "[ \"${clock_source}\" == \"tsc\" ]" + err_msg: 'clock_source: #{clock_source}. recommended: tsc. Uneven CPU utilization during pressure testing resulted in low TPS during pressure testing' + + diff --git a/handler/checker/tasks/observer/system/dependent_software.yaml b/handler/checker/tasks/observer/system/dependent_software.yaml index 5c0d64ae..c65c5996 100644 --- a/handler/checker/tasks/observer/system/dependent_software.yaml +++ b/handler/checker/tasks/observer/system/dependent_software.yaml @@ -35,11 +35,4 @@ task: set_value: transparent_hugepage_switch report_type: warning verify: '[ -n "${transparent_hugepage_switch}" ]' - err_msg: 'transparent_hugepage need retrun "[never]". Now , it is null.' - - type: ssh - ssh: "python --version 2>&1 | awk '{print $2}' | cut -d'.' -f1,2" - result: - set_value: python_version - report_type: warning - verify: '[ "2.7" == ${python_version} ]' - err_msg: 'python version need retrun 2.7.x' \ No newline at end of file + err_msg: 'transparent_hugepage need retrun "[never]". Now , it is null.' \ No newline at end of file diff --git a/handler/checker/tasks/observer/system/parameter.yaml b/handler/checker/tasks/observer/system/parameter.yaml index 6cb80c7e..5dc3bd3e 100644 --- a/handler/checker/tasks/observer/system/parameter.yaml +++ b/handler/checker/tasks/observer/system/parameter.yaml @@ -10,7 +10,7 @@ task: report_type: warning verify_type: between verify: "[2048,16384]" - err_msg: 'net.core.somaxconn : #{parameter} , which is not recommended. Set it within the range of 2048 ≤ value ≤ 16384' + err_msg: 'net.core.somaxconn : #{parameter}. recommended: 2048 ≤ value ≤ 16384.' - type: get_system_parameter parameter: net.core.netdev_max_backlog result: @@ -18,7 +18,7 @@ task: report_type: warning verify_type: between verify: "[ 500 ,10000 ]" - err_msg: 'net.core.netdev_max_backlog : #{parameter} , which is not recommended. Set it within the range of 500 ≤ value ≤ 10000' + err_msg: 'net.core.netdev_max_backlog: #{parameter}. recommended: 500 ≤ value ≤ 10000.' - type: get_system_parameter parameter: net.core.rmem_default result: @@ -26,7 +26,7 @@ task: report_type: warning verify_type: between verify: "[ 65536 ,16777216 ]" - err_msg: 'net.core.rmem_default : #{parameter} , which is not recommended. Set it within the range of 65536 ≤ value ≤ 16777216' + err_msg: 'net.core.rmem_default: #{parameter}. recommended: 65536 ≤ value ≤ 16777216.' - type: get_system_parameter parameter: net.core.wmem_default result: @@ -34,7 +34,7 @@ task: report_type: warning verify_type: between verify: "[ 65536,16777216 ]" - err_msg: 'net.core.wmem_default : #{parameter} , which is not recommended. Set it within the range of 65536 ≤ value ≤ 16777216' + err_msg: 'net.core.wmem_default: #{parameter}. recommended: 65536 ≤ value ≤ 16777216.' - type: get_system_parameter parameter: net.core.rmem_max result: @@ -42,7 +42,7 @@ task: report_type: warning verify_type: between verify: "[ 8388608 ,16777216 ]" - err_msg: 'net.core.rmem_max : #{parameter} , which is not recommended. Set it within the range of 8388608 ≤ value ≤ 16777216' + err_msg: 'net.core.rmem_max : #{parameter}. recommended: 8388608 ≤ value ≤ 16777216.' - type: get_system_parameter parameter: net.core.wmem_max result: @@ -50,7 +50,7 @@ task: report_type: warning verify_type: between verify: "[ 8388608,16777216 ]" - err_msg: 'net.core.wmem_max : #{parameter} , which is not recommended. Set it within the range of 8388608 ≤ value ≤ 16777216' + err_msg: 'net.core.wmem_max: #{parameter}. recommended: 8388608 ≤ value ≤ 16777216.' - type: get_system_parameter parameter: net.ipv4.ip_forward result: @@ -58,7 +58,7 @@ task: report_type: warning verify_type: equal verify: 0 - err_msg: 'net.ipv4.ip_forward : #{parameter} , which is not recommended. Set it within the range of 0' + err_msg: 'net.ipv4.ip_forward : #{parameter}. recommended: 0.' - type: get_system_parameter parameter: net.ipv4.tcp_tw_recycle result: @@ -66,7 +66,7 @@ task: report_type: warning verify_type: equal verify: 1 - err_msg: 'net.ipv4.tcp_tw_recycle : #{parameter} , which is not recommended. Set it within the range of 1' + err_msg: 'net.ipv4.tcp_tw_recycle : #{parameter}. recommended: 1.' - type: get_system_parameter parameter: net.ipv4.conf.default.rp_filter @@ -75,7 +75,7 @@ task: report_type: warning verify_type: equal verify: 1 - err_msg: 'net.ipv4.conf.default.rp_filter : #{parameter} , which is not recommended. Set it within the range of 1' + err_msg: 'net.ipv4.conf.default.rp_filter : #{parameter}. recommended: 1.' - type: get_system_parameter parameter: net.ipv4.conf.default.accept_source_route result: @@ -83,7 +83,7 @@ task: report_type: warning verify_type: equal verify: 0 - err_msg: 'net.ipv4.conf.default.accept_source_route : #{parameter} , which is not recommended. Set it within the range of 0' + err_msg: 'net.ipv4.conf.default.accept_source_route: #{parameter}. recommended: 0.' - type: get_system_parameter parameter: net.ipv4.tcp_syncookies result: @@ -91,7 +91,7 @@ task: report_type: warning verify_type: equal verify: 1 - err_msg: 'net.ipv4.tcp_syncookies : #{parameter} , which is not recommended. Set it within the range of 1' + err_msg: 'net.ipv4.tcp_syncookies: #{parameter}. recommended: 1.' - type: get_system_parameter parameter: net.ipv4.tcp_max_syn_backlog result: @@ -99,97 +99,97 @@ task: report_type: warning verify_type: between verify: "[1024,16384]" - err_msg: 'net.ipv4.tcp_max_syn_backlog : #{parameter} , which is not recommended. Set it within the range of 1024 ≤ value ≤ 16384' + err_msg: 'net.ipv4.tcp_max_syn_backlog : #{parameter}. recommended: 1024 ≤ value ≤ 16384.' - type: get_system_parameter parameter: net.ipv4.tcp_fin_timeout result: set_value: parameter report_type: warning verify: "[ 15 -le ${parameter} ] && [ ${parameter} -le 60 ]" - err_msg: 'net.ipv4.tcp_fin_timeout : #{parameter} , which is not recommended. Set it within the range of 15 ≤ value ≤ 60' + err_msg: 'net.ipv4.tcp_fin_timeout : #{parameter}. recommended: 15 ≤ value ≤ 60.' - type: get_system_parameter parameter: net.ipv4.tcp_tw_reuse result: set_value: parameter report_type: warning verify: "[ ${parameter} -eq 1 ]" - err_msg: 'net.ipv4.tcp_tw_reuse : #{parameter} , which is not recommended. Set it within the range of 1' + err_msg: 'net.ipv4.tcp_tw_reuse: #{parameter}. recommended: 1' - type: get_system_parameter parameter: net.ipv4.tcp_slow_start_after_idle result: set_value: parameter report_type: warning verify: "[ ${parameter} -eq 0 ]" - err_msg: 'net.ipv4.tcp_slow_start_after_idle : #{parameter} , which is not recommended. Set it within the range of 0' + err_msg: 'net.ipv4.tcp_slow_start_after_idle: #{parameter}. recommended: 0.' - type: get_system_parameter parameter: vm.swappiness result: set_value: parameter report_type: warning verify: "[ ${parameter} -eq 0 ]" - err_msg: 'vm.swappiness : #{parameter} , which is not recommended. Set it within the range of 0' + err_msg: 'vm.swappiness : #{parameter}. recommended: 0' - type: get_system_parameter parameter: vm.min_free_kbytes result: set_value: parameter report_type: warning verify: "[ 32768 -le ${parameter} ] && [ ${parameter} -le 2097152 ]" - err_msg: 'vm.min_free_kbytes : #{parameter} , which is not recommended. Set it within the range of 32768 ≤ value ≤ 2097152' + err_msg: 'vm.min_free_kbytes : #{parameter}. recommended: 32768 ≤ value ≤ 2097152.' - type: get_system_parameter parameter: vm.max_map_count result: set_value: parameter report_type: warning verify: "[ 327680 -le ${parameter} ] && [ ${parameter} -le 1000000 ]" - err_msg: 'vm.max_map_count : #{parameter} , which is not recommended.Unreasonable vm.max_map_count configuration may cause serious memory leaks. Set it within the range of 327680 ≤ value ≤ 1000000 ' + err_msg: 'vm.max_map_count : #{parameter}. recommended:327680 ≤ value ≤ 1000000.' - type: get_system_parameter parameter: vm.overcommit_memory result: set_value: parameter report_type: warning verify: "[ ${parameter} -eq 0 ]" - err_msg: 'vm.overcommit_memory : #{parameter} , which is not recommended. Set it within the range of 0 ' + err_msg: 'vm.overcommit_memory : #{parameter}. recommended: 0.' - type: get_system_parameter parameter: vm.nr_hugepages result: set_value: parameter report_type: warning verify: "[ ${parameter} -eq 0 ]" - err_msg: 'vm.nr_hugepages : #{parameter} , which is not recommended. Set it within the range of 0 ' + err_msg: 'vm.nr_hugepages : #{parameter}. recommended: 0' - type: get_system_parameter parameter: fs.aio-max-nr result: set_value: parameter report_type: warning verify: "[ 1048576 -le ${parameter} ]" - err_msg: 'fs.aio-max-nr : #{parameter} is a non recommended value, recommended value : #{parameter} is ≥ 1048576' + err_msg: 'fs.aio-max-nr : #{parameter}. recommended: #{parameter} is ≥ 1048576.' - type: get_system_parameter parameter: kernel.numa_balancing result: set_value: parameter report_type: warning verify: "[ ${parameter} -eq 0 ]" - err_msg: 'kernel.numa_balancing : #{parameter} , which is not recommended. Set it within the range of 0 ' + err_msg: 'kernel.numa_balancing : #{parameter}. recommended: 0.' - type: get_system_parameter parameter: vm.zone_reclaim_mode result: set_value: parameter report_type: warning verify: "[ ${parameter} -eq 0 ]" - err_msg: 'vm.zone_reclaim_mode : #{parameter} , which is not recommended. Set it within the range of 0 ' + err_msg: 'vm.zone_reclaim_mode : #{parameter}. recommended: 0.' - type: get_system_parameter parameter: fs.file-max result: set_value: parameter report_type: warning verify: "[ 6573688 -le ${parameter} ]" - err_msg: 'fs.file-max: #{parameter} is a non recommended value, recommended value : #{parameter} is ≥ 6573688' + err_msg: 'fs.file-max: #{parameter}. recommended: #{parameter} is ≥ 6573688.' - type: get_system_parameter parameter: fs.pipe-user-pages-soft result: set_value: parameter report_type: warning verify: "[ 0 -eq ${parameter} ]" - err_msg: 'fs.pipe-user-pages-soft : #{parameter} is a non recommended value, recommended value is 0' + err_msg: 'fs.pipe-user-pages-soft : #{parameter}. recommended: 0.' diff --git a/handler/checker/tasks/observer/system/parameter_ip_local_port_range.yaml b/handler/checker/tasks/observer/system/parameter_ip_local_port_range.yaml index 9a83b86a..29e41c99 100644 --- a/handler/checker/tasks/observer/system/parameter_ip_local_port_range.yaml +++ b/handler/checker/tasks/observer/system/parameter_ip_local_port_range.yaml @@ -8,7 +8,7 @@ task: result: set_value: ip_local_port_range verify: '[[ -n "$ip_local_port_range" && "$ip_local_port_range" != "-1" ]]' - err_msg: "ip_local_port_range is #{ip_local_port_range} . Please check net.ipv4.ip_local_port_range on your node" + err_msg: "ip_local_port_range : #{ip_local_port_range} . Please check net.ipv4.ip_local_port_range on your node" - type: ssh ssh: "echo \"#{ip_local_port_range}\" | awk '{print $1}'" result: @@ -16,7 +16,7 @@ task: report_type: warning verify_type: equal verify: 3500 - err_msg: 'ip_local_port_range_min : #{ip_local_port_range_min} is a non recommended value, recommended value is 3500' + err_msg: 'ip_local_port_range_min : #{ip_local_port_range_min}. recommended: 3500' - type: ssh ssh: "echo \"#{ip_local_port_range}\" | awk '{print $2}'" result: @@ -24,4 +24,4 @@ task: report_type: warning verify_type: equal verify: 65535 - err_msg: 'ip_local_port_range_max : #{ip_local_port_range_max} is a non recommended value, recommended value is 65535' \ No newline at end of file + err_msg: 'ip_local_port_range_max : #{ip_local_port_range_max}. recommended: 65535' \ No newline at end of file diff --git a/handler/checker/tasks/observer/system/parameter_tcp_rmem.yaml b/handler/checker/tasks/observer/system/parameter_tcp_rmem.yaml index 2cd140cd..9bc4a880 100644 --- a/handler/checker/tasks/observer/system/parameter_tcp_rmem.yaml +++ b/handler/checker/tasks/observer/system/parameter_tcp_rmem.yaml @@ -16,7 +16,7 @@ task: report_type: warning verify_type: between verify: "[4096,8192]" - err_msg: 'net.ipv4.tcp_rmem_min : #{tcp_rmem_min} is a non recommended value, recommended value is 4096 ≤ min ≤ 8192' + err_msg: 'net.ipv4.tcp_rmem_min : #{tcp_rmem_min}. recommended: 4096 ≤ min ≤ 8192' - type: ssh ssh: "echo \"#{tcp_rmem}\" | awk '{print $2}'" result: @@ -24,7 +24,7 @@ task: report_type: warning verify_type: between verify: "[65536,131072]" - err_msg: 'net.ipv4.tcp_rmem_default : #{tcp_rmem_default}. net.ipv4.tcp_rmem_default from net.ipv4.tcp_rmem. It is a non recommended value, recommended value :is 65536 ≤ default≤ 131072' + err_msg: 'net.ipv4.tcp_rmem_default : #{tcp_rmem_default}. net.ipv4.tcp_rmem_default from net.ipv4.tcp_rmem. recommended: is 65536 ≤ default≤ 131072' - type: ssh ssh: "echo \"#{tcp_rmem}\" | awk '{print $3}'" result: @@ -32,4 +32,4 @@ task: report_type: warning verify_type: between verify: "[8388608,16777216]" - err_msg: 'net.ipv4.tcp_rmem_max : #{tcp_rmem_max}. net.ipv4.tcp_rmem_max from net.ipv4.tcp_rmem. It is a non recommended value, recommended value is 65536 ≤ max≤ 131072' + err_msg: 'net.ipv4.tcp_rmem_max : #{tcp_rmem_max}. net.ipv4.tcp_rmem_max from net.ipv4.tcp_rmem. recommended: 65536 ≤ max≤ 131072' diff --git a/handler/checker/tasks/observer/system/parameter_tcp_wmem.yaml b/handler/checker/tasks/observer/system/parameter_tcp_wmem.yaml index 76eadaa9..9ed4a2fc 100644 --- a/handler/checker/tasks/observer/system/parameter_tcp_wmem.yaml +++ b/handler/checker/tasks/observer/system/parameter_tcp_wmem.yaml @@ -14,7 +14,7 @@ task: report_type: warning verify_type: between verify: "[4096,8192]" - err_msg: 'net.ipv4.tcp_wmem_min : #{tcp_wmem_min} is a non recommended value, recommended value is 4096 ≤ min ≤ 8192' + err_msg: 'net.ipv4.tcp_wmem_min : #{tcp_wmem_min}. recommended: 4096 ≤ min ≤ 8192' - type: ssh ssh: "echo \"#{tcp_wmem}\" | awk '{print $2}'" result: @@ -22,7 +22,7 @@ task: report_type: warning verify_type: between verify: "[65536,131072]" - err_msg: 'net.ipv4.tcp_wmem_default : #{tcp_wmem_default} is a non recommended value, recommended value :is 65536 ≤ default≤ 131072' + err_msg: 'net.ipv4.tcp_wmem_default : #{tcp_wmem_default}. recommended: is 65536 ≤ default≤ 131072' - type: ssh ssh: "echo \"#{tcp_wmem}\" | awk '{print $3}'" result: @@ -30,4 +30,4 @@ task: report_type: warning verify_type: between verify: "[8388608,16777216]" - err_msg: 'net.ipv4.tcp_wmem_max : #{tcp_wmem_max} is a non recommended value, recommended value is 65536 ≤ max≤ 131072' + err_msg: 'net.ipv4.tcp_wmem_max : #{tcp_wmem_max}. recommended: 65536 ≤ max≤ 131072' diff --git a/handler/checker/tasks/observer/system/ulimit_parameter.yaml b/handler/checker/tasks/observer/system/ulimit_parameter.yaml index 08eb0ea3..778259c5 100644 --- a/handler/checker/tasks/observer/system/ulimit_parameter.yaml +++ b/handler/checker/tasks/observer/system/ulimit_parameter.yaml @@ -8,21 +8,21 @@ task: set_value: parameter report_type: warning verify: "[ 'unlimited' == ${parameter} ]" - err_msg: 'On ip : #{remote_ip}, ulimit -c is #{parameter} . It is a non recommended value, and the recommended value is unlimited. Please refer to the official website document for the configuration method' + err_msg: 'On ip : #{remote_ip}, ulimit -c is #{parameter} . recommended: unlimited.' - type: ssh ssh: ulimit -u result: set_value: parameter report_type: warning verify: "[ '655360' == ${parameter} ]" - err_msg: 'On ip : #{remote_ip}, ulimit -u is #{parameter} . It is a non recommended value, and the recommended value is 655360. Please refer to the official website document for the configuration method' + err_msg: 'On ip : #{remote_ip}, ulimit -u is #{parameter} . recommended: 655360.' - type: ssh ssh: ulimit -s result: set_value: parameter report_type: warning verify: "[ 'unlimited' == ${parameter} ]" - err_msg: 'On ip : #{remote_ip}, ulimit -s is #{parameter} . It is a non recommended value, and the recommended value is unlimited. Please refer to the official website document for the configuration method' + err_msg: 'On ip : #{remote_ip}, ulimit -s is #{parameter} . recommended: unlimited.' - type: ssh ssh: ulimit -n result: @@ -30,4 +30,4 @@ task: report_type: warning verify_type: equal verify: 655350 - err_msg: 'On ip : #{remote_ip}, ulimit -n is #{parameter} . It is a non recommended value, and the recommended value is unlimited. Please refer to the official website document for the configuration method' + err_msg: 'On ip : #{remote_ip}, ulimit -n is #{parameter} . recommended: unlimited.' diff --git a/handler/gather/gather_obproxy_log.py b/handler/gather/gather_obproxy_log.py index 603097e0..dcf0b769 100644 --- a/handler/gather/gather_obproxy_log.py +++ b/handler/gather/gather_obproxy_log.py @@ -249,7 +249,7 @@ def __get_log_name(self, ssh_helper, node): self.scope == "obproxy_limit" or self.scope == "obproxy_slow" or self.scope == "obproxy_diagnosis" or self.scope == "obproxy_error": get_obproxy_log = "ls -1 -F %s/*%s.*log* | awk -F '/' '{print $NF}'" % (log_path, self.scope) else: - get_obproxy_log = "ls -1 -F %s/obproxy.*log* %s/obproxy_error.*log* %s/obproxy_stat.*log* %s/obproxy_digest.*log* %s/obproxy_limit.*log* %s/obproxy_slow.*log* | awk -F '/' '{print $NF}'" % (log_path, log_path, log_path, log_path, log_path, log_path) + get_obproxy_log = "ls -1 -F %s/obproxy.*log* %s/obproxy_error.*log* %s/obproxy_stat.*log* %s/obproxy_digest.*log* %s/obproxy_limit.*log* %s/obproxy_slow.*log* %s/obproxy_diagnosis.*log*| awk -F '/' '{print $NF}'" % (log_path, log_path, log_path, log_path, log_path, log_path,log_path) if self.is_ssh: log_files = SshClient(self.stdio).run(ssh_helper, get_obproxy_log).strip() else: diff --git a/handler/rca/rca_handler.py b/handler/rca/rca_handler.py index 3a004891..94d4d017 100644 --- a/handler/rca/rca_handler.py +++ b/handler/rca/rca_handler.py @@ -252,6 +252,15 @@ def get_scene_info(self): raise Exception("rca ({0}) scene.get_scene_info() undefined".format(type(self).__name__)) def export_result(self): return self.Result.export() + def get_all_tenants_id(self): + try: + if self.ob_connector is None: + raise Exception("ob_connector is None") + all_tenant_id_data=self.ob_connector.execute_sql("select tenant_id from oceanbase.__all_tenant;")[0] + return all_tenant_id_data + except Exception as e: + raise Exception("run rca's get_all_tenants_id. Exception: {0}".format(e)) + class Result: diff --git a/handler/rca/scene/log_error_scene.py b/handler/rca/scene/log_error_scene.py index 0973db65..39ab8929 100644 --- a/handler/rca/scene/log_error_scene.py +++ b/handler/rca/scene/log_error_scene.py @@ -12,7 +12,7 @@ """ @time: 2024/04/16 -@file: no_leader_scene.py +@file: log_error_scene.py @desc: """ import os @@ -39,8 +39,6 @@ def init(self, context): if not (observer_version == "4.0.0.0" or StringUtils.compare_versions_greater(observer_version, "4.0.0.0")): self.stdio.error("observer version is {0}, which is less than 4.0.0.0.".format(observer_version)) raise RCAInitException("observer version is {0}, which is less than 4.0.0.0.".format(observer_version)) - # if self.ob_connector is None: - # raise RCAInitException("ob_connector is None. Please check the NODES conf.") self.verbose("observer version is {0}.".format(observer_version)) if self.ob_connector is None: raise RCAInitException("ob_connector is None. Please check the NODES conf.") @@ -120,7 +118,6 @@ def execute_by_tenant_id(self, tenant_id, record): if step_next_tag is False: self.verbose("step_next_tag is false") return - record.add_record("start step3") return From 7089ec3a2e03e6fb095d37bfc37b53dc97cd1615 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Wed, 24 Apr 2024 18:06:38 +0800 Subject: [PATCH 09/30] add check's cases about avx2 --- .../tasks/observer/system/instruction_set_avx2.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 handler/checker/tasks/observer/system/instruction_set_avx2.yaml diff --git a/handler/checker/tasks/observer/system/instruction_set_avx2.yaml b/handler/checker/tasks/observer/system/instruction_set_avx2.yaml new file mode 100644 index 00000000..a17a1e00 --- /dev/null +++ b/handler/checker/tasks/observer/system/instruction_set_avx2.yaml @@ -0,0 +1,11 @@ +info: 'Check the flags of cpu' +task: + - steps: + - type: ssh + ssh: "lscpu |grep Flags" + result: + set_value: cpu_flags + verify: " [[ $cpu_flags == *avx2* ]] " + err_msg: 'clock_source: #{clock_source}. recommended: tsc. Uneven CPU utilization during pressure testing resulted in low TPS during pressure testing' + + From c81e94b6f8801a24f57a36180c8916ca53846257 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Wed, 24 Apr 2024 20:52:47 +0800 Subject: [PATCH 10/30] add check's cases about core_pattern tenant_number --- .../tasks/observer/cluster/tenant_number.yaml | 19 +++++++++++++++++++ .../tasks/observer/system/core_pattern.yaml | 11 +++++++++++ 2 files changed, 30 insertions(+) create mode 100644 handler/checker/tasks/observer/cluster/tenant_number.yaml create mode 100644 handler/checker/tasks/observer/system/core_pattern.yaml diff --git a/handler/checker/tasks/observer/cluster/tenant_number.yaml b/handler/checker/tasks/observer/cluster/tenant_number.yaml new file mode 100644 index 00000000..3c725440 --- /dev/null +++ b/handler/checker/tasks/observer/cluster/tenant_number.yaml @@ -0,0 +1,19 @@ +info: "Check the number of tenant" +task: + - version: "[4.0.0.0,*]" + steps: + - type: sql + sql: 'select count(0)/2 from oceanbase.__all_tenant where tenant_id>1000;' + result: + set_value: tenant_nu + verify_type: max + verify: 100 + err_msg: "The number of tenants: #{tenant_nu}.recommended: tenant_nu<50" + - type: sql + sql: 'select count(0)/2 from oceanbase.__all_tenant where tenant_id>1000;' + result: + set_value: tenant_nu + verify_type: max + report_type: warning + verify: 50 + err_msg: "The number of tenants: #{tenant_nu}.recommended: tenant_nu<50" \ No newline at end of file diff --git a/handler/checker/tasks/observer/system/core_pattern.yaml b/handler/checker/tasks/observer/system/core_pattern.yaml new file mode 100644 index 00000000..ba0e3f9a --- /dev/null +++ b/handler/checker/tasks/observer/system/core_pattern.yaml @@ -0,0 +1,11 @@ +info: 'Check kernel.core_pattern' +task: + - steps: + - type: get_system_parameter + parameter: kernel.core_pattern + result: + set_value: core_pattern + verify: "[[ $core_pattern == *" "* ]]" + err_msg: 'clock_source: #{clock_source}. recommended: tsc. Uneven CPU utilization during pressure testing resulted in low TPS during pressure testing' + + From f91d7515d486568c067dc3a9a72c96782adc49c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 25 Apr 2024 10:22:53 +0800 Subject: [PATCH 11/30] fix check yaml --- handler/checker/tasks/observer/cluster/sys_log_level.yaml | 6 ++---- handler/checker/tasks/observer/system/core_pattern.yaml | 4 +--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/handler/checker/tasks/observer/cluster/sys_log_level.yaml b/handler/checker/tasks/observer/cluster/sys_log_level.yaml index 1d67338c..1063d34e 100644 --- a/handler/checker/tasks/observer/cluster/sys_log_level.yaml +++ b/handler/checker/tasks/observer/cluster/sys_log_level.yaml @@ -6,9 +6,8 @@ task: sql: 'SELECT value FROM oceanbase.__all_virtual_sys_parameter_stat where name like "%syslog_level%";' result: set_value: sys_log_level - verify_type: equal report_type: warning - verify: '[[ ${sys_log_level} == "WDIAG"]]' + verify: '[[ $sys_log_level == "WDIAG" ]]' err_msg: "on 4.x, the recommended value for sys_log_level is WDIAG" - version: "[3.0.0,4.0.0.0)" @@ -17,7 +16,6 @@ task: sql: 'SELECT value FROM oceanbase.__all_virtual_sys_parameter_stat where name like "%syslog_level%";' result: set_value: sys_log_level - verify_type: equal report_type: warning - verify: '[[ ${sys_log_level} == "INFO"]]' + verify: '[[ $sys_log_level == "INFO" ]]' err_msg: "on 3.x, the recommended value for sys_log_level is INFO" diff --git a/handler/checker/tasks/observer/system/core_pattern.yaml b/handler/checker/tasks/observer/system/core_pattern.yaml index ba0e3f9a..a226347f 100644 --- a/handler/checker/tasks/observer/system/core_pattern.yaml +++ b/handler/checker/tasks/observer/system/core_pattern.yaml @@ -5,7 +5,5 @@ task: parameter: kernel.core_pattern result: set_value: core_pattern - verify: "[[ $core_pattern == *" "* ]]" + verify: '[[ $core_pattern == *" "* ]]' err_msg: 'clock_source: #{clock_source}. recommended: tsc. Uneven CPU utilization during pressure testing resulted in low TPS during pressure testing' - - From c4940d3d8a763635b3e0ac5d3c03c50961795007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 25 Apr 2024 11:54:26 +0800 Subject: [PATCH 12/30] update clock_source.yaml --- handler/checker/tasks/observer/system/clock_source.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handler/checker/tasks/observer/system/clock_source.yaml b/handler/checker/tasks/observer/system/clock_source.yaml index e494f5d9..ce9950a2 100644 --- a/handler/checker/tasks/observer/system/clock_source.yaml +++ b/handler/checker/tasks/observer/system/clock_source.yaml @@ -5,7 +5,7 @@ task: ssh: "cat /sys/devices/system/clocksource/clocksource0/current_clocksource" result: set_value: clock_source - verify: "[ \"${clock_source}\" == \"tsc\" ]" + verify: "[[ \"${clock_source}\" == \"tsc\" || \"${clock_source}\" == \"arch_sys_counter\" || \"${clock_source}\" == \"kvm-clock\" ]]" err_msg: 'clock_source: #{clock_source}. recommended: tsc. Uneven CPU utilization during pressure testing resulted in low TPS during pressure testing' From 9db5d05774aa14109863f535f98e278438abe93d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 25 Apr 2024 14:45:36 +0800 Subject: [PATCH 13/30] update build_before --- handler/checker/tasks/observer_check_package.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handler/checker/tasks/observer_check_package.yaml b/handler/checker/tasks/observer_check_package.yaml index f0a6b356..4b8f132b 100644 --- a/handler/checker/tasks/observer_check_package.yaml +++ b/handler/checker/tasks/observer_check_package.yaml @@ -2,7 +2,7 @@ ad: info_en: "Test and inspection tasks" info_cn: "测试巡检任务" tasks: - - system.parameter + - system.* build_before: info_en: "Deployment environment check" info_cn: "部署环境检查" From 41b1b3aaad3963ccbda4019a18a7fb14aad5cb78 Mon Sep 17 00:00:00 2001 From: Teingi Date: Thu, 25 Apr 2024 15:14:09 +0800 Subject: [PATCH 14/30] update README --- README-CN.md | 22 +++++++++++----------- README.md | 22 +++++++++++----------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/README-CN.md b/README-CN.md index f790fc7e..fe99b285 100644 --- a/README-CN.md +++ b/README-CN.md @@ -1,18 +1,18 @@

- + license - - license + + license pyversions - - stars + + stars - - forks + + forks Chinese doc @@ -65,9 +65,9 @@ obdiag config -h -u [-p password] [-P port] # 参与贡献 obdiag 期望构建一个开放的社区,我们欢迎任何形式的贡献,您可以: -- 通过 [Issues](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues) 提交 bug。 -- 通过 [Discussion](https://github.com/oceanbase/oceanbase-diagnostic-tool/discussions) 参与或发起讨论。 -- 通过 [Pull requests](https://github.com/oceanbase/oceanbase-diagnostic-tool/pulls) 提交问题修复或者功能特性。 +- 通过 [Issues](https://github.com/oceanbase/obdiag/issues) 提交 bug。 +- 通过 [Discussion](https://github.com/oceanbase/obdiag/discussions) 参与或发起讨论。 +- 通过 [Pull requests](https://github.com/oceanbase/obdiag/pulls) 提交问题修复或者功能特性。 # Roadmap Ahead @@ -95,6 +95,6 @@ OceanBase Diagnostic Tool 使用 [MulanPSL - 2.0](http://license.coscl.org.cn/Mu 如果您在使用 OceanBase LogProxy 时遇到任何问题,欢迎联系我们寻求帮助: -- [GitHub Issue](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues) +- [GitHub Issue](https://github.com/oceanbase/obdiag/issues) - [官方网站](https://www.oceanbase.com/docs/obdiag-cn) - obdiag SIG 微信号: obdiagsig \ No newline at end of file diff --git a/README.md b/README.md index 54e4394c..376402e1 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,20 @@ English | [中文版](README-CN.md)

- + license - - license + + license pyversions - - stars + + stars - - forks + + forks Chinese doc @@ -67,9 +67,9 @@ For more details, please refer to [Official docs](https://www.oceanbase.com/docs obdiag envisions an open community. We welcome your contributions in any form: -- Report bugs through [Issues](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues). -- Participate in or initiate discussions via [Discussion](https://github.com/oceanbase/oceanbase-diagnostic-tool/discussions). -- Contribute bug fixes or new features through [Pull requests](https://github.com/oceanbase/oceanbase-diagnostic-tool/pulls). +- Report bugs through [Issues](https://github.com/oceanbase/obdiag/issues). +- Participate in or initiate discussions via [Discussion](https://github.com/oceanbase/obdiag/discussions). +- Contribute bug fixes or new features through [Pull requests](https://github.com/oceanbase/obdiag/pulls). @@ -97,7 +97,7 @@ distribute the source code, please obey the MulanPubL - 2.0 license. In case you have any problems when using obdiag, welcome reach out for help: -- [GitHub Issue](https://github.com/oceanbase/oceanbase-diagnostic-tool/issues) +- [GitHub Issue](https://github.com/oceanbase/obdiag/issues) - [Official Website](https://www.oceanbase.com/docs/obdiag-cn) - obdiag SIG WeChat: obdiagsig From b42efafd5f25cd69089f27c22effdf93079e110c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 25 Apr 2024 18:07:51 +0800 Subject: [PATCH 15/30] test check --- handler/checker/check_handler.py | 7 ++++- handler/checker/check_task.py | 30 +++++++++++++++++--- handler/checker/step/data_size.py | 2 +- handler/checker/step/get_system_parameter.py | 2 +- handler/checker/step/sql.py | 4 +-- handler/checker/step/ssh.py | 2 +- handler/checker/step/stepbase.py | 11 +++---- 7 files changed, 43 insertions(+), 15 deletions(-) diff --git a/handler/checker/check_handler.py b/handler/checker/check_handler.py index c7477a52..7bd249f3 100644 --- a/handler/checker/check_handler.py +++ b/handler/checker/check_handler.py @@ -17,6 +17,8 @@ """ import os +import time + import yaml from common.ob_connector import OBConnector @@ -229,6 +231,7 @@ def execute_one(self, task_name): raise CheckException("execute_one Exception : {0}".format(e)) def execute(self): + start_time = time.time() try: self.stdio.verbose( "execute_all_tasks. the number of tasks is {0} ,tasks is {1}".format(len(self.tasks.keys()), @@ -244,4 +247,6 @@ def execute(self): except CheckrReportException as e: self.stdio.error("Report error :{0}".format(e)) except Exception as e: - self.stdio.error("Internal error :{0}".format(e)) \ No newline at end of file + self.stdio.error("Internal error :{0}".format(e)) + end_time = time.time() + print("Total cost time is {0} s".format((end_time - start_time))) \ No newline at end of file diff --git a/handler/checker/check_task.py b/handler/checker/check_task.py index 9605bfcf..b8db28e8 100644 --- a/handler/checker/check_task.py +++ b/handler/checker/check_task.py @@ -15,6 +15,9 @@ @file: check_task.py @desc: """ +import threading + +from common.ob_connector import OBConnector from handler.checker.check_exception import StepResultFailException, \ StepExecuteFailException, StepResultFalseException, TaskException from handler.checker.step.stepbase import StepBase @@ -39,15 +42,30 @@ def execute(self): self.stdio.verbose("task_base execute") steps_nu = filter_by_version(self.task, self.cluster, self.stdio) if steps_nu < 0: - self.stdio.warn("{0} Unadapted by version. SKIP".format(self.task['name'])) + self.stdio.warn("Unadapted by version. SKIP") self.report.add("Unadapted by version. SKIP", "warning") return "Unadapted by version.SKIP" self.stdio.verbose("filter_by_version is return {0}".format(steps_nu)) if len(self.nodes) == 0: raise Exception("node is not exist") - + # TODO: 这里的逻辑需要优化,如果一个节点执行失败了,那么后续的步骤就不会被执行了。 + work_threads = [] for node in self.nodes: + obConnector = OBConnector(ip=self.cluster.get("db_host"), + port=self.cluster.get("db_port"), + username=self.cluster.get("tenant_sys").get("user"), + password=self.cluster.get("tenant_sys").get("password"), + stdio=self.stdio, + timeout=10000) + t = threading.Thread(target=self.execute_one_node, args=(steps_nu,node,obConnector)) + work_threads.append(t) + t.start() + for t in work_threads: + t.join() + self.stdio.verbose("task execute end") + def execute_one_node(self,steps_nu,node,obConnector): + try: self.stdio.verbose("run task in node: {0}".format(StringUtils.node_cut_passwd_for_log(node))) steps = self.task[steps_nu] nu = 1 @@ -56,7 +74,7 @@ def execute(self): self.stdio.verbose("step nu: {0}".format(nu)) if len(self.cluster) == 0: raise Exception("cluster is not exist") - step_run = StepBase(self.context, step, node, self.cluster, self.task_variable_dict) + step_run = StepBase(self.context, step, node, self.cluster, self.task_variable_dict,obConnector) self.stdio.verbose("step nu: {0} initted, to execute".format(nu)) step_run.execute(self.report) self.task_variable_dict = step_run.update_task_variable_dict() @@ -78,4 +96,8 @@ def execute(self): self.stdio.verbose("step nu: {0} execute end ".format(nu)) nu = nu + 1 - self.stdio.verbose("task execute end") + except Exception as e: + self.stdio.error("TaskBase execute Exception: {0}".format(e)) + raise e + + diff --git a/handler/checker/step/data_size.py b/handler/checker/step/data_size.py index b8d0ff0c..66dc2ff9 100644 --- a/handler/checker/step/data_size.py +++ b/handler/checker/step/data_size.py @@ -23,7 +23,7 @@ class DataSizeHandler: - def __init__(self,context, step, node, task_variable_dict): + def __init__(self,context, step, node, task_variable_dict,obConnector): self.context = context self.stdio = context.stdio self.stdio.verbose("init DataSizeHandler") diff --git a/handler/checker/step/get_system_parameter.py b/handler/checker/step/get_system_parameter.py index af3341c1..29713ee3 100644 --- a/handler/checker/step/get_system_parameter.py +++ b/handler/checker/step/get_system_parameter.py @@ -23,7 +23,7 @@ class GetSystemParameterHandler: - def __init__(self,context, step, node, task_variable_dict): + def __init__(self,context, step, node, task_variable_dict,obConnector): self.context = context self.stdio = context.stdio self.stdio.verbose("init GetSystemParameterHandler") diff --git a/handler/checker/step/sql.py b/handler/checker/step/sql.py index c55c50a7..7d846626 100644 --- a/handler/checker/step/sql.py +++ b/handler/checker/step/sql.py @@ -23,7 +23,7 @@ class StepSQLHandler: - def __init__(self,context, step, task_variable_dict): + def __init__(self,context, step, task_variable_dict,obConnector): try: self.context = context self.stdio = context.stdio @@ -32,7 +32,7 @@ def __init__(self,context, step, task_variable_dict): self.tenant_mode = None self.sys_database = None self.database = None - self.ob_connector=self.context.get_variable('check_obConnector') + self.ob_connector=obConnector if self.ob_connector is None: raise Exception("self.ob_connector is None.") except Exception as e: diff --git a/handler/checker/step/ssh.py b/handler/checker/step/ssh.py index 282477e2..4e4c6368 100644 --- a/handler/checker/step/ssh.py +++ b/handler/checker/step/ssh.py @@ -23,7 +23,7 @@ class SshHandler: - def __init__(self,context, step, node, task_variable_dict): + def __init__(self,context, step, node, task_variable_dict,obConnector): self.context = context self.stdio = context.stdio self.ssh_report_value = None diff --git a/handler/checker/step/stepbase.py b/handler/checker/step/stepbase.py index 3ec68d8e..aa479251 100644 --- a/handler/checker/step/stepbase.py +++ b/handler/checker/step/stepbase.py @@ -27,7 +27,7 @@ class StepBase(object): - def __init__(self, context, step, node, cluster, task_variable_dict): + def __init__(self, context, step, node, cluster, task_variable_dict,obConnector): self.context = context self.stdio = context.stdio self.step = step @@ -35,6 +35,7 @@ def __init__(self, context, step, node, cluster, task_variable_dict): self.cluster = cluster self.task_variable_dict = {} self.task_variable_dict = task_variable_dict + self.obConnector=obConnector def execute(self, report): no_cluster_name_msg = "(Please set ob_cluster_name or obproxy_cluster_name)" @@ -54,13 +55,13 @@ def execute(self, report): if "type" not in self.step: raise StepExecuteFailException("Missing field :type") if self.step["type"] == "get_system_parameter": - handler = GetSystemParameterHandler(self.context, self.step, self.node, self.task_variable_dict) + handler = GetSystemParameterHandler(self.context, self.step, self.node, self.task_variable_dict,self.obConnector) elif self.step["type"] == "ssh": - handler = SshHandler(self.context, self.step, self.node, self.task_variable_dict) + handler = SshHandler(self.context, self.step, self.node, self.task_variable_dict,self.obConnector) elif self.step["type"] == "sql": - handler = StepSQLHandler(self.context, self.step, task_variable_dict=self.task_variable_dict) + handler = StepSQLHandler(self.context, self.step, task_variable_dict=self.task_variable_dict,obConnector=self.obConnector) elif self.step["type"] == "data_size": - handler = DataSizeHandler(self.context, self.step, self.cluster, self.task_variable_dict) + handler = DataSizeHandler(self.context, self.step, self.cluster, self.task_variable_dict,self.obConnector) else: raise StepExecuteFailException("the type not support: {0}".format(self.step["type"])) self.stdio.verbose("task execute and result") From 5fa5fef2ce0c8fa8706796e25c2b3a82d7ff96d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 25 Apr 2024 20:55:23 +0800 Subject: [PATCH 16/30] test check --- handler/checker/check_handler.py | 51 +++++++++++++++---- handler/checker/step/sql.py | 6 ++- .../sysbench_free_test_memory_limit.yaml | 2 +- 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/handler/checker/check_handler.py b/handler/checker/check_handler.py index 7bd249f3..88622e4d 100644 --- a/handler/checker/check_handler.py +++ b/handler/checker/check_handler.py @@ -17,6 +17,7 @@ """ import os +import queue import time import yaml @@ -111,20 +112,15 @@ def __init__(self, context, check_target_type="observer"): self.nodes=new_node self.version=get_version(self.nodes, self.check_target_type,self.cluster, self.stdio) - # add OBConnector - obConnector = None + # add OBConnectorPool + OBConnectorPool = None try: - if self.cluster is not None: - obConnector=OBConnector(ip=self.cluster.get("db_host"), - port=self.cluster.get("db_port"), - username=self.cluster.get("tenant_sys").get("user"), - password=self.cluster.get("tenant_sys").get("password"), - stdio=self.stdio, - timeout=10000) + OBConnectorPool=checkOBConnectorPool(context,2,self.cluster) + except Exception as e: self.stdio.warn("obConnector init error. Error info is {0}".format(e)) finally: - self.context.set_variable('check_obConnector', obConnector) + self.context.set_variable('check_obConnector_pool', OBConnectorPool) def handle(self): @@ -249,4 +245,37 @@ def execute(self): except Exception as e: self.stdio.error("Internal error :{0}".format(e)) end_time = time.time() - print("Total cost time is {0} s".format((end_time - start_time))) \ No newline at end of file + print("Total cost time is {0} s".format((end_time - start_time))) +class checkOBConnectorPool: + def __init__(self,context, max_size, cluster): + self.max_size = max_size + self.cluster=cluster + self.connections = queue.Queue(maxsize=max_size) + self.stdio=context.stdio + + def get_connection(self): + try: + if self.connections.qsize() == 0: + if self.connections.qsize() < self.max_size: + conn = OBConnector( + ip=self.cluster.get("db_host"), + port=self.cluster.get("db_port"), + username=self.cluster.get("tenant_sys").get("user"), + password=self.cluster.get("tenant_sys").get("password"), + stdio=self.stdio, + timeout=10000 + ) + self.connections.put(conn) + else: + conn = self.connections.get() + else: + conn = self.connections.get() + return conn + except Exception as e: + return None + + def release_connection(self, conn): + if conn is not None: + self.connections.put(conn) + return + diff --git a/handler/checker/step/sql.py b/handler/checker/step/sql.py index 7d846626..bf1311c8 100644 --- a/handler/checker/step/sql.py +++ b/handler/checker/step/sql.py @@ -32,7 +32,9 @@ def __init__(self,context, step, task_variable_dict,obConnector): self.tenant_mode = None self.sys_database = None self.database = None - self.ob_connector=obConnector + self.ob_connector_pool=self.context.get_variable('check_obConnector_pool',None) + if self.ob_connector_pool is not None: + self.ob_connector=self.ob_connector_pool.get_connection() if self.ob_connector is None: raise Exception("self.ob_connector is None.") except Exception as e: @@ -72,6 +74,8 @@ def execute(self): except Exception as e: self.stdio.error("StepSQLHandler execute Exception: {0}".format(e)) raise StepExecuteFailException("StepSQLHandler execute Exception: {0}".format(e)) + finally: + self.ob_connector_pool.release_connection(self.ob_connector) def update_step_variable_dict(self): return self.task_variable_dict diff --git a/handler/checker/tasks/observer/sysbench/sysbench_free_test_memory_limit.yaml b/handler/checker/tasks/observer/sysbench/sysbench_free_test_memory_limit.yaml index 661d93b7..63412cb6 100644 --- a/handler/checker/tasks/observer/sysbench/sysbench_free_test_memory_limit.yaml +++ b/handler/checker/tasks/observer/sysbench/sysbench_free_test_memory_limit.yaml @@ -30,7 +30,7 @@ task: set_value: result verify_type: between verify: "[80,100]" - err_msg: 'memory_limit/os_memory is #{result}%,is not between 80 and 100' + err_msg: 'memory_limit: #{memory_limit}. os_memory: #{os_memory}. memory_limit/os_memory is #{result}%,is not between 80% and 100%' # memory_size - type: sql From 7300bac98ad7319da37c533db64e1741cf4bad56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Sun, 28 Apr 2024 16:32:14 +0800 Subject: [PATCH 17/30] test check --- common/ob_connector.py | 14 ++++++- handler/checker/check_handler.py | 38 ++++++++++--------- handler/checker/check_task.py | 12 ++---- handler/checker/step/data_size.py | 2 +- handler/checker/step/get_system_parameter.py | 2 +- handler/checker/step/sql.py | 2 +- handler/checker/step/ssh.py | 2 +- handler/checker/step/stepbase.py | 8 ++-- .../tasks/observer/cluster/deadlocks.yaml | 3 +- .../tasks/observer/system/core_pattern.yaml | 4 +- 10 files changed, 48 insertions(+), 39 deletions(-) diff --git a/common/ob_connector.py b/common/ob_connector.py index 223eb915..b548b89e 100644 --- a/common/ob_connector.py +++ b/common/ob_connector.py @@ -20,7 +20,7 @@ class OBConnector(object): - def __init__(self, ip, port, username, password=None, database=None, stdio=None, timeout=10,): + def __init__(self, ip, port, username, password=None, database=None, stdio=None, timeout=30,): self.ip = str(ip) self.port = int(port) self.username = str(username) @@ -50,6 +50,18 @@ def _connect_db(self): self.stdio.verbose("connect databse ...") except mysql.Error as e: self.stdio.error("connect OB: {0}:{1} with user {2} failed, error:{3}".format(self.ip, self.port, self.username, e)) + return + try: + ob_trx_timeout=self.timeout*1000000 + self.execute_sql("SET SESSION ob_trx_timeout={0};".format(ob_trx_timeout)) + except Exception as e: + self.stdio.warn("set ob_trx_timeout failed, error:{0}".format(e)) + try: + ob_query_timeout=self.timeout*1000000 + self.execute_sql("SET SESSION ob_query_timeout={0};".format(ob_query_timeout)) + except Exception as e: + self.stdio.warn("set ob_query_timeout failed, error:{0}".format(e)) + def execute_sql(self, sql): if self.conn is None: diff --git a/handler/checker/check_handler.py b/handler/checker/check_handler.py index 88622e4d..ca967621 100644 --- a/handler/checker/check_handler.py +++ b/handler/checker/check_handler.py @@ -33,7 +33,6 @@ from common.tool import YamlUtils from common.tool import StringUtils - class CheckHandler: def __init__(self, context, check_target_type="observer"): @@ -113,8 +112,8 @@ def __init__(self, context, check_target_type="observer"): self.version=get_version(self.nodes, self.check_target_type,self.cluster, self.stdio) # add OBConnectorPool - OBConnectorPool = None try: + global OBConnectorPool OBConnectorPool=checkOBConnectorPool(context,2,self.cluster) except Exception as e: @@ -252,29 +251,32 @@ def __init__(self,context, max_size, cluster): self.cluster=cluster self.connections = queue.Queue(maxsize=max_size) self.stdio=context.stdio + self.stdio.verbose("obConnectorPool init success!") + try: + for i in range(max_size): + conn = OBConnector( + ip=self.cluster.get("db_host"), + port=self.cluster.get("db_port"), + username=self.cluster.get("tenant_sys").get("user"), + password=self.cluster.get("tenant_sys").get("password"), + stdio=self.stdio, + timeout=10000 + ) + self.connections.put(conn) + self.stdio.verbose("obConnectorPool init success!") + except Exception as e: + self.stdio.error("obConnectorPool init fail! err:".format(e)) + def get_connection(self): try: - if self.connections.qsize() == 0: - if self.connections.qsize() < self.max_size: - conn = OBConnector( - ip=self.cluster.get("db_host"), - port=self.cluster.get("db_port"), - username=self.cluster.get("tenant_sys").get("user"), - password=self.cluster.get("tenant_sys").get("password"), - stdio=self.stdio, - timeout=10000 - ) - self.connections.put(conn) - else: - conn = self.connections.get() - else: - conn = self.connections.get() - return conn + return self.connections.get() except Exception as e: + self.stdio.error("get connection fail! err:".format(e)) return None def release_connection(self, conn): + if conn is not None: self.connections.put(conn) return diff --git a/handler/checker/check_task.py b/handler/checker/check_task.py index b8db28e8..77e04310 100644 --- a/handler/checker/check_task.py +++ b/handler/checker/check_task.py @@ -51,20 +51,14 @@ def execute(self): # TODO: 这里的逻辑需要优化,如果一个节点执行失败了,那么后续的步骤就不会被执行了。 work_threads = [] for node in self.nodes: - obConnector = OBConnector(ip=self.cluster.get("db_host"), - port=self.cluster.get("db_port"), - username=self.cluster.get("tenant_sys").get("user"), - password=self.cluster.get("tenant_sys").get("password"), - stdio=self.stdio, - timeout=10000) - t = threading.Thread(target=self.execute_one_node, args=(steps_nu,node,obConnector)) + t = threading.Thread(target=self.execute_one_node, args=(steps_nu,node)) work_threads.append(t) t.start() for t in work_threads: t.join() self.stdio.verbose("task execute end") - def execute_one_node(self,steps_nu,node,obConnector): + def execute_one_node(self,steps_nu,node): try: self.stdio.verbose("run task in node: {0}".format(StringUtils.node_cut_passwd_for_log(node))) steps = self.task[steps_nu] @@ -74,7 +68,7 @@ def execute_one_node(self,steps_nu,node,obConnector): self.stdio.verbose("step nu: {0}".format(nu)) if len(self.cluster) == 0: raise Exception("cluster is not exist") - step_run = StepBase(self.context, step, node, self.cluster, self.task_variable_dict,obConnector) + step_run = StepBase(self.context, step, node, self.cluster, self.task_variable_dict) self.stdio.verbose("step nu: {0} initted, to execute".format(nu)) step_run.execute(self.report) self.task_variable_dict = step_run.update_task_variable_dict() diff --git a/handler/checker/step/data_size.py b/handler/checker/step/data_size.py index 66dc2ff9..b8d0ff0c 100644 --- a/handler/checker/step/data_size.py +++ b/handler/checker/step/data_size.py @@ -23,7 +23,7 @@ class DataSizeHandler: - def __init__(self,context, step, node, task_variable_dict,obConnector): + def __init__(self,context, step, node, task_variable_dict): self.context = context self.stdio = context.stdio self.stdio.verbose("init DataSizeHandler") diff --git a/handler/checker/step/get_system_parameter.py b/handler/checker/step/get_system_parameter.py index 29713ee3..af3341c1 100644 --- a/handler/checker/step/get_system_parameter.py +++ b/handler/checker/step/get_system_parameter.py @@ -23,7 +23,7 @@ class GetSystemParameterHandler: - def __init__(self,context, step, node, task_variable_dict,obConnector): + def __init__(self,context, step, node, task_variable_dict): self.context = context self.stdio = context.stdio self.stdio.verbose("init GetSystemParameterHandler") diff --git a/handler/checker/step/sql.py b/handler/checker/step/sql.py index bf1311c8..6c0759ae 100644 --- a/handler/checker/step/sql.py +++ b/handler/checker/step/sql.py @@ -23,7 +23,7 @@ class StepSQLHandler: - def __init__(self,context, step, task_variable_dict,obConnector): + def __init__(self,context, step, task_variable_dict): try: self.context = context self.stdio = context.stdio diff --git a/handler/checker/step/ssh.py b/handler/checker/step/ssh.py index 4e4c6368..282477e2 100644 --- a/handler/checker/step/ssh.py +++ b/handler/checker/step/ssh.py @@ -23,7 +23,7 @@ class SshHandler: - def __init__(self,context, step, node, task_variable_dict,obConnector): + def __init__(self,context, step, node, task_variable_dict): self.context = context self.stdio = context.stdio self.ssh_report_value = None diff --git a/handler/checker/step/stepbase.py b/handler/checker/step/stepbase.py index aa479251..9eca253f 100644 --- a/handler/checker/step/stepbase.py +++ b/handler/checker/step/stepbase.py @@ -55,13 +55,13 @@ def execute(self, report): if "type" not in self.step: raise StepExecuteFailException("Missing field :type") if self.step["type"] == "get_system_parameter": - handler = GetSystemParameterHandler(self.context, self.step, self.node, self.task_variable_dict,self.obConnector) + handler = GetSystemParameterHandler(self.context, self.step, self.node, self.task_variable_dict) elif self.step["type"] == "ssh": - handler = SshHandler(self.context, self.step, self.node, self.task_variable_dict,self.obConnector) + handler = SshHandler(self.context, self.step, self.node, self.task_variable_dict) elif self.step["type"] == "sql": - handler = StepSQLHandler(self.context, self.step, task_variable_dict=self.task_variable_dict,obConnector=self.obConnector) + handler = StepSQLHandler(self.context, self.step, task_variable_dict=self.task_variable_dict) elif self.step["type"] == "data_size": - handler = DataSizeHandler(self.context, self.step, self.cluster, self.task_variable_dict,self.obConnector) + handler = DataSizeHandler(self.context, self.step, self.cluster, self.task_variable_dict) else: raise StepExecuteFailException("the type not support: {0}".format(self.step["type"])) self.stdio.verbose("task execute and result") diff --git a/handler/checker/tasks/observer/cluster/deadlocks.yaml b/handler/checker/tasks/observer/cluster/deadlocks.yaml index 201359da..690d0227 100644 --- a/handler/checker/tasks/observer/cluster/deadlocks.yaml +++ b/handler/checker/tasks/observer/cluster/deadlocks.yaml @@ -1,6 +1,7 @@ info: "Check whether there is a deadlock." task: - version: "[4.0.0.0,*]" + execute_type: "one_node" steps: - type: sql sql: 'select count(0) from oceanbase.DBA_OB_DEADLOCK_EVENT_HISTORY;' @@ -9,4 +10,4 @@ task: verify_type: equal report_type: warning verify: 0 - err_msg: "There is a deadlock." \ No newline at end of file + err_msg: "There is a deadlock. Please check on the oceanbase.DBA_OB_DEADLOCK_EVENT_HISTORY" \ No newline at end of file diff --git a/handler/checker/tasks/observer/system/core_pattern.yaml b/handler/checker/tasks/observer/system/core_pattern.yaml index a226347f..a708a49a 100644 --- a/handler/checker/tasks/observer/system/core_pattern.yaml +++ b/handler/checker/tasks/observer/system/core_pattern.yaml @@ -5,5 +5,5 @@ task: parameter: kernel.core_pattern result: set_value: core_pattern - verify: '[[ $core_pattern == *" "* ]]' - err_msg: 'clock_source: #{clock_source}. recommended: tsc. Uneven CPU utilization during pressure testing resulted in low TPS during pressure testing' + verify: '[[ $core_pattern != *" "* ]]' + err_msg: 'kernel.core_pattern: #{core_pattern} , is not recommended for configuring functions other than the specified core path' From 6590e26ca45ac36d9d02175284c08f38f1b94c90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Sun, 28 Apr 2024 17:06:45 +0800 Subject: [PATCH 18/30] update check --- conf/inner_config.yml | 1 - handler/checker/check_handler.py | 5 ++--- handler/checker/check_task.py | 1 - handler/checker/step/stepbase.py | 3 +-- 4 files changed, 3 insertions(+), 7 deletions(-) diff --git a/conf/inner_config.yml b/conf/inner_config.yml index 8c1e0845..f4bf9245 100644 --- a/conf/inner_config.yml +++ b/conf/inner_config.yml @@ -17,7 +17,6 @@ check: report: report_path: "./check_report/" export_type: table - package_file: "~/.obdiag/check/check_package.yaml" tasks_base_path: "~/.obdiag/check/tasks/" gather: scenes_base_path: "~/.obdiag/gather/tasks" diff --git a/handler/checker/check_handler.py b/handler/checker/check_handler.py index ca967621..cc5fdac5 100644 --- a/handler/checker/check_handler.py +++ b/handler/checker/check_handler.py @@ -113,13 +113,12 @@ def __init__(self, context, check_target_type="observer"): # add OBConnectorPool try: - global OBConnectorPool - OBConnectorPool=checkOBConnectorPool(context,2,self.cluster) + obConnectorPool=checkOBConnectorPool(context,2,self.cluster) except Exception as e: self.stdio.warn("obConnector init error. Error info is {0}".format(e)) finally: - self.context.set_variable('check_obConnector_pool', OBConnectorPool) + self.context.set_variable('check_obConnector_pool', obConnectorPool) def handle(self): diff --git a/handler/checker/check_task.py b/handler/checker/check_task.py index 77e04310..9a4476a4 100644 --- a/handler/checker/check_task.py +++ b/handler/checker/check_task.py @@ -71,7 +71,6 @@ def execute_one_node(self,steps_nu,node): step_run = StepBase(self.context, step, node, self.cluster, self.task_variable_dict) self.stdio.verbose("step nu: {0} initted, to execute".format(nu)) step_run.execute(self.report) - self.task_variable_dict = step_run.update_task_variable_dict() if "report_type" in step["result"] and step["result"]["report_type"] == "execution": self.stdio.verbose("report_type stop this step") return diff --git a/handler/checker/step/stepbase.py b/handler/checker/step/stepbase.py index 9eca253f..3ec68d8e 100644 --- a/handler/checker/step/stepbase.py +++ b/handler/checker/step/stepbase.py @@ -27,7 +27,7 @@ class StepBase(object): - def __init__(self, context, step, node, cluster, task_variable_dict,obConnector): + def __init__(self, context, step, node, cluster, task_variable_dict): self.context = context self.stdio = context.stdio self.step = step @@ -35,7 +35,6 @@ def __init__(self, context, step, node, cluster, task_variable_dict,obConnector) self.cluster = cluster self.task_variable_dict = {} self.task_variable_dict = task_variable_dict - self.obConnector=obConnector def execute(self, report): no_cluster_name_msg = "(Please set ob_cluster_name or obproxy_cluster_name)" From 1fcabe416058642baf2d7766f7150449ab23c6c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Sun, 28 Apr 2024 18:01:15 +0800 Subject: [PATCH 19/30] add check case --- handler/checker/check_handler.py | 2 +- handler/checker/step/sql.py | 3 ++- .../checker/tasks/observer/bugs/bug_182.yaml | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 handler/checker/tasks/observer/bugs/bug_182.yaml diff --git a/handler/checker/check_handler.py b/handler/checker/check_handler.py index cc5fdac5..81f46cc9 100644 --- a/handler/checker/check_handler.py +++ b/handler/checker/check_handler.py @@ -113,7 +113,7 @@ def __init__(self, context, check_target_type="observer"): # add OBConnectorPool try: - obConnectorPool=checkOBConnectorPool(context,2,self.cluster) + obConnectorPool=checkOBConnectorPool(context,3,self.cluster) except Exception as e: self.stdio.warn("obConnector init error. Error info is {0}".format(e)) diff --git a/handler/checker/step/sql.py b/handler/checker/step/sql.py index 6c0759ae..b7194bdc 100644 --- a/handler/checker/step/sql.py +++ b/handler/checker/step/sql.py @@ -61,8 +61,9 @@ def execute(self): if data is None: self.stdio.warn("sql result is None: {0}".format(self.step["sql"])) self.stdio.verbose("execute_sql result:{0}".format(data)) - if len(data) == 0: + if len(data) == 0 or data is None: self.stdio.warn("sql result is None: {0}".format(self.step["sql"])) + data="" else: data = data[0][0] if data is None: diff --git a/handler/checker/tasks/observer/bugs/bug_182.yaml b/handler/checker/tasks/observer/bugs/bug_182.yaml new file mode 100644 index 00000000..5745b6cd --- /dev/null +++ b/handler/checker/tasks/observer/bugs/bug_182.yaml @@ -0,0 +1,17 @@ +info: "OB has been upgraded to version 4.2.1, and some partition tables are executing DDL with error code -4109 and error message: Server state or role not the same as expected. github issue #182" +task: + - version: "[4.0.0.0,*]" + steps: + - type: sql + sql: 'select tenant_id, table_id, table_name, database_id, table_type, load_type, def_type, rowkey_column_num, index_column_num, max_used_column_id, autoinc_column_id, auto_increment, read_only, rowkey_split_pos, compress_func_name, expire_condition, is_use_bloomfilter, comment, block_size, collation_type, data_table_id, index_status, tablegroup_id, progressive_merge_num, index_type, part_level, part_func_type, part_func_expr, part_num, sub_part_func_type, sub_part_func_expr, sub_part_num, schema_version, view_definition, view_check_option, view_is_updatable, index_using_type, parser_name, index_attributes_set, tablet_size, pctfree, partition_status, partition_schema_version, session_id, pk_comment, sess_active_time, row_store_type, store_format, duplicate_scope, progressive_merge_round, storage_format_version, table_mode, encryption, tablespace_id, sub_part_template_flags, dop, character_set_client, collation_connection, auto_part_size, auto_part, association_table_id, tablet_id, max_dependency_version, define_user_id, transition_point, b_transition_point, interval_range, b_interval_range, object_status, table_flags, truncate_version, 0 as is_deleted from OCEANBASE.__all_table +EXCEPT select + t1.tenant_id, t1.table_id, t1.table_name, t1.database_id, t1.table_type, t1.load_type, t1.def_type, t1.rowkey_column_num, t1.index_column_num, t1.max_used_column_id, t1.autoinc_column_id, t1.auto_increment, t1.read_only, t1.rowkey_split_pos, t1.compress_func_name, t1.expire_condition, t1.is_use_bloomfilter, t1.comment, t1.block_size, t1.collation_type, t1.data_table_id, t1.index_status, t1.tablegroup_id, t1.progressive_merge_num, t1.index_type, t1.part_level, t1.part_func_type, t1.part_func_expr, t1.part_num, t1.sub_part_func_type, t1.sub_part_func_expr, t1.sub_part_num, t1.schema_version, t1.view_definition, t1.view_check_option, t1.view_is_updatable, t1.index_using_type, t1.parser_name, t1.index_attributes_set, t1.tablet_size, t1.pctfree, t1.partition_status, t1.partition_schema_version, t1.session_id, t1.pk_comment, t1.sess_active_time, t1.row_store_type, t1.store_format, t1.duplicate_scope, t1.progressive_merge_round, t1.storage_format_version, t1.table_mode, t1.encryption, t1.tablespace_id, t1.sub_part_template_flags, t1.dop, t1.character_set_client, t1.collation_connection, t1.auto_part_size, t1.auto_part, t1.association_table_id, t1.tablet_id, t1.max_dependency_version, t1.define_user_id, t1.transition_point, t1.b_transition_point, t1.interval_range, t1.b_interval_range, t1.object_status, t1.table_flags, t1.truncate_version, t1.is_deleted + from + OCEANBASE.__all_table_history t1 + inner join (select t2.table_id,max(t2.schema_version) as schema_version from OCEANBASE.__all_table_history t2 group by t2.table_id)as t3 + on t1.table_id = t3.table_id and t1.schema_version = t3.schema_version and t1.is_deleted = 0;' + result: + set_value: error_table + report_type: warning + verify: '[ -z "$error_table" ]' + err_msg: "Some partition tables are inconsistent. Please get bug's on https://github.com/oceanbase/obdiag/issues/182" From 2ffbeb8178089f3fb6e5688b3be429b8b3dcc137 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Sun, 28 Apr 2024 20:15:51 +0800 Subject: [PATCH 20/30] fix case --- handler/checker/check_handler.py | 4 +--- handler/checker/tasks/observer/cluster/deadlocks.yaml | 1 - .../observer/table/information_schema_tables_two_data.yaml | 3 +-- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/handler/checker/check_handler.py b/handler/checker/check_handler.py index 81f46cc9..f225832c 100644 --- a/handler/checker/check_handler.py +++ b/handler/checker/check_handler.py @@ -225,7 +225,6 @@ def execute_one(self, task_name): raise CheckException("execute_one Exception : {0}".format(e)) def execute(self): - start_time = time.time() try: self.stdio.verbose( "execute_all_tasks. the number of tasks is {0} ,tasks is {1}".format(len(self.tasks.keys()), @@ -242,8 +241,7 @@ def execute(self): self.stdio.error("Report error :{0}".format(e)) except Exception as e: self.stdio.error("Internal error :{0}".format(e)) - end_time = time.time() - print("Total cost time is {0} s".format((end_time - start_time))) + class checkOBConnectorPool: def __init__(self,context, max_size, cluster): self.max_size = max_size diff --git a/handler/checker/tasks/observer/cluster/deadlocks.yaml b/handler/checker/tasks/observer/cluster/deadlocks.yaml index 690d0227..e05c7034 100644 --- a/handler/checker/tasks/observer/cluster/deadlocks.yaml +++ b/handler/checker/tasks/observer/cluster/deadlocks.yaml @@ -1,7 +1,6 @@ info: "Check whether there is a deadlock." task: - version: "[4.0.0.0,*]" - execute_type: "one_node" steps: - type: sql sql: 'select count(0) from oceanbase.DBA_OB_DEADLOCK_EVENT_HISTORY;' diff --git a/handler/checker/tasks/observer/table/information_schema_tables_two_data.yaml b/handler/checker/tasks/observer/table/information_schema_tables_two_data.yaml index f6bce02c..76d450b7 100644 --- a/handler/checker/tasks/observer/table/information_schema_tables_two_data.yaml +++ b/handler/checker/tasks/observer/table/information_schema_tables_two_data.yaml @@ -1,4 +1,4 @@ -info: 'A table found two records in information_schema.tables. KBA-50000056' +info: 'A table found two records in information_schema.tables.' task: - version: "[4.0.0.0,*]" steps: @@ -6,7 +6,6 @@ task: sql: 'select count(0) from oceanbase.__all_virtual_table_stat where table_id = partition_id and (tenant_id,table_id) in (select tenant_id, table_id from oceanbase.__all_virtual_table where part_level != 0);' result: set_value: err_table_count - # report_type: warning verify_type: equal verify: 0 err_msg: 'Find have table found two records in information_schema.tables. the number of err_table_count is : #{err_table_count}. Please get more info by "select * from oceanbase.__all_virtual_table_stat where table_id = partition_id and (tenant_id,table_id) in (select tenant_id, table_id from oceanbase.__all_virtual_table where part_level != 0);". And you can by "delete from __all_table_stat where table_id=partition_id and table_id=${partition table table_id};" and "delete from __all_column_stat where table_id=partition_id and table_id=${partition table table_id};" to fix it.' From 65132b43cf3abdf93739bf3eaa219b398cb77bd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Mon, 29 Apr 2024 17:42:01 +0800 Subject: [PATCH 21/30] support ash --- cmd.py | 32 +++++ common/ssh.py | 19 ++- conf/inner_config.yml | 1 + config.py | 1 + core.py | 10 +- handler/gather/gather_ash_report.py | 193 ++++++++++++++++++++++++++++ 6 files changed, 251 insertions(+), 5 deletions(-) create mode 100644 handler/gather/gather_ash_report.py diff --git a/cmd.py b/cmd.py index 0c0fcfb1..071face0 100644 --- a/cmd.py +++ b/cmd.py @@ -562,6 +562,38 @@ def _do_command(self, obdiag): return obdiag.gather_function('gather_scenes_run', self.opts) +class ObdiagGatherAshReportRunCommand(ObdiagOriginCommand): + + def __init__(self): + super(ObdiagGatherAshReportRunCommand, self).__init__('run', 'gather ash report') + self.parser.add_option('--trace_id', type='string', help="The TRACE.ID of the SQL to be sampled, if left blank or filled with NULL, indicates that TRACE.ID is not restricted.") + self.parser.add_option('--sql_id', type='string', help="The SQL.ID, if left blank or filled with NULL, indicates that SQL.ID is not restricted.") + #WAIT_CLASS + self.parser.add_option('--wait_class', type='string', + help='Report type.',default='TEXT') + self.parser.add_option('--report_type', type='string', + help='Event types to be sampled.') + self.parser.add_option('--from', type='string', + help="specify the start of the time range. format: 'yyyy-mm-dd hh:mm:ss'") + self.parser.add_option('--to', type='string', + help="specify the end of the time range. format: 'yyyy-mm-dd hh:mm:ss'") + self.parser.add_option('--since', type='string', + help="Specify time range that from 'n' [d]ays, 'n' [h]ours or 'n' [m]inutes. before to now. format: . example: 1h.", + default='30m') + self.parser.add_option('--store_dir', type='string', + help='the dir to store gather result, current dir by default.', default='./') + + self.parser.add_option('-c', type='string', help='obdiag custom config', + default=os.path.expanduser('~/.obdiag/config.yml')) + + def init(self, cmd, args): + super(ObdiagGatherAshReportRunCommand, self).init(cmd, args) + return self + + def _do_command(self, obdiag): + return obdiag.gather_function('gather_ash_report', self.opts) + + class ObdiagAnalyzeLogCommand(ObdiagOriginCommand): def __init__(self): diff --git a/common/ssh.py b/common/ssh.py index 9c18b6b2..f58b8994 100644 --- a/common/ssh.py +++ b/common/ssh.py @@ -811,7 +811,14 @@ def file_uploader(self, local_dir, remote_dir, stdio=None): except: stdio.exception("") stdio.verbose('Failed to get %s' % remote_dir) - +# TODO ENV_DISABLE_RSA_ALGORITHMS need get by context.inner_context +ENV_DISABLE_RSA_ALGORITHMS=0 +def dis_rsa_algorithms(state=0): + """ + Disable RSA algorithms in OpenSSH server. + """ + global ENV_DISABLE_RSA_ALGORITHMS + ENV_DISABLE_RSA_ALGORITHMS=state class SshHelper(object): def __init__(self, is_ssh=None, host_ip=None, username=None, password=None, ssh_port=None, key_file=None, node=None, stdio=None): @@ -851,17 +858,21 @@ def __init__(self, is_ssh=None, host_ip=None, username=None, password=None, ssh_ return if self.is_ssh: + self._disabled_rsa_algorithms=None + DISABLED_ALGORITHMS = dict(pubkeys=["rsa-sha2-512", "rsa-sha2-256"]) + if ENV_DISABLE_RSA_ALGORITHMS == 1: + self._disabled_rsa_algorithms = DISABLED_ALGORITHMS self.ssh_type = "remote" if len(self.key_file) > 0: try: self._ssh_fd = paramiko.SSHClient() self._ssh_fd.set_missing_host_key_policy(paramiko.client.AutoAddPolicy()) self._ssh_fd.load_system_host_keys() - self._ssh_fd.connect(hostname=host_ip, username=username, key_filename=self.key_file, port=ssh_port) + self._ssh_fd.connect(hostname=host_ip, username=username, key_filename=self.key_file, port=ssh_port,disabled_algorithms=self._disabled_rsa_algorithms) except AuthenticationException: self.password = input("Authentication failed, Input {0}@{1} password:\n".format(username, host_ip)) self.need_password = True - self._ssh_fd.connect(hostname=host_ip, username=username, password=password, port=ssh_port) + self._ssh_fd.connect(hostname=host_ip, username=username, password=password, port=ssh_port,disabled_algorithms=self._disabled_rsa_algorithms) except Exception as e: raise OBDIAGSSHConnException("ssh {0}@{1}: failed, exception:{2}".format(username, host_ip, e)) else: @@ -869,7 +880,7 @@ def __init__(self, is_ssh=None, host_ip=None, username=None, password=None, ssh_ self._ssh_fd.set_missing_host_key_policy(paramiko.client.AutoAddPolicy()) self._ssh_fd.load_system_host_keys() self.need_password = True - self._ssh_fd.connect(hostname=host_ip, username=username, password=password, port=ssh_port) + self._ssh_fd.connect(hostname=host_ip, username=username, password=password, port=ssh_port,disabled_algorithms=self._disabled_rsa_algorithms) def ssh_exec_cmd(self, cmd): if self.ssh_type == "docker": diff --git a/conf/inner_config.yml b/conf/inner_config.yml index f4bf9245..9b3fdcd1 100644 --- a/conf/inner_config.yml +++ b/conf/inner_config.yml @@ -4,6 +4,7 @@ obdiag: config_backup_dir: ~/.obdiag/backup_conf file_number_limit: 20 file_size_limit: 2G + dis_rsa_algorithms: 0 logger: log_dir: ~/.obdiag/log log_filename: obdiag.log diff --git a/config.py b/config.py index fb01eb8d..d79fcb9c 100644 --- a/config.py +++ b/config.py @@ -64,6 +64,7 @@ 'config_backup_dir': '~/.obdiag/backup_conf', 'file_number_limit': 20, 'file_size_limit': '2G', + 'dis_rsa_algorithms':0, }, 'logger': { 'log_dir': '~/.obdiag/log', diff --git a/core.py b/core.py index b5d7dcf3..ef8c41c8 100644 --- a/core.py +++ b/core.py @@ -21,9 +21,10 @@ from optparse import Values from copy import copy +from handler.gather.gather_ash_report import GatherAshReportHandler from handler.rca.rca_handler import RCAHandler from handler.rca.rca_list import RcaScenesListHandler -from common.ssh import SshClient, SshConfig +from common.ssh import SshClient, SshConfig, dis_rsa_algorithms from context import HandlerContextNamespace, HandlerContext from config import ConfigManager, InnerConfigManager from err import CheckStatus, SUG_SSH_FAILED @@ -67,6 +68,10 @@ def __init__(self, stdio=None, config_path=os.path.expanduser('~/.obdiag/config. "basic") is not None and self.inner_config_manager.config.get("obdiag").get("basic").get( "telemetry") is not None and self.inner_config_manager.config.get("obdiag").get("basic").get("telemetry") is False: telemetry.work_tag = False + if self.inner_config_manager.config.get("obdiag") is not None and self.inner_config_manager.config.get("obdiag").get( + "basic") is not None and self.inner_config_manager.config.get("obdiag").get("basic").get("dis_rsa_algorithms") is not None : + disable_rsa_algorithms=self.inner_config_manager.config.get("obdiag").get("basic").get("dis_rsa_algorithms") + dis_rsa_algorithms(disable_rsa_algorithms) def fork(self, cmds=None, options=None, stdio=None): new_obdiag = copy(self) @@ -236,6 +241,9 @@ def gather_function(self, function_type, opt): elif function_type == 'gather_scenes_run': handler = GatherSceneHandler(self.context) return handler.handle() + elif function_type == 'gather_ash_report': + handler =GatherAshReportHandler(self.context) + return handler.handle() else: self._call_stdio('error', 'Not support gather function: {0}'.format(function_type)) return False diff --git a/handler/gather/gather_ash_report.py b/handler/gather/gather_ash_report.py new file mode 100644 index 00000000..03135441 --- /dev/null +++ b/handler/gather/gather_ash_report.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/4/28 +@file: gather_ash_report.py +@desc: +""" +import datetime +import os + +from common.ob_connector import OBConnector +from common.obdiag_exception import OBDIAGFormatException, OBDIAGException +from common.tool import DirectoryUtil, TimeUtils, Util +from stdio import SafeStdio +from colorama import Fore, Style + + +class GatherAshReportHandler(SafeStdio): + def __init__(self, context, gather_pack_dir='./'): + super().__init__() + self.report_type = None + self.wait_class = None + self.sql_id = None + self.ash_report_file_name = None + self.from_time_str = None + self.to_time_str = None + self.ash_sql = None + self.trace_id = None + self.context = context + self.stdio = self.context.stdio + self.gather_pack_dir = gather_pack_dir + if self.context.get_variable("gather_timestamp", None) : + self.gather_timestamp=self.context.get_variable("gather_timestamp") + else: + self.gather_timestamp = TimeUtils.get_current_us_timestamp() + self.cluster = self.context.cluster_config + try: + self.obconn = OBConnector( + ip=self.cluster.get("db_host"), + port=self.cluster.get("db_port"), + username=self.cluster.get("tenant_sys").get("user"), + password=self.cluster.get("tenant_sys").get("password"), + stdio=self.stdio, + timeout=10000 + ) + except Exception as e: + self.stdio.error("Failed to connect to database: {0}".format(e)) + raise OBDIAGFormatException("Failed to connect to database: {0}".format(e)) + + + def handle(self): + if not self.init_option(): + self.stdio.error('init option failed') + return False + if not self.init_config(): + self.stdio.error('init config failed') + return False + self.__init_variables() + self.__init_report_path() + self.__init_task_names() + self.execute() + self.__print_result() + + def execute(self): + try: + self.stdio.verbose("execute_tasks. the number of tasks is {0} ,tasks is {1}".format(len(self.yaml_tasks.keys()), self.yaml_tasks.keys())) + + except Exception as e: + self.stdio.error("Internal error :{0}".format(e)) + + + def __init_report_path(self): + try: + self.report_path = os.path.join(self.gather_pack_dir, "gather_pack_{0}".format(TimeUtils.timestamp_to_filename_time(self.gather_timestamp), self.stdio)) + self.stdio.verbose("Use {0} as pack dir.".format(self.report_path)) + DirectoryUtil.mkdir(path=self.report_path, stdio=self.stdio) + except Exception as e: + self.stdio.error("init_report_path failed, error:{0}".format(e)) + + def __init_variables(self): + try: + self.variables = { + "observer_data_dir": self.ob_nodes[0].get("home_path") if self.ob_nodes and self.ob_nodes[0].get("home_path") else "", + "obproxy_data_dir": self.obproxy_nodes[0].get("home_path") if self.obproxy_nodes and self.obproxy_nodes[0].get("home_path") else "", + "from_time": self.from_time_str, + "to_time": self.to_time_str + } + self.stdio.verbose("gather scene variables: {0}".format(self.variables)) + except Exception as e: + self.stdio.error("init gather scene variables failed, error: {0}".format(e)) + + + def init_option(self): + options = self.context.options + from_option = Util.get_option(options, 'from') + to_option = Util.get_option(options, 'to') + store_dir_option = Util.get_option(options, 'store_dir',"./") + trace_id_option = Util.get_option(options, 'trace_id') + sql_id_option = Util.get_option(options, 'sql_id') + report_type_option = Util.get_option(options, 'report_type') + wait_class_option = Util.get_option(options, 'wait_class') + + if from_option is not None and to_option is not None: + try: + from_timestamp = TimeUtils.parse_time_str(from_option) + to_timestamp = TimeUtils.parse_time_str(to_option) + self.from_time_str = from_option + self.to_time_str = to_option + except OBDIAGFormatException: + self.stdio.exception('Error: Datetime is invalid. Must be in format yyyy-mm-dd hh:mm:ss. from_datetime={0}, to_datetime={1}'.format(from_option, to_option)) + return False + if to_timestamp <= from_timestamp: + self.stdio.exception('Error: from datetime is larger than to datetime, please check.') + return False + elif (from_option is None or to_option is None) and since_option is not None: + now_time = datetime.datetime.now() + self.to_time_str = (now_time + datetime.timedelta(minutes=1)).strftime('%Y-%m-%d %H:%M:%S') + self.from_time_str = (now_time - datetime.timedelta(seconds=TimeUtils.parse_time_length_to_sec(since_option))).strftime('%Y-%m-%d %H:%M:%S') + self.stdio.print('gather from_time: {0}, to_time: {1}'.format(self.from_time_str, self.to_time_str)) + else: + self.stdio.warn('No time option provided, default processing is based on the last 30 minutes') + now_time = datetime.datetime.now() + self.to_time_str = (now_time + datetime.timedelta(minutes=1)).strftime('%Y-%m-%d %H:%M:%S') + if since_option: + self.from_time_str = (now_time - datetime.timedelta(seconds=TimeUtils.parse_time_length_to_sec(since_option))).strftime('%Y-%m-%d %H:%M:%S') + else: + self.from_time_str = (now_time - datetime.timedelta(minutes=30)).strftime('%Y-%m-%d %H:%M:%S') + self.stdio.print('gather from_time: {0}, to_time: {1}'.format(self.from_time_str, self.to_time_str)) + if store_dir_option: + if not os.path.exists(os.path.abspath(store_dir_option)): + self.stdio.warn('warn: args --store_dir [{0}] incorrect: No such directory, Now create it'.format(os.path.abspath(store_dir_option))) + os.makedirs(os.path.abspath(store_dir_option)) + self.gather_pack_dir = os.path.abspath(store_dir_option) + self.ash_sql="CALL DBMS_WORKLOAD_REPOSITORY.ASH_REPORT( '{0}', '{1}'".format(self.from_time_str, self.to_time_str) + if sql_id_option: + self.sql_id = sql_id_option + self.ash_sql = self.ash_sql + ", '{0}'".format(self.sql_id) + else: + self.ash_sql = self.ash_sql + ", NULL" + if trace_id_option: + self.trace_id = trace_id_option + self.ash_sql = self.ash_sql + ", '{0}'".format(self.trace_id) + else: + self.ash_sql = self.ash_sql + ", NULL" + if wait_class_option: + self.wait_class = wait_class_option + self.ash_sql = self.ash_sql + ", '{0}'".format(self.wait_class) + else: + self.ash_sql = self.ash_sql + ", NULL" + if report_type_option: + self.report_type = report_type_option + self.ash_sql = self.ash_sql + ", '{0}'".format(self.report_type) + else: + self.ash_sql = self.ash_sql + ", NULL" + + try: + self.ash_sql = self.ash_sql + ");" + self.stdio.verbose("ash_sql: {0}".format(self.ash_sql)) + + ash_report_data=self.obconn.execute_sql(self.ash_sql) + if not ash_report_data or len(ash_report_data)==0: + self.stdio.error("ash report data is empty") + raise OBDIAGException("ash report data is empty") + ash_report=ash_report_data[0] + + # save ash_report_data + self.ash_report_file_name="ash_report_{0}.txt".format(TimeUtils.timestamp_to_filename_time(self.gather_timestamp)) + + with open(self.report_path + "/"+self.ash_report_file_name, 'w') as f: + f.write(ash_report) + except Exception as e: + self.stdio.error("ash report gather failed, error message: {0}".format(e)) + return False + + return True + + def __print_result(self): + self.stdio.print(Fore.YELLOW + "\nGather scene results stored in this directory: {0}\n".format(self.report_path + "/"+self.ash_report_file_name) + Style.RESET_ALL) + + + + + From 4d9210469d5307d497bf109501b5a5d2b2fcacf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Mon, 29 Apr 2024 20:39:20 +0800 Subject: [PATCH 22/30] support ash --- cmd.py | 23 +++-- common/ob_connector.py | 9 ++ handler/gather/gather_ash_report.py | 136 +++++++++++++--------------- init_obdiag_cmd.sh | 2 +- 4 files changed, 82 insertions(+), 88 deletions(-) diff --git a/cmd.py b/cmd.py index 071face0..3e8c9a0f 100644 --- a/cmd.py +++ b/cmd.py @@ -562,24 +562,23 @@ def _do_command(self, obdiag): return obdiag.gather_function('gather_scenes_run', self.opts) -class ObdiagGatherAshReportRunCommand(ObdiagOriginCommand): +class ObdiagGatherAshReportCommand(ObdiagOriginCommand): def __init__(self): - super(ObdiagGatherAshReportRunCommand, self).__init__('run', 'gather ash report') - self.parser.add_option('--trace_id', type='string', help="The TRACE.ID of the SQL to be sampled, if left blank or filled with NULL, indicates that TRACE.ID is not restricted.") - self.parser.add_option('--sql_id', type='string', help="The SQL.ID, if left blank or filled with NULL, indicates that SQL.ID is not restricted.") - #WAIT_CLASS + super(ObdiagGatherAshReportCommand, self).__init__('ash', 'gather ash report') + self.parser.add_option('--trace_id', type='string', + help="The TRACE.ID of the SQL to be sampled, if left blank or filled with NULL, indicates that TRACE.ID is not restricted.") + self.parser.add_option('--sql_id', type='string', + help="The SQL.ID, if left blank or filled with NULL, indicates that SQL.ID is not restricted.") + # WAIT_CLASS self.parser.add_option('--wait_class', type='string', - help='Report type.',default='TEXT') - self.parser.add_option('--report_type', type='string', help='Event types to be sampled.') + self.parser.add_option('--report_type', type='string', + help='Report type, currently only supports text type.', default='TEXT') self.parser.add_option('--from', type='string', help="specify the start of the time range. format: 'yyyy-mm-dd hh:mm:ss'") self.parser.add_option('--to', type='string', help="specify the end of the time range. format: 'yyyy-mm-dd hh:mm:ss'") - self.parser.add_option('--since', type='string', - help="Specify time range that from 'n' [d]ays, 'n' [h]ours or 'n' [m]inutes. before to now. format: . example: 1h.", - default='30m') self.parser.add_option('--store_dir', type='string', help='the dir to store gather result, current dir by default.', default='./') @@ -587,7 +586,7 @@ def __init__(self): default=os.path.expanduser('~/.obdiag/config.yml')) def init(self, cmd, args): - super(ObdiagGatherAshReportRunCommand, self).init(cmd, args) + super(ObdiagGatherAshReportCommand, self).init(cmd, args) return self def _do_command(self, obdiag): @@ -600,7 +599,6 @@ def __init__(self): super(ObdiagAnalyzeLogCommand, self).__init__('log', 'Analyze oceanbase log from online observer machines or offline oceanbase log files') self.parser.add_option('--from', type='string', help="specify the start of the time range. format: 'yyyy-mm-dd hh:mm:ss'") self.parser.add_option('--to', type='string', help="specify the end of the time range. format: 'yyyy-mm-dd hh:mm:ss'") - self.parser.add_option('--since', type='string', help="Specify time range that from 'n' [d]ays, 'n' [h]ours or 'n' [m]inutes. before to now. format: . example: 1h.", default='30m') self.parser.add_option('--scope', type='string', help="log type constrains, choices=[observer, election, rootservice, all]", default='all') self.parser.add_option('--grep', action="append", type='string', help="specify keywords constrain") self.parser.add_option('--log_level', type='string', help="oceanbase logs greater than or equal to this level will be analyze, choices=[DEBUG, TRACE, INFO, WDIAG, WARN, EDIAG, ERROR]") @@ -745,6 +743,7 @@ def __init__(self): self.register_command(ObdiagGatherAwrCommand()) self.register_command(ObdiagGatherObproxyLogCommand()) self.register_command(ObdiagGatherSceneCommand()) + self.register_command(ObdiagGatherAshReportCommand()) class ObdiagGatherSceneCommand(MajorCommand): diff --git a/common/ob_connector.py b/common/ob_connector.py index b548b89e..e002c9e4 100644 --- a/common/ob_connector.py +++ b/common/ob_connector.py @@ -114,3 +114,12 @@ def execute_sql_pretty(self, sql): ret = from_db_cursor(cursor) cursor.close() return ret + def callproc(self, procname, args=()): + if self.conn is None: + self._connect_db() + else: + self.conn.ping(reconnect=True) + cursor = self.conn.cursor() + cursor.callproc(procname, args) + ret = cursor.fetchall() + return ret diff --git a/handler/gather/gather_ash_report.py b/handler/gather/gather_ash_report.py index 03135441..6bb6bcf3 100644 --- a/handler/gather/gather_ash_report.py +++ b/handler/gather/gather_ash_report.py @@ -39,45 +39,58 @@ def __init__(self, context, gather_pack_dir='./'): self.context = context self.stdio = self.context.stdio self.gather_pack_dir = gather_pack_dir - if self.context.get_variable("gather_timestamp", None) : - self.gather_timestamp=self.context.get_variable("gather_timestamp") + if self.context.get_variable("gather_timestamp", None): + self.gather_timestamp = self.context.get_variable("gather_timestamp") else: self.gather_timestamp = TimeUtils.get_current_us_timestamp() self.cluster = self.context.cluster_config try: self.obconn = OBConnector( - ip=self.cluster.get("db_host"), - port=self.cluster.get("db_port"), - username=self.cluster.get("tenant_sys").get("user"), - password=self.cluster.get("tenant_sys").get("password"), - stdio=self.stdio, - timeout=10000 - ) + ip=self.cluster.get("db_host"), + port=self.cluster.get("db_port"), + username=self.cluster.get("tenant_sys").get("user"), + password=self.cluster.get("tenant_sys").get("password"), + stdio=self.stdio, + timeout=10000, + database="oceanbase" + ) except Exception as e: self.stdio.error("Failed to connect to database: {0}".format(e)) raise OBDIAGFormatException("Failed to connect to database: {0}".format(e)) - def handle(self): if not self.init_option(): self.stdio.error('init option failed') return False - if not self.init_config(): - self.stdio.error('init config failed') - return False - self.__init_variables() self.__init_report_path() - self.__init_task_names() self.execute() self.__print_result() def execute(self): try: - self.stdio.verbose("execute_tasks. the number of tasks is {0} ,tasks is {1}".format(len(self.yaml_tasks.keys()), self.yaml_tasks.keys())) + ash_report_arg = (self.from_time_str, self.to_time_str, self.sql_id, self.trace_id, self.wait_class, self.report_type) + self.stdio.verbose("ash report arg: {0}".format(ash_report_arg)) + ash_report_data = self.obconn.callproc("DBMS_WORKLOAD_REPOSITORY.ASH_REPORT", args=ash_report_arg) + if not ash_report_data or len(ash_report_data) == 0: + self.stdio.error("ash report data is empty") + raise OBDIAGException("ash report data is empty") + ash_report = ash_report_data[0][0] + if len(ash_report) > 1: + self.stdio.verbose("ash report: \n{0}".format(ash_report)) + else: + raise OBDIAGException("ash report data is empty") - except Exception as e: - self.stdio.error("Internal error :{0}".format(e)) + # save ash_report_data + self.ash_report_file_name = "ash_report_{0}.txt".format( + TimeUtils.timestamp_to_filename_time(self.gather_timestamp)) + self.ash_report_file_name=os.path.join(self.report_path, "result_summary.txt") + self.stdio.verbose("save ash report file name: {0}".format(self.ash_report_file_name)) + + with open(self.ash_report_file_name, 'w+') as f: + f.write(ash_report) + except Exception as e: + self.stdio.error("ash report gather failed, error message: {0}".format(e)) def __init_report_path(self): try: @@ -87,29 +100,19 @@ def __init_report_path(self): except Exception as e: self.stdio.error("init_report_path failed, error:{0}".format(e)) - def __init_variables(self): - try: - self.variables = { - "observer_data_dir": self.ob_nodes[0].get("home_path") if self.ob_nodes and self.ob_nodes[0].get("home_path") else "", - "obproxy_data_dir": self.obproxy_nodes[0].get("home_path") if self.obproxy_nodes and self.obproxy_nodes[0].get("home_path") else "", - "from_time": self.from_time_str, - "to_time": self.to_time_str - } - self.stdio.verbose("gather scene variables: {0}".format(self.variables)) - except Exception as e: - self.stdio.error("init gather scene variables failed, error: {0}".format(e)) def init_option(self): options = self.context.options from_option = Util.get_option(options, 'from') to_option = Util.get_option(options, 'to') - store_dir_option = Util.get_option(options, 'store_dir',"./") trace_id_option = Util.get_option(options, 'trace_id') sql_id_option = Util.get_option(options, 'sql_id') report_type_option = Util.get_option(options, 'report_type') wait_class_option = Util.get_option(options, 'wait_class') + store_dir_option = Util.get_option(options, 'store_dir' ) + since_option = "30m" if from_option is not None and to_option is not None: try: from_timestamp = TimeUtils.parse_time_str(from_option) @@ -117,77 +120,60 @@ def init_option(self): self.from_time_str = from_option self.to_time_str = to_option except OBDIAGFormatException: - self.stdio.exception('Error: Datetime is invalid. Must be in format yyyy-mm-dd hh:mm:ss. from_datetime={0}, to_datetime={1}'.format(from_option, to_option)) + self.stdio.exception( + 'Error: Datetime is invalid. Must be in format yyyy-mm-dd hh:mm:ss. from_datetime={0}, to_datetime={1}'.format( + from_option, to_option)) return False if to_timestamp <= from_timestamp: self.stdio.exception('Error: from datetime is larger than to datetime, please check.') return False - elif (from_option is None or to_option is None) and since_option is not None: + elif (from_option is None or to_option is None): now_time = datetime.datetime.now() - self.to_time_str = (now_time + datetime.timedelta(minutes=1)).strftime('%Y-%m-%d %H:%M:%S') - self.from_time_str = (now_time - datetime.timedelta(seconds=TimeUtils.parse_time_length_to_sec(since_option))).strftime('%Y-%m-%d %H:%M:%S') + self.to_time_str = (now_time + datetime.timedelta(minutes=0)).strftime('%Y-%m-%d %H:%M:%S') + self.from_time_str = (now_time - datetime.timedelta( + seconds=TimeUtils.parse_time_length_to_sec(since_option))).strftime('%Y-%m-%d %H:%M:%S') self.stdio.print('gather from_time: {0}, to_time: {1}'.format(self.from_time_str, self.to_time_str)) else: self.stdio.warn('No time option provided, default processing is based on the last 30 minutes') now_time = datetime.datetime.now() self.to_time_str = (now_time + datetime.timedelta(minutes=1)).strftime('%Y-%m-%d %H:%M:%S') - if since_option: - self.from_time_str = (now_time - datetime.timedelta(seconds=TimeUtils.parse_time_length_to_sec(since_option))).strftime('%Y-%m-%d %H:%M:%S') - else: - self.from_time_str = (now_time - datetime.timedelta(minutes=30)).strftime('%Y-%m-%d %H:%M:%S') + self.from_time_str = (now_time - datetime.timedelta(minutes=30)).strftime('%Y-%m-%d %H:%M:%S') self.stdio.print('gather from_time: {0}, to_time: {1}'.format(self.from_time_str, self.to_time_str)) if store_dir_option: if not os.path.exists(os.path.abspath(store_dir_option)): - self.stdio.warn('warn: args --store_dir [{0}] incorrect: No such directory, Now create it'.format(os.path.abspath(store_dir_option))) + self.stdio.warn('warn: args --store_dir [{0}] incorrect: No such directory, Now create it'.format( + os.path.abspath(store_dir_option))) os.makedirs(os.path.abspath(store_dir_option)) self.gather_pack_dir = os.path.abspath(store_dir_option) - self.ash_sql="CALL DBMS_WORKLOAD_REPOSITORY.ASH_REPORT( '{0}', '{1}'".format(self.from_time_str, self.to_time_str) if sql_id_option: self.sql_id = sql_id_option - self.ash_sql = self.ash_sql + ", '{0}'".format(self.sql_id) else: - self.ash_sql = self.ash_sql + ", NULL" + self.sql_id = None if trace_id_option: self.trace_id = trace_id_option - self.ash_sql = self.ash_sql + ", '{0}'".format(self.trace_id) else: - self.ash_sql = self.ash_sql + ", NULL" + self.trace_id = None + + if report_type_option: + self.report_type = report_type_option.strip() + if report_type_option.upper() != "TEXT": + self.stdio.error("Invalid argument for report type, Now just support TEXT") + return False + else: + self.report_type = None if wait_class_option: self.wait_class = wait_class_option - self.ash_sql = self.ash_sql + ", '{0}'".format(self.wait_class) else: - self.ash_sql = self.ash_sql + ", NULL" - if report_type_option: - self.report_type = report_type_option - self.ash_sql = self.ash_sql + ", '{0}'".format(self.report_type) + self.wait_class = None + if store_dir_option: + self.gather_pack_dir = store_dir_option else: - self.ash_sql = self.ash_sql + ", NULL" - - try: - self.ash_sql = self.ash_sql + ");" - self.stdio.verbose("ash_sql: {0}".format(self.ash_sql)) - - ash_report_data=self.obconn.execute_sql(self.ash_sql) - if not ash_report_data or len(ash_report_data)==0: - self.stdio.error("ash report data is empty") - raise OBDIAGException("ash report data is empty") - ash_report=ash_report_data[0] - - # save ash_report_data - self.ash_report_file_name="ash_report_{0}.txt".format(TimeUtils.timestamp_to_filename_time(self.gather_timestamp)) - - with open(self.report_path + "/"+self.ash_report_file_name, 'w') as f: - f.write(ash_report) - except Exception as e: - self.stdio.error("ash report gather failed, error message: {0}".format(e)) - return False + self.gather_pack_dir = "./" + self.stdio.print("from_time: {0}, to_time: {1}, sql_id: {2}, trace_id: {3}, report_type: {4}, wait_class: {5}, store_dir: {6}".format(self.from_time_str, self.to_time_str, self.sql_id, self.trace_id, self.report_type, self.wait_class,self.gather_pack_dir)) return True def __print_result(self): - self.stdio.print(Fore.YELLOW + "\nGather scene results stored in this directory: {0}\n".format(self.report_path + "/"+self.ash_report_file_name) + Style.RESET_ALL) - - - - - + self.stdio.print(Fore.YELLOW + "\nGather scene results stored in this directory: {0}".format( + self.ash_report_file_name) + Style.RESET_ALL) + self.stdio.print("") \ No newline at end of file diff --git a/init_obdiag_cmd.sh b/init_obdiag_cmd.sh index e9ea3d49..c896ecca 100644 --- a/init_obdiag_cmd.sh +++ b/init_obdiag_cmd.sh @@ -12,7 +12,7 @@ _obdiag_completion() { case "${COMP_WORDS[1]}" in gather) if [ "$COMP_CWORD" -eq 2 ]; then - type_list="log clog slog plan_monitor stack perf sysstat obproxy_log all scene" + type_list="log clog slog plan_monitor stack perf sysstat obproxy_log all scene ash" elif [ "${COMP_WORDS[2]}" = "scene" ] && [ "$COMP_CWORD" -eq 3 ]; then type_list="list run" fi From ac776ab295606371c141c926e0b166e3284b0f2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Mon, 29 Apr 2024 20:47:30 +0800 Subject: [PATCH 23/30] support ash --- cmd.py | 2 +- handler/gather/gather_ash_report.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cmd.py b/cmd.py index 3e8c9a0f..69282bf1 100644 --- a/cmd.py +++ b/cmd.py @@ -565,7 +565,7 @@ def _do_command(self, obdiag): class ObdiagGatherAshReportCommand(ObdiagOriginCommand): def __init__(self): - super(ObdiagGatherAshReportCommand, self).__init__('ash', 'gather ash report') + super(ObdiagGatherAshReportCommand, self).__init__('ash', 'Gather ash report') self.parser.add_option('--trace_id', type='string', help="The TRACE.ID of the SQL to be sampled, if left blank or filled with NULL, indicates that TRACE.ID is not restricted.") self.parser.add_option('--sql_id', type='string', diff --git a/handler/gather/gather_ash_report.py b/handler/gather/gather_ash_report.py index 6bb6bcf3..5e96b58f 100644 --- a/handler/gather/gather_ash_report.py +++ b/handler/gather/gather_ash_report.py @@ -28,6 +28,7 @@ class GatherAshReportHandler(SafeStdio): def __init__(self, context, gather_pack_dir='./'): super().__init__() + self.result_summary_file_name = None self.report_type = None self.wait_class = None self.sql_id = None @@ -83,12 +84,15 @@ def execute(self): # save ash_report_data self.ash_report_file_name = "ash_report_{0}.txt".format( TimeUtils.timestamp_to_filename_time(self.gather_timestamp)) - self.ash_report_file_name=os.path.join(self.report_path, "result_summary.txt") - - self.stdio.verbose("save ash report file name: {0}".format(self.ash_report_file_name)) + self.ash_report_file_name=os.path.join(self.report_path, self.ash_report_file_name) with open(self.ash_report_file_name, 'w+') as f: f.write(ash_report) + self.stdio.print("save ash report file name:"+ Fore.YELLOW +"{0}".format(self.ash_report_file_name)+Style.RESET_ALL) + self.result_summary_file_name = os.path.join(self.report_path, "result_summary.txt") + with open(self.ash_report_file_name, 'w+') as f: + f.write(self.ash_report_file_name) + except Exception as e: self.stdio.error("ash report gather failed, error message: {0}".format(e)) @@ -174,6 +178,6 @@ def init_option(self): return True def __print_result(self): - self.stdio.print(Fore.YELLOW + "\nGather scene results stored in this directory: {0}".format( - self.ash_report_file_name) + Style.RESET_ALL) + self.stdio.print(Fore.YELLOW + "\nGather ash_report results stored in this directory: {0}".format( + self.report_path) + Style.RESET_ALL) self.stdio.print("") \ No newline at end of file From ef08820033a6be1aff4f0bfde00517d764f85ed2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Tue, 30 Apr 2024 10:05:15 +0800 Subject: [PATCH 24/30] support ash --- cmd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd.py b/cmd.py index 69282bf1..d3935ac8 100644 --- a/cmd.py +++ b/cmd.py @@ -605,6 +605,7 @@ def __init__(self): self.parser.add_option('--files', action="append", type='string', help="specify files") self.parser.add_option('--store_dir', type='string', help='the dir to store gather result, current dir by default.', default='./') self.parser.add_option('-c', type='string', help='obdiag custom config', default=os.path.expanduser('~/.obdiag/config.yml')) + self.parser.add_option('--since', type='string',help="Specify time range that from 'n' [d]ays, 'n' [h]ours or 'n' [m]inutes. before to now. format: . example: 1h.",default='30m') def init(self, cmd, args): super(ObdiagAnalyzeLogCommand, self).init(cmd, args) From 1cfd7c187a89f09a83cb60a17498ef5909b0ce3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Tue, 30 Apr 2024 10:25:43 +0800 Subject: [PATCH 25/30] ash's observer version must greater than 4.0.0.0 --- handler/gather/gather_ash_report.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/handler/gather/gather_ash_report.py b/handler/gather/gather_ash_report.py index 5e96b58f..6f0d2977 100644 --- a/handler/gather/gather_ash_report.py +++ b/handler/gather/gather_ash_report.py @@ -18,9 +18,10 @@ import datetime import os +from common.command import get_observer_version, get_observer_version_by_sql from common.ob_connector import OBConnector from common.obdiag_exception import OBDIAGFormatException, OBDIAGException -from common.tool import DirectoryUtil, TimeUtils, Util +from common.tool import DirectoryUtil, TimeUtils, Util, StringUtils from stdio import SafeStdio from colorama import Fore, Style @@ -45,6 +46,8 @@ def __init__(self, context, gather_pack_dir='./'): else: self.gather_timestamp = TimeUtils.get_current_us_timestamp() self.cluster = self.context.cluster_config + + self.observer_nodes = self.context.cluster_config.get("servers") try: self.obconn = OBConnector( ip=self.cluster.get("db_host"), @@ -60,12 +63,31 @@ def __init__(self, context, gather_pack_dir='./'): raise OBDIAGFormatException("Failed to connect to database: {0}".format(e)) def handle(self): + if not self.version_check(): + self.stdio.error('version check failed') + return False if not self.init_option(): self.stdio.error('init option failed') return False self.__init_report_path() self.execute() self.__print_result() + def version_check(self): + observer_version = "" + try: + observer_version = get_observer_version_by_sql(self.ob_cluster, self.stdio) + except Exception as e: + if len(self.observer_nodes) > 0: + observer_version = get_observer_version(True, self.observer_nodes[0]["ssher"], + self.observer_nodes[0]["home_path"],self.stdio) + else: + self.stdio.warn("RCAHandler Failed to get observer version:{0}".format(e)) + self.stdio.verbose("RCAHandler.init get observer version: {0}".format(observer_version)) + + if not (observer_version == "4.0.0.0" or StringUtils.compare_versions_greater(observer_version, "4.0.0.0")): + self.stdio.error("observer version: {0}, must greater than 4.0.0.0".format(observer_version)) + return False + return True def execute(self): try: From e0b3271b2dda7c82b95459b9ea41e7564802df33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Tue, 30 Apr 2024 10:28:57 +0800 Subject: [PATCH 26/30] update format --- common/ob_connector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/common/ob_connector.py b/common/ob_connector.py index e002c9e4..948644a5 100644 --- a/common/ob_connector.py +++ b/common/ob_connector.py @@ -114,6 +114,7 @@ def execute_sql_pretty(self, sql): ret = from_db_cursor(cursor) cursor.close() return ret + def callproc(self, procname, args=()): if self.conn is None: self._connect_db() From fa4108b7574ff631d2b70c51d3e03059a64080c0 Mon Sep 17 00:00:00 2001 From: Teingi Date: Tue, 30 Apr 2024 15:51:03 +0800 Subject: [PATCH 27/30] update README --- README-CN.md | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README-CN.md b/README-CN.md index fe99b285..bbd70cd2 100644 --- a/README-CN.md +++ b/README-CN.md @@ -75,7 +75,7 @@ obdiag 期望构建一个开放的社区,我们欢迎任何形式的贡献, |---------|---------------|---------| |1.6.0| 2024.01|

| |2.0.0|2024.03|
  • context改造,场景化扩展能力增强
  • 支持在线更新巡检、采集的task
  • 根因分析二期
| -|2.1.0|2024.04|
  • 根因分析场景扩展
  • 新增 tabledump 采集
| +|2.1.0|2024.04|
  • 根因分析场景扩展
  • 新增 ash 报告 采集
| |2.2.0|2024.05|
  • 根因分析场景扩展
| |2.3.0|2024.06|
  • 根因分析场景扩展
  • 支持 SQL 诊断
| |2.4.0|2024.07|
  • 根因分析场景扩展
  • 适配两款内核的诊断工具
| diff --git a/README.md b/README.md index 376402e1..203fa922 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ obdiag envisions an open community. We welcome your contributions in any form: |---------|---------------|---------| |1.6.0| 2024.01|
  • Scenario based fault information collection
  • Scenario based root cause analysis
| |2.0.0|2024.03|
  • Context Transformation, Enhanced Scene Expansion Capabilities
  • Support online updating of inspection and gather tasks
  • Root Cause Analysis Phase II Transformation
| -|2.1.0|2024.04|
  • Root Cause Analysis Scenario Expansion
  • Gather tabledump
| +|2.1.0|2024.04|
  • Root Cause Analysis Scenario Expansion
  • Gather ash report
| |2.2.0|2024.05|
  • Root Cause Analysis Scenario Expansion
| |2.3.0|2024.06|
  • Root Cause Analysis Scenario Expansion
  • Support SQL Diagnosis
| |2.4.0|2024.07|
  • Root Cause Analysis Scenario Expansion
  • Adapting Two Additional Kernel Diagnostic Tools
| From 93cf3c9310092a080f853deced600869a20c5ab6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Mon, 6 May 2024 10:29:04 +0800 Subject: [PATCH 28/30] fix ash --- handler/gather/gather_ash_report.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/handler/gather/gather_ash_report.py b/handler/gather/gather_ash_report.py index 6f0d2977..c445f263 100644 --- a/handler/gather/gather_ash_report.py +++ b/handler/gather/gather_ash_report.py @@ -21,6 +21,7 @@ from common.command import get_observer_version, get_observer_version_by_sql from common.ob_connector import OBConnector from common.obdiag_exception import OBDIAGFormatException, OBDIAGException +from common.ssh import SshHelper from common.tool import DirectoryUtil, TimeUtils, Util, StringUtils from stdio import SafeStdio from colorama import Fore, Style @@ -41,6 +42,7 @@ def __init__(self, context, gather_pack_dir='./'): self.context = context self.stdio = self.context.stdio self.gather_pack_dir = gather_pack_dir + self.ob_cluster = self.context.cluster_config if self.context.get_variable("gather_timestamp", None): self.gather_timestamp = self.context.get_variable("gather_timestamp") else: @@ -78,7 +80,8 @@ def version_check(self): observer_version = get_observer_version_by_sql(self.ob_cluster, self.stdio) except Exception as e: if len(self.observer_nodes) > 0: - observer_version = get_observer_version(True, self.observer_nodes[0]["ssher"], + ssher=SshHelper(self.observer_nodes[0]["ip"], self.observer_nodes[0]["ssh_port"], self.observer_nodes[0]["ssh_username"], self.observer_nodes[0]["ssh_password"]) + observer_version = get_observer_version(True, ssher, self.observer_nodes[0]["home_path"],self.stdio) else: self.stdio.warn("RCAHandler Failed to get observer version:{0}".format(e)) @@ -110,9 +113,9 @@ def execute(self): with open(self.ash_report_file_name, 'w+') as f: f.write(ash_report) - self.stdio.print("save ash report file name:"+ Fore.YELLOW +"{0}".format(self.ash_report_file_name)+Style.RESET_ALL) + self.stdio.print("save ash report file name: "+ Fore.YELLOW +"{0}".format(self.ash_report_file_name)+Style.RESET_ALL) self.result_summary_file_name = os.path.join(self.report_path, "result_summary.txt") - with open(self.ash_report_file_name, 'w+') as f: + with open(self.result_summary_file_name, 'w+') as f: f.write(self.ash_report_file_name) except Exception as e: From f5ab7cff610a8b35e9afe1acb19c668d357eb66a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Tue, 7 May 2024 16:26:29 +0800 Subject: [PATCH 29/30] fix rca --- handler/rca/rca_handler.py | 7 ++- handler/rca/scene/lock_conflict_scene.py | 2 +- handler/rca/scene/major_hold_scene.py | 73 ++++++++++++++---------- 3 files changed, 51 insertions(+), 31 deletions(-) diff --git a/handler/rca/rca_handler.py b/handler/rca/rca_handler.py index 94d4d017..6574c6b0 100644 --- a/handler/rca/rca_handler.py +++ b/handler/rca/rca_handler.py @@ -293,15 +293,20 @@ def export(self): class RCA_ResultRecord: - def __init__(self): + def __init__(self,stdio=None): self.records = [] self.suggest = "The suggest: " + self.stdio = stdio def add_record(self, record): self.records.append(record) + if self.stdio is not None: + self.stdio.verbose("add record: {0}".format(record)) def add_suggest(self, suggest): self.suggest += suggest + if self.stdio is not None: + self.stdio.verbose("add suggest: {0}".format(suggest)) def suggest_is_empty(self): return self.suggest == "The suggest: " diff --git a/handler/rca/scene/lock_conflict_scene.py b/handler/rca/scene/lock_conflict_scene.py index 6e0ab71a..85206fc6 100644 --- a/handler/rca/scene/lock_conflict_scene.py +++ b/handler/rca/scene/lock_conflict_scene.py @@ -73,7 +73,7 @@ def __execute_4_2(self): trans_record.add_record("wait_lock_trans_id is {0}".format(wait_lock_trans_id)) session_datas = cursor_by_trans_id.fetchall() trans_record.add_record( - "get SESSION_ID by wait_lock_trans_id:{0}. get data:{0}".format(trans_id, session_datas)) + "get SESSION_ID by wait_lock_trans_id:{0}. get data:{1}".format(trans_id, session_datas)) if len(session_datas) != 1: trans_record.add_suggest("wait_lock_session_id is not get. The holding lock trans_id is {0}. You can resolve lock conflicts by killing this locked session, but this may cause business exceptions. Please use with caution.".format(trans_id)) continue diff --git a/handler/rca/scene/major_hold_scene.py b/handler/rca/scene/major_hold_scene.py index 7ebfb93d..55fed00c 100644 --- a/handler/rca/scene/major_hold_scene.py +++ b/handler/rca/scene/major_hold_scene.py @@ -52,7 +52,7 @@ def execute(self): COMPACTING_data = self.ob_connector.execute_sql( 'select * from oceanbase.CDB_OB_MAJOR_COMPACTION where IS_ERROR="YES";') if len(COMPACTING_data) == 0: - first_record.add_record("CDB_OB_MAJOR_COMPACTION is not exist IS_ERROR='YES'") + first_record.add_record("ON CDB_OB_MAJOR_COMPACTION WHERE IS_ERROR='YES', data is not exist") else: need_tag = True CDB_OB_MAJOR_COMPACTION_err_tenant_ids = [] @@ -177,22 +177,26 @@ def execute(self): sql = "select * from oceanbase.GV$OB_COMPACTION_PROGRESS where TENANT_ID='{0}' and COMPACTION_SCN='{1}';".format( err_tenant_id, global_broadcast_scn) - OB_COMPACTION_PROGRESS_data_global_broadcast_scn = self.ob_connector.execute_sql(sql) - file_name = "{0}/rca_major_hold_{1}_OB_COMPACTION_PROGRESS_data_global_broadcast_scn".format( + cursor = self.ob_connector.execute_sql_return_cursor_dictionary(sql) + OB_COMPACTION_PROGRESS_data_global_broadcast_scn_data = cursor.fetchall() + OB_COMPACTION_PROGRESS_data_global_broadcast_scn_json_data = json.dumps(OB_COMPACTION_PROGRESS_data_global_broadcast_scn_data, cls=DateTimeEncoder) + file_name = "{0}/rca_major_hold_{1}_OB_COMPACTION_PROGRESS_data_global_broadcast_scn.json".format( self.local_path, err_tenant_id) - with open(file_name, 'w') as f: - f.write(str(OB_COMPACTION_PROGRESS_data_global_broadcast_scn)) + with open(file_name, 'w+') as f: + f.write(str(OB_COMPACTION_PROGRESS_data_global_broadcast_scn_json_data)) tenant_record.add_record( "tenant_id:{0} OB_COMPACTION_PROGRESS_data_global_broadcast_scn save on {1}".format(err_tenant_id, file_name)) sql = "select * from oceanbase.GV$OB_COMPACTION_PROGRESS where TENANT_ID='{0}' and COMPACTION_SCN='{1}';".format( err_tenant_id, last_scn) - OB_COMPACTION_PROGRESS_data_last_scn = self.ob_connector.execute_sql(sql) - file_name = "{0}/rca_major_hold_{1}_OB_COMPACTION_PROGRESS_data_last_scn".format( + cursor = self.ob_connector.execute_sql_return_cursor_dictionary(sql) + OB_COMPACTION_PROGRESS_data_last_scn_data = cursor.fetchall() + OB_COMPACTION_PROGRESS_data_last_scn_json_data = json.dumps(OB_COMPACTION_PROGRESS_data_last_scn_data, cls=DateTimeEncoder) + file_name = "{0}/rca_major_hold_{1}_OB_COMPACTION_PROGRESS_data_last_scn.json".format( self.local_path, err_tenant_id) - with open(file_name, 'w') as f: - f.write(str(OB_COMPACTION_PROGRESS_data_last_scn)) + with open(file_name, 'w+') as f: + f.write(str(OB_COMPACTION_PROGRESS_data_last_scn_json_data)) tenant_record.add_record( "tenant_id:{0} OB_COMPACTION_PROGRESS_data_last_scn save on {1}".format(err_tenant_id, file_name)) @@ -233,19 +237,31 @@ def execute(self): try: cursor = self.ob_connector.execute_sql_return_cursor_dictionary( 'select * from oceanbase.GV$OB_COMPACTION_SUGGESTIONS where tenant_id="{0}";'.format(err_tenant_id)) - columns = [column[0] for column in cursor.description] OB_COMPACTION_SUGGESTIONS_data = cursor.fetchall() OB_COMPACTION_SUGGESTIONS_info = json.dumps(OB_COMPACTION_SUGGESTIONS_data, cls=DateTimeEncoder) - file_name = "{0}/rca_major_hold_{1}_OB_COMPACTION_SUGGESTIONS_info".format( + file_name = "{0}/rca_major_hold_{1}_OB_COMPACTION_SUGGESTIONS_info.json".format( self.local_path, err_tenant_id) - with open(file_name, 'w') as f: + with open(file_name, 'w+') as f: f.write(str(OB_COMPACTION_SUGGESTIONS_info)) tenant_record.add_record( "tenant_id:{0} OB_COMPACTION_PROGRESS_data_last_scn save on {1}".format(err_tenant_id, file_name)) except Exception as e: - self.stdio.warn("MajorHoldScene execute 5 exception: {0}".format(e)) + self.stdio.error("MajorHoldScene execute 5 exception: {0}".format(e)) + #6 + try: + # get oceanbase.__all_virtual_dag_warning_history status="RETRYED" type like "%MERGE%" + cursor=self.ob_connector.execute_sql_return_cursor_dictionary('SELECT * FROM oceanbase.__all_virtual_dag_warning_history WHERE tenant_id="{0}" AND status="RETRYED" AND type like "%MERGE%";'.format(err_tenant_id)) + __all_virtual_dag_warning_history_data = cursor.fetchall() + file_name = "{0}/rca_major_hold_{0}_all_virtual_dag_warning_history.json".format( + self.local_path, err_tenant_id) + __all_virtual_dag_warning_history_json_data = json.dumps(__all_virtual_dag_warning_history_data, cls=DateTimeEncoder) + with open(file_name, 'w+') as f: + f.write(str(__all_virtual_dag_warning_history_json_data)) + tenant_record.add_record("tenant_id:{0} all_virtual_dag_warning_history save on {1}".format(err_tenant_id,file_name)) + except Exception as e: + self.stdio.error("MajorHoldScene execute 6 exception: {0}".format(e)) tenant_record.add_suggest("send the {0} to the oceanbase community".format(self.local_path)) self.Result.records.append(tenant_record) @@ -255,14 +271,14 @@ def get_info__all_virtual_compaction_diagnose_info(self, tenant_record): "SELECT * FROM oceanbase.__all_virtual_compaction_diagnose_info WHERE IS_ERROR = 'NO' OR IS_SUSPENDED = 'NO';") if len(COMPACTING_datas) == 0: tenant_record.add_record( - "sql:select * from oceanbase.__all_virtual_compaction_diagnose_info; no data") + "ON oceanbase.__all_virtual_compaction_diagnose_info. No data WHERE IS_ERROR = 'NO' OR IS_SUSPENDED = 'NO';") return else: tenant_record.add_record( "sql:select * from oceanbase.CDB_OB_MAJOR_COMPACTION where status=COMPACTING; " "result:{0}".format(str(COMPACTING_datas))) for index, COMPACTING_data in COMPACTING_datas: - self.diagnose_info_switch(COMPACTING_data) + self.diagnose_info_switch(COMPACTING_data,tenant_record) except Exception as e: raise RCAExecuteException( "MajorHoldScene execute get_info__all_virtual_compaction_diagnose_info exception: {0}".format(e)) @@ -288,7 +304,7 @@ def diagnose_info_switch(self, sql_data, tenant_record): log_name = "/tmp/rca_major_hold_schedule_medium_failed_{1}_{2}_{0}.txt".format(tenant_id, svr_ip, svr_port) tenant_record.add_record( - "diagnose_info type is 'schedule medium failed'. time is {0},observer is {1}:{2},the log is {3}".format( + "diagnose_info type: 'schedule medium failed'. time is {0},observer is {1}:{2},the log is {3}".format( create_time, svr_ip, svr_port, log_name)) ssh_helper.ssh_exec_cmd( 'grep "schedule_medium_failed" {1}/log/observer.log* |grep -P "\[\d+\]" -m 1 -o >{0}'.format(log_name, @@ -309,12 +325,12 @@ def diagnose_info_switch(self, sql_data, tenant_record): table_id, tenant_id))[0][7] if compaction_scn > global_broadcast_scn: tenant_record.add_record( - "diagnose_info type is error_no. error_no: {0}, err_trace: {1} , table_id:{2}, tenant_id:{3}, compaction_scn: {4}, global_broadcast_scn: {5}. compaction_scn>global_broadcast_scn".format( + "diagnose_info type: error_no. error_no: {0}, err_trace: {1} , table_id:{2}, tenant_id:{3}, compaction_scn: {4}, global_broadcast_scn: {5}. compaction_scn>global_broadcast_scn".format( err_no, err_trace, table_id, tenant_id, compaction_scn, global_broadcast_scn)) return else: tenant_record.add_record( - "diagnose_info type is error_no. error_no: {0}, err_trace:{1}, table_id:{2}, tenant_id:{3}, compaction_scn: {4}, global_broadcast_scn: {5}. compaction_scn /tmp/{0}'.format(log_name, err_trace)) ssh_helper.download(log_name, local_path=self.local_path) @@ -354,7 +370,7 @@ def diagnose_info_switch(self, sql_data, tenant_record): cursor = self.ob_connector.execute_sql_return_cursor_dictionary( "select * from oceanbase.__all_virtual_ls_info where tenant_id='{0}' and ls_id='{1}';".format(tenant_id, ls_id)) - columns = [column[0] for column in cursor.description] + all_virtual_ls_info_data = cursor.fetchall() self.all_virtual_ls_info = json.dumps(all_virtual_ls_info_data, cls=DateTimeEncoder) tenant_record.add_record( @@ -363,7 +379,7 @@ def diagnose_info_switch(self, sql_data, tenant_record): "result:{0}".format(str(self.all_virtual_ls_info))) return elif "memtable can not create dag successfully" in diagnose_info: - tenant_record.add_record("diagnose_info type is memtable can not create dag successfully.") + tenant_record.add_record("diagnose_info type: memtable can not create dag successfully.") global_broadcast_scn = self.ob_connector.execute_sql( "select * from oceanbase.CDB_OB_MAJOR_COMPACTION where TENANT_ID='{0}';".format(tenant_id))[0][3] @@ -372,14 +388,14 @@ def diagnose_info_switch(self, sql_data, tenant_record): table_id, tenant_id))[0][7] if compaction_scn > global_broadcast_scn: tenant_record.add_record( - "diagnose_info type is memtable can not create dag successfully. table_id:{0}, tenant_id:{1}, compaction_scn: {2}, global_broadcast_scn: {3}. compaction_scn>global_broadcast_scn".format( + "diagnose_info type: memtable can not create dag successfully. table_id:{0}, tenant_id:{1}, compaction_scn: {2}, global_broadcast_scn: {3}. compaction_scn>global_broadcast_scn".format( table_id, tenant_id, compaction_scn, global_broadcast_scn)) return else: cursor = self.ob_connector.execute_sql_return_cursor_dictionary( "select * from oceanbase.__all_virtual_dag_scheduler where svr_ip='{0}' and svr_port='{1}' and tenant_id='{2}';".format( svr_ip, svr_port, tenant_id)) - columns = [column[0] for column in cursor.description] + all_virtual_ls_info_data = cursor.fetchall() self.all_virtual_ls_info = json.dumps(all_virtual_ls_info_data, cls=DateTimeEncoder) tenant_record.add_record( @@ -390,11 +406,11 @@ def diagnose_info_switch(self, sql_data, tenant_record): return elif "medium wait for freeze" in diagnose_info or "major wait for freeze" in diagnose_info: - tenant_record.add_record("diagnose_info type is medium wait for freeze or major wait for freeze.") + tenant_record.add_record("diagnose_info type: medium wait for freeze or major wait for freeze.") cursor = self.ob_connector.execute_sql_return_cursor_dictionary( "select * from oceanbase.__all_virtual_dag_scheduler where svr_ip='{0}' and svr_port='{1}' and tenant_id='{2}';".format( svr_ip, svr_port, tenant_id)) - columns = [column[0] for column in cursor.description] + all_virtual_ls_info_data = cursor.fetchall() self.all_virtual_ls_info = json.dumps(all_virtual_ls_info_data, cls=DateTimeEncoder) tenant_record.add_record( @@ -404,11 +420,10 @@ def diagnose_info_switch(self, sql_data, tenant_record): "result:{0}".format(str(self.all_virtual_ls_info))) return elif "major not schedule for long time" in diagnose_info: - tenant_record.add_record("diagnose_info type is major not schedule for long time") + tenant_record.add_record("diagnose_info type: ‘major not schedule for long time’") cursor = self.ob_connector.execute_sql_return_cursor_dictionary( "select * from oceanbase.__all_virtual_tablet_compaction_info where svr_ip='{0}' and svr_port='{1}' and tenant_id='{2}' and ls_id='{3}' and tablet_id='{4}';".format( svr_ip, svr_port, tenant_id, ls_id, table_id)) - columns = [column[0] for column in cursor.description] all_virtual_ls_info_data = cursor.fetchall() all_virtual_tablet_compaction_info = json.dumps(all_virtual_ls_info_data, cls=DateTimeEncoder) tenant_record.add_record( @@ -429,7 +444,7 @@ def diagnose_info_switch(self, sql_data, tenant_record): svr_ip, svr_port) tenant_record.add_record( - "diagnose_info type is 'major not schedule for long time'. time is {0},observer is {1}:{2},the log is {3}".format( + "diagnose_info type: 'major not schedule for long time'. time is {0},observer is {1}:{2},the log is {3}".format( create_time, svr_ip, svr_port, log_name)) thread_id = ssh_helper.ssh_exec_cmd( 'cat {0}/log/observer.log* |grep "MediumLoo" -m 1 |grep -P "\[\d+\]" -m 1 -o | grep -oP "\d+"'.format( @@ -441,7 +456,7 @@ def diagnose_info_switch(self, sql_data, tenant_record): ssh_helper.ssh_exec_cmd("rm -rf {0}".format(log_name)) else: - tenant_record.add_record("diagnose_info type is Unknown.") + tenant_record.add_record("diagnose_info type: Unknown.") def export_result(self): return self.Result.export() From 51a86fc0202b1664efccf76a625b24b4f0c725ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Tue, 7 May 2024 18:03:58 +0800 Subject: [PATCH 30/30] fix rca --- handler/rca/scene/lock_conflict_scene.py | 62 +++++++++++++++++------- handler/rca/scene/major_hold_scene.py | 2 +- 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/handler/rca/scene/lock_conflict_scene.py b/handler/rca/scene/lock_conflict_scene.py index 85206fc6..97024ad4 100644 --- a/handler/rca/scene/lock_conflict_scene.py +++ b/handler/rca/scene/lock_conflict_scene.py @@ -15,18 +15,22 @@ @file: lock_conflict_scene.py @desc: """ +import json + from handler.rca.rca_exception import RCAInitException, RCANotNeedExecuteException from handler.rca.rca_handler import RcaScene, RCA_ResultRecord -from common.tool import StringUtils +from common.tool import StringUtils, DateTimeEncoder class LockConflictScene(RcaScene): def __init__(self): super().__init__() + self.local_path = "" def init(self, context): try: super().init(context) + self.local_path = context.get_variable('store_dir') if self.observer_version is None or len(self.observer_version.strip()) == 0 or self.observer_version == "": raise Exception("observer version is None. Please check the NODES conf.") except Exception as e: @@ -65,27 +69,51 @@ def __execute_4_2(self): continue else: trans_id = OB_LOCKS_data['ID1'] - trans_record.add_record("holding lock trans_id is {0}".format(trans_id)) + trans_record.add_record("get holding_lock trans_id:{0}".format(trans_id)) + holding_lock_session_id=trans_id + self.stdio.verbose("get holding lock SESSION_ID by trans_id:{0}".format(trans_id)) + cursor_by_trans_id = self.ob_connector.execute_sql_return_cursor_dictionary( + 'select * from oceanbase.V$OB_TRANSACTION_PARTICIPANTS where TX_ID="{0}";'.format(holding_lock_session_id)) + holding_lock_session_id_datas = cursor_by_trans_id.fetchall() + holding_lock_session_id = "not get" + if len(holding_lock_session_id_datas) == 1: + holding_lock_session_id=holding_lock_session_id_datas[0].get("SESSION_ID") + trans_record.add_record("get holding_lock_session_id:{0}".format(holding_lock_session_id)) + wait_lock_trans_id=OB_LOCKS_data['TRANS_ID'] cursor_by_trans_id = self.ob_connector.execute_sql_return_cursor_dictionary( 'select * from oceanbase.V$OB_TRANSACTION_PARTICIPANTS where TX_ID="{0}";'.format(wait_lock_trans_id)) - self.stdio.verbose("get SESSION_ID by trans_id:{0}".format(trans_id)) trans_record.add_record("wait_lock_trans_id is {0}".format(wait_lock_trans_id)) - session_datas = cursor_by_trans_id.fetchall() - trans_record.add_record( - "get SESSION_ID by wait_lock_trans_id:{0}. get data:{1}".format(trans_id, session_datas)) - if len(session_datas) != 1: - trans_record.add_suggest("wait_lock_session_id is not get. The holding lock trans_id is {0}. You can resolve lock conflicts by killing this locked session, but this may cause business exceptions. Please use with caution.".format(trans_id)) - continue - if session_datas[0].get("SESSION_ID") is not None: - trans_record.add_record("get SESSION_ID:{0}".format(session_datas[0].get("SESSION_ID"))) - trans_record.add_suggest("Sessions corresponding to lock transactions. The ID is {0}, " - "which may be a lock conflict issue.You can be accessed through kill " - "session to rollback the corresponding transaction with ID. Please " - "note that this will result in corresponding transaction regression! " - "".format(session_datas[0].get("SESSION_ID"))) + wait_lock_session_datas = cursor_by_trans_id.fetchall() + wait_lock_session_id="not get" + if len(wait_lock_session_datas) == 1: + wait_lock_session_id=wait_lock_session_datas[0].get("SESSION_ID") + trans_record.add_record("get wait_lock_session_id:{0}".format(wait_lock_session_datas[0].get("SESSION_ID"))) + self.stdio.verbose("get sql_info by holding_lock_session_id:{0}".format(holding_lock_session_id)) + # check SQL_AUDIT switch + sql_info="not find" + + cursor_check_switch = self.ob_connector.execute_sql_return_cursor_dictionary("SHOW PARAMETERS LIKE '%enable_sql_audit%';") + audit_switch_value = cursor_check_switch.fetchone().get('value') + if audit_switch_value.strip().upper() == "TRUE": + holding_lock_sql_info_cursor=self.ob_connector.execute_sql_return_cursor_dictionary( + 'SELECT * FROM oceanbase.v$OB_SQL_AUDIT where SID="{0}";'.format(holding_lock_session_id)) + holding_lock_sql_info= holding_lock_sql_info_cursor.fetchall() + if len(holding_lock_sql_info)==0: + trans_record.add_record("holding_lock_session_id: {0}; not find sql_info on v$OB_SQL_AUDIT".format(holding_lock_session_id)) + else: + holding_lock_sql_info_json_data = json.dumps(holding_lock_sql_info, cls=DateTimeEncoder) + file_name = "{0}/rca_holding_lock_sql_info_{1}.json".format( self.local_path, holding_lock_session_id) + with open(file_name, 'w+') as f: + f.write(str(holding_lock_sql_info_json_data)) + trans_record.add_record( + "holding_lock_session_id: {0}. holding_lock_sql_info save on {1}".format(holding_lock_session_id, + file_name)) + sql_info="save on {0}".format(file_name) else: - trans_record.add_record("wait_lock_session_id is not get. The holding lock trans_id is {0}. You can resolve lock conflicts by killing this locked session, but this may cause business exceptions. Please use with caution.".format(trans_id)) + self.stdio.verbose("SQL_AUDIT switch is False") + trans_record.add_record("SQL_AUDIT switch is False. can't get sql_info") + trans_record.add_suggest("holding_lock_session_id: {0}; wait_lock_session_id : {1}, sql_info: {2}. Lock conflicts can be ended by killing holding_lock_session_id or wait_lock_session_id".format(holding_lock_session_id,wait_lock_session_id,sql_info)) except Exception as e: trans_record.add_record("get SESSION_ID panic. OB_LOCKS_data:{0} error: {1}".format(OB_LOCKS_data, e)) diff --git a/handler/rca/scene/major_hold_scene.py b/handler/rca/scene/major_hold_scene.py index 55fed00c..e72ecb14 100644 --- a/handler/rca/scene/major_hold_scene.py +++ b/handler/rca/scene/major_hold_scene.py @@ -31,7 +31,7 @@ def __init__(self): def init(self, context): try: super().init(context) - self.local_path = context.get_variable('result_path') + self.local_path = context.get_variable('store_dir') if self.observer_version is None: raise Exception("obproxy version is None. Please check the NODES conf.")