diff --git a/cmd.py b/cmd.py index 986dec43..0c0fcfb1 100644 --- a/cmd.py +++ b/cmd.py @@ -322,6 +322,8 @@ def do_command(self): return False cmd = '%s %s' % (self.prev_cmd, base) ROOT_IO.track_limit += 1 + if "main.py" in cmd: + telemetry.work_tag=False telemetry.push_cmd_info("cmd: {0}. args:{1}".format(cmd,args)) return self.commands[base].init(cmd, args).do_command() diff --git a/common/ob_connector.py b/common/ob_connector.py index 223eb915..b548b89e 100644 --- a/common/ob_connector.py +++ b/common/ob_connector.py @@ -20,7 +20,7 @@ class OBConnector(object): - def __init__(self, ip, port, username, password=None, database=None, stdio=None, timeout=10,): + def __init__(self, ip, port, username, password=None, database=None, stdio=None, timeout=30,): self.ip = str(ip) self.port = int(port) self.username = str(username) @@ -50,6 +50,18 @@ def _connect_db(self): self.stdio.verbose("connect databse ...") except mysql.Error as e: self.stdio.error("connect OB: {0}:{1} with user {2} failed, error:{3}".format(self.ip, self.port, self.username, e)) + return + try: + ob_trx_timeout=self.timeout*1000000 + self.execute_sql("SET SESSION ob_trx_timeout={0};".format(ob_trx_timeout)) + except Exception as e: + self.stdio.warn("set ob_trx_timeout failed, error:{0}".format(e)) + try: + ob_query_timeout=self.timeout*1000000 + self.execute_sql("SET SESSION ob_query_timeout={0};".format(ob_query_timeout)) + except Exception as e: + self.stdio.warn("set ob_query_timeout failed, error:{0}".format(e)) + def execute_sql(self, sql): if self.conn is None: diff --git a/common/scene.py b/common/scene.py index 044445a0..735d38b0 100644 --- a/common/scene.py +++ b/common/scene.py @@ -17,7 +17,8 @@ """ from common.ssh import SshHelper from common.tool import StringUtils -from common.command import get_observer_version, get_obproxy_version +from common.command import get_observer_version, get_obproxy_version, get_observer_version_by_sql + def filter_by_version(scene, cluster, stdio=None): try: @@ -59,14 +60,19 @@ def filter_by_version(scene, cluster, stdio=None): stdio.exception("filter_by_version Exception : {0}".format(e)) raise Exception("filter_by_version Exception : {0}".format(e)) -def get_version(nodes, type, stdio=None): +def get_version(nodes, type,cluster, stdio=None): try: if len(nodes) < 1: raise Exception("input nodes is empty, please check your config") node = nodes[0] ssh = SshHelper(True, node.get("ip"), node.get("ssh_username"), node.get("ssh_password"), node.get("ssh_port"), node.get("ssh_key_file"), node) + version = "" if type == "observer": - version = get_observer_version(True, ssh, nodes[0]["home_path"], stdio) + try: + version = get_observer_version_by_sql(cluster,stdio) + except Exception as e: + stdio.warn("get observer version by sql fail, use node ssher to get. Exception:{0}".format(e)) + version = get_observer_version(True, ssh, nodes[0]["home_path"], stdio) elif type == "obproxy": version = get_obproxy_version(True, ssh, nodes[0]["home_path"], stdio) return version diff --git a/conf/inner_config.yml b/conf/inner_config.yml index 8c1e0845..f4bf9245 100644 --- a/conf/inner_config.yml +++ b/conf/inner_config.yml @@ -17,7 +17,6 @@ check: report: report_path: "./check_report/" export_type: table - package_file: "~/.obdiag/check/check_package.yaml" tasks_base_path: "~/.obdiag/check/tasks/" gather: scenes_base_path: "~/.obdiag/gather/tasks" diff --git a/handler/checker/check_handler.py b/handler/checker/check_handler.py index 5d4152b8..f225832c 100644 --- a/handler/checker/check_handler.py +++ b/handler/checker/check_handler.py @@ -17,7 +17,13 @@ """ import os +import queue +import time + import yaml + +from common.ob_connector import OBConnector +from common.ssh import SshHelper from handler.checker.check_exception import CheckException from handler.checker.check_report import TaskReport, CheckReport, CheckrReportException from handler.checker.check_task import TaskBase @@ -27,7 +33,6 @@ from common.tool import YamlUtils from common.tool import StringUtils - class CheckHandler: def __init__(self, context, check_target_type="observer"): @@ -87,6 +92,35 @@ def __init__(self, context, check_target_type="observer"): # input_param self.options=self.context.options + # add ssher + new_node=[] + for node in self.nodes: + # add ssher + ssher = None + try: + ssher = SshHelper(True, node.get("ip"), + node.get("ssh_username"), + node.get("ssh_password"), + node.get("ssh_port"), + node.get("ssh_key_file"), + node) + except Exception as e: + self.stdio.warn("StepBase get SshHelper fail on{0} ,Exception: {1}".format(node.get("ip"), e)) + node["ssher"] = ssher + new_node.append(node) + self.nodes=new_node + self.version=get_version(self.nodes, self.check_target_type,self.cluster, self.stdio) + + # add OBConnectorPool + try: + obConnectorPool=checkOBConnectorPool(context,3,self.cluster) + + except Exception as e: + self.stdio.warn("obConnector init error. Error info is {0}".format(e)) + finally: + self.context.set_variable('check_obConnector_pool', obConnectorPool) + + def handle(self): try: package_name = None @@ -173,7 +207,7 @@ def execute_one(self, task_name): # Verify if the version is within a reasonable range report = TaskReport(self.context,task_name) if not self.ignore_version: - version = get_version(self.nodes, self.check_target_type, self.stdio) + version = self.version if version: self.cluster["version"] = version self.stdio.verbose("cluster.version is {0}".format(self.cluster["version"])) @@ -206,4 +240,41 @@ def execute(self): except CheckrReportException as e: self.stdio.error("Report error :{0}".format(e)) except Exception as e: - self.stdio.error("Internal error :{0}".format(e)) \ No newline at end of file + self.stdio.error("Internal error :{0}".format(e)) + +class checkOBConnectorPool: + def __init__(self,context, max_size, cluster): + self.max_size = max_size + self.cluster=cluster + self.connections = queue.Queue(maxsize=max_size) + self.stdio=context.stdio + self.stdio.verbose("obConnectorPool init success!") + try: + for i in range(max_size): + conn = OBConnector( + ip=self.cluster.get("db_host"), + port=self.cluster.get("db_port"), + username=self.cluster.get("tenant_sys").get("user"), + password=self.cluster.get("tenant_sys").get("password"), + stdio=self.stdio, + timeout=10000 + ) + self.connections.put(conn) + self.stdio.verbose("obConnectorPool init success!") + except Exception as e: + self.stdio.error("obConnectorPool init fail! err:".format(e)) + + + def get_connection(self): + try: + return self.connections.get() + except Exception as e: + self.stdio.error("get connection fail! err:".format(e)) + return None + + def release_connection(self, conn): + + if conn is not None: + self.connections.put(conn) + return + diff --git a/handler/checker/check_task.py b/handler/checker/check_task.py index bcd357b4..9a4476a4 100644 --- a/handler/checker/check_task.py +++ b/handler/checker/check_task.py @@ -15,7 +15,9 @@ @file: check_task.py @desc: """ +import threading +from common.ob_connector import OBConnector from handler.checker.check_exception import StepResultFailException, \ StepExecuteFailException, StepResultFalseException, TaskException from handler.checker.step.stepbase import StepBase @@ -46,7 +48,18 @@ def execute(self): self.stdio.verbose("filter_by_version is return {0}".format(steps_nu)) if len(self.nodes) == 0: raise Exception("node is not exist") + # TODO: 这里的逻辑需要优化,如果一个节点执行失败了,那么后续的步骤就不会被执行了。 + work_threads = [] for node in self.nodes: + t = threading.Thread(target=self.execute_one_node, args=(steps_nu,node)) + work_threads.append(t) + t.start() + for t in work_threads: + t.join() + + self.stdio.verbose("task execute end") + def execute_one_node(self,steps_nu,node): + try: self.stdio.verbose("run task in node: {0}".format(StringUtils.node_cut_passwd_for_log(node))) steps = self.task[steps_nu] nu = 1 @@ -58,7 +71,6 @@ def execute(self): step_run = StepBase(self.context, step, node, self.cluster, self.task_variable_dict) self.stdio.verbose("step nu: {0} initted, to execute".format(nu)) step_run.execute(self.report) - self.task_variable_dict = step_run.update_task_variable_dict() if "report_type" in step["result"] and step["result"]["report_type"] == "execution": self.stdio.verbose("report_type stop this step") return @@ -77,4 +89,8 @@ def execute(self): self.stdio.verbose("step nu: {0} execute end ".format(nu)) nu = nu + 1 - self.stdio.verbose("task execute end") + except Exception as e: + self.stdio.error("TaskBase execute Exception: {0}".format(e)) + raise e + + diff --git a/handler/checker/step/data_size.py b/handler/checker/step/data_size.py index c93a3e9d..b8d0ff0c 100644 --- a/handler/checker/step/data_size.py +++ b/handler/checker/step/data_size.py @@ -34,18 +34,14 @@ def __init__(self,context, step, node, task_variable_dict): self.task_variable_dict = task_variable_dict try: - is_ssh = True - self.ssh_helper = SshHelper(is_ssh, node.get("ip"), - node.get("ssh_username"), - node.get("ssh_password"), - node.get("ssh_port"), - node.get("ssh_key_file"), - node) + self.ssh_helper=self.node["ssher"] + if self.ssh_helper is None: + raise Exception("self.ssh_helper is None.") except Exception as e: self.stdio.error( - "GetSystemParameterHandler ssh init fail . Please check the NODES conf Exception : {0} .".format(e)) + "DataSizeHandler ssh init fail . Please check the NODES conf Exception : {0} .".format(e)) raise Exception( - "GetSystemParameterHandler ssh init fail . Please check the NODES conf Exception : {0} .".format(e)) + "DataSizeHandler ssh init fail . Please check the NODES conf Exception : {0} .".format(e)) # step report self.parameter = [] diff --git a/handler/checker/step/get_system_parameter.py b/handler/checker/step/get_system_parameter.py index 8ecbc6d4..af3341c1 100644 --- a/handler/checker/step/get_system_parameter.py +++ b/handler/checker/step/get_system_parameter.py @@ -34,13 +34,9 @@ def __init__(self,context, step, node, task_variable_dict): self.task_variable_dict = task_variable_dict try: - is_ssh = True - self.ssh_helper = SshHelper(is_ssh, node.get("ip"), - node.get("ssh_username"), - node.get("ssh_password"), - node.get("ssh_port"), - node.get("ssh_key_file"), - node) + self.ssh_helper=self.node["ssher"] + if self.ssh_helper is None: + raise Exception("self.ssh_helper is None.") except Exception as e: self.stdio.error( "GetSystemParameterHandler ssh init fail . Please check the NODES conf Exception : {0} .".format(e)) diff --git a/handler/checker/step/sql.py b/handler/checker/step/sql.py index 7b44d86d..b7194bdc 100644 --- a/handler/checker/step/sql.py +++ b/handler/checker/step/sql.py @@ -23,24 +23,23 @@ class StepSQLHandler: - def __init__(self,context, step, ob_cluster, task_variable_dict): + def __init__(self,context, step, task_variable_dict): try: self.context = context self.stdio = context.stdio - self.ob_cluster = ob_cluster - self.ob_cluster_name = ob_cluster.get("cluster_name") + self.ob_cluster = self.context.cluster_config + self.ob_cluster_name = self.ob_cluster.get("cluster_name") self.tenant_mode = None self.sys_database = None self.database = None - self.ob_connector = OBConnector(ip=ob_cluster.get("db_host"), - port=ob_cluster.get("db_port"), - username=ob_cluster.get("tenant_sys").get("user"), - password=ob_cluster.get("tenant_sys").get("password"), - stdio=self.stdio, - timeout=10000) + self.ob_connector_pool=self.context.get_variable('check_obConnector_pool',None) + if self.ob_connector_pool is not None: + self.ob_connector=self.ob_connector_pool.get_connection() + if self.ob_connector is None: + raise Exception("self.ob_connector is None.") except Exception as e: - self.stdio.error("StepSQLHandler init fail. Please check the OBCLUSTER conf. OBCLUSTER: {0} Exception : {1} .".format(ob_cluster,e)) - raise Exception("StepSQLHandler init fail. Please check the OBCLUSTER conf. OBCLUSTER: {0} Exception : {1} .".format(ob_cluster,e)) + self.stdio.error("StepSQLHandler init fail. Please check the OBCLUSTER conf. Exception : {0} .".format(e)) + raise Exception("StepSQLHandler init fail. Please check the OBCLUSTER conf. Exception : {0} .".format(e)) self.task_variable_dict = task_variable_dict self.enable_dump_db = False self.trace_id = None @@ -62,8 +61,9 @@ def execute(self): if data is None: self.stdio.warn("sql result is None: {0}".format(self.step["sql"])) self.stdio.verbose("execute_sql result:{0}".format(data)) - if len(data) == 0: + if len(data) == 0 or data is None: self.stdio.warn("sql result is None: {0}".format(self.step["sql"])) + data="" else: data = data[0][0] if data is None: @@ -73,8 +73,10 @@ def execute(self): self.stdio.verbose("sql execute update task_variable_dict: {0} = {1}".format(self.step["result"]["set_value"], Util.convert_to_number(data))) self.task_variable_dict[self.step["result"]["set_value"]] = Util.convert_to_number(data) except Exception as e: - self.stdio.error("StepSQLHandler execute Exception: {0}".format(e).strip()) - raise StepExecuteFailException("StepSQLHandler execute Exception: {0}".format(e).strip()) + self.stdio.error("StepSQLHandler execute Exception: {0}".format(e)) + raise StepExecuteFailException("StepSQLHandler execute Exception: {0}".format(e)) + finally: + self.ob_connector_pool.release_connection(self.ob_connector) def update_step_variable_dict(self): return self.task_variable_dict diff --git a/handler/checker/step/ssh.py b/handler/checker/step/ssh.py index 963cd19f..282477e2 100644 --- a/handler/checker/step/ssh.py +++ b/handler/checker/step/ssh.py @@ -18,7 +18,6 @@ from handler.checker.check_exception import StepExecuteFailException from handler.checker.check_report import TaskReport -from common.ssh import SshHelper from common.tool import StringUtils from common.tool import Util @@ -32,13 +31,9 @@ def __init__(self,context, step, node, task_variable_dict): self.step = step self.node = node try: - is_ssh = True - self.ssh_helper = SshHelper(is_ssh, node.get("ip"), - node.get("ssh_username"), - node.get("ssh_password"), - node.get("ssh_port"), - node.get("ssh_key_file"), - node) + self.ssh_helper=self.node["ssher"] + if self.ssh_helper is None: + raise Exception("self.ssh_helper is None.") except Exception as e: self.stdio.error( "SshHandler init fail. Please check the NODES conf. node: {0}. Exception : {1} .".format(node, e)) diff --git a/handler/checker/step/stepbase.py b/handler/checker/step/stepbase.py index a3351211..3ec68d8e 100644 --- a/handler/checker/step/stepbase.py +++ b/handler/checker/step/stepbase.py @@ -49,9 +49,8 @@ def execute(self, report): self.task_variable_dict["remote_ip"] = \ docker.from_env().containers.get(self.node["container_name"]).attrs['NetworkSettings']['Networks'][ 'bridge']["IPAddress"] - for key in self.node: - self.task_variable_dict["remote_{0}".format(key)] = self.node[key] - + for node in self.node: + self.task_variable_dict["remote_{0}".format(node)] = self.node[node] if "type" not in self.step: raise StepExecuteFailException("Missing field :type") if self.step["type"] == "get_system_parameter": @@ -59,7 +58,7 @@ def execute(self, report): elif self.step["type"] == "ssh": handler = SshHandler(self.context, self.step, self.node, self.task_variable_dict) elif self.step["type"] == "sql": - handler = StepSQLHandler(self.context, self.step, self.cluster, self.task_variable_dict) + handler = StepSQLHandler(self.context, self.step, task_variable_dict=self.task_variable_dict) elif self.step["type"] == "data_size": handler = DataSizeHandler(self.context, self.step, self.cluster, self.task_variable_dict) else: diff --git a/handler/checker/tasks/observer/bugs/bug_182.yaml b/handler/checker/tasks/observer/bugs/bug_182.yaml new file mode 100644 index 00000000..5745b6cd --- /dev/null +++ b/handler/checker/tasks/observer/bugs/bug_182.yaml @@ -0,0 +1,17 @@ +info: "OB has been upgraded to version 4.2.1, and some partition tables are executing DDL with error code -4109 and error message: Server state or role not the same as expected. github issue #182" +task: + - version: "[4.0.0.0,*]" + steps: + - type: sql + sql: 'select tenant_id, table_id, table_name, database_id, table_type, load_type, def_type, rowkey_column_num, index_column_num, max_used_column_id, autoinc_column_id, auto_increment, read_only, rowkey_split_pos, compress_func_name, expire_condition, is_use_bloomfilter, comment, block_size, collation_type, data_table_id, index_status, tablegroup_id, progressive_merge_num, index_type, part_level, part_func_type, part_func_expr, part_num, sub_part_func_type, sub_part_func_expr, sub_part_num, schema_version, view_definition, view_check_option, view_is_updatable, index_using_type, parser_name, index_attributes_set, tablet_size, pctfree, partition_status, partition_schema_version, session_id, pk_comment, sess_active_time, row_store_type, store_format, duplicate_scope, progressive_merge_round, storage_format_version, table_mode, encryption, tablespace_id, sub_part_template_flags, dop, character_set_client, collation_connection, auto_part_size, auto_part, association_table_id, tablet_id, max_dependency_version, define_user_id, transition_point, b_transition_point, interval_range, b_interval_range, object_status, table_flags, truncate_version, 0 as is_deleted from OCEANBASE.__all_table +EXCEPT select + t1.tenant_id, t1.table_id, t1.table_name, t1.database_id, t1.table_type, t1.load_type, t1.def_type, t1.rowkey_column_num, t1.index_column_num, t1.max_used_column_id, t1.autoinc_column_id, t1.auto_increment, t1.read_only, t1.rowkey_split_pos, t1.compress_func_name, t1.expire_condition, t1.is_use_bloomfilter, t1.comment, t1.block_size, t1.collation_type, t1.data_table_id, t1.index_status, t1.tablegroup_id, t1.progressive_merge_num, t1.index_type, t1.part_level, t1.part_func_type, t1.part_func_expr, t1.part_num, t1.sub_part_func_type, t1.sub_part_func_expr, t1.sub_part_num, t1.schema_version, t1.view_definition, t1.view_check_option, t1.view_is_updatable, t1.index_using_type, t1.parser_name, t1.index_attributes_set, t1.tablet_size, t1.pctfree, t1.partition_status, t1.partition_schema_version, t1.session_id, t1.pk_comment, t1.sess_active_time, t1.row_store_type, t1.store_format, t1.duplicate_scope, t1.progressive_merge_round, t1.storage_format_version, t1.table_mode, t1.encryption, t1.tablespace_id, t1.sub_part_template_flags, t1.dop, t1.character_set_client, t1.collation_connection, t1.auto_part_size, t1.auto_part, t1.association_table_id, t1.tablet_id, t1.max_dependency_version, t1.define_user_id, t1.transition_point, t1.b_transition_point, t1.interval_range, t1.b_interval_range, t1.object_status, t1.table_flags, t1.truncate_version, t1.is_deleted + from + OCEANBASE.__all_table_history t1 + inner join (select t2.table_id,max(t2.schema_version) as schema_version from OCEANBASE.__all_table_history t2 group by t2.table_id)as t3 + on t1.table_id = t3.table_id and t1.schema_version = t3.schema_version and t1.is_deleted = 0;' + result: + set_value: error_table + report_type: warning + verify: '[ -z "$error_table" ]' + err_msg: "Some partition tables are inconsistent. Please get bug's on https://github.com/oceanbase/obdiag/issues/182" diff --git a/handler/checker/tasks/observer/cluster/deadlocks.yaml b/handler/checker/tasks/observer/cluster/deadlocks.yaml index 201359da..e05c7034 100644 --- a/handler/checker/tasks/observer/cluster/deadlocks.yaml +++ b/handler/checker/tasks/observer/cluster/deadlocks.yaml @@ -9,4 +9,4 @@ task: verify_type: equal report_type: warning verify: 0 - err_msg: "There is a deadlock." \ No newline at end of file + err_msg: "There is a deadlock. Please check on the oceanbase.DBA_OB_DEADLOCK_EVENT_HISTORY" \ No newline at end of file diff --git a/handler/checker/tasks/observer/cluster/ob_enable_plan_cache_bad_version.yaml b/handler/checker/tasks/observer/cluster/ob_enable_plan_cache_bad_version.yaml index 13b957e8..77e6d8a3 100644 --- a/handler/checker/tasks/observer/cluster/ob_enable_plan_cache_bad_version.yaml +++ b/handler/checker/tasks/observer/cluster/ob_enable_plan_cache_bad_version.yaml @@ -6,6 +6,6 @@ task: sql: 'select name from oceanbase.__all_virtual_tenant_parameter_stat where name like "%ob_enable_plan_cache%" and value like "%true%";' result: set_value: ob_enable_plan_cache - verify: '[ -z "ob_enable_plan_cache" ]' + verify: '[ -z "$ob_enable_plan_cache" ]' err_msg: 'On this version, ob_enable_plan_cache suggestion to close' diff --git a/handler/checker/tasks/observer/cluster/sys_log_level.yaml b/handler/checker/tasks/observer/cluster/sys_log_level.yaml new file mode 100644 index 00000000..1063d34e --- /dev/null +++ b/handler/checker/tasks/observer/cluster/sys_log_level.yaml @@ -0,0 +1,21 @@ +info: "Check sys_log_level ." +task: + - version: "[4.0.0.0,*]" + steps: + - type: sql + sql: 'SELECT value FROM oceanbase.__all_virtual_sys_parameter_stat where name like "%syslog_level%";' + result: + set_value: sys_log_level + report_type: warning + verify: '[[ $sys_log_level == "WDIAG" ]]' + err_msg: "on 4.x, the recommended value for sys_log_level is WDIAG" + + - version: "[3.0.0,4.0.0.0)" + steps: + - type: sql + sql: 'SELECT value FROM oceanbase.__all_virtual_sys_parameter_stat where name like "%syslog_level%";' + result: + set_value: sys_log_level + report_type: warning + verify: '[[ $sys_log_level == "INFO" ]]' + err_msg: "on 3.x, the recommended value for sys_log_level is INFO" diff --git a/handler/checker/tasks/observer/cluster/tenant_number.yaml b/handler/checker/tasks/observer/cluster/tenant_number.yaml new file mode 100644 index 00000000..3c725440 --- /dev/null +++ b/handler/checker/tasks/observer/cluster/tenant_number.yaml @@ -0,0 +1,19 @@ +info: "Check the number of tenant" +task: + - version: "[4.0.0.0,*]" + steps: + - type: sql + sql: 'select count(0)/2 from oceanbase.__all_tenant where tenant_id>1000;' + result: + set_value: tenant_nu + verify_type: max + verify: 100 + err_msg: "The number of tenants: #{tenant_nu}.recommended: tenant_nu<50" + - type: sql + sql: 'select count(0)/2 from oceanbase.__all_tenant where tenant_id>1000;' + result: + set_value: tenant_nu + verify_type: max + report_type: warning + verify: 50 + err_msg: "The number of tenants: #{tenant_nu}.recommended: tenant_nu<50" \ No newline at end of file diff --git a/handler/checker/tasks/observer/sysbench/sysbench_free_test_memory_limit.yaml b/handler/checker/tasks/observer/sysbench/sysbench_free_test_memory_limit.yaml index 661d93b7..63412cb6 100644 --- a/handler/checker/tasks/observer/sysbench/sysbench_free_test_memory_limit.yaml +++ b/handler/checker/tasks/observer/sysbench/sysbench_free_test_memory_limit.yaml @@ -30,7 +30,7 @@ task: set_value: result verify_type: between verify: "[80,100]" - err_msg: 'memory_limit/os_memory is #{result}%,is not between 80 and 100' + err_msg: 'memory_limit: #{memory_limit}. os_memory: #{os_memory}. memory_limit/os_memory is #{result}%,is not between 80% and 100%' # memory_size - type: sql diff --git a/handler/checker/tasks/observer/system/aio.yaml b/handler/checker/tasks/observer/system/aio.yaml index 6ad6adeb..d72b3369 100644 --- a/handler/checker/tasks/observer/system/aio.yaml +++ b/handler/checker/tasks/observer/system/aio.yaml @@ -11,11 +11,11 @@ task: set_value: aio_max_nr report_type: warning verify: "[ ${aio_max_nr} -ge 1048576 ]" - err_msg: 'fs.aio-max-nr : #{aio_max_nr} is a non recommended value, recommended value need >1048576' + err_msg: 'fs.aio-max-nr : #{aio_max_nr} . recommended: >1048576' - type: get_system_parameter parameter: fs.aio-nr result: set_value: aio_nr report_type: warning verify: "[ $((aio_max_nr - aio_nr)) -ge $((20000*${observer_nu})) ]" - err_msg: 'fs.aio-nr : #{aio_nr} is a non recommended value, recommended value need aio-max-nr - aio-nr>20000 * observer_num' + err_msg: 'fs.aio-nr : #{aio_nr} . recommended: aio-max-nr - aio-nr>20000 * observer_num' diff --git a/handler/checker/tasks/observer/system/clock_source.yaml b/handler/checker/tasks/observer/system/clock_source.yaml new file mode 100644 index 00000000..ce9950a2 --- /dev/null +++ b/handler/checker/tasks/observer/system/clock_source.yaml @@ -0,0 +1,11 @@ +info: 'Check the type of clock_source is tsc' +task: + - steps: + - type: ssh + ssh: "cat /sys/devices/system/clocksource/clocksource0/current_clocksource" + result: + set_value: clock_source + verify: "[[ \"${clock_source}\" == \"tsc\" || \"${clock_source}\" == \"arch_sys_counter\" || \"${clock_source}\" == \"kvm-clock\" ]]" + err_msg: 'clock_source: #{clock_source}. recommended: tsc. Uneven CPU utilization during pressure testing resulted in low TPS during pressure testing' + + diff --git a/handler/checker/tasks/observer/system/core_pattern.yaml b/handler/checker/tasks/observer/system/core_pattern.yaml new file mode 100644 index 00000000..a708a49a --- /dev/null +++ b/handler/checker/tasks/observer/system/core_pattern.yaml @@ -0,0 +1,9 @@ +info: 'Check kernel.core_pattern' +task: + - steps: + - type: get_system_parameter + parameter: kernel.core_pattern + result: + set_value: core_pattern + verify: '[[ $core_pattern != *" "* ]]' + err_msg: 'kernel.core_pattern: #{core_pattern} , is not recommended for configuring functions other than the specified core path' diff --git a/handler/checker/tasks/observer/system/dependent_software.yaml b/handler/checker/tasks/observer/system/dependent_software.yaml index 5c0d64ae..c65c5996 100644 --- a/handler/checker/tasks/observer/system/dependent_software.yaml +++ b/handler/checker/tasks/observer/system/dependent_software.yaml @@ -35,11 +35,4 @@ task: set_value: transparent_hugepage_switch report_type: warning verify: '[ -n "${transparent_hugepage_switch}" ]' - err_msg: 'transparent_hugepage need retrun "[never]". Now , it is null.' - - type: ssh - ssh: "python --version 2>&1 | awk '{print $2}' | cut -d'.' -f1,2" - result: - set_value: python_version - report_type: warning - verify: '[ "2.7" == ${python_version} ]' - err_msg: 'python version need retrun 2.7.x' \ No newline at end of file + err_msg: 'transparent_hugepage need retrun "[never]". Now , it is null.' \ No newline at end of file diff --git a/handler/checker/tasks/observer/system/instruction_set_avx2.yaml b/handler/checker/tasks/observer/system/instruction_set_avx2.yaml new file mode 100644 index 00000000..a17a1e00 --- /dev/null +++ b/handler/checker/tasks/observer/system/instruction_set_avx2.yaml @@ -0,0 +1,11 @@ +info: 'Check the flags of cpu' +task: + - steps: + - type: ssh + ssh: "lscpu |grep Flags" + result: + set_value: cpu_flags + verify: " [[ $cpu_flags == *avx2* ]] " + err_msg: 'clock_source: #{clock_source}. recommended: tsc. Uneven CPU utilization during pressure testing resulted in low TPS during pressure testing' + + diff --git a/handler/checker/tasks/observer/system/parameter.yaml b/handler/checker/tasks/observer/system/parameter.yaml index 6cb80c7e..5dc3bd3e 100644 --- a/handler/checker/tasks/observer/system/parameter.yaml +++ b/handler/checker/tasks/observer/system/parameter.yaml @@ -10,7 +10,7 @@ task: report_type: warning verify_type: between verify: "[2048,16384]" - err_msg: 'net.core.somaxconn : #{parameter} , which is not recommended. Set it within the range of 2048 ≤ value ≤ 16384' + err_msg: 'net.core.somaxconn : #{parameter}. recommended: 2048 ≤ value ≤ 16384.' - type: get_system_parameter parameter: net.core.netdev_max_backlog result: @@ -18,7 +18,7 @@ task: report_type: warning verify_type: between verify: "[ 500 ,10000 ]" - err_msg: 'net.core.netdev_max_backlog : #{parameter} , which is not recommended. Set it within the range of 500 ≤ value ≤ 10000' + err_msg: 'net.core.netdev_max_backlog: #{parameter}. recommended: 500 ≤ value ≤ 10000.' - type: get_system_parameter parameter: net.core.rmem_default result: @@ -26,7 +26,7 @@ task: report_type: warning verify_type: between verify: "[ 65536 ,16777216 ]" - err_msg: 'net.core.rmem_default : #{parameter} , which is not recommended. Set it within the range of 65536 ≤ value ≤ 16777216' + err_msg: 'net.core.rmem_default: #{parameter}. recommended: 65536 ≤ value ≤ 16777216.' - type: get_system_parameter parameter: net.core.wmem_default result: @@ -34,7 +34,7 @@ task: report_type: warning verify_type: between verify: "[ 65536,16777216 ]" - err_msg: 'net.core.wmem_default : #{parameter} , which is not recommended. Set it within the range of 65536 ≤ value ≤ 16777216' + err_msg: 'net.core.wmem_default: #{parameter}. recommended: 65536 ≤ value ≤ 16777216.' - type: get_system_parameter parameter: net.core.rmem_max result: @@ -42,7 +42,7 @@ task: report_type: warning verify_type: between verify: "[ 8388608 ,16777216 ]" - err_msg: 'net.core.rmem_max : #{parameter} , which is not recommended. Set it within the range of 8388608 ≤ value ≤ 16777216' + err_msg: 'net.core.rmem_max : #{parameter}. recommended: 8388608 ≤ value ≤ 16777216.' - type: get_system_parameter parameter: net.core.wmem_max result: @@ -50,7 +50,7 @@ task: report_type: warning verify_type: between verify: "[ 8388608,16777216 ]" - err_msg: 'net.core.wmem_max : #{parameter} , which is not recommended. Set it within the range of 8388608 ≤ value ≤ 16777216' + err_msg: 'net.core.wmem_max: #{parameter}. recommended: 8388608 ≤ value ≤ 16777216.' - type: get_system_parameter parameter: net.ipv4.ip_forward result: @@ -58,7 +58,7 @@ task: report_type: warning verify_type: equal verify: 0 - err_msg: 'net.ipv4.ip_forward : #{parameter} , which is not recommended. Set it within the range of 0' + err_msg: 'net.ipv4.ip_forward : #{parameter}. recommended: 0.' - type: get_system_parameter parameter: net.ipv4.tcp_tw_recycle result: @@ -66,7 +66,7 @@ task: report_type: warning verify_type: equal verify: 1 - err_msg: 'net.ipv4.tcp_tw_recycle : #{parameter} , which is not recommended. Set it within the range of 1' + err_msg: 'net.ipv4.tcp_tw_recycle : #{parameter}. recommended: 1.' - type: get_system_parameter parameter: net.ipv4.conf.default.rp_filter @@ -75,7 +75,7 @@ task: report_type: warning verify_type: equal verify: 1 - err_msg: 'net.ipv4.conf.default.rp_filter : #{parameter} , which is not recommended. Set it within the range of 1' + err_msg: 'net.ipv4.conf.default.rp_filter : #{parameter}. recommended: 1.' - type: get_system_parameter parameter: net.ipv4.conf.default.accept_source_route result: @@ -83,7 +83,7 @@ task: report_type: warning verify_type: equal verify: 0 - err_msg: 'net.ipv4.conf.default.accept_source_route : #{parameter} , which is not recommended. Set it within the range of 0' + err_msg: 'net.ipv4.conf.default.accept_source_route: #{parameter}. recommended: 0.' - type: get_system_parameter parameter: net.ipv4.tcp_syncookies result: @@ -91,7 +91,7 @@ task: report_type: warning verify_type: equal verify: 1 - err_msg: 'net.ipv4.tcp_syncookies : #{parameter} , which is not recommended. Set it within the range of 1' + err_msg: 'net.ipv4.tcp_syncookies: #{parameter}. recommended: 1.' - type: get_system_parameter parameter: net.ipv4.tcp_max_syn_backlog result: @@ -99,97 +99,97 @@ task: report_type: warning verify_type: between verify: "[1024,16384]" - err_msg: 'net.ipv4.tcp_max_syn_backlog : #{parameter} , which is not recommended. Set it within the range of 1024 ≤ value ≤ 16384' + err_msg: 'net.ipv4.tcp_max_syn_backlog : #{parameter}. recommended: 1024 ≤ value ≤ 16384.' - type: get_system_parameter parameter: net.ipv4.tcp_fin_timeout result: set_value: parameter report_type: warning verify: "[ 15 -le ${parameter} ] && [ ${parameter} -le 60 ]" - err_msg: 'net.ipv4.tcp_fin_timeout : #{parameter} , which is not recommended. Set it within the range of 15 ≤ value ≤ 60' + err_msg: 'net.ipv4.tcp_fin_timeout : #{parameter}. recommended: 15 ≤ value ≤ 60.' - type: get_system_parameter parameter: net.ipv4.tcp_tw_reuse result: set_value: parameter report_type: warning verify: "[ ${parameter} -eq 1 ]" - err_msg: 'net.ipv4.tcp_tw_reuse : #{parameter} , which is not recommended. Set it within the range of 1' + err_msg: 'net.ipv4.tcp_tw_reuse: #{parameter}. recommended: 1' - type: get_system_parameter parameter: net.ipv4.tcp_slow_start_after_idle result: set_value: parameter report_type: warning verify: "[ ${parameter} -eq 0 ]" - err_msg: 'net.ipv4.tcp_slow_start_after_idle : #{parameter} , which is not recommended. Set it within the range of 0' + err_msg: 'net.ipv4.tcp_slow_start_after_idle: #{parameter}. recommended: 0.' - type: get_system_parameter parameter: vm.swappiness result: set_value: parameter report_type: warning verify: "[ ${parameter} -eq 0 ]" - err_msg: 'vm.swappiness : #{parameter} , which is not recommended. Set it within the range of 0' + err_msg: 'vm.swappiness : #{parameter}. recommended: 0' - type: get_system_parameter parameter: vm.min_free_kbytes result: set_value: parameter report_type: warning verify: "[ 32768 -le ${parameter} ] && [ ${parameter} -le 2097152 ]" - err_msg: 'vm.min_free_kbytes : #{parameter} , which is not recommended. Set it within the range of 32768 ≤ value ≤ 2097152' + err_msg: 'vm.min_free_kbytes : #{parameter}. recommended: 32768 ≤ value ≤ 2097152.' - type: get_system_parameter parameter: vm.max_map_count result: set_value: parameter report_type: warning verify: "[ 327680 -le ${parameter} ] && [ ${parameter} -le 1000000 ]" - err_msg: 'vm.max_map_count : #{parameter} , which is not recommended.Unreasonable vm.max_map_count configuration may cause serious memory leaks. Set it within the range of 327680 ≤ value ≤ 1000000 ' + err_msg: 'vm.max_map_count : #{parameter}. recommended:327680 ≤ value ≤ 1000000.' - type: get_system_parameter parameter: vm.overcommit_memory result: set_value: parameter report_type: warning verify: "[ ${parameter} -eq 0 ]" - err_msg: 'vm.overcommit_memory : #{parameter} , which is not recommended. Set it within the range of 0 ' + err_msg: 'vm.overcommit_memory : #{parameter}. recommended: 0.' - type: get_system_parameter parameter: vm.nr_hugepages result: set_value: parameter report_type: warning verify: "[ ${parameter} -eq 0 ]" - err_msg: 'vm.nr_hugepages : #{parameter} , which is not recommended. Set it within the range of 0 ' + err_msg: 'vm.nr_hugepages : #{parameter}. recommended: 0' - type: get_system_parameter parameter: fs.aio-max-nr result: set_value: parameter report_type: warning verify: "[ 1048576 -le ${parameter} ]" - err_msg: 'fs.aio-max-nr : #{parameter} is a non recommended value, recommended value : #{parameter} is ≥ 1048576' + err_msg: 'fs.aio-max-nr : #{parameter}. recommended: #{parameter} is ≥ 1048576.' - type: get_system_parameter parameter: kernel.numa_balancing result: set_value: parameter report_type: warning verify: "[ ${parameter} -eq 0 ]" - err_msg: 'kernel.numa_balancing : #{parameter} , which is not recommended. Set it within the range of 0 ' + err_msg: 'kernel.numa_balancing : #{parameter}. recommended: 0.' - type: get_system_parameter parameter: vm.zone_reclaim_mode result: set_value: parameter report_type: warning verify: "[ ${parameter} -eq 0 ]" - err_msg: 'vm.zone_reclaim_mode : #{parameter} , which is not recommended. Set it within the range of 0 ' + err_msg: 'vm.zone_reclaim_mode : #{parameter}. recommended: 0.' - type: get_system_parameter parameter: fs.file-max result: set_value: parameter report_type: warning verify: "[ 6573688 -le ${parameter} ]" - err_msg: 'fs.file-max: #{parameter} is a non recommended value, recommended value : #{parameter} is ≥ 6573688' + err_msg: 'fs.file-max: #{parameter}. recommended: #{parameter} is ≥ 6573688.' - type: get_system_parameter parameter: fs.pipe-user-pages-soft result: set_value: parameter report_type: warning verify: "[ 0 -eq ${parameter} ]" - err_msg: 'fs.pipe-user-pages-soft : #{parameter} is a non recommended value, recommended value is 0' + err_msg: 'fs.pipe-user-pages-soft : #{parameter}. recommended: 0.' diff --git a/handler/checker/tasks/observer/system/parameter_ip_local_port_range.yaml b/handler/checker/tasks/observer/system/parameter_ip_local_port_range.yaml index 9a83b86a..29e41c99 100644 --- a/handler/checker/tasks/observer/system/parameter_ip_local_port_range.yaml +++ b/handler/checker/tasks/observer/system/parameter_ip_local_port_range.yaml @@ -8,7 +8,7 @@ task: result: set_value: ip_local_port_range verify: '[[ -n "$ip_local_port_range" && "$ip_local_port_range" != "-1" ]]' - err_msg: "ip_local_port_range is #{ip_local_port_range} . Please check net.ipv4.ip_local_port_range on your node" + err_msg: "ip_local_port_range : #{ip_local_port_range} . Please check net.ipv4.ip_local_port_range on your node" - type: ssh ssh: "echo \"#{ip_local_port_range}\" | awk '{print $1}'" result: @@ -16,7 +16,7 @@ task: report_type: warning verify_type: equal verify: 3500 - err_msg: 'ip_local_port_range_min : #{ip_local_port_range_min} is a non recommended value, recommended value is 3500' + err_msg: 'ip_local_port_range_min : #{ip_local_port_range_min}. recommended: 3500' - type: ssh ssh: "echo \"#{ip_local_port_range}\" | awk '{print $2}'" result: @@ -24,4 +24,4 @@ task: report_type: warning verify_type: equal verify: 65535 - err_msg: 'ip_local_port_range_max : #{ip_local_port_range_max} is a non recommended value, recommended value is 65535' \ No newline at end of file + err_msg: 'ip_local_port_range_max : #{ip_local_port_range_max}. recommended: 65535' \ No newline at end of file diff --git a/handler/checker/tasks/observer/system/parameter_tcp_rmem.yaml b/handler/checker/tasks/observer/system/parameter_tcp_rmem.yaml index 2cd140cd..9bc4a880 100644 --- a/handler/checker/tasks/observer/system/parameter_tcp_rmem.yaml +++ b/handler/checker/tasks/observer/system/parameter_tcp_rmem.yaml @@ -16,7 +16,7 @@ task: report_type: warning verify_type: between verify: "[4096,8192]" - err_msg: 'net.ipv4.tcp_rmem_min : #{tcp_rmem_min} is a non recommended value, recommended value is 4096 ≤ min ≤ 8192' + err_msg: 'net.ipv4.tcp_rmem_min : #{tcp_rmem_min}. recommended: 4096 ≤ min ≤ 8192' - type: ssh ssh: "echo \"#{tcp_rmem}\" | awk '{print $2}'" result: @@ -24,7 +24,7 @@ task: report_type: warning verify_type: between verify: "[65536,131072]" - err_msg: 'net.ipv4.tcp_rmem_default : #{tcp_rmem_default}. net.ipv4.tcp_rmem_default from net.ipv4.tcp_rmem. It is a non recommended value, recommended value :is 65536 ≤ default≤ 131072' + err_msg: 'net.ipv4.tcp_rmem_default : #{tcp_rmem_default}. net.ipv4.tcp_rmem_default from net.ipv4.tcp_rmem. recommended: is 65536 ≤ default≤ 131072' - type: ssh ssh: "echo \"#{tcp_rmem}\" | awk '{print $3}'" result: @@ -32,4 +32,4 @@ task: report_type: warning verify_type: between verify: "[8388608,16777216]" - err_msg: 'net.ipv4.tcp_rmem_max : #{tcp_rmem_max}. net.ipv4.tcp_rmem_max from net.ipv4.tcp_rmem. It is a non recommended value, recommended value is 65536 ≤ max≤ 131072' + err_msg: 'net.ipv4.tcp_rmem_max : #{tcp_rmem_max}. net.ipv4.tcp_rmem_max from net.ipv4.tcp_rmem. recommended: 65536 ≤ max≤ 131072' diff --git a/handler/checker/tasks/observer/system/parameter_tcp_wmem.yaml b/handler/checker/tasks/observer/system/parameter_tcp_wmem.yaml index 76eadaa9..9ed4a2fc 100644 --- a/handler/checker/tasks/observer/system/parameter_tcp_wmem.yaml +++ b/handler/checker/tasks/observer/system/parameter_tcp_wmem.yaml @@ -14,7 +14,7 @@ task: report_type: warning verify_type: between verify: "[4096,8192]" - err_msg: 'net.ipv4.tcp_wmem_min : #{tcp_wmem_min} is a non recommended value, recommended value is 4096 ≤ min ≤ 8192' + err_msg: 'net.ipv4.tcp_wmem_min : #{tcp_wmem_min}. recommended: 4096 ≤ min ≤ 8192' - type: ssh ssh: "echo \"#{tcp_wmem}\" | awk '{print $2}'" result: @@ -22,7 +22,7 @@ task: report_type: warning verify_type: between verify: "[65536,131072]" - err_msg: 'net.ipv4.tcp_wmem_default : #{tcp_wmem_default} is a non recommended value, recommended value :is 65536 ≤ default≤ 131072' + err_msg: 'net.ipv4.tcp_wmem_default : #{tcp_wmem_default}. recommended: is 65536 ≤ default≤ 131072' - type: ssh ssh: "echo \"#{tcp_wmem}\" | awk '{print $3}'" result: @@ -30,4 +30,4 @@ task: report_type: warning verify_type: between verify: "[8388608,16777216]" - err_msg: 'net.ipv4.tcp_wmem_max : #{tcp_wmem_max} is a non recommended value, recommended value is 65536 ≤ max≤ 131072' + err_msg: 'net.ipv4.tcp_wmem_max : #{tcp_wmem_max}. recommended: 65536 ≤ max≤ 131072' diff --git a/handler/checker/tasks/observer/system/ulimit_parameter.yaml b/handler/checker/tasks/observer/system/ulimit_parameter.yaml index 08eb0ea3..778259c5 100644 --- a/handler/checker/tasks/observer/system/ulimit_parameter.yaml +++ b/handler/checker/tasks/observer/system/ulimit_parameter.yaml @@ -8,21 +8,21 @@ task: set_value: parameter report_type: warning verify: "[ 'unlimited' == ${parameter} ]" - err_msg: 'On ip : #{remote_ip}, ulimit -c is #{parameter} . It is a non recommended value, and the recommended value is unlimited. Please refer to the official website document for the configuration method' + err_msg: 'On ip : #{remote_ip}, ulimit -c is #{parameter} . recommended: unlimited.' - type: ssh ssh: ulimit -u result: set_value: parameter report_type: warning verify: "[ '655360' == ${parameter} ]" - err_msg: 'On ip : #{remote_ip}, ulimit -u is #{parameter} . It is a non recommended value, and the recommended value is 655360. Please refer to the official website document for the configuration method' + err_msg: 'On ip : #{remote_ip}, ulimit -u is #{parameter} . recommended: 655360.' - type: ssh ssh: ulimit -s result: set_value: parameter report_type: warning verify: "[ 'unlimited' == ${parameter} ]" - err_msg: 'On ip : #{remote_ip}, ulimit -s is #{parameter} . It is a non recommended value, and the recommended value is unlimited. Please refer to the official website document for the configuration method' + err_msg: 'On ip : #{remote_ip}, ulimit -s is #{parameter} . recommended: unlimited.' - type: ssh ssh: ulimit -n result: @@ -30,4 +30,4 @@ task: report_type: warning verify_type: equal verify: 655350 - err_msg: 'On ip : #{remote_ip}, ulimit -n is #{parameter} . It is a non recommended value, and the recommended value is unlimited. Please refer to the official website document for the configuration method' + err_msg: 'On ip : #{remote_ip}, ulimit -n is #{parameter} . recommended: unlimited.' diff --git a/handler/checker/tasks/observer/table/information_schema_tables_two_data.yaml b/handler/checker/tasks/observer/table/information_schema_tables_two_data.yaml new file mode 100644 index 00000000..76d450b7 --- /dev/null +++ b/handler/checker/tasks/observer/table/information_schema_tables_two_data.yaml @@ -0,0 +1,11 @@ +info: 'A table found two records in information_schema.tables.' +task: + - version: "[4.0.0.0,*]" + steps: + - type: sql + sql: 'select count(0) from oceanbase.__all_virtual_table_stat where table_id = partition_id and (tenant_id,table_id) in (select tenant_id, table_id from oceanbase.__all_virtual_table where part_level != 0);' + result: + set_value: err_table_count + verify_type: equal + verify: 0 + err_msg: 'Find have table found two records in information_schema.tables. the number of err_table_count is : #{err_table_count}. Please get more info by "select * from oceanbase.__all_virtual_table_stat where table_id = partition_id and (tenant_id,table_id) in (select tenant_id, table_id from oceanbase.__all_virtual_table where part_level != 0);". And you can by "delete from __all_table_stat where table_id=partition_id and table_id=${partition table table_id};" and "delete from __all_column_stat where table_id=partition_id and table_id=${partition table table_id};" to fix it.' diff --git a/handler/checker/tasks/observer_check_package.yaml b/handler/checker/tasks/observer_check_package.yaml index f0a6b356..4b8f132b 100644 --- a/handler/checker/tasks/observer_check_package.yaml +++ b/handler/checker/tasks/observer_check_package.yaml @@ -2,7 +2,7 @@ ad: info_en: "Test and inspection tasks" info_cn: "测试巡检任务" tasks: - - system.parameter + - system.* build_before: info_en: "Deployment environment check" info_cn: "部署环境检查" diff --git a/handler/gather/gather_log.py b/handler/gather/gather_log.py index ee640f4f..d5a1174f 100644 --- a/handler/gather/gather_log.py +++ b/handler/gather/gather_log.py @@ -35,6 +35,7 @@ class GatherLogHandler(BaseShellHandler): def __init__(self, context, gather_pack_dir='./', is_scene=False): super(GatherLogHandler, self).__init__() + self.pack_dir_this_command = "" self.context = context self.stdio = context.stdio self.is_ssh = True @@ -168,6 +169,7 @@ def handle_from_node(node): summary_tuples = self.__get_overall_summary(gather_tuples, self.zip_encrypt) self.stdio.print(summary_tuples) + self.pack_dir_this_command=pack_dir_this_command # Persist the summary results to a file FileUtil.write_append(os.path.join(pack_dir_this_command, "result_summary.txt"), summary_tuples) last_info = "For result details, please run cmd \033[32m' cat {0} '\033[0m\n".format(os.path.join(pack_dir_this_command, "result_summary.txt")) @@ -333,6 +335,7 @@ def __pharse_log(self, ssh_helper, home_path, log_name, gather_path): """ log_path = os.path.join(home_path, "log") if self.grep_options is not None: + grep_cmd="" if type(self.grep_options) == str: grep_cmd = "grep -e '{grep_options}' {log_dir}/{log_name} >> {gather_path}/{log_name} ".format( grep_options=self.grep_options, diff --git a/handler/gather/gather_obproxy_log.py b/handler/gather/gather_obproxy_log.py index 603097e0..dcf0b769 100644 --- a/handler/gather/gather_obproxy_log.py +++ b/handler/gather/gather_obproxy_log.py @@ -249,7 +249,7 @@ def __get_log_name(self, ssh_helper, node): self.scope == "obproxy_limit" or self.scope == "obproxy_slow" or self.scope == "obproxy_diagnosis" or self.scope == "obproxy_error": get_obproxy_log = "ls -1 -F %s/*%s.*log* | awk -F '/' '{print $NF}'" % (log_path, self.scope) else: - get_obproxy_log = "ls -1 -F %s/obproxy.*log* %s/obproxy_error.*log* %s/obproxy_stat.*log* %s/obproxy_digest.*log* %s/obproxy_limit.*log* %s/obproxy_slow.*log* | awk -F '/' '{print $NF}'" % (log_path, log_path, log_path, log_path, log_path, log_path) + get_obproxy_log = "ls -1 -F %s/obproxy.*log* %s/obproxy_error.*log* %s/obproxy_stat.*log* %s/obproxy_digest.*log* %s/obproxy_limit.*log* %s/obproxy_slow.*log* %s/obproxy_diagnosis.*log*| awk -F '/' '{print $NF}'" % (log_path, log_path, log_path, log_path, log_path, log_path,log_path) if self.is_ssh: log_files = SshClient(self.stdio).run(ssh_helper, get_obproxy_log).strip() else: diff --git a/handler/rca/rca_handler.py b/handler/rca/rca_handler.py index 92f423b3..94d4d017 100644 --- a/handler/rca/rca_handler.py +++ b/handler/rca/rca_handler.py @@ -252,6 +252,15 @@ def get_scene_info(self): raise Exception("rca ({0}) scene.get_scene_info() undefined".format(type(self).__name__)) def export_result(self): return self.Result.export() + def get_all_tenants_id(self): + try: + if self.ob_connector is None: + raise Exception("ob_connector is None") + all_tenant_id_data=self.ob_connector.execute_sql("select tenant_id from oceanbase.__all_tenant;")[0] + return all_tenant_id_data + except Exception as e: + raise Exception("run rca's get_all_tenants_id. Exception: {0}".format(e)) + class Result: @@ -293,6 +302,8 @@ def add_record(self, record): def add_suggest(self, suggest): self.suggest += suggest + def suggest_is_empty(self): + return self.suggest == "The suggest: " def export_suggest(self): return self.suggest diff --git a/handler/rca/scene/ddl_disk_full_scene.py b/handler/rca/scene/ddl_disk_full_scene.py index 89095f80..b45db4ac 100644 --- a/handler/rca/scene/ddl_disk_full_scene.py +++ b/handler/rca/scene/ddl_disk_full_scene.py @@ -12,7 +12,7 @@ """ @time: 2024/04/01 -@file: ddl_disk_full.py +@file: ddl_disk_full_scene.py @desc: """ import re @@ -69,6 +69,9 @@ def init(self, context): tenant_data = self.ob_connector.execute_sql( "select tenant_id from oceanbase.__all_tenant where tenant_name = '{0}';".format(tenant_name)) + if len(tenant_data) == 0: + raise RCAInitException( + "can not find tenant id by tenant name: {0}. Please check the tenant name.".format(tenant_name)) self.tenant_id = tenant_data[0][0] if self.tenant_id is None: raise RCAInitException( @@ -76,6 +79,9 @@ def init(self, context): table_id_data = self.ob_connector.execute_sql( "select table_id from oceanbase.__all_virtual_table where table_name = '{0}';".format(table_name)) + if len(table_id_data) == 0: + raise RCAInitException( + "can not find table id by table name: {0}. Please check the table name.".format(table_name)) self.table_id = table_id_data[0][0] if self.table_id is None: raise RCAInitException( diff --git a/handler/rca/scene/disconnection_scene.py b/handler/rca/scene/disconnection_scene.py index 1ac6d6a2..754224a0 100644 --- a/handler/rca/scene/disconnection_scene.py +++ b/handler/rca/scene/disconnection_scene.py @@ -12,7 +12,7 @@ """ @time: 2024/03/11 -@file: disconnectionScene.py +@file: disconnection_scene.py @desc: """ import re diff --git a/handler/rca/scene/log_error_scene.py b/handler/rca/scene/log_error_scene.py new file mode 100644 index 00000000..39ab8929 --- /dev/null +++ b/handler/rca/scene/log_error_scene.py @@ -0,0 +1,335 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/04/16 +@file: log_error_scene.py +@desc: +""" +import os +import re + +from handler.rca.rca_exception import RCAInitException, RCAExecuteException +from handler.rca.rca_handler import RcaScene, RCA_ResultRecord +from common.tool import StringUtils + + +class LogErrorScene(RcaScene): + def __init__(self): + super().__init__() + self.all_tenant_election_leader_info = None + self.work_path = None + self.all_tenant_ids = None + + def init(self, context): + super().init(context) + ## observer version≥4.0.0.0 + observer_version = self.observer_version + if observer_version is None or len(observer_version.strip()) == 0: + raise RCAInitException("observer version is None. Please check the NODES conf.") + if not (observer_version == "4.0.0.0" or StringUtils.compare_versions_greater(observer_version, "4.0.0.0")): + self.stdio.error("observer version is {0}, which is less than 4.0.0.0.".format(observer_version)) + raise RCAInitException("observer version is {0}, which is less than 4.0.0.0.".format(observer_version)) + self.verbose("observer version is {0}.".format(observer_version)) + if self.ob_connector is None: + raise RCAInitException("ob_connector is None. Please check the NODES conf.") + + def verbose(self, info): + self.stdio.verbose("[NoLeaderScene] {0}".format(info)) + + def execute(self): + try: + if self.observer_version >= '4.2.1.0': + self.execute_421() + return + # check Election leader + # get_all_tenant_id + self.verbose("start to get all tenant id...") + sql = "select tenant_id from oceanbase.__all_tenant;" + tenant_ids = self.ob_connector.execute_sql(sql) + if len(tenant_ids) <= 0: + raise RCAExecuteException("can not find any tenant id") + self.all_tenant_election_leader_info = {} + for tenant_id_data in tenant_ids: + record = RCA_ResultRecord() + try: + tenant_id_data = tenant_id_data[0] + + record.add_record("tenant_id:{0}.".format(tenant_id_data)) + self.execute_by_tenant_id(tenant_id_data, record) + except Exception as e: + self.verbose("check election leader error,tenant_id:{0},error:{1}".format(tenant_id_data, e)) + continue + finally: + if len(record.suggest) == 13: + record.add_suggest("no suggest") + self.Result.records.append(record) + except Exception as e: + self.stdio.error("NoLeaderScene execute Exception:{0}".format(e)) + + def execute_by_tenant_id(self, tenant_id, record): + try: + record.add_record("start step1") + election_leader_info = self.check_election_leader_by_tenant_id(tenant_id) + self.verbose("election_leader_info:{0}".format(election_leader_info)) + record.add_record("election_leader_info:{0}".format(election_leader_info)) + if election_leader_info == "": + self.verbose("can not find any election leader,tenant_id:{0}".format(tenant_id)) + record.add_record("election_leader_info is null") + record.add_suggest("can not find any election leader,tenant_id:{0}. Please check it.".format(tenant_id)) + return + record.add_record("start step2") + step_next_tag = True + ls_ids = self.ob_connector.execute_sql( + "select distinct (ls_id) from oceanbase.__all_virtual_log_stat where tenant_id={0};".format(tenant_id)) + if ls_ids is None or len(ls_ids) <= 0: + self.stdio.warn("not found log about election_leader. tenant_id: {0}".format(tenant_id)) + record.add_suggest( + "not found log on oceanbase.__all_virtual_log_stat. tenant_id: {0}".format(tenant_id)) + return + + for ls_id in ls_ids: + ls_id = ls_id[0] + leader_ls_id_bool = self.ob_connector.execute_sql( + 'select count(0) from oceanbase.__all_virtual_log_stat where role="LEADER" and tenant_id={0} and ls_id="{1}";'.format( + tenant_id, ls_id)) + leader_ls_id_bool = leader_ls_id_bool[0] + if leader_ls_id_bool <= 0: + record.add_record( + "tenant_id: {0}, ls_id: {1} on oceanbase.__all_virtual_log_stat no LEADER".format(tenant_id, + ls_id)) + record.add_suggest( + "tenant_id: {0}, ls_id: {1} on oceanbase.__all_virtual_log_stat no LEADER".format(tenant_id, + ls_id)) + self.stdio.warn( + "tenant_id: {0}, ls_id: {1} on oceanbase.__all_virtual_log_stat no LEADER".format(tenant_id, + ls_id)) + step_next_tag = False + + if step_next_tag is False: + self.verbose("step_next_tag is false") + return + return + + + except Exception as e: + self.stdio.warn("execute_by_tenant_id:{0} Exception:{1}".format(tenant_id, e)) + + def execute_421(self): + try: + self.stdio.print("start execute_421") + if self.ob_connector is None: + self.stdio.error("ob_connector is None. please check conf") + return + # get data from __all_virtual_ha_diagnose + sql = "select * from oceanbase.__all_virtual_ha_diagnose;" + cursor = self.ob_connector.execute_sql_return_cursor_dictionary(sql) + diagnose_data = cursor.fetchall() + if diagnose_data is None or len(diagnose_data) <= 0: + self.stdio.warn("not found data on oceanbase.__all_virtual_ha_diagnose") + return + # get all tenant_id + tenant_ids = [] + for data in diagnose_data: + tenant_ids.append(data["tenant_id"]) + self.verbose("tenant_ids:{0}".format(tenant_ids)) + # step1 + ### tenant_diagnose_data: [tenant_id] diagnose_data + tenant_diagnose_data = {} + for data in diagnose_data: + if tenant_diagnose_data.get(data["tenant_id"]) is None: + tenant_diagnose_data[data["tenant_id"]] = [data] + else: + tenant_data = tenant_diagnose_data.get(data["tenant_id"]) + tenant_data.append(data) + tenant_diagnose_data[data["tenant_id"]] = tenant_data + self.verbose("tenant_diagnose_data:{0}".format(tenant_diagnose_data)) + self.stdio.start_loading("no_leader scene start analyzing...") + for tenant_id in tenant_diagnose_data: + record_one_tenant=self.execute_421_no_leader_by_tenant_id(tenant_id, tenant_diagnose_data[tenant_id]) + self.Result.records.append(record_one_tenant) + self.stdio.stop_loading('no_leader scene end') + return + + except Exception as e: + raise RCAExecuteException("execute_421 execute error: {0}".format(e)) + + def execute_421_no_leader_by_tenant_id(self, tenant_id,diagnose_data): + record = RCA_ResultRecord() + try: + self.stdio.verbose("start execute_421_no_leader_by_tenant_id") + record.add_record("tenant_id: {0}.".format(tenant_id)) + leader_nu={} + record.add_record("start step1") + for diagnose_data_by_tenant_id in diagnose_data: + if diagnose_data_by_tenant_id["election_role"].upper() == "LEADER": + leader_nu[diagnose_data_by_tenant_id["ls_id"]] = leader_nu.get( + diagnose_data_by_tenant_id["ls_id"], 0) + 1 + else: + leader_nu[diagnose_data_by_tenant_id["ls_id"]] = leader_nu.get( + diagnose_data_by_tenant_id["ls_id"], 0) + record.add_record("all ls_id:{0}".format(list(leader_nu.keys()))) + self.verbose("all ls_id:{0}".format(list(leader_nu.keys()))) + scene_1_tag=True + for ls_id in leader_nu: + record.add_record("on ls_id: {1} ".format(tenant_id, ls_id)) + self.verbose("on tenant_id: {0}, ls_id: {1} ".format(tenant_id, ls_id)) + if leader_nu[ls_id] > 1: + self.stdio.warn("the leader number > 1") + record.add_record("the ls_id's leader number > 1") + record.add_suggest( + "tenant_id: {0}, ls_id: {1} .the ls_id's leader number > 1".format(tenant_id, ls_id)) + scene_1_tag = False + continue + elif leader_nu[ls_id] == 0: + self.stdio.warn( + "the leader number = 0,The election layer is unable to select a new owner, and a common problem in this scenario is that the message delay is too large. You can continue to troubleshoot the problem of message delay or backlog in the log") + record.add_suggest( + "tenant_id: {0}, ls_id: {1} .the leader number = 0. The election layer is unable to select a new owner, and a common problem in this scenario is that the message delay is too large. You can continue to troubleshoot the problem of message delay or backlog in the log".format( + tenant_id, ls_id)) + scene_1_tag = False + continue + else: + ## Normal + self.verbose("Normal. The ls_id's leader number = 1") + record.add_record("Normal. The ls_id's leader number = 1") + + if scene_1_tag is False: + self.verbose("scene_1 is check") + return record + + ## scene 2 + record.add_record("start step2") + scene_2_tag = True + for tenant_diagnose_data_by_tenant_id in diagnose_data: + ls_id = tenant_diagnose_data_by_tenant_id["ls_id"] + record.add_record("on ls_id: {1} ".format(tenant_id, ls_id)) + if tenant_diagnose_data_by_tenant_id["election_role"].upper() == "LEADER" and \ + tenant_diagnose_data_by_tenant_id["palf_role"].upper() != "LEADER" and \ + tenant_diagnose_data_by_tenant_id["palf_state"].upper() != "ACTIVE": + self.stdio.warn( + "tenant_id: {0}, ls_id: {1} on oceanbase.__all_virtual_ha_diagnose election_role is LEADER but palf_role is {2} and palf_state is {3}".format( + tenant_id, + ls_id, + tenant_diagnose_data_by_tenant_id["palf_role"], + tenant_diagnose_data_by_tenant_id["palf_state"])) + record.add_record( + "tenant_id: {0}, ls_id: {1} on oceanbase.__all_virtual_ha_diagnose election_role is LEADER but palf_role is {2} and palf_state is {3}".format( + tenant_id, + ls_id, + tenant_diagnose_data_by_tenant_id["palf_role"], + tenant_diagnose_data_by_tenant_id["palf_state"])) + record.add_suggest( + "tenant_id: {0}, ls_id: {1} on oceanbase.__all_virtual_ha_diagnose election_role is LEADER but palf_role is {2} and palf_state is {3}. The newly elected leader failed to take office in the palf layer, and the palf_state can be used to determine at which stage the palf failed to take office.".format( + tenant_id, + ls_id, + tenant_diagnose_data_by_tenant_id["palf_role"], + tenant_diagnose_data_by_tenant_id["palf_state"])) + scene_2_tag = False + else: + self.verbose( + "tenant_id: {0}, ls_id: {1} on oceanbase.__all_virtual_ha_diagnose election_role is LEADER , palf_role is {2} and palf_state is {3}".format( + tenant_id, + ls_id, + tenant_diagnose_data_by_tenant_id["palf_role"], + tenant_diagnose_data_by_tenant_id["palf_state"])) + record.add_record("Normal. Unable to find a replica where both election_role and palf_role are leaders, but log_handler_role is follower") + continue + if scene_2_tag is False: + self.verbose("scene_2 is check") + return + ## scene 3 + record.add_record("start step3") + + for tenant_diagnose_data_by_tenant_id in diagnose_data: + record.add_record( + "tenant_id: {0}, ls_id: {1} ".format(tenant_diagnose_data_by_tenant_id["tenant_id"], + tenant_diagnose_data_by_tenant_id["ls_id"])) + if tenant_diagnose_data_by_tenant_id["election_role"].upper() == "LEADER" and \ + tenant_diagnose_data_by_tenant_id["palf_role"].upper() == "LEADER" and \ + tenant_diagnose_data_by_tenant_id["log_handler_role"].upper() == "follower": + record.add_record("election_role:LEADER , palf_role: LEADER, log_handler_role: follower") + log_handler_takeover_state = tenant_diagnose_data_by_tenant_id[ + "log_handler_takeover_state"].lower() + record.add_record("log_handler_takeover_state: {0}".format(log_handler_takeover_state)) + if log_handler_takeover_state == "wait_replay_done": + record.add_suggest( + "Previous stuck waiting for replay steps. Please check the issue about replay") + elif log_handler_takeover_state == "unknown": + record.add_suggest( + "Please check observe whether the remaining log streams of this tenant also have the issue of log handler failure in taking over") + elif log_handler_takeover_state == "wait_rc_handler_done": + log_handler_takeover_log_type = tenant_diagnose_data_by_tenant_id[ + "log_handler_takeover_log_type"] + record.add_record( + "log_handler_takeover_log_type: {0}".format(log_handler_takeover_log_type)) + record.add_suggest( + "log_handler_takeover_log_type is {0}. Please report oceanbase's community".format( + log_handler_takeover_log_type)) + else: + record.add_record("Normal.Unable to find a replica where the selection_role is a leader, but the palf_role and palf_state are not leaders or active, respectively") + + if record.suggest_is_empty(): + record.add_suggest("Normal. Not find the reason of the log handler failure in taking over.") + except Exception as e: + raise RCAExecuteException("tenant_id: {0}. execute_421_no_leader_by_tenant_id execute error: {1}".format(tenant_id,e)) + finally: + + return record + + + + + def check_election_leader_by_tenant_id(self, tenant_id): + try: + self.stdio.verbose("start check_election_leader_by_tenant_id") + self.gather_log.set_parameters("scope", "election") + self.gather_log.grep("T{0}_.*dump proposer info".format(tenant_id)) + self.work_path = self.store_dir + logs_name = self.gather_log.execute() + if len(logs_name) == 0: + self.stdio.warn( + "check_election_leader_by_tenant_id not found log about election_leader. tenant_id: {0}".format( + tenant_id)) + return "" + self.stdio.verbose( + "check_election_leader_by_tenant_id tenant_id: {0}, logs_name:{1}".format(tenant_id, logs_name)) + for name in logs_name: + self.stdio.verbose("read the log file: {0}".format(name)) + with open(name, 'rb') as file: + file.seek(0, os.SEEK_END) + file_length = file.tell() + file.seek(max(file_length - 1024, 0), 0) + lines = file.readlines() + last_line = lines[-1].decode().strip() + pattern = r'addr:"([^"]+)"' + match = re.search(pattern, last_line) + if match: + return match.group(1) + else: + return "" + except Exception as e: + raise RCAExecuteException( + "check_election_leader_by_tenant_id: {1}. execute error: {0}".format(e, tenant_id)) + + def export_result(self): + super().export_result() + + def get_scene_info(self): + + return {"name": "log_error", + "info_en": "Troubleshooting log related issues. Currently supported scenes: no_leader.", + "info_cn": '日志相关问题排查。目前支持:无主场景。', + } + + +log_error = LogErrorScene() \ No newline at end of file diff --git a/handler/rca/scene/major_hold_scene.py b/handler/rca/scene/major_hold_scene.py index dee2ad5a..7ebfb93d 100644 --- a/handler/rca/scene/major_hold_scene.py +++ b/handler/rca/scene/major_hold_scene.py @@ -12,7 +12,7 @@ """ @time: 2024/1/2 -@file: major_hold.py +@file: major_hold_scene.py @desc: """ import json