From ba5148b4204be3c27af510c7254709ad5aa396eb Mon Sep 17 00:00:00 2001
From: Teingi
%s" % (data[1]) logger.debug("table schema: {0}".format(schemas)) except Exception as e: @@ -531,20 +559,26 @@ def __report(self, s): def tenant_mode_detected(self): try: - data = self.ob_connector.execute_sql("select version();") - logger.info("Detected mySQL mode successful, Database version : %s " % ("%s" % data[0])) - ob_version = data[0] - version_info = re.findall(r'OceanBase(_)?(.CE)?-v(.+)', ob_version[0]) - version = version_info[0][2] - if int(version[0]) >= 4: - self.sql_audit_name = "gv$ob_sql_audit" - self.plan_explain_name = "gv$ob_plan_cache_plan_explain" + data = self.db_connector.execute_sql("show variables like 'version_comment'") + ob_version = "3.0.0.0" + for row in data: + ob_version = row[1] + logger.info("Detected mySQL mode successful, Database version :{0} ".format(ob_version)) + version_pattern = r'(?:OceanBase(_CE)?\s+)?(\d+\.\d+\.\d+\.\d+)' + matched_version = re.search(version_pattern, ob_version) + if matched_version: + version = matched_version.group(2) + if int(version[0]) >= 4: + self.sql_audit_name = "gv$ob_sql_audit" + self.plan_explain_name = "gv$ob_plan_cache_plan_explain" + else: + self.sql_audit_name = "gv$sql_audit" + self.plan_explain_name = "gv$plan_cache_plan_explain" + self.ob_major_version = int(version[0]) + self.tenant_mode = "mysql" + self.sys_database = "oceanbase" else: - self.sql_audit_name = "gv$sql_audit" - self.plan_explain_name = "gv$plan_cache_plan_explain" - self.ob_major_version = int(version[0]) - self.tenant_mode = "mysql" - self.sys_database = "oceanbase" + logger.warn("Failed to match ob version") except: data = self.ob_connector.execute_sql("select SUBSTR(BANNER, 11, 100) from V$VERSION;") logger.info("Detectedo oracle mode successful, Database version : %s " % ("%s" % data[0])) diff --git a/handler/gather/gather_scenes.py b/handler/gather/gather_scenes.py new file mode 100644 index 00000000..a350502f --- /dev/null +++ b/handler/gather/gather_scenes.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/04 +@file: gather_scene_handler.py +@desc: +""" + +import os +import re +import uuid +import datetime +from common.logger import logger +from handler.gather.scenes.base import SceneBase +from utils.utils import display_trace +from common.obdiag_exception import OBDIAGFormatException +from utils.time_utils import parse_time_str +from utils.time_utils import parse_time_length_to_sec +from utils.time_utils import timestamp_to_filename_time +from utils.utils import display_trace +from handler.gather.scenes.list import GatherScenesListHandler +from utils.file_utils import mkdir_if_not_exist +from utils.string_utils import parse_custom_env_string +from common.scene import get_obproxy_and_ob_version +from colorama import Fore, Style + + +class GatherSceneHandler: + + def __init__(self, obproxy_cluster, obproxy_nodes, ob_cluster, ob_nodes, gather_pack_dir, gather_timestamp, tasks_base_path="./handler/gather/tasks/", task_type="observer"): + self.is_ssh = True + self.report = None + self.gather_timestamp = gather_timestamp + self.gather_pack_dir = gather_pack_dir + self.report_path = None + self.yaml_tasks = {} + self.code_tasks = [] + self.env = {} + self.scene = None + self.obproxy_cluster = obproxy_cluster + self.obproxy_nodes = obproxy_nodes + self.cluster = ob_cluster + self.ob_nodes = ob_nodes + self.tasks_base_path = tasks_base_path + self.task_type = task_type + self.variables = {} + + def handle(self, args): + if not self.__check_valid_and_parse_args(args): + return + self.__init_variables() + self.__init_report_path() + self.__init_task_names() + self.execute() + self.__print_result() + + def execute(self): + try: + logger.info("execute_tasks. the number of tasks is {0} ,tasks is {1}".format(len(self.yaml_tasks.keys()), self.yaml_tasks.keys())) + for key, value in zip(self.yaml_tasks.keys(), self.yaml_tasks.values()): + self.__execute_yaml_task_one(key, value) + for task in self.code_tasks: + self.__execute_code_task_one(task) + except Exception as e: + logger.error("Internal error :{0}".format(e)) + finally: + display_trace(uuid.uuid3(uuid.NAMESPACE_DNS, str(os.getpid()))) + + # execute yaml task + def __execute_yaml_task_one(self, task_name, task_data): + try: + logger.info("execute tasks is {0}".format(task_name)) + task_type = self.__get_task_type(task_name) + version = get_obproxy_and_ob_version(self.obproxy_nodes, self.ob_nodes, self.task_type) + if version: + self.cluster["version"] = re.findall(r'\d+\.\d+\.\d+\.\d+', version)[0] + logger.info("cluster.version is {0}".format(self.cluster["version"])) + task = SceneBase(scene=task_data["task"], obproxy_nodes=self.obproxy_nodes, ob_nodes=self.ob_nodes, cluster=self.cluster, report_dir=self.report_path, args=self.args, env=self.env, scene_variable_dict=self.variables, task_type=task_type) + logger.info("{0} execute!".format(task_name)) + task.execute() + logger.info("execute tasks end : {0}".format(task_name)) + else: + logger.error("can't get version") + except Exception as e: + logger.error("__execute_yaml_task_one Exception : {0}".format(e)) + + # execute code task + def __execute_code_task_one(self, task_name): + try: + logger.info("execute tasks is {0}".format(task_name)) + scene = {"name": task_name} + task = SceneBase(scene=scene, obproxy_nodes=self.obproxy_nodes, ob_nodes=self.ob_nodes, cluster=self.cluster, report_dir=self.report_path, args=self.args, env=self.env, mode='code', task_type=task_name) + logger.info("{0} execute!".format(task_name)) + task.execute() + logger.info("execute tasks end : {0}".format(task_name)) + except Exception as e: + logger.error("__execute_code_task_one Exception : {0}".format(e)) + + def __init_task_names(self): + if self.scene: + new = re.sub(r'\{|\}', '', self.scene) + items = re.split(r'[;,]', new) + scene = GatherScenesListHandler(self.tasks_base_path) + for item in items: + yaml_task_data = scene.get_one_yaml_task(item) + is_code_task = scene.is_code_task(item) + if is_code_task: + self.code_tasks.append(item) + else: + if yaml_task_data: + self.yaml_tasks[item] = yaml_task_data + else: + logger.error("Invalid Task :{0}".format(item)) + else: + logger.error("get task name failed") + + def __init_report_path(self): + try: + self.report_path = os.path.join(self.gather_pack_dir, "gather_pack_{0}".format(timestamp_to_filename_time(self.gather_timestamp))) + logger.info("Use {0} as pack dir.".format(self.report_path)) + mkdir_if_not_exist(self.report_path) + except Exception as e: + logger.error("init_report_path failed, error:{0}".format(e)) + + def __init_variables(self): + try: + + self.variables = { + "observer_data_dir": self.ob_nodes[0].get("home_path") if self.ob_nodes and self.ob_nodes[0].get("home_path") else "", + "obproxy_data_dir": self.obproxy_nodes[0].get("home_path") if self.obproxy_nodes and self.obproxy_nodes[0].get("home_path") else "", + "from_time": self.from_time_str, + "to_time": self.to_time_str + } + logger.info("gather scene variables: {0}".format(self.variables)) + except Exception as e: + logger.error("init gather scene variables failed, error: {0}".format(e)) + + def __get_task_type(self, s): + trimmed_str = s.strip() + if '.' in trimmed_str: + parts = trimmed_str.split('.', 1) + return parts[0] + else: + return None + + def __check_valid_and_parse_args(self, args): + """ + chech whether command args are valid. If invalid, stop processing and print the error to the user + :param args: command args + :return: boolean. True if valid, False if invalid. + """ + self.args = args + # 1: to timestamp must be larger than from timestamp, and be valid + if getattr(args, "from") is not None and getattr(args, "to") is not None: + try: + from_timestamp = parse_time_str(getattr(args, "from")) + to_timestamp = parse_time_str(getattr(args, "to")) + self.from_time_str = getattr(args, "from") + self.to_time_str = getattr(args, "to") + except OBDIAGFormatException: + logger.error("Error: Datetime is invalid. Must be in format yyyy-mm-dd hh:mm:ss. from_datetime={0}, to_datetime={1}".format(getattr(args, "from"), getattr(args, "to"))) + return False + if to_timestamp <= from_timestamp: + logger.error("Error: from datetime is larger than to datetime, please check.") + return False + else: + now_time = datetime.datetime.now() + self.to_time_str = (now_time + datetime.timedelta(minutes=1)).strftime('%Y-%m-%d %H:%M:%S') + if args.since is not None: + self.from_time_str = (now_time - datetime.timedelta( + seconds=parse_time_length_to_sec(args.since))).strftime('%Y-%m-%d %H:%M:%S') + else: + self.from_time_str = (now_time - datetime.timedelta(minutes=30)).strftime('%Y-%m-%d %H:%M:%S') + # 2: store_dir must exist, else create directory. + if getattr(args, "store_dir") is not None: + if not os.path.exists(os.path.abspath(getattr(args, "store_dir"))): + logger.warn("Error: args --store_dir [{0}] incorrect: No such directory, Now create it".format(os.path.abspath(getattr(args, "store_dir")))) + os.makedirs(os.path.abspath(getattr(args, "store_dir"))) + self.gather_pack_dir = os.path.abspath(getattr(args, "store_dir")) + if getattr(args, "scene") is not None: + self.scene = ' '.join(getattr(args, "scene")) + else: + return False + if getattr(args, "env") is not None: + env_dict = parse_custom_env_string(getattr(args, "env")) + self.env = env_dict + return True + + def __print_result(self): + print(Fore.YELLOW + "\nGather scene results stored in this directory: {0}\n".format(self.report_path) + Style.RESET_ALL) diff --git a/handler/gather/gather_sysstat.py b/handler/gather/gather_sysstat.py index c8a2c264..39c6167d 100644 --- a/handler/gather/gather_sysstat.py +++ b/handler/gather/gather_sysstat.py @@ -16,7 +16,6 @@ @desc: """ import os -import threading import time import datetime @@ -24,10 +23,9 @@ import uuid from common.logger import logger -from common.obdiag_exception import OBDIAGInvalidArgs from common.constant import const from common.command import LocalClient, SshClient -from common.command import get_file_size, download_file, mkdir, zip_dir, delete_file_force +from common.command import get_file_size, download_file, mkdir, zip_dir from handler.base_shell_handler import BaseShellHandler from utils.file_utils import mkdir_if_not_exist, size_format, write_result_append_to_file, parse_size from utils.shell_utils import SshHelper @@ -36,7 +34,7 @@ class GatherOsInfoHandler(BaseShellHandler): - def __init__(self, nodes, gather_pack_dir, gather_timestamp, common_config): + def __init__(self, nodes, gather_pack_dir, gather_timestamp=None, common_config=None, is_scene=False): super(GatherOsInfoHandler, self).__init__(nodes) for node in nodes: if node.get("ssh_type") == "docker": @@ -46,9 +44,10 @@ def __init__(self, nodes, gather_pack_dir, gather_timestamp, common_config): self.gather_timestamp = gather_timestamp self.local_stored_path = gather_pack_dir self.remote_stored_path = None + self.is_scene = is_scene self.config_path = const.DEFAULT_CONFIG_PATH if common_config is None: - self.file_size_limit = 2 * 1024 * 1024 + self.file_size_limit = 2 * 1024 * 1024 * 1024 else: self.file_size_limit = int(parse_size(common_config["file_size_limit"])) @@ -57,13 +56,10 @@ def handle(self, args): if not self.__check_valid_args(args): return - # if user indicates the store_dir, use it, otherwise use the dir in the config(default) - if args.store_dir is not None: - self.local_stored_path = os.path.abspath(args.store_dir) - - pack_dir_this_command = os.path.join(self.local_stored_path, - "gather_pack_{0}".format(timestamp_to_filename_time( - self.gather_timestamp))) + if self.is_scene: + pack_dir_this_command = self.local_stored_path + else: + pack_dir_this_command = os.path.join(self.local_stored_path,"gather_pack_{0}".format(timestamp_to_filename_time(self.gather_timestamp))) logger.info("Use {0} as pack dir.".format(pack_dir_this_command)) gather_tuples = [] @@ -160,9 +156,14 @@ def __gather_dmesg_current_info(self, ssh_helper, gather_path): def __gather_dmesg_boot_info(self, ssh_helper, dir_path): try: - dmesg_cmd = 'cp --force /var/log/dmesg {dir_path}/dmesg.boot'.format(dir_path=dir_path) - logger.info("gather dmesg boot info on server {0}, run cmd = [{1}]".format(ssh_helper.get_name(), dmesg_cmd)) - SshClient().run(ssh_helper, dmesg_cmd) if self.is_ssh else LocalClient().run(dmesg_cmd) + file_exit_cmd = "ls -l {file_path} 2>/dev/null".format(file_path="/var/log/dmesg") + file_exit = SshClient().run(ssh_helper, file_exit_cmd) if self.is_ssh else LocalClient().run(file_exit_cmd) + if file_exit: + dmesg_cmd = 'cp --force /var/log/dmesg {dir_path}/dmesg.boot'.format(dir_path=dir_path) + logger.info("gather dmesg boot info on server {0}, run cmd = [{1}]".format(ssh_helper.get_name(), dmesg_cmd)) + SshClient().run(ssh_helper, dmesg_cmd) if self.is_ssh else LocalClient().run(dmesg_cmd) + else: + logger.warn("the file /var/log/dmesg on server {0} not found ".format(ssh_helper.get_name())) except: logger.error("Failed to gather the /var/log/dmesg on server {0}".format(ssh_helper.get_name())) @@ -184,19 +185,19 @@ def __gather_mem_info(self, ssh_helper, gather_path): except: logger.error("Failed to gather memory info use tsar on server {0}".format(ssh_helper.get_name())) - @staticmethod - def __check_valid_args(args): + + def __check_valid_args(self, args): """ chech whether command args are valid. If invalid, stop processing and print the error to the user :param args: command args :return: boolean. True if valid, False if invalid. """ - # 1: store_dir must exist, else return "No such file or directory". + # 1: store_dir must exist, else create directory. if getattr(args, "store_dir") is not None: if not os.path.exists(os.path.abspath(getattr(args, "store_dir"))): - logger.error("Error: args --store_dir [{0}] incorrect: No such directory." - .format(os.path.abspath(getattr(args, "store_dir")))) - return False + logger.warn("Error: args --store_dir [{0}] incorrect: No such directory, Now create it".format(os.path.abspath(getattr(args, "store_dir")))) + os.makedirs(os.path.abspath(getattr(args, "store_dir"))) + self.local_stored_path = os.path.abspath(getattr(args, "store_dir")) return True @staticmethod diff --git a/handler/gather/scenes/__init__.py b/handler/gather/scenes/__init__.py new file mode 100644 index 00000000..40982804 --- /dev/null +++ b/handler/gather/scenes/__init__.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/26 +@file: __init__.py +@desc: +""" \ No newline at end of file diff --git a/handler/gather/scenes/base.py b/handler/gather/scenes/base.py new file mode 100644 index 00000000..a8e15a54 --- /dev/null +++ b/handler/gather/scenes/base.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/1/10 +@file: base.py +@desc: +""" + +from common.logger import logger +from common.scene import filter_by_version +from handler.gather.step.base import Base +from utils.utils import node_cut_passwd_for_log +from handler.gather.scenes.sql_problem import SQLProblemScene +from handler.gather.scenes.cpu_high import CPUHighScene + + +class SceneBase(object): + def __init__(self, scene, obproxy_nodes, ob_nodes, cluster, report_dir=None, scene_variable_dict={}, args=None, env={}, mode="yaml", task_type="observer"): + self.scene_variable_dict = scene_variable_dict + self.scene = scene + self.cluster = cluster + self.ob_nodes = ob_nodes + self.obproxy_nodes = obproxy_nodes + self.report_dir = report_dir + self.args = args + self.mode = mode + self.env = env + self.task_type = task_type + + def execute(self): + try: + if self.mode == "yaml": + if self.task_type == "observer": + self.__execute_yaml_mode(self.ob_nodes) + elif self.task_type == "obproxy": + self.__execute_yaml_mode(self.obproxy_nodes) + elif self.task_type == "other": + self.__execute_yaml_mode(self.ob_nodes) + self.__execute_yaml_mode(self.obproxy_nodes) + elif self.mode == "code": + self.__execute_code_mode() + else: + logger.error("Unsupported mode. SKIP") + raise Exception("Unsupported mode. SKIP") + except Exception as e: + raise Exception("execute failed, error: {0}".format(e)) + + def __execute_yaml_mode(self, nodes): + steps_nu = filter_by_version(self.scene, self.cluster) + if steps_nu < 0: + logger.warning("Unadapted by version. SKIP") + return "Unadapted by version.SKIP" + logger.info("filter_by_version is return {0}".format(steps_nu)) + if len(nodes)==0: + logger.error("node is not exist") + return + node_number = 0 + for node in nodes: + logger.info("run scene in node: {0}".format(node_cut_passwd_for_log(node))) + steps = self.scene[steps_nu] + nu = 1 + node_number = node_number + 1 + for step in steps["steps"]: + try: + logger.debug("step nu: {0}".format(nu)) + if len(self.cluster)==0: + logger.error("cluster is not exist") + return + step_run = Base(step, node, self.cluster, self.report_dir, self.scene_variable_dict, self.args, self.env, node_number) + logger.info("step nu: {0} initted, to execute".format(nu)) + step_run.execute() + self.scene_variable_dict = step_run.update_task_variable_dict() + except Exception as e: + logger.error("SceneBase execute Exception: {0}".format(e)) + return + logger.info("step nu: {0} execute end ".format(nu)) + nu = nu + 1 + logger.info("scene execute end") + + def __execute_code_mode(self): + if self.scene["name"] == "observer.perf_sql" or self.scene["name"] == "observer.sql_err": + scene = SQLProblemScene(self.scene["name"], self.ob_nodes, self.obproxy_nodes, self.cluster, self.report_dir, self.scene_variable_dict, self.args, self.env) + elif self.scene["name"] == "observer.cpu_high": + scene = CPUHighScene(self.ob_nodes, self.cluster, self.report_dir, self.scene_variable_dict, self.args, self.env) + else: + logger.error("unsupported hard code scene {0}".format(self.scene["name"])) + return + try: + logger.info("hard code scene {0} execute start".format(self.scene["name"])) + scene.execute() + logger.info("hard code scene {0} execute end".format(self.scene["name"])) + except Exception as e: + logger.error("hard code scene execute failed, error :{0}".format(e)) + diff --git a/handler/gather/scenes/cpu_high.py b/handler/gather/scenes/cpu_high.py new file mode 100644 index 00000000..ae7fce0e --- /dev/null +++ b/handler/gather/scenes/cpu_high.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/17 +@file: cpu_high.py +@desc: +""" +import os +from utils.shell_utils import SshHelper +from common.logger import logger +from handler.gather.gather_obstack2 import GatherObstack2Handler +from handler.gather.gather_perf import GatherPerfHandler +from utils.parser_utils import ParserAction + +class CPUHighScene(object): + def __init__(self, nodes, cluster, report_path, task_variable_dict=None, args=None, env={}): + if task_variable_dict is None: + self.task_variable_dict = {} + else: + self.task_variable_dict = task_variable_dict + self.nodes = nodes + self.cluster = cluster + self.report_path = report_path + self.args = args + self.env = env + self.is_ssh = True + + def execute(self): + self.__gather_obstack() + self.__gather_perf() + self.__gather_current_clocksource() + + def __gather_obstack(self): + logger.info("gather obstack start") + obstack = GatherObstack2Handler(nodes=self.nodes, gather_pack_dir=self.report_path, is_scene=True) + obstack.handle(self.args) + logger.info("gather obstack end") + + def __gather_perf(self): + logger.info("gather perf start") + perf = GatherPerfHandler(nodes=self.nodes, gather_pack_dir=self.report_path, is_scene=True) + self.args = ParserAction.add_attribute_to_namespace(self.args, 'scope', "all") + perf.handle(self.args) + logger.info("gather perf end") + + def __gather_current_clocksource(self): + try: + logger.info("gather current_clocksource start") + for node in self.nodes: + ssh_helper = SshHelper(self.is_ssh, node.get("ip"), node.get("user"), node.get("password"), node.get("port"), node.get("private_key"), node) + cmd = 'cat /sys/devices/system/clocksource/clocksource0/current_clocksource' + logger.info("gather current_clocksource, run cmd = [{0}]".format(cmd)) + result = ssh_helper.ssh_exec_cmd(cmd) + file_path = os.path.join(self.report_path, "current_clocksource_{ip}_result.txt".format(ip=str(node.get("ip")).replace('.', '_'))) + self.report(file_path, cmd, result) + logger.info("gather current_clocksource end") + except Exception as e: + logger.error("SshHandler init fail. Please check the node conf. Exception : {0} .".format(e)) + + def report(self, file_path, command, data): + try: + with open(file_path, 'a', encoding='utf-8') as f: + f.write('\n\n' + 'shell > ' + command + '\n') + f.write(data + '\n') + except Exception as e: + logger.error("report sql result to file: {0} failed, error: ".format(file_path)) diff --git a/handler/gather/scenes/list.py b/handler/gather/scenes/list.py new file mode 100644 index 00000000..86dbf2d4 --- /dev/null +++ b/handler/gather/scenes/list.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/10 +@file: list.py +@desc: +""" + +import os +from common.logger import logger +from utils.yaml_utils import read_yaml_data +from handler.gather.scenes.register import hardcode_scene_list +from utils.print_utils import print_scene, print_title + +class GatherScenesListHandler: + def __init__(self, yaml_tasks_base_path="./handler/gather/tasks/"): + self.observer_tasks = {} + self.obproxy_tasks = {} + self.other_tasks = {} + self.yaml_tasks_base_path = yaml_tasks_base_path + base_path = os.path.expanduser(yaml_tasks_base_path) + if os.path.exists(base_path): + self.yaml_tasks_base_path = base_path + else: + logger.error("Failed to find yaml task path: {0}".format(base_path)) + + def handle(self, args): + logger.debug("list gather scene") + self.get_all_yaml_tasks() + self.get_all_code_tasks() + logger.debug("len of observer_tasks: {0}; len of observer_tasks: {1}; len of observer_tasks: {2};".format(len(self.observer_tasks), len(self.obproxy_tasks), len(self.other_tasks))) + if (len(self.observer_tasks) + len(self.obproxy_tasks) + len(self.other_tasks)) == 0: + logger.error("Failed to find any tasks") + else: + self.print_scene_data() + + def get_all_yaml_tasks(self): + try: + current_path = self.yaml_tasks_base_path + for root, dirs, files in os.walk(current_path): + for file in files: + if file.endswith('.yaml'): + folder_name = os.path.basename(root) + task_name = "{}.{}".format(folder_name, file.split('.')[0]) + task_data = read_yaml_data(os.path.join(root, file)) + task_data["name"] = task_name + if folder_name == "observer": + self.observer_tasks[task_name] = task_data + elif folder_name == "obproxy": + self.obproxy_tasks[task_name] = task_data + else: + self.other_tasks[task_name] = task_data + except Exception as e: + logger.error("get all yaml task failed, error: ", e) + + def get_all_code_tasks(self): + try: + for scene in hardcode_scene_list: + if "observer" in scene.name: + self.observer_tasks[scene.name] = self.__get_hardcode_task(scene) + elif "obproxy" in scene.name: + self.obproxy_tasks[scene.name] = self.__get_hardcode_task(scene) + else: + self.other_tasks[scene.name] = self.__get_hardcode_task(scene) + except Exception as e: + logger.error("get all hard code task failed, error: ", e) + + def __get_hardcode_task(self, scene): + return {"name": scene.name, "command": scene.command, "info_en": scene.info_en, "info_cn": scene.info_cn,} + + def get_one_yaml_task(self, name): + try: + task_data = None + current_path = self.yaml_tasks_base_path + for root, dirs, files in os.walk(current_path): + for file in files: + if file.endswith('.yaml'): + folder_name = os.path.basename(root) + task_name = "{}.{}".format(folder_name, file.split('.')[0]) + if name == task_name: + task_data = read_yaml_data(os.path.join(root, file)) + task_data["name"] = task_name + return task_data + except Exception as e: + logger.error("get one yaml task failed, error: ", e) + + def is_code_task(self, name): + try: + for scene in hardcode_scene_list: + if scene.name == name: + return True + return False + except Exception as e: + logger.error("get one code task failed, error: ", e) + return False + + def print_scene_data(self): + sorted_observer_tasks_dict = {} + sorted_obproxy_tasks_dict = {} + sorted_other_tasks_dict = {} + if self.other_tasks: + sorted_other_tasks = sorted(self.other_tasks.items(), key=lambda x: x[0]) + sorted_other_tasks_dict = {k: v for k, v in sorted_other_tasks} + print_title("Other Problem Gather Scenes") + print_scene(sorted_other_tasks_dict) + if self.obproxy_tasks: + sorted_obproxy_tasks = sorted(self.obproxy_tasks.items(), key=lambda x: x[0]) + sorted_obproxy_tasks_dict = {k: v for k, v in sorted_obproxy_tasks} + print_title("Obproxy Problem Gather Scenes") + print_scene(sorted_obproxy_tasks_dict) + if self.observer_tasks: + sorted_observer_tasks = sorted(self.observer_tasks.items(), key=lambda x: x[0]) + sorted_observer_tasks_dict = {k: v for k, v in sorted_observer_tasks} + print_title("Observer Problem Gather Scenes") + print_scene(sorted_observer_tasks_dict) + + \ No newline at end of file diff --git a/handler/gather/scenes/register.py b/handler/gather/scenes/register.py new file mode 100644 index 00000000..9ae68fc7 --- /dev/null +++ b/handler/gather/scenes/register.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/10 +@file: register.py +@desc: +""" + +from dataclasses import dataclass + +@dataclass +class RegisteredHardCodeScene: + name: str + command: str + info_en: str + info_cn: str + +# 对于不适合通过yaml编排的复杂场景可以用这个类注册,注册后通过代码实现采集逻辑 +db_connect = '-hxx -Pxx -uxx -pxx -Dxx' +trace_id = 'xx' + +hardcode_scene_list = [ + RegisteredHardCodeScene( + 'observer.perf_sql', + f'''obdiag gather scene run --scene=observer.perf_sql --env "{{db_connect='{db_connect}', trace_id='{trace_id}'}}"''', + '[SQL performance problem]', + '[SQL性能问题]' + ), + RegisteredHardCodeScene( + 'observer.sql_err', + f'''obdiag gather scene run --scene=observer.sql_err --env "{{db_connect='{db_connect}', trace_id='{trace_id}'}}"''', + '[SQL execution error]', + '[SQL 执行出错]' + ), + RegisteredHardCodeScene('observer.cpu_high', 'obdiag gather scene run --scene=observer.cpu_high', '[High CPU]', '[CPU高]'), +] diff --git a/handler/gather/scenes/sql_problem.py b/handler/gather/scenes/sql_problem.py new file mode 100644 index 00000000..a80bf33b --- /dev/null +++ b/handler/gather/scenes/sql_problem.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/17 +@file: sql_problem.py +@desc: +""" + +from common.logger import logger +from utils.parser_utils import ParserAction +from handler.gather.gather_log import GatherLogHandler +from handler.gather.gather_obproxy_log import GatherObProxyLogHandler +from handler.gather.gather_plan_monitor import GatherPlanMonitorHandler +from utils.string_utils import parse_mysql_cli_connection_string + + +class SQLProblemScene(object): + def __init__(self, scene_name, ob_nodes, obproxy_nodes, cluster, report_path, task_variable_dict=None, args=None, env={}): + if task_variable_dict is None: + self.task_variable_dict = {} + else: + self.task_variable_dict = task_variable_dict + self.ob_nodes = ob_nodes + self.obproxy_nodes = obproxy_nodes + self.cluster = cluster + self.report_path = report_path + self.args = args + self.env = env + self.is_ssh = True + self.scene_name = scene_name + self.db_conn = {} + self.trace_id = "FAKE_TRACE_ID" + + def execute(self): + self.__parse_env() + self.__gather_log() + self.__gather_obproxy_log() + self.__gather_sql_info() + + def __gather_log(self): + try: + logger.info("gather observer log start") + handler = GatherLogHandler(nodes=self.ob_nodes, gather_pack_dir=self.report_path, is_scene=True) + self.args = ParserAction.add_attribute_to_namespace(self.args, 'grep', "") + handler.handle(self.args) + logger.info("gather observer log end") + except Exception as e: + logger.error("gather observer log failed, error: {0}".format(e)) + raise Exception("gather observer log failed, error: {0}".format(e)) + + def __gather_obproxy_log(self): + try: + logger.info("gather obproxy log start") + handler = GatherObProxyLogHandler(nodes=self.obproxy_nodes, gather_pack_dir=self.report_path, is_scene=True) + if self.scene_name: + if self.scene_name == "observer.sql_err": + self.args = ParserAction.add_attribute_to_namespace(self.args, 'grep', None) + elif self.scene_name == "observer.perf_sql": + self.args = ParserAction.add_attribute_to_namespace(self.args, 'grep', self.trace_id) + else: + logger.warn("unsupported scene {0}".format(self.scene_name)) + return + self.args = ParserAction.add_attribute_to_namespace(self.args, 'scope', "all") + self.args = ParserAction.add_attribute_to_namespace(self.args, 'encrypt', "false") + handler.handle(self.args) + logger.info("gather obproxy log end") + else: + logger.warn("scene is None") + return + except Exception as e: + logger.error("gather obproxy log failed, error: {0}".format(e)) + raise Exception("gather obproxy log failed, error: {0}".format(e)) + + def __gather_sql_info(self): + try: + logger.info("gather sql info start") + handler = GatherPlanMonitorHandler(ob_cluster=self.cluster, gather_pack_dir=self.report_path, db_conn=self.db_conn, is_scene=True) + self.args = ParserAction.add_attribute_to_namespace(self.args, 'trace_id', self.trace_id) + handler.handle(self.args) + logger.info("gather sql info end") + except Exception as e: + logger.error("gather sql info failed, error: {0}".format(e)) + raise Exception("gather sql info failed, error: {0}".format(e)) + + def report(self): + pass + + def __parse_env(self): + cli_connection_string = self.env.get("db_connect") + self.db_conn = parse_mysql_cli_connection_string(cli_connection_string) + trace_id = self.env.get("trace_id") + if trace_id: + self.trace_id = self.env.get("trace_id") diff --git a/handler/gather/step/__init__.py b/handler/gather/step/__init__.py new file mode 100644 index 00000000..4dea303d --- /dev/null +++ b/handler/gather/step/__init__.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/05 +@file: __init__.py +@desc: +""" \ No newline at end of file diff --git a/handler/gather/step/base.py b/handler/gather/step/base.py new file mode 100644 index 00000000..0a2a9f10 --- /dev/null +++ b/handler/gather/step/base.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/05 +@file: base.py +@desc: +""" +import docker +from handler.gather.step.ssh import SshHandler +from handler.gather.step.sql import StepSQLHandler +from common.logger import logger +from handler.gather.gather_log import GatherLogHandler +from handler.gather.gather_obproxy_log import GatherObProxyLogHandler +from handler.gather.gather_sysstat import GatherOsInfoHandler +from utils.parser_utils import ParserAction + +class Base(object): + def __init__(self, step, node, cluster, report_path, task_variable_dict=None, args=None, env={}, node_number = 1): + if task_variable_dict is None: + self.task_variable_dict = {} + else: + self.task_variable_dict = task_variable_dict + self.step = step + self.node = node + self.cluster = cluster + self.report_path = report_path + self.args = args + self.env = env + self.node_number = node_number + + def execute(self): + logger.debug("step: {0}".format(self.step)) + no_cluster_name_msg="(Please set ob_cluster_name or obproxy_cluster_name)" + try: + if "ip" in self.node: + self.task_variable_dict["remote_ip"] = self.node["ip"] + elif "ssh_type" in self.node and self.node["ssh_type"]=="docker": + logger.debug("execute ssh_type is docker") + self.task_variable_dict["remote_ip"] = docker.from_env().containers.get(self.node["container_name"]).attrs['NetworkSettings']['Networks']['bridge']["IPAddress"] + self.task_variable_dict["remote_home_path"] = self.node["home_path"] + + if "type" not in self.step: + logger.error("Missing field :type") + if (self.node_number > 1) and self.step.get("global") and (self.step.get("global") == "true"): + logger.info("step sets the value of the global is true and it is processing the {0} node, skipping gather".format(self.node_number)) + else: + if self.step["type"] == "ssh": + handler = SshHandler(self.step, self.node, self.report_path, self.task_variable_dict) + handler.execute() + elif self.step["type"] == "sql": + handler = StepSQLHandler(self.step, self.cluster, self.report_path, self.task_variable_dict) + handler.execute() + elif self.step["type"] == "log": + if self.node.get("host_type") and self.node.get("host_type") == "OBSERVER": + handler = GatherLogHandler(nodes=[self.node], gather_pack_dir=self.report_path, is_scene=True) + if self.step.get("grep") is None or len(self.step.get("grep")) == 0: + self.args = ParserAction.add_attribute_to_namespace(self.args, 'grep', None) + else: + self.args = ParserAction.add_attribute_to_namespace(self.args, 'grep', self.step.get("grep")) + handler.handle(self.args) + else: + logger.info("node host_type is {0} not OBSERVER, skipping gather log".format(self.node.get("host_type"))) + elif self.step["type"] == "obproxy_log": + if self.node.get("host_type") and self.node.get("host_type") == "OBPROXY": + handler = GatherObProxyLogHandler(nodes=[self.node], gather_pack_dir=self.report_path, is_scene=True) + if self.step.get("grep") is None or len(self.step.get("grep")) == 0: + self.args = ParserAction.add_attribute_to_namespace(self.args, 'grep', None) + else: + self.args = ParserAction.add_attribute_to_namespace(self.args, 'grep', self.step.get("grep")) + self.args = ParserAction.add_attribute_to_namespace(self.args, 'scope', 'all') + self.args = ParserAction.add_attribute_to_namespace(self.args, 'encrypt', 'false') + handler.handle(self.args) + else: + logger.info("node host_type is {0} not OBPROXY, skipping gather log".format(self.node.get("host_type"))) + elif self.step["type"] == "sysstat": + handler = GatherOsInfoHandler(nodes=[self.node], gather_pack_dir=self.report_path, is_scene=True) + handler.handle(self.args) + else: + logger.error("the type not support: {0}" .format(self.step["type"])) + except Exception as e: + logger.error("StepBase handler.execute fail, error: {0}".format(e)) + if self.step["type"] == "sql": + logger.error("[cluster:{0}] {1}]".format(self.cluster.get("ob_cluster_name") or self.cluster.get("obproxy_cluster_name") or no_cluster_name_msg, e)) + else: + logger.error("[{0}:{1}] {2}]".format(self.node.get("ssh_type") or "", self.node.get("container_name") or self.task_variable_dict.get("remote_ip") or "", e)) + logger.error("StepBase handler.execute fail, error: {0}".format(e)) + + def update_task_variable_dict(self): + return self.task_variable_dict diff --git a/handler/gather/step/sql.py b/handler/gather/step/sql.py new file mode 100644 index 00000000..f9d89f41 --- /dev/null +++ b/handler/gather/step/sql.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/04 +@file: sql.py +@desc: +""" +import os +from common.logger import logger +from common.ob_connector import OBConnector +from tabulate import tabulate +from utils.utils import build_str_on_expr_by_dict_2, convert_to_number + + +class StepSQLHandler: + def __init__(self, step, ob_cluster, report_path, task_variable_dict): + try: + self.ob_cluster = ob_cluster + self.ob_cluster_name = ob_cluster.get("cluster_name") + self.tenant_mode = None + self.sys_database = None + self.database = None + self.ob_connector = OBConnector(ip=ob_cluster.get("db_host"), + port=ob_cluster.get("db_port"), + username=ob_cluster.get("tenant_sys").get("user"), + password=ob_cluster.get("tenant_sys").get("password"), + timeout=10000) + except Exception as e: + logger.error("StepSQLHandler init fail. Please check the OBCLUSTER conf. OBCLUSTER: {0} Exception : {1} .".format(ob_cluster,e)) + self.task_variable_dict = task_variable_dict + self.enable_dump_db = False + self.enable_fast_dump = False + self.ob_major_version = None + self.step = step + self.report_path = report_path + self.report_file_path = os.path.join(self.report_path, "sql_result.txt") + + def execute(self): + try: + if "sql" not in self.step: + logger.error("StepSQLHandler execute sql is not set") + return + sql = build_str_on_expr_by_dict_2(self.step["sql"], self.task_variable_dict) + logger.info("StepSQLHandler execute: {0}".format(sql)) + columns, data = self.ob_connector.execute_sql_return_columns_and_data(sql) + if data is None or len(data) == 0: + logger.warning("excute sql: {0}, result is None".format(sql)) + else: + self.report(sql, columns, data) + except Exception as e: + logger.error("StepSQLHandler execute Exception: {0}".format(e).strip()) + + def update_step_variable_dict(self): + return self.task_variable_dict + + def report(self, sql, column_names, data): + try: + table_data = [list(row) for row in data] + formatted_table = tabulate(table_data, headers=column_names, tablefmt="grid") + with open(self.report_file_path, 'a', encoding='utf-8') as f: + f.write('\n\n' + 'obclient > ' + sql + '\n') + f.write(formatted_table) + except Exception as e: + logger.error("report sql result to file: {0} failed, error: ".format(self.report_file_path)) \ No newline at end of file diff --git a/handler/gather/step/ssh.py b/handler/gather/step/ssh.py new file mode 100644 index 00000000..41ec26f2 --- /dev/null +++ b/handler/gather/step/ssh.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/04 +@file: ssh.py +@desc: +""" +import os +from utils.shell_utils import SshHelper +from common.logger import logger +from utils.utils import build_str_on_expr_by_dict_2 + + +class SshHandler: + def __init__(self, step, node, report_path, task_variable_dict): + self.ssh_report_value = None + self.parameters = None + self.step = step + self.node = node + self.report_path = report_path + try: + is_ssh = True + self.ssh_helper = SshHelper(is_ssh, node.get("ip"), node.get("user"), node.get("password"), node.get("port"), node.get("private_key"), node) + except Exception as e: + logger.error("SshHandler init fail. Please check the NODES conf. node: {0}. Exception : {1} .".format(node, e)) + self.task_variable_dict = task_variable_dict + self.parameter = [] + self.report_file_path = os.path.join(self.report_path, "shell_result.txt") + + def execute(self): + try: + if "ssh" not in self.step: + logger.error("SshHandler execute ssh is not set") + return + ssh_cmd = build_str_on_expr_by_dict_2(self.step["ssh"], self.task_variable_dict) + logger.info("step SshHandler execute :{0} ".format(ssh_cmd)) + ssh_report_value = self.ssh_helper.ssh_exec_cmd(ssh_cmd) + if ssh_report_value is None: + ssh_report_value = "" + if len(ssh_report_value) > 0: + ssh_report_value = ssh_report_value.strip() + self.report(ssh_cmd, ssh_report_value) + except Exception as e: + logger.error("ssh execute Exception:{0}".format(e).strip()) + finally: + self.ssh_helper.ssh_close() + logger.debug("gather step SshHandler ssh_report_value:{0}".format(ssh_report_value)) + + def update_step_variable_dict(self): + return self.task_variable_dict + + def report(self, command, data): + try: + with open(self.report_file_path, 'a', encoding='utf-8') as f: + f.write('\n\n' + 'shell > ' + command + '\n') + f.write(data + '\n') + except Exception as e: + logger.error("report sql result to file: {0} failed, error: ".format(self.report_file_path)) diff --git a/handler/gather/tasks/obproxy/restart.yaml b/handler/gather/tasks/obproxy/restart.yaml new file mode 100644 index 00000000..65725159 --- /dev/null +++ b/handler/gather/tasks/obproxy/restart.yaml @@ -0,0 +1,18 @@ +info_en: "[obproxy restart]" +info_cn: "[obproxy无故重启]" +command: obdiag gather scene run --scene=obproxy.restart +task: + - version: "[2.0.0.0, *]" + steps: + - type: ssh + ssh: "ps -ef | grep obproxy" + global: false + - type: ssh + ssh: "cat /proc/sys/kernel/core_pattern" + global: false + - type: ssh + ssh: "ls -lhrt ${obproxy_data_dir}" + global: false + - type: obproxy_log + grep: "" + global: false diff --git a/handler/gather/tasks/observer/backup.yaml b/handler/gather/tasks/observer/backup.yaml new file mode 100644 index 00000000..f34162b0 --- /dev/null +++ b/handler/gather/tasks/observer/backup.yaml @@ -0,0 +1,107 @@ +info_en: "[backup problem]" +info_cn: "[数据备份问题]" +command: obdiag gather scene run --scene=observer.backup +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_backup_task;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_backup_info;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_sys_task_status where comment like '%backup%';" + global: true + - type: sql + sql: "select count(*),status from oceanbase.__all_virtual_pg_backup_task group by status;" + global: true + - type: sql + sql: "select svr_ip, log_archive_status, count(*) from oceanbase.__all_virtual_pg_backup_log_archive_status group by svr_ip, log_archive_status;" + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_event_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "select b.* from oceanbase.__all_virtual_pg_backup_log_archive_status a,oceanbase.__all_virtual_pg_log_archive_stat b where a.table_id=b.table_id and a.partition_id=b.partition_id order by log_archive_cur_ts limit 5;" + global: true + - type: log + global: false + grep: "" + - type: sysstat + global: false + sysstat: "" + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%backup%';" + global: true + - type: sql + sql: "show parameters like '%ha_low_thread_score%';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_PARAMETER" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_JOBS limit 20;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ROOTSERVICE_EVENT_HISTORY WHERE module='backup_data' AND event ='start_backup_data';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_TASKS limit 20;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_backup_schedule_task limit 20" + global: true + - type: sql + sql: "SELECT * from oceanbase.CDB_OB_BACKUP_JOB_HISTORY where STATUS = 'FAILED' limit 20;" + global: true + - type: log + global: false + grep: "" + - type: sysstat + global: false + sysstat: "" diff --git a/handler/gather/tasks/observer/backup_clean.yaml b/handler/gather/tasks/observer/backup_clean.yaml new file mode 100644 index 00000000..daa03ed3 --- /dev/null +++ b/handler/gather/tasks/observer/backup_clean.yaml @@ -0,0 +1,125 @@ +info_en: "[backup clean]" +info_cn: "[备份清理问题]" +command: obdiag gather scene run --scene=observer.backup_clean +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "show parameters like '%backup_dest%';" + global: true + - type: sql + sql: "show parameters like '%auto_delete_expired_backup%';" + global: true + - type: sql + sql: "show parameters like '%backup_recovery_window%';" + global: true + - type: sql + sql: "select * from oceanbase.__all_tenant_backup_info;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_backup_clean_info;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_sys_task_status where comment like '%backup%';" + global: true + - type: sql + sql: "select * from oceanbase.CDB_OB_BACKUP_SET_DETAILS order by START_TIME asc limit 1;" + global: true + - type: sql + sql: "select * from oceanbase.__all_backup_task_clean_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "select * from oceanbase.__all_backup_clean_info_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_event_history where module='backup_clean' and gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: log + grep: "" + global: false + - type: sysstat + sysstat: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%backup%';" + global: true + - type: sql + sql: "select * from oceanbase.CDB_OB_BACKUP_JOB_HISTORY" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ROOTSERVICE_EVENT_HISTORY WHERE module='backup_data' AND event ='start_backup_data';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_TASKS limit 20;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_backup_schedule_task limit 20" + global: true + - type: sql + sql: "SELECT * from oceanbase.CDB_OB_BACKUP_JOB_HISTORY where STATUS = 'FAILED' limit 20;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_DELETE_POLICY;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_DELETE_JOBS limit 20" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_DELETE_TASKS limit 20" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_DELETE_TASK_HISTORY limit 20" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_DELETE_JOB_HISTORY limit 20" + global: true + - type: log + global: false + grep: "" + - type: sysstat + global: false + sysstat: "" diff --git a/handler/gather/tasks/observer/clog_disk_full.yaml b/handler/gather/tasks/observer/clog_disk_full.yaml new file mode 100644 index 00000000..b2715dc9 --- /dev/null +++ b/handler/gather/tasks/observer/clog_disk_full.yaml @@ -0,0 +1,89 @@ +info_en: "[clog disk full]" +info_cn: "[clog盘满]" +command: obdiag gather scene run --scene=observer.clog_disk_full +task: + - version: "[3.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "show parameters like '%clog_disk_usage_limit_percentage%';" + global: true + - type: sql + sql: "show parameters like '%clog_expire_days%';" + global: true + - type: sql + sql: "show parameters like '%backup_log_archive_option%';" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_tenant_memstore_info where (active_memstore_used > major_freeze_trigger) or (total_memstore_used > memstore_limit);" + global: true + - type: sql + sql: "select svr_ip,total_size / 1024 / 1024 / 1024 total_G,free_size / 1024 / 1024 / 1024 free_G,(total_size - free_size) / 1024 / 1024 / 1024 used_G,(total_size - free_size) / total_size used_percentage FROM oceanbase.__all_virtual_disk_stat; " + global: true + - type: log + global: false + grep: "" + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%clog%';" + global: true + - type: sql + sql: "show parameters like '%backup%';" + global: true + - type: sql + sql: "select tenant_id, svr_ip, svr_port, LOG_DISK_IN_USE/1024/1024/1024 LOG_DISK_IN_USE_G, LOG_DISK_SIZE/1024/1024/1024 LOG_DISK_SIZE_G, LOG_DISK_IN_USE*100/LOG_DISK_SIZE LOG_DISK_USED_PERCENTAGE from oceanbase.gv$ob_units;" + global: true + - type: sql + sql: "select TENANT_ID, LS_ID, SVR_IP, ROLE , (end_lsn-base_lsn)/1024/1024 from oceanbase.gv$ob_log_stat;" + global: true + - type: sql + sql: "(select value1, value2 from oceanbase.DBA_OB_ROOTSERVICE_EVENT_HISTORY where event like '%add_ls%') except (select value1, value2 from oceanbase.DBA_OB_SERVER_EVENT_HISTORY where module like 'storage_ha' and event like '%finish_complete%');" + global: true + - type: sql + sql: "select * from oceanbase.DBA_OB_SERVER_EVENT_HISTORY where event like '%migrat%' and name6 like '%fail%' and value6=1;" + global: true + - type: log + global: false + grep: "" diff --git a/handler/gather/tasks/observer/compaction.yaml b/handler/gather/tasks/observer/compaction.yaml new file mode 100644 index 00000000..a4e123a0 --- /dev/null +++ b/handler/gather/tasks/observer/compaction.yaml @@ -0,0 +1,134 @@ +info_en: "[compaction]" +info_cn: "[合并问题]" +command: obdiag gather scene run --scene=observer.compaction +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "show parameters like '%enable_manual_merge%';" + global: true + - type: sql + sql: "show parameters like '%zone_merge_concurrency%';" + global: true + - type: sql + sql: "show parameters like '%zone_merge_order%';" + global: true + - type: sql + sql: "show parameters like '%enable_merge_by_turn%';" + global: true + - type: sql + sql: "show parameters like '%major_freeze_duty_time%';" + global: true + - type: sql + sql: "show parameters like '%enable_auto_leader_switch%';" + global: true + - type: sql + sql: "select * from oceanbase.__all_zone;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_replica_task;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_rebalance_task_stat;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_sys_task_status;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_partition_compaction_progress;" + global: true + - type: sql + sql: "select * from oceanbase.__all_freeze_schema_version where schema_version = -1;" + global: true + - type: sql + sql: "select tenant_id, table_id, partition_id from oceanbase.__all_virtual_partition_table group by 1,2,3 having min(role) = 2;" + global: true + - type: sql + sql: "SELECT count(*),svr_ip FROM oceanbase.__all_virtual_clog_stat WHERE is_in_sync = 0 AND is_offline = 0 AND replica_type != 16 group by svr_ip;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_clog_stat WHERE is_in_sync = 0 AND is_offline = 0 AND replica_type != 16 limit 10;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_tenant_memstore_info where (active_memstore_used > major_freeze_trigger) or (total_memstore_used > memstore_limit);" + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_event_history where module = 'daily_merge' and event like '%merge_error%' order by gmt_create desc limit 5;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_meta_table where data_version != (select value from oceanbase.__all_zone where name='global_broadcast_version') limit 10;" + global: true + - type: sql + sql: "SELECT count(1) FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action<=2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: sql + sql: "SELECT count(1) FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action > 2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_election_info group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_election_info where role = 1)" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_clog_stat group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_clog_stat where role = 'LEADER') ;" + global: true + - type: sql + sql: "SELECT svr_ip,total_size / 1024 / 1024 / 1024 total_G,free_size / 1024 / 1024 / 1024 free_G,(total_size - free_size) / 1024 / 1024 / 1024 used_G,(total_size - free_size) / total_size used_percentage FROM oceanbase.__all_virtual_disk_stat;" + global: true + - type: log + grep: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%merge%';" + global: true + - type: sql + sql: "show parameters like 'memstore_limit_percentage';" + global: true + - type: sql + sql: "show parameters like 'freeze_trigger_percentage';" + global: true + - type: log + global: false + grep: "" diff --git a/handler/gather/tasks/observer/delay_of_primary_and_backup.yaml b/handler/gather/tasks/observer/delay_of_primary_and_backup.yaml new file mode 100644 index 00000000..ee2abcde --- /dev/null +++ b/handler/gather/tasks/observer/delay_of_primary_and_backup.yaml @@ -0,0 +1,143 @@ +info_en: "[delay of primary and backup]" +info_cn: "[主备库延迟]" +command: obdiag gather scene run --scene=observer.delay_of_primary_and_backup +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "select * from oceanbase.__all_failover_info;" + global: true + - type: sql + sql: "select * from oceanbase.__all_freeze_schema_version ;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_sys_task_status;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_replica_task;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_rebalance_task_stat;" + global: true + - type: sql + sql: "select * from oceanbase.__all_unit where migrate_from_svr_ip !='';" + global: true + - type: sql + sql: "select * from oceanbase.__all_root_table where is_restore != 0;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_meta_table where is_restore != 0;" + global: true + - type: sql + sql: "select * from oceanbase.__all_core_table where table_name like '%schema_status%'';" + global: true + - type: sql + sql: "SELECT TENANT_ID, COUNT(*) FROM oceanbase.__ALL_VIRTUAL_META_TABLE WHERE ROLE = 2 GROUP BY TENANT_ID;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_clog_stat WHERE is_in_sync = 0 AND is_offline = 0 AND replica_type != 16 limit 10;" + global: true + - type: sql + sql: "select tenant_id, table_id, partition_id from oceanbase.__all_virtual_partition_table group by 1,2,3 having min(role) = 2;" + global: true + - type: sql + sql: "SELECT USEC_TO_TIME(CURRENT_SCN) AS CUR_PROCESS, NOW(6) - USEC_TO_TIME(CURRENT_SCN) AS DELAY FROM oceanbase.V$OB_CLUSTER;" + global: true + - type: sql + sql: "SELECT count(*),svr_ip FROM oceanbase.__all_virtual_clog_stat WHERE is_in_sync = 0 AND is_offline = 0 AND replica_type != 16 group by svr_ip;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_tenant_memstore_info where (active_memstore_used > major_freeze_trigger) or (total_memstore_used > memstore_limit);" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__ALL_ROOTSERVICE_EVENT_HISTORY WHERE MODULE = 'BALANCER' AND EVENT LIKE '%ADD_REPLICA%' ORDER BY GMT_CREATE DESC LIMIT 100;" + global: true + - type: sql + sql: "SELECT count(1) FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action<=2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: sql + sql: "SELECT count(1) FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action > 2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_election_info group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_election_info where role = 1)" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_clog_stat group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_clog_stat where role = 'LEADER') ;" + global: true + - type: sql + sql: "SELECT svr_ip ,total_size / 1024 / 1024 / 1024 total_G,free_size / 1024 / 1024 / 1024 free_G,(total_size - free_size) / 1024 / 1024 / 1024 used_G,(total_size - free_size) / total_size used_percentage FROM oceanbase.__all_virtual_disk_stat;" + global: true + - type: sql + sql: "select * from oceanbase.v$ob_cluster;" + global: true + - type: sql + sql: "SELECT TENANT_ID, COUNT(*) FROM oceanbase.__ALL_VIRTUAL_META_TABLE WHERE ROLE = 1 GROUP BY TENANT_ID;" + global: true + - type: sql + sql: "select tenant_id, table_id, partition_id from oceanbase.__all_virtual_partition_table group by 1,2,3 having min(role) = 2;" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_election_info group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_election_info where role = 1) ;" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_clog_stat group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_clog_stat where role = 'LEADER') ;" + global: true + - type: log + global: false + grep: "" + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%backup%';" + global: true + - type: sql + sql: "SELECT TENANT_NAME, TENANT_ID, TENANT_ROLE, SCN_TO_TIMESTAMP(SYNC_SCN) FROM oceanbase.DBA_OB_TENANTS WHERE TENANT_NAME = 'standby_tenant';;" + global: true + - type: sql + sql: "SELECT LS_ID, SCN_TO_TIMESTAMP(END_SCN) FROM oceanbase.GV$OB_LOG_STAT WHERE ROLE = 'LEADER';" + global: true + - type: log + global: false + grep: "" diff --git a/handler/gather/tasks/observer/log_archive.yaml b/handler/gather/tasks/observer/log_archive.yaml new file mode 100644 index 00000000..4cf66f0a --- /dev/null +++ b/handler/gather/tasks/observer/log_archive.yaml @@ -0,0 +1,110 @@ +info_en: "[log archive]" +info_cn: "[日志归档问题]" +command: obdiag gather scene run --scene=observer.log_archive +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "select * from oceanbase.CDB_OB_BACKUP_ARCHIVELOG;" + global: true + - type: sql + sql: "select * from __all_virtual_sys_task_status where comment like '%backup%';" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_pg_backup_log_archive_status order by log_archive_cur_ts limit 10;" + global: true + - type: sql + sql: "select svr_ip, log_archive_status, count(*) from oceanbase.__all_virtual_pg_backup_log_archive_status group by svr_ip, log_archive_status;" + global: true + - type: sql + sql: "select tenant_id, table_id, partition_id from oceanbase.__all_virtual_partition_table group by 1,2,3 having min(role) = 2;" + global: true + - type: sql + sql: "select count(*) ,DATE_FORMAT(gmt_create, '%Y-%c-%d') as date from oceanbase.__all_virtual_ddl_operation where ddl_stmt_str !='' group by date order by date limit 10;" + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_event_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_election_info group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_election_info where role = 1);" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_clog_stat group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_clog_stat where role = 'LEADER');" + global: true + - type: sql + sql: "select b.* from oceanbase.__all_virtual_pg_backup_log_archive_status a,oceanbase.__all_virtual_pg_log_archive_stat b where a.table_id=b.table_id and a.partition_id=b.partition_id order by log_archive_cur_ts limit 5;" + global: true + - type: log + global: false + grep: "" + - type: sysstat + sysstat: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%backup%';" + global: true + - type: sql + sql: "SHOW PARAMETERS LIKE 'log_archive_concurrency';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_ARCHIVE_DEST;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_ARCHIVELOG_SUMMARY limit 20" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_ARCHIVELOG limit 20" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_ARCHIVELOG_PIECE_FILES limit 20" + global: true + - type: log + global: false + grep: "" + - type: sysstat + sysstat: "" + global: false diff --git a/handler/gather/tasks/observer/long_transaction.yaml b/handler/gather/tasks/observer/long_transaction.yaml new file mode 100644 index 00000000..70b83a9e --- /dev/null +++ b/handler/gather/tasks/observer/long_transaction.yaml @@ -0,0 +1,65 @@ +info_en: "[long transaction]" +info_cn: "[长事务]" +command: obdiag gather scene run --scene=observer.long_transaction +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action<=2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: log + grep: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "select * from oceanbase.gv$ob_transaction_participants limit 100" + global: true + - type: sql + sql: "SELECT count(1) FROM oceanbase.GV$OB_TRANSACTION_PARTICIPANTS WHERE CTX_CREATE_TIME < date_sub(now(), INTERVAL 600 SECOND) AND STATE = 'INIT';" + global: true + - type: log + grep: "" + global: false diff --git a/handler/gather/tasks/observer/memory.yaml b/handler/gather/tasks/observer/memory.yaml new file mode 100644 index 00000000..69cb752c --- /dev/null +++ b/handler/gather/tasks/observer/memory.yaml @@ -0,0 +1,77 @@ +info_en: "[memory problem]" +info_cn: "[内存问题]" +command: obdiag gather scene run --scene=observer.memory +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_tenant_memstore_info where (active_memstore_used > major_freeze_trigger) or (total_memstore_used > memstore_limit);" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action<=2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action > 2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: sql + sql: "SELECT table_id, partition_id, base_version, snapshot_version FROM oceanbase.__all_virtual_table_mgr WHERE table_type=0 except SELECT table_id, partition_idx, base_version, snapshot_version FROM oceanbase.__all_virtual_memstore_info limit 10;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_tenant_memstore_allocator_info a,(select svr_ip,tenant_id from oceanbase.__all_virtual_tenant_memstore_info where (active_memstore_used > major_freeze_trigger)) b where a.svr_ip=b.svr_ip and a.tenant_id=b.tenant_id AND a.mt_is_frozen=1 ORDER BY mt_protection_clock limit 20;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_tenant_memstore_allocator_info a,(select svr_ip,tenant_id from oceanbase.__all_virtual_tenant_memstore_info where (total_memstore_used > memstore_limit)) b where a.svr_ip=b.svr_ip and a.tenant_id=b.tenant_id AND a.mt_is_frozen=0 ORDER BY mt_protection_clock limit 20;" + global: true + - type: log + grep: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "select * from oceanbase.GV$OB_MEMSTORE limit 5" + global: true + - type: log + grep: "" + global: false diff --git a/handler/gather/tasks/observer/recovery.yaml b/handler/gather/tasks/observer/recovery.yaml new file mode 100644 index 00000000..0f4802b3 --- /dev/null +++ b/handler/gather/tasks/observer/recovery.yaml @@ -0,0 +1,79 @@ +info_en: "[recovery]" +info_cn: "[数据恢复问题]" +command: obdiag gather scene run --scene=observer.recovery +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "select * from oceanbase.__all_restore_info;" + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_event_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "select svr_ip,role, is_restore, count(*) from oceanbase.__all_root_table as a, (select value from oceanbase.__all_restore_info where name='tenant_id') as b where a.tenant_id=b.value group by role, is_restore, svr_ip order by svr_ip, is_restore;" + global: true + - type: sql + sql: "select svr_ip,role, is_restore, count(*) from oceanbase.__all_virtual_meta_table as a, (select value from oceanbase.__all_restore_info where name='tenant_id') as b where a.tenant_id=b.value group by role, is_restore, svr_ip order by svr_ip, is_restore;" + global: true + - type: log + grep: "" + global: false + - type: sysstat + sysstat: "" + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_RESTORE_PROGRESS limit 20;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_RESTORE_HISTORY limit 20;" + global: true + - type: log + grep: "" + global: false + - type: sysstat + sysstat: "" + global: false diff --git a/handler/gather/tasks/observer/restart.yaml b/handler/gather/tasks/observer/restart.yaml new file mode 100644 index 00000000..56195780 --- /dev/null +++ b/handler/gather/tasks/observer/restart.yaml @@ -0,0 +1,74 @@ +info_en: "[restart]" +info_cn: "[observer无故重启]" +command: obdiag gather scene run --scene=observer.restart +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: ssh + ssh: "ps -ef | grep observer" + global: false + - type: ssh + ssh: "cat /proc/sys/kernel/core_pattern" + global: false + - type: ssh + ssh: "ls -lhrt ${observer_data_dir}" + global: false + - type: log + grep: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: ssh + ssh: "ps -ef | grep observer" + global: false + - type: ssh + ssh: "cat /proc/sys/kernel/core_pattern" + global: false + - type: ssh + ssh: "ls -lhrt ${observer_data_dir}" + global: false + - type: log + grep: "" + global: false diff --git a/handler/gather/tasks/observer/rootservice_switch.yaml b/handler/gather/tasks/observer/rootservice_switch.yaml new file mode 100644 index 00000000..6f81da3e --- /dev/null +++ b/handler/gather/tasks/observer/rootservice_switch.yaml @@ -0,0 +1,122 @@ +info_en: "[rootservice switch]" +info_cn: "[有主改选或者无主选举的切主]" +command: obdiag gather scene run --scene=observer.rootservice_switch +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "show parameters like 'enable_auto_leader_switch';" + global: true + - type: sql + sql: "show parameters like 'enable_merge_by_turn';" + global: true + - type: sql + sql: "select count(*) from oceanbase.__all_virtual_database where primary_zone != '' group by tenant_id;" + global: true + - type: sql + sql: "select count(*) from oceanbase.__all_virtual_table where primary_zone != '' group by tenant_id;" + global: true + - type: sql + sql: "select count(*) from oceanbase.__all_virtual_tablegroup where primary_zone != '' group by tenant_id;" + global: true + - type: sql + sql: "select tenant_id, table_id, partition_id from oceanbase.__all_virtual_partition_table group by 1,2,3 having min(role) = 2;" + global: true + - type: sql + sql: "select count(*) from oceanbase.__all_virtual_rebalance_task_stat where task_type in ('ADD_REPLICA', 'MIGRATE_REPLICA', 'TYPE_TRANSFORM');" + global: true + - type: sql + sql: "select count(*) from oceanbase.__all_virtual_replica_task where cmd_type in ('ADD_REPLICA', 'MIGRATE_REPLICA', 'TYPE_TRANSFORM'); " + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_event_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_election_event_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_election_info group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_election_info where role = 1) ;" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_clog_stat group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_clog_stat where role = 'LEADER') ;" + global: true + - type: log + grep: "" + global: false + - type: sysstat + sysstat: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%switch%';" + global: true + - type: sql + sql: "select TIMESTAMP,MODULE,EVENT,VALUE1 tenant_id,VALUE2 ls_id,NAME3,VALUE3,NAME4,VALUE4,NAME5,VALUE5,NAME6,VALUE6 from oceanbase.DBA_OB_ROOTSERVICE_EVENT_HISTORY where module like '%disaster%' limit 20;" + global: true + - type: sql + sql: "(select value1, value2 from oceanbase.DBA_OB_ROOTSERVICE_EVENT_HISTORY where event like '%add_ls%') except (select value1, value2 from oceanbase.DBA_OB_SERVER_EVENT_HISTORY where module like 'storage_ha' and event like '%finish_complete%');" + global: true + - type: sql + sql: "select * from oceanbase.DBA_OB_SERVER_EVENT_HISTORY where event like '%migrat%' and name6 like '%fail%' and value6=1;" + global: true + - type: sql + sql: "select * from oceanbase.DBA_OB_SERVER_EVENT_HISTORY where module='FAILURE_DETECTOR' limit 10;" + global: true + - type: sql + sql: "select * from oceanbase.DBA_OB_SERVER_EVENT_HISTORY where module like '%ELECTION%' limit 10;" + global: true + - type: sql + sql: "select * from oceanbase.GV$OB_LOG_STAT where role='LEADER' limit 20;" + global: true + - type: sql + sql: "SELECT TENANT_NAME, TENANT_ID, TENANT_ROLE, STATUS, SWITCHOVER_STATUS FROM oceanbase.DBA_OB_TENANTS" + global: true + - type: log + grep: "" + global: false + - type: sysstat + sysstat: "" + global: false diff --git a/handler/gather/tasks/observer/suspend_transaction.yaml b/handler/gather/tasks/observer/suspend_transaction.yaml new file mode 100644 index 00000000..c8b09863 --- /dev/null +++ b/handler/gather/tasks/observer/suspend_transaction.yaml @@ -0,0 +1,62 @@ +info_en: "[suspend transaction]" +info_cn: "[悬挂事务]" +command: obdiag gather scene run --scene=observer.suspend_transaction +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action > 2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: log + grep: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "SELECT count(1) FROM oceanbase.GV$OB_TRANSACTION_PARTICIPANTS WHERE CTX_CREATE_TIME < date_sub(now(), INTERVAL 600 SECOND) AND (STATE = 'PREPARE' OR STATE = 'REDO COMPLETE' OR STATE ='PRECOMMIT');" + global: true + - type: log + grep: "" + global: false diff --git a/handler/gather/tasks/observer/unit_data_imbalance.yaml b/handler/gather/tasks/observer/unit_data_imbalance.yaml new file mode 100644 index 00000000..40923da1 --- /dev/null +++ b/handler/gather/tasks/observer/unit_data_imbalance.yaml @@ -0,0 +1,137 @@ +info_en: "[unit data imbalance]" +info_cn: "[unit迁移/缩小 副本不均衡问题]" +command: obdiag gather scene run --scene=observer.unit_data_imbalance +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "show parameters like '%data_disk_usage_limit_percentage%';" + global: true + - type: sql + sql: "show parameters like '%migration_disable_time%';" + global: true + - type: sql + sql: "show parameters like '%sys_bkgd_net_percentage%';" + global: true + - type: sql + sql: "show parameters like '%balancer_idle_time%';" + global: true + - type: sql + sql: "show parameters like '%server_data_copy_in_concurrency%';" + global: true + - type: sql + sql: "show parameters like '%server_data_copy_out_concurrency%';" + global: true + - type: sql + sql: "show parameters like '%data_copy_concurrency%';" + global: true + - type: sql + sql: "show parameters like '%server_permanent_offline_time%';" + global: true + - type: sql + sql: "show parameters like '%migrate_concurrency%';" + global: true + - type: sql + sql: "show parameters like '%enable_rebalance%';" + global: true + - type: sql + sql: "show parameters like '%enable_rereplication%';" + global: true + - type: sql + sql: "show parameters like '%enable_auto_leader_switch%';" + global: true + - type: sql + sql: "select * from oceanbase.__all_unit;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_replica_task;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_rebalance_task_stat;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_sys_task_status;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_partition_migration_status;" + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_job where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_event_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "select svr_ip,total_size / 1024 / 1024 / 1024 total_G,free_size / 1024 / 1024 / 1024 free_G,(total_size - free_size) / 1024 / 1024 / 1024 used_G,(total_size - free_size) / total_size used_percentage FROM oceanbase.__all_virtual_disk_stat;" + global: true + - type: log + grep: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%data_disk_usage_limit_percentage%';" + global: true + - type: sql + sql: "show parameters like '%migration_disable_time%';" + global: true + - type: sql + sql: "show parameters like '%sys_bkgd_net_percentage%';" + global: true + - type: sql + sql: "show parameters like '%balancer_idle_time%';" + global: true + - type: sql + sql: "show parameters like '%server_permanent_offline_time%';" + global: true + - type: sql + sql: "show parameters like '%enable_rebalance%';" + global: true + - type: sql + sql: "show parameters like '%enable_rereplication%';" + global: true + - type: log + grep: "" + global: false diff --git a/handler/gather/tasks/observer/unknown.yaml b/handler/gather/tasks/observer/unknown.yaml new file mode 100644 index 00000000..9727f41e --- /dev/null +++ b/handler/gather/tasks/observer/unknown.yaml @@ -0,0 +1,74 @@ +info_en: "[unknown problem]" +info_cn: "[未能明确问题的场景]" +command: obdiag gather scene run --scene=observer.unknown +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "show parameters" + global: true + - type: log + global: false + grep: "" + - type: sysstat + global: false + sysstat: "" + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters" + global: true + - type: log + global: false + grep: "" + - type: sysstat + global: false + sysstat: "" + - type: ssh + ssh: "ps -ef | grep observer" + global: false + - type: ssh + ssh: "cat /proc/sys/kernel/core_pattern" + global: false diff --git a/handler/gather/tasks/other/application_error.yaml b/handler/gather/tasks/other/application_error.yaml new file mode 100644 index 00000000..1d3d95c1 --- /dev/null +++ b/handler/gather/tasks/other/application_error.yaml @@ -0,0 +1,68 @@ +info_en: "[application error]" +info_cn: "[应用报错问题]" +command: obdiag gather scene run --scene=other.application_error +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "show parameters" + global: true + - type: log + grep: "" + global: false + - type: obproxy_log + grep: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters" + global: true + - type: log + grep: "" + global: false + - type: obproxy_log + grep: "" + global: false diff --git a/handler/rca/__init__.py b/handler/rca/__init__.py new file mode 100644 index 00000000..d85f698e --- /dev/null +++ b/handler/rca/__init__.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/22 +@file: __init__.py +@desc: +""" + diff --git a/handler/rca/rca_exception.py b/handler/rca/rca_exception.py new file mode 100644 index 00000000..7cfcff95 --- /dev/null +++ b/handler/rca/rca_exception.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/22 +@file: rca_exception.py +@desc: +""" +import pprint + + +# rce +class RCAInitException(Exception): + def __init__(self, msg=None, obj=None): + self.msg, self.obj = msg, obj + + def __repr__(self): + return '%s %s' % (self.msg, self.obj is not None and pprint.pformat(self.obj) or '') + + def __str__(self): + return repr(self) + + + +class RCAExecuteException(Exception): + def __init__(self, msg=None, obj=None): + self.msg, self.obj = msg, obj + + def __repr__(self): + return '%s %s' % (self.msg, self.obj is not None and pprint.pformat(self.obj) or '') + + def __str__(self): + return repr(self) + + + +class RCANotNeedExecuteException(Exception): + def __init__(self, msg=None, obj=None): + self.msg, self.obj = msg, obj + + def __repr__(self): + return '%s %s' % (self.msg, self.obj is not None and pprint.pformat(self.obj) or '') + + def __str__(self): + return repr(self) diff --git a/handler/rca/rca_handler.py b/handler/rca/rca_handler.py new file mode 100644 index 00000000..58045ce6 --- /dev/null +++ b/handler/rca/rca_handler.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/22 +@file: rca_handler.py +@desc: +""" +import datetime + +from common.logger import logger +from handler.rca.rca_exception import RCANotNeedExecuteException +from handler.rca.rca_scene import rca_map +from utils.utils import node_cut_passwd_for_log + + +def scene_exist(scene_name): + if scene_name in rca_map: + return True + else: + return False + + +class RCAHandler: + + def __init__(self, cluster, nodes, obproxy_nodes, + result_path="./rca/"): + self.rca_scene_parameters = None + self.rca_scene = None + self.cluster = cluster + self.nodes = nodes + self.obproxy_nodes = obproxy_nodes + self.result_path = result_path + + # init input parameters + self.report = None + self.tasks = None + logger.debug("RCAHandler init.cluster:{0}, init.nodes:{1}, init.obproxy_nodes:{2}, init.result_path:{3}".format( + self.cluster.get( + "ob_cluster_name") or self.cluster.get( + "obproxy_cluster_name"), node_cut_passwd_for_log(self.nodes), node_cut_passwd_for_log(self.obproxy_nodes), self.result_path)) + + def get_result_path(self): + return self.result_path + + def handle(self, args): + if getattr(args, "parameters"): + self.rca_scene_parameters = getattr(args, "parameters", "")[0].strip() + if getattr(args, "result_path"): + self.result_path = getattr(args, "result_path", "./rca/")[0].strip() + + if getattr(args, "scene") and scene_exist(getattr(args, "scene")[0]): + self.rca_scene = rca_map[getattr(args, "scene")[0]] + self.result_path = "{0}/{1}_{2}".format(self.result_path, getattr(args, "scene")[0].strip(), + datetime.datetime.now().strftime('%Y%m%d%H%M%S')) + self.rca_scene.init(self.cluster, self.nodes, self.obproxy_nodes, + env=self.rca_scene_parameters, result_path=self.result_path) + + else: + raise Exception("rca_scene :{0} is not exist or not input".format(getattr(args, "scene", ""))) + + # get all tasks + def execute(self): + try: + self.rca_scene.execute() + except RCANotNeedExecuteException as e: + logger.warning("rca_scene.execute not need execute: {0}".format(e)) + pass + except Exception as e: + logger.error("rca_scene.execute err: {0}".format(e)) + raise Exception("rca_scene.execute err: {0}".format(e)) + try: + self.rca_scene.export_result() + except Exception as e: + logger.error("rca_scene.export_result err: {0}".format(e)) + raise Exception("rca_scene.export_result err: {0}".format(e)) + diff --git a/handler/rca/rca_list.py b/handler/rca/rca_list.py new file mode 100644 index 00000000..4d7ec6ad --- /dev/null +++ b/handler/rca/rca_list.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/23 +@file: rca_list.py +@desc: +""" +from common.logger import logger +from dataclasses import dataclass +from utils.print_utils import print_scene, print_title + +@dataclass +class RegisteredScene: + name: str + command: str + info_en: str + info_cn: str + + +scene_list = [ + RegisteredScene( + 'major_hold', + 'obdiag rca run --scene=major_hold', + '[root cause analysis of major hold]', + '[针对卡合并场景的根因分析]' + ), + RegisteredScene( + 'disconnection', + 'obdiag rca run --scene=disconnection', + '[root cause analysis of disconnection]', + '[针对断链接场景的根因分析]' + ), + RegisteredScene('lock_conflict', 'obdiag rca run --scene=lock_conflict', '[root cause analysis of lock conflict]', '[针对锁冲突的根因分析]'), +] + + +class RcaScenesListHandler: + def handle(self, args): + logger.debug("list rca scenes") + scenes_map = self.__get_scenes() + self.__print_scenes_data(scenes_map) + + def __print_scenes_data(self,scenes): + print_title("Rca Scenes") + print_scene(scenes) + + def __get_scenes(self): + scenes_map = {} + for scene in scene_list: + scenes_map[scene.name]={"name": scene.name, "command": scene.command, "info_en": scene.info_en, "info_cn": scene.info_cn} + return scenes_map \ No newline at end of file diff --git a/handler/rca/rca_scene/__init__.py b/handler/rca/rca_scene/__init__.py new file mode 100644 index 00000000..90641263 --- /dev/null +++ b/handler/rca/rca_scene/__init__.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/22 +@file: __init__.py +@desc: +""" +from handler.rca.rca_scene.disconnection_scene import DisconnectionScene +from handler.rca.rca_scene.lock_conflict_scene import LockConflictScene +from handler.rca.rca_scene.major_hold_scene import MajorHoldScene + +rca_map = {} +rca_map["major_hold"] = MajorHoldScene() +rca_map["lock_conflict"] = LockConflictScene() +rca_map["disconnection"] = DisconnectionScene() + diff --git a/handler/rca/rca_scene/disconnection_scene.py b/handler/rca/rca_scene/disconnection_scene.py new file mode 100644 index 00000000..f7f72c6f --- /dev/null +++ b/handler/rca/rca_scene/disconnection_scene.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/22 +@file: disconnection_scene.py +@desc: +""" +import re +import time +import datetime + +from common.command import get_obproxy_version +from common.logger import logger +from handler.rca.rca_scene.scene_base import scene_base, Result, RCA_ResultRecord +from utils.shell_utils import SshHelper +from utils.version_utils import compare_versions_greater + + +class DisconnectionScene(scene_base): + def __init__(self): + super().__init__() + + def init(self, cluster, nodes, obproxy_nodes, env, result_path): + super().init(cluster, nodes, obproxy_nodes, env, result_path) + + for node in obproxy_nodes: + if "home_path" not in node or len(node["home_path"].strip()) == 0: + raise Exception("obproxy_node home_path is empty") + try: + is_ssh = True + ssh_helper = SshHelper(is_ssh, node.get("ip"), + node.get("user"), + node.get("password"), + node.get("port"), + node.get("private_key"), + node) + except Exception as e: + logger.error( + "SshHandler init fail. Please check the NODES conf. node: {0}. Exception : {1} .".format(node, e)) + raise Exception( + "SshHandler init fail. Please check the NODES conf node: {0} Exception : {1} .".format(node, e)) + obproxy_version = get_obproxy_version(True, ssh_helper, node.get("home_path")) + if obproxy_version is None: + raise Exception("obproxy version is None. Please check the NODES conf.") + + if not (obproxy_version == "4.2.2.0" or compare_versions_greater(obproxy_version, "4.2.2.0")): + raise Exception("obproxy version must be greater than 4.2.2.0. Please check the NODES conf.") + + def execute(self): + for node in self.obproxy_nodes: + self.__execute_obproxy_one_node(node) + logger.info("end disconnectionScene execute all nodes") + + def export_result(self): + return self.Result.export() + + def __execute_obproxy_one_node(self, node): + ssh = SshHelper(True, node.get("ip"), + node.get("user"), + node.get("password"), + node.get("port"), + node.get("private_key"), + node) + all_log = ssh.ssh_exec_cmd( + 'grep "CONNECTION](trace_type" -m 100 $(ls {0}/log/obproxy_diagnosis.log* | head -10 ) '.format( + node['home_path']) + ) + + log_list = all_log.strip().split('\n') + for line in log_list: + try: + record = RCA_ResultRecord() + record.add_record( + "node:{1} obproxy_diagnosis_log:{0}".format(line, node.get("ip"))) + log_check = DisconnectionLog(line, record) + suggest = log_check.execute() + record.add_suggest(suggest) + logger.debug("suggest:{0}".format(suggest)) + + # self.Result.suggest += "obproxy_diagnosis_log:{0}\nsuggest:{1}\n\n".format(line, suggest) + self.Result.records.append(record) + except Exception as e: + logger.warning("line in log_list is error, log: {0} ,err:{1}".format(line, e)) + continue + + +class DisconnectionLog: + def __init__(self, log, record): + self.record = record + logger.debug("DisconnectionLog base:{0}".format(log)) + if log is None or len(log.strip()) == 0: + logger.debug("log is None or len(log.strip()) == 0") + raise Exception("log is None or len(log.strip()) == 0") + + self.timeout_event = "" + try: + self.log = log + + pattern = re.compile( + r'trace_type="(.*?)".*' + r'cs_id:(\d+).*' + r'server_session_id:(\d+).*' + r'error_code:([-0-9]+).*' + r'error_msg:"(.*?)"' + + ) + + # 搜索日志条目 + matches = pattern.search(log) + + # 如果找到匹配项,则提取所需信息 + if matches: + trace_type = matches.group(1) + cs_id = matches.group(2) + server_session_id = matches.group(3) + error_code = matches.group(4) + error_msg = matches.group(5) + # 打印所需信息 + self.trace_type = trace_type + self.error_code = error_code + self.error_msg = error_msg + timeout_event_pattern = re.compile(r'timeout_event:"(.*?)".*') + timeout_event_matches = timeout_event_pattern.search(log) + if timeout_event_matches and self.trace_type == "TIMEOUT_TRACE": + timeout_event = matches.group(1) + self.error_msg = timeout_event + if self.trace_type == "SERVER_INTERNAL_TRACE": + self.trace_type = "PROXY_INTERNAL_TRACE" + record.add_record("cs_id:{0}, server_session_id:{1}".format(cs_id, server_session_id)) + + except Exception as e: + logger.error("DisconnectionLog err: {0}".format(e)) + + def execute(self): + # self.get_suggest() + try: + suggest = get_disconnectionSuggest(self.trace_type, self.error_code, self.error_msg, self.record) + return suggest + except Exception as e: + raise Exception("DisconnectionLog execute err: {0}".format(e)) + + +DisconnectionAllSuggest = { + "LOGIN_TRACE": { + "-4669": { + "does not exist": "Ensure the existence of the corresponding cluster, which can be confirmed by directly connecting to ObServer", + "cluster info is empty": "Directly connect to the Observer to execute the sql statement in the internal_sql field to confirm whether the cluster information returned by the Observer is empty", + }, + "-4043": { + "dummy entry is empty, please check if the tenant exists": "Ensure the existence of the corresponding tenant, which can be confirmed by directly connecting to ObServer" + }, + "-8205": { + "can not pass white list": "Confirm whether the ObProxy whitelist is configured correctly through OCP" + }, + "-1227": { + "Access denied": "Confirm if the ObServer whitelist is configured correctly" + }, + "-5059": { + "too many sessions": "You can adjust the global configuration client_max_connections of ObProxy to temporarily avoid it.", + "hold too many connections": "Need to contact the public cloud platform to adjust the connection limit for cloud tenants", + + }, + "-8004": { + "obproxy is configured to use ssl connection": "Modify the SSL protocol configuration enable_client_ssl, or use SSL protocol access", + + }, + + "-10021": { + "user proxyro is rejected while proxyro_check on": "Should not be used directly proxyro@sys Accessing databases", + "connection with cluster name and tenant name is rejected while cloud_full_user_name_check off": "Should not be used directly proxyro@sys Accessing databases", + "cluster name and tenant name is required while full_username_check on": "When non-cloud users turn off enable_full_user_name, ObProxy will restrict non-three-segment access", + + }, + "-10018": { + "fail to check observer version, proxyro@sys access denied, error resp": "The password for deploying proxyro by default is not a problem. If you manually change the password for proxyro user, please ensure that the configuration of the ObProxy startup parameter is correct", + "fail to check observer version, empty result": "You can confirm whether the server ip configured when the ObProxy was started is available by directly connecting to the ObServer.", + "fail to check observer version": "Directly connect to the Observer to execute the sql statement in the internal_sql field to confirm whether the cluster information returned by the Observer is empty", + "fail to check cluster info": "Directly connect to the Observer to execute the sql statement in the internal_sql field to confirm whether the cluster information returned by the Observer is empty", + "fail to init server state": "Directly connect to the Observer to execute the sql statement in the internal_sql field to confirm whether the cluster information returned by the Observer is empty", + + }, + "-10301": { + "fail to fetch root server list from config server " + "fail to fetch root server list from local": "You can manually pull the url of the config_server configured at startup to confirm whether the information returned by the config server is normal", + }, + + }, + "TIMEOUT_TRACE": { + "-10022": { + "CLIENT_DELETE_CLUSTER_RESOURCE": "You can temporarily avoid it by adjusting the obproxy cluster_ expire_time configuration. The default expiration time is one day, and the new request will reset the expiration time.", + "CLIENT_INTERNAL_CMD_TIMEOUT": "Unexpected timeout, requiring customer environment cooperation for diagnosis", + "CLIENT_CONNECT_TIMEOUT": "Unexpected timeout, requiring customer environment cooperation for diagnosis", + "CLIENT_NET_READ_TIMEOUT": "Modifying the observer net_read_timeout variable requires mainly modifying the global level configuration, which will not take effect on existing connections.", + "CLIENT_NET_WRITE_TIMEOUT": "Modifying the observer net_read_timeout variable requires mainly modifying the global level configuration, which will not take effect on existing connections.", + "CLIENT_WAIT_TIMEOUT": "Modify the observer wait_timeout variable to temporarily avoid it", + "SERVER_QUERY_TIMEOUT": "Modify the observer ob_query_timeout variable to temporarily avoid or modify the obproxy observer_query_timeout_delta configuration to avoid it", + "SERVER_TRX_TIMEOUT": "Modify the variable ob_trx_timeout to temporarily avoid it", + "SERVER_WAIT_TIMEOUT": "Modify the observer wait_timeout variable to temporarily avoid it", + }, + }, + "SERVER_VC_TRACE": { + "-10013": { + "Fail to build connection to observer": "Need the cooperation of the observer for diagnosis" + }, + "-10014": { + " received while proxy transferring request": "Need the cooperation of the observer for diagnosis" + }, + "-10016": { + " received while proxy reading response": "Need the cooperation of the observer for diagnosis" + } + }, + "CLIENT_VC_TRACE": { + "-10010": { + " received from client while obproxy reading request": "Need client cooperation for diagnosis", + }, + "-10011": { + " received from client while obproxy handling response": "Need client cooperation for diagnosis", + }, + "-10012": { + " received from client while obproxy transferring response": "Need client cooperation for diagnosis", + }, + }, + "PROXY_INTERNAL_TRACE": { + "-4664": { + "dummy entry is empty, disconnect": "Unexpected error scenario", + }, + "-10018": { + "proxy execute internal request failed, received error resp, error_type:": "Unexpected error scenario", + }, + "-10019": { + "OBProxy reached the maximum number of retrying request": "Unexpected error scenario", + }, + "-10001": { + "target session is closed, disconnect": "Unexpected error scenario", + "": "Unexpected error scenario", + "ora fatal error": "Unexpected error scenario", + "primary cluster switchover to standby, disconnect": "The possible connection loss problem during the switch between the primary and secondary databases, which is consistent with the expected scenario", + }, + "-5065": { + "connection was killed by user self, cs_id": "In line with the expected scenario, the diagnostic log is recorded", + "connection was killed by user session": "In line with the expected scenario, the diagnostic log is recorded" + }, + }, + +} + + +def get_disconnectionSuggest(trace_type, error_code, error_msg, record): + if trace_type == "" or error_code == "" or error_msg == "": + raise Exception( + "not find the suggest. Please contact the community and upload the exception information.. trace_type:{0}, error_code:{1}, error_msg:{2}".format( + trace_type, error_code, error_msg)) + Suggest_trace_type = DisconnectionAllSuggest.get(trace_type) + record.add_record('trace_type:{0}'.format(trace_type)) + if Suggest_trace_type: + Suggest_error_code = Suggest_trace_type.get(error_code) + record.add_record('error_code:{0}'.format(error_code)) + if Suggest_error_code: + suggest = "" + error_msgs = Suggest_error_code.keys() + for suggest_error_msg in error_msgs: + # 子串 + if suggest_error_msg in error_msg: + logger.info( + "find the suggest. trace_type:{0}, error_code:{1}, error_msg:{2}".format(trace_type, error_code, + error_msg)) + suggest += "\n" + suggest += Suggest_error_code.get(suggest_error_msg) + if suggest.strip() != "": + logger.info( + "find the suggest. trace_type:{0}, error_code:{1}, error_msg:{2}, suggest:{3}".format(trace_type, + error_code, + error_msg, + suggest.strip())) + return suggest.strip() + else: + + suggest = "not find the suggest. Please contact the community and upload the exception information.. trace_type:{0}, error_code:{1}, error_msg:{2}. The suggestions are as follows. You can try using the following suggestions or submit the logs to the Oceanbase community.".format( + trace_type, error_code, error_msg) + suggest +="\n" + + for error_msg_by_Suggest_error_code in Suggest_error_code: + suggest += Suggest_error_code.get(error_msg_by_Suggest_error_code)+"\n" + return suggest + else: + raise Exception("the disconnection error_code :{0} ,not support.".format(error_code)) + else: + raise Exception("the disconnection trace_type :{0} ,not support.".format(trace_type)) diff --git a/handler/rca/rca_scene/lock_conflict_scene.py b/handler/rca/rca_scene/lock_conflict_scene.py new file mode 100644 index 00000000..dab88102 --- /dev/null +++ b/handler/rca/rca_scene/lock_conflict_scene.py @@ -0,0 +1,148 @@ +# !/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/29 +@file: lock_conflict_scene.py +@desc: +""" +from common.command import get_observer_version +from common.logger import logger +from common.ob_connector import OBConnector +from handler.rca.rca_exception import RCAInitException, RCANotNeedExecuteException +from handler.rca.rca_scene.scene_base import scene_base, Result, RCA_ResultRecord +from utils.shell_utils import SshHelper +from utils.version_utils import compare_versions_greater + + +class LockConflictScene(scene_base): + def __init__(self): + super().__init__() + self.ob_connector = None + self.observer_nodes = None + self.ob_cluster = None + self.observer_version = None + self.default_node = None + + def init(self, cluster, nodes, obproxy_nodes, env, result_path): + try: + super().init(cluster, nodes, obproxy_nodes, env, result_path) + self.default_node = self.observer_nodes[0] + + ssh = SshHelper(True, self.default_node.get("ip"), + self.default_node.get("user"), + self.default_node.get("password"), + self.default_node.get("port"), + self.default_node.get("private_key"), + self.default_node) + self.observer_version = get_observer_version(True, ssh, self.default_node["home_path"]) + + self.ob_connector = OBConnector(ip=self.ob_cluster.get("db_host"), + port=self.ob_cluster.get("db_port"), + username=self.ob_cluster.get("tenant_sys").get("user"), + password=self.ob_cluster.get("tenant_sys").get("password"), + timeout=10000) + + except Exception as e: + raise RCAInitException("LockConflictScene RCAInitException: ", e) + + def execute(self): + if self.observer_version is None or len(self.observer_version) == 0: + raise Exception("observer version is None. Please check the NODES conf.") + if self.observer_version == "4.2.0.0" or compare_versions_greater(self.observer_version, "4.2.0.0"): + self.__execute_4_2() + elif compare_versions_greater("4.2.2.0", self.observer_version): + self.__execute_old() + else: + raise Exception("observer version is {0}. Not support".format(self.observer_version)) + + def __execute_4_2(self): + first_record = RCA_ResultRecord() + # get trans_id + cursor = self.ob_connector.execute_sql_return_cursor_dictionary( + 'select * from oceanbase.GV$OB_LOCKS where BLOCK=1 and TYPE="TX" limit 50;') + data = cursor.fetchall() + if len(data) == 0: + first_record.add_record("on GV$OB_LOCKS result is null") + first_record.add_suggest("No block lock found. Not Need Execute") + self.Result.records.append(first_record) + raise RCANotNeedExecuteException("No block lock found.") + first_record.add_record("by select * from oceanbase.GV$OB_LOCKS where BLOCK=1; the len is {0}".format(len(data))) + for OB_LOCKS_data in data: + trans_record = RCA_ResultRecord() + first_record_records = first_record.records.copy() + trans_record.records.extend(first_record_records) + self.Result.records.append(trans_record) + try: + if OB_LOCKS_data.get('TRANS_ID') is None: + trans_record.add_record("trans_id is null") + trans_record.add_suggest("trans_id is null. can not do next") + continue + else: + trans_id = OB_LOCKS_data['TRANS_ID'] + trans_record.add_record("trans_id is {0}".format(trans_id)) + cursor_by_trans_id = self.ob_connector.execute_sql_return_cursor_dictionary( + 'select * from oceanbase.V$OB_TRANSACTION_PARTICIPANTS where TX_ID="{0}";'.format(trans_id)) + session_datas = cursor_by_trans_id.fetchall() + trans_record.add_record( + "get SESSION_ID by trans_id:{0}. get data:{0}".format(trans_id, session_datas)) + if len(session_datas) != 1: + trans_record.add_suggest( + "get SESSION_ID by trans_id:{0}. Maybe the lock is not exist".format(trans_id)) + continue + if session_datas[0].get("SESSION_ID") is not None: + trans_record.add_record("get SESSION_ID:{0}".format(session_datas[0].get("SESSION_ID"))) + trans_record.add_suggest("Sessions corresponding to lock transactions. The ID is {0}, " + "which may be a lock conflict issue.You can be accessed through kill " + "session_ Roll back the corresponding transaction with ID. Please " + "note that this will result in corresponding transaction regression! " + "".format(session_datas[0].get("SESSION_ID"))) + + except Exception as e: + trans_record.add_record("get SESSION_ID panic. OB_LOCKS_data:{0} error: {1}".format(OB_LOCKS_data, e)) + trans_record.add_suggest("get SESSION_ID panic. OB_LOCKS_data:{0} error: {1}".format(OB_LOCKS_data, e)) + + return + + def __execute_old(self): + first_record = RCA_ResultRecord() + cursor = self.ob_connector.execute_sql_return_cursor_dictionary( + "select * from oceanbase.__all_virtual_lock_wait_stat order by try_lock_times limit 50;") + virtual_lock_wait_stat_datas = cursor.fetchall() + if len(virtual_lock_wait_stat_datas) == 0: + first_record.add_record("on __all_virtual_trans_stat result is null") + first_record.add_suggest("No block lock found. Not Need Execute") + self.Result.records.append(first_record) + raise RCANotNeedExecuteException("No block lock found.") + first_record.add_record( + "by select * from oceanbase.__all_virtual_lock_wait_stat order by try_lock_times limit 50; the len is {0}".format( + len(virtual_lock_wait_stat_datas))) + + for trans_lock_data in virtual_lock_wait_stat_datas: + + trans_id = trans_lock_data["block_session_id"] + trans_record = RCA_ResultRecord() + first_record_records = first_record.records.copy() + trans_record.records.extend(first_record_records) + self.Result.records.append(trans_record) + trans_record.add_record("block_data is {0}".format(trans_lock_data)) + trans_record.add_record("block_session_id is {0}".format(trans_id)) + trans_record.add_suggest("Sessions corresponding to lock transactions. The ID is {0}, " + "which may be a lock conflict issue.You can be accessed through kill " + "session_Roll back the corresponding transaction with ID. Please " + "note that this will result in corresponding transaction regression! " + "".format(trans_lock_data.get("block_session_id"))) + + return + + def export_result(self): + return self.Result.export() \ No newline at end of file diff --git a/handler/rca/rca_scene/major_hold_scene.py b/handler/rca/rca_scene/major_hold_scene.py new file mode 100644 index 00000000..cf35bcbb --- /dev/null +++ b/handler/rca/rca_scene/major_hold_scene.py @@ -0,0 +1,487 @@ +# !/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/1/2 +@file: major_hold.py +@desc: +""" +import json +import re + +from common.command import get_observer_version +from common.logger import logger +from common.ob_connector import OBConnector +from handler.rca.rca_exception import RCAInitException, RCAExecuteException, RCANotNeedExecuteException +from handler.rca.rca_scene.scene_base import scene_base, Result, RCA_ResultRecord +from utils.shell_utils import SshHelper +from utils.time_utils import DateTimeEncoder +from utils.version_utils import compare_versions_greater + + +class MajorHoldScene(scene_base): + def __init__(self): + super().__init__() + self.local_path = None + self.ob_cluster = None + self.observer_nodes = [] + self.observer_version = "" + self.ob_connector = None + self.Result = Result() + + def init(self, cluster, nodes, obproxy_nodes, env, result_path): + try: + super().__init__() + self.Result.set_save_path(result_path) + self.ob_cluster = cluster + self.observer_nodes = nodes + self.local_path = result_path + node = self.observer_nodes[0] + ssh = SshHelper(True, node.get("ip"), + node.get("user"), + node.get("password"), + node.get("port"), + node.get("private_key"), + node) + self.observer_version = get_observer_version(True, ssh, node["home_path"]) + if self.observer_version is None: + raise Exception("obproxy version is None. Please check the NODES conf.") + + if not (self.observer_version == "4.0.0.0" or compare_versions_greater(self.observer_version, "4.0.0.0")): + raise Exception("observer version must be greater than 4.0.0.0. Please check the NODES conf.") + + self.ob_connector = OBConnector(ip=self.ob_cluster.get("db_host"), + port=self.ob_cluster.get("db_port"), + username=self.ob_cluster.get("tenant_sys").get("user"), + password=self.ob_cluster.get("tenant_sys").get("password"), + timeout=10000) + except Exception as e: + raise RCAInitException("MajorHoldScene RCAInitException: {0}".format(e)) + + def execute(self): + # 前置条件确认 + need_tag = False + first_record = RCA_ResultRecord() + err_tenant_ids = [] + # 合并任务是否有报错 + try: + COMPACTING_data = self.ob_connector.execute_sql( + 'select * from oceanbase.CDB_OB_MAJOR_COMPACTION where IS_ERROR="YES";') + if len(COMPACTING_data) == 0: + first_record.add_record("CDB_OB_MAJOR_COMPACTION is not exist IS_ERROR='YES'") + else: + need_tag = True + CDB_OB_MAJOR_COMPACTION_err_tenant_ids = [] + for data in COMPACTING_data: + CDB_OB_MAJOR_COMPACTION_err_tenant_ids.append(str(data[0])) + + first_record.add_record( + "CDB_OB_MAJOR_COMPACTION have IS_ERROR='YES',the tenant_ids are {0}".format(err_tenant_ids)) + err_tenant_ids.extend(CDB_OB_MAJOR_COMPACTION_err_tenant_ids) + + except Exception as e: + logger.warning("MajorHoldScene execute CDB_OB_MAJOR_COMPACTION panic: {0}".format(e)) + raise RCAExecuteException("MajorHoldScene execute CDB_OB_MAJOR_COMPACTION panic: {0}".format(e)) + # __all_virtual_compaction_diagnose_info里存在status=FAILED的记录 + try: + diagnose_data = self.ob_connector.execute_sql( + 'select * from oceanbase.__all_virtual_compaction_diagnose_info where status="FAILED";') + if len(diagnose_data) == 0: + first_record.add_record('__all_virtual_compaction_diagnose_info is not exist status="FAILED";') + else: + need_tag = True + __all_virtual_compaction_diagnose_info_err_tenant_ids = [] + for data in COMPACTING_data: + __all_virtual_compaction_diagnose_info_err_tenant_ids.append(str(data[0])) + + first_record.add_record( + "__all_virtual_compaction_diagnose_info have status='FAILED',the tenant is {0}".format( + __all_virtual_compaction_diagnose_info_err_tenant_ids)) + err_tenant_ids.extend(__all_virtual_compaction_diagnose_info_err_tenant_ids) + except Exception as e: + logger.error("MajorHoldScene execute CDB_OB_MAJOR_COMPACTION panic: {0}".format(e)) + raise RCAExecuteException("MajorHoldScene execute CDB_OB_MAJOR_COMPACTION panic: {0}".format(e)) + # GV$OB_COMPACTION_PROGRESS表中,根据上一次合并记录中的data_size/(estimated_finish_time-start_time)与当前合并版本记录中(data_size-unfinished_data_size)/(当前时间-start_time)相比,如果差距过大(当前合并比上一次合并慢很多,以5倍为指标) + try: + running_data = self.ob_connector.execute_sql( + "select * from oceanbase.GV$OB_COMPACTION_PROGRESS where STATUS <> 'FINISH' and START_TIME <= NOW() - INTERVAL 20 minute GROUP BY COMPACTION_SCN DESC;") + if len(running_data) == 0: + first_record.add_record('No merge tasks that have not ended beyond the expected time') + else: + + time_out_merge_err_tenant_ids = [] + need_tag = True + for data in running_data: + time_out_merge_err_tenant_ids.append(str(data[2])) + first_record.add_record( + "merge tasks that have not ended beyond the expected time,the tenant_id is {0}".format( + time_out_merge_err_tenant_ids)) + logger.info("merge tasks that have not ended beyond the expected time,the tenant_id is {0}".format( + time_out_merge_err_tenant_ids)) + err_tenant_ids.extend(time_out_merge_err_tenant_ids) + except Exception as e: + logger.error("MajorHoldScene execute GV$OB_COMPACTION_PROGRESS panic: {0}".format(e)) + raise RCAExecuteException("MajorHoldScene execute GV$OB_COMPACTION_PROGRESS panic: {0}".format(e)) + if not need_tag: + first_record.add_suggest("major merge abnormal situation not need execute") + self.Result.records.append(first_record) + raise RCANotNeedExecuteException("MajorHoldScene not need execute") + else: + err_tenant_ids = list(set(err_tenant_ids)) + first_record.add_suggest("some tenants need execute MajorHoldScene. :{0}".format(err_tenant_ids)) + logger.info("On CDB_OB_MAJOR_COMPACTION") + + # execute record need more + for err_tenant_id in err_tenant_ids: + tenant_record = RCA_ResultRecord() + first_record_records=first_record.records.copy() + tenant_record.records.extend(first_record_records) + logger.info("tenant_id is {0}".format(err_tenant_id)) + tenant_record.add_record("tenant_id is {0}".format(err_tenant_id)) + # 1 + try: + cursor = self.ob_connector.execute_sql_return_cursor_dictionary( + 'SELECT * FROM oceanbase.CDB_OB_MAJOR_COMPACTION WHERE TENANT_ID= "{0}" AND (IS_ERROR = "NO" OR IS_SUSPENDED = "NO");'.format( + err_tenant_id)) + OB_MAJOR_COMPACTION_data = cursor.fetchall() + if len(OB_MAJOR_COMPACTION_data) == 0: + tenant_record.add_record( + "on CDB_OB_MAJOR_COMPACTION where status='COMPACTING'; " + "result:{0} , need not next step".format(str(OB_MAJOR_COMPACTION_data))) + + else: + tenant_record.add_record( + "on CDB_OB_MAJOR_COMPACTION where status='COMPACTING';" + "result:{0}".format(str(OB_MAJOR_COMPACTION_data))) + + except Exception as e: + tenant_record.add_record("#1 on CDB_OB_MAJOR_COMPACTION get data failed") + logger.warning("MajorHoldScene execute exception: {0}".format(e)) + pass + # 2 + try: + compaction_diagnose_info = self.ob_connector.execute_sql( + 'SELECT * FROM oceanbase.__all_virtual_compaction_diagnose_info WHERE status="FAILED";') + + if len(compaction_diagnose_info) == 0: + tenant_record.add_record( + "on __all_virtual_compaction_diagnose_info no data status=FAILED") + else: + tenant_record.add_record( + "on __all_virtual_compaction_diagnose_info;" + "result:{0}".format(str(compaction_diagnose_info))) + + for COMPACTING_data in compaction_diagnose_info: + self.diagnose_info_switch(COMPACTING_data, tenant_record) + + except Exception as e: + tenant_record.add_record("#2&3 on __all_virtual_compaction_diagnose_info get data failed") + logger.warning("#2&3 MajorHoldScene execute exception: {0}".format(e)) + pass + + # 4 + try: + global_broadcast_scn = self.ob_connector.execute_sql( + "select * from oceanbase.CDB_OB_MAJOR_COMPACTION where TENANT_ID='{0}';".format(err_tenant_id))[ + 0][3] + tenant_record.add_record("global_broadcast_scn is {0}".format(global_broadcast_scn)) + last_scn = self.ob_connector.execute_sql( + "select LAST_SCN from oceanbase.CDB_OB_MAJOR_COMPACTION where TENANT_ID='{0}';".format( + err_tenant_id))[0] + tenant_record.add_record("last_scn is {0}".format(last_scn)) + + sql = "select * from oceanbase.GV$OB_COMPACTION_PROGRESS where TENANT_ID='{0}' and COMPACTION_SCN='{1}';".format( + err_tenant_id, global_broadcast_scn) + OB_COMPACTION_PROGRESS_data_global_broadcast_scn = self.ob_connector.execute_sql(sql) + file_name = "{0}/rca_major_hold_{1}_OB_COMPACTION_PROGRESS_data_global_broadcast_scn".format( + self.local_path, err_tenant_id) + with open(file_name, 'w') as f: + f.write(str(OB_COMPACTION_PROGRESS_data_global_broadcast_scn)) + tenant_record.add_record( + "tenant_id:{0} OB_COMPACTION_PROGRESS_data_global_broadcast_scn save on {1}".format(err_tenant_id, + file_name)) + + sql = "select * from oceanbase.GV$OB_COMPACTION_PROGRESS where TENANT_ID='{0}' and COMPACTION_SCN='{1}';".format( + err_tenant_id, last_scn) + OB_COMPACTION_PROGRESS_data_last_scn = self.ob_connector.execute_sql(sql) + file_name = "{0}/rca_major_hold_{1}_OB_COMPACTION_PROGRESS_data_last_scn".format( + self.local_path, err_tenant_id) + with open(file_name, 'w') as f: + f.write(str(OB_COMPACTION_PROGRESS_data_last_scn)) + tenant_record.add_record( + "tenant_id:{0} OB_COMPACTION_PROGRESS_data_last_scn save on {1}".format(err_tenant_id, + file_name)) + + sql = "select * from oceanbase.GV$OB_COMPACTION_PROGRESS where TENANT_ID='{0}' and STATUS<>'FINISH';".format( + err_tenant_id, global_broadcast_scn) + finish_data = self.ob_connector.execute_sql(sql) + if len(finish_data) == 0: + tenant_record.add_record("sql:{0},len of result is 0;result:{1}".format(sql, finish_data)) + sql = "select * from oceanbase. where TENANT_ID='{0}' and LS_ID=1".format(err_tenant_id) + svrs = self.ob_connector.execute_sql(sql) + svr_ip = svrs[0][4] + svr_port = svrs[0][5] + node = None + for observer_node in self.observer_nodes: + if observer_node["ip"] == svr_ip and observer_node["port"] == svr_port: + node = observer_node + if node == None: + logger.error( + "can not find ls_svr by TENANT_ID:{2} ip:{0},port:{1}".format(svr_ip, svr_port, + err_tenant_id)) + break + ssh_helper = SshHelper(True, node.get("ip"), + node.get("user"), + node.get("password"), + node.get("port"), + node.get("private_key"), + node) + log_name = "/tmp/major_hold_scene_4_major_merge_progress_checker_{0}.log".format(err_tenant_id) + ssh_helper.ssh_exec_cmd( + 'grep "major_merge_progress_checker" {0}/log/rootservice.log* | grep T{1} -m500 >{2}'.format( + node.get("home_path"), err_tenant_id, log_name)) + ssh_helper.download(log_name, self.local_path) + tenant_record.add_record("download {0} to {1}".format(log_name, self.local_path)) + ssh_helper.ssh_exec_cmd("rm -rf {0}".format(log_name)) + except Exception as e: + logger.error("MajorHoldScene execute 4 exception: {0}".format(e)) + raise RCAExecuteException("MajorHoldScene execute 4 exception: {0}".format(e)) + + # 5 + try: + cursor = self.ob_connector.execute_sql_return_cursor_dictionary( + 'select * from oceanbase.GV$OB_COMPACTION_SUGGESTIONS where tenant_id="{0}";'.format(err_tenant_id)) + columns = [column[0] for column in cursor.description] + OB_COMPACTION_SUGGESTIONS_data = cursor.fetchall() + OB_COMPACTION_SUGGESTIONS_info = json.dumps(OB_COMPACTION_SUGGESTIONS_data, cls=DateTimeEncoder) + file_name = "{0}/rca_major_hold_{1}_OB_COMPACTION_SUGGESTIONS_info".format( + self.local_path, err_tenant_id) + with open(file_name, 'w') as f: + f.write(str(OB_COMPACTION_SUGGESTIONS_info)) + tenant_record.add_record( + "tenant_id:{0} OB_COMPACTION_PROGRESS_data_last_scn save on {1}".format(err_tenant_id, + file_name)) + + except Exception as e: + logger.warning("MajorHoldScene execute 5 exception: {0}".format(e)) + tenant_record.add_suggest("send the {0} to the oceanbase community".format(self.local_path)) + self.Result.records.append(tenant_record) + + def get_info__all_virtual_compaction_diagnose_info(self, tenant_record): + try: + COMPACTING_datas = self.ob_connector.execute_sql( + "SELECT * FROM oceanbase.__all_virtual_compaction_diagnose_info WHERE IS_ERROR = 'NO' OR IS_SUSPENDED = 'NO';") + if len(COMPACTING_datas) == 0: + tenant_record.add_record( + "sql:select * from oceanbase.__all_virtual_compaction_diagnose_info; no data") + return + else: + tenant_record.add_record( + "sql:select * from oceanbase.CDB_OB_MAJOR_COMPACTION where status=COMPACTING; " + "result:{0}".format(str(COMPACTING_datas))) + for index, COMPACTING_data in COMPACTING_datas: + self.diagnose_info_switch(COMPACTING_data) + except Exception as e: + raise RCAExecuteException( + "MajorHoldScene execute get_info__all_virtual_compaction_diagnose_info exception: {0}".format(e)) + + def diagnose_info_switch(self, sql_data, tenant_record): + svr_ip = sql_data[0] + svr_port = sql_data[1] + tenant_id = sql_data[2] + ls_id = sql_data[4] + table_id = sql_data[5] + create_time = sql_data[7] + diagnose_info = sql_data[8] + if "schedule medium failed" in diagnose_info: + node = None + for observer_node in self.observer_nodes: + if svr_ip == node.get("ip"): + node = observer_node + if node is None: + raise RCAExecuteException("can not find observer node by ip:{0}, port:{1}".format(svr_ip, svr_port)) + ssh_helper = SshHelper(True, node.get("ip"), + node.get("user"), + node.get("password"), + node.get("port"), + node.get("private_key"), + node) + log_name = "/tmp/rca_major_hold_schedule_medium_failed_{1}_{2}_{0}.txt".format(tenant_id, svr_ip, + svr_port) + tenant_record.add_record( + "diagnose_info type is 'schedule medium failed'. time is {0},observer is {1}:{2},the log is {3}".format( + create_time, svr_ip, svr_port, log_name)) + ssh_helper.ssh_exec_cmd( + 'grep "schedule_medium_failed" {1}/log/observer.log* |grep -P "\[\d+\]" -m 1 -o >{0}'.format(log_name, + node.get( + "home_path"))) + ssh_helper.download(log_name, local_path=self.local_path) + tenant_record.add_record("download {0} to {1}".format(log_name, self.local_path)) + ssh_helper.ssh_exec_cmd("rm -rf {0}".format(log_name)) + return + elif "error_no=" in diagnose_info and "error_trace=" in diagnose_info: + err_no = re.search("\berror_no=(\d+)\b", diagnose_info).group(1) + err_trace = re.search("\berror_trace=(.+)\b", diagnose_info).group(1) + + global_broadcast_scn = self.ob_connector.execute_sql( + "select * from oceanbase.CDB_OB_MAJOR_COMPACTION where TENANT_ID='{0}';".format(tenant_id))[0][3] + compaction_scn = self.ob_connector.execute_sql( + "select * from oceanbase.__all_virtual_tablet_meta_table where tablet_id='{0}' and tenant_id='{1}';".format( + table_id, tenant_id))[0][7] + if compaction_scn > global_broadcast_scn: + tenant_record.add_record( + "diagnose_info type is error_no. error_no: {0}, err_trace: {1} , table_id:{2}, tenant_id:{3}, compaction_scn: {4}, global_broadcast_scn: {5}. compaction_scn>global_broadcast_scn".format( + err_no, err_trace, table_id, tenant_id, compaction_scn, global_broadcast_scn)) + return + else: + tenant_record.add_record( + "diagnose_info type is error_no. error_no: {0}, err_trace:{1}, table_id:{2}, tenant_id:{3}, compaction_scn: {4}, global_broadcast_scn: {5}. compaction_scn