From ba5148b4204be3c27af510c7254709ad5aa396eb Mon Sep 17 00:00:00 2001 From: Teingi Date: Fri, 2 Feb 2024 15:45:25 +0800 Subject: [PATCH 1/2] 1.6.0 release --- README-CN.md | 9 +- README.md | 9 +- clean_all_result.sh | 3 +- common/command.py | 6 +- common/constant.py | 13 +- common/ob_connector.py | 33 +- common/obdiag_exception.py | 2 +- common/scene.py | 100 ++++ conf/inner_config.yml | 6 +- docs/check.md | 6 +- docs/gather_all.md | 11 + docs/gather_awr.md | 4 +- docs/gather_ob_stack.md | 23 + docs/gather_obproxy_log.md | 4 +- docs/gather_scene.md | 177 +++++++ docs/rca.md | 30 ++ example/all-components.yaml | 6 +- handler/analyzer/analyze_flt_trace.py | 11 +- handler/analyzer/analyze_log.py | 13 +- handler/checker/check_handler.py | 50 +- handler/checker/check_task.py | 51 +- handler/checker/step/get_system_parameter.py | 14 +- handler/checker/step/sql.py | 2 +- handler/checker/step/stepbase.py | 34 +- .../observer/cluster/core_file_find.yaml | 2 +- .../observer/cluster/data_path_settings.yaml | 17 +- .../ob_enable_plan_cache_bad_version.yaml | 11 + .../observer/cluster/observer_not_active.yaml | 15 + ...mizer_better_inlist_costing_parmmeter.yaml | 20 + .../cluster/table_history_too_many.yaml | 12 + .../checker/tasks/observer/cpu/oversold.yaml | 14 +- .../observer/disk/clog_abnormal_file.yaml | 4 +- .../observer/disk/sstable_abnormal_file.yaml | 18 + .../system/dependent_software_swapon.yaml | 17 + .../tasks/observer/system/parameter.yaml | 78 +-- .../system/parameter_ip_local_port_range.yaml | 27 + .../observer/system/parameter_tcp_rmem.yaml | 35 ++ .../observer/system/parameter_tcp_wmem.yaml | 33 ++ .../tasks/observer/version/old_version.yaml | 13 + handler/gather/gather_awr.py | 34 +- handler/gather/gather_log.py | 36 +- handler/gather/gather_obadmin.py | 12 +- handler/gather/gather_obproxy_log.py | 30 +- handler/gather/gather_obstack2.py | 273 ++++++++++ handler/gather/gather_perf.py | 30 +- handler/gather/gather_plan_monitor.py | 82 ++- handler/gather/gather_scenes.py | 200 +++++++ handler/gather/gather_sysstat.py | 43 +- handler/gather/scenes/__init__.py | 17 + handler/gather/scenes/base.py | 104 ++++ handler/gather/scenes/cpu_high.py | 76 +++ handler/gather/scenes/list.py | 127 +++++ handler/gather/scenes/register.py | 46 ++ handler/gather/scenes/sql_problem.py | 103 ++++ handler/gather/step/__init__.py | 17 + handler/gather/step/base.py | 99 ++++ handler/gather/step/sql.py | 74 +++ handler/gather/step/ssh.py | 68 +++ handler/gather/tasks/obproxy/restart.yaml | 18 + handler/gather/tasks/observer/backup.yaml | 107 ++++ .../gather/tasks/observer/backup_clean.yaml | 125 +++++ .../gather/tasks/observer/clog_disk_full.yaml | 89 ++++ handler/gather/tasks/observer/compaction.yaml | 134 +++++ .../observer/delay_of_primary_and_backup.yaml | 143 +++++ .../gather/tasks/observer/log_archive.yaml | 110 ++++ .../tasks/observer/long_transaction.yaml | 65 +++ handler/gather/tasks/observer/memory.yaml | 77 +++ handler/gather/tasks/observer/recovery.yaml | 79 +++ handler/gather/tasks/observer/restart.yaml | 74 +++ .../tasks/observer/rootservice_switch.yaml | 122 +++++ .../tasks/observer/suspend_transaction.yaml | 62 +++ .../tasks/observer/unit_data_imbalance.yaml | 137 +++++ handler/gather/tasks/observer/unknown.yaml | 74 +++ .../gather/tasks/other/application_error.yaml | 68 +++ handler/rca/__init__.py | 18 + handler/rca/rca_exception.py | 54 ++ handler/rca/rca_handler.py | 86 ++++ handler/rca/rca_list.py | 61 +++ handler/rca/rca_scene/__init__.py | 26 + handler/rca/rca_scene/disconnection_scene.py | 298 +++++++++++ handler/rca/rca_scene/lock_conflict_scene.py | 148 ++++++ handler/rca/rca_scene/major_hold_scene.py | 487 ++++++++++++++++++ handler/rca/rca_scene/scene_base.py | 89 ++++ init.sh | 4 + init_obdiag_cmd.sh | 18 +- obdiag_client.py | 160 ++++-- obdiag_main.py | 45 ++ requirements3.txt | 3 +- rpm/build.sh | 2 +- rpm/oceanbase-diagnostic-tool.spec | 6 +- telemetry/telemetry.py | 16 +- utils/__init__.py | 1 - utils/file_utils.py | 9 +- utils/network_utils.py | 37 ++ utils/parser_utils.py | 61 ++- utils/print_utils.py | 38 ++ utils/shell_utils.py | 1 + utils/sql_utils.py | 60 +++ utils/string_utils.py | 95 ++++ utils/utils.py | 33 +- utils/yaml_utils.py | 19 +- 101 files changed, 5346 insertions(+), 417 deletions(-) create mode 100644 common/scene.py create mode 100644 docs/gather_ob_stack.md create mode 100644 docs/gather_scene.md create mode 100644 docs/rca.md create mode 100644 handler/checker/tasks/observer/cluster/ob_enable_plan_cache_bad_version.yaml create mode 100644 handler/checker/tasks/observer/cluster/observer_not_active.yaml create mode 100644 handler/checker/tasks/observer/cluster/optimizer_better_inlist_costing_parmmeter.yaml create mode 100644 handler/checker/tasks/observer/cluster/table_history_too_many.yaml create mode 100644 handler/checker/tasks/observer/disk/sstable_abnormal_file.yaml create mode 100644 handler/checker/tasks/observer/system/dependent_software_swapon.yaml create mode 100644 handler/checker/tasks/observer/system/parameter_ip_local_port_range.yaml create mode 100644 handler/checker/tasks/observer/system/parameter_tcp_rmem.yaml create mode 100644 handler/checker/tasks/observer/system/parameter_tcp_wmem.yaml create mode 100644 handler/checker/tasks/observer/version/old_version.yaml create mode 100644 handler/gather/gather_obstack2.py create mode 100644 handler/gather/gather_scenes.py create mode 100644 handler/gather/scenes/__init__.py create mode 100644 handler/gather/scenes/base.py create mode 100644 handler/gather/scenes/cpu_high.py create mode 100644 handler/gather/scenes/list.py create mode 100644 handler/gather/scenes/register.py create mode 100644 handler/gather/scenes/sql_problem.py create mode 100644 handler/gather/step/__init__.py create mode 100644 handler/gather/step/base.py create mode 100644 handler/gather/step/sql.py create mode 100644 handler/gather/step/ssh.py create mode 100644 handler/gather/tasks/obproxy/restart.yaml create mode 100644 handler/gather/tasks/observer/backup.yaml create mode 100644 handler/gather/tasks/observer/backup_clean.yaml create mode 100644 handler/gather/tasks/observer/clog_disk_full.yaml create mode 100644 handler/gather/tasks/observer/compaction.yaml create mode 100644 handler/gather/tasks/observer/delay_of_primary_and_backup.yaml create mode 100644 handler/gather/tasks/observer/log_archive.yaml create mode 100644 handler/gather/tasks/observer/long_transaction.yaml create mode 100644 handler/gather/tasks/observer/memory.yaml create mode 100644 handler/gather/tasks/observer/recovery.yaml create mode 100644 handler/gather/tasks/observer/restart.yaml create mode 100644 handler/gather/tasks/observer/rootservice_switch.yaml create mode 100644 handler/gather/tasks/observer/suspend_transaction.yaml create mode 100644 handler/gather/tasks/observer/unit_data_imbalance.yaml create mode 100644 handler/gather/tasks/observer/unknown.yaml create mode 100644 handler/gather/tasks/other/application_error.yaml create mode 100644 handler/rca/__init__.py create mode 100644 handler/rca/rca_exception.py create mode 100644 handler/rca/rca_handler.py create mode 100644 handler/rca/rca_list.py create mode 100644 handler/rca/rca_scene/__init__.py create mode 100644 handler/rca/rca_scene/disconnection_scene.py create mode 100644 handler/rca/rca_scene/lock_conflict_scene.py create mode 100644 handler/rca/rca_scene/major_hold_scene.py create mode 100644 handler/rca/rca_scene/scene_base.py create mode 100644 utils/network_utils.py create mode 100644 utils/print_utils.py create mode 100644 utils/sql_utils.py create mode 100644 utils/string_utils.py diff --git a/README-CN.md b/README-CN.md index d4ca59df..a1f4f868 100644 --- a/README-CN.md +++ b/README-CN.md @@ -19,7 +19,7 @@

-# Oceanbase Diagnostic Tool (obdiag) +# OceanBase Diagnostic Tool (obdiag) OceanBase Diagnostic Tool (obdiag) 是一款专门OceanBase打造的敏捷诊断工具,功能包括诊断信息收集、分析、巡检,可以在OceanBase集群不同的部署模式下(OCP,OBD或用户根据文档手工部署)实现一键执行。 # 安装 obdiag @@ -60,6 +60,9 @@ obdiag config -h -u [-p password] [-P port] ## obdiag 巡检功能 - [一键巡检](./docs/check.md) +## obdiag 一键场景化信息采集功能 +- [一键场景化信息采集](./docs/gather_scene.md) + ## obdiag 一键信息采集功能 - [一键收集OB日志](./docs/gather_ob_log.md) @@ -67,6 +70,7 @@ obdiag config -h -u [-p password] [-P port] - [一键收集主机信息](./docs/gather_sysstat.md) - [一键收集slog/clog日志](./docs/gather_admin.md) - [一键收集火焰图信息](./docs/gather_perf.md) +- [一键收集OB堆栈信息](./docs/gather_ob_stack.md) - [一键收集并行SQL的执行详情信息](./docs/gather_sql_plan_monitor.md) - [一键收集OBPROXY日志](./docs/gather_obproxy_log.md) - [一键收集AWR报告](./docs/gather_awr.md) @@ -76,6 +80,9 @@ obdiag config -h -u [-p password] [-P port] - [一键分析OB日志](./docs/analyze_ob_log.md) - [一键全链路诊断](./docs/analyze_flt_trace.md) +## obdiag 一键场景化根因分析功能 +- [一键场景化根因分析](./docs/rca.md) + # 许可证 OceanBase Diagnostic Tool 使用 [MulanPSL - 2.0](http://license.coscl.org.cn/MulanPSL2) 许可证。 diff --git a/README.md b/README.md index ec6bf14e..42ed1dd6 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,7 @@ English | [中文版](README-CN.md)

- -# Oceanbase Diagnostic Tool (obdiag) +# OceanBase Diagnostic Tool (obdiag) OceanBase Diagnostic Tool (obdiag) is is a quick diagnostic tool for open-source OceanBase software. The features include gather\analyze\check OceanBase Diagnostic information. It can be executed with one click in different deployment modes of OceanBase clusters (OCP, OBD, or manually deployed by users according to documentation). # Install obdiag @@ -64,6 +63,9 @@ obdiag config -h -u [-p password] [-P port] ## obdiag check Fuctions - [check](./docs/check.md) +## obdiag gather scene Fuctions +- [gather scene](./docs/gather_scene.md) + ## obdiag gather Fuctions - [gather log](./docs/gather_ob_log.md) @@ -80,6 +82,9 @@ obdiag config -h -u [-p password] [-P port] - [analyze log](./docs/analyze_ob_log.md) - [analyze flt trace log](./docs/analyze_flt_trace.md) +## obdiag rca Fuctions +- [rca](./docs/rca.md) + # Licencing OceanBase Database is under MulanPubL - 2.0 license. You can freely copy and use the source code. When you modify or distribute the source code, please obey the MulanPubL - 2.0 license. diff --git a/clean_all_result.sh b/clean_all_result.sh index 2a2cb659..cfc510d9 100755 --- a/clean_all_result.sh +++ b/clean_all_result.sh @@ -1,3 +1,4 @@ rm -rf ./gather_pack_* rm -rf ./analyze_pack_* -rm -rf ./analyze_flt_result* \ No newline at end of file +rm -rf ./analyze_flt_result* +rm -rf ./check_report \ No newline at end of file diff --git a/common/command.py b/common/command.py index d9971205..28080dac 100644 --- a/common/command.py +++ b/common/command.py @@ -336,7 +336,7 @@ def get_observer_version(is_ssh, ssh_helper, ob_install_dir): ob_version_info = SshClient().run_get_stderr(ssh_helper, cmd) else: ob_version_info = LocalClient().run_get_stderr(cmd) - logger.info("get observer version with LD_LIBRARY_PATH,cmd:{0}, result:{1}".format(cmd,ob_version_info)) + logger.info("get observer version with LD_LIBRARY_PATH,cmd:{0}".format(cmd)) if "REVISION" not in ob_version_info: raise Exception("Please check conf about observer,{0}".format(ob_version_info)) ob_version = re.findall(r'[(]OceanBase.CE\s(.+?)[)]', ob_version_info)[0] @@ -355,7 +355,7 @@ def get_obproxy_version(is_ssh, ssh_helper, obproxy_install_dir): obproxy_version_info = SshClient().run_get_stderr(ssh_helper, cmd) else: obproxy_version_info = LocalClient().run_get_stderr(cmd) - logger.info("get obproxy version, run cmd = [{0}] ".format(cmd)) + logger.debug("get obproxy version, run cmd = [{0}] ".format(cmd)) if obproxy_version_info is not None: ob_version = re.findall(r'[(]OceanBase.(.+? +?)[)]', obproxy_version_info) if len(ob_version) > 0: @@ -367,7 +367,7 @@ def get_obproxy_version(is_ssh, ssh_helper, obproxy_install_dir): obproxy_version_info = SshClient().run_get_stderr(ssh_helper, cmd) else: obproxy_version_info = LocalClient().run_get_stderr(cmd) - logger.info("get obproxy version with LD_LIBRARY_PATH,cmd:{0}, result:{1}".format(cmd,obproxy_version_info)) + logger.debug("get obproxy version with LD_LIBRARY_PATH,cmd:{0}, result:{1}".format(cmd,obproxy_version_info)) if "REVISION" not in obproxy_version_info: raise Exception("Please check conf about proxy,{0}".format(obproxy_version_info)) pattern = r"(\d+\.\d+\.\d+\.\d+)" diff --git a/common/constant.py b/common/constant.py index 1fb7801d..0837ae4e 100644 --- a/common/constant.py +++ b/common/constant.py @@ -91,8 +91,19 @@ def __setattr__(self, name, value): "tasks_base_path": "~/.obdiag/tasks/" } } + +const.OBDIAG_GATHER_DEFAULT_CONFIG = { + "gather": { + "cases_base_path": "~/.obdiag/gather/tasks" + } +} + +const.OBDIAG_RCA_DEFAULT_CONFIG = { + "rca": { + "result_path": "./rca/", + } +} const.OBDIAG_TELEMETRY_FILE_NAME = os.path.expanduser("~/.obdiag/.obdiag_telemetry.txt") const.TELEMETRY_CONTENT_REPORTER = "obdiag" const.TELEMETRY_URL = "openwebapi.oceanbase.com" const.TELEMETRY_PATH = "/api/web/oceanbase/report" - diff --git a/common/ob_connector.py b/common/ob_connector.py index 6952c184..b2279651 100644 --- a/common/ob_connector.py +++ b/common/ob_connector.py @@ -37,15 +37,18 @@ def init(self): logger.exception(e) def _connect_db(self): - logger.debug("connect OB: {0}:{1} with user {2}".format(self.ip, self.port, self.username)) - self.conn = mysql.connect( - host=self.ip, - port=self.port, - user=self.username, - passwd=self.password, - connect_timeout=30, - ) - logger.debug("connect databse ...") + try: + logger.debug("connect OB: {0}:{1} with user {2}".format(self.ip, self.port, self.username)) + self.conn = mysql.connect( + host=self.ip, + port=self.port, + user=self.username, + passwd=self.password, + connect_timeout=30, + ) + logger.debug("connect databse ...") + except mysql.Error as e: + logger.error("connect OB: {0}:{1} with user {2} failed, error:{3}".format(self.ip, self.port, self.username, e)) def execute_sql(self, sql): if self.conn is None: @@ -58,6 +61,18 @@ def execute_sql(self, sql): cursor.close() return ret + def execute_sql_return_columns_and_data(self, sql): + if self.conn is None: + self._connect_db() + else: + self.conn.ping(reconnect=True) + cursor = self.conn.cursor() + cursor.execute(sql) + column_names = [col[0] for col in cursor.description] + ret = cursor.fetchall() + cursor.close() + return column_names, ret + def execute_sql_return_cursor_dictionary(self, sql): if self.conn is None: self._connect_db() diff --git a/common/obdiag_exception.py b/common/obdiag_exception.py index 5170c796..3779f2cc 100644 --- a/common/obdiag_exception.py +++ b/common/obdiag_exception.py @@ -118,4 +118,4 @@ def __repr__(self): return '%s %s' % (self.msg, self.obj is not None and pprint.pformat(self.obj) or '') def __str__(self): - return repr(self) \ No newline at end of file + return repr(self) diff --git a/common/scene.py b/common/scene.py new file mode 100644 index 00000000..7e433ca8 --- /dev/null +++ b/common/scene.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/16 +@file: scene.py +@desc: +""" + + +from common.logger import logger +from utils.shell_utils import SshHelper +from utils.version_utils import compare_versions_greater +from common.command import get_observer_version, get_obproxy_version + +def filter_by_version(scene, cluster): + try: + steps = scene + steps_nu = 0 + # get observer version + if "version" not in cluster or cluster["version"] == "": + return steps_nu + for now_steps in steps: + if "version" in now_steps: + steps_versions = now_steps["version"] + if not isinstance(steps_versions, str): + logger.error("filter_by_version steps_version Exception : {0}".format("the type of version is not string")) + raise Exception("filter_by_version steps_version Exception : {0}".format("the type of version is not string")) + version_real = cluster["version"] + logger.info("version_int is {0} steps_versions is {1}".format(version_real, steps_versions)) + + steps_versions = steps_versions.replace(" ", "") + steps_versions = steps_versions[1:-1] + steps_versions_list = steps_versions.split(",") + minVersion = steps_versions_list[0] + maxVersion = steps_versions_list[1] + # min + if minVersion == "*": + minVersion = "-1" + if maxVersion == "*": + maxVersion = "999" + if compare_versions_greater(version_real, minVersion) and compare_versions_greater(maxVersion, version_real): + break + else: + logger.info("not version in now_steps") + break + steps_nu = steps_nu + 1 + if steps_nu > len(steps) - 1: + logger.warning("not version in this scene") + return -1 + return steps_nu + except Exception as e: + logger.error("filter_by_version Exception : {0}".format(e)) + raise Exception("filter_by_version Exception : {0}".format(e)) + +def get_version(nodes, type): + try: + if len(nodes) < 1: + raise Exception("input nodes is empty, please check your config") + node = nodes[0] + ssh = SshHelper(True, node.get("ip"), node.get("user"), node.get("password"), node.get("port"), node.get("private_key"), node) + if type == "observer": + version = get_observer_version(True, ssh, nodes[0]["home_path"]) + elif type == "obproxy": + version = get_obproxy_version(True, ssh, nodes[0]["home_path"]) + return version + except Exception as e: + logger.error("can't get version, Exception: {0}".format(e)) + raise Exception("can't get version, Exception: {0}".format(e)) + +def get_obproxy_and_ob_version(obproxy_nodes, nodes, type): + try: + if type == "observer" or type == "other": + if len(nodes) < 1: + raise Exception("input nodes is empty, please check your config") + node = nodes[0] + ssh = SshHelper(True, node.get("ip"), node.get("user"), node.get("password"), node.get("port"), node.get("private_key"), node) + version = get_observer_version(True, ssh, nodes[0]["home_path"]) + elif type == "obproxy": + if len(nodes) < 1: + raise Exception("input obproxy nodes is empty, please check your config") + node = obproxy_nodes[0] + ssh = SshHelper(True, node.get("ip"), node.get("user"), node.get("password"), node.get("port"), node.get("private_key"), node) + version = get_obproxy_version(True, ssh, nodes[0]["home_path"]) + else: + raise Exception( + "type is {0} . No func to get the version".format(type)) + return version + except Exception as e: + logger.error("can't get version, Exception: {0}".format(e)) + raise Exception("can't get version, Exception: {0}".format(e)) \ No newline at end of file diff --git a/conf/inner_config.yml b/conf/inner_config.yml index 9f5ccf06..b7bfb411 100644 --- a/conf/inner_config.yml +++ b/conf/inner_config.yml @@ -17,4 +17,8 @@ check: report_path: "./check_report/" export_type: table package_file: "~/.obdiag/check_package.yaml" - tasks_base_path: "~/.obdiag/tasks/" \ No newline at end of file + tasks_base_path: "~/.obdiag/tasks/" +gather: + scenes_base_path: "~/.obdiag/gather/tasks" +rca: + result_path: "./rca/" diff --git a/docs/check.md b/docs/check.md index 17eb13b7..1d4d19e9 100644 --- a/docs/check.md +++ b/docs/check.md @@ -23,18 +23,18 @@ observer ->check_package.yaml obproxy ->obproxy_check_package.yaml -Example1: +Example: obdiag check --cases= ad obdiag check --obproxy_cases= proxy obdiag check --cases=ad --obproxy_cases=proxy ``` ### 关联持久化参数: -持久化参数主要是部分日常不会修改的参数,依赖于conf/config.yml +持久化参数主要是部分日常不会修改的参数,依赖于conf/inner_config.yml 若使用rpm方式进行安装,config.yml位于 ```shell script -/user/local/oceanbase-diagnostic-tool/conf/config.yml +/user/local/oceanbase-diagnostic-tool/conf/inner_config.yml ``` check功能所关联的配置项在"CHECK"下,基本上的参数均无需变更或更改频率较低 diff --git a/docs/gather_all.md b/docs/gather_all.md index 55019c3e..a32a467d 100644 --- a/docs/gather_all.md +++ b/docs/gather_all.md @@ -74,6 +74,17 @@ Summary: | 192.168.2.12 | Completed | 42.152K | 6 s | gather_pack_20220729170856/sysstat_192.168.2.12_20220729170856.zip | +----------------+-----------+---------+--------+----------------------------------------------------------------------+ + +# observer当前的堆栈信息 +Summary: ++----------------+-----------+---------+--------+-----------------------------------------------------------------------+ +| Node | Status | Size | Time | PackPath | ++================+===========+=========+========+=======================================================================+ +| 192.168.2.11 | Completed | 22.693K | 13 s | gather_pack_20220729170902/obstack2_192.168.2.11_20220729170902.zip | ++----------------+-----------+---------+--------+-----------------------------------------------------------------------+ +| 192.168.2.12 | Completed | 19.902K | 13 s | gather_pack_20220729170902/obstack2_192.168.2.12_20220729170902.zip | ++----------------+-----------+---------+--------+-----------------------------------------------------------------------+ + Gather Perf Summary: +----------------+-----------+----------+--------+-------------------------------------------------------------------+ | Node | Status | Size | Time | PackPath | diff --git a/docs/gather_awr.md b/docs/gather_awr.md index 3a44d4e2..ff5d59f3 100644 --- a/docs/gather_awr.md +++ b/docs/gather_awr.md @@ -23,7 +23,7 @@ optional arguments: --cluster_name cluster_name cluster name. -Example: obdiag gather awr --cluster_name demo1 --from 2022-06-16 18:25:00 --to 2022-06-16 18:30:00 +Example: obdiag gather awr --from 2022-06-16 18:25:00 --to 2022-06-16 18:30:00 ``` @@ -33,6 +33,6 @@ Gather AWR Summary: +-----------+-----------+--------+--------+----------------------------------------------------------------------------------------+ | Cluster | Status | Size | Time | PackPath | +===========+===========+========+========+========================================================================================+ -| demo1 | Completed | 4.602M | 29 s | gather_pack_20220627005659/OBAWR_obcluster_jingshun_20220625160100_20220625180100.html | +| demo1 | Completed | 4.602M | 29 s | gather_pack_20220627005659/OBAWR_obcluster_demo1_20220625160100_20220625180100.html | +-----------+-----------+--------+--------+----------------------------------------------------------------------------------------+ ``` \ No newline at end of file diff --git a/docs/gather_ob_stack.md b/docs/gather_ob_stack.md new file mode 100644 index 00000000..ebc0ddbb --- /dev/null +++ b/docs/gather_ob_stack.md @@ -0,0 +1,23 @@ +## gather stack命令 + +收集observer的堆栈信息 +``` +$ obdiag gather stack [-h] + +Example: obdiag gather stack +``` + +执行结果 +```shell script +Example: obdiag gather stack + +Summary: ++----------------+-----------+---------+--------+-----------------------------------------------------------------------+ +| Node | Status | Size | Time | PackPath | ++================+===========+=========+========+=======================================================================+ +| 192.168.2.11 | Completed | 19.926K | 10 s | gather_pack_20220729163951/obstack2_192.168.2.11_20220729163951.zip | ++----------------+-----------+---------+--------+-----------------------------------------------------------------------+ +| 192.168.2.12 | Completed | 22.803K | 12 s | gather_pack_20220729163951/obstack2_192.168.2.11_20220729163951.zip | ++----------------+-----------+---------+--------+-----------------------------------------------------------------------+ + +``` \ No newline at end of file diff --git a/docs/gather_obproxy_log.md b/docs/gather_obproxy_log.md index 8a535837..06548ea9 100644 --- a/docs/gather_obproxy_log.md +++ b/docs/gather_obproxy_log.md @@ -53,9 +53,9 @@ Gather ObProxy Log Summary: +----------------+-----------+----------+------------------+--------+--------------------------------------------------------------------------+ | Node | Status | Size | Password | Time | PackPath | +================+===========+==========+==================+========+==========================================================================+ -| 192.168.2.11 | Completed | 36.762M | **************** | 19 s | gather_pack_20220701183246/obproxy_log_192.168.2.11_20220701183247.zip | +| 192.168.2.11 | Completed | 36.762M | HYmVourcUyRNP8Om | 19 s | gather_pack_20220701183246/obproxy_log_192.168.2.11_20220701183247.zip | +----------------+-----------+----------+------------------+--------+--------------------------------------------------------------------------+ -| 192.168.2.12 | Completed | 638.200M | **************** | 718 s | gather_pack_20220701183246/obproxy_log_192.168.2.12_20220701183918.zip | +| 192.168.2.12 | Completed | 638.200M | 1RicMaiLUUNfemnj | 718 s | gather_pack_20220701183246/obproxy_log_192.168.2.12_20220701183918.zip | +----------------+-----------+----------+------------------+--------+--------------------------------------------------------------------------+ ``` diff --git a/docs/gather_scene.md b/docs/gather_scene.md new file mode 100644 index 00000000..21c9adae --- /dev/null +++ b/docs/gather_scene.md @@ -0,0 +1,177 @@ +## gather scenes 命令 + +该命令可以一键执行将某些问题场景所需要的排查信息统一捞回,解决分布式节点信息捞取难的通点 + +## 查看当前支持的场景 + +```shell script +obdiag gather scene list +``` + +```bash +obdiag gather scene list + +[Other Problem Gather Scenes]: +------------------------------------------------------------------------------------------ +command info_en info_cn +------------------------------------------------------------------------------------------ +obdiag gather scene run --scene=other.application_error [application error] [应用报错问题] +------------------------------------------------------------------------------------------ + +[Obproxy Problem Gather Scenes]: +---------------------------------------------------------------------------------- +command info_en info_cn +---------------------------------------------------------------------------------- +obdiag gather scene run --scene=obproxy.restart [obproxy restart] [obproxy无故重启] +---------------------------------------------------------------------------------- + +[Observer Problem Gather Scenes]: +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +command info_en info_cn +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +obdiag gather scene run --scene=observer.backup [backup problem] [数据备份问题] +obdiag gather scene run --scene=observer.backup_clean [backup clean] [备份清理问题] +obdiag gather scene run --scene=observer.clog_disk_full [clog disk full] [clog盘满] +obdiag gather scene run --scene=observer.compaction [compaction] [合并问题] +obdiag gather scene run --scene=observer.cpu_high [High CPU] [CPU高] +obdiag gather scene run --scene=observer.delay_of_primary_and_backup [delay of primary and backup] [主备库延迟] +obdiag gather scene run --scene=observer.log_archive [log archive] [日志归档问题] +obdiag gather scene run --scene=observer.long_transaction [long transaction] [长事务] +obdiag gather scene run --scene=observer.memory [memory problem] [内存问题] +obdiag gather scene run --scene=observer.perf_sql --env "{db_connect: '-hxx -Pxx -uxx -pxx -Dxx', trace_id: 'xx'}" [SQL performance problem] [SQL性能问题] +obdiag gather scene run --scene=observer.recovery [recovery] [数据恢复问题] +obdiag gather scene run --scene=observer.restart [restart] [observer无故重启] +obdiag gather scene run --scene=observer.rootservice_switch [rootservice switch] [有主改选或者无主选举的切主] +obdiag gather scene run --scene=observer.sql_err --env "{db_connect: '-hxx -Pxx -uxx -pxx -Dxx', trace_id: 'xx'}" [SQL execution error] [SQL 执行出错] +obdiag gather scene run --scene=observer.suspend_transaction [suspend transaction] [悬挂事务] +obdiag gather scene run --scene=observer.unit_data_imbalance [unit data imbalance] [unit迁移/缩小 副本不均衡问题] +obdiag gather scene run --scene=observer.unknown [unknown problem] [未能明确问题的场景] +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +``` + +## 快速使用 + +```shell script +obdiag gather scene run --scene={SceneName} +``` +### 关联动态可配参数: +```shell script +--scene={SceneName} + +SceneName是对需要执行收集的场景 + + +Example1: +obdiag gather scene run --scene=observer.unknown +``` + + +## task编写教程 +一个task表示一个独立的场景,可以理解为一个专业的,用yaml编写的,用obdiag识别的脚本文件。 + +### 开始编写前 +编写前需要确定yaml需要放在哪 + +可以先进入~/.obdiag/inner_config.yml文件中设置 gather.scenes_base_path 所标识的目录里,看下编写的采集场景是否属于已有的大类,若没有就创建一个文件夹用于声明这个大类 + +例: + + +```ssh script +#先进入${gather.scenes_base_path} ,并创建我们的示例文件test.yaml(以observer为测试目标) +cd ~/.obdiag/gather/tasks/observer +touch test.yaml +``` + +以上便完成了编写前的步骤 + + +### 开始编写 +开始编写就是开始编辑我们的test.yaml + +```yaml script +# 首先需要声明下这个场景的作用,为了让大家看得懂 + +info: "for test" +``` +简单的内容已经结束,开始复杂的编写,注意细节 + +#### task编写 + +task的作用是声明场景采集执行的步骤,其基础结构是一个list + + + + +为什么task是一个list? +- 是为了兼容不同版本可能导致的步骤的不同 + +task的一个元素的结构如下 + +| 参数名 | 是否必填 | | | | +|---------| --- | --- |----------------------------------------------------------| --- | +| version | 否 | 表示适用的版本,使用方式见下示例 | 用str的形式表示范围,需要完整的数字的版本号,3.x版本为三位,4.x版本为四位如:[3.1.1,3.2.0],版本支持遵循左开又闭的原则 | | +| steps | 是 | 所执行步骤 | 为list结构 | | + +如下就是一个示例 + +```yaml script +info: testinfo +task: + - version: "[3.1.0,3.2.4]" + steps: + {steps_object} + - version: [4.2.0.0,4.3.0.0] + steps: + {steps_object} + ``` +steps又是一个list,用来表示具体的多个执行流程 + +steps的一个元素的结构即单个流程,如下 + +| 参数名 | 是否必填 | | +|-----------|------|-------------------------------------------------------------------------------| +| type | 是 | 表示适用的执行类型,目前支持 ssh/sql/log/obproxy_log/sysstat, 后续会持续增加支持的类型 | +| {ssh/sql/log/obproxy_log/sysstat} | 是 | 根据所选的类型提供的参数,这块比较依赖代码里的对执行类型的逻辑说明,本章节后续会对支持的进行类型进行详细的使用说明 | +各种类型示例如下,"step:" 仅为一个标记,无实际作用 + + +##### ssh +远程执行指令并获取对应的返回值 +```yaml +step: + type: ssh + ssh: wc -l /proc/${task_OBServer_pid}/maps | awk '{print $1}' + +``` +##### sql +执行sql并获取对应的值 +```yaml +step: + type: sql + sql: select tenant_name from oceanbase.__all_tenant from where tenant_id=${taskTenantId}; +``` + +##### log +收集observer的日志 +```yaml +step: + type: log + grep: "" # 过滤字段 +``` + +##### obproxy_log +收集 obproxy 的日志 +```yaml +step: + type: obproxy_log + grep: "" # 过滤字段 +``` + +##### sysstat +收集主机的信息 +```yaml +step: + type: sysstat + sysstat: "" +``` \ No newline at end of file diff --git a/docs/rca.md b/docs/rca.md new file mode 100644 index 00000000..aeecfa36 --- /dev/null +++ b/docs/rca.md @@ -0,0 +1,30 @@ +## rca命令 + +通过内部对一些已知问题的归纳分析,通过对 + + +## 快速使用 + +```shell script +obdiag rca run --scene={scene_name} +obdiag rca list +``` +### 关联动态可配参数: +```shell script +scene_name是需要执行的根因分析场景的名称,可以通过obdiag rca list获取 + +``` + +### 关联持久化参数: +持久化参数主要是部分日常不会修改的参数,依赖于{obdiag安装目录}/conf/inner_config.yml + +若使用rpm方式进行安装,inner_config.yml位于 +```shell script +/user/local/oceanbase-diagnostic-tool/conf/config.yml +``` + +rca功能所关联的配置项在"rca"下,基本上的参数均无需变更或更改频率较低 +```yaml script +rca: + result_path: "./rca/" # rca报告保存的地址 +``` diff --git a/example/all-components.yaml b/example/all-components.yaml index aeb39f58..b1ae9cab 100644 --- a/example/all-components.yaml +++ b/example/all-components.yaml @@ -1,6 +1,6 @@ ocp: login: - url: http://192.168.1.100:8080 + url: http://xx.xx.xx.xx:xx user: admin password: '' obcluster: @@ -9,14 +9,14 @@ obcluster: db_port: 2881 # default 2881 tenant_sys: user: root@sys # default root@sys - password: '' + password: "" servers: nodes: - ip: 192.168.1.1 - ip: 192.168.1.2 - ip: 192.168.1.3 global: - ssh_username: admin # your username + ssh_username: '' # your username ssh_password: '' # password if need # ssh_port: 22 # your ssh port, default 22 # ssh_key_file: "" # your ssh-key file path if need diff --git a/handler/analyzer/analyze_flt_trace.py b/handler/analyzer/analyze_flt_trace.py index 67bdbbb4..25e6667b 100644 --- a/handler/analyzer/analyze_flt_trace.py +++ b/handler/analyzer/analyze_flt_trace.py @@ -176,7 +176,7 @@ def __get_offline_log_file(self, ssh_helper, log_full_path, local_store_dir): """ local_store_path = os.path.join(local_store_dir, self.flt_trace_id) log_name_list = self.__get_log_name_list_offline() - if self.flt_trace_id is not None: + if self.flt_trace_id is not None and (len(log_name_list) > 0): grep_cmd = "grep -e '{grep_args}' {log_file} > {local_store_path} ".format( grep_args=self.flt_trace_id, log_file=' '.join(log_name_list), @@ -344,13 +344,12 @@ def __check_valid_and_parse_args(self, args): self.directly_analyze_files = True self.analyze_files_list = getattr(args, "files") self.is_ssh = False + # 2: store_dir must exist, else create directory. if getattr(args, "store_dir") is not None: if not os.path.exists(os.path.abspath(getattr(args, "store_dir"))): - logger.error("Error: args --store_dir [{0}] incorrect: No such directory." - .format(os.path.abspath(getattr(args, "store_dir")))) - return False - else: - self.gather_pack_dir = os.path.abspath(getattr(args, "store_dir")) + logger.warn("Error: args --store_dir [{0}] incorrect: No such directory, Now create it".format(os.path.abspath(getattr(args, "store_dir")))) + os.makedirs(os.path.abspath(getattr(args, "store_dir"))) + self.gather_pack_dir = os.path.abspath(getattr(args, "store_dir")) if getattr(args, "flt_trace_id") is not None: self.flt_trace_id = getattr(args, "flt_trace_id")[0] if getattr(args, "top") is not None: diff --git a/handler/analyzer/analyze_log.py b/handler/analyzer/analyze_log.py index 419be702..0f756f7f 100644 --- a/handler/analyzer/analyze_log.py +++ b/handler/analyzer/analyze_log.py @@ -85,8 +85,7 @@ def handle_from_node(node): local_ip = get_localhost_inner_ip() node = self.nodes[0] node["ip"] = local_ip - for node in self.nodes: - handle_from_node(node) + handle_from_node(node) title, field_names, summary_list, summary_details_list = self.__get_overall_summary(analyze_tuples, self.directly_analyze_files) table = tabulate.tabulate(summary_list, headers=field_names, tablefmt="grid", showindex=False) @@ -419,14 +418,12 @@ def __check_valid_and_parse_args(self, args): seconds=parse_time_length_to_sec(args.since))).strftime('%Y-%m-%d %H:%M:%S') else: self.from_time_str = (now_time - datetime.timedelta(minutes=30)).strftime('%Y-%m-%d %H:%M:%S') - # store_dir must exist, else return "No such file or directory". + # 2: store_dir must exist, else create directory. if getattr(args, "store_dir") is not None: if not os.path.exists(os.path.abspath(getattr(args, "store_dir"))): - logger.error("Error: args --store_dir [{0}] incorrect: No such directory." - .format(os.path.abspath(getattr(args, "store_dir")))) - return False - else: - self.gather_pack_dir = os.path.abspath(getattr(args, "store_dir")) + logger.warn("Error: args --store_dir [{0}] incorrect: No such directory, Now create it".format(os.path.abspath(getattr(args, "store_dir")))) + os.makedirs(os.path.abspath(getattr(args, "store_dir"))) + self.gather_pack_dir = os.path.abspath(getattr(args, "store_dir")) if getattr(args, "grep") is not None: self.grep_args = ' '.join(getattr(args, "grep")) diff --git a/handler/checker/check_handler.py b/handler/checker/check_handler.py index 0b27f26a..41fb0a05 100644 --- a/handler/checker/check_handler.py +++ b/handler/checker/check_handler.py @@ -23,7 +23,7 @@ from handler.checker.check_exception import CheckException from handler.checker.check_report import TaskReport, CheckReport, CheckrReportException from handler.checker.check_task import TaskBase -from common.command import get_observer_version, get_obproxy_version +from common.scene import get_version import re from utils.utils import display_trace, node_cut_passwd_for_log from utils.yaml_utils import read_yaml_data @@ -84,12 +84,7 @@ def __init__(self, ignore_version, cluster, nodes, export_report_path, export_re logger.info("tasks_base_path is " + self.tasks_base_path) # checker export_report_path - export_report_path = os.path.expanduser(export_report_path) - if not os.path.exists(export_report_path): - logger.warning("{0} not exists. mkdir it!".format(self.export_report_path)) - os.mkdir(export_report_path) - self.export_report_path = export_report_path - logger.info("export_report_path is " + self.export_report_path) + self.export_report_path=export_report_path def handle(self, args): package_name = None @@ -106,6 +101,14 @@ def handle(self, args): package_name = package_name[0] else: package_name = getattr(args, "cases") + if getattr(args, "report_path") : + self.export_report_path = getattr(args, "report_path") + logger.info("export_report_path change to " + self.export_report_path) + self.export_report_path = os.path.expanduser(self.export_report_path) + if not os.path.exists(self.export_report_path): + logger.warning("{0} not exists. mkdir it!".format(self.export_report_path)) + os.mkdir(self.export_report_path) + logger.info("export_report_path is " + self.export_report_path) logger.info("package_name is {0}".format(package_name)) # get package's by package_name @@ -164,34 +167,19 @@ def execute_one(self, task_name): # Verify if the version is within a reasonable range report = TaskReport(task_name) if not self.ignore_version: - try: - node = self.nodes[0] - ssh = SshHelper(True, node.get("ip"), - node.get("user"), - node.get("password"), - node.get("port"), - node.get("private_key"), - node) - - if self.check_target_type == "observer": - version = get_observer_version(True, ssh, self.nodes[0]["home_path"]) - elif self.check_target_type == "obproxy": - version = get_obproxy_version(True, ssh, self.nodes[0]["home_path"]) - else: - raise Exception( - "check_target_type is {0} . No func to get the version".format(self.check_target_type)) + version = get_version(self.nodes, self.check_target_type) + if version: self.cluster["version"] = re.findall(r'\d+\.\d+\.\d+\.\d+', version)[0] logger.info("cluster.version is {0}".format(self.cluster["version"])) - except Exception as e: - logger.error("can't get version, Exception: {0}".format(e)) - raise Exception("can't get version, Exception: {0}".format(e)) + task = TaskBase(self.tasks[task_name]["task"], self.nodes, self.cluster, report) + logger.info("{0} execute!".format(task_name)) + task.execute() + logger.info("execute tasks end : {0}".format(task_name)) + return report + else: + logger.error("can't get version") else: logger.info("ignore version") - task = TaskBase(self.tasks[task_name]["task"], self.nodes, self.cluster, report) - logger.info("{0} execute!".format(task_name)) - task.execute() - logger.info("execute tasks end : {0}".format(task_name)) - return report except Exception as e: logger.error("execute_one Exception : {0}".format(e)) raise CheckException("execute_one Exception : {0}".format(e)) diff --git a/handler/checker/check_task.py b/handler/checker/check_task.py index 0907cbe2..77a873dc 100644 --- a/handler/checker/check_task.py +++ b/handler/checker/check_task.py @@ -21,7 +21,7 @@ StepExecuteFailException, StepResultFalseException, TaskException from handler.checker.step.stepbase import StepBase from utils.utils import node_cut_passwd_for_log -from utils.version_utils import compare_versions_greater +from common.scene import filter_by_version class TaskBase(object): @@ -37,7 +37,7 @@ def __init__(self, task, nodes, cluster, report, task_variable_dict=None): def execute(self): logger.info("task_base execute") - steps_nu = self.filter_by_version() + steps_nu = filter_by_version(self.task, self.cluster) if steps_nu < 0: logger.warning("Unadapted by version. SKIP") self.report.add("Unadapted by version. SKIP", "warning") @@ -55,7 +55,7 @@ def execute(self): if len(self.cluster)==0: raise Exception("cluster is not exist") step_run = StepBase(step, node, self.cluster, self.task_variable_dict) - logger.info("step nu: {0} initted, to execute".format(nu)) + logger.debug("step nu: {0} initted, to execute".format(nu)) step_run.execute(self.report) self.task_variable_dict = step_run.update_task_variable_dict() if "report_type" in step["result"] and step["result"]["report_type"] == "execution": @@ -74,47 +74,6 @@ def execute(self): logger.error("TaskBase execute Exception: {0}".format(e)) raise TaskException("TaskBase execute Exception: {0}".format(e)) - logger.info("step nu: {0} execute end ".format(nu)) + logger.debug("step nu: {0} execute end ".format(nu)) nu = nu + 1 - logger.info("task execute end") - - def filter_by_version(self): - try: - steps = self.task - steps_nu = 0 - # get observer version - if "version" not in self.cluster or self.cluster["version"] == "": - return steps_nu - for now_steps in steps: - # have version in task ? - if "version" in now_steps: - steps_versions = now_steps["version"] - if not isinstance(steps_versions, str): - raise TaskException("filter_by_version steps_version Exception : {0}".format("the type of version is not string")) - version_real = self.cluster["version"] - logger.info("version_int is {0} steps_versions is {1}".format(version_real, steps_versions)) - - steps_versions = steps_versions.replace(" ", "") - steps_versions = steps_versions[1:-1] - steps_versions_list = steps_versions.split(",") - minVersion = steps_versions_list[0] - maxVersion = steps_versions_list[1] - # min - if minVersion == "*": - minVersion = "-1" - if maxVersion == "*": - maxVersion = "999" - if compare_versions_greater(version_real, minVersion) and compare_versions_greater(maxVersion, - version_real): - break - else: - logger.info("not version in now_steps") - break - steps_nu = steps_nu + 1 - if steps_nu > len(steps) - 1: - logger.warning("not version in this task") - return -1 - return steps_nu - except Exception as e: - logger.error("filter_by_version Exception : {0}".format(e)) - raise TaskException("filter_by_version Exception : {0}".format(e)) + logger.info("task execute end") \ No newline at end of file diff --git a/handler/checker/step/get_system_parameter.py b/handler/checker/step/get_system_parameter.py index 19a5c7d3..d09d0188 100644 --- a/handler/checker/step/get_system_parameter.py +++ b/handler/checker/step/get_system_parameter.py @@ -52,7 +52,7 @@ def __init__(self, step, node, task_variable_dict): def get_parameter(self, parameter_name): try: - parameter_value = self.ssh_helper.ssh_exec_cmd("sysctl -n " + parameter_name) + parameter_value = self.ssh_helper.ssh_exec_cmd("sysctl -n " + parameter_name).strip() self.ssh_helper.ssh_close() except Exception as e: logger.warning( @@ -66,6 +66,18 @@ def execute(self): if "parameter" not in self.step: raise StepExecuteFailException("GetSystemParameterHandler execute parameter is not set") logger.info("GetSystemParameterHandler execute: {0}".format(self.step["parameter"])) + s = self.step["parameter"] + if '.' in s: + last_substring = s.rsplit('.', 1) + s = last_substring[len(last_substring) - 1] + else: + s = self.step["parameter"] + # SystemParameter exist? + if self.ssh_helper.ssh_exec_cmd('find /proc/sys/ -name "{0}"'.format(s)) == "": + logger.warning("{0} is not exist".format(self.step["parameter"])) + if "result" in self.step and "set_value" in self.step["result"]: + self.task_variable_dict[self.step["result"]["set_value"]] = "" + return parameter_value = self.get_parameter(self.step["parameter"]) if "result" in self.step and "set_value" in self.step["result"]: diff --git a/handler/checker/step/sql.py b/handler/checker/step/sql.py index 5bd1828a..98b29bfe 100644 --- a/handler/checker/step/sql.py +++ b/handler/checker/step/sql.py @@ -65,7 +65,7 @@ def execute(self): data = data[0][0] if data is None: data = "" - logger.info("sql result:{0}".format(convert_to_number(data))) + logger.info("sql result:{0}".format(convert_to_number(str(data)))) if "result" in self.step and "set_value" in self.step["result"]: logger.info("sql execute update task_variable_dict: {0} = {1}".format(self.step["result"]["set_value"], convert_to_number(data))) self.task_variable_dict[self.step["result"]["set_value"]] = convert_to_number(data) diff --git a/handler/checker/step/stepbase.py b/handler/checker/step/stepbase.py index f7053e2f..fa3eccc3 100644 --- a/handler/checker/step/stepbase.py +++ b/handler/checker/step/stepbase.py @@ -25,6 +25,7 @@ from common.logger import logger import docker + class StepBase(object): def __init__(self, step, node, cluster, task_variable_dict): self.step = step @@ -34,17 +35,20 @@ def __init__(self, step, node, cluster, task_variable_dict): self.task_variable_dict = task_variable_dict def execute(self, report): - no_cluster_name_msg="(Please set ob_cluster_name or obproxy_cluster_name)" + no_cluster_name_msg = "(Please set ob_cluster_name or obproxy_cluster_name)" # execute and result try: # init task_variable_dict ## set remote_ip if "ip" in self.node: self.task_variable_dict["remote_ip"] = self.node["ip"] - elif "ssh_type" in self.node and self.node["ssh_type"]=="docker": + elif "ssh_type" in self.node and self.node["ssh_type"] == "docker": logger.debug("execute ssh_type is docker") - self.task_variable_dict["remote_ip"] = docker.from_env().containers.get(self.node["container_name"]).attrs['NetworkSettings']['Networks']['bridge']["IPAddress"] - self.task_variable_dict["remote_home_path"] = self.node["home_path"] + self.task_variable_dict["remote_ip"] = \ + docker.from_env().containers.get(self.node["container_name"]).attrs['NetworkSettings']['Networks'][ + 'bridge']["IPAddress"] + for key in self.node: + self.task_variable_dict["remote_{0}".format(key)]=self.node[key] if "type" not in self.step: raise StepExecuteFailException("Missing field :type") @@ -55,7 +59,7 @@ def execute(self, report): elif self.step["type"] == "sql": handler = StepSQLHandler(self.step, self.cluster, self.task_variable_dict) else: - raise StepExecuteFailException("the type not support: {0}" .format(self.step["type"])) + raise StepExecuteFailException("the type not support: {0}".format(self.step["type"])) logger.debug("task execute and result") handler.execute() except Exception as e: @@ -64,12 +68,18 @@ def execute(self, report): report.add("[cluster:{0}] {1}".format(self.cluster.get("ob_cluster_name") or self.cluster.get( "obproxy_cluster_name") or no_cluster_name_msg, e), "fail") else: - report.add("[{0}:{1}] {2}".format(self.node.get("ssh_type") or "", self.node.get("container_name") or self.task_variable_dict.get("remote_ip") or "",e), "fail") + report.add("[{0}:{1}] {2}".format(self.node.get("ssh_type") or "", + self.node.get("container_name") or self.task_variable_dict.get( + "remote_ip") or "", e), "fail") raise StepExecuteFailException("StepBase handler.execute fail {0}".format(e)) try: self.task_variable_dict = handler.update_step_variable_dict() logger.debug("self.task_variable_dict: {0}".format(self.task_variable_dict)) + if self.step["type"] == "get_system_parameter" and "result" in self.step and "set_value" in self.step[ + "result"] and self.task_variable_dict[self.step["result"]["set_value"]] == "": + return + if "result" in self.step: logger.debug("result execute ") result = CheckResult(self.step["result"], self.task_variable_dict) @@ -96,9 +106,11 @@ def execute(self, report): level = "warning" if self.step["type"] == "sql": report.add("[cluster:{0}] {1}".format(self.cluster.get("ob_cluster_name") or self.cluster.get( - "obproxy_cluster_name") or no_cluster_name_msg , resultException), level) + "obproxy_cluster_name") or no_cluster_name_msg, resultException), level) else: - report.add("[{0}:{1}] {2}".format(self.node.get("ssh_type") or "", self.node.get("container_name") or self.task_variable_dict.get("remote_ip") or "",resultException), level) + report.add("[{0}:{1}] {2}".format(self.node.get("ssh_type") or "", + self.node.get("container_name") or self.task_variable_dict.get( + "remote_ip") or "", resultException), level) if level == "critical": raise StepResultFailException(resultException) raise StepResultFalseException(resultException) @@ -108,9 +120,11 @@ def execute(self, report): logger.error("step_base ResultFailException:{0}".format(resultFailException)) if self.step["type"] == "sql": report.add("[cluster:{0}] {1}".format(self.cluster.get("ob_cluster_name") or self.cluster.get( - "obproxy_cluster_name") or no_cluster_name_msg , resultFailException), "fail") + "obproxy_cluster_name") or no_cluster_name_msg, resultFailException), "fail") else: - report.add("[{0}:{1}] {2}".format(self.node.get("ssh_type") or "", self.node.get("container_name") or self.task_variable_dict.get("remote_ip") or "",resultFailException), "fail") + report.add("[{0}:{1}] {2}".format(self.node.get("ssh_type") or "", + self.node.get("container_name") or self.task_variable_dict.get( + "remote_ip") or "", resultFailException), "fail") raise StepResultFailException(resultFailException) except Exception as e: diff --git a/handler/checker/tasks/observer/cluster/core_file_find.yaml b/handler/checker/tasks/observer/cluster/core_file_find.yaml index 508107cd..d8741e30 100644 --- a/handler/checker/tasks/observer/cluster/core_file_find.yaml +++ b/handler/checker/tasks/observer/cluster/core_file_find.yaml @@ -3,7 +3,7 @@ task: - version: "[4.0.0.0,*]" steps: - type: ssh - ssh: 'ls | grep "^core" | wc -l' + ssh: 'ls #{remote_home_path}| grep "^core" | wc -l' result: set_value: core_file_number verify_type: equal diff --git a/handler/checker/tasks/observer/cluster/data_path_settings.yaml b/handler/checker/tasks/observer/cluster/data_path_settings.yaml index 330323fc..a2570907 100644 --- a/handler/checker/tasks/observer/cluster/data_path_settings.yaml +++ b/handler/checker/tasks/observer/cluster/data_path_settings.yaml @@ -3,17 +3,21 @@ task: - version: "[4.0.0.0,*]" steps: - type: ssh - ssh: 'find #{remote_home_path} -name "sstable"' + ssh: 'find #{remote_data_dir} -name "sstable"' result: set_value: data_dir_path + verify: '[ -n "${data_dir_path}" ]' + err_msg: "data_dir_path is null . Please check your nodes.data_dir need absolute Path" - type: ssh ssh: "df -h #{data_dir_path} | grep '/' | awk '{print $6}'" result: set_value: data_dir_disk - type: ssh - ssh: 'find #{remote_home_path} -name "clog"' + ssh: 'find #{remote_redo_dir} -name "clog"' result: set_value: log_dir_path + verify: '[ -n "${log_dir_path}" ]' + err_msg: "log_dir_path is null . Please check your nodes.redo_dir need absolute Path" - type: ssh ssh: "df -h #{log_dir_path} | grep '/' | awk '{print $6}'" result: @@ -55,12 +59,3 @@ task: verify: '[ "${file_system}" == "XFS" ] || [ "${file_system}" == "ext4" ]' err_msg: "ip:#{remote_ip} ,file_system is not XFS or ext4." - - - - - - - - - diff --git a/handler/checker/tasks/observer/cluster/ob_enable_plan_cache_bad_version.yaml b/handler/checker/tasks/observer/cluster/ob_enable_plan_cache_bad_version.yaml new file mode 100644 index 00000000..13b957e8 --- /dev/null +++ b/handler/checker/tasks/observer/cluster/ob_enable_plan_cache_bad_version.yaml @@ -0,0 +1,11 @@ +info: "There are too many table histories for a tenant in the cluster, and when the machine restarts, the schema refresh will continue to report -4013, resulting in the inability to refresh the corresponding tenant's schema for a particular machine." +task: + - version: "[4.1.0.0,4.1.0.1]" + steps: + - type: sql + sql: 'select name from oceanbase.__all_virtual_tenant_parameter_stat where name like "%ob_enable_plan_cache%" and value like "%true%";' + result: + set_value: ob_enable_plan_cache + verify: '[ -z "ob_enable_plan_cache" ]' + err_msg: 'On this version, ob_enable_plan_cache suggestion to close' + diff --git a/handler/checker/tasks/observer/cluster/observer_not_active.yaml b/handler/checker/tasks/observer/cluster/observer_not_active.yaml new file mode 100644 index 00000000..c1ede6c8 --- /dev/null +++ b/handler/checker/tasks/observer/cluster/observer_not_active.yaml @@ -0,0 +1,15 @@ +info: 'Check whether there is any observer not in the ACTIVE state.' +task: + - version: "[4.0.0.0,*]" + steps: + - type: sql + sql: 'select count(0) from oceanbase.DBA_OB_SERVERS where STATUS<>"ACTIVE";' + result: + set_value: not_ACTIVE + verify_type: equal + verify: 0 + err_msg: 'There is #{not_ACTIVE} not_ACTIVE observer, please check as soon as possible.' + + + + diff --git a/handler/checker/tasks/observer/cluster/optimizer_better_inlist_costing_parmmeter.yaml b/handler/checker/tasks/observer/cluster/optimizer_better_inlist_costing_parmmeter.yaml new file mode 100644 index 00000000..804bbb80 --- /dev/null +++ b/handler/checker/tasks/observer/cluster/optimizer_better_inlist_costing_parmmeter.yaml @@ -0,0 +1,20 @@ +info: 'Check if the tag parameter for a specific version is enabled.' +task: + - version: "[4.1.0.0,4.1.0.2]" + steps: + - type: sql + sql: 'select name from oceanbase.__all_virtual_tenant_parameter_stat where name like "%_optimizer_better_inlist_costing%" and value like "%true%";' + result: + set_value: optimizer_better_inlist_costing + verify: '[ -z "$optimizer_better_inlist_costing" ]' + err_msg: '_optimizer_better_inlist_costing need close. Triggering this issue can lead to correctness issues, causing random errors or core issues.' + - version: "[4.2.0.0,4.2.0.0]" + steps: + - type: sql + sql: 'select name from oceanbase.__all_virtual_tenant_parameter_stat where name like "%_optimizer_better_inlist_costing%" and value like "%true%";' + result: + set_value: optimizer_better_inlist_costing + verify: '[ -z "$optimizer_better_inlist_costing" ]' + err_msg: '_optimizer_better_inlist_costing need close. Triggering this issue can lead to correctness issues, causing random errors or core issues.' + + diff --git a/handler/checker/tasks/observer/cluster/table_history_too_many.yaml b/handler/checker/tasks/observer/cluster/table_history_too_many.yaml new file mode 100644 index 00000000..2075e4e4 --- /dev/null +++ b/handler/checker/tasks/observer/cluster/table_history_too_many.yaml @@ -0,0 +1,12 @@ +info: "There are too many table histories for a tenant in the cluster, and when the machine restarts, the schema refresh will continue to report -4013, resulting in the inability to refresh the corresponding tenant's schema for a particular machine." +task: + - version: "[4.1.0.1,4.1.0.2]" + steps: + - type: sql + sql: 'select table_name from oceanbase.__all_virtual_table_history group by 1 having count(*) > 4000000;' + result: + set_value: table_name + verify: '[ -z "table_name" ]' + err_msg: "There are too many table histories for a tenant in the cluster, and when the machine restarts, the schema refresh will continue to report -4013, resulting in the inability to refresh the corresponding tenant's schema for a particular machine." + + diff --git a/handler/checker/tasks/observer/cpu/oversold.yaml b/handler/checker/tasks/observer/cpu/oversold.yaml index 05244ba3..5ada0f80 100644 --- a/handler/checker/tasks/observer/cpu/oversold.yaml +++ b/handler/checker/tasks/observer/cpu/oversold.yaml @@ -1,13 +1,11 @@ -info: 'Check whether there is any observer not in the ACTIVE state.' +info: 'Check whether there is any observer have CPU oversold.' task: - version: "[4.0.0.0,*]" steps: - type: sql - sql: 'select count(0) from oceanbase.DBA_OB_SERVERS where STATUS<>"ACTIVE";' + sql: "SELECT GROUP_CONCAT(CONCAT(SVR_IP, ':', SVR_PORT) SEPARATOR ', ') AS IP_PORT_COMBINATIONSFROM from oceanbase.GV$OB_SERVERS WHERE CPU_ASSIGNED > CPU_CAPACITY;" result: - set_value: not_ACTIVE - verify_type: equal - verify: 0 - err_msg: 'There is #{not_ACTIVE} not_ACTIVE observer, please check as soon as possible.' - - + set_value: CPU_oversold + verify: '[ -z "$CPU_oversold" ]' + report_type: warning + err_msg: 'Some observers have CPU oversold. There are #{CPU_oversold}' diff --git a/handler/checker/tasks/observer/disk/clog_abnormal_file.yaml b/handler/checker/tasks/observer/disk/clog_abnormal_file.yaml index f08e736c..003160e6 100644 --- a/handler/checker/tasks/observer/disk/clog_abnormal_file.yaml +++ b/handler/checker/tasks/observer/disk/clog_abnormal_file.yaml @@ -4,9 +4,11 @@ task: - version: "[4.0.0.0,*]" steps: - type: ssh - ssh: 'find #{remote_home_path} -name "clog"' + ssh: 'find #{remote_redo_dir} -name "clog"' result: set_value: log_dir_path + verify: '[ -n "${log_dir_path}" ]' + err_msg: "log_dir_path is null . Please check your nodes.redo_dir need absolute Path" - type: ssh ssh: "find #{log_dir_path} -type f -name '*[^0-9]*' ! -name '*.tmp' ! -name '*.flashback' ! -name 'meta'" result: diff --git a/handler/checker/tasks/observer/disk/sstable_abnormal_file.yaml b/handler/checker/tasks/observer/disk/sstable_abnormal_file.yaml new file mode 100644 index 00000000..ea0d03db --- /dev/null +++ b/handler/checker/tasks/observer/disk/sstable_abnormal_file.yaml @@ -0,0 +1,18 @@ + +info: "Check if there are files in the clog folder that do not belong to the observer" +task: + - version: "[4.0.0.0,*]" + steps: + - type: ssh + ssh: 'find #{remote_data_dir} -name "sstable"' + result: + set_value: sstable_dir_path + verify: '[ -n "${sstable_dir_path}" ]' + err_msg: "sstable_dir_path is null . Please check your nodes.data_dir need absolute Path" + - type: ssh + ssh: "find #{sstable_dir_path} -type f ! -name block_file" + result: + set_value: files + verify: '[ -z "${files}" ]' + err_msg: "Users are not allowed to modify or create in the dir_path folder, It will be causing observer startup failure. Files need be checked: #{files}" + diff --git a/handler/checker/tasks/observer/system/dependent_software_swapon.yaml b/handler/checker/tasks/observer/system/dependent_software_swapon.yaml new file mode 100644 index 00000000..16520ced --- /dev/null +++ b/handler/checker/tasks/observer/system/dependent_software_swapon.yaml @@ -0,0 +1,17 @@ +info: 'To detect dependent software, refer to: https://www.oceanbase.com/docs/enterprise-oceanbase-ocp-cn-1000000000125643' +task: + - steps: + - type: ssh + ssh: 'if command -v swapon &>/dev/null; then echo "exist"; fi' + result: + set_value: swapon_exist + report_type: execution + verify: '[ "exist" != "${swapon_exist}" ]' + err_msg: 'Do not warning. swapon is exist. We will check the swap' + - type: ssh + ssh: 'swapon --summary | grep -q "^" && echo "used" || echo "not used"' + result: + set_value: swapon_switch + report_type: warning + verify: '[ "not used" == "${swapon_switch}" ]' + err_msg: 'swapon need be closed. Now , it is #{swapon_switch}.' \ No newline at end of file diff --git a/handler/checker/tasks/observer/system/parameter.yaml b/handler/checker/tasks/observer/system/parameter.yaml index bf9af594..34323636 100644 --- a/handler/checker/tasks/observer/system/parameter.yaml +++ b/handler/checker/tasks/observer/system/parameter.yaml @@ -141,7 +141,7 @@ task: set_value: parameter report_type: warning verify: "[ 327680 -le ${parameter} ] && [ ${parameter} -le 1000000 ]" - err_msg: 'vm.max_map_count : #{parameter} , which is not recommended. Set it within the range of 327680 ≤ value ≤ 1000000 ' + err_msg: 'vm.max_map_count : #{parameter} , which is not recommended.Unreasonable vm.max_map_count configuration may cause serious memory leaks. Set it within the range of 327680 ≤ value ≤ 1000000 ' - type: get_system_parameter parameter: vm.overcommit_memory result: @@ -191,81 +191,5 @@ task: report_type: warning verify: "[ 0 -eq ${parameter} ]" err_msg: 'fs.pipe-user-pages-soft : #{parameter} is a non recommended value, recommended value is 0' - - type: get_system_parameter - parameter: net.ipv4.tcp_rmem - result: - set_value: tcp_rmem - - type: ssh - ssh: "echo \"#{tcp_rmem}\" | awk '{print $1}'" - result: - set_value: tcp_rmem_min - report_type: warning - verify_type: between - verify: "[4096,8192]" - err_msg: 'net.ipv4.tcp_rmem_min : #{tcp_rmem_min} is a non recommended value, recommended value is 4096 ≤ min ≤ 8192' - - type: ssh - ssh: "echo \"#{tcp_rmem}\" | awk '{print $2}'" - result: - set_value: tcp_rmem_default - report_type: warning - verify_type: between - verify: "[65536,131072]" - err_msg: 'net.ipv4.tcp_rmem_default : #{tcp_rmem_default} is a non recommended value, recommended value :is 65536 ≤ default≤ 131072' - - type: ssh - ssh: "echo \"#{tcp_rmem}\" | awk '{print $3}'" - result: - set_value: tcp_rmem_max - report_type: warning - verify_type: between - verify: "[8388608,16777216]" - err_msg: 'net.ipv4.tcp_rmem_max : #{tcp_rmem_max} is a non recommended value, recommended value is 65536 ≤ max≤ 131072' - - type: get_system_parameter - parameter: net.ipv4.tcp_wmem - result: - set_value: tcp_rmem - - type: ssh - ssh: "echo \"#{tcp_rmem}\" | awk '{print $1}'" - result: - set_value: tcp_rmem_min - report_type: warning - verify_type: between - verify: "[4096,8192]" - err_msg: 'net.ipv4.tcp_wmem_min : #{tcp_rmem_min} is a non recommended value, recommended value is 4096 ≤ min ≤ 8192' - - type: ssh - ssh: "echo \"#{tcp_rmem}\" | awk '{print $2}'" - result: - set_value: tcp_rmem_default - report_type: warning - verify_type: between - verify: "[65536,131072]" - err_msg: 'net.ipv4.tcp_wmem_default : #{tcp_rmem_default} is a non recommended value, recommended value :is 65536 ≤ default≤ 131072' - - type: ssh - ssh: "echo \"#{tcp_rmem}\" | awk '{print $3}'" - result: - set_value: tcp_rmem_max - report_type: warning - verify_type: between - verify: "[8388608,16777216]" - err_msg: 'net.ipv4.tcp_wmem_max : #{tcp_rmem_max} is a non recommended value, recommended value is 65536 ≤ max≤ 131072' - - type: get_system_parameter - parameter: net.ipv4.ip_local_port_range - result: - set_value: ip_local_port_range - - type: ssh - ssh: "echo \"#{ip_local_port_range}\" | awk '{print $1}'" - result: - set_value: ip_local_port_range_min - report_type: warning - verify_type: equal - verify: 3500 - err_msg: 'ip_local_port_range_min : #{tcp_rmem_min} is a non recommended value, recommended value is 3500' - - type: ssh - ssh: "echo \"#{tcp_rmem}\" | awk '{print $2}'" - result: - set_value: ip_local_port_range_max - report_type: warning - verify_type: equal - verify: 65535 - err_msg: 'ip_local_port_range_max : #{ip_local_port_range_max} is a non recommended value, recommended value is 65535' diff --git a/handler/checker/tasks/observer/system/parameter_ip_local_port_range.yaml b/handler/checker/tasks/observer/system/parameter_ip_local_port_range.yaml new file mode 100644 index 00000000..9a83b86a --- /dev/null +++ b/handler/checker/tasks/observer/system/parameter_ip_local_port_range.yaml @@ -0,0 +1,27 @@ + +info: | + 'To detect kernel parameters, refer to: https://www.oceanbase.com/docs/enterprise-oceanbase-ocp-cn-1000000000125643' +task: + - steps: + - type: get_system_parameter + parameter: net.ipv4.ip_local_port_range + result: + set_value: ip_local_port_range + verify: '[[ -n "$ip_local_port_range" && "$ip_local_port_range" != "-1" ]]' + err_msg: "ip_local_port_range is #{ip_local_port_range} . Please check net.ipv4.ip_local_port_range on your node" + - type: ssh + ssh: "echo \"#{ip_local_port_range}\" | awk '{print $1}'" + result: + set_value: ip_local_port_range_min + report_type: warning + verify_type: equal + verify: 3500 + err_msg: 'ip_local_port_range_min : #{ip_local_port_range_min} is a non recommended value, recommended value is 3500' + - type: ssh + ssh: "echo \"#{ip_local_port_range}\" | awk '{print $2}'" + result: + set_value: ip_local_port_range_max + report_type: warning + verify_type: equal + verify: 65535 + err_msg: 'ip_local_port_range_max : #{ip_local_port_range_max} is a non recommended value, recommended value is 65535' \ No newline at end of file diff --git a/handler/checker/tasks/observer/system/parameter_tcp_rmem.yaml b/handler/checker/tasks/observer/system/parameter_tcp_rmem.yaml new file mode 100644 index 00000000..2cd140cd --- /dev/null +++ b/handler/checker/tasks/observer/system/parameter_tcp_rmem.yaml @@ -0,0 +1,35 @@ + +info: | + 'To detect kernel parameters, refer to: https://www.oceanbase.com/docs/enterprise-oceanbase-ocp-cn-1000000000125643' +task: + - steps: + - type: get_system_parameter + parameter: net.ipv4.tcp_rmem + result: + set_value: tcp_rmem + verify: '[[ -n "$tcp_rmem" && "$tcp_rmem" != "-1" ]]' + err_msg: "net.ipv4.tcp_rmem is #{tcp_rmem} . Please check net.ipv4.tcp_rmem on your node" + - type: ssh + ssh: "echo \"#{tcp_rmem}\" | awk '{print $1}'" + result: + set_value: tcp_rmem_min + report_type: warning + verify_type: between + verify: "[4096,8192]" + err_msg: 'net.ipv4.tcp_rmem_min : #{tcp_rmem_min} is a non recommended value, recommended value is 4096 ≤ min ≤ 8192' + - type: ssh + ssh: "echo \"#{tcp_rmem}\" | awk '{print $2}'" + result: + set_value: tcp_rmem_default + report_type: warning + verify_type: between + verify: "[65536,131072]" + err_msg: 'net.ipv4.tcp_rmem_default : #{tcp_rmem_default}. net.ipv4.tcp_rmem_default from net.ipv4.tcp_rmem. It is a non recommended value, recommended value :is 65536 ≤ default≤ 131072' + - type: ssh + ssh: "echo \"#{tcp_rmem}\" | awk '{print $3}'" + result: + set_value: tcp_rmem_max + report_type: warning + verify_type: between + verify: "[8388608,16777216]" + err_msg: 'net.ipv4.tcp_rmem_max : #{tcp_rmem_max}. net.ipv4.tcp_rmem_max from net.ipv4.tcp_rmem. It is a non recommended value, recommended value is 65536 ≤ max≤ 131072' diff --git a/handler/checker/tasks/observer/system/parameter_tcp_wmem.yaml b/handler/checker/tasks/observer/system/parameter_tcp_wmem.yaml new file mode 100644 index 00000000..76eadaa9 --- /dev/null +++ b/handler/checker/tasks/observer/system/parameter_tcp_wmem.yaml @@ -0,0 +1,33 @@ + +info: | + 'To detect kernel parameters, refer to: https://www.oceanbase.com/docs/enterprise-oceanbase-ocp-cn-1000000000125643' +task: + - steps: + - type: get_system_parameter + parameter: net.ipv4.tcp_wmem + result: + set_value: tcp_wmem + - type: ssh + ssh: "echo \"#{tcp_wmem}\" | awk '{print $1}'" + result: + set_value: tcp_wmem_min + report_type: warning + verify_type: between + verify: "[4096,8192]" + err_msg: 'net.ipv4.tcp_wmem_min : #{tcp_wmem_min} is a non recommended value, recommended value is 4096 ≤ min ≤ 8192' + - type: ssh + ssh: "echo \"#{tcp_wmem}\" | awk '{print $2}'" + result: + set_value: tcp_wmem_default + report_type: warning + verify_type: between + verify: "[65536,131072]" + err_msg: 'net.ipv4.tcp_wmem_default : #{tcp_wmem_default} is a non recommended value, recommended value :is 65536 ≤ default≤ 131072' + - type: ssh + ssh: "echo \"#{tcp_wmem}\" | awk '{print $3}'" + result: + set_value: tcp_wmem_max + report_type: warning + verify_type: between + verify: "[8388608,16777216]" + err_msg: 'net.ipv4.tcp_wmem_max : #{tcp_wmem_max} is a non recommended value, recommended value is 65536 ≤ max≤ 131072' diff --git a/handler/checker/tasks/observer/version/old_version.yaml b/handler/checker/tasks/observer/version/old_version.yaml new file mode 100644 index 00000000..bcbc9b59 --- /dev/null +++ b/handler/checker/tasks/observer/version/old_version.yaml @@ -0,0 +1,13 @@ +info: 'Check observer version . Some versions of observers are not recommended' +task: + - steps: + - type: ssh + ssh: 'export LD_LIBRARY_PATH=#{remote_home_path}/lib && #{remote_home_path}/bin/observer --version 2>&1 | grep "(OceanBase"' + result: + set_value: observer_version + verify: '[[ ! "#{observer_version}" == *"3.1."* ]] || [[ ! "#{observer_version}" == *"CE"* ]]' + err_msg: 'Starting from June 30, 2024, OceanBase Database Community Edition V3.1.x will no longer receive any bug fixes or version updates.Please upgrade as soon as possible' + + + + diff --git a/handler/gather/gather_awr.py b/handler/gather/gather_awr.py index cc391dae..92b7edf6 100644 --- a/handler/gather/gather_awr.py +++ b/handler/gather/gather_awr.py @@ -208,29 +208,6 @@ def __check_valid_and_parse_args(self, args): :param args: command args :return: boolean. True if valid, False if invalid. """ - if getattr(args, "cluster_name") is not None: - # 1: cluster_name must be must be provided, if not be valid - try: - self.cluster_name = getattr(args, "cluster_name")[0] - except OBDIAGArgsNotFoundException: - logger.error("Error: cluster_name must be must be provided") - return False - - try: - ocp_base_init = ocp_base.OcpBase(self.ocp_url, self.ocp_user, self.ocp_password) - ocp_base_init.check_ocp_site() - except Exception as e: - raise Exception("check login ocp failed, please check whether conf/config.yml is set correctly" - .format(e)) - - # 2. get cluster id from ocp - try: - self.ob_cluster = ocp_cluster.ObCluster(self.ocp_url, self.auth, None) - self.cluster_id = self.ob_cluster.get_cluster_id_by_name(getattr(args, "cluster_name")) - except Exception as e: - logger.error("get cluster id from ocp failed, Exception:{0}, please check cluster_name".format(e)) - return False - # 3: to timestamp must be larger than from timestamp, otherwise be valid if getattr(args, "from") is not None and getattr(args, "to") is not None: try: self.from_time_str = getattr(args, "from") @@ -245,7 +222,7 @@ def __check_valid_and_parse_args(self, args): logger.error("Error: from datetime is larger than to datetime, please check.") return False elif (getattr(args, "from") is None or getattr(args, "to") is None) and args.since is not None: - # 3: the format of since must be 'n' + # the format of since must be 'n' try: since_to_seconds = parse_time_length_to_sec(args.since) except ValueError: @@ -257,8 +234,13 @@ def __check_valid_and_parse_args(self, args): since_to_seconds = 3600 self.from_time_str = (now_time - datetime.timedelta(seconds=since_to_seconds)).strftime('%Y-%m-%d %H:%M:%S') else: - logger.error( - "Invalid args, you need input since or from and to datetime, args={0}".format(args)) + logger.error("Invalid args, you need input since or from and to datetime, args={0}".format(args)) + # store_dir must exist, else create directory. + if getattr(args, "store_dir") is not None: + if not os.path.exists(os.path.abspath(getattr(args, "store_dir"))): + logger.warn("Error: args --store_dir [{0}] incorrect: No such directory, Now create it".format(os.path.abspath(getattr(args, "store_dir")))) + os.makedirs(os.path.abspath(getattr(args, "store_dir"))) + self.gather_pack_dir = os.path.abspath(getattr(args, "store_dir")) return True @staticmethod diff --git a/handler/gather/gather_log.py b/handler/gather/gather_log.py index 535d11ce..6dd24fbe 100644 --- a/handler/gather/gather_log.py +++ b/handler/gather/gather_log.py @@ -41,7 +41,7 @@ class GatherLogHandler(BaseShellHandler): - def __init__(self, nodes, gather_pack_dir, gather_timestamp, common_config): + def __init__(self, nodes, gather_pack_dir, gather_timestamp=None, common_config=None, is_scene=False): super(GatherLogHandler, self).__init__(nodes) self.is_ssh = True self.gather_timestamp = gather_timestamp @@ -53,10 +53,11 @@ def __init__(self, nodes, gather_pack_dir, gather_timestamp, common_config): self.grep_args = None self.scope = None self.zip_encrypt = False + self.is_scene = is_scene self.config_path = const.DEFAULT_CONFIG_PATH if common_config is None: self.file_number_limit = 20 - self.file_size_limit = 2 * 1024 * 1024 + self.file_size_limit = 2 * 1024 * 1024 * 1024 else: self.file_number_limit = int(common_config["file_number_limit"]) self.file_size_limit = int(parse_size(common_config["file_size_limit"])) @@ -66,9 +67,11 @@ def handle(self, args): if not self.__check_valid_and_parse_args(args): return # example of the format of pack dir for this command: {gather_pack_dir}/gather_pack_20190610123344 - pack_dir_this_command = os.path.join(self.gather_pack_dir, - "gather_pack_{0}".format(timestamp_to_filename_time( - self.gather_timestamp))) + if self.is_scene: + pack_dir_this_command = self.gather_pack_dir + else: + pack_dir_this_command = os.path.join(self.gather_pack_dir, "gather_pack_{0}".format(timestamp_to_filename_time(self.gather_timestamp))) + mkdir_if_not_exist(pack_dir_this_command) logger.info("Use {0} as pack dir.".format(pack_dir_this_command)) gather_tuples = [] gather_pack_path_dict = {} @@ -126,7 +129,6 @@ def __handle_from_node(self, args, pack_dir_this_command, node): local_store_dir = "{0}/docker_{1}".format(pack_dir_this_command, node["container_name"]) else: local_store_dir = "{0}/{1}".format(pack_dir_this_command, remote_ip) - mkdir_if_not_exist(local_store_dir) try: ssh = SshHelper(self.is_ssh, remote_ip, remote_user, remote_password, remote_port, remote_private_key, node) except Exception as e: @@ -175,8 +177,8 @@ def __handle_log_list(self, ssh, node, resp): return log_list, resp elif len(log_list) <= 0: logger.warn( - "{0} The number of log files is {1}, No files found, " - "Please adjust the query limit".format(ip, len(log_list))) + "{0} The number of log files is {1}, The time range for file gather from {2} to {3}, and no eligible files were found" + "Please adjust the query time limit".format(ip, len(log_list), self.from_time_str, self.to_time_str)) resp["skip"] = True, resp["error"] = "No files found" return log_list, resp @@ -282,20 +284,18 @@ def __check_valid_and_parse_args(self, args): seconds=parse_time_length_to_sec(args.since))).strftime('%Y-%m-%d %H:%M:%S') else: self.from_time_str = (now_time - datetime.timedelta(minutes=30)).strftime('%Y-%m-%d %H:%M:%S') - # 2: store_dir must exist, else return "No such file or directory". + # 2: store_dir must exist, else create directory. if getattr(args, "store_dir") is not None: if not os.path.exists(os.path.abspath(getattr(args, "store_dir"))): - logger.error("Error: args --store_dir [{0}] incorrect: No such directory." - .format(os.path.abspath(getattr(args, "store_dir")))) - return False - else: - self.gather_pack_dir = os.path.abspath(getattr(args, "store_dir")) + logger.warn("Error: args --store_dir [{0}] incorrect: No such directory, Now create it".format(os.path.abspath(getattr(args, "store_dir")))) + os.makedirs(os.path.abspath(getattr(args, "store_dir"))) + self.gather_pack_dir = os.path.abspath(getattr(args, "store_dir")) - if getattr(args, "grep") is not None: - self.grep_args = ' '.join(getattr(args, "grep")) - if getattr(args, "scope") is not None: + if hasattr(args, "grep") and args.grep is not None: + self.grep_args = getattr(args, "grep") + if hasattr(args, "scope") and args.scope is not None: self.scope = getattr(args, "scope")[0] - if getattr(args, "encrypt")[0] == "true": + if hasattr(args, "encrypt") and args.encrypt[0] == "true": self.zip_encrypt = True return True diff --git a/handler/gather/gather_obadmin.py b/handler/gather/gather_obadmin.py index be0e7972..16d42401 100644 --- a/handler/gather/gather_obadmin.py +++ b/handler/gather/gather_obadmin.py @@ -23,7 +23,7 @@ import tabulate from common.logger import logger -from common.obdiag_exception import OBDIAGInvalidArgs, OBDIAGFormatException +from common.obdiag_exception import OBDIAGFormatException from common.constant import const from common.command import LocalClient, SshClient, is_empty_dir from handler.base_shell_handler import BaseShellHandler @@ -267,14 +267,12 @@ def __check_valid_args(self, args): :param args: command args :return: boolean. True if valid, False if invalid. """ - # 1: store_dir must exist, else return "No such file or directory". + # 1: store_dir must exist, else create directory. if getattr(args, "store_dir") is not None: if not os.path.exists(os.path.abspath(getattr(args, "store_dir"))): - logger.error("Error: args --store_dir [{0}] incorrect: No such directory." - .format(os.path.abspath(getattr(args, "store_dir")))) - return False - else: - self.local_stored_path = os.path.abspath(getattr(args, "store_dir")) + logger.warn("Error: args --store_dir [{0}] incorrect: No such directory, Now create it".format(os.path.abspath(getattr(args, "store_dir")))) + os.makedirs(os.path.abspath(getattr(args, "store_dir"))) + self.local_stored_path = os.path.abspath(getattr(args, "store_dir")) if getattr(args, "encrypt")[0] == "true": self.zip_encrypt = True diff --git a/handler/gather/gather_obproxy_log.py b/handler/gather/gather_obproxy_log.py index 45b61984..f05bd050 100644 --- a/handler/gather/gather_obproxy_log.py +++ b/handler/gather/gather_obproxy_log.py @@ -17,7 +17,6 @@ """ import datetime import os -import threading import time import uuid @@ -26,7 +25,6 @@ from handler.base_shell_handler import BaseShellHandler from common.logger import logger from common.obdiag_exception import OBDIAGFormatException -from common.obdiag_exception import OBDIAGInvalidArgs from common.command import LocalClient, SshClient from common.constant import const from utils.file_utils import mkdir_if_not_exist, size_format, write_result_append_to_file, parse_size, show_file_size_tabulate @@ -42,7 +40,7 @@ class GatherObProxyLogHandler(BaseShellHandler): - def __init__(self, nodes, gather_pack_dir, gather_timestamp, common_config): + def __init__(self, nodes, gather_pack_dir, gather_timestamp=None, common_config=None, is_scene=False): super(GatherObProxyLogHandler, self).__init__(nodes) self.is_ssh = True self.gather_timestamp = gather_timestamp @@ -54,10 +52,11 @@ def __init__(self, nodes, gather_pack_dir, gather_timestamp, common_config): self.grep_args = None self.scope = None self.zip_encrypt = False + self.is_scene = is_scene self.config_path = const.DEFAULT_CONFIG_PATH if common_config is None: self.file_number_limit = 20 - self.file_size_limit = 2 * 1024 * 1024 + self.file_size_limit = 2 * 1024 * 1024 * 1024 else: self.file_number_limit = int(common_config["file_number_limit"]) self.file_size_limit = int(parse_size(common_config["file_size_limit"])) @@ -65,9 +64,10 @@ def __init__(self, nodes, gather_pack_dir, gather_timestamp, common_config): def handle(self, args): if not self.__check_valid_and_parse_args(args): return - pack_dir_this_command = os.path.join(self.gather_pack_dir, - "gather_pack_{0}".format(timestamp_to_filename_time( - self.gather_timestamp))) + if self.is_scene: + pack_dir_this_command = self.gather_pack_dir + else: + pack_dir_this_command = os.path.join(self.gather_pack_dir, "gather_pack_{0}".format(timestamp_to_filename_time(self.gather_timestamp))) logger.info("Use {0} as pack dir.".format(pack_dir_this_command)) gather_tuples = [] gather_pack_path_dict = {} @@ -166,8 +166,8 @@ def __handle_log_list(self, ssh, node, resp): return log_list, resp elif len(log_list) <= 0: logger.warn( - "{0} The number of log files is {1}, No files found, " - "Please adjust the query limit".format(ip, len(log_list))) + "{0} The number of log files is {1}, The time range for file gather from {2} to {3}, and no eligible files were found" + "Please adjust the query time limit".format(ip, len(log_list), self.from_time_str, self.to_time_str)) resp["skip"] = True, resp["error"] = "No files found" return log_list, resp @@ -190,7 +190,7 @@ def __get_log_name(self, ssh_helper, node): if log_files: log_name_list = get_logfile_name_list(self.is_ssh, ssh_helper, self.from_time_str, self.to_time_str, log_path, log_files) else: - logger.error("Unable to find the log file. Please provide the correct home_path config, the default is [/home/admin/obproxy]") + logger.error("Unable to find the log file. Please provide the correct home_path config and check obproxy {0} log exist".format(log_path)) return log_name_list def __pharse_log(self, ssh_helper, home_path, log_name, gather_path): @@ -277,14 +277,12 @@ def __check_valid_and_parse_args(self, args): seconds=parse_time_length_to_sec(args.since))).strftime('%Y-%m-%d %H:%M:%S') else: self.from_time_str = (now_time - datetime.timedelta(minutes=30)).strftime('%Y-%m-%d %H:%M:%S') - # 2: store_dir must exist, else return "No such file or directory". + # 2: store_dir must exist, else create directory. if getattr(args, "store_dir") is not None: if not os.path.exists(os.path.abspath(getattr(args, "store_dir"))): - logger.error("Error: args --store_dir [{0}] incorrect: No such directory." - .format(os.path.abspath(getattr(args, "store_dir")))) - return False - else: - self.gather_pack_dir = os.path.abspath(getattr(args, "store_dir")) + logger.warn("Error: args --store_dir [{0}] incorrect: No such directory, Now create it".format(os.path.abspath(getattr(args, "store_dir")))) + os.makedirs(os.path.abspath(getattr(args, "store_dir"))) + self.gather_pack_dir = os.path.abspath(getattr(args, "store_dir")) if getattr(args, "grep") is not None: self.grep_args = ' '.join(getattr(args, "grep")) diff --git a/handler/gather/gather_obstack2.py b/handler/gather/gather_obstack2.py new file mode 100644 index 00000000..7be4c863 --- /dev/null +++ b/handler/gather/gather_obstack2.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2020/7/19 +@file: gather_obstack2.py +@desc: +""" +import os +import re +import sys +import threading +import time +import datetime + +import tabulate +import uuid + +from common.command import download_file, is_empty_dir, is_support_arch, get_observer_version, get_observer_pid, mkdir, zip_dir, get_file_size, delete_file_force, is_empty_file, upload_file +from common.logger import logger +from common.obdiag_exception import OBDIAGInvalidArgs +from common.constant import const +from common.command import LocalClient, SshClient +from handler.base_shell_handler import BaseShellHandler +from utils.version_utils import compare_versions_greater +from utils.retry_utils import retry +from utils.file_utils import mkdir_if_not_exist, size_format, write_result_append_to_file, parse_size +from utils.shell_utils import SshHelper +from utils.time_utils import timestamp_to_filename_time +from utils.utils import get_localhost_inner_ip, display_trace + + +class GatherObstack2Handler(BaseShellHandler): + def __init__(self, nodes, gather_pack_dir, gather_timestamp=None, common_config=None, is_scene=False): + super(GatherObstack2Handler, self).__init__(nodes) + self.is_ssh = True + self.gather_timestamp = gather_timestamp + self.local_stored_path = gather_pack_dir + self.remote_stored_path = None + self.is_scene = is_scene + self.config_path = const.DEFAULT_CONFIG_PATH + if common_config is None: + self.file_size_limit = 2 * 1024 * 1024 * 1024 + else: + self.file_size_limit = int(parse_size(common_config["file_size_limit"])) + + def handle(self, args): + if not self.__check_valid_args(args): + return + if self.is_scene: + pack_dir_this_command = self.local_stored_path + else: + pack_dir_this_command = os.path.join(self.local_stored_path, "gather_pack_{0}".format(timestamp_to_filename_time(self.gather_timestamp))) + logger.info("Use {0} as pack dir.".format(pack_dir_this_command)) + gather_tuples = [] + def handle_from_node(node): + st = time.time() + resp = self.__handle_from_node(args, pack_dir_this_command, node) + file_size = "" + if len(resp["error"]) == 0: + file_size = os.path.getsize(resp["gather_pack_path"]) + gather_tuples.append((node.get("ip"), False, resp["error"], + file_size, + int(time.time() - st), + resp["gather_pack_path"])) + + if self.is_ssh: + for node in self.nodes: + handle_from_node(node) + else: + local_ip = get_localhost_inner_ip() + node = self.nodes[0] + node["ip"] = local_ip + for node in self.nodes: + handle_from_node(node) + + summary_tuples = self.__get_overall_summary(gather_tuples) + print(summary_tuples) + display_trace(uuid.uuid3(uuid.NAMESPACE_DNS, str(os.getpid()))) + # Persist the summary results to a file + write_result_append_to_file(os.path.join(pack_dir_this_command, "result_summary.txt"), summary_tuples) + last_info = "For result details, please run cmd \033[32m' cat {0} '\033[0m\n".format(os.path.join(pack_dir_this_command, "result_summary.txt")) + print(last_info) + + def __handle_from_node(self, args, local_stored_path, node): + resp = { + "skip": False, + "error": "", + "gather_pack_path": "" + } + remote_ip = node.get("ip") if self.is_ssh else get_localhost_inner_ip() + remote_user = node.get("user") + remote_password = node.get("password") + remote_port = node.get("port") + remote_private_key = node.get("private_key") + remote_home_path = node.get("home_path") + logger.info( + "Sending Collect Shell Command to node {0} ...".format(remote_ip)) + mkdir_if_not_exist(local_stored_path) + now_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S') + remote_dir_name = "obstack2_{0}_{1}".format(remote_ip, now_time) + remote_dir_full_path = "/tmp/{0}".format(remote_dir_name) + ssh_failed = False + try: + ssh_helper = SshHelper(self.is_ssh, remote_ip, remote_user, remote_password, remote_port, remote_private_key,node) + except Exception as e: + logger.error("ssh {0}@{1}: failed, Please check the {2}".format( + remote_user, + remote_ip, + self.config_path)) + ssh_failed = True + resp["skip"] = True + resp["error"] = "Please check the {0}".format(self.config_path) + + if not ssh_failed: + if not is_support_arch(self.is_ssh, ssh_helper): + resp["error"] = "remote server {0} arch not support gather obstack".format(ssh_helper.get_name()) + return resp + mkdir(self.is_ssh, ssh_helper, remote_dir_full_path) + + # install and chmod obstack2 + ob_version = get_observer_version(self.is_ssh, ssh_helper, node.get("home_path")) + if not compare_versions_greater(ob_version, const.MIN_OB_VERSION_SUPPORT_GATHER_OBSTACK): + logger.info("This version {0} does not support gather obstack . The minimum supported version is {1}". + format(ob_version, const.MIN_OB_VERSION_SUPPORT_GATHER_OBSTACK)) + resp["error"] = "{0} not support gather obstack".format(ob_version) + resp["gather_pack_path"] = "{0}".format(local_stored_path) + return resp + is_need_install_obstack = self.__is_obstack_exists(self.is_ssh, ssh_helper) + if is_need_install_obstack: + logger.info("There is no obstack2 on the host {0}. It needs to be installed. " + "Please wait a moment ...".format(remote_ip)) + if getattr(sys, 'frozen', False): + absPath = os.path.dirname(sys.executable) + else: + absPath = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + obstack2_local_stored_full_path = os.path.join(absPath, const.OBSTACK2_LOCAL_STORED_PATH) + upload_file(self.is_ssh, ssh_helper, obstack2_local_stored_full_path, const.OBSTACK2_DEFAULT_INSTALL_PATH) + logger.info("Installation of obstack2 is completed and gather begins ...") + + self.__chmod_obstack2(self.is_ssh, ssh_helper) + # get observer_pid + observer_pid_list = get_observer_pid(self.is_ssh, ssh_helper, node.get("home_path")) + # gather obstack2 info + for observer_pid in observer_pid_list: + user = self.__get_observer_execute_user(ssh_helper, observer_pid) + self.__gather_obstack2_info(self.is_ssh, ssh_helper, user, observer_pid, remote_dir_name,node) + try: + self.is_ready(ssh_helper, observer_pid, remote_dir_name) + except: + logger.error("Gather obstack info on the host {0} observer pid {1}".format(remote_ip, observer_pid)) + delete_file_force(self.is_ssh, ssh_helper, "/tmp/{dir_name}/observer_{pid}_obstack.txt" + .format(dir_name=remote_dir_name, pid=observer_pid)) + pass + if is_empty_dir(self.is_ssh, ssh_helper, "/tmp/{0}".format(remote_dir_name)): + resp["error"] = "gather failed, folder is empty" + return resp + + zip_dir(self.is_ssh, ssh_helper, "/tmp", remote_dir_name) + remote_zip_file_path = "{0}.zip".format(remote_dir_full_path) + + file_size = get_file_size(self.is_ssh, ssh_helper, remote_zip_file_path) + remote_file_full_path = "{0}.zip".format(remote_dir_full_path) + if int(file_size) < self.file_size_limit: + local_file_path = "{0}/{1}.zip".format(local_stored_path, remote_dir_name) + download_file(self.is_ssh, ssh_helper, remote_file_full_path, local_file_path) + resp["error"] = "" + else: + resp["error"] = "File too large" + delete_file_force(self.is_ssh, ssh_helper, remote_file_full_path) + ssh_helper.ssh_close() + resp["gather_pack_path"] = "{0}/{1}.zip".format(local_stored_path, remote_dir_name) + return resp + + @retry(5, 2) + def is_ready(self, ssh_helper, pid, remote_dir_name): + try: + logger.info("Check whether the directory /tmp/{dir_name} or " + "file /tmp/{dir_name}/observer_{pid}_obstack.txt is empty" + .format(dir_name=remote_dir_name, pid=pid)) + is_empty_dir_res = is_empty_dir(self.is_ssh, ssh_helper, "/tmp/{0}".format(remote_dir_name)) + is_empty_file_res = is_empty_file(self.is_ssh, ssh_helper, "/tmp/{dir_name}/observer_{pid}_obstack.txt" + .format(dir_name=remote_dir_name, pid=pid)) + if is_empty_dir_res or is_empty_file_res: + logger.info( + "The server {host_ip} directory /tmp/{dir_name} or file /tmp/{dir_name}/observer_{pid}_obstack.txt" + " is empty, waiting for the collection to complete" + .format(host_ip=ssh_helper.get_name() if self.is_ssh else get_localhost_inner_ip(), dir_name=remote_dir_name, pid=pid)) + raise + except Exception as e: + raise e + + @staticmethod + def __chmod_obstack2(is_ssh, ssh_helper): + cmd = "chmod a+x {file}".format(file=const.OBSTACK2_DEFAULT_INSTALL_PATH) + SshClient().run(ssh_helper, cmd) if is_ssh else LocalClient().run(cmd) + + @staticmethod + def __is_obstack_exists(is_ssh, ssh_helper): + cmd = "test -e {file} && echo exists".format(file=const.OBSTACK2_DEFAULT_INSTALL_PATH) + stdout = SshClient().run(ssh_helper, cmd) if is_ssh else LocalClient().run(cmd) + if stdout == 'exists': + return False + else: + return True + + def __get_observer_execute_user(self, ssh_helper, pid): + cmd = "ps -o ruser=userForLongName -e -o pid,ppid,c,stime,tty,time,cmd | grep observer | grep {0} | awk {1}".format(pid, "'{print $1}'") + stdout = SshClient().run(ssh_helper, cmd) if self.is_ssh else LocalClient().run(cmd) + user = stdout.splitlines()[0] + logger.info("get observer execute user, run cmd = [{0}], result:{1} ".format(cmd, user)) + return user + + @staticmethod + def __gather_obstack2_info(is_ssh, ssh_helper, user, observer_pid, remote_gather_dir,node): + cmd = "{obstack} {pid} > /tmp/{gather_dir}/observer_{pid}_obstack.txt".format( + obstack=const.OBSTACK2_DEFAULT_INSTALL_PATH, + pid=observer_pid, + gather_dir=remote_gather_dir) + if is_ssh: + if user == ssh_helper.username: + logger.debug("gather obstack info on server {0}, run cmd = [{1}]".format(ssh_helper.get_name(), cmd)) + SshClient().run_ignore_err(ssh_helper, cmd) + else: + ssh_helper_new = SshHelper(ssh_helper.host_ip, ssh_helper.username, ssh_helper.password, ssh_helper.ssh_port, ssh_helper.key_file,node) + chown_cmd = "chown {user} /tmp/{gather_dir}/".format(user=user,gather_dir=remote_gather_dir) + SshClient().run(ssh_helper_new, chown_cmd) + logger.info("gather obstack info on server {0}, run cmd = [su {1}, {2}]".format(ssh_helper.get_name(), user, cmd)) + ssh_helper_new.ssh_invoke_shell_switch_user(user, cmd, 10) + else: + LocalClient().run(cmd) + + def __check_valid_args(self, args): + """ + chech whether command args are valid. If invalid, stop processing and print the error to the user + :param args: command args + :return: boolean. True if valid, False if invalid. + """ + # 1: store_dir must exist, else create directory. + if getattr(args, "store_dir") is not None: + if not os.path.exists(os.path.abspath(getattr(args, "store_dir"))): + logger.warn("Error: args --store_dir [{0}] incorrect: No such directory, Now create it".format(os.path.abspath(getattr(args, "store_dir")))) + os.makedirs(os.path.abspath(getattr(args, "store_dir"))) + self.local_stored_path = os.path.abspath(getattr(args, "store_dir")) + return True + + @staticmethod + def __get_overall_summary(node_summary_tuple): + summary_tab = [] + field_names = ["Node", "Status", "Size", "Time", "PackPath"] + for tup in node_summary_tuple: + node = tup[0] + is_err = tup[2] + file_size = tup[3] + consume_time = tup[4] + pack_path = tup[5] + try: + format_file_size = size_format(file_size, output_str=True) + except: + format_file_size = size_format(0, output_str=True) + summary_tab.append((node, "Error:" + tup[2] if is_err else "Completed", + format_file_size, "{0} s".format(int(consume_time)), pack_path)) + return "\nGather Ob stack Summary:\n" + \ + tabulate.tabulate(summary_tab, headers=field_names, tablefmt="grid", showindex=False) diff --git a/handler/gather/gather_perf.py b/handler/gather/gather_perf.py index f8e11fa1..4117cb83 100644 --- a/handler/gather/gather_perf.py +++ b/handler/gather/gather_perf.py @@ -26,7 +26,6 @@ from common.command import get_observer_pid, mkdir, zip_dir, get_file_size, download_file, delete_file_force from common.logger import logger from common.command import LocalClient, SshClient -from common.obdiag_exception import OBDIAGInvalidArgs from common.constant import const from handler.base_shell_handler import BaseShellHandler from utils.file_utils import mkdir_if_not_exist, size_format, write_result_append_to_file, parse_size @@ -36,13 +35,14 @@ class GatherPerfHandler(BaseShellHandler): - def __init__(self, nodes, gather_pack_dir, gather_timestamp, common_config): + def __init__(self, nodes, gather_pack_dir, gather_timestamp=None, common_config=None, is_scene=False): super(GatherPerfHandler, self).__init__(nodes) self.is_ssh = True self.gather_timestamp = gather_timestamp self.local_stored_path = gather_pack_dir self.remote_stored_path = None self.ob_install_dir = None + self.is_scene = is_scene self.scope = "all" self.config_path = const.DEFAULT_CONFIG_PATH if common_config is None: @@ -53,9 +53,10 @@ def __init__(self, nodes, gather_pack_dir, gather_timestamp, common_config): def handle(self, args): if not self.__check_valid_args(args): return - if args.store_dir is not None: - self.local_stored_path = os.path.abspath(args.store_dir) - pack_dir_this_command = os.path.join(self.local_stored_path,"gather_pack_{0}".format(timestamp_to_filename_time(self.gather_timestamp))) + if self.is_scene: + pack_dir_this_command = self.local_stored_path + else: + pack_dir_this_command = os.path.join(self.local_stored_path,"gather_pack_{0}".format(timestamp_to_filename_time(self.gather_timestamp))) logger.info("Use {0} as pack dir.".format(pack_dir_this_command)) gather_tuples = [] @@ -136,6 +137,7 @@ def __handle_from_node(self, node, local_stored_path): self.__gather_perf_sample(ssh_helper, remote_dir_full_path, pid_observer) self.__gather_perf_flame(ssh_helper, remote_dir_full_path, pid_observer) self.__gather_pstack(ssh_helper, remote_dir_full_path, pid_observer) + self.__gather_top(ssh_helper, remote_dir_full_path, pid_observer) zip_dir(self.is_ssh, ssh_helper, "/tmp", remote_dir_name) remote_file_full_path = "{0}.zip".format(remote_dir_full_path) @@ -188,18 +190,28 @@ def __gather_pstack(self, ssh_helper, gather_path, pid_observer): except: logger.error("gather pstack on server failed [{0}]".format(ssh_helper.get_name())) + def __gather_top(self, ssh_helper, gather_path, pid_observer): + try: + cmd = "cd {gather_path} && top -Hp {pid} -b -n 1 > top.txt".format( + gather_path=gather_path, pid=pid_observer) + logger.info("gather top, run cmd = [{0}]".format(cmd)) + SshClient().run(ssh_helper, cmd) if self.is_ssh else LocalClient().run(cmd) + except: + logger.error("gather top on server failed [{0}]".format(ssh_helper.get_name())) + + def __check_valid_args(self, args): """ chech whether command args are valid. If invalid, stop processing and print the error to the user :param args: command args :return: boolean. True if valid, False if invalid. """ - # 1: store_dir must exist, else return "No such file or directory". + # 1: store_dir must exist, else create directory. if getattr(args, "store_dir") is not None: if not os.path.exists(os.path.abspath(getattr(args, "store_dir"))): - logger.error("Error: args --store_dir [{0}] incorrect: No such directory." - .format(os.path.abspath(getattr(args, "store_dir")))) - return False + logger.warn("Error: args --store_dir [{0}] incorrect: No such directory, Now create it".format(os.path.abspath(getattr(args, "store_dir")))) + os.makedirs(os.path.abspath(getattr(args, "store_dir"))) + self.local_stored_path = os.path.abspath(getattr(args, "store_dir")) if getattr(args, "scope") is not None: self.scope = getattr(args, "scope")[0] return True diff --git a/handler/gather/gather_plan_monitor.py b/handler/gather/gather_plan_monitor.py index d1411f1e..1d631e2e 100644 --- a/handler/gather/gather_plan_monitor.py +++ b/handler/gather/gather_plan_monitor.py @@ -15,7 +15,6 @@ @file: gather_plan_monitor.py @desc: """ -from logging import log import os import re import sys @@ -27,20 +26,21 @@ import tabulate from prettytable import from_db_cursor -from common.command import get_observer_version_by_sql from common.logger import logger from common.ob_connector import OBConnector -from common.obdiag_exception import OBDIAGInvalidArgs, OBDIAGArgsNotFoundException +from common.obdiag_exception import OBDIAGArgsNotFoundException from handler.base_sql_handler import BaseSQLHandler from handler.meta.html_meta import GlobalHtmlMeta from handler.meta.sql_meta import GlobalSqlMeta from utils.file_utils import mkdir_if_not_exist, write_result_append_to_file from utils.time_utils import timestamp_to_filename_time +from utils.string_utils import parse_custom_env_string from utils.utils import display_trace +from utils.string_utils import parse_mysql_cli_connection_string, validate_db_info class GatherPlanMonitorHandler(BaseSQLHandler): - def __init__(self, ob_cluster, gather_pack_dir, gather_timestamp): + def __init__(self, ob_cluster, gather_pack_dir, gather_timestamp=None, db_conn={}, is_scene=False): super(GatherPlanMonitorHandler, self).__init__() self.ob_cluster = ob_cluster self.local_stored_path = gather_pack_dir @@ -56,12 +56,19 @@ def __init__(self, ob_cluster, gather_pack_dir, gather_timestamp): timeout=100) self.enable_dump_db = False self.trace_id = None + self.env = {} self.STAT_NAME = {} self.report_file_path = "" self.enable_fast_dump = False self.ob_major_version = None self.sql_audit_name = "gv$sql_audit" self.plan_explain_name = "gv$plan_cache_plan_explain" + self.db_conn = db_conn + self.is_scene = is_scene + self.gather_pack_dir = gather_pack_dir + + def __init_db_connector(self): + self.db_connector = OBConnector(ip=self.db_conn.get("host"), port=self.db_conn.get("port"), username=self.db_conn.get("user"), password=self.db_conn.get("password"), timeout=100) def handle(self, args): """ @@ -71,7 +78,10 @@ def handle(self, args): """ if not self.__check_valid_and_parse_args(args): return - pack_dir_this_command = os.path.join(self.local_stored_path, "gather_pack_{0}".format( + if self.is_scene: + pack_dir_this_command = self.gather_pack_dir + else: + pack_dir_this_command = os.path.join(self.local_stored_path, "gather_pack_{0}".format( timestamp_to_filename_time(self.gather_timestamp))) self.report_file_path = os.path.join(pack_dir_this_command, "sql_plan_monitor_report.html") logger.info("Use {0} as pack dir.".format(pack_dir_this_command)) @@ -215,16 +225,34 @@ def __check_valid_and_parse_args(self, args): return False else: return False + # 2: store_dir must exist, else create directory. if getattr(args, "store_dir") is not None: if not os.path.exists(os.path.abspath(getattr(args, "store_dir"))): - logger.error("Error: args --store_dir [{0}] incorrect: No such directory." - .format(os.path.abspath(getattr(args, "store_dir")))) - return False - else: - self.local_stored_path = os.path.abspath(getattr(args, "store_dir")) + logger.warn("Error: args --store_dir [{0}] incorrect: No such directory, Now create it".format(os.path.abspath(getattr(args, "store_dir")))) + os.makedirs(os.path.abspath(getattr(args, "store_dir"))) + self.local_stored_path = os.path.abspath(getattr(args, "store_dir")) + if getattr(args, "env") is not None: + self.__init_db_conn(args) + else: + self.db_connector = self.ob_connector self.tenant_mode_detected() return True + def __init_db_conn(self, args): + try: + env_dict = parse_custom_env_string(getattr(args, "env")) + self.env = env_dict + cli_connection_string = self.env.get("db_connect") + self.db_conn = parse_mysql_cli_connection_string(cli_connection_string) + if validate_db_info(self.db_conn): + self.__init_db_connector() + else: + logger.error("db connection information requird [db_connect = '-hxx -Pxx -uxx -pxx -Dxx'] but provided {0}, please check the --env {0}".format(env_dict)) + self.db_connector = self.ob_connector + except Exception as e: + self.db_connector = self.ob_connector + logger.error("init db connector, error: {0}, please check --env args {1}".format(e, parse_custom_env_string(getattr(args, "env")))) + @staticmethod def __get_overall_summary(node_summary_tuple): """ @@ -257,7 +285,7 @@ def report_schema(self, sql): valid_words.append(t) for t in valid_words: try: - data = self.ob_connector.execute_sql("show create table %s" % t) + data = self.db_connector.execute_sql("show create table %s" % t) schemas = schemas + "
%s
" % (data[1]) logger.debug("table schema: {0}".format(schemas)) except Exception as e: @@ -531,20 +559,26 @@ def __report(self, s): def tenant_mode_detected(self): try: - data = self.ob_connector.execute_sql("select version();") - logger.info("Detected mySQL mode successful, Database version : %s " % ("%s" % data[0])) - ob_version = data[0] - version_info = re.findall(r'OceanBase(_)?(.CE)?-v(.+)', ob_version[0]) - version = version_info[0][2] - if int(version[0]) >= 4: - self.sql_audit_name = "gv$ob_sql_audit" - self.plan_explain_name = "gv$ob_plan_cache_plan_explain" + data = self.db_connector.execute_sql("show variables like 'version_comment'") + ob_version = "3.0.0.0" + for row in data: + ob_version = row[1] + logger.info("Detected mySQL mode successful, Database version :{0} ".format(ob_version)) + version_pattern = r'(?:OceanBase(_CE)?\s+)?(\d+\.\d+\.\d+\.\d+)' + matched_version = re.search(version_pattern, ob_version) + if matched_version: + version = matched_version.group(2) + if int(version[0]) >= 4: + self.sql_audit_name = "gv$ob_sql_audit" + self.plan_explain_name = "gv$ob_plan_cache_plan_explain" + else: + self.sql_audit_name = "gv$sql_audit" + self.plan_explain_name = "gv$plan_cache_plan_explain" + self.ob_major_version = int(version[0]) + self.tenant_mode = "mysql" + self.sys_database = "oceanbase" else: - self.sql_audit_name = "gv$sql_audit" - self.plan_explain_name = "gv$plan_cache_plan_explain" - self.ob_major_version = int(version[0]) - self.tenant_mode = "mysql" - self.sys_database = "oceanbase" + logger.warn("Failed to match ob version") except: data = self.ob_connector.execute_sql("select SUBSTR(BANNER, 11, 100) from V$VERSION;") logger.info("Detectedo oracle mode successful, Database version : %s " % ("%s" % data[0])) diff --git a/handler/gather/gather_scenes.py b/handler/gather/gather_scenes.py new file mode 100644 index 00000000..a350502f --- /dev/null +++ b/handler/gather/gather_scenes.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/04 +@file: gather_scene_handler.py +@desc: +""" + +import os +import re +import uuid +import datetime +from common.logger import logger +from handler.gather.scenes.base import SceneBase +from utils.utils import display_trace +from common.obdiag_exception import OBDIAGFormatException +from utils.time_utils import parse_time_str +from utils.time_utils import parse_time_length_to_sec +from utils.time_utils import timestamp_to_filename_time +from utils.utils import display_trace +from handler.gather.scenes.list import GatherScenesListHandler +from utils.file_utils import mkdir_if_not_exist +from utils.string_utils import parse_custom_env_string +from common.scene import get_obproxy_and_ob_version +from colorama import Fore, Style + + +class GatherSceneHandler: + + def __init__(self, obproxy_cluster, obproxy_nodes, ob_cluster, ob_nodes, gather_pack_dir, gather_timestamp, tasks_base_path="./handler/gather/tasks/", task_type="observer"): + self.is_ssh = True + self.report = None + self.gather_timestamp = gather_timestamp + self.gather_pack_dir = gather_pack_dir + self.report_path = None + self.yaml_tasks = {} + self.code_tasks = [] + self.env = {} + self.scene = None + self.obproxy_cluster = obproxy_cluster + self.obproxy_nodes = obproxy_nodes + self.cluster = ob_cluster + self.ob_nodes = ob_nodes + self.tasks_base_path = tasks_base_path + self.task_type = task_type + self.variables = {} + + def handle(self, args): + if not self.__check_valid_and_parse_args(args): + return + self.__init_variables() + self.__init_report_path() + self.__init_task_names() + self.execute() + self.__print_result() + + def execute(self): + try: + logger.info("execute_tasks. the number of tasks is {0} ,tasks is {1}".format(len(self.yaml_tasks.keys()), self.yaml_tasks.keys())) + for key, value in zip(self.yaml_tasks.keys(), self.yaml_tasks.values()): + self.__execute_yaml_task_one(key, value) + for task in self.code_tasks: + self.__execute_code_task_one(task) + except Exception as e: + logger.error("Internal error :{0}".format(e)) + finally: + display_trace(uuid.uuid3(uuid.NAMESPACE_DNS, str(os.getpid()))) + + # execute yaml task + def __execute_yaml_task_one(self, task_name, task_data): + try: + logger.info("execute tasks is {0}".format(task_name)) + task_type = self.__get_task_type(task_name) + version = get_obproxy_and_ob_version(self.obproxy_nodes, self.ob_nodes, self.task_type) + if version: + self.cluster["version"] = re.findall(r'\d+\.\d+\.\d+\.\d+', version)[0] + logger.info("cluster.version is {0}".format(self.cluster["version"])) + task = SceneBase(scene=task_data["task"], obproxy_nodes=self.obproxy_nodes, ob_nodes=self.ob_nodes, cluster=self.cluster, report_dir=self.report_path, args=self.args, env=self.env, scene_variable_dict=self.variables, task_type=task_type) + logger.info("{0} execute!".format(task_name)) + task.execute() + logger.info("execute tasks end : {0}".format(task_name)) + else: + logger.error("can't get version") + except Exception as e: + logger.error("__execute_yaml_task_one Exception : {0}".format(e)) + + # execute code task + def __execute_code_task_one(self, task_name): + try: + logger.info("execute tasks is {0}".format(task_name)) + scene = {"name": task_name} + task = SceneBase(scene=scene, obproxy_nodes=self.obproxy_nodes, ob_nodes=self.ob_nodes, cluster=self.cluster, report_dir=self.report_path, args=self.args, env=self.env, mode='code', task_type=task_name) + logger.info("{0} execute!".format(task_name)) + task.execute() + logger.info("execute tasks end : {0}".format(task_name)) + except Exception as e: + logger.error("__execute_code_task_one Exception : {0}".format(e)) + + def __init_task_names(self): + if self.scene: + new = re.sub(r'\{|\}', '', self.scene) + items = re.split(r'[;,]', new) + scene = GatherScenesListHandler(self.tasks_base_path) + for item in items: + yaml_task_data = scene.get_one_yaml_task(item) + is_code_task = scene.is_code_task(item) + if is_code_task: + self.code_tasks.append(item) + else: + if yaml_task_data: + self.yaml_tasks[item] = yaml_task_data + else: + logger.error("Invalid Task :{0}".format(item)) + else: + logger.error("get task name failed") + + def __init_report_path(self): + try: + self.report_path = os.path.join(self.gather_pack_dir, "gather_pack_{0}".format(timestamp_to_filename_time(self.gather_timestamp))) + logger.info("Use {0} as pack dir.".format(self.report_path)) + mkdir_if_not_exist(self.report_path) + except Exception as e: + logger.error("init_report_path failed, error:{0}".format(e)) + + def __init_variables(self): + try: + + self.variables = { + "observer_data_dir": self.ob_nodes[0].get("home_path") if self.ob_nodes and self.ob_nodes[0].get("home_path") else "", + "obproxy_data_dir": self.obproxy_nodes[0].get("home_path") if self.obproxy_nodes and self.obproxy_nodes[0].get("home_path") else "", + "from_time": self.from_time_str, + "to_time": self.to_time_str + } + logger.info("gather scene variables: {0}".format(self.variables)) + except Exception as e: + logger.error("init gather scene variables failed, error: {0}".format(e)) + + def __get_task_type(self, s): + trimmed_str = s.strip() + if '.' in trimmed_str: + parts = trimmed_str.split('.', 1) + return parts[0] + else: + return None + + def __check_valid_and_parse_args(self, args): + """ + chech whether command args are valid. If invalid, stop processing and print the error to the user + :param args: command args + :return: boolean. True if valid, False if invalid. + """ + self.args = args + # 1: to timestamp must be larger than from timestamp, and be valid + if getattr(args, "from") is not None and getattr(args, "to") is not None: + try: + from_timestamp = parse_time_str(getattr(args, "from")) + to_timestamp = parse_time_str(getattr(args, "to")) + self.from_time_str = getattr(args, "from") + self.to_time_str = getattr(args, "to") + except OBDIAGFormatException: + logger.error("Error: Datetime is invalid. Must be in format yyyy-mm-dd hh:mm:ss. from_datetime={0}, to_datetime={1}".format(getattr(args, "from"), getattr(args, "to"))) + return False + if to_timestamp <= from_timestamp: + logger.error("Error: from datetime is larger than to datetime, please check.") + return False + else: + now_time = datetime.datetime.now() + self.to_time_str = (now_time + datetime.timedelta(minutes=1)).strftime('%Y-%m-%d %H:%M:%S') + if args.since is not None: + self.from_time_str = (now_time - datetime.timedelta( + seconds=parse_time_length_to_sec(args.since))).strftime('%Y-%m-%d %H:%M:%S') + else: + self.from_time_str = (now_time - datetime.timedelta(minutes=30)).strftime('%Y-%m-%d %H:%M:%S') + # 2: store_dir must exist, else create directory. + if getattr(args, "store_dir") is not None: + if not os.path.exists(os.path.abspath(getattr(args, "store_dir"))): + logger.warn("Error: args --store_dir [{0}] incorrect: No such directory, Now create it".format(os.path.abspath(getattr(args, "store_dir")))) + os.makedirs(os.path.abspath(getattr(args, "store_dir"))) + self.gather_pack_dir = os.path.abspath(getattr(args, "store_dir")) + if getattr(args, "scene") is not None: + self.scene = ' '.join(getattr(args, "scene")) + else: + return False + if getattr(args, "env") is not None: + env_dict = parse_custom_env_string(getattr(args, "env")) + self.env = env_dict + return True + + def __print_result(self): + print(Fore.YELLOW + "\nGather scene results stored in this directory: {0}\n".format(self.report_path) + Style.RESET_ALL) diff --git a/handler/gather/gather_sysstat.py b/handler/gather/gather_sysstat.py index c8a2c264..39c6167d 100644 --- a/handler/gather/gather_sysstat.py +++ b/handler/gather/gather_sysstat.py @@ -16,7 +16,6 @@ @desc: """ import os -import threading import time import datetime @@ -24,10 +23,9 @@ import uuid from common.logger import logger -from common.obdiag_exception import OBDIAGInvalidArgs from common.constant import const from common.command import LocalClient, SshClient -from common.command import get_file_size, download_file, mkdir, zip_dir, delete_file_force +from common.command import get_file_size, download_file, mkdir, zip_dir from handler.base_shell_handler import BaseShellHandler from utils.file_utils import mkdir_if_not_exist, size_format, write_result_append_to_file, parse_size from utils.shell_utils import SshHelper @@ -36,7 +34,7 @@ class GatherOsInfoHandler(BaseShellHandler): - def __init__(self, nodes, gather_pack_dir, gather_timestamp, common_config): + def __init__(self, nodes, gather_pack_dir, gather_timestamp=None, common_config=None, is_scene=False): super(GatherOsInfoHandler, self).__init__(nodes) for node in nodes: if node.get("ssh_type") == "docker": @@ -46,9 +44,10 @@ def __init__(self, nodes, gather_pack_dir, gather_timestamp, common_config): self.gather_timestamp = gather_timestamp self.local_stored_path = gather_pack_dir self.remote_stored_path = None + self.is_scene = is_scene self.config_path = const.DEFAULT_CONFIG_PATH if common_config is None: - self.file_size_limit = 2 * 1024 * 1024 + self.file_size_limit = 2 * 1024 * 1024 * 1024 else: self.file_size_limit = int(parse_size(common_config["file_size_limit"])) @@ -57,13 +56,10 @@ def handle(self, args): if not self.__check_valid_args(args): return - # if user indicates the store_dir, use it, otherwise use the dir in the config(default) - if args.store_dir is not None: - self.local_stored_path = os.path.abspath(args.store_dir) - - pack_dir_this_command = os.path.join(self.local_stored_path, - "gather_pack_{0}".format(timestamp_to_filename_time( - self.gather_timestamp))) + if self.is_scene: + pack_dir_this_command = self.local_stored_path + else: + pack_dir_this_command = os.path.join(self.local_stored_path,"gather_pack_{0}".format(timestamp_to_filename_time(self.gather_timestamp))) logger.info("Use {0} as pack dir.".format(pack_dir_this_command)) gather_tuples = [] @@ -160,9 +156,14 @@ def __gather_dmesg_current_info(self, ssh_helper, gather_path): def __gather_dmesg_boot_info(self, ssh_helper, dir_path): try: - dmesg_cmd = 'cp --force /var/log/dmesg {dir_path}/dmesg.boot'.format(dir_path=dir_path) - logger.info("gather dmesg boot info on server {0}, run cmd = [{1}]".format(ssh_helper.get_name(), dmesg_cmd)) - SshClient().run(ssh_helper, dmesg_cmd) if self.is_ssh else LocalClient().run(dmesg_cmd) + file_exit_cmd = "ls -l {file_path} 2>/dev/null".format(file_path="/var/log/dmesg") + file_exit = SshClient().run(ssh_helper, file_exit_cmd) if self.is_ssh else LocalClient().run(file_exit_cmd) + if file_exit: + dmesg_cmd = 'cp --force /var/log/dmesg {dir_path}/dmesg.boot'.format(dir_path=dir_path) + logger.info("gather dmesg boot info on server {0}, run cmd = [{1}]".format(ssh_helper.get_name(), dmesg_cmd)) + SshClient().run(ssh_helper, dmesg_cmd) if self.is_ssh else LocalClient().run(dmesg_cmd) + else: + logger.warn("the file /var/log/dmesg on server {0} not found ".format(ssh_helper.get_name())) except: logger.error("Failed to gather the /var/log/dmesg on server {0}".format(ssh_helper.get_name())) @@ -184,19 +185,19 @@ def __gather_mem_info(self, ssh_helper, gather_path): except: logger.error("Failed to gather memory info use tsar on server {0}".format(ssh_helper.get_name())) - @staticmethod - def __check_valid_args(args): + + def __check_valid_args(self, args): """ chech whether command args are valid. If invalid, stop processing and print the error to the user :param args: command args :return: boolean. True if valid, False if invalid. """ - # 1: store_dir must exist, else return "No such file or directory". + # 1: store_dir must exist, else create directory. if getattr(args, "store_dir") is not None: if not os.path.exists(os.path.abspath(getattr(args, "store_dir"))): - logger.error("Error: args --store_dir [{0}] incorrect: No such directory." - .format(os.path.abspath(getattr(args, "store_dir")))) - return False + logger.warn("Error: args --store_dir [{0}] incorrect: No such directory, Now create it".format(os.path.abspath(getattr(args, "store_dir")))) + os.makedirs(os.path.abspath(getattr(args, "store_dir"))) + self.local_stored_path = os.path.abspath(getattr(args, "store_dir")) return True @staticmethod diff --git a/handler/gather/scenes/__init__.py b/handler/gather/scenes/__init__.py new file mode 100644 index 00000000..40982804 --- /dev/null +++ b/handler/gather/scenes/__init__.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/26 +@file: __init__.py +@desc: +""" \ No newline at end of file diff --git a/handler/gather/scenes/base.py b/handler/gather/scenes/base.py new file mode 100644 index 00000000..a8e15a54 --- /dev/null +++ b/handler/gather/scenes/base.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/1/10 +@file: base.py +@desc: +""" + +from common.logger import logger +from common.scene import filter_by_version +from handler.gather.step.base import Base +from utils.utils import node_cut_passwd_for_log +from handler.gather.scenes.sql_problem import SQLProblemScene +from handler.gather.scenes.cpu_high import CPUHighScene + + +class SceneBase(object): + def __init__(self, scene, obproxy_nodes, ob_nodes, cluster, report_dir=None, scene_variable_dict={}, args=None, env={}, mode="yaml", task_type="observer"): + self.scene_variable_dict = scene_variable_dict + self.scene = scene + self.cluster = cluster + self.ob_nodes = ob_nodes + self.obproxy_nodes = obproxy_nodes + self.report_dir = report_dir + self.args = args + self.mode = mode + self.env = env + self.task_type = task_type + + def execute(self): + try: + if self.mode == "yaml": + if self.task_type == "observer": + self.__execute_yaml_mode(self.ob_nodes) + elif self.task_type == "obproxy": + self.__execute_yaml_mode(self.obproxy_nodes) + elif self.task_type == "other": + self.__execute_yaml_mode(self.ob_nodes) + self.__execute_yaml_mode(self.obproxy_nodes) + elif self.mode == "code": + self.__execute_code_mode() + else: + logger.error("Unsupported mode. SKIP") + raise Exception("Unsupported mode. SKIP") + except Exception as e: + raise Exception("execute failed, error: {0}".format(e)) + + def __execute_yaml_mode(self, nodes): + steps_nu = filter_by_version(self.scene, self.cluster) + if steps_nu < 0: + logger.warning("Unadapted by version. SKIP") + return "Unadapted by version.SKIP" + logger.info("filter_by_version is return {0}".format(steps_nu)) + if len(nodes)==0: + logger.error("node is not exist") + return + node_number = 0 + for node in nodes: + logger.info("run scene in node: {0}".format(node_cut_passwd_for_log(node))) + steps = self.scene[steps_nu] + nu = 1 + node_number = node_number + 1 + for step in steps["steps"]: + try: + logger.debug("step nu: {0}".format(nu)) + if len(self.cluster)==0: + logger.error("cluster is not exist") + return + step_run = Base(step, node, self.cluster, self.report_dir, self.scene_variable_dict, self.args, self.env, node_number) + logger.info("step nu: {0} initted, to execute".format(nu)) + step_run.execute() + self.scene_variable_dict = step_run.update_task_variable_dict() + except Exception as e: + logger.error("SceneBase execute Exception: {0}".format(e)) + return + logger.info("step nu: {0} execute end ".format(nu)) + nu = nu + 1 + logger.info("scene execute end") + + def __execute_code_mode(self): + if self.scene["name"] == "observer.perf_sql" or self.scene["name"] == "observer.sql_err": + scene = SQLProblemScene(self.scene["name"], self.ob_nodes, self.obproxy_nodes, self.cluster, self.report_dir, self.scene_variable_dict, self.args, self.env) + elif self.scene["name"] == "observer.cpu_high": + scene = CPUHighScene(self.ob_nodes, self.cluster, self.report_dir, self.scene_variable_dict, self.args, self.env) + else: + logger.error("unsupported hard code scene {0}".format(self.scene["name"])) + return + try: + logger.info("hard code scene {0} execute start".format(self.scene["name"])) + scene.execute() + logger.info("hard code scene {0} execute end".format(self.scene["name"])) + except Exception as e: + logger.error("hard code scene execute failed, error :{0}".format(e)) + diff --git a/handler/gather/scenes/cpu_high.py b/handler/gather/scenes/cpu_high.py new file mode 100644 index 00000000..ae7fce0e --- /dev/null +++ b/handler/gather/scenes/cpu_high.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/17 +@file: cpu_high.py +@desc: +""" +import os +from utils.shell_utils import SshHelper +from common.logger import logger +from handler.gather.gather_obstack2 import GatherObstack2Handler +from handler.gather.gather_perf import GatherPerfHandler +from utils.parser_utils import ParserAction + +class CPUHighScene(object): + def __init__(self, nodes, cluster, report_path, task_variable_dict=None, args=None, env={}): + if task_variable_dict is None: + self.task_variable_dict = {} + else: + self.task_variable_dict = task_variable_dict + self.nodes = nodes + self.cluster = cluster + self.report_path = report_path + self.args = args + self.env = env + self.is_ssh = True + + def execute(self): + self.__gather_obstack() + self.__gather_perf() + self.__gather_current_clocksource() + + def __gather_obstack(self): + logger.info("gather obstack start") + obstack = GatherObstack2Handler(nodes=self.nodes, gather_pack_dir=self.report_path, is_scene=True) + obstack.handle(self.args) + logger.info("gather obstack end") + + def __gather_perf(self): + logger.info("gather perf start") + perf = GatherPerfHandler(nodes=self.nodes, gather_pack_dir=self.report_path, is_scene=True) + self.args = ParserAction.add_attribute_to_namespace(self.args, 'scope', "all") + perf.handle(self.args) + logger.info("gather perf end") + + def __gather_current_clocksource(self): + try: + logger.info("gather current_clocksource start") + for node in self.nodes: + ssh_helper = SshHelper(self.is_ssh, node.get("ip"), node.get("user"), node.get("password"), node.get("port"), node.get("private_key"), node) + cmd = 'cat /sys/devices/system/clocksource/clocksource0/current_clocksource' + logger.info("gather current_clocksource, run cmd = [{0}]".format(cmd)) + result = ssh_helper.ssh_exec_cmd(cmd) + file_path = os.path.join(self.report_path, "current_clocksource_{ip}_result.txt".format(ip=str(node.get("ip")).replace('.', '_'))) + self.report(file_path, cmd, result) + logger.info("gather current_clocksource end") + except Exception as e: + logger.error("SshHandler init fail. Please check the node conf. Exception : {0} .".format(e)) + + def report(self, file_path, command, data): + try: + with open(file_path, 'a', encoding='utf-8') as f: + f.write('\n\n' + 'shell > ' + command + '\n') + f.write(data + '\n') + except Exception as e: + logger.error("report sql result to file: {0} failed, error: ".format(file_path)) diff --git a/handler/gather/scenes/list.py b/handler/gather/scenes/list.py new file mode 100644 index 00000000..86dbf2d4 --- /dev/null +++ b/handler/gather/scenes/list.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/10 +@file: list.py +@desc: +""" + +import os +from common.logger import logger +from utils.yaml_utils import read_yaml_data +from handler.gather.scenes.register import hardcode_scene_list +from utils.print_utils import print_scene, print_title + +class GatherScenesListHandler: + def __init__(self, yaml_tasks_base_path="./handler/gather/tasks/"): + self.observer_tasks = {} + self.obproxy_tasks = {} + self.other_tasks = {} + self.yaml_tasks_base_path = yaml_tasks_base_path + base_path = os.path.expanduser(yaml_tasks_base_path) + if os.path.exists(base_path): + self.yaml_tasks_base_path = base_path + else: + logger.error("Failed to find yaml task path: {0}".format(base_path)) + + def handle(self, args): + logger.debug("list gather scene") + self.get_all_yaml_tasks() + self.get_all_code_tasks() + logger.debug("len of observer_tasks: {0}; len of observer_tasks: {1}; len of observer_tasks: {2};".format(len(self.observer_tasks), len(self.obproxy_tasks), len(self.other_tasks))) + if (len(self.observer_tasks) + len(self.obproxy_tasks) + len(self.other_tasks)) == 0: + logger.error("Failed to find any tasks") + else: + self.print_scene_data() + + def get_all_yaml_tasks(self): + try: + current_path = self.yaml_tasks_base_path + for root, dirs, files in os.walk(current_path): + for file in files: + if file.endswith('.yaml'): + folder_name = os.path.basename(root) + task_name = "{}.{}".format(folder_name, file.split('.')[0]) + task_data = read_yaml_data(os.path.join(root, file)) + task_data["name"] = task_name + if folder_name == "observer": + self.observer_tasks[task_name] = task_data + elif folder_name == "obproxy": + self.obproxy_tasks[task_name] = task_data + else: + self.other_tasks[task_name] = task_data + except Exception as e: + logger.error("get all yaml task failed, error: ", e) + + def get_all_code_tasks(self): + try: + for scene in hardcode_scene_list: + if "observer" in scene.name: + self.observer_tasks[scene.name] = self.__get_hardcode_task(scene) + elif "obproxy" in scene.name: + self.obproxy_tasks[scene.name] = self.__get_hardcode_task(scene) + else: + self.other_tasks[scene.name] = self.__get_hardcode_task(scene) + except Exception as e: + logger.error("get all hard code task failed, error: ", e) + + def __get_hardcode_task(self, scene): + return {"name": scene.name, "command": scene.command, "info_en": scene.info_en, "info_cn": scene.info_cn,} + + def get_one_yaml_task(self, name): + try: + task_data = None + current_path = self.yaml_tasks_base_path + for root, dirs, files in os.walk(current_path): + for file in files: + if file.endswith('.yaml'): + folder_name = os.path.basename(root) + task_name = "{}.{}".format(folder_name, file.split('.')[0]) + if name == task_name: + task_data = read_yaml_data(os.path.join(root, file)) + task_data["name"] = task_name + return task_data + except Exception as e: + logger.error("get one yaml task failed, error: ", e) + + def is_code_task(self, name): + try: + for scene in hardcode_scene_list: + if scene.name == name: + return True + return False + except Exception as e: + logger.error("get one code task failed, error: ", e) + return False + + def print_scene_data(self): + sorted_observer_tasks_dict = {} + sorted_obproxy_tasks_dict = {} + sorted_other_tasks_dict = {} + if self.other_tasks: + sorted_other_tasks = sorted(self.other_tasks.items(), key=lambda x: x[0]) + sorted_other_tasks_dict = {k: v for k, v in sorted_other_tasks} + print_title("Other Problem Gather Scenes") + print_scene(sorted_other_tasks_dict) + if self.obproxy_tasks: + sorted_obproxy_tasks = sorted(self.obproxy_tasks.items(), key=lambda x: x[0]) + sorted_obproxy_tasks_dict = {k: v for k, v in sorted_obproxy_tasks} + print_title("Obproxy Problem Gather Scenes") + print_scene(sorted_obproxy_tasks_dict) + if self.observer_tasks: + sorted_observer_tasks = sorted(self.observer_tasks.items(), key=lambda x: x[0]) + sorted_observer_tasks_dict = {k: v for k, v in sorted_observer_tasks} + print_title("Observer Problem Gather Scenes") + print_scene(sorted_observer_tasks_dict) + + \ No newline at end of file diff --git a/handler/gather/scenes/register.py b/handler/gather/scenes/register.py new file mode 100644 index 00000000..9ae68fc7 --- /dev/null +++ b/handler/gather/scenes/register.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/10 +@file: register.py +@desc: +""" + +from dataclasses import dataclass + +@dataclass +class RegisteredHardCodeScene: + name: str + command: str + info_en: str + info_cn: str + +# 对于不适合通过yaml编排的复杂场景可以用这个类注册,注册后通过代码实现采集逻辑 +db_connect = '-hxx -Pxx -uxx -pxx -Dxx' +trace_id = 'xx' + +hardcode_scene_list = [ + RegisteredHardCodeScene( + 'observer.perf_sql', + f'''obdiag gather scene run --scene=observer.perf_sql --env "{{db_connect='{db_connect}', trace_id='{trace_id}'}}"''', + '[SQL performance problem]', + '[SQL性能问题]' + ), + RegisteredHardCodeScene( + 'observer.sql_err', + f'''obdiag gather scene run --scene=observer.sql_err --env "{{db_connect='{db_connect}', trace_id='{trace_id}'}}"''', + '[SQL execution error]', + '[SQL 执行出错]' + ), + RegisteredHardCodeScene('observer.cpu_high', 'obdiag gather scene run --scene=observer.cpu_high', '[High CPU]', '[CPU高]'), +] diff --git a/handler/gather/scenes/sql_problem.py b/handler/gather/scenes/sql_problem.py new file mode 100644 index 00000000..a80bf33b --- /dev/null +++ b/handler/gather/scenes/sql_problem.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/17 +@file: sql_problem.py +@desc: +""" + +from common.logger import logger +from utils.parser_utils import ParserAction +from handler.gather.gather_log import GatherLogHandler +from handler.gather.gather_obproxy_log import GatherObProxyLogHandler +from handler.gather.gather_plan_monitor import GatherPlanMonitorHandler +from utils.string_utils import parse_mysql_cli_connection_string + + +class SQLProblemScene(object): + def __init__(self, scene_name, ob_nodes, obproxy_nodes, cluster, report_path, task_variable_dict=None, args=None, env={}): + if task_variable_dict is None: + self.task_variable_dict = {} + else: + self.task_variable_dict = task_variable_dict + self.ob_nodes = ob_nodes + self.obproxy_nodes = obproxy_nodes + self.cluster = cluster + self.report_path = report_path + self.args = args + self.env = env + self.is_ssh = True + self.scene_name = scene_name + self.db_conn = {} + self.trace_id = "FAKE_TRACE_ID" + + def execute(self): + self.__parse_env() + self.__gather_log() + self.__gather_obproxy_log() + self.__gather_sql_info() + + def __gather_log(self): + try: + logger.info("gather observer log start") + handler = GatherLogHandler(nodes=self.ob_nodes, gather_pack_dir=self.report_path, is_scene=True) + self.args = ParserAction.add_attribute_to_namespace(self.args, 'grep', "") + handler.handle(self.args) + logger.info("gather observer log end") + except Exception as e: + logger.error("gather observer log failed, error: {0}".format(e)) + raise Exception("gather observer log failed, error: {0}".format(e)) + + def __gather_obproxy_log(self): + try: + logger.info("gather obproxy log start") + handler = GatherObProxyLogHandler(nodes=self.obproxy_nodes, gather_pack_dir=self.report_path, is_scene=True) + if self.scene_name: + if self.scene_name == "observer.sql_err": + self.args = ParserAction.add_attribute_to_namespace(self.args, 'grep', None) + elif self.scene_name == "observer.perf_sql": + self.args = ParserAction.add_attribute_to_namespace(self.args, 'grep', self.trace_id) + else: + logger.warn("unsupported scene {0}".format(self.scene_name)) + return + self.args = ParserAction.add_attribute_to_namespace(self.args, 'scope', "all") + self.args = ParserAction.add_attribute_to_namespace(self.args, 'encrypt', "false") + handler.handle(self.args) + logger.info("gather obproxy log end") + else: + logger.warn("scene is None") + return + except Exception as e: + logger.error("gather obproxy log failed, error: {0}".format(e)) + raise Exception("gather obproxy log failed, error: {0}".format(e)) + + def __gather_sql_info(self): + try: + logger.info("gather sql info start") + handler = GatherPlanMonitorHandler(ob_cluster=self.cluster, gather_pack_dir=self.report_path, db_conn=self.db_conn, is_scene=True) + self.args = ParserAction.add_attribute_to_namespace(self.args, 'trace_id', self.trace_id) + handler.handle(self.args) + logger.info("gather sql info end") + except Exception as e: + logger.error("gather sql info failed, error: {0}".format(e)) + raise Exception("gather sql info failed, error: {0}".format(e)) + + def report(self): + pass + + def __parse_env(self): + cli_connection_string = self.env.get("db_connect") + self.db_conn = parse_mysql_cli_connection_string(cli_connection_string) + trace_id = self.env.get("trace_id") + if trace_id: + self.trace_id = self.env.get("trace_id") diff --git a/handler/gather/step/__init__.py b/handler/gather/step/__init__.py new file mode 100644 index 00000000..4dea303d --- /dev/null +++ b/handler/gather/step/__init__.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/05 +@file: __init__.py +@desc: +""" \ No newline at end of file diff --git a/handler/gather/step/base.py b/handler/gather/step/base.py new file mode 100644 index 00000000..0a2a9f10 --- /dev/null +++ b/handler/gather/step/base.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/05 +@file: base.py +@desc: +""" +import docker +from handler.gather.step.ssh import SshHandler +from handler.gather.step.sql import StepSQLHandler +from common.logger import logger +from handler.gather.gather_log import GatherLogHandler +from handler.gather.gather_obproxy_log import GatherObProxyLogHandler +from handler.gather.gather_sysstat import GatherOsInfoHandler +from utils.parser_utils import ParserAction + +class Base(object): + def __init__(self, step, node, cluster, report_path, task_variable_dict=None, args=None, env={}, node_number = 1): + if task_variable_dict is None: + self.task_variable_dict = {} + else: + self.task_variable_dict = task_variable_dict + self.step = step + self.node = node + self.cluster = cluster + self.report_path = report_path + self.args = args + self.env = env + self.node_number = node_number + + def execute(self): + logger.debug("step: {0}".format(self.step)) + no_cluster_name_msg="(Please set ob_cluster_name or obproxy_cluster_name)" + try: + if "ip" in self.node: + self.task_variable_dict["remote_ip"] = self.node["ip"] + elif "ssh_type" in self.node and self.node["ssh_type"]=="docker": + logger.debug("execute ssh_type is docker") + self.task_variable_dict["remote_ip"] = docker.from_env().containers.get(self.node["container_name"]).attrs['NetworkSettings']['Networks']['bridge']["IPAddress"] + self.task_variable_dict["remote_home_path"] = self.node["home_path"] + + if "type" not in self.step: + logger.error("Missing field :type") + if (self.node_number > 1) and self.step.get("global") and (self.step.get("global") == "true"): + logger.info("step sets the value of the global is true and it is processing the {0} node, skipping gather".format(self.node_number)) + else: + if self.step["type"] == "ssh": + handler = SshHandler(self.step, self.node, self.report_path, self.task_variable_dict) + handler.execute() + elif self.step["type"] == "sql": + handler = StepSQLHandler(self.step, self.cluster, self.report_path, self.task_variable_dict) + handler.execute() + elif self.step["type"] == "log": + if self.node.get("host_type") and self.node.get("host_type") == "OBSERVER": + handler = GatherLogHandler(nodes=[self.node], gather_pack_dir=self.report_path, is_scene=True) + if self.step.get("grep") is None or len(self.step.get("grep")) == 0: + self.args = ParserAction.add_attribute_to_namespace(self.args, 'grep', None) + else: + self.args = ParserAction.add_attribute_to_namespace(self.args, 'grep', self.step.get("grep")) + handler.handle(self.args) + else: + logger.info("node host_type is {0} not OBSERVER, skipping gather log".format(self.node.get("host_type"))) + elif self.step["type"] == "obproxy_log": + if self.node.get("host_type") and self.node.get("host_type") == "OBPROXY": + handler = GatherObProxyLogHandler(nodes=[self.node], gather_pack_dir=self.report_path, is_scene=True) + if self.step.get("grep") is None or len(self.step.get("grep")) == 0: + self.args = ParserAction.add_attribute_to_namespace(self.args, 'grep', None) + else: + self.args = ParserAction.add_attribute_to_namespace(self.args, 'grep', self.step.get("grep")) + self.args = ParserAction.add_attribute_to_namespace(self.args, 'scope', 'all') + self.args = ParserAction.add_attribute_to_namespace(self.args, 'encrypt', 'false') + handler.handle(self.args) + else: + logger.info("node host_type is {0} not OBPROXY, skipping gather log".format(self.node.get("host_type"))) + elif self.step["type"] == "sysstat": + handler = GatherOsInfoHandler(nodes=[self.node], gather_pack_dir=self.report_path, is_scene=True) + handler.handle(self.args) + else: + logger.error("the type not support: {0}" .format(self.step["type"])) + except Exception as e: + logger.error("StepBase handler.execute fail, error: {0}".format(e)) + if self.step["type"] == "sql": + logger.error("[cluster:{0}] {1}]".format(self.cluster.get("ob_cluster_name") or self.cluster.get("obproxy_cluster_name") or no_cluster_name_msg, e)) + else: + logger.error("[{0}:{1}] {2}]".format(self.node.get("ssh_type") or "", self.node.get("container_name") or self.task_variable_dict.get("remote_ip") or "", e)) + logger.error("StepBase handler.execute fail, error: {0}".format(e)) + + def update_task_variable_dict(self): + return self.task_variable_dict diff --git a/handler/gather/step/sql.py b/handler/gather/step/sql.py new file mode 100644 index 00000000..f9d89f41 --- /dev/null +++ b/handler/gather/step/sql.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/04 +@file: sql.py +@desc: +""" +import os +from common.logger import logger +from common.ob_connector import OBConnector +from tabulate import tabulate +from utils.utils import build_str_on_expr_by_dict_2, convert_to_number + + +class StepSQLHandler: + def __init__(self, step, ob_cluster, report_path, task_variable_dict): + try: + self.ob_cluster = ob_cluster + self.ob_cluster_name = ob_cluster.get("cluster_name") + self.tenant_mode = None + self.sys_database = None + self.database = None + self.ob_connector = OBConnector(ip=ob_cluster.get("db_host"), + port=ob_cluster.get("db_port"), + username=ob_cluster.get("tenant_sys").get("user"), + password=ob_cluster.get("tenant_sys").get("password"), + timeout=10000) + except Exception as e: + logger.error("StepSQLHandler init fail. Please check the OBCLUSTER conf. OBCLUSTER: {0} Exception : {1} .".format(ob_cluster,e)) + self.task_variable_dict = task_variable_dict + self.enable_dump_db = False + self.enable_fast_dump = False + self.ob_major_version = None + self.step = step + self.report_path = report_path + self.report_file_path = os.path.join(self.report_path, "sql_result.txt") + + def execute(self): + try: + if "sql" not in self.step: + logger.error("StepSQLHandler execute sql is not set") + return + sql = build_str_on_expr_by_dict_2(self.step["sql"], self.task_variable_dict) + logger.info("StepSQLHandler execute: {0}".format(sql)) + columns, data = self.ob_connector.execute_sql_return_columns_and_data(sql) + if data is None or len(data) == 0: + logger.warning("excute sql: {0}, result is None".format(sql)) + else: + self.report(sql, columns, data) + except Exception as e: + logger.error("StepSQLHandler execute Exception: {0}".format(e).strip()) + + def update_step_variable_dict(self): + return self.task_variable_dict + + def report(self, sql, column_names, data): + try: + table_data = [list(row) for row in data] + formatted_table = tabulate(table_data, headers=column_names, tablefmt="grid") + with open(self.report_file_path, 'a', encoding='utf-8') as f: + f.write('\n\n' + 'obclient > ' + sql + '\n') + f.write(formatted_table) + except Exception as e: + logger.error("report sql result to file: {0} failed, error: ".format(self.report_file_path)) \ No newline at end of file diff --git a/handler/gather/step/ssh.py b/handler/gather/step/ssh.py new file mode 100644 index 00000000..41ec26f2 --- /dev/null +++ b/handler/gather/step/ssh.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/04 +@file: ssh.py +@desc: +""" +import os +from utils.shell_utils import SshHelper +from common.logger import logger +from utils.utils import build_str_on_expr_by_dict_2 + + +class SshHandler: + def __init__(self, step, node, report_path, task_variable_dict): + self.ssh_report_value = None + self.parameters = None + self.step = step + self.node = node + self.report_path = report_path + try: + is_ssh = True + self.ssh_helper = SshHelper(is_ssh, node.get("ip"), node.get("user"), node.get("password"), node.get("port"), node.get("private_key"), node) + except Exception as e: + logger.error("SshHandler init fail. Please check the NODES conf. node: {0}. Exception : {1} .".format(node, e)) + self.task_variable_dict = task_variable_dict + self.parameter = [] + self.report_file_path = os.path.join(self.report_path, "shell_result.txt") + + def execute(self): + try: + if "ssh" not in self.step: + logger.error("SshHandler execute ssh is not set") + return + ssh_cmd = build_str_on_expr_by_dict_2(self.step["ssh"], self.task_variable_dict) + logger.info("step SshHandler execute :{0} ".format(ssh_cmd)) + ssh_report_value = self.ssh_helper.ssh_exec_cmd(ssh_cmd) + if ssh_report_value is None: + ssh_report_value = "" + if len(ssh_report_value) > 0: + ssh_report_value = ssh_report_value.strip() + self.report(ssh_cmd, ssh_report_value) + except Exception as e: + logger.error("ssh execute Exception:{0}".format(e).strip()) + finally: + self.ssh_helper.ssh_close() + logger.debug("gather step SshHandler ssh_report_value:{0}".format(ssh_report_value)) + + def update_step_variable_dict(self): + return self.task_variable_dict + + def report(self, command, data): + try: + with open(self.report_file_path, 'a', encoding='utf-8') as f: + f.write('\n\n' + 'shell > ' + command + '\n') + f.write(data + '\n') + except Exception as e: + logger.error("report sql result to file: {0} failed, error: ".format(self.report_file_path)) diff --git a/handler/gather/tasks/obproxy/restart.yaml b/handler/gather/tasks/obproxy/restart.yaml new file mode 100644 index 00000000..65725159 --- /dev/null +++ b/handler/gather/tasks/obproxy/restart.yaml @@ -0,0 +1,18 @@ +info_en: "[obproxy restart]" +info_cn: "[obproxy无故重启]" +command: obdiag gather scene run --scene=obproxy.restart +task: + - version: "[2.0.0.0, *]" + steps: + - type: ssh + ssh: "ps -ef | grep obproxy" + global: false + - type: ssh + ssh: "cat /proc/sys/kernel/core_pattern" + global: false + - type: ssh + ssh: "ls -lhrt ${obproxy_data_dir}" + global: false + - type: obproxy_log + grep: "" + global: false diff --git a/handler/gather/tasks/observer/backup.yaml b/handler/gather/tasks/observer/backup.yaml new file mode 100644 index 00000000..f34162b0 --- /dev/null +++ b/handler/gather/tasks/observer/backup.yaml @@ -0,0 +1,107 @@ +info_en: "[backup problem]" +info_cn: "[数据备份问题]" +command: obdiag gather scene run --scene=observer.backup +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_backup_task;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_backup_info;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_sys_task_status where comment like '%backup%';" + global: true + - type: sql + sql: "select count(*),status from oceanbase.__all_virtual_pg_backup_task group by status;" + global: true + - type: sql + sql: "select svr_ip, log_archive_status, count(*) from oceanbase.__all_virtual_pg_backup_log_archive_status group by svr_ip, log_archive_status;" + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_event_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "select b.* from oceanbase.__all_virtual_pg_backup_log_archive_status a,oceanbase.__all_virtual_pg_log_archive_stat b where a.table_id=b.table_id and a.partition_id=b.partition_id order by log_archive_cur_ts limit 5;" + global: true + - type: log + global: false + grep: "" + - type: sysstat + global: false + sysstat: "" + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%backup%';" + global: true + - type: sql + sql: "show parameters like '%ha_low_thread_score%';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_PARAMETER" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_JOBS limit 20;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ROOTSERVICE_EVENT_HISTORY WHERE module='backup_data' AND event ='start_backup_data';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_TASKS limit 20;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_backup_schedule_task limit 20" + global: true + - type: sql + sql: "SELECT * from oceanbase.CDB_OB_BACKUP_JOB_HISTORY where STATUS = 'FAILED' limit 20;" + global: true + - type: log + global: false + grep: "" + - type: sysstat + global: false + sysstat: "" diff --git a/handler/gather/tasks/observer/backup_clean.yaml b/handler/gather/tasks/observer/backup_clean.yaml new file mode 100644 index 00000000..daa03ed3 --- /dev/null +++ b/handler/gather/tasks/observer/backup_clean.yaml @@ -0,0 +1,125 @@ +info_en: "[backup clean]" +info_cn: "[备份清理问题]" +command: obdiag gather scene run --scene=observer.backup_clean +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "show parameters like '%backup_dest%';" + global: true + - type: sql + sql: "show parameters like '%auto_delete_expired_backup%';" + global: true + - type: sql + sql: "show parameters like '%backup_recovery_window%';" + global: true + - type: sql + sql: "select * from oceanbase.__all_tenant_backup_info;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_backup_clean_info;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_sys_task_status where comment like '%backup%';" + global: true + - type: sql + sql: "select * from oceanbase.CDB_OB_BACKUP_SET_DETAILS order by START_TIME asc limit 1;" + global: true + - type: sql + sql: "select * from oceanbase.__all_backup_task_clean_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "select * from oceanbase.__all_backup_clean_info_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_event_history where module='backup_clean' and gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: log + grep: "" + global: false + - type: sysstat + sysstat: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%backup%';" + global: true + - type: sql + sql: "select * from oceanbase.CDB_OB_BACKUP_JOB_HISTORY" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ROOTSERVICE_EVENT_HISTORY WHERE module='backup_data' AND event ='start_backup_data';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_TASKS limit 20;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_backup_schedule_task limit 20" + global: true + - type: sql + sql: "SELECT * from oceanbase.CDB_OB_BACKUP_JOB_HISTORY where STATUS = 'FAILED' limit 20;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_DELETE_POLICY;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_DELETE_JOBS limit 20" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_DELETE_TASKS limit 20" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_DELETE_TASK_HISTORY limit 20" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_BACKUP_DELETE_JOB_HISTORY limit 20" + global: true + - type: log + global: false + grep: "" + - type: sysstat + global: false + sysstat: "" diff --git a/handler/gather/tasks/observer/clog_disk_full.yaml b/handler/gather/tasks/observer/clog_disk_full.yaml new file mode 100644 index 00000000..b2715dc9 --- /dev/null +++ b/handler/gather/tasks/observer/clog_disk_full.yaml @@ -0,0 +1,89 @@ +info_en: "[clog disk full]" +info_cn: "[clog盘满]" +command: obdiag gather scene run --scene=observer.clog_disk_full +task: + - version: "[3.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "show parameters like '%clog_disk_usage_limit_percentage%';" + global: true + - type: sql + sql: "show parameters like '%clog_expire_days%';" + global: true + - type: sql + sql: "show parameters like '%backup_log_archive_option%';" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_tenant_memstore_info where (active_memstore_used > major_freeze_trigger) or (total_memstore_used > memstore_limit);" + global: true + - type: sql + sql: "select svr_ip,total_size / 1024 / 1024 / 1024 total_G,free_size / 1024 / 1024 / 1024 free_G,(total_size - free_size) / 1024 / 1024 / 1024 used_G,(total_size - free_size) / total_size used_percentage FROM oceanbase.__all_virtual_disk_stat; " + global: true + - type: log + global: false + grep: "" + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%clog%';" + global: true + - type: sql + sql: "show parameters like '%backup%';" + global: true + - type: sql + sql: "select tenant_id, svr_ip, svr_port, LOG_DISK_IN_USE/1024/1024/1024 LOG_DISK_IN_USE_G, LOG_DISK_SIZE/1024/1024/1024 LOG_DISK_SIZE_G, LOG_DISK_IN_USE*100/LOG_DISK_SIZE LOG_DISK_USED_PERCENTAGE from oceanbase.gv$ob_units;" + global: true + - type: sql + sql: "select TENANT_ID, LS_ID, SVR_IP, ROLE , (end_lsn-base_lsn)/1024/1024 from oceanbase.gv$ob_log_stat;" + global: true + - type: sql + sql: "(select value1, value2 from oceanbase.DBA_OB_ROOTSERVICE_EVENT_HISTORY where event like '%add_ls%') except (select value1, value2 from oceanbase.DBA_OB_SERVER_EVENT_HISTORY where module like 'storage_ha' and event like '%finish_complete%');" + global: true + - type: sql + sql: "select * from oceanbase.DBA_OB_SERVER_EVENT_HISTORY where event like '%migrat%' and name6 like '%fail%' and value6=1;" + global: true + - type: log + global: false + grep: "" diff --git a/handler/gather/tasks/observer/compaction.yaml b/handler/gather/tasks/observer/compaction.yaml new file mode 100644 index 00000000..a4e123a0 --- /dev/null +++ b/handler/gather/tasks/observer/compaction.yaml @@ -0,0 +1,134 @@ +info_en: "[compaction]" +info_cn: "[合并问题]" +command: obdiag gather scene run --scene=observer.compaction +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "show parameters like '%enable_manual_merge%';" + global: true + - type: sql + sql: "show parameters like '%zone_merge_concurrency%';" + global: true + - type: sql + sql: "show parameters like '%zone_merge_order%';" + global: true + - type: sql + sql: "show parameters like '%enable_merge_by_turn%';" + global: true + - type: sql + sql: "show parameters like '%major_freeze_duty_time%';" + global: true + - type: sql + sql: "show parameters like '%enable_auto_leader_switch%';" + global: true + - type: sql + sql: "select * from oceanbase.__all_zone;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_replica_task;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_rebalance_task_stat;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_sys_task_status;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_partition_compaction_progress;" + global: true + - type: sql + sql: "select * from oceanbase.__all_freeze_schema_version where schema_version = -1;" + global: true + - type: sql + sql: "select tenant_id, table_id, partition_id from oceanbase.__all_virtual_partition_table group by 1,2,3 having min(role) = 2;" + global: true + - type: sql + sql: "SELECT count(*),svr_ip FROM oceanbase.__all_virtual_clog_stat WHERE is_in_sync = 0 AND is_offline = 0 AND replica_type != 16 group by svr_ip;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_clog_stat WHERE is_in_sync = 0 AND is_offline = 0 AND replica_type != 16 limit 10;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_tenant_memstore_info where (active_memstore_used > major_freeze_trigger) or (total_memstore_used > memstore_limit);" + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_event_history where module = 'daily_merge' and event like '%merge_error%' order by gmt_create desc limit 5;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_meta_table where data_version != (select value from oceanbase.__all_zone where name='global_broadcast_version') limit 10;" + global: true + - type: sql + sql: "SELECT count(1) FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action<=2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: sql + sql: "SELECT count(1) FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action > 2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_election_info group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_election_info where role = 1)" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_clog_stat group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_clog_stat where role = 'LEADER') ;" + global: true + - type: sql + sql: "SELECT svr_ip,total_size / 1024 / 1024 / 1024 total_G,free_size / 1024 / 1024 / 1024 free_G,(total_size - free_size) / 1024 / 1024 / 1024 used_G,(total_size - free_size) / total_size used_percentage FROM oceanbase.__all_virtual_disk_stat;" + global: true + - type: log + grep: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%merge%';" + global: true + - type: sql + sql: "show parameters like 'memstore_limit_percentage';" + global: true + - type: sql + sql: "show parameters like 'freeze_trigger_percentage';" + global: true + - type: log + global: false + grep: "" diff --git a/handler/gather/tasks/observer/delay_of_primary_and_backup.yaml b/handler/gather/tasks/observer/delay_of_primary_and_backup.yaml new file mode 100644 index 00000000..ee2abcde --- /dev/null +++ b/handler/gather/tasks/observer/delay_of_primary_and_backup.yaml @@ -0,0 +1,143 @@ +info_en: "[delay of primary and backup]" +info_cn: "[主备库延迟]" +command: obdiag gather scene run --scene=observer.delay_of_primary_and_backup +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "select * from oceanbase.__all_failover_info;" + global: true + - type: sql + sql: "select * from oceanbase.__all_freeze_schema_version ;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_sys_task_status;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_replica_task;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_rebalance_task_stat;" + global: true + - type: sql + sql: "select * from oceanbase.__all_unit where migrate_from_svr_ip !='';" + global: true + - type: sql + sql: "select * from oceanbase.__all_root_table where is_restore != 0;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_meta_table where is_restore != 0;" + global: true + - type: sql + sql: "select * from oceanbase.__all_core_table where table_name like '%schema_status%'';" + global: true + - type: sql + sql: "SELECT TENANT_ID, COUNT(*) FROM oceanbase.__ALL_VIRTUAL_META_TABLE WHERE ROLE = 2 GROUP BY TENANT_ID;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_clog_stat WHERE is_in_sync = 0 AND is_offline = 0 AND replica_type != 16 limit 10;" + global: true + - type: sql + sql: "select tenant_id, table_id, partition_id from oceanbase.__all_virtual_partition_table group by 1,2,3 having min(role) = 2;" + global: true + - type: sql + sql: "SELECT USEC_TO_TIME(CURRENT_SCN) AS CUR_PROCESS, NOW(6) - USEC_TO_TIME(CURRENT_SCN) AS DELAY FROM oceanbase.V$OB_CLUSTER;" + global: true + - type: sql + sql: "SELECT count(*),svr_ip FROM oceanbase.__all_virtual_clog_stat WHERE is_in_sync = 0 AND is_offline = 0 AND replica_type != 16 group by svr_ip;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_tenant_memstore_info where (active_memstore_used > major_freeze_trigger) or (total_memstore_used > memstore_limit);" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__ALL_ROOTSERVICE_EVENT_HISTORY WHERE MODULE = 'BALANCER' AND EVENT LIKE '%ADD_REPLICA%' ORDER BY GMT_CREATE DESC LIMIT 100;" + global: true + - type: sql + sql: "SELECT count(1) FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action<=2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: sql + sql: "SELECT count(1) FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action > 2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_election_info group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_election_info where role = 1)" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_clog_stat group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_clog_stat where role = 'LEADER') ;" + global: true + - type: sql + sql: "SELECT svr_ip ,total_size / 1024 / 1024 / 1024 total_G,free_size / 1024 / 1024 / 1024 free_G,(total_size - free_size) / 1024 / 1024 / 1024 used_G,(total_size - free_size) / total_size used_percentage FROM oceanbase.__all_virtual_disk_stat;" + global: true + - type: sql + sql: "select * from oceanbase.v$ob_cluster;" + global: true + - type: sql + sql: "SELECT TENANT_ID, COUNT(*) FROM oceanbase.__ALL_VIRTUAL_META_TABLE WHERE ROLE = 1 GROUP BY TENANT_ID;" + global: true + - type: sql + sql: "select tenant_id, table_id, partition_id from oceanbase.__all_virtual_partition_table group by 1,2,3 having min(role) = 2;" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_election_info group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_election_info where role = 1) ;" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_clog_stat group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_clog_stat where role = 'LEADER') ;" + global: true + - type: log + global: false + grep: "" + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%backup%';" + global: true + - type: sql + sql: "SELECT TENANT_NAME, TENANT_ID, TENANT_ROLE, SCN_TO_TIMESTAMP(SYNC_SCN) FROM oceanbase.DBA_OB_TENANTS WHERE TENANT_NAME = 'standby_tenant';;" + global: true + - type: sql + sql: "SELECT LS_ID, SCN_TO_TIMESTAMP(END_SCN) FROM oceanbase.GV$OB_LOG_STAT WHERE ROLE = 'LEADER';" + global: true + - type: log + global: false + grep: "" diff --git a/handler/gather/tasks/observer/log_archive.yaml b/handler/gather/tasks/observer/log_archive.yaml new file mode 100644 index 00000000..4cf66f0a --- /dev/null +++ b/handler/gather/tasks/observer/log_archive.yaml @@ -0,0 +1,110 @@ +info_en: "[log archive]" +info_cn: "[日志归档问题]" +command: obdiag gather scene run --scene=observer.log_archive +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "select * from oceanbase.CDB_OB_BACKUP_ARCHIVELOG;" + global: true + - type: sql + sql: "select * from __all_virtual_sys_task_status where comment like '%backup%';" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_pg_backup_log_archive_status order by log_archive_cur_ts limit 10;" + global: true + - type: sql + sql: "select svr_ip, log_archive_status, count(*) from oceanbase.__all_virtual_pg_backup_log_archive_status group by svr_ip, log_archive_status;" + global: true + - type: sql + sql: "select tenant_id, table_id, partition_id from oceanbase.__all_virtual_partition_table group by 1,2,3 having min(role) = 2;" + global: true + - type: sql + sql: "select count(*) ,DATE_FORMAT(gmt_create, '%Y-%c-%d') as date from oceanbase.__all_virtual_ddl_operation where ddl_stmt_str !='' group by date order by date limit 10;" + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_event_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_election_info group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_election_info where role = 1);" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_clog_stat group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_clog_stat where role = 'LEADER');" + global: true + - type: sql + sql: "select b.* from oceanbase.__all_virtual_pg_backup_log_archive_status a,oceanbase.__all_virtual_pg_log_archive_stat b where a.table_id=b.table_id and a.partition_id=b.partition_id order by log_archive_cur_ts limit 5;" + global: true + - type: log + global: false + grep: "" + - type: sysstat + sysstat: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%backup%';" + global: true + - type: sql + sql: "SHOW PARAMETERS LIKE 'log_archive_concurrency';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_ARCHIVE_DEST;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_ARCHIVELOG_SUMMARY limit 20" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_ARCHIVELOG limit 20" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_ARCHIVELOG_PIECE_FILES limit 20" + global: true + - type: log + global: false + grep: "" + - type: sysstat + sysstat: "" + global: false diff --git a/handler/gather/tasks/observer/long_transaction.yaml b/handler/gather/tasks/observer/long_transaction.yaml new file mode 100644 index 00000000..70b83a9e --- /dev/null +++ b/handler/gather/tasks/observer/long_transaction.yaml @@ -0,0 +1,65 @@ +info_en: "[long transaction]" +info_cn: "[长事务]" +command: obdiag gather scene run --scene=observer.long_transaction +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action<=2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: log + grep: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "select * from oceanbase.gv$ob_transaction_participants limit 100" + global: true + - type: sql + sql: "SELECT count(1) FROM oceanbase.GV$OB_TRANSACTION_PARTICIPANTS WHERE CTX_CREATE_TIME < date_sub(now(), INTERVAL 600 SECOND) AND STATE = 'INIT';" + global: true + - type: log + grep: "" + global: false diff --git a/handler/gather/tasks/observer/memory.yaml b/handler/gather/tasks/observer/memory.yaml new file mode 100644 index 00000000..69cb752c --- /dev/null +++ b/handler/gather/tasks/observer/memory.yaml @@ -0,0 +1,77 @@ +info_en: "[memory problem]" +info_cn: "[内存问题]" +command: obdiag gather scene run --scene=observer.memory +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_tenant_memstore_info where (active_memstore_used > major_freeze_trigger) or (total_memstore_used > memstore_limit);" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action<=2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action > 2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: sql + sql: "SELECT table_id, partition_id, base_version, snapshot_version FROM oceanbase.__all_virtual_table_mgr WHERE table_type=0 except SELECT table_id, partition_idx, base_version, snapshot_version FROM oceanbase.__all_virtual_memstore_info limit 10;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_tenant_memstore_allocator_info a,(select svr_ip,tenant_id from oceanbase.__all_virtual_tenant_memstore_info where (active_memstore_used > major_freeze_trigger)) b where a.svr_ip=b.svr_ip and a.tenant_id=b.tenant_id AND a.mt_is_frozen=1 ORDER BY mt_protection_clock limit 20;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_tenant_memstore_allocator_info a,(select svr_ip,tenant_id from oceanbase.__all_virtual_tenant_memstore_info where (total_memstore_used > memstore_limit)) b where a.svr_ip=b.svr_ip and a.tenant_id=b.tenant_id AND a.mt_is_frozen=0 ORDER BY mt_protection_clock limit 20;" + global: true + - type: log + grep: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "select * from oceanbase.GV$OB_MEMSTORE limit 5" + global: true + - type: log + grep: "" + global: false diff --git a/handler/gather/tasks/observer/recovery.yaml b/handler/gather/tasks/observer/recovery.yaml new file mode 100644 index 00000000..0f4802b3 --- /dev/null +++ b/handler/gather/tasks/observer/recovery.yaml @@ -0,0 +1,79 @@ +info_en: "[recovery]" +info_cn: "[数据恢复问题]" +command: obdiag gather scene run --scene=observer.recovery +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "select * from oceanbase.__all_restore_info;" + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_event_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "select svr_ip,role, is_restore, count(*) from oceanbase.__all_root_table as a, (select value from oceanbase.__all_restore_info where name='tenant_id') as b where a.tenant_id=b.value group by role, is_restore, svr_ip order by svr_ip, is_restore;" + global: true + - type: sql + sql: "select svr_ip,role, is_restore, count(*) from oceanbase.__all_virtual_meta_table as a, (select value from oceanbase.__all_restore_info where name='tenant_id') as b where a.tenant_id=b.value group by role, is_restore, svr_ip order by svr_ip, is_restore;" + global: true + - type: log + grep: "" + global: false + - type: sysstat + sysstat: "" + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_RESTORE_PROGRESS limit 20;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.CDB_OB_RESTORE_HISTORY limit 20;" + global: true + - type: log + grep: "" + global: false + - type: sysstat + sysstat: "" + global: false diff --git a/handler/gather/tasks/observer/restart.yaml b/handler/gather/tasks/observer/restart.yaml new file mode 100644 index 00000000..56195780 --- /dev/null +++ b/handler/gather/tasks/observer/restart.yaml @@ -0,0 +1,74 @@ +info_en: "[restart]" +info_cn: "[observer无故重启]" +command: obdiag gather scene run --scene=observer.restart +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: ssh + ssh: "ps -ef | grep observer" + global: false + - type: ssh + ssh: "cat /proc/sys/kernel/core_pattern" + global: false + - type: ssh + ssh: "ls -lhrt ${observer_data_dir}" + global: false + - type: log + grep: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: ssh + ssh: "ps -ef | grep observer" + global: false + - type: ssh + ssh: "cat /proc/sys/kernel/core_pattern" + global: false + - type: ssh + ssh: "ls -lhrt ${observer_data_dir}" + global: false + - type: log + grep: "" + global: false diff --git a/handler/gather/tasks/observer/rootservice_switch.yaml b/handler/gather/tasks/observer/rootservice_switch.yaml new file mode 100644 index 00000000..6f81da3e --- /dev/null +++ b/handler/gather/tasks/observer/rootservice_switch.yaml @@ -0,0 +1,122 @@ +info_en: "[rootservice switch]" +info_cn: "[有主改选或者无主选举的切主]" +command: obdiag gather scene run --scene=observer.rootservice_switch +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "show parameters like 'enable_auto_leader_switch';" + global: true + - type: sql + sql: "show parameters like 'enable_merge_by_turn';" + global: true + - type: sql + sql: "select count(*) from oceanbase.__all_virtual_database where primary_zone != '' group by tenant_id;" + global: true + - type: sql + sql: "select count(*) from oceanbase.__all_virtual_table where primary_zone != '' group by tenant_id;" + global: true + - type: sql + sql: "select count(*) from oceanbase.__all_virtual_tablegroup where primary_zone != '' group by tenant_id;" + global: true + - type: sql + sql: "select tenant_id, table_id, partition_id from oceanbase.__all_virtual_partition_table group by 1,2,3 having min(role) = 2;" + global: true + - type: sql + sql: "select count(*) from oceanbase.__all_virtual_rebalance_task_stat where task_type in ('ADD_REPLICA', 'MIGRATE_REPLICA', 'TYPE_TRANSFORM');" + global: true + - type: sql + sql: "select count(*) from oceanbase.__all_virtual_replica_task where cmd_type in ('ADD_REPLICA', 'MIGRATE_REPLICA', 'TYPE_TRANSFORM'); " + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_event_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_election_event_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_election_info group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_election_info where role = 1) ;" + global: true + - type: sql + sql: "(select table_id, partition_idx from oceanbase.__all_virtual_clog_stat group by table_id, partition_idx) except (select table_id, partition_idx from oceanbase.__all_virtual_clog_stat where role = 'LEADER') ;" + global: true + - type: log + grep: "" + global: false + - type: sysstat + sysstat: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%switch%';" + global: true + - type: sql + sql: "select TIMESTAMP,MODULE,EVENT,VALUE1 tenant_id,VALUE2 ls_id,NAME3,VALUE3,NAME4,VALUE4,NAME5,VALUE5,NAME6,VALUE6 from oceanbase.DBA_OB_ROOTSERVICE_EVENT_HISTORY where module like '%disaster%' limit 20;" + global: true + - type: sql + sql: "(select value1, value2 from oceanbase.DBA_OB_ROOTSERVICE_EVENT_HISTORY where event like '%add_ls%') except (select value1, value2 from oceanbase.DBA_OB_SERVER_EVENT_HISTORY where module like 'storage_ha' and event like '%finish_complete%');" + global: true + - type: sql + sql: "select * from oceanbase.DBA_OB_SERVER_EVENT_HISTORY where event like '%migrat%' and name6 like '%fail%' and value6=1;" + global: true + - type: sql + sql: "select * from oceanbase.DBA_OB_SERVER_EVENT_HISTORY where module='FAILURE_DETECTOR' limit 10;" + global: true + - type: sql + sql: "select * from oceanbase.DBA_OB_SERVER_EVENT_HISTORY where module like '%ELECTION%' limit 10;" + global: true + - type: sql + sql: "select * from oceanbase.GV$OB_LOG_STAT where role='LEADER' limit 20;" + global: true + - type: sql + sql: "SELECT TENANT_NAME, TENANT_ID, TENANT_ROLE, STATUS, SWITCHOVER_STATUS FROM oceanbase.DBA_OB_TENANTS" + global: true + - type: log + grep: "" + global: false + - type: sysstat + sysstat: "" + global: false diff --git a/handler/gather/tasks/observer/suspend_transaction.yaml b/handler/gather/tasks/observer/suspend_transaction.yaml new file mode 100644 index 00000000..c8b09863 --- /dev/null +++ b/handler/gather/tasks/observer/suspend_transaction.yaml @@ -0,0 +1,62 @@ +info_en: "[suspend transaction]" +info_cn: "[悬挂事务]" +command: obdiag gather scene run --scene=observer.suspend_transaction +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_virtual_trans_stat WHERE part_trans_action > 2 AND ctx_create_time < date_sub(now(), INTERVAL 600 SECOND) AND is_exiting != 1;" + global: true + - type: log + grep: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "SELECT count(1) FROM oceanbase.GV$OB_TRANSACTION_PARTICIPANTS WHERE CTX_CREATE_TIME < date_sub(now(), INTERVAL 600 SECOND) AND (STATE = 'PREPARE' OR STATE = 'REDO COMPLETE' OR STATE ='PRECOMMIT');" + global: true + - type: log + grep: "" + global: false diff --git a/handler/gather/tasks/observer/unit_data_imbalance.yaml b/handler/gather/tasks/observer/unit_data_imbalance.yaml new file mode 100644 index 00000000..40923da1 --- /dev/null +++ b/handler/gather/tasks/observer/unit_data_imbalance.yaml @@ -0,0 +1,137 @@ +info_en: "[unit data imbalance]" +info_cn: "[unit迁移/缩小 副本不均衡问题]" +command: obdiag gather scene run --scene=observer.unit_data_imbalance +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "show parameters like '%data_disk_usage_limit_percentage%';" + global: true + - type: sql + sql: "show parameters like '%migration_disable_time%';" + global: true + - type: sql + sql: "show parameters like '%sys_bkgd_net_percentage%';" + global: true + - type: sql + sql: "show parameters like '%balancer_idle_time%';" + global: true + - type: sql + sql: "show parameters like '%server_data_copy_in_concurrency%';" + global: true + - type: sql + sql: "show parameters like '%server_data_copy_out_concurrency%';" + global: true + - type: sql + sql: "show parameters like '%data_copy_concurrency%';" + global: true + - type: sql + sql: "show parameters like '%server_permanent_offline_time%';" + global: true + - type: sql + sql: "show parameters like '%migrate_concurrency%';" + global: true + - type: sql + sql: "show parameters like '%enable_rebalance%';" + global: true + - type: sql + sql: "show parameters like '%enable_rereplication%';" + global: true + - type: sql + sql: "show parameters like '%enable_auto_leader_switch%';" + global: true + - type: sql + sql: "select * from oceanbase.__all_unit;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_replica_task;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_rebalance_task_stat;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_sys_task_status;" + global: true + - type: sql + sql: "select * from oceanbase.__all_virtual_partition_migration_status;" + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_job where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "select * from oceanbase.__all_rootservice_event_history where gmt_create > ${from_time} and gmt_create < ${to_time} order by gmt_create desc;" + global: true + - type: sql + sql: "select svr_ip,total_size / 1024 / 1024 / 1024 total_G,free_size / 1024 / 1024 / 1024 free_G,(total_size - free_size) / 1024 / 1024 / 1024 used_G,(total_size - free_size) / total_size used_percentage FROM oceanbase.__all_virtual_disk_stat;" + global: true + - type: log + grep: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters like '%data_disk_usage_limit_percentage%';" + global: true + - type: sql + sql: "show parameters like '%migration_disable_time%';" + global: true + - type: sql + sql: "show parameters like '%sys_bkgd_net_percentage%';" + global: true + - type: sql + sql: "show parameters like '%balancer_idle_time%';" + global: true + - type: sql + sql: "show parameters like '%server_permanent_offline_time%';" + global: true + - type: sql + sql: "show parameters like '%enable_rebalance%';" + global: true + - type: sql + sql: "show parameters like '%enable_rereplication%';" + global: true + - type: log + grep: "" + global: false diff --git a/handler/gather/tasks/observer/unknown.yaml b/handler/gather/tasks/observer/unknown.yaml new file mode 100644 index 00000000..9727f41e --- /dev/null +++ b/handler/gather/tasks/observer/unknown.yaml @@ -0,0 +1,74 @@ +info_en: "[unknown problem]" +info_cn: "[未能明确问题的场景]" +command: obdiag gather scene run --scene=observer.unknown +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "show parameters" + global: true + - type: log + global: false + grep: "" + - type: sysstat + global: false + sysstat: "" + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters" + global: true + - type: log + global: false + grep: "" + - type: sysstat + global: false + sysstat: "" + - type: ssh + ssh: "ps -ef | grep observer" + global: false + - type: ssh + ssh: "cat /proc/sys/kernel/core_pattern" + global: false diff --git a/handler/gather/tasks/other/application_error.yaml b/handler/gather/tasks/other/application_error.yaml new file mode 100644 index 00000000..1d3d95c1 --- /dev/null +++ b/handler/gather/tasks/other/application_error.yaml @@ -0,0 +1,68 @@ +info_en: "[application error]" +info_cn: "[应用报错问题]" +command: obdiag gather scene run --scene=other.application_error +task: + - version: "[2.0.0.0, 4.0.0.0]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.v$ob_cluster" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.__all_zone WHERE name='idc';" + global: true + - type: sql + sql: "SELECT id,svr_ip,svr_port,zone,inner_port,with_rootserver,status,start_service_time,last_offline_time FROM oceanbase.__all_server;" + global: true + - type: sql + sql: "SELECT zone, concat(svr_ip, ':', svr_port) observer, cpu_capacity, cpu_total, cpu_assigned, cpu_assigned_percent, mem_capacity, mem_total, mem_assigned, mem_assigned_percent, unit_Num, round(`load`, 2) `load`, round(cpu_weight, 2) cpu_weight, round(memory_weight, 2) mem_weight, leader_count FROM oceanbase.__all_virtual_server_stat ORDER BY zone,svr_ip;" + global: true + - type: sql + sql: "show parameters" + global: true + - type: log + grep: "" + global: false + - type: obproxy_log + grep: "" + global: false + - version: "[4.0.0.0, *]" + steps: + - type: sql + sql: "show variables like 'version_comment';" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_ZONES;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.GV$OB_SERVERS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_UNIT_CONFIGS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_RESOURCE_POOLS;" + global: true + - type: sql + sql: "SELECT * FROM oceanbase.DBA_OB_TENANTS;" + global: true + - type: sql + sql: "SELECT c.TENANT_ID, e.TENANT_NAME, concat(c.NAME, ': ', d.NAME) `pool:conf`,concat(c.UNIT_COUNT, ' unit: ', d.min_cpu, 'C/', ROUND(d.MEMORY_SIZE/1024/1024/1024,0), 'G') unit_info FROM oceanbase.DBA_OB_RESOURCE_POOLS c, oceanbase.DBA_OB_UNIT_CONFIGS d, oceanbase.DBA_OB_TENANTS e WHERE c.UNIT_CONFIG_ID=d.UNIT_CONFIG_ID AND c.TENANT_ID=e.TENANT_ID AND c.TENANT_ID>1000 ORDER BY c.TENANT_ID;" + global: true + - type: sql + sql: "SELECT a.TENANT_NAME,a.TENANT_ID,b.SVR_IP FROM oceanbase.DBA_OB_TENANTS a, oceanbase.GV$OB_UNITS b WHERE a.TENANT_ID=b.TENANT_ID;" + global: true + - type: sql + sql: "show parameters" + global: true + - type: log + grep: "" + global: false + - type: obproxy_log + grep: "" + global: false diff --git a/handler/rca/__init__.py b/handler/rca/__init__.py new file mode 100644 index 00000000..d85f698e --- /dev/null +++ b/handler/rca/__init__.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/22 +@file: __init__.py +@desc: +""" + diff --git a/handler/rca/rca_exception.py b/handler/rca/rca_exception.py new file mode 100644 index 00000000..7cfcff95 --- /dev/null +++ b/handler/rca/rca_exception.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/22 +@file: rca_exception.py +@desc: +""" +import pprint + + +# rce +class RCAInitException(Exception): + def __init__(self, msg=None, obj=None): + self.msg, self.obj = msg, obj + + def __repr__(self): + return '%s %s' % (self.msg, self.obj is not None and pprint.pformat(self.obj) or '') + + def __str__(self): + return repr(self) + + + +class RCAExecuteException(Exception): + def __init__(self, msg=None, obj=None): + self.msg, self.obj = msg, obj + + def __repr__(self): + return '%s %s' % (self.msg, self.obj is not None and pprint.pformat(self.obj) or '') + + def __str__(self): + return repr(self) + + + +class RCANotNeedExecuteException(Exception): + def __init__(self, msg=None, obj=None): + self.msg, self.obj = msg, obj + + def __repr__(self): + return '%s %s' % (self.msg, self.obj is not None and pprint.pformat(self.obj) or '') + + def __str__(self): + return repr(self) diff --git a/handler/rca/rca_handler.py b/handler/rca/rca_handler.py new file mode 100644 index 00000000..58045ce6 --- /dev/null +++ b/handler/rca/rca_handler.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/22 +@file: rca_handler.py +@desc: +""" +import datetime + +from common.logger import logger +from handler.rca.rca_exception import RCANotNeedExecuteException +from handler.rca.rca_scene import rca_map +from utils.utils import node_cut_passwd_for_log + + +def scene_exist(scene_name): + if scene_name in rca_map: + return True + else: + return False + + +class RCAHandler: + + def __init__(self, cluster, nodes, obproxy_nodes, + result_path="./rca/"): + self.rca_scene_parameters = None + self.rca_scene = None + self.cluster = cluster + self.nodes = nodes + self.obproxy_nodes = obproxy_nodes + self.result_path = result_path + + # init input parameters + self.report = None + self.tasks = None + logger.debug("RCAHandler init.cluster:{0}, init.nodes:{1}, init.obproxy_nodes:{2}, init.result_path:{3}".format( + self.cluster.get( + "ob_cluster_name") or self.cluster.get( + "obproxy_cluster_name"), node_cut_passwd_for_log(self.nodes), node_cut_passwd_for_log(self.obproxy_nodes), self.result_path)) + + def get_result_path(self): + return self.result_path + + def handle(self, args): + if getattr(args, "parameters"): + self.rca_scene_parameters = getattr(args, "parameters", "")[0].strip() + if getattr(args, "result_path"): + self.result_path = getattr(args, "result_path", "./rca/")[0].strip() + + if getattr(args, "scene") and scene_exist(getattr(args, "scene")[0]): + self.rca_scene = rca_map[getattr(args, "scene")[0]] + self.result_path = "{0}/{1}_{2}".format(self.result_path, getattr(args, "scene")[0].strip(), + datetime.datetime.now().strftime('%Y%m%d%H%M%S')) + self.rca_scene.init(self.cluster, self.nodes, self.obproxy_nodes, + env=self.rca_scene_parameters, result_path=self.result_path) + + else: + raise Exception("rca_scene :{0} is not exist or not input".format(getattr(args, "scene", ""))) + + # get all tasks + def execute(self): + try: + self.rca_scene.execute() + except RCANotNeedExecuteException as e: + logger.warning("rca_scene.execute not need execute: {0}".format(e)) + pass + except Exception as e: + logger.error("rca_scene.execute err: {0}".format(e)) + raise Exception("rca_scene.execute err: {0}".format(e)) + try: + self.rca_scene.export_result() + except Exception as e: + logger.error("rca_scene.export_result err: {0}".format(e)) + raise Exception("rca_scene.export_result err: {0}".format(e)) + diff --git a/handler/rca/rca_list.py b/handler/rca/rca_list.py new file mode 100644 index 00000000..4d7ec6ad --- /dev/null +++ b/handler/rca/rca_list.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/23 +@file: rca_list.py +@desc: +""" +from common.logger import logger +from dataclasses import dataclass +from utils.print_utils import print_scene, print_title + +@dataclass +class RegisteredScene: + name: str + command: str + info_en: str + info_cn: str + + +scene_list = [ + RegisteredScene( + 'major_hold', + 'obdiag rca run --scene=major_hold', + '[root cause analysis of major hold]', + '[针对卡合并场景的根因分析]' + ), + RegisteredScene( + 'disconnection', + 'obdiag rca run --scene=disconnection', + '[root cause analysis of disconnection]', + '[针对断链接场景的根因分析]' + ), + RegisteredScene('lock_conflict', 'obdiag rca run --scene=lock_conflict', '[root cause analysis of lock conflict]', '[针对锁冲突的根因分析]'), +] + + +class RcaScenesListHandler: + def handle(self, args): + logger.debug("list rca scenes") + scenes_map = self.__get_scenes() + self.__print_scenes_data(scenes_map) + + def __print_scenes_data(self,scenes): + print_title("Rca Scenes") + print_scene(scenes) + + def __get_scenes(self): + scenes_map = {} + for scene in scene_list: + scenes_map[scene.name]={"name": scene.name, "command": scene.command, "info_en": scene.info_en, "info_cn": scene.info_cn} + return scenes_map \ No newline at end of file diff --git a/handler/rca/rca_scene/__init__.py b/handler/rca/rca_scene/__init__.py new file mode 100644 index 00000000..90641263 --- /dev/null +++ b/handler/rca/rca_scene/__init__.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/22 +@file: __init__.py +@desc: +""" +from handler.rca.rca_scene.disconnection_scene import DisconnectionScene +from handler.rca.rca_scene.lock_conflict_scene import LockConflictScene +from handler.rca.rca_scene.major_hold_scene import MajorHoldScene + +rca_map = {} +rca_map["major_hold"] = MajorHoldScene() +rca_map["lock_conflict"] = LockConflictScene() +rca_map["disconnection"] = DisconnectionScene() + diff --git a/handler/rca/rca_scene/disconnection_scene.py b/handler/rca/rca_scene/disconnection_scene.py new file mode 100644 index 00000000..f7f72c6f --- /dev/null +++ b/handler/rca/rca_scene/disconnection_scene.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/22 +@file: disconnection_scene.py +@desc: +""" +import re +import time +import datetime + +from common.command import get_obproxy_version +from common.logger import logger +from handler.rca.rca_scene.scene_base import scene_base, Result, RCA_ResultRecord +from utils.shell_utils import SshHelper +from utils.version_utils import compare_versions_greater + + +class DisconnectionScene(scene_base): + def __init__(self): + super().__init__() + + def init(self, cluster, nodes, obproxy_nodes, env, result_path): + super().init(cluster, nodes, obproxy_nodes, env, result_path) + + for node in obproxy_nodes: + if "home_path" not in node or len(node["home_path"].strip()) == 0: + raise Exception("obproxy_node home_path is empty") + try: + is_ssh = True + ssh_helper = SshHelper(is_ssh, node.get("ip"), + node.get("user"), + node.get("password"), + node.get("port"), + node.get("private_key"), + node) + except Exception as e: + logger.error( + "SshHandler init fail. Please check the NODES conf. node: {0}. Exception : {1} .".format(node, e)) + raise Exception( + "SshHandler init fail. Please check the NODES conf node: {0} Exception : {1} .".format(node, e)) + obproxy_version = get_obproxy_version(True, ssh_helper, node.get("home_path")) + if obproxy_version is None: + raise Exception("obproxy version is None. Please check the NODES conf.") + + if not (obproxy_version == "4.2.2.0" or compare_versions_greater(obproxy_version, "4.2.2.0")): + raise Exception("obproxy version must be greater than 4.2.2.0. Please check the NODES conf.") + + def execute(self): + for node in self.obproxy_nodes: + self.__execute_obproxy_one_node(node) + logger.info("end disconnectionScene execute all nodes") + + def export_result(self): + return self.Result.export() + + def __execute_obproxy_one_node(self, node): + ssh = SshHelper(True, node.get("ip"), + node.get("user"), + node.get("password"), + node.get("port"), + node.get("private_key"), + node) + all_log = ssh.ssh_exec_cmd( + 'grep "CONNECTION](trace_type" -m 100 $(ls {0}/log/obproxy_diagnosis.log* | head -10 ) '.format( + node['home_path']) + ) + + log_list = all_log.strip().split('\n') + for line in log_list: + try: + record = RCA_ResultRecord() + record.add_record( + "node:{1} obproxy_diagnosis_log:{0}".format(line, node.get("ip"))) + log_check = DisconnectionLog(line, record) + suggest = log_check.execute() + record.add_suggest(suggest) + logger.debug("suggest:{0}".format(suggest)) + + # self.Result.suggest += "obproxy_diagnosis_log:{0}\nsuggest:{1}\n\n".format(line, suggest) + self.Result.records.append(record) + except Exception as e: + logger.warning("line in log_list is error, log: {0} ,err:{1}".format(line, e)) + continue + + +class DisconnectionLog: + def __init__(self, log, record): + self.record = record + logger.debug("DisconnectionLog base:{0}".format(log)) + if log is None or len(log.strip()) == 0: + logger.debug("log is None or len(log.strip()) == 0") + raise Exception("log is None or len(log.strip()) == 0") + + self.timeout_event = "" + try: + self.log = log + + pattern = re.compile( + r'trace_type="(.*?)".*' + r'cs_id:(\d+).*' + r'server_session_id:(\d+).*' + r'error_code:([-0-9]+).*' + r'error_msg:"(.*?)"' + + ) + + # 搜索日志条目 + matches = pattern.search(log) + + # 如果找到匹配项,则提取所需信息 + if matches: + trace_type = matches.group(1) + cs_id = matches.group(2) + server_session_id = matches.group(3) + error_code = matches.group(4) + error_msg = matches.group(5) + # 打印所需信息 + self.trace_type = trace_type + self.error_code = error_code + self.error_msg = error_msg + timeout_event_pattern = re.compile(r'timeout_event:"(.*?)".*') + timeout_event_matches = timeout_event_pattern.search(log) + if timeout_event_matches and self.trace_type == "TIMEOUT_TRACE": + timeout_event = matches.group(1) + self.error_msg = timeout_event + if self.trace_type == "SERVER_INTERNAL_TRACE": + self.trace_type = "PROXY_INTERNAL_TRACE" + record.add_record("cs_id:{0}, server_session_id:{1}".format(cs_id, server_session_id)) + + except Exception as e: + logger.error("DisconnectionLog err: {0}".format(e)) + + def execute(self): + # self.get_suggest() + try: + suggest = get_disconnectionSuggest(self.trace_type, self.error_code, self.error_msg, self.record) + return suggest + except Exception as e: + raise Exception("DisconnectionLog execute err: {0}".format(e)) + + +DisconnectionAllSuggest = { + "LOGIN_TRACE": { + "-4669": { + "does not exist": "Ensure the existence of the corresponding cluster, which can be confirmed by directly connecting to ObServer", + "cluster info is empty": "Directly connect to the Observer to execute the sql statement in the internal_sql field to confirm whether the cluster information returned by the Observer is empty", + }, + "-4043": { + "dummy entry is empty, please check if the tenant exists": "Ensure the existence of the corresponding tenant, which can be confirmed by directly connecting to ObServer" + }, + "-8205": { + "can not pass white list": "Confirm whether the ObProxy whitelist is configured correctly through OCP" + }, + "-1227": { + "Access denied": "Confirm if the ObServer whitelist is configured correctly" + }, + "-5059": { + "too many sessions": "You can adjust the global configuration client_max_connections of ObProxy to temporarily avoid it.", + "hold too many connections": "Need to contact the public cloud platform to adjust the connection limit for cloud tenants", + + }, + "-8004": { + "obproxy is configured to use ssl connection": "Modify the SSL protocol configuration enable_client_ssl, or use SSL protocol access", + + }, + + "-10021": { + "user proxyro is rejected while proxyro_check on": "Should not be used directly proxyro@sys Accessing databases", + "connection with cluster name and tenant name is rejected while cloud_full_user_name_check off": "Should not be used directly proxyro@sys Accessing databases", + "cluster name and tenant name is required while full_username_check on": "When non-cloud users turn off enable_full_user_name, ObProxy will restrict non-three-segment access", + + }, + "-10018": { + "fail to check observer version, proxyro@sys access denied, error resp": "The password for deploying proxyro by default is not a problem. If you manually change the password for proxyro user, please ensure that the configuration of the ObProxy startup parameter is correct", + "fail to check observer version, empty result": "You can confirm whether the server ip configured when the ObProxy was started is available by directly connecting to the ObServer.", + "fail to check observer version": "Directly connect to the Observer to execute the sql statement in the internal_sql field to confirm whether the cluster information returned by the Observer is empty", + "fail to check cluster info": "Directly connect to the Observer to execute the sql statement in the internal_sql field to confirm whether the cluster information returned by the Observer is empty", + "fail to init server state": "Directly connect to the Observer to execute the sql statement in the internal_sql field to confirm whether the cluster information returned by the Observer is empty", + + }, + "-10301": { + "fail to fetch root server list from config server " + "fail to fetch root server list from local": "You can manually pull the url of the config_server configured at startup to confirm whether the information returned by the config server is normal", + }, + + }, + "TIMEOUT_TRACE": { + "-10022": { + "CLIENT_DELETE_CLUSTER_RESOURCE": "You can temporarily avoid it by adjusting the obproxy cluster_ expire_time configuration. The default expiration time is one day, and the new request will reset the expiration time.", + "CLIENT_INTERNAL_CMD_TIMEOUT": "Unexpected timeout, requiring customer environment cooperation for diagnosis", + "CLIENT_CONNECT_TIMEOUT": "Unexpected timeout, requiring customer environment cooperation for diagnosis", + "CLIENT_NET_READ_TIMEOUT": "Modifying the observer net_read_timeout variable requires mainly modifying the global level configuration, which will not take effect on existing connections.", + "CLIENT_NET_WRITE_TIMEOUT": "Modifying the observer net_read_timeout variable requires mainly modifying the global level configuration, which will not take effect on existing connections.", + "CLIENT_WAIT_TIMEOUT": "Modify the observer wait_timeout variable to temporarily avoid it", + "SERVER_QUERY_TIMEOUT": "Modify the observer ob_query_timeout variable to temporarily avoid or modify the obproxy observer_query_timeout_delta configuration to avoid it", + "SERVER_TRX_TIMEOUT": "Modify the variable ob_trx_timeout to temporarily avoid it", + "SERVER_WAIT_TIMEOUT": "Modify the observer wait_timeout variable to temporarily avoid it", + }, + }, + "SERVER_VC_TRACE": { + "-10013": { + "Fail to build connection to observer": "Need the cooperation of the observer for diagnosis" + }, + "-10014": { + " received while proxy transferring request": "Need the cooperation of the observer for diagnosis" + }, + "-10016": { + " received while proxy reading response": "Need the cooperation of the observer for diagnosis" + } + }, + "CLIENT_VC_TRACE": { + "-10010": { + " received from client while obproxy reading request": "Need client cooperation for diagnosis", + }, + "-10011": { + " received from client while obproxy handling response": "Need client cooperation for diagnosis", + }, + "-10012": { + " received from client while obproxy transferring response": "Need client cooperation for diagnosis", + }, + }, + "PROXY_INTERNAL_TRACE": { + "-4664": { + "dummy entry is empty, disconnect": "Unexpected error scenario", + }, + "-10018": { + "proxy execute internal request failed, received error resp, error_type:": "Unexpected error scenario", + }, + "-10019": { + "OBProxy reached the maximum number of retrying request": "Unexpected error scenario", + }, + "-10001": { + "target session is closed, disconnect": "Unexpected error scenario", + "": "Unexpected error scenario", + "ora fatal error": "Unexpected error scenario", + "primary cluster switchover to standby, disconnect": "The possible connection loss problem during the switch between the primary and secondary databases, which is consistent with the expected scenario", + }, + "-5065": { + "connection was killed by user self, cs_id": "In line with the expected scenario, the diagnostic log is recorded", + "connection was killed by user session": "In line with the expected scenario, the diagnostic log is recorded" + }, + }, + +} + + +def get_disconnectionSuggest(trace_type, error_code, error_msg, record): + if trace_type == "" or error_code == "" or error_msg == "": + raise Exception( + "not find the suggest. Please contact the community and upload the exception information.. trace_type:{0}, error_code:{1}, error_msg:{2}".format( + trace_type, error_code, error_msg)) + Suggest_trace_type = DisconnectionAllSuggest.get(trace_type) + record.add_record('trace_type:{0}'.format(trace_type)) + if Suggest_trace_type: + Suggest_error_code = Suggest_trace_type.get(error_code) + record.add_record('error_code:{0}'.format(error_code)) + if Suggest_error_code: + suggest = "" + error_msgs = Suggest_error_code.keys() + for suggest_error_msg in error_msgs: + # 子串 + if suggest_error_msg in error_msg: + logger.info( + "find the suggest. trace_type:{0}, error_code:{1}, error_msg:{2}".format(trace_type, error_code, + error_msg)) + suggest += "\n" + suggest += Suggest_error_code.get(suggest_error_msg) + if suggest.strip() != "": + logger.info( + "find the suggest. trace_type:{0}, error_code:{1}, error_msg:{2}, suggest:{3}".format(trace_type, + error_code, + error_msg, + suggest.strip())) + return suggest.strip() + else: + + suggest = "not find the suggest. Please contact the community and upload the exception information.. trace_type:{0}, error_code:{1}, error_msg:{2}. The suggestions are as follows. You can try using the following suggestions or submit the logs to the Oceanbase community.".format( + trace_type, error_code, error_msg) + suggest +="\n" + + for error_msg_by_Suggest_error_code in Suggest_error_code: + suggest += Suggest_error_code.get(error_msg_by_Suggest_error_code)+"\n" + return suggest + else: + raise Exception("the disconnection error_code :{0} ,not support.".format(error_code)) + else: + raise Exception("the disconnection trace_type :{0} ,not support.".format(trace_type)) diff --git a/handler/rca/rca_scene/lock_conflict_scene.py b/handler/rca/rca_scene/lock_conflict_scene.py new file mode 100644 index 00000000..dab88102 --- /dev/null +++ b/handler/rca/rca_scene/lock_conflict_scene.py @@ -0,0 +1,148 @@ +# !/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/29 +@file: lock_conflict_scene.py +@desc: +""" +from common.command import get_observer_version +from common.logger import logger +from common.ob_connector import OBConnector +from handler.rca.rca_exception import RCAInitException, RCANotNeedExecuteException +from handler.rca.rca_scene.scene_base import scene_base, Result, RCA_ResultRecord +from utils.shell_utils import SshHelper +from utils.version_utils import compare_versions_greater + + +class LockConflictScene(scene_base): + def __init__(self): + super().__init__() + self.ob_connector = None + self.observer_nodes = None + self.ob_cluster = None + self.observer_version = None + self.default_node = None + + def init(self, cluster, nodes, obproxy_nodes, env, result_path): + try: + super().init(cluster, nodes, obproxy_nodes, env, result_path) + self.default_node = self.observer_nodes[0] + + ssh = SshHelper(True, self.default_node.get("ip"), + self.default_node.get("user"), + self.default_node.get("password"), + self.default_node.get("port"), + self.default_node.get("private_key"), + self.default_node) + self.observer_version = get_observer_version(True, ssh, self.default_node["home_path"]) + + self.ob_connector = OBConnector(ip=self.ob_cluster.get("db_host"), + port=self.ob_cluster.get("db_port"), + username=self.ob_cluster.get("tenant_sys").get("user"), + password=self.ob_cluster.get("tenant_sys").get("password"), + timeout=10000) + + except Exception as e: + raise RCAInitException("LockConflictScene RCAInitException: ", e) + + def execute(self): + if self.observer_version is None or len(self.observer_version) == 0: + raise Exception("observer version is None. Please check the NODES conf.") + if self.observer_version == "4.2.0.0" or compare_versions_greater(self.observer_version, "4.2.0.0"): + self.__execute_4_2() + elif compare_versions_greater("4.2.2.0", self.observer_version): + self.__execute_old() + else: + raise Exception("observer version is {0}. Not support".format(self.observer_version)) + + def __execute_4_2(self): + first_record = RCA_ResultRecord() + # get trans_id + cursor = self.ob_connector.execute_sql_return_cursor_dictionary( + 'select * from oceanbase.GV$OB_LOCKS where BLOCK=1 and TYPE="TX" limit 50;') + data = cursor.fetchall() + if len(data) == 0: + first_record.add_record("on GV$OB_LOCKS result is null") + first_record.add_suggest("No block lock found. Not Need Execute") + self.Result.records.append(first_record) + raise RCANotNeedExecuteException("No block lock found.") + first_record.add_record("by select * from oceanbase.GV$OB_LOCKS where BLOCK=1; the len is {0}".format(len(data))) + for OB_LOCKS_data in data: + trans_record = RCA_ResultRecord() + first_record_records = first_record.records.copy() + trans_record.records.extend(first_record_records) + self.Result.records.append(trans_record) + try: + if OB_LOCKS_data.get('TRANS_ID') is None: + trans_record.add_record("trans_id is null") + trans_record.add_suggest("trans_id is null. can not do next") + continue + else: + trans_id = OB_LOCKS_data['TRANS_ID'] + trans_record.add_record("trans_id is {0}".format(trans_id)) + cursor_by_trans_id = self.ob_connector.execute_sql_return_cursor_dictionary( + 'select * from oceanbase.V$OB_TRANSACTION_PARTICIPANTS where TX_ID="{0}";'.format(trans_id)) + session_datas = cursor_by_trans_id.fetchall() + trans_record.add_record( + "get SESSION_ID by trans_id:{0}. get data:{0}".format(trans_id, session_datas)) + if len(session_datas) != 1: + trans_record.add_suggest( + "get SESSION_ID by trans_id:{0}. Maybe the lock is not exist".format(trans_id)) + continue + if session_datas[0].get("SESSION_ID") is not None: + trans_record.add_record("get SESSION_ID:{0}".format(session_datas[0].get("SESSION_ID"))) + trans_record.add_suggest("Sessions corresponding to lock transactions. The ID is {0}, " + "which may be a lock conflict issue.You can be accessed through kill " + "session_ Roll back the corresponding transaction with ID. Please " + "note that this will result in corresponding transaction regression! " + "".format(session_datas[0].get("SESSION_ID"))) + + except Exception as e: + trans_record.add_record("get SESSION_ID panic. OB_LOCKS_data:{0} error: {1}".format(OB_LOCKS_data, e)) + trans_record.add_suggest("get SESSION_ID panic. OB_LOCKS_data:{0} error: {1}".format(OB_LOCKS_data, e)) + + return + + def __execute_old(self): + first_record = RCA_ResultRecord() + cursor = self.ob_connector.execute_sql_return_cursor_dictionary( + "select * from oceanbase.__all_virtual_lock_wait_stat order by try_lock_times limit 50;") + virtual_lock_wait_stat_datas = cursor.fetchall() + if len(virtual_lock_wait_stat_datas) == 0: + first_record.add_record("on __all_virtual_trans_stat result is null") + first_record.add_suggest("No block lock found. Not Need Execute") + self.Result.records.append(first_record) + raise RCANotNeedExecuteException("No block lock found.") + first_record.add_record( + "by select * from oceanbase.__all_virtual_lock_wait_stat order by try_lock_times limit 50; the len is {0}".format( + len(virtual_lock_wait_stat_datas))) + + for trans_lock_data in virtual_lock_wait_stat_datas: + + trans_id = trans_lock_data["block_session_id"] + trans_record = RCA_ResultRecord() + first_record_records = first_record.records.copy() + trans_record.records.extend(first_record_records) + self.Result.records.append(trans_record) + trans_record.add_record("block_data is {0}".format(trans_lock_data)) + trans_record.add_record("block_session_id is {0}".format(trans_id)) + trans_record.add_suggest("Sessions corresponding to lock transactions. The ID is {0}, " + "which may be a lock conflict issue.You can be accessed through kill " + "session_Roll back the corresponding transaction with ID. Please " + "note that this will result in corresponding transaction regression! " + "".format(trans_lock_data.get("block_session_id"))) + + return + + def export_result(self): + return self.Result.export() \ No newline at end of file diff --git a/handler/rca/rca_scene/major_hold_scene.py b/handler/rca/rca_scene/major_hold_scene.py new file mode 100644 index 00000000..cf35bcbb --- /dev/null +++ b/handler/rca/rca_scene/major_hold_scene.py @@ -0,0 +1,487 @@ +# !/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/1/2 +@file: major_hold.py +@desc: +""" +import json +import re + +from common.command import get_observer_version +from common.logger import logger +from common.ob_connector import OBConnector +from handler.rca.rca_exception import RCAInitException, RCAExecuteException, RCANotNeedExecuteException +from handler.rca.rca_scene.scene_base import scene_base, Result, RCA_ResultRecord +from utils.shell_utils import SshHelper +from utils.time_utils import DateTimeEncoder +from utils.version_utils import compare_versions_greater + + +class MajorHoldScene(scene_base): + def __init__(self): + super().__init__() + self.local_path = None + self.ob_cluster = None + self.observer_nodes = [] + self.observer_version = "" + self.ob_connector = None + self.Result = Result() + + def init(self, cluster, nodes, obproxy_nodes, env, result_path): + try: + super().__init__() + self.Result.set_save_path(result_path) + self.ob_cluster = cluster + self.observer_nodes = nodes + self.local_path = result_path + node = self.observer_nodes[0] + ssh = SshHelper(True, node.get("ip"), + node.get("user"), + node.get("password"), + node.get("port"), + node.get("private_key"), + node) + self.observer_version = get_observer_version(True, ssh, node["home_path"]) + if self.observer_version is None: + raise Exception("obproxy version is None. Please check the NODES conf.") + + if not (self.observer_version == "4.0.0.0" or compare_versions_greater(self.observer_version, "4.0.0.0")): + raise Exception("observer version must be greater than 4.0.0.0. Please check the NODES conf.") + + self.ob_connector = OBConnector(ip=self.ob_cluster.get("db_host"), + port=self.ob_cluster.get("db_port"), + username=self.ob_cluster.get("tenant_sys").get("user"), + password=self.ob_cluster.get("tenant_sys").get("password"), + timeout=10000) + except Exception as e: + raise RCAInitException("MajorHoldScene RCAInitException: {0}".format(e)) + + def execute(self): + # 前置条件确认 + need_tag = False + first_record = RCA_ResultRecord() + err_tenant_ids = [] + # 合并任务是否有报错 + try: + COMPACTING_data = self.ob_connector.execute_sql( + 'select * from oceanbase.CDB_OB_MAJOR_COMPACTION where IS_ERROR="YES";') + if len(COMPACTING_data) == 0: + first_record.add_record("CDB_OB_MAJOR_COMPACTION is not exist IS_ERROR='YES'") + else: + need_tag = True + CDB_OB_MAJOR_COMPACTION_err_tenant_ids = [] + for data in COMPACTING_data: + CDB_OB_MAJOR_COMPACTION_err_tenant_ids.append(str(data[0])) + + first_record.add_record( + "CDB_OB_MAJOR_COMPACTION have IS_ERROR='YES',the tenant_ids are {0}".format(err_tenant_ids)) + err_tenant_ids.extend(CDB_OB_MAJOR_COMPACTION_err_tenant_ids) + + except Exception as e: + logger.warning("MajorHoldScene execute CDB_OB_MAJOR_COMPACTION panic: {0}".format(e)) + raise RCAExecuteException("MajorHoldScene execute CDB_OB_MAJOR_COMPACTION panic: {0}".format(e)) + # __all_virtual_compaction_diagnose_info里存在status=FAILED的记录 + try: + diagnose_data = self.ob_connector.execute_sql( + 'select * from oceanbase.__all_virtual_compaction_diagnose_info where status="FAILED";') + if len(diagnose_data) == 0: + first_record.add_record('__all_virtual_compaction_diagnose_info is not exist status="FAILED";') + else: + need_tag = True + __all_virtual_compaction_diagnose_info_err_tenant_ids = [] + for data in COMPACTING_data: + __all_virtual_compaction_diagnose_info_err_tenant_ids.append(str(data[0])) + + first_record.add_record( + "__all_virtual_compaction_diagnose_info have status='FAILED',the tenant is {0}".format( + __all_virtual_compaction_diagnose_info_err_tenant_ids)) + err_tenant_ids.extend(__all_virtual_compaction_diagnose_info_err_tenant_ids) + except Exception as e: + logger.error("MajorHoldScene execute CDB_OB_MAJOR_COMPACTION panic: {0}".format(e)) + raise RCAExecuteException("MajorHoldScene execute CDB_OB_MAJOR_COMPACTION panic: {0}".format(e)) + # GV$OB_COMPACTION_PROGRESS表中,根据上一次合并记录中的data_size/(estimated_finish_time-start_time)与当前合并版本记录中(data_size-unfinished_data_size)/(当前时间-start_time)相比,如果差距过大(当前合并比上一次合并慢很多,以5倍为指标) + try: + running_data = self.ob_connector.execute_sql( + "select * from oceanbase.GV$OB_COMPACTION_PROGRESS where STATUS <> 'FINISH' and START_TIME <= NOW() - INTERVAL 20 minute GROUP BY COMPACTION_SCN DESC;") + if len(running_data) == 0: + first_record.add_record('No merge tasks that have not ended beyond the expected time') + else: + + time_out_merge_err_tenant_ids = [] + need_tag = True + for data in running_data: + time_out_merge_err_tenant_ids.append(str(data[2])) + first_record.add_record( + "merge tasks that have not ended beyond the expected time,the tenant_id is {0}".format( + time_out_merge_err_tenant_ids)) + logger.info("merge tasks that have not ended beyond the expected time,the tenant_id is {0}".format( + time_out_merge_err_tenant_ids)) + err_tenant_ids.extend(time_out_merge_err_tenant_ids) + except Exception as e: + logger.error("MajorHoldScene execute GV$OB_COMPACTION_PROGRESS panic: {0}".format(e)) + raise RCAExecuteException("MajorHoldScene execute GV$OB_COMPACTION_PROGRESS panic: {0}".format(e)) + if not need_tag: + first_record.add_suggest("major merge abnormal situation not need execute") + self.Result.records.append(first_record) + raise RCANotNeedExecuteException("MajorHoldScene not need execute") + else: + err_tenant_ids = list(set(err_tenant_ids)) + first_record.add_suggest("some tenants need execute MajorHoldScene. :{0}".format(err_tenant_ids)) + logger.info("On CDB_OB_MAJOR_COMPACTION") + + # execute record need more + for err_tenant_id in err_tenant_ids: + tenant_record = RCA_ResultRecord() + first_record_records=first_record.records.copy() + tenant_record.records.extend(first_record_records) + logger.info("tenant_id is {0}".format(err_tenant_id)) + tenant_record.add_record("tenant_id is {0}".format(err_tenant_id)) + # 1 + try: + cursor = self.ob_connector.execute_sql_return_cursor_dictionary( + 'SELECT * FROM oceanbase.CDB_OB_MAJOR_COMPACTION WHERE TENANT_ID= "{0}" AND (IS_ERROR = "NO" OR IS_SUSPENDED = "NO");'.format( + err_tenant_id)) + OB_MAJOR_COMPACTION_data = cursor.fetchall() + if len(OB_MAJOR_COMPACTION_data) == 0: + tenant_record.add_record( + "on CDB_OB_MAJOR_COMPACTION where status='COMPACTING'; " + "result:{0} , need not next step".format(str(OB_MAJOR_COMPACTION_data))) + + else: + tenant_record.add_record( + "on CDB_OB_MAJOR_COMPACTION where status='COMPACTING';" + "result:{0}".format(str(OB_MAJOR_COMPACTION_data))) + + except Exception as e: + tenant_record.add_record("#1 on CDB_OB_MAJOR_COMPACTION get data failed") + logger.warning("MajorHoldScene execute exception: {0}".format(e)) + pass + # 2 + try: + compaction_diagnose_info = self.ob_connector.execute_sql( + 'SELECT * FROM oceanbase.__all_virtual_compaction_diagnose_info WHERE status="FAILED";') + + if len(compaction_diagnose_info) == 0: + tenant_record.add_record( + "on __all_virtual_compaction_diagnose_info no data status=FAILED") + else: + tenant_record.add_record( + "on __all_virtual_compaction_diagnose_info;" + "result:{0}".format(str(compaction_diagnose_info))) + + for COMPACTING_data in compaction_diagnose_info: + self.diagnose_info_switch(COMPACTING_data, tenant_record) + + except Exception as e: + tenant_record.add_record("#2&3 on __all_virtual_compaction_diagnose_info get data failed") + logger.warning("#2&3 MajorHoldScene execute exception: {0}".format(e)) + pass + + # 4 + try: + global_broadcast_scn = self.ob_connector.execute_sql( + "select * from oceanbase.CDB_OB_MAJOR_COMPACTION where TENANT_ID='{0}';".format(err_tenant_id))[ + 0][3] + tenant_record.add_record("global_broadcast_scn is {0}".format(global_broadcast_scn)) + last_scn = self.ob_connector.execute_sql( + "select LAST_SCN from oceanbase.CDB_OB_MAJOR_COMPACTION where TENANT_ID='{0}';".format( + err_tenant_id))[0] + tenant_record.add_record("last_scn is {0}".format(last_scn)) + + sql = "select * from oceanbase.GV$OB_COMPACTION_PROGRESS where TENANT_ID='{0}' and COMPACTION_SCN='{1}';".format( + err_tenant_id, global_broadcast_scn) + OB_COMPACTION_PROGRESS_data_global_broadcast_scn = self.ob_connector.execute_sql(sql) + file_name = "{0}/rca_major_hold_{1}_OB_COMPACTION_PROGRESS_data_global_broadcast_scn".format( + self.local_path, err_tenant_id) + with open(file_name, 'w') as f: + f.write(str(OB_COMPACTION_PROGRESS_data_global_broadcast_scn)) + tenant_record.add_record( + "tenant_id:{0} OB_COMPACTION_PROGRESS_data_global_broadcast_scn save on {1}".format(err_tenant_id, + file_name)) + + sql = "select * from oceanbase.GV$OB_COMPACTION_PROGRESS where TENANT_ID='{0}' and COMPACTION_SCN='{1}';".format( + err_tenant_id, last_scn) + OB_COMPACTION_PROGRESS_data_last_scn = self.ob_connector.execute_sql(sql) + file_name = "{0}/rca_major_hold_{1}_OB_COMPACTION_PROGRESS_data_last_scn".format( + self.local_path, err_tenant_id) + with open(file_name, 'w') as f: + f.write(str(OB_COMPACTION_PROGRESS_data_last_scn)) + tenant_record.add_record( + "tenant_id:{0} OB_COMPACTION_PROGRESS_data_last_scn save on {1}".format(err_tenant_id, + file_name)) + + sql = "select * from oceanbase.GV$OB_COMPACTION_PROGRESS where TENANT_ID='{0}' and STATUS<>'FINISH';".format( + err_tenant_id, global_broadcast_scn) + finish_data = self.ob_connector.execute_sql(sql) + if len(finish_data) == 0: + tenant_record.add_record("sql:{0},len of result is 0;result:{1}".format(sql, finish_data)) + sql = "select * from oceanbase. where TENANT_ID='{0}' and LS_ID=1".format(err_tenant_id) + svrs = self.ob_connector.execute_sql(sql) + svr_ip = svrs[0][4] + svr_port = svrs[0][5] + node = None + for observer_node in self.observer_nodes: + if observer_node["ip"] == svr_ip and observer_node["port"] == svr_port: + node = observer_node + if node == None: + logger.error( + "can not find ls_svr by TENANT_ID:{2} ip:{0},port:{1}".format(svr_ip, svr_port, + err_tenant_id)) + break + ssh_helper = SshHelper(True, node.get("ip"), + node.get("user"), + node.get("password"), + node.get("port"), + node.get("private_key"), + node) + log_name = "/tmp/major_hold_scene_4_major_merge_progress_checker_{0}.log".format(err_tenant_id) + ssh_helper.ssh_exec_cmd( + 'grep "major_merge_progress_checker" {0}/log/rootservice.log* | grep T{1} -m500 >{2}'.format( + node.get("home_path"), err_tenant_id, log_name)) + ssh_helper.download(log_name, self.local_path) + tenant_record.add_record("download {0} to {1}".format(log_name, self.local_path)) + ssh_helper.ssh_exec_cmd("rm -rf {0}".format(log_name)) + except Exception as e: + logger.error("MajorHoldScene execute 4 exception: {0}".format(e)) + raise RCAExecuteException("MajorHoldScene execute 4 exception: {0}".format(e)) + + # 5 + try: + cursor = self.ob_connector.execute_sql_return_cursor_dictionary( + 'select * from oceanbase.GV$OB_COMPACTION_SUGGESTIONS where tenant_id="{0}";'.format(err_tenant_id)) + columns = [column[0] for column in cursor.description] + OB_COMPACTION_SUGGESTIONS_data = cursor.fetchall() + OB_COMPACTION_SUGGESTIONS_info = json.dumps(OB_COMPACTION_SUGGESTIONS_data, cls=DateTimeEncoder) + file_name = "{0}/rca_major_hold_{1}_OB_COMPACTION_SUGGESTIONS_info".format( + self.local_path, err_tenant_id) + with open(file_name, 'w') as f: + f.write(str(OB_COMPACTION_SUGGESTIONS_info)) + tenant_record.add_record( + "tenant_id:{0} OB_COMPACTION_PROGRESS_data_last_scn save on {1}".format(err_tenant_id, + file_name)) + + except Exception as e: + logger.warning("MajorHoldScene execute 5 exception: {0}".format(e)) + tenant_record.add_suggest("send the {0} to the oceanbase community".format(self.local_path)) + self.Result.records.append(tenant_record) + + def get_info__all_virtual_compaction_diagnose_info(self, tenant_record): + try: + COMPACTING_datas = self.ob_connector.execute_sql( + "SELECT * FROM oceanbase.__all_virtual_compaction_diagnose_info WHERE IS_ERROR = 'NO' OR IS_SUSPENDED = 'NO';") + if len(COMPACTING_datas) == 0: + tenant_record.add_record( + "sql:select * from oceanbase.__all_virtual_compaction_diagnose_info; no data") + return + else: + tenant_record.add_record( + "sql:select * from oceanbase.CDB_OB_MAJOR_COMPACTION where status=COMPACTING; " + "result:{0}".format(str(COMPACTING_datas))) + for index, COMPACTING_data in COMPACTING_datas: + self.diagnose_info_switch(COMPACTING_data) + except Exception as e: + raise RCAExecuteException( + "MajorHoldScene execute get_info__all_virtual_compaction_diagnose_info exception: {0}".format(e)) + + def diagnose_info_switch(self, sql_data, tenant_record): + svr_ip = sql_data[0] + svr_port = sql_data[1] + tenant_id = sql_data[2] + ls_id = sql_data[4] + table_id = sql_data[5] + create_time = sql_data[7] + diagnose_info = sql_data[8] + if "schedule medium failed" in diagnose_info: + node = None + for observer_node in self.observer_nodes: + if svr_ip == node.get("ip"): + node = observer_node + if node is None: + raise RCAExecuteException("can not find observer node by ip:{0}, port:{1}".format(svr_ip, svr_port)) + ssh_helper = SshHelper(True, node.get("ip"), + node.get("user"), + node.get("password"), + node.get("port"), + node.get("private_key"), + node) + log_name = "/tmp/rca_major_hold_schedule_medium_failed_{1}_{2}_{0}.txt".format(tenant_id, svr_ip, + svr_port) + tenant_record.add_record( + "diagnose_info type is 'schedule medium failed'. time is {0},observer is {1}:{2},the log is {3}".format( + create_time, svr_ip, svr_port, log_name)) + ssh_helper.ssh_exec_cmd( + 'grep "schedule_medium_failed" {1}/log/observer.log* |grep -P "\[\d+\]" -m 1 -o >{0}'.format(log_name, + node.get( + "home_path"))) + ssh_helper.download(log_name, local_path=self.local_path) + tenant_record.add_record("download {0} to {1}".format(log_name, self.local_path)) + ssh_helper.ssh_exec_cmd("rm -rf {0}".format(log_name)) + return + elif "error_no=" in diagnose_info and "error_trace=" in diagnose_info: + err_no = re.search("\berror_no=(\d+)\b", diagnose_info).group(1) + err_trace = re.search("\berror_trace=(.+)\b", diagnose_info).group(1) + + global_broadcast_scn = self.ob_connector.execute_sql( + "select * from oceanbase.CDB_OB_MAJOR_COMPACTION where TENANT_ID='{0}';".format(tenant_id))[0][3] + compaction_scn = self.ob_connector.execute_sql( + "select * from oceanbase.__all_virtual_tablet_meta_table where tablet_id='{0}' and tenant_id='{1}';".format( + table_id, tenant_id))[0][7] + if compaction_scn > global_broadcast_scn: + tenant_record.add_record( + "diagnose_info type is error_no. error_no: {0}, err_trace: {1} , table_id:{2}, tenant_id:{3}, compaction_scn: {4}, global_broadcast_scn: {5}. compaction_scn>global_broadcast_scn".format( + err_no, err_trace, table_id, tenant_id, compaction_scn, global_broadcast_scn)) + return + else: + tenant_record.add_record( + "diagnose_info type is error_no. error_no: {0}, err_trace:{1}, table_id:{2}, tenant_id:{3}, compaction_scn: {4}, global_broadcast_scn: {5}. compaction_scn{2}".format(err_trace, node.get("home_path"), log_name)) + ssh_helper.download(log_name, local_path=self.local_path) + tenant_record.add_record("download {0} to {1}".format(log_name, self.local_path)) + ssh_helper.ssh_exec_cmd("rm -rf {0}".format(log_name)) + node = None + for observer_node in self.observer_nodes: + if svr_ip == node.get("ip"): + node = observer_node + if node is None: + raise RCAExecuteException("can not find observer node by ip:{0}, port:{1}".format(svr_ip, svr_port)) + ssh_helper = SshHelper(True, node.get("ip"), + node.get("user"), + node.get("password"), + node.get("port"), + node.get("private_key"), + node) + tenant_record.add_record( + "diagnose_info type is 'error_no'. time is {0},observer is {1}:{2},the log is {3}".format( + create_time, svr_ip, svr_port, log_name)) + ssh_helper.ssh_exec_cmd('cat observer.log* |grep "{1}" > /tmp/{0}'.format(log_name, err_trace)) + ssh_helper.download(log_name, local_path=self.local_path) + tenant_record.add_record("download {0} to {1}".format(log_name, self.local_path)) + ssh_helper.ssh_exec_cmd("rm -rf {0}".format(log_name)) + return + elif "weak read ts is not ready" in diagnose_info: + cursor = self.ob_connector.execute_sql_return_cursor_dictionary( + "select * from oceanbase.__all_virtual_ls_info where tenant_id='{0}' and ls_id='{1}';".format(tenant_id, + ls_id)) + columns = [column[0] for column in cursor.description] + all_virtual_ls_info_data = cursor.fetchall() + self.all_virtual_ls_info = json.dumps(all_virtual_ls_info_data, cls=DateTimeEncoder) + tenant_record.add_record( + "sql:" + "select * from oceanbase.__all_virtual_ls_info where tenant_id='{0}' and ls_id='{1}';".format( + tenant_id, ls_id) + + "result:{0}".format(str(self.all_virtual_ls_info))) + return + elif "memtable can not create dag successfully" in diagnose_info: + tenant_record.add_record("diagnose_info type is memtable can not create dag successfully.") + + global_broadcast_scn = self.ob_connector.execute_sql( + "select * from oceanbase.CDB_OB_MAJOR_COMPACTION where TENANT_ID='{0}';".format(tenant_id))[0][3] + compaction_scn = self.ob_connector.execute_sql( + "select * from oceanbase.__all_virtual_tablet_meta_table where tablet_id='{0}' and tenant_id='{1}';".format( + table_id, tenant_id))[0][7] + if compaction_scn > global_broadcast_scn: + tenant_record.add_record( + "diagnose_info type is memtable can not create dag successfully. table_id:{0}, tenant_id:{1}, compaction_scn: {2}, global_broadcast_scn: {3}. compaction_scn>global_broadcast_scn".format( + table_id, tenant_id, compaction_scn, global_broadcast_scn)) + return + else: + cursor = self.ob_connector.execute_sql_return_cursor_dictionary( + "select * from oceanbase.__all_virtual_dag_scheduler where svr_ip='{0}' and svr_port='{1}' and tenant_id='{2}';".format( + svr_ip, svr_port, tenant_id)) + columns = [column[0] for column in cursor.description] + all_virtual_ls_info_data = cursor.fetchall() + self.all_virtual_ls_info = json.dumps(all_virtual_ls_info_data, cls=DateTimeEncoder) + tenant_record.add_record( + "sql:" + + "select * from oceanbase.__all_virtual_dag_scheduler where svr_ip='{0}' and svr_port='{1}' and tenant_id='{2}';".format( + svr_ip, svr_port, tenant_id) + + "result:{0}".format(str(self.all_virtual_ls_info))) + + return + elif "medium wait for freeze" in diagnose_info or "major wait for freeze" in diagnose_info: + tenant_record.add_record("diagnose_info type is medium wait for freeze or major wait for freeze.") + cursor = self.ob_connector.execute_sql_return_cursor_dictionary( + "select * from oceanbase.__all_virtual_dag_scheduler where svr_ip='{0}' and svr_port='{1}' and tenant_id='{2}';".format( + svr_ip, svr_port, tenant_id)) + columns = [column[0] for column in cursor.description] + all_virtual_ls_info_data = cursor.fetchall() + self.all_virtual_ls_info = json.dumps(all_virtual_ls_info_data, cls=DateTimeEncoder) + tenant_record.add_record( + "sql:" + + "select * from oceanbase.__all_virtual_dag_scheduler where svr_ip='{0}' and svr_port='{1}' and tenant_id='{2}';".format( + svr_ip, svr_port, tenant_id) + + "result:{0}".format(str(self.all_virtual_ls_info))) + return + elif "major not schedule for long time" in diagnose_info: + tenant_record.add_record("diagnose_info type is major not schedule for long time") + cursor = self.ob_connector.execute_sql_return_cursor_dictionary( + "select * from oceanbase.__all_virtual_tablet_compaction_info where svr_ip='{0}' and svr_port='{1}' and tenant_id='{2}' and ls_id='{3}' and tablet_id='{4}';".format( + svr_ip, svr_port, tenant_id, ls_id, table_id)) + columns = [column[0] for column in cursor.description] + all_virtual_ls_info_data = cursor.fetchall() + all_virtual_tablet_compaction_info = json.dumps(all_virtual_ls_info_data, cls=DateTimeEncoder) + tenant_record.add_record( + "sql:" + + "select * from oceanbase.__all_virtual_tablet_compaction_info where svr_ip='{0}' and svr_port='{1}' and tenant_id='{2}' and ls_id='{3}' and tablet_id='{4}';".format( + svr_ip, svr_port, tenant_id, ls_id, table_id) + + "result:{0}".format(str(all_virtual_tablet_compaction_info))) + node = None + for observer_node in self.observer_nodes: + if svr_ip == node.get("ip"): + node = observer_node + if node is None: + raise RCAExecuteException("can not find observer node by ip:{0}, port:{1}".format(svr_ip, svr_port)) + ssh_helper = SshHelper(True, node.get("ip"), + node.get("user"), + node.get("password"), + node.get("port"), + node.get("private_key"), + node) + + log_name = "/tmp/rca_major_hold_major_not_schedule_for_long_time_{1}_{2}_{0}.txt".format(create_time, + svr_ip, + svr_port) + tenant_record.add_record( + "diagnose_info type is 'major not schedule for long time'. time is {0},observer is {1}:{2},the log is {3}".format( + create_time, svr_ip, svr_port, log_name)) + thread_id = ssh_helper.ssh_exec_cmd( + 'cat {0}/log/observer.log* |grep "MediumLoo" -m 1 |grep -P "\[\d+\]" -m 1 -o | grep -oP "\d+"'.format( + node["home_path"], tenant_id)).strip() + ssh_helper.ssh_exec_cmd( + 'cat {0}/log/observer.log | grep "{1}" -m 100> {2}'.format(node["home_path"], thread_id, log_name)) + ssh_helper.download(log_name, local_path=self.local_path) + tenant_record.add_record("download {0} to {1}".format(log_name, self.local_path)) + ssh_helper.ssh_exec_cmd("rm -rf {0}".format(log_name)) + + else: + tenant_record.add_record("diagnose_info type is Unknown.") + + def export_result(self): + return self.Result.export() diff --git a/handler/rca/rca_scene/scene_base.py b/handler/rca/rca_scene/scene_base.py new file mode 100644 index 00000000..f4c3aa6e --- /dev/null +++ b/handler/rca/rca_scene/scene_base.py @@ -0,0 +1,89 @@ +import os + +from prettytable import PrettyTable +from textwrap import fill + +from common.logger import logger + + +class scene_base: + def __init__(self): + self.env = None + self.observer_nodes = None + self.ob_cluster = None + self.result_path = None + self.cluster = None + self.obproxy_nodes = None + self.Result = Result() + + def init(self, cluster, nodes, obproxy_nodes, env, result_path): + self.cluster = cluster + self.obproxy_nodes = obproxy_nodes + self.observer_nodes = nodes + self.env = env + self.ob_cluster = cluster + self.Result.set_save_path(result_path) + pass + + def info(self): + pass + + def execute(self): + pass + + +class Result: + + def __init__(self): + # self.suggest = "" + self.procedure = None + self.records = [] + self.save_path = "./" + + def set_save_path(self, save_path): + self.save_path = os.path.expanduser(save_path) + if os.path.exists(save_path): + self.save_path = save_path + else: + os.makedirs(save_path) + self.save_path = save_path + logger.info("rca result save_path is :{0}".format(self.save_path)) + + def export(self): + record_file_name = "{0}/{1}".format(self.save_path, "record") + logger.info("save record to {0}".format(record_file_name)) + with open(record_file_name, 'w') as f: + for record in self.records: + record_data = record.export_record() + f.write(record_data.get_string()) + f.write("\n") + f.write(record.export_suggest()) + f.write("\n") + + +class RCA_ResultRecord: + def __init__(self): + self.records = [] + self.suggest = "The suggest: " + + def add_record(self, record): + logger.info("add_record:{0}".format(record)) + self.records.append(record) + + def add_suggest(self, suggest): + logger.info("add_suggest:{0}".format(suggest)) + self.suggest += suggest + + def export_suggest(self): + return self.suggest + + def export_record(self): + record_tb = PrettyTable(["step", "info"]) + record_tb.align["info"] = "l" + record_tb.title = "record" + i = 0 + while i < len(self.records): + record_tb.add_row([i + 1, fill(self.records[i], width=100)]) + i += 1 + logger.debug(record_tb) + return record_tb diff --git a/init.sh b/init.sh index b0fe2030..c507a0de 100755 --- a/init.sh +++ b/init.sh @@ -22,6 +22,10 @@ elif [ -d "${WORK_DIR}/handler/checker/tasks" ]; then cp -rf ${WORK_DIR}/handler/checker/tasks ${OBDIAG_HOME}/ fi +if [ -d "${WORK_DIR}/gather" ]; then + cp -rf ${WORK_DIR}/gather ${OBDIAG_HOME}/ +fi + if [ -d "${WORK_DIR}/example" ]; then cp -rf ${WORK_DIR}/example ${OBDIAG_HOME}/ fi diff --git a/init_obdiag_cmd.sh b/init_obdiag_cmd.sh index dc58a407..0c945909 100644 --- a/init_obdiag_cmd.sh +++ b/init_obdiag_cmd.sh @@ -1,4 +1,3 @@ - _obdiag_completion() { local cur_word args type_list cur_word="${COMP_WORDS[COMP_CWORD]}" @@ -6,23 +5,36 @@ _obdiag_completion() { case "${COMP_CWORD}" in 1) - type_list="version display-trace config gather analyze check" + type_list="version display-trace config gather analyze check rca" COMPREPLY=($(compgen -W "${type_list}" -- "${cur_word}")) ;; 2) case "${COMP_WORDS[1]}" in gather) - type_list="log clog slog plan_monitor stack perf obproxy_log all" + if [ "$COMP_CWORD" -eq 2 ]; then + type_list="log clog slog plan_monitor stack perf obproxy_log all scene" + elif [ "${COMP_WORDS[2]}" = "scene" ] && [ "$COMP_CWORD" -eq 3 ]; then + type_list="list run" + fi ;; analyze) type_list="log flt_trace" ;; + rca) + type_list="list run" + ;; *) type_list="" ;; esac COMPREPLY=($(compgen -W "${type_list}" -- "${cur_word}")) ;; + 3) + if [ "${COMP_WORDS[1]}" = "gather" ] && [ "${COMP_WORDS[2]}" = "scene" ]; then + type_list="list run" + COMPREPLY=($(compgen -W "${type_list}" -- "${cur_word}")) + fi + ;; *) COMPREPLY=() ;; diff --git a/obdiag_client.py b/obdiag_client.py index f198f496..6c22b22e 100644 --- a/obdiag_client.py +++ b/obdiag_client.py @@ -14,6 +14,10 @@ @file: obdiag_client.py @desc: """ +import uuid + +from prettytable import PrettyTable + from common.command import get_obdiag_display from common.constant import const from handler.analyzer.analyze_flt_trace import AnalyzeFltTraceHandler @@ -23,16 +27,22 @@ from handler.gather.gather_awr import GatherAwrHandler from handler.gather.gather_obproxy_log import GatherObProxyLogHandler from handler.gather.gather_sysstat import GatherOsInfoHandler +from handler.gather.gather_obstack2 import GatherObstack2Handler from handler.gather.gather_obadmin import GatherObAdminHandler from handler.gather.gather_perf import GatherPerfHandler from handler.gather.gather_plan_monitor import GatherPlanMonitorHandler +from handler.gather.gather_scenes import GatherSceneHandler +from handler.gather.scenes.list import GatherScenesListHandler +from handler.rca.rca_list import RcaScenesListHandler from common.config_helper import ConfigHelper import base64 import os import sys from common.logger import logger +from handler.rca.rca_handler import RCAHandler from telemetry.telemetry import telemetry from utils.time_utils import get_current_us_timestamp +from utils.utils import display_trace from utils.yaml_utils import read_yaml_data from utils.version_utils import print_obdiag_version from colorama import Fore, Style @@ -85,6 +95,9 @@ def __init__(self): self.gather_slog_handler = None self.gather_plan_monitor_handler = None self.gather_obproxy_log_handler = None + self.handle_gather_scene_handler = None + self.handle_gather_scene_list_handler = None + self.handle_rca_scenes_list_handler = None # analyze handler self.analyze_log_handler = None self.analyze_flt_trace_handler = None @@ -99,6 +112,8 @@ def __init__(self): self.obdiag_log_file = os.path.join( os.path.expanduser(const.OBDIAG_BASE_DEFAULT_CONFIG["obdiag"]["logger"]["log_dir"]), const.OBDIAG_BASE_DEFAULT_CONFIG["obdiag"]["logger"]["log_filename"]) + # obdiag rca + self.rca_result_path = None def init(self, args): if "c" in args and (getattr(args, "c") is not None): @@ -134,6 +149,20 @@ def init(self, args): return sucess_3 and (sucess_1 or sucess_2) elif "gather_plan_monitor" in args: return self.init_obcluster_config() + elif ("gather_scene" in args) or ("gather_scene_list" in args): + self.init_obcluster_config() + sucess_1 = self.init_observer_node_config() + self.init_obproxy_config() + sucess_2 = self.init_obproxy_node_config() + sucess_3 = self.init_gather_scene_config() + return sucess_3 and (sucess_1 or sucess_2) + elif "rca_run" in args: + self.init_obcluster_config() + sucess_1 = self.init_observer_node_config() + self.init_obproxy_config() + sucess_2 = self.init_obproxy_node_config() + sucess_3 = self.init_rca_config() + return sucess_3 and (sucess_1 or sucess_2) def init_observer_node_config(self): try: @@ -143,7 +172,7 @@ def init_observer_node_config(self): cluster_name = ob_cluster.get("ob_cluster_name") db_host = ob_cluster.get("db_host") - db_port = get_conf_data_str(ob_cluster.get("db_port"),2881) + db_port = get_conf_data_str(ob_cluster.get("db_port"), 2881) ob_servers = ob_cluster.get("servers") global_values = ob_servers.get("global") @@ -190,8 +219,8 @@ def init_observer_node_config(self): return False self.observer_nodes = observer_nodes return True - except: - logger.error("observer node config init Failed") + except Exception as e: + logger.error("observer node config init Failed, error:{0}".format(e)) return False def init_obproxy_node_config(self): @@ -214,7 +243,6 @@ def init_obproxy_node_config(self): global_redo_dir = get_conf_data_str(global_values.get("redo_dir"), global_data_dir) global_node_ip = global_values.get("ip") - nodes = obproxy_servers.get("nodes") for node in nodes: node_config = {} @@ -238,21 +266,23 @@ def init_obproxy_node_config(self): return True else: return False - except: - logger.error("obproxy node config init Failed") + except Exception as e: + logger.error("obproxy node config init Failed, error:{0}".format(e)) return False def init_basic_config(self): try: if self.inner_config.get("obdiag"): self.basic_config = self.inner_config.get("obdiag").get("basic") - self.obdiag_log_file = os.path.join(os.path.expanduser(self.inner_config.get("obdiag").get("logger").get("log_dir")), - self.inner_config.get("obdiag").get("logger").get("log_filename")) + self.obdiag_log_file = os.path.join( + os.path.expanduser(self.inner_config.get("obdiag").get("logger").get("log_dir")), + self.inner_config.get("obdiag").get("logger").get("log_filename")) self.config_file = os.path.expanduser(self.basic_config.get("config_path")) except: self.basic_config = const.OBDIAG_BASE_DEFAULT_CONFIG["obdiag"]["basic"] - self.obdiag_log_file = os.path.join(os.path.expanduser(const.OBDIAG_BASE_DEFAULT_CONFIG["obdiag"]["logger"]["log_dir"]), - const.OBDIAG_BASE_DEFAULT_CONFIG["obdiag"]["logger"]["log_filename"]) + self.obdiag_log_file = os.path.join( + os.path.expanduser(const.OBDIAG_BASE_DEFAULT_CONFIG["obdiag"]["logger"]["log_dir"]), + const.OBDIAG_BASE_DEFAULT_CONFIG["obdiag"]["logger"]["log_filename"]) def init_ocp_config(self): try: @@ -265,8 +295,8 @@ def init_ocp_config(self): return True else: return False - except: - logger.warning("ocp config init Failed") + except Exception as e: + logger.warning("ocp config init Failed, error:{0}".format(e)) return False def init_checker_config(self): @@ -280,8 +310,35 @@ def init_checker_config(self): self.check_tasks_base_path = check_config["tasks_base_path"] self.check_ignore_version = check_config["ignore_version"] return True - except: - logger.error("checker config init Failed") + except Exception as e: + logger.error("checker config init Failed, error:{0}".format(e)) + return False + + def init_gather_scene_config(self): + try: + gather_scene_config = self.inner_config.get("gather") + if gather_scene_config is None: + gather_scene_config = const.OBDIAG_GATHER_DEFAULT_CONFIG + self.gather_scene_base_path = gather_scene_config["scenes_base_path"] + return True + except Exception as e: + logger.error("gather scene config init Failed, error:{0}".format(e)) + return False + + + def init_rca_config(self): + try: + rca_config = self.inner_config.get("rca") + if rca_config is None: + rca_config = const.OBDIAG_RCA_DEFAULT_CONFIG + self.rca_result_path = rca_config["result_path"] + self.rca_result_path = os.path.dirname(self.rca_result_path) + if not os.path.isdir(self.rca_result_path): + logger.warning("rca_result_path is not exist ,mkdir it: {0}".format(self.rca_result_path)) + os.makedirs(self.rca_result_path) + return True + except Exception as e: + logger.error("rca config init Failed, error:{0}".format(e)) return False def init_obproxy_config(self): @@ -293,8 +350,8 @@ def init_obproxy_config(self): if config: self.obproxy_cluster = self.config.get("obproxy") return True - except: - logger.error("obproxy config init Failed") + except Exception as e: + logger.error("obproxy config init Failed, error:{0}".format(e)) return False def init_obcluster_config(self): @@ -306,8 +363,8 @@ def init_obcluster_config(self): if config: self.ob_cluster = self.config.get("obcluster") return True - except: - logger.error("obcluster config init Failed") + except Exception as e: + logger.error("obcluster config init Failed, error:{0}".format(e)) return False def read_config(self, config_file): @@ -330,13 +387,16 @@ def obdiag_display(self, args): def quick_build_configuration(self, args): try: user = getattr(args, "u")[0] - password = getattr(args, "p")[0] + if getattr(args, "p"): + password = getattr(args, "p")[0] + else: + password = "" host = getattr(args, "h")[0] port = getattr(args, "P")[0] config_helper = ConfigHelper(user, password, host, port) config_helper.build_configuration(args, self.config_file, INNER_CONFIG_FILE) - except: - logger.error("Configuration generation failed") + except Exception as e: + logger.error("Configuration generation failed, error:{0}".format(e)) def handle_gather_log_command(self, args): self.gather_log_handler = GatherLogHandler(self.observer_nodes, self.default_collect_pack_dir, @@ -349,6 +409,10 @@ def handle_gather_sysstat_command(self, args): self.gather_timestamp, self.basic_config) return self.gather_sysstat_handler.handle(args) + def handle_gather_obstack_command(self, args): + self.gather_obstack_handler = GatherObstack2Handler(self.observer_nodes, self.default_collect_pack_dir, + self.gather_timestamp, self.basic_config) + return self.gather_obstack_handler.handle(args) def handle_gather_perf_command(self, args): self.gather_perf_handler = GatherPerfHandler(self.observer_nodes, self.default_collect_pack_dir, @@ -375,6 +439,14 @@ def handle_gather_plan_monitor(self, args): self.gather_timestamp) return self.gather_plan_monitor_handler.handle(args) + def handle_gather_scene_command(self, args): + self.handle_gather_scene_handler = GatherSceneHandler(self.obproxy_cluster, self.obproxy_nodes, self.ob_cluster, self.observer_nodes, self.default_collect_pack_dir, self.gather_timestamp, self.gather_scene_base_path) + return self.handle_gather_scene_handler.handle(args) + + def handle_gather_scene_list_command(self, args): + self.handle_gather_scene_list_handler = GatherScenesListHandler(self.gather_scene_base_path) + return self.handle_gather_scene_list_handler.handle(args) + def handle_gather_obproxy_log_command(self, args): self.gather_obproxy_log_handler = GatherObProxyLogHandler(self.obproxy_nodes, self.default_collect_pack_dir, self.gather_timestamp, self.basic_config) @@ -402,24 +474,25 @@ def handle_analyze_flt_trace_command(self, args): return self.analyze_flt_trace_handler.handle(args) def handle_check_command(self, args): - obproxy_check_handler=None - observer_check_handler= None + obproxy_check_handler = None + observer_check_handler = None if self.obproxy_cluster is not None: - obproxy_check_handler = CheckHandler(ignore_version=self.check_ignore_version, cluster=self.obproxy_cluster, nodes=self.obproxy_nodes, - export_report_path=self.check_report_path, - export_report_type=self.check_report_type, - case_package_file=self.check_case_package_file, - tasks_base_path=self.check_tasks_base_path, + obproxy_check_handler = CheckHandler(ignore_version=self.check_ignore_version, cluster=self.obproxy_cluster, + nodes=self.obproxy_nodes, + export_report_path=self.check_report_path, + export_report_type=self.check_report_type, + case_package_file=self.check_case_package_file, + tasks_base_path=self.check_tasks_base_path, check_target_type="obproxy") obproxy_check_handler.handle(args) obproxy_check_handler.execute() if self.ob_cluster is not None: - observer_check_handler = CheckHandler(ignore_version=self.check_ignore_version, cluster=self.ob_cluster, + observer_check_handler = CheckHandler(ignore_version=self.check_ignore_version, cluster=self.ob_cluster, nodes=self.observer_nodes, - export_report_path=self.check_report_path, - export_report_type=self.check_report_type, - case_package_file=self.check_case_package_file, - tasks_base_path=self.check_tasks_base_path) + export_report_path=self.check_report_path, + export_report_type=self.check_report_type, + case_package_file=self.check_case_package_file, + tasks_base_path=self.check_tasks_base_path) observer_check_handler.handle(args) observer_check_handler.execute() if obproxy_check_handler is not None: @@ -431,6 +504,27 @@ def handle_check_command(self, args): return + def handle_rca_run_command(self, args): + try: + rca_handler = RCAHandler(cluster=self.ob_cluster, + nodes=self.observer_nodes, + obproxy_nodes=self.obproxy_nodes, + result_path=self.rca_result_path or "./rca") + rca_handler.handle(args) + rca_handler.execute() + logger.info( + "rca finished. For more details, the result on '" + Fore.YELLOW + rca_handler.get_result_path() + Style.RESET_ALL + "' \nYou can get the suggest by '" + Fore.YELLOW + "cat " + rca_handler.get_result_path() + "/record" + Style.RESET_ALL + "'") + except Exception as e: + logger.error("rca failed! error msg:" + str(e)) + finally: + display_trace(uuid.uuid3(uuid.NAMESPACE_DNS, str(os.getpid()))) + + return + + def handle_rca_list_command(self,args): + self.handle_rca_scenes_list_handler = RcaScenesListHandler() + return self.handle_rca_scenes_list_handler.handle(args) + def get_conf_data_str(value, dafult_value): if value is None: diff --git a/obdiag_main.py b/obdiag_main.py index 0cfeec32..d93a1926 100644 --- a/obdiag_main.py +++ b/obdiag_main.py @@ -65,6 +65,14 @@ def gather_perf(args): logger.debug("object has no attribute 'gather_perf' pass gather perf info\n") +def gather_obstack(args): + try: + if args.gather_obstack: + args.gather_obstack(args) + except AttributeError: + logger.debug("object has no attribute 'gather_obstack' pass gather ob stack\n") + + def gather_plan_monitor(args): try: if args.gather_plan_monitor: @@ -97,6 +105,21 @@ def gather_obproxy_log(args): logger.debug("object has no attribute 'gather_obproxy_log' pass gather obproxy log\n") +def gather_scene(args): + try: + if args.gather_scene: + args.gather_scene(args) + except AttributeError: + logger.debug("object has no attribute 'gather_scene' pass gather scene\n") + +def gather_scene_list(args): + try: + if args.gather_scene_list: + args.gather_scene_list(args) + except AttributeError: + logger.debug("object has no attribute 'gather_scene_list' pass gather scene list\n") + + def get_version(args): try: if args.version: @@ -128,6 +151,7 @@ def analyze_flt_trace(args): except AttributeError: logger.debug("object has no attribute 'analyze_flt_trace' pass analyze trace log\n") + def check(args): try: if args.check: @@ -136,6 +160,22 @@ def check(args): logger.debug("object has no attribute 'check' pass check\n") +def rca_run(args): + try: + if args.rca_run: + args.rca_run(args) + except AttributeError as e: + logger.debug("object has no attribute 'rca_run' pass rca run\n") + + +def rca_list(args): + try: + if args.rca_list: + args.rca_list(args) + except AttributeError as e: + logger.debug("object has no attribute 'rca_list' pass rca list\n") + + if __name__ == '__main__': obdiag = OBDIAGClient() arg_parser = ArgParser(obdiag) @@ -144,17 +184,22 @@ def check(args): get_obdiag_trace_log(obdiag_args) pharse_config(obdiag_args) telemetry.push_cmd_info(obdiag_args) + rca_list(obdiag_args) if obdiag.init(obdiag_args): telemetry.set_cluster_conn(obdiag.ob_cluster) gather_log(obdiag_args) gather_awr(obdiag_args) gather_sysstat(obdiag_args) gather_perf(obdiag_args) + gather_obstack(obdiag_args) gather_plan_monitor(obdiag_args) gather_clog(obdiag_args) gather_slog(obdiag_args) gather_obproxy_log(obdiag_args) + gather_scene(obdiag_args) + gather_scene_list(obdiag_args) analyze_log(obdiag_args) analyze_flt_trace(obdiag_args) check(obdiag_args) + rca_run(obdiag_args) telemetry.put_data() diff --git a/requirements3.txt b/requirements3.txt index e3ac4ba8..60a18449 100644 --- a/requirements3.txt +++ b/requirements3.txt @@ -28,4 +28,5 @@ docker==6.1.3 pwinput==1.0.3 pyinstaller>=4.3 colorama==0.4.6 -PyMySQL==1.0.2 \ No newline at end of file +PyMySQL==1.0.2 +sqlparse==0.4.4 \ No newline at end of file diff --git a/rpm/build.sh b/rpm/build.sh index e7699421..062b23e0 100755 --- a/rpm/build.sh +++ b/rpm/build.sh @@ -2,7 +2,7 @@ python_bin='python' W_DIR=`pwd` -VERSION=${VERSION:-'1.5.2'} +VERSION=${VERSION:-'1.6.0'} function python_version() diff --git a/rpm/oceanbase-diagnostic-tool.spec b/rpm/oceanbase-diagnostic-tool.spec index 51e18ec3..6e941cf3 100644 --- a/rpm/oceanbase-diagnostic-tool.spec +++ b/rpm/oceanbase-diagnostic-tool.spec @@ -1,5 +1,5 @@ Name: oceanbase-diagnostic-tool -Version:1.5.2 +Version:1.6.0 Release: %(echo $RELEASE)%{?dist} Summary: oceanbase diagnostic tool program Group: Development/Tools @@ -39,6 +39,7 @@ mkdir -p $BUILD_DIR/SOURCES ${RPM_BUILD_ROOT} mkdir -p $BUILD_DIR/SOURCES/site-packages mkdir -p $BUILD_DIR/SOURCES/resources mkdir -p $BUILD_DIR/SOURCES/handler/checker/tasks +mkdir -p $BUILD_DIR/SOURCES/gather/tasks mkdir -p $BUILD_DIR/SOURCES/dependencies/bin mkdir -p ${RPM_BUILD_ROOT}/usr/bin mkdir -p ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool @@ -49,12 +50,14 @@ rm -f obdiag.py oceanbase-diagnostic-tool.spec \cp -rf $SRC_DIR/resources $BUILD_DIR/SOURCES/ \cp -rf $SRC_DIR/dependencies/bin $BUILD_DIR/SOURCES/dependencies \cp -rf $SRC_DIR/handler/checker/tasks $BUILD_DIR/SOURCES/tasks +\cp -rf $SRC_DIR/handler/gather/tasks $BUILD_DIR/SOURCES/gather \cp -rf $SRC_DIR/*check_package.yaml $BUILD_DIR/SOURCES/ \cp -rf $SRC_DIR/init.sh $BUILD_DIR/SOURCES/init.sh \cp -rf $SRC_DIR/init_obdiag_cmd.sh $BUILD_DIR/SOURCES/init_obdiag_cmd.sh \cp -rf $SRC_DIR/conf $BUILD_DIR/SOURCES/conf mkdir -p ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/lib/ mkdir -p ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/dependencies/bin +mkdir -p ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/gather \cp -rf $SRC_DIR/dist/obdiag ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/obdiag \cp -rf $BUILD_DIR/SOURCES/site-packages ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/lib/site-packages \cp -rf $BUILD_DIR/SOURCES/resources ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/resources @@ -65,6 +68,7 @@ mkdir -p ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/dependencies/bin \cp -rf $BUILD_DIR/SOURCES/init.sh ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/ \cp -rf $BUILD_DIR/SOURCES/init_obdiag_cmd.sh ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/ \cp -rf $BUILD_DIR/SOURCES/tasks ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/tasks +\cp -rf $BUILD_DIR/SOURCES/gather/tasks ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/gather %files diff --git a/telemetry/telemetry.py b/telemetry/telemetry.py index b0e8b585..4c8c9eb4 100644 --- a/telemetry/telemetry.py +++ b/telemetry/telemetry.py @@ -27,6 +27,7 @@ from io import open from common.constant import const from common.ob_connector import OBConnector +from utils.network_utils import network_connectivity from utils.time_utils import DateTimeEncoder from utils.version_utils import get_obdiag_version @@ -49,7 +50,7 @@ def set_cluster_conn(self, obcluster): if not self.work_tag: return if self.work_tag: - self.work_tag = check_observer() + self.work_tag = network_connectivity("https://" + const.TELEMETRY_URL + const.TELEMETRY_PATH) if not self.work_tag: return @@ -90,6 +91,7 @@ def get_cluster_info(self): data_one["svr_ip"] = ip_mix_by_sha256(data_one["svr_ip"]) self.cluster_info = json.dumps(data) + self.cluster_info["obversion"] = version except Exception as e: pass return @@ -177,18 +179,6 @@ def put_info_to_oceanbase(self): pass -def check_observer(): - try: - url = "https://" + const.TELEMETRY_URL + const.TELEMETRY_PATH - socket.setdefaulttimeout(3) - response = requests.get(url,timeout=(3)) - if response.status_code == 200: - return True - else: - return False - except Exception as e: - - return False key="********" diff --git a/utils/__init__.py b/utils/__init__.py index afb4c5ef..2b33595e 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -12,7 +12,6 @@ """ @time: 2022/6/20 -@author: jingshun @file: __init__.py @desc: """ \ No newline at end of file diff --git a/utils/file_utils.py b/utils/file_utils.py index e2563369..997c882a 100644 --- a/utils/file_utils.py +++ b/utils/file_utils.py @@ -200,4 +200,11 @@ def find_all_file(base): for f in fs: fullname = os.path.join(root, f) file_list.append(fullname) - return file_list \ No newline at end of file + return file_list + + +def write_data_append(output_file, results): + with open(output_file, 'a', encoding='utf-8') as f: + for row in results: + line_to_write = ','.join(str(item) for item in row) + f.write(line_to_write + '\n') \ No newline at end of file diff --git a/utils/network_utils.py b/utils/network_utils.py new file mode 100644 index 00000000..780fb8c7 --- /dev/null +++ b/utils/network_utils.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2023/12/26 +@file: version_utils.py +@desc: +""" +import socket + +import requests + +from common.constant import const + + +def network_connectivity(url=""): + try: + socket.setdefaulttimeout(3) + response = requests.get(url, timeout=(3)) + if response.status_code == 200: + return True + else: + return False + except Exception as e: + print(e) + return False + + diff --git a/utils/parser_utils.py b/utils/parser_utils.py index cdb89a9e..6822847f 100644 --- a/utils/parser_utils.py +++ b/utils/parser_utils.py @@ -26,6 +26,10 @@ def __init__(self, option_strings, dest, **kwargs): def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, " ".join(values)) +class ParserAction(object): + def add_attribute_to_namespace(args, attr_name, attr_value): + setattr(args, attr_name, attr_value) + return args class ArgParser(object): def __new__(cls, *args, **kwargs): @@ -197,6 +201,7 @@ def parse_argv(self, argv=None): gather_plan_monitor_arguments.set_defaults(gather_plan_monitor=self.client.handle_gather_plan_monitor) gather_plan_monitor_arguments.add_argument("--trace_id", metavar="trace_id", required=True, nargs=1, help=" sql trace id") + gather_plan_monitor_arguments.add_argument("--env", metavar="env", type=str, help='env, eg: "{env1=xxx, env2=xxx}"') # gather 子命令 clog gather_clog_arguments = subparsers_gather.add_parser( @@ -289,6 +294,33 @@ def parse_argv(self, argv=None): gather_all_arguments.add_argument("--grep", metavar="grep", nargs='+', help="specify keywords constrain for log") + + gather_scene = subparsers_gather.add_parser( + "scene", help="Gather scene info", + conflict_handler='resolve', + description="gather scene") + # gather scene list + subparsers_gather_scene = gather_scene.add_subparsers() + gather_scene_arguments = subparsers_gather_scene.add_parser( + "run", + help="Gather scene run", + parents=[parents_time_arguments, parents_common_arguments], + epilog="Example: obdiag gather scene run --scene=xxx", + conflict_handler='resolve', + description="gather scene run") + gather_scene_arguments.set_defaults(gather_scene=self.client.handle_gather_scene_command) + gather_scene_arguments.add_argument("--scene", metavar="scene", nargs=1, required=True, help="specify scene") + gather_scene_arguments.add_argument("--env", metavar="env", type=str, help='env, eg: "{env1=xxx, env2=xxx}"') + + # gather scene list + gather_scene_list_arguments = subparsers_gather_scene.add_parser( + "list", + help="Gather scene list", + epilog="Example: obdiag gather scene list", + conflict_handler='resolve', + description="gather scene list") + gather_scene_list_arguments.set_defaults(gather_scene_list=self.client.handle_gather_scene_list_command) + # analyze parser_analyze = subparsers.add_parser("analyze", help="analyze logs and other information", ) subparsers_analyze = parser_analyze.add_subparsers() @@ -333,12 +365,11 @@ def parse_argv(self, argv=None): analyze_flt_trace_arguments.add_argument("--recursion", metavar="recursion", nargs=1, help="Maximum number of recursion") analyze_flt_trace_arguments.add_argument("--output", metavar="output", nargs=1, help="Print the result to the maximum output line on the screen") - # 定义巡检参数check arguments check_arguments = subparsers.add_parser("check", help="do check", - epilog="Example: ./obdiag check \n\n" - "Example: ./obdiag check --cases= system\n\n", + epilog="Example: obdiag check \n\n" + "Example: obdiag check --cases= system\n\n", conflict_handler='resolve', ) check_arguments.set_defaults(check=self.client.handle_check_command) check_arguments.add_argument("--cases", metavar="cases", nargs=1, @@ -348,6 +379,30 @@ def parse_argv(self, argv=None): check_arguments.add_argument("--report-path", metavar="report_path", nargs=1, help="report path", required=False) + check_arguments.add_argument("-c", metavar="config", help="obdiag custom config") + + + # 定义根因分析参数rca arguments + rca_arguments = subparsers.add_parser("rca", help="root cause analysis", + epilog="Example: obdiag rca run --scene=disconnection\n\n" + "Example: obdiag rca list", + conflict_handler='resolve', ) + subparsers_rca = rca_arguments.add_subparsers() + rca_list_arguments = subparsers_rca.add_parser( + "list", help="show list of rca list", + epilog="Example: obdiag rca list\n\n",) + rca_list_arguments.set_defaults(rca_list=self.client.handle_rca_list_command) + + rca_run_arguments = subparsers_rca.add_parser( + "run", help="Filter and analyze observer trace log", + epilog="Example: obdiag rca run --scene=disconnection\n\n", + conflict_handler='resolve', + description="According to the input parameters, rca run") + rca_run_arguments.set_defaults(rca_run=self.client.handle_rca_run_command) + rca_run_arguments.add_argument("--scene", metavar="scene", nargs=1,help="scene name. The argument is required.", required=True) + rca_run_arguments.add_argument("--parameters", metavar="parameters", nargs=1,help="Other parameters required for the scene, input in JSON format.",required=False) + rca_run_arguments.add_argument("--result-path", metavar="result_path", nargs=1,required=False) + rca_run_arguments.add_argument("-c", metavar="config", help="obdiag custom config") # parse args args = parser.parse_args(args=argv) diff --git a/utils/print_utils.py b/utils/print_utils.py new file mode 100644 index 00000000..19d40ac7 --- /dev/null +++ b/utils/print_utils.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/23 +@file: print_utils.py +@desc: +""" +from colorama import Fore, Style +from utils.utils import is_chinese + +def print_scene(scene_dict): + columns_to_print = ['command', 'info_en', 'info_cn'] + keys = columns_to_print + table_data = [[value[key] for key in keys] for value in scene_dict.values()] + column_widths = [max(len(str(item)) * (is_chinese(item) or 1) for item in column) for column in zip(*table_data)] + table_data.insert(0, keys) + print_line(length= sum(column_widths) + 5) + for i in range(len(table_data)): + print(Fore.GREEN + " ".join(f"{item:<{width}}" for item, width in zip(table_data[i], column_widths)) + Style.RESET_ALL) + if i == 0: + print_line(length= sum(column_widths) + 5) + print_line(length= sum(column_widths) + 5) + +def print_line(char='-', length=50): + print(char * length) + +def print_title(name): + print("\n[{0}]:".format(name)) \ No newline at end of file diff --git a/utils/shell_utils.py b/utils/shell_utils.py index e5f37e55..0a66b56d 100644 --- a/utils/shell_utils.py +++ b/utils/shell_utils.py @@ -16,6 +16,7 @@ @desc: """ import sys +import os import paramiko import time diff --git a/utils/sql_utils.py b/utils/sql_utils.py new file mode 100644 index 00000000..4999fde7 --- /dev/null +++ b/utils/sql_utils.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/17 +@file: sql_utils.py +@desc: +""" + +import sqlparse + +def extract_db_and_table(sql): + parsed_sql = sqlparse.parse(sql) + + db_tables_list = [] + + for statement in parsed_sql: + tokens = list(statement.tokens) + if statement.get_type() == 'SELECT': + from_index = next((i for i, token in enumerate(tokens) if + token.ttype == sqlparse.tokens.Keyword and token.value.lower() == 'from'), -1) + if from_index != -1: + after_from_tokens = tokens[from_index + 1:] + parse_db_table(after_from_tokens, db_tables_list) + + elif statement.get_type() == 'INSERT': + into_index = next((i for i, token in enumerate(tokens) if + token.ttype == sqlparse.tokens.Keyword and token.value.lower() == 'into'), -1) + if into_index != -1: + after_into_tokens = tokens[into_index + 1:] + parse_db_table(after_into_tokens, db_tables_list) + + return db_tables_list + + +def parse_db_table(tokens, db_tables_list): + for token in tokens: + if isinstance(token, sqlparse.sql.IdentifierList): + for sub_token in token.tokens: + parts = split_db_table(sub_token.value) + if len(parts) > 1: + db_tables_list.append(parts) + elif isinstance(token, sqlparse.sql.Identifier): + parts = split_db_table(token.value) + if len(parts) > 1: + db_tables_list.append(parts) + + +def split_db_table(table_name): + parts = table_name.replace('`', '').split('.') + return ('unknown' if len(parts) == 1 else parts[0], parts[-1]) \ No newline at end of file diff --git a/utils/string_utils.py b/utils/string_utils.py new file mode 100644 index 00000000..975a34db --- /dev/null +++ b/utils/string_utils.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/01/17 +@file: string_utils.py +@desc: +""" + +import re + +def parse_mysql_cli_connection_string(cli_conn_str): + db_info = {} + # 处理密码选项,注意区分短选项和长选项的密码 + password_pattern = re.compile(r'(-p\s*|--password=)([^ ]*)') + password_match = password_pattern.search(cli_conn_str) + if password_match: + password = password_match.group(2) + db_info['password'] = password + # 去除密码部分,避免后续解析出错 + cli_conn_str = cli_conn_str[:password_match.start()] + cli_conn_str[password_match.end():] + + # 模式匹配短选项 + short_opt_pattern = re.compile(r'-(\w)\s*(\S*)') + matches = short_opt_pattern.finditer(cli_conn_str) + for match in matches: + opt = match.group(1) + value = match.group(2) + if opt == 'h': + db_info['host'] = value + elif opt == 'u': + db_info['user'] = value + elif opt == 'P': + db_info['port'] = int(value) + elif opt == 'D': + db_info['database'] = value + + # 模式匹配长选项 + long_opt_pattern = re.compile(r'--(\w+)=([^ ]+)') + long_matches = long_opt_pattern.finditer(cli_conn_str) + for match in long_matches: + opt = match.group(1) + value = match.group(2) + if opt == 'host': + db_info['host'] = value + elif opt == 'user': + db_info['user'] = value + elif opt == 'port': + db_info['port'] = int(value) + elif opt in ['dbname', 'database']: + db_info['database'] = value + + # 如果存在命令行最后的参数,且不是一个选项,则认为是数据库名 + last_param = cli_conn_str.split()[-1] + if last_param[0] != '-' and 'database' not in db_info: + db_info['database'] = last_param + return db_info + +def validate_db_info(db_info): + required_keys = {'database', 'host', 'user', 'port'} + if not required_keys.issubset(db_info.keys()) or any(not value for value in db_info.values()): + return False + if not isinstance(db_info['port'], int): + return False + for key, value in db_info.items(): + if key != 'port' and not isinstance(value, str): + return False + return True + +def parse_custom_env_string(env_string): + env_dict = {} + # 去除花括号 + inner_str = env_string[1:-1] + pairs = inner_str.split(',') + for pair in pairs: + key_value = pair.strip().split('=') + if len(key_value) == 2: + key, value = key_value + # 处理可能含有单引号或双引号的情况 + if value.startswith('"') and value.endswith('"'): + value = value[1:-1] + elif value.startswith("'") and value.endswith("'"): + value = value[1:-1] + env_dict[key.strip()] = value.strip() + + return env_dict \ No newline at end of file diff --git a/utils/utils.py b/utils/utils.py index 089a0be1..42ee2804 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -15,17 +15,12 @@ @file: utils.py @desc: """ -import datetime import decimal -import json -import os +import ast import re import sys import subprocess import socket - -from prettytable import PrettyTable - from common.logger import logger @@ -168,6 +163,18 @@ def replacer(match): return re.sub(r'#\{(\w+)\}', replacer, s) +def build_str_on_expr_by_dict_2(expr, variable_dict): + s = expr + d = variable_dict + + def replacer(match): + key = match.group(1) + value = str(d.get(key, match.group(0))) + return f"'{value}'" + + return re.sub(r'\$\{(\w+)\}', replacer, s) + + def display_trace(uuid): print("If you want to view detailed obdiag logs, please run:' obdiag display-trace --trace_id {0} '".format(uuid)) @@ -197,3 +204,17 @@ def split_ip(ip_str): pattern = r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' result = re.findall(pattern, ip_str) return result + + +def parse_env_string_to_dict(env_string): + env_string = env_string.replace(';', ',') + env_dict = ast.literal_eval(f"{{{env_string}}}") + return env_dict + +def is_chinese(s): + try: + s.encode('ascii') + except UnicodeEncodeError: + return True + else: + return False \ No newline at end of file diff --git a/utils/yaml_utils.py b/utils/yaml_utils.py index e520c3ff..5da34d1b 100644 --- a/utils/yaml_utils.py +++ b/utils/yaml_utils.py @@ -17,12 +17,25 @@ """ import oyaml as yaml +import os +def is_yaml_file(path): + if not os.path.isfile(path): + return False + if path.endswith(('.yaml', '.yml')): + return True + else: + return False + def read_yaml_data(file_path): - with open(file_path, 'r') as f: - data = yaml.load(f, Loader=yaml.FullLoader) - return data + if is_yaml_file(file_path): + try: + with open(file_path, 'r') as f: + data = yaml.load(f, Loader=yaml.FullLoader) + return data + except yaml.YAMLError as exc: + raise Exception("Error loading YAML from file, error: {0}".format(exc)) def write_yaml_data(data, file_path): From 6089c1457bb76a0369c49c7902858ebeb977a1c9 Mon Sep 17 00:00:00 2001 From: Teingi Date: Fri, 2 Feb 2024 16:00:23 +0800 Subject: [PATCH 2/2] doc --- docs/gather_obproxy_log.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/gather_obproxy_log.md b/docs/gather_obproxy_log.md index 06548ea9..8a535837 100644 --- a/docs/gather_obproxy_log.md +++ b/docs/gather_obproxy_log.md @@ -53,9 +53,9 @@ Gather ObProxy Log Summary: +----------------+-----------+----------+------------------+--------+--------------------------------------------------------------------------+ | Node | Status | Size | Password | Time | PackPath | +================+===========+==========+==================+========+==========================================================================+ -| 192.168.2.11 | Completed | 36.762M | HYmVourcUyRNP8Om | 19 s | gather_pack_20220701183246/obproxy_log_192.168.2.11_20220701183247.zip | +| 192.168.2.11 | Completed | 36.762M | **************** | 19 s | gather_pack_20220701183246/obproxy_log_192.168.2.11_20220701183247.zip | +----------------+-----------+----------+------------------+--------+--------------------------------------------------------------------------+ -| 192.168.2.12 | Completed | 638.200M | 1RicMaiLUUNfemnj | 718 s | gather_pack_20220701183246/obproxy_log_192.168.2.12_20220701183918.zip | +| 192.168.2.12 | Completed | 638.200M | **************** | 718 s | gather_pack_20220701183246/obproxy_log_192.168.2.12_20220701183918.zip | +----------------+-----------+----------+------------------+--------+--------------------------------------------------------------------------+ ```