From be46c65e297d4ddf6ecee5c123821541fbc254d1 Mon Sep 17 00:00:00 2001 From: Rongfeng Fu Date: Fri, 19 Apr 2024 14:52:39 +0530 Subject: [PATCH] V2.8.0 (#194) --- _cmd.py | 26 + _errno.py | 8 +- _mirror.py | 103 +- const.py | 7 + core.py | 154 +- plugins/oceanbase/4.2.1.4/bootstrap.py | 278 +++ plugins/oceanbase/4.2.1.4/connect.py | 646 ++++++ plugins/oceanbase/4.2.1.4/destroy.py | 54 + plugins/oceanbase/4.2.1.4/init.py | 203 ++ plugins/oceanbase/4.2.1.4/parameter.yaml | 1941 +++++++++++++++++ plugins/oceanbase/4.2.1.4/restart.py | 314 +++ plugins/oceanbase/4.2.1.4/scale_out_check.py | 57 + plugins/oceanbase/4.2.1.4/start.py | 289 +++ plugins/oceanbase/4.2.1.4/start_check.py | 782 +++++++ plugins/oceanbase/4.2.1.4/stop.py | 162 ++ plugins/oceanbase/4.2.1.4/takeover.py | 275 +++ plugins/oceanbase/4.2.1.4/upgrade.py | 621 ++++++ plugins/oceanbase/4.2.2.0/bootstrap.py | 6 +- plugins/oceanbase/4.2.2.0/scale_out_check.py | 6 + plugins/oceanbase/4.2.2.0/upgrade.py | 2 +- plugins/ocp-express/1.0.1/generate_config.py | 3 + profile/obd.sh | 2 +- rpm/ob-deploy.spec | 46 + ssh.py | 12 +- .../MetaDBConfig/DataBaseNodeConfig.tsx | 2 +- .../component/OCPConfigNew/ServiceConfig.tsx | 3 +- web/src/pages/Obdeploy/NodeConfig.tsx | 13 +- web/src/utils/index.tsx | 137 +- 28 files changed, 5959 insertions(+), 193 deletions(-) create mode 100644 plugins/oceanbase/4.2.1.4/bootstrap.py create mode 100644 plugins/oceanbase/4.2.1.4/connect.py create mode 100644 plugins/oceanbase/4.2.1.4/destroy.py create mode 100644 plugins/oceanbase/4.2.1.4/init.py create mode 100644 plugins/oceanbase/4.2.1.4/parameter.yaml create mode 100644 plugins/oceanbase/4.2.1.4/restart.py create mode 100644 plugins/oceanbase/4.2.1.4/scale_out_check.py create mode 100644 plugins/oceanbase/4.2.1.4/start.py create mode 100644 plugins/oceanbase/4.2.1.4/start_check.py create mode 100644 plugins/oceanbase/4.2.1.4/stop.py create mode 100644 plugins/oceanbase/4.2.1.4/takeover.py create mode 100644 plugins/oceanbase/4.2.1.4/upgrade.py diff --git a/_cmd.py b/_cmd.py index bcff2bc..5949ec3 100644 --- a/_cmd.py +++ b/_cmd.py @@ -134,6 +134,8 @@ def _process_short_opts(self, rargs, values): else: raise e + def print_usage(self, file=None): + print(self.format_help(OptionHelpFormatter()), file=file) class BaseCommand(object): @@ -721,6 +723,29 @@ def _do_command(self, obd): return self._show_help() +class ClusterTakeoverCommand(ClusterMirrorCommand): + + def __init__(self): + super(ClusterTakeoverCommand, self).__init__('takeover', 'Takeover oceanbase cluster') + self.parser.remove_option('-h') + self.parser.add_option('--help', action='callback', callback=self._show_help, help='Show help and exit.') + self.parser.add_option('-h', '--host', type='string', help="db connection host, default: 127.0.0.1", default='127.0.0.1') + self.parser.add_option('-P', '--mysql-port', type='int', help="mysql port, default: 2881", default=2881) + self.parser.add_option('-p', '--root-password', type='string', help="password of root@sys user, default: ''", default='') + self.parser.add_option('--ssh-user', type='string', help="ssh user, default: current user") + self.parser.add_option('--ssh-password', type='string', help="ssh password, default: ''", default='') + self.parser.add_option('--ssh-port', type='int', help="ssh port, default: 22") + self.parser.add_option('-t', '--ssh-timeout', type='int', help="ssh connection timeout (second), default: 30") + self.parser.add_option('--ssh-key-file', type='string', help="ssh key file") + + + def _do_command(self, obd): + if self.cmds: + return obd.takeover(self.cmds[0]) + else: + return self._show_help() + + class DemoCommand(ClusterMirrorCommand): def __init__(self): @@ -1248,6 +1273,7 @@ def __init__(self): self.register_command(ClusterTenantCommand()) self.register_command(ClusterScaleoutCommand()) self.register_command(ClusterComponentMajorCommand()) + self.register_command(ClusterTakeoverCommand()) class TestMirrorCommand(ObdCommand): diff --git a/_errno.py b/_errno.py index 8ebe55d..856f529 100644 --- a/_errno.py +++ b/_errno.py @@ -213,7 +213,7 @@ class InitDirFailedErrorMessage(object): EC_OCP_SERVER_JAVA_VERSION_ERROR = OBDErrorCodeTemplate(4359, "{server}: ocp-server need java with version {version} and update release must greater than 161") EC_OCP_SERVER_JAVA_NOT_FOUND = OBDErrorCodeTemplate(4359, "{server}: failed to query java version, you may not have java installed") EC_OCP_SERVER_CLOCKDIFF_NOT_EXISTS = OBDErrorCodeTemplate(4360, "{server}: clockdiff not exists. Please install clockdiff manually") -EC_OCP_SERVER_TENANT_ALREADY_EXISTS = OBDErrorCodeTemplate(4361, "tenant({tenant_name}) alread exist") +EC_OCP_SERVER_TENANT_ALREADY_EXISTS = OBDErrorCodeTemplate(4361, "tenant({tenant_name}) already exist") EC_OCP_SERVER_DIR_ACCESS_FORBIDE = OBDErrorCodeTemplate(4362, "{server}:{path} access failed for current user, {server}:{cur_path} access succeed, please run `chmod -R 755 {cur_path}` ") EC_OCP_SERVER_DEPENDS_COMP_VERSION = OBDErrorCodeTemplate(4363, 'OCP server {ocp_server_version} needs to use {comp} with version {comp_version} or above') EC_OCP_SERVER_NOT_ENOUGH_MEMORY_AVAILABLE = OBDErrorCodeTemplate(4364, '({ip}) not enough memory. (Available: {available}, Need: {need})') @@ -247,7 +247,7 @@ class InitDirFailedErrorMessage(object): EC_OBDIAG_NOT_FOUND = OBDErrorCodeTemplate(6000, 'Failed to executable obdiag command, you may not have obdiag installed') EC_OBDIAG_NOT_CONTAIN_DEPEND_COMPONENT = OBDErrorCodeTemplate(6001, 'obdiag must contain depend components {components}') EC_OBDIAG_OPTIONS_FORMAT_ERROR = OBDErrorCodeTemplate(6002, 'obdiag options {option} format error, please check the value : {value}') -EC_OBDIAG_FUCYION_FAILED = OBDErrorCodeTemplate(6003, 'Failed to excute obdiag function {fuction}') +EC_OBDIAG_FUCYION_FAILED = OBDErrorCodeTemplate(6003, 'Failed to execute obdiag function {fuction}') # Unexpected exceptions code EC_UNEXPECTED_EXCEPTION = OBDErrorCodeTemplate(9999, 'Unexpected exception: need to be posted on "https://ask.oceanbase.com", and we will help you resolve them.') @@ -276,7 +276,7 @@ class InitDirFailedErrorMessage(object): SUG_CONNECT_EXCEPT = OBDErrorSuggestionTemplate('Connection exception or unsupported OS. Please retry or contact us.') SUG_UNSUPPORT_OS = OBDErrorSuggestionTemplate('It may be an unsupported OS, please contact us for assistance') SUG_OBSERVER_SYS_MEM_TOO_LARGE = OBDErrorSuggestionTemplate('`system_memory` should be less than {factor} * memory_limit/memory_limit_percentage.', fix_eval=[FixEval(FixEval.DEL, 'system_memory')]) -SUG_OBSERVER_NOT_ENOUGH_MEMORY_ALAILABLE = OBDErrorSuggestionTemplate('Please execute `echo 1 > /proc/sys/vm/drop_caches` as root in {ip} to rlease cached.') +SUG_OBSERVER_NOT_ENOUGH_MEMORY_ALAILABLE = OBDErrorSuggestionTemplate('Please execute `echo 1 > /proc/sys/vm/drop_caches` as root in {ip} to release cached.') SUG_OBSERVER_REDUCE_MEM = OBDErrorSuggestionTemplate('Please reduce the `memory_limit` or `memory_limit_percentage`', fix_eval=[FixEval(FixEval.DEL, 'memory_limit'), FixEval(FixEval.DEL, 'system_memory'), FixEval(FixEval.DEL, 'memory_limit_percentage')]) SUG_OBSERVER_SAME_DISK = OBDErrorSuggestionTemplate('Configure `redo_dir` and `data_dir` to different disks') SUG_OBSERVER_NOT_ENOUGH_DISK = OBDErrorSuggestionTemplate('Please reduce the `datafile_size` or `datafile_disk_percentage`', fix_eval=[FixEval(FixEval.DEL, 'datafile_size'), FixEval(FixEval.DEL, 'datafile_disk_percentage')]) @@ -284,7 +284,7 @@ class InitDirFailedErrorMessage(object): SUG_OBSERVER_NOT_ENOUGH_DISK_4_CLOG = OBDErrorSuggestionTemplate('Please increase the `clog_disk_utilization_threshold` and `clog_disk_usage_limit_percentage`', fix_eval=[FixEval(FixEval.DEL, 'clog_disk_utilization_threshold'), FixEval(FixEval.DEL, 'clog_disk_usage_limit_percentage')]) SUG_OBSERVER_TIME_OUT_OF_SYNC = OBDErrorSuggestionTemplate('Please enable clock synchronization service') SUG_OCP_EXPRESS_INSTALL_JAVA_WITH_VERSION = OBDErrorSuggestionTemplate('Please install java with version {version}. If java is already installed, please set `java_bin` to the expected java binary path') -SUG_OCP_EXPRESS_NOT_ENOUGH_MEMORY_AVALIABLE = OBDErrorSuggestionTemplate('Please execute `echo 1 > /proc/sys/vm/drop_caches` as root in {ip} to rlease cached.') +SUG_OCP_EXPRESS_NOT_ENOUGH_MEMORY_AVALIABLE = OBDErrorSuggestionTemplate('Please execute `echo 1 > /proc/sys/vm/drop_caches` as root in {ip} to release cached.') SUG_OCP_EXPRESS_REDUCE_MEM = OBDErrorSuggestionTemplate('Please reduce the `memory_size`', fix_eval=[FixEval(FixEval.DEL, 'memory_size')]) SUG_OCP_EXPRESS_REDUCE_DISK = OBDErrorSuggestionTemplate('Please reduce the `logging_file_total_size_cap`', fix_eval=[FixEval(FixEval.DEL, 'logging_file_total_size_cap')]) SUG_OCP_EXPRESS_COMP_VERSION = OBDErrorSuggestionTemplate('Please use {comp} with version {version} or above') diff --git a/_mirror.py b/_mirror.py index ce155ea..53260d8 100644 --- a/_mirror.py +++ b/_mirror.py @@ -48,38 +48,55 @@ _ARCH = getArchList() -_RELEASE = None -SUP_MAP = { - 'ubuntu': {'16': 7}, - 'debian': {'9': 7}, - 'opensuse-leap': {'15': 7}, - 'sles': {'15.2': 7}, - 'fedora': {'33': 7}, - 'uos': {'20': 8}, - 'anolis': {'23': 7}, - 'openEuler': {'22.03': 7}, - 'kylin': {'V10': 8}, - 'alinux': {'2': 7, '3': 8} -} -_SERVER_VARS = { - 'basearch': getBaseArch(), -} -with FileUtil.open('/etc/os-release') as f: - for line in f.readlines(): - line = line.strip() - if not line: - continue - try: - k, v = line.split('=', 1) - _SERVER_VARS[k] = v.strip('"').strip("'") - except: - pass - if 'VERSION_ID' in _SERVER_VARS: - m = re.match('\d+', _SERVER_VARS['VERSION_ID']) - if m: - _RELEASE = m.group(0) -_SERVER_VARS['releasever'] = _RELEASE - +_NO_LSE = 'amd64' in _ARCH and LocalClient.execute_command("grep atomics /proc/cpuinfo").stdout.strip() == '' + +def get_use_centos_release(stdio=None): + _RELEASE = None + SUP_MAP = { + 'ubuntu': {'16': 7}, + 'debian': {'9': 7}, + 'opensuse-leap': {'15': 7}, + 'sles': {'15.2': 7}, + 'fedora': {'33': 7}, + 'uos': {'20': 8}, + 'anolis': {'23': 7}, + 'openEuler': {'22.03': 7}, + 'kylin': {'V10': 8}, + 'alinux': {'2': 7, '3': 8} + } + _SERVER_VARS = { + 'basearch': getBaseArch(), + } + with FileUtil.open('/etc/os-release') as f: + for line in f.readlines(): + line = line.strip() + if not line: + continue + try: + k, v = line.split('=', 1) + _SERVER_VARS[k] = v.strip('"').strip("'") + except: + pass + if 'VERSION_ID' in _SERVER_VARS: + m = re.match('\d+', _SERVER_VARS['VERSION_ID']) + if m: + _RELEASE = m.group(0) + _SERVER_VARS['releasever'] = _RELEASE + + server_vars = deepcopy(_SERVER_VARS) + linux_id = server_vars.get('ID') + if linux_id in SUP_MAP: + version_id = server_vars.get('VERSION_ID', '') + sorted_versions = sorted([Version(key) for key in SUP_MAP[linux_id]], reverse=True) + for version in sorted_versions: + if Version(version_id) >= version: + server_vars['releasever'] = SUP_MAP[linux_id][str(version)] + break + else: + server_vars['releasever'] = SUP_MAP[linux_id][str(version)] + stdio and getattr(stdio, 'warn', print)('Use centos %s remote mirror repository for %s %s' % (server_vars['releasever'], linux_id, server_vars.get('VERSION_ID'))) + use_release = server_vars.get('releasever') + return use_release, server_vars class MirrorRepositoryType(Enum): @@ -567,10 +584,14 @@ def match_score(self, info, name, arch, version=None, min_version=None, max_vers if max_version and Version(info_version) > Version(max_version): return [0 ,] if release and info.release != release: - raise Exception ('break') return [0 ,] - c = [len(name) / len(info.name), info] + if _NO_LSE: + lse_score = 'nonlse' in info.release + else: + lse_score = True + + c = [len(name) / len(info.name), lse_score, info] return c @staticmethod @@ -979,22 +1000,10 @@ def _get_section(self, section_name): def get_remote_mirrors(self, is_enabled=True): self._lock() mirrors = [] - server_vars = deepcopy(_SERVER_VARS) - linux_id = server_vars.get('ID') - if linux_id in SUP_MAP: - version_id = server_vars.get('VERSION_ID', '') - sorted_versions = sorted([Version(key) for key in SUP_MAP[linux_id]], reverse=True) - for version in sorted_versions: - if Version(version_id) >= version: - server_vars['releasever'] = SUP_MAP[linux_id][str(version)] - break - else: - server_vars['releasever'] = SUP_MAP[linux_id][str(version)] - self.stdio and getattr(self.stdio, 'warn', print)('Use centos %s remote mirror repository for %s %s' % ( - server_vars['releasever'], linux_id, server_vars.get('VERSION_ID'))) for mirror_section in self._get_sections(): if is_enabled is not None and is_enabled != mirror_section.is_enabled: continue + _, server_vars = get_use_centos_release(self.stdio) mirrors.append(mirror_section.get_mirror(server_vars, self.stdio)) return mirrors diff --git a/const.py b/const.py index b89c722..a4c3944 100644 --- a/const.py +++ b/const.py @@ -51,11 +51,18 @@ COMP_OCEANBASE_DIAGNOSTIC_TOOL = "oceanbase-diagnostic-tool" COMP_OBDIAG = "obdiag" COMP_JRE = 'openjdk-jre' + +# ocp COMP_OCP_EXPRESS = 'ocp-express' COMP_OCP_SERVER = 'ocp-server' COMP_OCP_SERVER_CE = 'ocp-server-ce' COMPS_OCP = [COMP_OCP_SERVER, COMP_OCP_SERVER_CE] +# ob +COMP_OB = "oceanbase" +COMP_OB_CE = "oceanbase-ce" +COMPS_OB = [COMP_OB, COMP_OB_CE] + # service docs url DISABLE_SWAGGER = '' diff --git a/core.py b/core.py index df1510b..a75549c 100644 --- a/core.py +++ b/core.py @@ -1000,7 +1000,7 @@ def cluster_status_check(self, repositories, ret_status=None): self._call_stdio('stop_loading', 'succeed') return status - def search_components_from_mirrors_and_install(self, deploy_config, components=None): + def search_components_from_mirrors_and_install(self, deploy_config, components=None, raise_exception=True): # Check the best suitable mirror for the components errors = [] self._call_stdio('verbose', 'Search best suitable repository') @@ -1013,7 +1013,7 @@ def search_components_from_mirrors_and_install(self, deploy_config, components=N if not errors: pkgs, repositories, errors = self.search_components_from_mirrors(deploy_config, only_info=False, components=components) if errors: - self._call_stdio('error', '\n'.join(errors)) + raise_exception and self._call_stdio('error', '\n'.join(errors)) return repositories, None # Get the installation plugins. Install locally @@ -3511,10 +3511,10 @@ def upgrade_cluster(self, name): return True - def create_repository(self): + def create_repository(self, options=None): force = getattr(self.options, 'force', False) necessary = ['name', 'version', 'path'] - attrs = self.options.__dict__ + attrs = self.options.__dict__ if options is None else options success = True for key in necessary: if key not in attrs or not attrs[key]: @@ -5157,4 +5157,148 @@ def update_tool(self, tool_name, force=False, version=None, install_prefix=None) install_path = os.path.abspath(os.path.join(install_prefix, tool_name)) if not self._update_tool(tool, version, force, install_path): return False - return True \ No newline at end of file + return True + + def takeover(self, name): + host = getattr(self.options, 'host') + mysql_port = getattr(self.options, 'mysql_port') + root_password = getattr(self.options, 'root_password') + ssh_user = getattr(self.options, 'ssh_user') + ssh_password = getattr(self.options, 'ssh_password') + ssh_key_file = getattr(self.options, 'ssh_key_file') + ssh_port = getattr(self.options, 'ssh_port') + ssh_timeout = getattr(self.options, 'ssh_timeout') + + self._call_stdio('verbose', 'Get Deploy by name') + deploy = self.deploy_manager.get_deploy_config(name) + if deploy: + deploy_info = deploy.deploy_info + if deploy_info.status not in [DeployStatus.STATUS_CONFIGURED, DeployStatus.STATUS_DESTROYED]: + self._call_stdio('error', 'The deployment {} has exited. Please modify the deploy name and take over again.'.format(name)) + return False + + self._call_stdio('verbose', 'get plugins by mocking an oceanbase repository.') + # search and get all related plugins using a mock ocp repository + mock_oceanbase_ce_repository = Repository("oceanbase-ce", "/") + mock_oceanbase_ce_repository.version = "3.1.0" + configs = OrderedDict() + component_name = 'oceanbase-ce' + global_config = {} + configs[component_name] = { + 'servers': [host], + 'global': global_config + } + + user = dict() + if ssh_user: + user['username'] = ssh_user + if ssh_password: + user['password'] = ssh_password + if ssh_key_file: + user['key_file'] = ssh_key_file + if ssh_port: + user['port'] = ssh_port + if ssh_timeout: + user['timeout'] = ssh_timeout + if user: + configs['user'] = user + + global_config['mysql_port'] = mysql_port + global_config['root_password'] = root_password + with tempfile.NamedTemporaryFile(suffix=".yaml", mode='w') as tf: + yaml_loader = YamlLoader() + yaml_loader.dump(configs, tf) + deploy_config = DeployConfig( + tf.name, yaml_loader=YamlLoader(self.stdio), + config_parser_manager=self.deploy_manager.config_parser_manager, + inner_config=None, + stdio=self.stdio + ) + deploy_config.allow_include_error() + connect_plugin = self.plugin_manager.get_best_py_script_plugin('connect', mock_oceanbase_ce_repository.name, mock_oceanbase_ce_repository.version) + ssh_clients = self.get_clients(deploy_config, [mock_oceanbase_ce_repository]) + ret = self.call_plugin(connect_plugin, mock_oceanbase_ce_repository, cluster_config=deploy_config.components[component_name], clients=ssh_clients, stdio=self.stdio) + if not ret or not ret.get_return('connect'): + self._call_stdio('error', 'Failed to connect to OceanBase, Please check the database connection information.') + return False + cursor = ret.get_return('cursor') + ret = cursor.fetchone('select version() as version', raise_exception=True) + if ret is False: + return False + version = ret.get("version").split("-v")[-1] + mock_oceanbase_ce_repository.version = version + takeover_plugins = self.search_py_script_plugin([mock_oceanbase_ce_repository], "takeover") + if not takeover_plugins: + self._call_stdio('error', 'The current OceanBase version:%s does not support takeover, takeover plugin not found.' % version) + return False + # do take over cluster by call takeover precheck plugins + prepare_ret = self.call_plugin(takeover_plugins[mock_oceanbase_ce_repository], mock_oceanbase_ce_repository, + cursor=cursor, + user_config=configs.get('user', None), + name=name, + clients=ssh_clients, + obd_home=self.home_path, + stdio=self.stdio) + if not prepare_ret: + return False + try: + self.deploy = self.deploy_manager.get_deploy_config(name) + deploy_config = self.deploy.deploy_config + cluster_config = deploy_config.components[component_name] + version = cluster_config.version + release = cluster_config.release + repositories, _ = self.search_components_from_mirrors_and_install(deploy_config, raise_exception=False) + repository = repositories[0] if repositories else None + if not repository: + self._call_stdio('verbose', 'Cannot find the image of oceanbase-ce version: %s, release: %s" ' % (version, release)) + ssh_clients = self.get_clients(deploy_config, [mock_oceanbase_ce_repository]) + tmp_dir = '{}/tmp_takeover'.format(self.deploy.config_dir) + for server in cluster_config.servers: + ssh_client = ssh_clients[server] + server_config = cluster_config.get_server_conf(server) + home_path = server_config['home_path'] + plugin = self.plugin_manager.get_best_plugin(PluginType.INSTALL, component_name, version) + if not plugin: + self._call_stdio('error', 'Cannot find the plugin for {}'.format(component_name)) + return False + LocalClient.execute_command('rm -rf {}'.format(tmp_dir)) + for file_map in plugin.file_map_data: + if file_map['type'] == 'bin': + self._call_stdio('start_loading', 'Get %s from %s' % (home_path, file_map['target_path'])) + ret = ssh_client.get_file('{}/{}'.format(tmp_dir, file_map['target_path']), '{}/{}'.format(home_path, file_map['target_path']), stdio=self.stdio) + self._call_stdio('stop_loading', 'succeed') + elif file_map['type'] == 'dir': + ret = ssh_client.get_dir('{}/{}'.format(tmp_dir, file_map['target_path']), '{}/{}'.format(home_path, file_map['target_path']), stdio=self.stdio) + if not ret: + self._call_stdio('error', 'Cannot get the bin file from server: %s' % server) + break + + # create mirror by bin file + self._call_stdio('start_loading', 'Create mirror') + options = dict() + options['name'] = component_name + options['version'] = version + options['path'] = tmp_dir + options['force'] = True + setattr(self.options, 'release', release) + setattr(self.options, 'force', True) + if not self.create_repository(options): + self._call_stdio('error', 'Failed to create mirror') + return False + LocalClient.execute_command('rm -rf {}'.format(tmp_dir)) + self._call_stdio('stop_loading', 'succeed') + repository = self.repository_manager.get_repository(component_name, version, release=release) + + self.repositories = [repository] + self.deploy.deploy_info.components['oceanbase-ce']['md5'] = repository.md5 + self.deploy.deploy_info.status = DeployStatus.STATUS_RUNNING + self.deploy.dump_deploy_info() + display_plugins = self.search_py_script_plugin([repository], 'display') + if not self.call_plugin(display_plugins[repository], repository): + return False + return True + except: + self.deploy_manager.remove_deploy_config(name) + self._call_stdio('stop_loading', 'failed') + self._call_stdio('error', 'Failed to takeover OceanBase cluster' ) + return False \ No newline at end of file diff --git a/plugins/oceanbase/4.2.1.4/bootstrap.py b/plugins/oceanbase/4.2.1.4/bootstrap.py new file mode 100644 index 0000000..9ff8806 --- /dev/null +++ b/plugins/oceanbase/4.2.1.4/bootstrap.py @@ -0,0 +1,278 @@ +# coding: utf-8 +# OceanBase Deploy. +# Copyright (C) 2021 OceanBase +# +# This file is part of OceanBase Deploy. +# +# OceanBase Deploy is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# OceanBase Deploy is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with OceanBase Deploy. If not, see . + + +from __future__ import absolute_import, division, print_function + +import time +from copy import deepcopy +from optparse import Values + +from _deploy import InnerConfigItem +from tool import ConfigUtil + + + +def is_bootstrap(cursor): + sql = "select column_value from oceanbase.__all_core_table where table_name = '__all_global_stat' and column_name = 'baseline_schema_version'" + ret = cursor.fetchone(sql, raise_exception=False, exc_level='verbose') + if ret is False: + return False + return int(ret.get("column_value")) > 0 + + +def bootstrap(plugin_context, need_bootstrap=True, *args, **kwargs): + cluster_config = plugin_context.cluster_config + stdio = plugin_context.stdio + clients = plugin_context.clients + cursor = plugin_context.get_return('connect').get_return('cursor') + ocs_cursor = plugin_context.get_return('connect').get_return('ocs_cursor') + added_components = [] if cluster_config.added_servers else cluster_config.get_deploy_added_components() + be_depend = cluster_config.be_depends + global_conf = cluster_config.get_global_conf() + bootstrap = [] + floor_servers = {} + zones_config = {} + inner_config = { + InnerConfigItem('$_zone_idc'): 'idc' + } + + if added_components: + stdio.verbose('bootstrap for components: %s' % added_components) + + raise_cursor = cursor.raise_cursor + if cluster_config.name in added_components and need_bootstrap: + for server in cluster_config.servers: + server_config = cluster_config.get_server_conf(server) + zone = server_config['zone'] + if zone in floor_servers: + floor_servers[zone].append('%s:%s' % (server.ip, server_config['rpc_port'])) + else: + floor_servers[zone] = [] + zones_config[zone] = {} + bootstrap.append('REGION "sys_region" ZONE "%s" SERVER "%s:%s"' % (server_config['zone'], server.ip, server_config['rpc_port'])) + + zone_config = zones_config[zone] + for key in server_config: + if not isinstance(key, InnerConfigItem): + continue + if key not in inner_config: + continue + if key in zone_config: + continue + zone_config[key] = server_config[key] + try: + sql = 'set session ob_query_timeout=1000000000' + stdio.verbose('execute sql: %s' % sql) + raise_cursor.execute(sql) + sql = 'alter system bootstrap %s' % (','.join(bootstrap)) + stdio.start_loading('Cluster bootstrap') + raise_cursor.execute(sql, exc_level='verbose') + for zone in floor_servers: + for addr in floor_servers[zone]: + sql = 'alter system add server "%s" zone "%s"' % (addr, zone) + raise_cursor.execute(sql) + + if global_conf.get('root_password') is not None: + sql = 'alter user "root" IDENTIFIED BY %s' + raise_cursor.execute(sql, [global_conf.get('root_password')]) + for zone in zones_config: + zone_config = zones_config[zone] + for key in zone_config: + sql = 'alter system modify zone %s set %s = %%s' % (zone, inner_config[key]) + raise_cursor.execute(sql, [zone_config[key]]) + stdio.stop_loading('succeed') + except: + if not is_bootstrap(cursor): + stdio.stop_loading('fail') + return plugin_context.return_false() + stdio.stop_loading('succeed') + + # wait for server online + while True: + servers = cursor.fetchall('select * from oceanbase.__all_server', raise_exception=False, exc_level='verbose') + if servers and all([s.get('status') for s in servers]): + break + else: + time.sleep(1) + + need_takeover_servers = [] + for server in cluster_config.servers: + client = clients[server] + server_config = cluster_config.get_server_conf(server) + home_path = server_config['home_path'] + ret = client.execute_command('ls %s/.meta' % server_config['home_path']) + stdio.verbose(ret.stdout) + stdio.verbose(ret.stderr) + + obshell_pid_path = '%s/run/obshell.pid' % home_path + obshell_pid = client.execute_command('cat %s' % obshell_pid_path).stdout.strip() + if obshell_pid and client.execute_command('ls /proc/%s' % obshell_pid): + stdio.verbose('%s obshell[pid: %s] started', server, obshell_pid) + else: + need_takeover_servers.append(server) + ret = client.execute_command('strings %s/etc/observer.conf.bin' % home_path) + stdio.verbose(ret.stdout) + stdio.verbose(ret.stderr) + + if need_takeover_servers: + stdio.start_loading('obshell taking over') + for server in need_takeover_servers: + # obshell admin start + client = clients[server] + server_config = cluster_config.get_server_conf(server) + home_path = server_config['home_path'] + obshell_pid_path = '%s/run/obshell.pid' % home_path + obshell_pid = client.execute_command('cat %s' % obshell_pid_path).stdout.strip() + if obshell_pid and client.execute_command('ls /proc/%s' % obshell_pid): + stdio.verbose('%s obshell[pid: %s] started', server, obshell_pid) + else: + # start obshell + server_config = cluster_config.get_server_conf(server) + password = server_config.get('root_password', '') + client.add_env('OB_ROOT_PASSWORD', password if client._is_local else ConfigUtil.passwd_format(password)) + cmd = 'cd %s; %s/bin/obshell admin start --ip %s --port %s'%(server_config['home_path'],server_config['home_path'], server.ip, server_config['obshell_port']) + stdio.verbose('start obshell: %s' % cmd) + if not client.execute_command(cmd): + stdio.stop_loading('fail') + stdio.error('%s obshell failed', server) + return + + time.sleep(3) + retry_times = 600 + count = 0 + while count < retry_times: + success = True + for server in cluster_config.servers: + # get status + status = ocs_cursor[server].status_request() + if not status: + stdio.verbose('get status failed, count: %d' % count) + success = False + break + if status.state == status.State.STATE_CONNECTION_AVAILABLE.value: + stdio.verbose('get status success') + if not success: + success = False + count += 1 + time.sleep(1) + else: + break + if not success: + stdio.stop_loading('fail') + stdio.error('obshell is not ready') + return plugin_context.return_false() + + # find take over dag, maybe already finished + while count < retry_times: + success = False + for server in cluster_config.servers: + dag = ocs_cursor[server].get_agent_last_maintenance_dag_request() + if dag and dag.is_take_over_or_rebuild(): # find the dag + # wait util dag finished + result = ocs_cursor[server].query_dag_util_finish(dag) + # TODO: 如果请求失败了,是否还需要重试呢 + if result and result.is_succeed() and result.is_run(): + # take over succeed + stdio.verbose('obshell take over succeed') + success = True + break + else: + count = retry_times + success = False + if not success: + stdio.verbose('find take over dag failed, count: %d' % count) + success = False + count += 1 + time.sleep(1) + else: + break + + if not success: + stdio.stop_loading('fail') + stdio.error('obshell take over failed') + # TODO: 如果任务出错,提示用户是否需要重试(TAKE OVER 不可以回滚) + return plugin_context.return_false() + stdio.stop_loading('succeed') + + has_obproxy = False + for component_name in ['obproxy', 'obproxy-ce']: + if component_name in added_components and component_name in be_depend: + has_obproxy = True + break + if has_obproxy or 'proxyro_password' in global_conf: + value = global_conf['proxyro_password'] if global_conf.get('proxyro_password') is not None else '' + sql = 'create user if not exists "proxyro" IDENTIFIED BY %s' + raise_cursor.execute(sql, [value]) + sql = 'grant select on oceanbase.* to proxyro IDENTIFIED BY %s' + raise_cursor.execute(sql, [value]) + + has_oblogproxy = "oblogproxy" in added_components and "oblogproxy" in be_depend + if has_oblogproxy or 'cdcro_password' in global_conf: + value = global_conf['cdcro_password'] if global_conf.get('cdcro_password') is not None else '' + sql = 'create user "cdcro" IDENTIFIED BY %s' + raise_cursor.execute(sql, [value]) + sql = 'grant select on oceanbase.* to cdcro IDENTIFIED BY %s' + raise_cursor.execute(sql, [value]) + + has_obagent = "obagent" in added_components and "obagent" in be_depend + if has_obagent or 'ocp_agent_monitor_password' in global_conf: + value = global_conf['ocp_agent_monitor_password'] if global_conf.get('ocp_agent_monitor_password') is not None else '' + sql = 'create user if not exists "ocp_monitor" IDENTIFIED BY %s' + stdio.verbose(sql) + raise_cursor.execute(sql, [value]) + sql = 'grant select on oceanbase.* to ocp_monitor IDENTIFIED BY %s' + stdio.verbose(sql) + raise_cursor.execute(sql, [value]) + + # check the requirements of ocp meta and monitor tenant + global_conf_with_default = deepcopy(cluster_config.get_global_conf_with_default()) + original_global_conf = cluster_config.get_original_global_conf() + + ocp_tenants = [] + tenants_componets_map = { + "meta": ["ocp-express", "ocp-server", "ocp-server-ce"], + "monitor": ["ocp-server", "ocp-server-ce"], + } + ocp_tenant_keys = ['tenant', 'db', 'username', 'password'] + for tenant in tenants_componets_map: + components = tenants_componets_map[tenant] + prefix = "ocp_%s_" % tenant + if not any([component in added_components and component in be_depend for component in components]): + for key in ocp_tenant_keys: + config_key = prefix + key + if config_key in global_conf: + break + else: + continue + # set create tenant variable + for key in global_conf_with_default: + if key.startswith(prefix) and original_global_conf.get(key, None): + global_conf_with_default[prefix + 'tenant'][key.replace(prefix, '', 1)] = global_conf_with_default[key] + tenant_info = global_conf_with_default[prefix + "tenant"] + tenant_info["variables"] = "ob_tcp_invited_nodes='%'" + tenant_info["create_if_not_exists"] = True + tenant_info["database"] = global_conf_with_default[prefix + "db"] + tenant_info["db_username"] = global_conf_with_default[prefix + "username"] + tenant_info["db_password"] = global_conf_with_default.get(prefix + "password", "") + tenant_info["{0}_root_password".format(tenant_info['tenant_name'])] = global_conf_with_default.get(prefix + "password", "") + ocp_tenants.append(Values(tenant_info)) + plugin_context.set_variable("create_tenant_options", ocp_tenants) + + return plugin_context.return_true() diff --git a/plugins/oceanbase/4.2.1.4/connect.py b/plugins/oceanbase/4.2.1.4/connect.py new file mode 100644 index 0000000..fcee8bc --- /dev/null +++ b/plugins/oceanbase/4.2.1.4/connect.py @@ -0,0 +1,646 @@ +# coding: utf-8 +# OceanBase Deploy. +# Copyright (C) 2021 OceanBase +# +# This file is part of OceanBase Deploy. +# +# OceanBase Deploy is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# OceanBase Deploy is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with OceanBase Deploy. If not, see . + + +from __future__ import absolute_import, division, print_function + +import sys +import time +import re +import base64 +import copy +import json +import requests +import time +import traceback + +from Crypto.Cipher import PKCS1_v1_5 as PKCS1_cipher +from Crypto.PublicKey import RSA +from enum import Enum +from os import path +from datetime import datetime +from Crypto.Cipher import AES +from Crypto.Random import get_random_bytes +from Crypto.Util.Padding import pad +from const import RSA_KEY_SIZE +if sys.version_info.major == 2: + import MySQLdb as mysql +else: + import pymysql as mysql + +from _errno import EC_FAIL_TO_CONNECT, EC_SQL_EXECUTE_FAILED +from _stdio import SafeStdio + + + +class OcsResponse(object): + def __init__(self, code, data, type): + self.code = code + self._data = data + self._type = type + + def __bool__(self): + return self.code == 200 + + def __getattr__(self, name): + if self.code == 200: + if self._data and name in self._data: + return self._data[name] + else: + return None + return None + + @property + def type(self): + return self._type + +class OcsDag(object): + class DagState(Enum): + PENDING = 'PENDING' + READY = 'READY' + RUNNING = 'RUNNING' + FAILED = 'FAILED' + SUCCEED = 'SUCCEED' + + class Operator(Enum): + RUN = 'RUN' + RETRY = 'RETRY' + ROLLBACK = 'ROLLBACK' + CANCEL = 'CANCEL' + PASS = 'PASS' + + def __init__(self, data): + self._dag_id = data['dag_id'] + self._state = data['state'] + self._operator = data['operator'] + self._id = data['id'] # id 不会是0 + self._name = data['name'] + + @property + def state(self): + return self._state + + @property + def id(self): + return self._id + + @property + def operator(self): + return self._operator + + @property + def name(self): + return self._name + + def is_init_task(self): + return self.name == 'Init Cluster' + + def is_take_over_or_rebuild(self): + return self.name == 'Take over' or self.name == 'Rebuild CLUSTER AGENT' + + def is_finished(self): + return self.state == self.DagState.SUCCEED.value or self.state == self.DagState.FAILED.value + + def is_succeed(self): + return self.state == self.DagState.SUCCEED.value + + def is_failed(self): + return self.state == self.DagState.FAILED.value + + def is_run(self): + return self.operator == self.Operator.RUN.value + +class OcsDagResponse(OcsResponse): + def __init__(self, code, data): + super().__init__(code, data, 'DagDetailDTO') + self._dag = OcsDag(data) + + def __getattr__(self, name): + if name == 'dag': + return self._dag + return super().__getattr__(name) + +class OcsInfo(object): + class Identity(Enum): + MASTER = 'MASTER' + FOLLOWER = 'FOLLOWER' + SINGLE = 'SINGLE' + CLUSTER_AGENT = 'CLUSTER AGENT' + TAKE_OVER_MASTER = 'TAKE_OVER_MASTER' + TAKE_OVER_FOLLOWER = 'TAKE_OVER_FOLLOWER' + + class State(Enum): + Unknown = 0 + Starting = 1 + Running = 2 + Stopping = 3 + Stopped = 4 + + def __init__(self, data): + self._state = data['state'] + self._identity = data['identity'] + self._ip = data['ip'] + self._port = data['port'] + self._zone = data['version'] + self._isObExists = data['isObExists'] + + @property + def state(self): + return self._state + + @property + def identity(self): + return self._identity + + @property + def isObExists(self): + return self._isObExists + +class OcsInfoResponse(OcsResponse): + def __init__(self, code, data): + super().__init__(code, data, 'InfoDTO') + self._info = OcsInfo(data) + + def __getattr__(self, name): + if name == 'info': + return self._info + return super().__getattr__(name) + +class OcsStatus(object): + class State(Enum): + STATE_PROCESS_NOT_RUNNING = 0 + STATE_PROCESS_RUNNING = 1 + STATE_CONNECTION_RESTRICTED = 2 + STATE_CONNECTION_AVAILABLE = 3 + + def __init__(self, data): + self._state = data['state'] + self._version = data['version'] + self._pid = data['pid'] + self.startAt = data['startAt'] + self._port = data['port'] + + @property + def state(self): + return self._state + +class OcsStatusResponse(OcsResponse): + def __init__(self, code, data): + super().__init__(code, data, 'StatusDTO') + self._status = OcsStatus(data) + + def __getattr__(self, name): + if name == 'status': + return self._status + return super().__getattr__(name) + + +class OcsCursor(SafeStdio): + + class Header: + auth: str + ts: str + uri: str + keys: bytes + def __init__(self, auth, ts, uri, keys): + self.auth = auth + self.ts = ts + self.uri = uri + self.keys = keys + + def serialize_struct(self): + return json.dumps({ + 'auth': self.auth, + 'ts': self.ts, + 'uri': self.uri, + 'keys': base64.b64encode(self.keys).decode('utf-8') + }) + + + + HEADERS = {'content-type': 'application/json'} + + def __init__(self, ip, port, homepath = None, password = None, stdio=None): + self.ip = ip + self.port = port + self.stdio = stdio + self.password = password + self.homepath = homepath + self.socket_file = 'obshell.' + str(port) + '.sock' + self._auth_header = None + self._version = "" + self.aes_key = get_random_bytes(16) + self.aes_iv = get_random_bytes(16) + + @staticmethod + def _encrypt(context, encrypt_key): + key = RSA.import_key(base64.b64decode(encrypt_key)) + cipher = PKCS1_cipher.new(key) + return base64.b64encode(cipher.encrypt(bytes(context.encode('utf8')))).decode('utf8') + + @staticmethod + def rsa_encrypt(context, encrypt_key): + key = RSA.import_key(base64.b64decode(encrypt_key)) + cipher = PKCS1_cipher.new(key) + data_to_encrypt = bytes(context.encode('utf8')) + max_chunk_size = int(RSA_KEY_SIZE / 8) - 11 + chunks = [data_to_encrypt[i:i + max_chunk_size] for i in range(0, len(data_to_encrypt), max_chunk_size)] + encrypted_chunks = [cipher.encrypt(chunk) for chunk in chunks] + encrypted = b''.join(encrypted_chunks) + encoded_encrypted_chunks = base64.b64encode(encrypted).decode('utf-8') + return encoded_encrypted_chunks + + @staticmethod + def aes_encrypt(self, data): + cipher = AES.new(self.aes_key, AES.MODE_CBC, self.aes_iv) + return base64.b64encode(cipher.encrypt(pad(bytes(data.encode('utf8')), AES.block_size))).decode('utf8') + + @property + def auth_header(self): + if self._auth_header is None: + encrypt_key = self._get_secrets() + auth_json = json.dumps({'password': self.password, 'ts': int(datetime.now().timestamp()) + 100000}) + self._auth_header = self._encrypt(auth_json, encrypt_key) + return self._auth_header + + @property + def version(self): + if self._version != "": + return self._version + status = requests.get(self._make_url('/api/v1/status'), headers=self._make_headers()) + if status.status_code == 200: + self._version = status.json()['data']['version'] + return self._version + else : + self.stdio.warn('get obshell version failed') + return None + + def _make_headers(self, headers=None, safe=None, uri=None): + request_headers = copy.deepcopy(self.HEADERS) + if safe is True : + # request_headers['X-OCS-Auth'] = self.auth_header + if self.version >= '4.2.3': + header = self.Header(auth=self.password, ts=str(int(datetime.now().timestamp()) + 100000), uri=uri, keys=self.aes_key+self.aes_iv) + request_headers['X-OCS-Header'] = self.rsa_encrypt(header.serialize_struct(), self._get_secrets()) + else: + request_headers['X-OCS-Auth'] = self.auth_header + if headers: + request_headers.update(headers) + return request_headers + + def _make_url(self, url): + return 'http://{ip}:{port}{url}'.format(ip=self.ip, port=self.port, url=url) + + def _request(self, method, url, data=None, headers=None, params=None, safe=None, *args, **kwargs): + try: + if data is not None: + data = json.dumps(data) + else: + data = json.dumps({}) + if safe and self.version >= '4.2.3': + data = self.aes_encrypt(self, data) + self.stdio.verbose('send request to obshell: method: {}, url: {}, data: {}, headers: {}, params: {}'.format(method, url, data, headers, params)) + resp = requests.request(method, self._make_url(url), data=data, headers=self._make_headers(headers, safe, url), params=params, *args, **kwargs) + except Exception as e: + self.stdio.error('request error: {}'.format(e)) + return None + parsed_resp = self._response_parser(resp) + if parsed_resp.code != 200: + self.stdio.verbose('request obshell failed: {}'.format(resp)) + return None + return parsed_resp + + def _curl_socket(self, ssh_client, method, url, data=None): + if data is not None: + data = json.dumps(data) + socket_path = path.join(self.homepath, 'run', self.socket_file) + cmd = 'curl --unix-socket %s -X %s -d \'%s\' %s' % (socket_path, method, data, self._make_url(url)) + self.stdio.verbose('cmd: {}'.format(cmd)) + ssh_return = ssh_client.execute_command(cmd) + return self._response_parser(ssh_return.stdout, is_socket=True) + + def _response_parser(self, resp, is_socket=False): + try: + if is_socket: + data = json.loads(resp) + status_code = data['status'] + else: + data = resp.json() + # self.stdio.print('data: {}'.format(data)) + status_code = resp.status_code + if status_code == 200: + if 'data' in data: + data = data['data'] + if 'dag_id' in data and 'state' in data and 'operator' in data and 'id' in data: # 是不是已经足够说明返回了一个DagDetailDTO? + return OcsDagResponse(status_code, data) + if 'state' in data and 'identity' in data and 'ip' in data and 'port' in data and 'version' in data: # 返回了一个info + return OcsInfoResponse(status_code, data) + if 'state' in data and 'version' in data and 'pid' in data and 'startAt' in data and 'port' in data: # 返回了一个state + return OcsStatusResponse(status_code, data) + else: + return OcsResponse(status_code, data, "Unknown") + return OcsResponse(status_code, None, None) + except Exception as e: + traceback.print_exc() + self.stdio.error('response parser error: {}'.format(e)) + return None + + # get the public key from ocs agent + def _get_secrets(self): + resp = self._request('GET', '/api/v1/secret') + return resp.public_key if resp else None + + def request(self, method, url, data=None, headers=None, params=None, *args, **kwargs): + return self._request(method, url, data, headers, params, *args, **kwargs) + + def safe_request(self, method, url, data=None, headers=None, params=None, *args, **kwargs): + return self._request(method, url, data, headers, params, safe=True, *args, **kwargs) + + def query_dag_util_succeed(self, _dag): + dag = _dag + while True: + if not dag: + return False + if dag.state == dag.DagState.SUCCEED.value: + return True + dag = self.get_dag_request(dag.id) + time.sleep(1) + + def query_dag_util_finish(self, _dag): + dag = _dag + while True: + dag = self.get_dag_request(dag.id) + if not dag: + return None + if dag.is_finished(): + return dag + time.sleep(1) + + # normal route + def info_request(self): + resp = self.request('GET', '/api/v1/info') + return resp.info if resp and resp.type == 'InfoDTO' else None + + def status_request(self): + resp = self.request('GET', '/api/v1/status') + return resp.status if resp and resp.type == 'StatusDTO' else None + + def secret_request(self): + return self.request('GET', '/api/v1/secret') + + # ob routes + def ob_init_request(self): + resp = self.safe_request('POST', '/api/v1/ob/init') + return self.query_dag_util_finish(resp.dag) if resp else False + + def ob_stop_request(self, type = 'GLOBAL', target = None): + resp = self.safe_request('POST', '/api/v1/ob/stop', data = {'scope': {'type': type, 'target': target}, 'force': True}) + return self.query_dag_util_finish(resp.dag) if resp else False + + def ob_start_request(self, type = 'GLOBAL', target = None): + resp = self.safe_request('POST', '/api/v1/ob/start', data = {'scope': {'type': type, 'target': target}}) + return self.query_dag_util_finish(resp.dag) if resp else False + + def ob_info_request(self, data): + resp = self.safe_request('POST', '/api/v1/ob/info', data=data) + return resp + + # agent admin routes + def agent_join_request(self, ip, port, zone): + resp = self.safe_request('POST', '/api/v1/agent', data={'agentInfo': {'ip': ip, 'port': port}, 'zoneName': zone}) + return self.query_dag_util_finish(resp.dag) if resp else False + + def agent_remove_request(self, ip, port): + resp = self.safe_request('DELETE', '/api/v1/agent', data={'ip': ip, 'port': port}) + return self.query_dag_util_finish(resp.dag) if resp else False + + def agent_remove_by_socket(self, ssh_client, ip, port): + resp = self._curl_socket(ssh_client, 'DELETE', '/api/v1/agent', data={'ip': ip, 'port': port}) + return self.query_dag_util_finish(resp.dag) if resp else False + + # obcluster routes + def obcluster_config_request(self, cluster_id, cluster_name, rs_list): + encrypt_key = self._get_secrets() + encrypt_password = self._encrypt(self.password, encrypt_key) + resp = self.safe_request('POST', '/api/v1/obcluster/config', data={'clusterId': cluster_id, 'clusterName': cluster_name, 'rootPwd': encrypt_password, 'rsList': rs_list}) + return self.query_dag_util_finish(resp.dag) if resp else False + + # observer routes + def observer_put_config_request(self, server_config, agent_list, restart = True): + # 把serverconfig中的int类型的value全部转换成string类型 + for key in server_config: + server_config[key] = str(server_config[key]) + resp = self.safe_request('PUT', '/api/v1/observer/config', data={'observerConfig': server_config, 'restart': restart, 'scope': {'type': 'SERVER', 'target': agent_list}}) + return self.query_dag_util_finish(resp.dag) if resp else False + + # def observer_patch_config_request(self, server_config, servers, restart = False): + # resp = self.safe_request('POST', '/api/v1/observer/config', data={'observerConfig': server_config, 'restart': restart, 'scope': {'type': 'SERVER', 'target': servers}}) + # return self.query_dag_util_succeed(resp.dag) if resp else False + + def observer_scale_out_request(self, ip, port, zone, server_config): + resp = self.safe_request('POST', '/api/v1/ob/scale_out', data={'agentInfo': {'ip': ip, 'port': port}, 'obConfigs': server_config,'zone': zone}) + return self.query_dag_util_finish(resp.dag) if resp else False + + # upgrade routes + def pkg_upload_request(self, data = None): + return self.safe_request('POST', '/api/v1/upgrade/pkg/upload', data=data) + + def params_backup_request(self, data = None): + return self.safe_request('POST', '/api/v1/upgrade/params/backup', data=data) + + # task routes + def get_dag_request(self, id): + resp = self.safe_request('GET', '/api/v1/task/dag/%s' % id) + return resp.dag if resp else None + + def dag_request(self, dag, operator): + resp = self.safe_request('POST', '/api/v1/task/dag/%s' % dag.id, data={'operator': operator}) + if not resp: + return False + return self.query_dag_util_finish(dag) + + def get_agent_last_maintenance_dag_request(self): + if self.version >='4.2.3': + resp = self.safe_request('GET', '/api/v1/task/dag/maintain/agent') + else: + resp = self.request('GET', '/api/v1/task/dag/maintain/agent') + return resp.dag if resp else None + + def get_ob_last_maintenance_dag_request(self): + if self.version >= '4.2.3': + resp = self.safe_request('GET', '/api/v1/task/dag/maintain/ob') + else : + resp = self.request('GET', '/api/v1/task/dag/maintain/ob') + return resp.dag if resp else None + +def get_ocs_cursor(plugin_context, *args, **kwargs): + cluster_config = plugin_context.cluster_config + stdio = plugin_context.stdio + cursors = {} + for server in cluster_config.servers: + server_config = cluster_config.get_server_conf(server) + password = server_config.get('root_password', '') + obshell_port = server_config.get('obshell_port') + stdio.verbose('connect obshell ({}:{})'.format(server.ip, obshell_port)) + ocs_cursor = OcsCursor(ip=server.ip, port=obshell_port, homepath=server_config['home_path'], password=password, stdio=stdio) + cursors[server] = ocs_cursor + return cursors + + +class Cursor(SafeStdio): + + def __init__(self, ip, port, user='root', tenant='sys', password='', stdio=None): + self.stdio = stdio + self.ip = ip + self.port = port + self._user = user + self.tenant = tenant + self.password = password + self.cursor = None + self.db = None + self._connect() + self._raise_exception = False + self._raise_cursor = None + + @property + def user(self): + if "@" in self._user: + return self._user + if self.tenant: + return "{}@{}".format(self._user, self.tenant) + else: + return self._user + + @property + def raise_cursor(self): + if self._raise_cursor: + return self._raise_cursor + raise_cursor = copy.copy(self) + raise_cursor._raise_exception = True + self._raise_cursor = raise_cursor + return raise_cursor + + if sys.version_info.major == 2: + def _connect(self): + self.stdio.verbose('connect %s -P%s -u%s -p%s' % (self.ip, self.port, self.user, self.password)) + self.db = mysql.connect(host=self.ip, user=self.user, port=int(self.port), passwd=str(self.password)) + self.cursor = self.db.cursor(cursorclass=mysql.cursors.DictCursor) + else: + def _connect(self): + self.stdio.verbose('connect %s -P%s -u%s -p%s' % (self.ip, self.port, self.user, self.password)) + self.db = mysql.connect(host=self.ip, user=self.user, port=int(self.port), password=str(self.password), + cursorclass=mysql.cursors.DictCursor) + self.cursor = self.db.cursor() + + def new_cursor(self, tenant='sys', user='root', password='', ip='', port='', print_exception=True): + try: + ip = ip if ip else self.ip + port = port if port else self.port + return Cursor(ip=ip, port=port, user=user, tenant=tenant, password=password, stdio=self.stdio) + except: + print_exception and self.stdio.exception('') + self.stdio.verbose('fail to connect %s -P%s -u%s@%s -p%s' % (self.ip, self.port, user, tenant, password)) + return None + + def execute(self, sql, args=None, execute_func=None, raise_exception=None, exc_level='error', stdio=None): + + try: + stdio.verbose('execute sql: %s. args: %s' % (sql, args)) + self.cursor.execute(sql, args) + if not execute_func: + return self.cursor + return getattr(self.cursor, execute_func)() + except Exception as e: + getattr(stdio, exc_level)(EC_SQL_EXECUTE_FAILED.format(sql=sql)) + pattern = r'\n\[(.*?)\]\s+\[(.*?)\]\s+\[(.*?)\]$' + error_matches = re.findall(pattern, str(e.args[-1])) + if len(error_matches) > 0 and len(error_matches[-1]) == 3: + getattr(stdio, exc_level)("observer error trace [%s] from [%s]" % (error_matches[-1][2], error_matches[-1][0])) + if raise_exception is None: + raise_exception = self._raise_exception + if raise_exception: + stdio.exception('') + raise e + return False + + def fetchone(self, sql, args=None, raise_exception=None, exc_level='error', stdio=None): + return self.execute(sql, args=args, execute_func='fetchone', raise_exception=raise_exception, exc_level=exc_level, stdio=stdio) + + def fetchall(self, sql, args=None, raise_exception=None, exc_level='error', stdio=None): + return self.execute(sql, args=args, execute_func='fetchall', raise_exception=raise_exception, exc_level=exc_level, stdio=stdio) + + def close(self): + if self.cursor: + self.cursor.close() + self.cursor = None + if self.db: + self.db.close() + self.db = None + + +def connect(plugin_context, target_server=None, retry_times=101, connect_all=False, *args, **kwargs): + def return_true(**kwargs): + for key, value in kwargs.items(): + plugin_context.set_variable(key, value) + return plugin_context.return_true(**kwargs) + + ocs_cursor = get_ocs_cursor(plugin_context, *args, **kwargs) + stdio = plugin_context.stdio + if not ocs_cursor: + stdio.stop_loading('fail') + return plugin_context.return_false() + + count = retry_times + cluster_config = plugin_context.cluster_config + if target_server: + servers = [target_server] + server_config = cluster_config.get_server_conf(target_server) + stdio.start_loading('Connect observer(%s:%s)' % (target_server, server_config['mysql_port'])) + else: + servers = cluster_config.servers + stdio.start_loading('Connect to observer') + while count: + count -= 1 + connect_nums = 0 + for server in servers: + try: + server_config = cluster_config.get_server_conf(server) + password = server_config.get('root_password', '') if count % 2 == 0 else '' + cursor = Cursor(ip=server.ip, port=server_config['mysql_port'], tenant='', password=password if password is not None else '', stdio=stdio) + if cursor.execute('select 1', raise_exception=False, exc_level='verbose'): + if not connect_all: + stdio.stop_loading('succeed', text='Connect to observer {}:{}'.format(server.ip, server_config['mysql_port'])) + return return_true(connect=cursor.db, cursor=cursor, server=server, ocs_cursor = ocs_cursor) + else: + connect_nums += 1 + if connect_nums == len(servers): + stdio.stop_loading('succeed') + return return_true(connect=cursor.db, cursor=cursor, server=server, ocs_cursor = ocs_cursor) + else: + raise Exception('Connect to observer {}:{} failed'.format(server.ip, server_config['mysql_port'])) + except: + if count == 0: + stdio.exception('') + if connect_all: + break + time.sleep(3) + + stdio.stop_loading('fail') + stdio.error(EC_FAIL_TO_CONNECT.format(component=cluster_config.name)) + plugin_context.return_false() diff --git a/plugins/oceanbase/4.2.1.4/destroy.py b/plugins/oceanbase/4.2.1.4/destroy.py new file mode 100644 index 0000000..2cef1d6 --- /dev/null +++ b/plugins/oceanbase/4.2.1.4/destroy.py @@ -0,0 +1,54 @@ +# coding: utf-8 +# OceanBase Deploy. +# Copyright (C) 2021 OceanBase +# +# This file is part of OceanBase Deploy. +# +# OceanBase Deploy is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# OceanBase Deploy is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with OceanBase Deploy. If not, see . + + +from __future__ import absolute_import, division, print_function + +from _errno import EC_CLEAN_PATH_FAILED + +global_ret = True + + +def destroy(plugin_context, *args, **kwargs): + def clean(server, path): + client = clients[server] + ret = client.execute_command('rm -fr %s/' % (path), timeout=-1) + if not ret: + # print stderror + global global_ret + global_ret = False + stdio.warn(EC_CLEAN_PATH_FAILED.format(server=server, path=path)) + else: + stdio.verbose('%s:%s cleaned' % (server, path)) + cluster_config = plugin_context.cluster_config + clients = plugin_context.clients + stdio = plugin_context.stdio + stdio.start_loading('observer work dir cleaning') + for server in cluster_config.servers: + server_config = cluster_config.get_server_conf(server) + stdio.verbose('%s work path cleaning', server) + clean(server, server_config['home_path']) + for key in ['data_dir', 'redo_dir', 'clog_dir', 'ilog_dir', 'slog_dir']: + if server_config.get(key): + clean(server, server_config[key]) + if global_ret: + stdio.stop_loading('succeed') + plugin_context.return_true() + else: + stdio.stop_loading('fail') diff --git a/plugins/oceanbase/4.2.1.4/init.py b/plugins/oceanbase/4.2.1.4/init.py new file mode 100644 index 0000000..168fd80 --- /dev/null +++ b/plugins/oceanbase/4.2.1.4/init.py @@ -0,0 +1,203 @@ +# coding: utf-8 +# OceanBase Deploy. +# Copyright (C) 2021 OceanBase +# +# This file is part of OceanBase Deploy. +# +# OceanBase Deploy is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# OceanBase Deploy is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with OceanBase Deploy. If not, see . + + +from __future__ import absolute_import, division, print_function +import os + +from _errno import EC_CONFIG_CONFLICT_DIR, EC_FAIL_TO_INIT_PATH, InitDirFailedErrorMessage + + +stdio = None +force = False +global_ret = True + + +def critical(*arg, **kwargs): + global global_ret + global_ret = False + stdio.error(*arg, **kwargs) + + +def init_dir(server, client, key, path, link_path=None): + if force: + ret = client.execute_command('rm -fr %s' % path, timeout=-1) + if not ret: + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='%s path' % key, msg=ret.stderr)) + return False + else: + if client.execute_command('mkdir -p %s' % path): + ret = client.execute_command('ls %s' % (path)) + if not ret or ret.stdout.strip(): + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='%s path' % key, msg=InitDirFailedErrorMessage.NOT_EMPTY.format(path=path))) + return False + else: + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='%s path' % key, msg=InitDirFailedErrorMessage.CREATE_FAILED.format(path=path))) + return False + ret = client.execute_command('mkdir -p %s' % path) + if ret: + if link_path: + client.execute_command("if [ ! '%s' -ef '%s' ]; then ln -sf %s %s; fi" % (path, link_path, path, link_path)) + return True + else: + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='%s path' % key, msg=ret.stderr)) + return False + + +def init(plugin_context, *args, **kwargs): + global stdio, force + cluster_config = plugin_context.cluster_config + clients = plugin_context.clients + stdio = plugin_context.stdio + servers_dirs = {} + force = getattr(plugin_context.options, 'force', False) + clean = getattr(plugin_context.options, 'clean', False) + stdio.verbose('option `force` is %s' % force) + stdio.start_loading('Initializes observer work home') + tmp_sock_dir = '/tmp/obshell' + for server in cluster_config.servers: + ip = server.ip + if ip not in servers_dirs: + servers_dirs[ip] = {} + dirs = servers_dirs[ip] + server_config = cluster_config.get_server_conf(server) + client = clients[server] + home_path = server_config['home_path'] + + if not server_config.get('data_dir'): + server_config['data_dir'] = '%s/store' % home_path + if not server_config.get('redo_dir'): + server_config['redo_dir'] = server_config['data_dir'] + if not server_config.get('slog_dir'): + server_config['slog_dir'] = '%s/slog' % server_config['data_dir'] + if not server_config.get('clog_dir'): + server_config['clog_dir'] = '%s/clog' % server_config['redo_dir'] + + if server_config['redo_dir'] == server_config['data_dir']: + keys = ['home_path', 'data_dir', 'clog_dir', 'slog_dir'] + else: + keys = ['home_path', 'data_dir', 'redo_dir', 'clog_dir', 'slog_dir'] + for key in keys: + path = server_config[key] + if path in dirs: + critical(EC_CONFIG_CONFLICT_DIR.format(server1=server, path=path, server2=dirs[path]['server'], key=dirs[path]['key'])) + continue + dirs[path] = { + 'server': server, + 'key': key, + } + stdio.verbose('%s initializes observer work home' % server) + need_clean = force + if clean and not force: + if client.execute_command('bash -c \'if [[ "$(ls -d {0} 2>/dev/null)" != "" && ! -O {0} ]]; then exit 0; else exit 1; fi\''.format(home_path)): + owner = client.execute_command("ls -ld %s | awk '{print $3}'" % home_path).stdout.strip() + err_msg = ' {} is not empty, and the owner is {}'.format(home_path, owner) + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='home path', msg=err_msg)) + continue + need_clean = True + + if need_clean: + for bin_name in ['observer', 'obshell']: + client.execute_command( + "pkill -9 -u `whoami` -f '^%s/bin/%s'" % (home_path, bin_name)) + ret = client.execute_command('rm -fr %s/*' % home_path, timeout=-1) + if not ret: + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='home path', msg=ret.stderr)) + continue + else: + if client.execute_command('mkdir -p %s' % home_path): + ret = client.execute_command('ls %s' % (home_path)) + if not ret or ret.stdout.strip(): + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='home path', msg=InitDirFailedErrorMessage.NOT_EMPTY.format(path=home_path))) + continue + else: + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='home path', msg=InitDirFailedErrorMessage.CREATE_FAILED.format(path=home_path))) + + if not client.execute_command('rm -f %s/.meta' % home_path): + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='home path', msg=InitDirFailedErrorMessage.CREATE_FAILED.format(path=home_path))) + if not client.execute_command('mkdir -p {dir}; [ -w {dir} ] || chmod +666 {dir}'.format(dir=tmp_sock_dir)): + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='sock path', msg=InitDirFailedErrorMessage.CREATE_FAILED.format(path=tmp_sock_dir))) + + ret = client.execute_command('bash -c "mkdir -p %s/{etc,admin,.conf,log,log_obshell,bin,lib}"' % home_path) + if ret: + data_path = server_config['data_dir'] + if need_clean: + ret = client.execute_command('rm -fr %s/*' % data_path, timeout=-1) + if not ret: + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='data dir', msg=InitDirFailedErrorMessage.PERMISSION_DENIED.format(path=data_path))) + continue + else: + if client.execute_command('mkdir -p %s' % data_path): + ret = client.execute_command('ls %s' % (data_path)) + if not ret or ret.stdout.strip(): + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='data dir', msg=InitDirFailedErrorMessage.NOT_EMPTY.format(path=data_path))) + continue + else: + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='data dir', msg=InitDirFailedErrorMessage.CREATE_FAILED.format(path=data_path))) + ret = client.execute_command('bash -c "mkdir -p %s/sstable"' % data_path) + if ret: + link_path = '%s/store' % home_path + client.execute_command("if [ ! '%s' -ef '%s' ]; then ln -sf %s %s; fi" % (data_path, link_path, data_path, link_path)) + for key in ['clog', 'slog']: + # init_dir(server, client, key, server_config['%s_dir' % key], os.path.join(data_path, key)) + log_dir = server_config['%s_dir' % key] + if force: + ret = client.execute_command('rm -fr %s/*' % log_dir, timeout=-1) + if not ret: + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='%s dir' % key, msg=InitDirFailedErrorMessage.PERMISSION_DENIED.format(path=log_dir))) + continue + else: + if client.execute_command('mkdir -p %s' % log_dir): + ret = client.execute_command('ls %s' % (log_dir)) + if not ret or ret.stdout.strip(): + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='%s dir' % key, msg=InitDirFailedErrorMessage.NOT_EMPTY.format(path=log_dir))) + continue + else: + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='%s dir' % key, msg=InitDirFailedErrorMessage.CREATE_FAILED.format(path=log_dir))) + ret = client.execute_command('mkdir -p %s' % log_dir) + if ret: + link_path = '%s/%s' % (data_path, key) + client.execute_command("if [ ! '%s' -ef '%s' ]; then ln -sf %s %s; fi" % (log_dir, link_path, log_dir, link_path)) + else: + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='%s dir' % key, msg=ret.stderr)) + else: + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='data dir', msg=InitDirFailedErrorMessage.PATH_ONLY.format(path=data_path))) + else: + critical(EC_FAIL_TO_INIT_PATH.format(server=server, key='home path', msg=InitDirFailedErrorMessage.PERMISSION_DENIED.format(path=home_path))) + if global_ret: + stdio.verbose("check slog dir in the same disk with data dir") + slog_disk = data_disk = None + ret = client.execute_command("df --block-size=1024 %s | awk 'NR == 2 { print $1 }'" % server_config['slog_dir']) + if ret: + slog_disk = ret.stdout.strip() + stdio.verbose('slog disk is {}'.format(slog_disk)) + ret = client.execute_command("df --block-size=1024 %s | awk 'NR == 2 { print $1 }'" % server_config['data_dir']) + if ret: + data_disk = ret.stdout.strip() + stdio.verbose('data disk is {}'.format(data_disk)) + if slog_disk != data_disk: + critical(EC_FAIL_TO_INIT_PATH.format( + server=server, key='slog dir', + msg=': slog and data should be on the same disk. Now the slog disk is {slog_disk}, and the data disk is {data_disk}.'.format(slog_disk=slog_disk, data_disk=data_disk))) + + if global_ret: + stdio.stop_loading('succeed') + plugin_context.return_true() + else: + stdio.stop_loading('fail') \ No newline at end of file diff --git a/plugins/oceanbase/4.2.1.4/parameter.yaml b/plugins/oceanbase/4.2.1.4/parameter.yaml new file mode 100644 index 0000000..fc76c9b --- /dev/null +++ b/plugins/oceanbase/4.2.1.4/parameter.yaml @@ -0,0 +1,1941 @@ +- name: home_path + name_local: 工作目录 + require: true + essential: true + type: PATH + min_value: NULL + max_value: NULL + need_redeploy: true + description_en: the directory for the work data file + description_local: OceanBase工作目录 +- name: cluster_id + name_local: 集群ID + require: true + essential: true + type: INT + default: 1 + min_value: 1 + max_value: 4294901759 + modify_limit: modify + need_redeploy: true + description_en: ID of the cluster + description_local: 本OceanBase集群ID +- name: data_dir + name_local: 数据目录 + essential: true + type: PATH + min_value: NULL + max_value: NULL + need_redeploy: true + description_en: the directory for the data file + description_local: 存储sstable等数据的目录 +- name: redo_dir + name_local: 日志目录 + essential: true + type: PATH + min_value: NULL + max_value: NULL + need_redeploy: true + description_en: the directory for the redo file + description_local: 存储clog, iclog, slog数据的目录 +- name: clog_dir + type: PATH + min_value: NULL + max_value: NULL + need_redeploy: true + description_en: the directory for the clog file + description_local: 存储clog数据的目录, clog 应该与 ilog 同盘 +- name: slog_dir + type: PATH + min_value: NULL + max_value: NULL + need_redeploy: true + description_en: the directory for the slog file + description_local: 存储slog数据的目录. 4.0版本开始不支持配置该项 +- name: ilog_dir + type: PATH + min_value: NULL + max_value: NULL + need_redeploy: true + description_en: the directory for the ilog file + description_local: 存储ilog数据的目录 +- name: rpc_port + name_local: 内部通信端口 + require: true + essential: true + type: INT + default: 2882 + min_value: 1025 + max_value: 65535 + modify_limit: modify + need_restart: true + description_en: the port number for RPC protocol. + description_local: 集群内部通信的端口号 +- name: mysql_port + name_local: 服务端口 + require: true + essential: true + type: INT + default: 2881 + min_value: 1025 + max_value: 65535 + modify_limit: modify + need_restart: true + description_en: port number for mysql connection + description_local: SQL服务协议端口号 +- name: obshell_port + name_local: obshell 服务端口 + require: true + essential: true + type: INT + default: 2886 + min_value: 1025 + max_value: 65535 + modify_limit: modify + need_redeploy: true + description_en: The port for obshell agent + description_local: obshell agent 的端口号 +- name: zone + require: true + type: SAFE_STRING + default: zone1 + min_value: NULL + max_value: NULL + section: OBSERVER + need_redeploy: true + description_en: specifies the zone name + description_local: 节点所在的zone的名字。 +- name: sys_cpu_limit_trigger + require: false + type: INT + default: 80 + min_value: 50 + max_value: NULL + section: OBSERVER + need_restart: false + description_en: when the cpu usage percentage exceed the trigger, will limit the sys cpu usage + description_local: 当CPU利用率超过该阈值的时候,将暂停系统后台任务的执行 +- name: memory_limit_percentage + require: false + type: INT + default: 80 + min_value: 10 + max_value: 90 + modify_limit: decrease + section: OBSERVER + need_restart: false + description_en: memory limit percentage of the total physical memory + description_local: 系统总可用内存大小占总内存大小的百分比 +- name: sys_bkgd_migration_retry_num + require: false + type: INT + default: 3 + min_value: 3 + max_value: 100 + section: OBSERVER + need_restart: false + description_en: retry num limit during migration. + description_local: 副本迁移失败时最多重试次数 +- name: tableapi_transport_compress_func + require: false + type: SAFE_STRING + default: none + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: compressor used for tableAPI query result. + description_local: tableAPI查询结果传输使用的压缩算法 +- name: disk_io_thread_count + require: false + type: INT + default: 8 + min_value: 2 + max_value: 32 + section: OBSERVER + need_restart: false + description_en: The number of io threads on each disk. + description_local: 磁盘IO线程数。必须为偶数。 +- name: location_cache_refresh_min_interval + require: false + type: TIME + default: 100ms + min_value: 0s + max_value: NULL + section: LOCATION_CACHE + need_restart: false + description_en: the time interval in which no request for location cache renewal will be executed. + description_local: 位置缓存刷新请求的最小间隔,防止产生过多刷新请求造成系统压力过大 +- name: trace_log_slow_query_watermark + type: TIME + default: 1s + min_value: 1ms + max_value: NULL + section: OBSERVER + need_restart: false + description_en: the threshold of execution time (in milliseconds) of a query beyond which it is considered to be a slow query. + description_local: 执行时间超过该阈值的查询会被认为是慢查询,慢查询的追踪日志会被打印到系统日志中 +- name: max_string_print_length + require: false + type: INT + default: 500 + min_value: 0 + max_value: NULL + section: OBSERVER + need_restart: false + description_en: truncate very long string when printing to log file + description_local: 打印系统日志时,单行日志最大长度 +- name: row_compaction_update_limit + require: false + type: INT + default: 6 + min_value: 1 + max_value: 6400 + section: TRANS + need_restart: false + description_en: maximum update count before trigger row compaction + description_local: 触发内存中行内数据合并的修改次数 +- name: enable_rereplication + require: false + type: BOOL + default: true + min_value: NULL + max_value: NULL + section: LOAD_BALANCE + need_restart: false + description_en: specifies whether the partition auto-replication is turned on. + description_local: 自动补副本开关 +- name: rootservice_async_task_thread_count + require: false + type: INT + default: 4 + min_value: 1 + max_value: 10 + section: ROOT_SERVICE + need_restart: false + description_en: maximum of threads allowed for executing asynchronous task at rootserver. + description_local: RootService内部异步任务使用的线程池大小 +- name: major_compact_trigger + require: false + type: INT + default: 5 + min_value: 0 + max_value: 65535 + section: TENANT + need_restart: false + description_en: major_compact_trigger alias to minor_freeze_times + description_local: 多少次小合并触发一次全局合并。值为0时,表示关闭小合并 +- name: default_compress + require: false + type: SAFE_STRING + default: archive + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: default compress function name for create new table + description_local: Oracle模式下,建表时使用的默认压缩策略 +- name: ssl_client_authentication + require: false + type: BOOL + default: false + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: true + description_en: enable server supports SSL connection, takes effect only after server restart with all ca/cert/key file. + description_local: 是否开启SSL连接功能 +- name: datafile_size + name_local: 数据文件大小 + require: false + essential: true + type: CAPACITY_MB + default: 0 + min_value: 0M + max_value: NULL + modify_limit: decrease + section: SSTABLE + need_restart: false + description_en: size of the data file. Please enter an capacity, such as 20G + description_local: 数据文件大小。请输入带容量带单位的整数,如20G +- name: datafile_maxsize + name_local: 数据文件最大空间 + require: false + essential: true + type: CAPACITY_MB + default: 0 + min_value: 0M + max_value: NULL + modify_limit: decrease + section: SSTABLE + need_restart: false + description_en: the auto extend max size. Please enter an capacity, such as 20G + description_local: 数据文件最大空间。请输入带容量带单位的整数,如20G +- name: datafile_next + name_local: 数据文件自增步长 + require: false + essential: true + type: CAPACITY_MB + default: 0 + min_value: 0M + max_value: NULL + modify_limit: decrease + section: SSTABLE + need_restart: false + description_en: the auto extend step. Please enter an capacity, such as 2G + description_local: 数据文件自增步长。请输入带容量带单位的整数,如2G +- name: log_disk_percentage + require: false + type: INT + default: 0 + min_value: 0 + max_value: 99 + description_en: the percentage of disk space used by the clog files. + description_local: Redo 日志占用其所在磁盘总空间的百分比。 +- name: log_disk_size + name_local: Redo 日志大小 + require: false + essential: true + type: CAPACITY_MB + default: 0 + min_value: 0M + max_value: NULL + description_en: the size of disk space used by the clog files. Please enter an capacity, such as 20G + description_local: Redo 日志磁盘的大小。请输入带容量带单位的整数,如24G +- name: merge_stat_sampling_ratio + require: false + type: INT + default: 100 + min_value: 0 + max_value: 100 + section: OBSERVER + need_restart: false + description_en: column stats sampling ratio daily merge. + description_local: 合并时候数据列统计信息的采样率 +- name: cache_wash_threshold + require: false + type: CAPACITY_MB + default: 4GB + min_value: 0B + max_value: NULL + section: OBSERVER + need_restart: false + description_en: size of remaining memory at which cache eviction will be triggered. + description_local: 触发缓存清理的容量阈值 +- name: user_iort_up_percentage + require: false + type: INT + default: 100 + min_value: 0 + max_value: NULL + section: OBSERVER + need_restart: false + description_en: variable to control sys io, the percentage of use io rt can raise + description_local: 用户磁盘IO时延超过该阈值后,系统后台IO任务将被限流 +- name: high_priority_net_thread_count + require: false + type: INT + default: 0 + min_value: 0 + max_value: 100 + section: OBSERVER + need_restart: true + description_en: the number of rpc I/O threads for high priority messages, 0 means set off + description_local: 高优先级网络线程数,值0表示关闭 +- name: max_kept_major_version_number + require: false + type: INT + default: 2 + min_value: 1 + max_value: 16 + section: DAILY_MERGE + need_restart: false + description_en: the maximum number of kept major versions + description_local: 数据保留多少个冻结版本 +- name: enable_sys_unit_standalone + require: false + type: BOOL + default: false + min_value: NULL + max_value: NULL + section: LOAD_BALANCE + need_restart: false + description_en: specifies whether sys unit standalone deployment is turned on. + description_local: 系统租户UNIT是否独占节点 +- name: freeze_trigger_percentage + require: false + type: INT + default: 50 + min_value: 1 + max_value: 99 + section: TENANT + need_restart: false + description_en: the threshold of the size of the mem store when freeze will be triggered. + description_local: 触发全局冻结的租户使用内存阈值。另见enable_global_freeze_trigger。 +- name: enable_major_freeze + require: false + type: BOOL + default: true + min_value: NULL + max_value: NULL + section: ROOT_SERVICE + need_restart: false + description_en: specifies whether major_freeze function is turned on. + description_local: 自动全局冻结开关 +- name: balancer_tolerance_percentage + require: false + type: INT + default: 10 + min_value: 1 + max_value: 99 + section: LOAD_BALANCE + need_restart: false + description_en: specifies the tolerance (in percentage) of the unbalance of the disk space utilization among all units. + description_local: 租户内多个UNIT间磁盘不均衡程度的宽容度,在均值+-宽容度范围之内的不均衡不会触发执行均衡动作 +- name: server_cpu_quota_min + require: false + type: DOUBLE + default: 2.5 + min_value: 0 + max_value: 16 + section: TENANT + need_restart: true + description_en: the number of minimal vCPUs allocated to the server tenant(a special internal tenant that exists on every observer) + description_local: 系统可以使用的最小CPU配额,将会预留 +- name: memory_reserved + require: false + type: CAPACITY_MB + default: 500M + min_value: 10M + max_value: NULL + section: SSTABLE + need_restart: false + description_en: the size of the system memory reserved for emergency internal use. + description_local: 系统预留内存大小 +- name: server_cpu_quota_max + require: false + type: DOUBLE + default: 5 + min_value: 0 + max_value: 16 + section: TENANT + need_restart: true + description_en: the number of maximal vCPUs allocated to the server tenant + description_local: 系统可以使用的最大CPU配额 +- name: rootservice_ready_check_interval + require: false + type: TIME + default: 3s + min_value: 100000us + max_value: 1m + section: ROOT_SERVICE + need_restart: false + description_en: the interval between the schedule of the task that checks on the status of the ZONE during restarting. + description_local: RootService启动后等待和检查集群状态的时间间隔 +- name: debug_sync_timeout + require: false + type: TIME + default: 0 + min_value: 0 + max_value: NULL + section: OBSERVER + need_restart: false + description_en: Enable the debug sync facility and optionally specify a default wait timeout in micro seconds. A zero value keeps the facility disabled + description_local: 打开debug sync调试开关,并设置其超时时间;值为0时,则关闭。 +- name: syslog_level + require: false + type: SAFE_STRING + default: INFO + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: specifies the current level of logging. + description_local: 系统日志级别 +- name: resource_hard_limit + require: false + type: INT + default: 100 + min_value: 1 + max_value: 10000 + section: LOAD_BALANCE + need_restart: false + description_en: Used along with resource_soft_limit in unit allocation. If server utilization is less than resource_soft_limit, a policy of best fit will be used for unit allocation; otherwise, a least load policy will be employed. Ultimately,system utilization should not be large than resource_hard_limit. + description_local: CPU和内存等资源进行分配的时候,资源总量是实际数量乘以该百分比的值 +- name: leak_mod_to_check + require: false + type: SAFE_STRING + default: NONE + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: the name of the module under memory leak checks + description_local: 内存泄露检查,用于内部调试目的 +- name: balancer_task_timeout + require: false + type: TIME + default: 20m + min_value: 1s + max_value: NULL + section: LOAD_BALANCE + need_restart: false + description_en: the time to execute the load-balancing task before it is terminated. + description_local: 负载均衡等后台任务的超时时间 +- name: enable_upgrade_mode + require: false + type: BOOL + default: false + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: specifies whether upgrade mode is turned on. If turned on, daily merger and balancer will be disabled. + description_local: 升级模式开关。在升级模式中,会暂停部分系统后台功能。 +- name: multiblock_read_size + require: false + type: CAPACITY_MB + default: 128K + min_value: 0K + max_value: 2M + section: SSTABLE + need_restart: false + description_en: multiple block batch read size in one read io request. + description_local: 读取数据时IO聚合大小 +- name: migration_disable_time + require: false + type: TIME + default: 3600s + min_value: 1s + max_value: NULL + section: ROOT_SERVICE + need_restart: false + description_en: the duration in which the observer stays in the block_migrate_in status, which means no partition is allowed to migrate into the server. + description_local: 因磁盘满等原因导致某个节点数据迁入失败时,暂停迁入时长 +- name: tablet_size + require: false + type: CAPACITY_MB + default: 128M + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: default tablet size, has to be a multiple of 2M + description_local: 分区内部并行处理(合并、查询等)时每个分片的大小 +- name: dead_socket_detection_timeout + require: false + type: TIME + default: 10s + min_value: 0s + max_value: 2h + section: OBSERVER + need_restart: false + description_en: specify a tcp_user_timeout for RFC5482. A zero value makes the option disabled + description_local: 失效socket检测超时时间 +- name: server_check_interval + require: false + type: TIME + default: 30s + min_value: 1s + max_value: NULL + section: ROOT_SERVICE + need_restart: false + description_en: the time interval between schedules of a task that examines the __all_server table. + description_local: server表一致性检查的时间间隔 +- name: lease_time + require: false + type: TIME + default: 10s + min_value: 1s + max_value: 5m + section: ROOT_SERVICE + need_restart: false + description_en: Lease for current heartbeat. If the root server does not received any heartbeat from an observer in lease_time seconds, that observer is considered to be offline. + description_local: RootService与其他服务节点之间的租约时长。一般请勿修改。 +- name: rootservice_async_task_queue_size + require: false + type: INT + default: 16384 + min_value: 8 + max_value: 131072 + section: ROOT_SERVICE + need_restart: false + description_en: the size of the queue for all asynchronous tasks at rootserver. + description_local: RootService内部异步任务队列的容量 +- name: location_refresh_thread_count + require: false + type: INT + default: 4 + min_value: 2 + max_value: 64 + section: LOCATION_CACHE + need_restart: false + description_en: the number of threads that fetch the partition location information from the root service. + description_local: 用于位置缓存刷新的线程数 +- name: minor_compact_trigger + require: false + type: INT + default: 2 + min_value: 0 + max_value: 16 + section: TENANT + need_restart: false + description_en: minor_compact_trigger + description_local: 触发小合并的迷你合并次数 +- name: major_freeze_duty_time + type: MOMENT + default: Disable + min_value: 00:00 + max_value: 23:59 + section: DAILY_MERGE + need_restart: false + description_en: the start time of system daily merge procedure. + description_local: 每日定时冻结和合并的触发时刻 +- name: ignore_replay_checksum_error + require: false + type: BOOL + default: false + min_value: NULL + max_value: NULL + section: TRANS + need_restart: false + description_en: specifies whether error raised from the memtable replay checksum validation can be ignored. + description_local: 是否忽略回放事务日志时发生的校验和错误 +- name: user_block_cache_priority + require: false + type: INT + default: 1 + min_value: 1 + max_value: NULL + section: CACHE + need_restart: false + description_en: user block cache priority + description_local: 数据块缓存在缓存系统中的优先级 +- name: syslog_io_bandwidth_limit + require: false + type: CAPACITY_MB + default: 30MB + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: Syslog IO bandwidth limitation, exceeding syslog would be truncated. Use 0 to disable ERROR log. + description_local: 系统日志所能占用的磁盘IO带宽上限,超过带宽的系统日志将被丢弃 +- name: workers_per_cpu_quota + require: false + type: INT + default: 10 + min_value: 2 + max_value: 20 + section: TENANT + need_restart: false + description_en: the ratio(integer) between the number of system allocated workers vs the maximum number of threads that can be scheduled concurrently. + description_local: 每个CPU配额分配多少个工作线程 +- name: enable_record_trace_id + require: false + type: BOOL + default: true + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: specifies whether record app trace id is turned on. + description_local: 是否记录应用端设置的追踪ID +- name: config_additional_dir + require: false + type: PATH_LIST + default: etc2;etc3 + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: additional directories of configure file + description_local: 本地存储配置文件的多个目录,为了冗余存储多份配置文件 +- name: enable_syslog_recycle + require: false + essential: true + type: BOOL + default: false + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: specifies whether log file recycling is turned on + description_local: 是否自动回收系统日志 +- name: max_syslog_file_count + require: false + essential: true + type: INT + default: 0 + min_value: 0 + max_value: NULL + section: OBSERVER + need_restart: false + description_en: specifies the maximum number of the log files that can co-exist before the log file recycling kicks in. Each log file can occupy at most 256MB disk space. When this value is set to 0, no log file will be removed. + description_local: 系统日志自动回收复用时,最多保留多少个。值0表示不自动清理。 +- name: px_task_size + require: false + type: CAPACITY_MB + default: 2M + min_value: 2M + max_value: NULL + section: OBSERVER + need_restart: false + description_en: min task access size of px task + description_local: SQL并行查询引擎每个任务处理的数据量大小 +- name: replica_safe_remove_time + require: false + type: TIME + default: 2h + min_value: 1m + max_value: NULL + section: ROOT_SERVICE + need_restart: false + description_en: the time interval that replica not existed has not been modified beyond which a replica is considered can be safely removed + description_local: 已删除副本可以被清理的安全保留时间 +- name: builtin_db_data_verify_cycle + require: false + type: INT + default: 20 + min_value: 0 + max_value: 360 + section: OBSERVER + need_restart: false + description_en: check cycle of db data. + description_local: 数据坏块自检周期,单位为天。值0表示不检查。 +- name: system_cpu_quota + require: false + type: DOUBLE + default: 10 + min_value: 0 + max_value: 16 + section: TENANT + need_restart: false + description_en: the number of vCPUs allocated to the server tenant + description_local: 系统后台任务可使用CPU配额 +- name: enable_sys_table_ddl + require: false + type: BOOL + default: false + min_value: NULL + max_value: NULL + section: ROOT_SERVICE + need_restart: false + description_en: specifies whether a system table is allowed be to created manually. + description_local: 是否允许新建和修改系统表。主要在系统升级过程中使用。 +- name: merge_thread_count + require: false + type: INT + default: 0 + min_value: 0 + max_value: 256 + section: OBSERVER + need_restart: false + description_en: worker thread num for compaction + description_local: 用于合并的线程数 +- name: net_thread_count + require: false + type: INT + default: 0 + min_value: 0 + max_value: 128 + section: OBSERVER + need_restart: true + description_en: the number of rpc/mysql I/O threads for Libeasy. + description_local: 网络IO线程数 +- name: max_stale_time_for_weak_consistency + require: false + type: TIME + default: 5s + min_value: 5s + max_value: NULL + section: OBSERVER + need_restart: false + description_en: the max data stale time that observer can provide service when its parent is invalid. + description_local: 弱一致性读允许读到多旧的数据 +- name: backup_log_archive_option + require: false + type: SAFE_STRING + default: OPTIONAL + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: backup log archive option, support MANDATORY/OPTIONAL, COMPRESSION + description_local: 日志备份的参数 +- name: backup_concurrency + require: false + type: INT + default: 0 + min_value: 0 + max_value: 100 + section: OBSERVER + need_restart: false + description_en: backup concurrency limit. + description_local: observer备份基线的并发度 +- name: balancer_log_interval + require: false + type: TIME + default: 1m + min_value: 1s + max_value: NULL + section: LOAD_BALANCE + need_restart: false + description_en: the time interval between logging the load-balancing tasks statistics. + description_local: 负载均衡等后台任务线程打印统计日志的间隔时间 +- name: restore_concurrency + require: false + type: INT + default: 0 + min_value: 0 + max_value: 512 + section: OBSERVER + need_restart: false + description_en: the current work thread num of restore macro block. + description_local: 从备份恢复租户数据时最大并发度 +- name: micro_block_merge_verify_level + require: false + type: INT + default: 2 + min_value: 0 + max_value: 3 + section: OBSERVER + need_restart: false + description_en: specify what kind of verification should be done when merging micro block. 0, no verification will be done; 1, verify encoding algorithm, encoded micro block will be read to ensure data is correct; 2, verify encoding and compression algorithm, besides encoding verification, compressed block will be decompressed to ensure data is correct; 3, verify encoding, compression algorithm and lost write protect + + description_local: 控制合并时宏块的校验级别 +- name: bf_cache_miss_count_threshold + require: false + type: INT + default: 100 + min_value: 0 + max_value: NULL + section: CACHE + need_restart: false + description_en: bf cache miss count threshold, 0 means disable bf cache + description_local: 用于控制bloomfilter cache的触发次数,当宏块未命中次数达到这个值时,给创建bloomfilter缓存。0表示关闭。 +- name: weak_read_version_refresh_interval + require: false + type: TIME + default: 50ms + min_value: 0ms + max_value: NULL + section: OBSERVER + need_restart: false + description_en: the time interval to refresh cluster weak read version + description_local: 弱一致性读版本号的刷新周期,影响弱一致性读数据的延时;值为0时,表示不再刷新弱一致性读版本号,不提供单调读功能 +- name: large_query_worker_percentage + require: false + type: DOUBLE + default: 30 + min_value: 0 + max_value: 100 + section: TENANT + need_restart: false + description_en: the percentage of the workers reserved to serve large query request. + description_local: 预留给大查询处理的工作线程百分比 +- name: clog_transport_compress_all + require: false + type: BOOL + default: false + min_value: NULL + max_value: NULL + section: TRANS + need_restart: false + description_en: If this option is set to true, use compression for clog transport. The default is false(no compression) + description_local: 事务日志传输时是否压缩 +- name: flush_log_at_trx_commit + require: false + type: INT + default: 1 + min_value: 0 + max_value: 2 + section: TRANS + need_restart: false + description_en: 0 means commit transactions without waiting clog write to buffer cache, 1 means commit transactions after clog flush to disk, 2 means commit transactions after clog write to buffer cache + description_local: 事务提交时写事务日志策略。0表示不等待日志写入缓冲区,1表示等待日志写入磁盘,2表示等待日志写入缓冲区而不等落盘 +- name: global_major_freeze_residual_memory + require: false + type: INT + default: 40 + min_value: 1 + max_value: 99 + section: OBSERVER + need_restart: false + description_en: post global major freeze when observer memsotre free memory(plus memory hold by frozen memstore and blockcache) reach this limit. limit calc by memory_limit * (1 - system_memory_percentage/100) * global_major_freeze_residual_memory/100 + description_local: 当剩余内存小于这个百分比时,触发全局冻结 +- name: enable_sql_audit + require: false + type: BOOL + default: true + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: specifies whether SQL audit is turned on. + description_local: SQL审计功能开关 +- name: merger_switch_leader_duration_time + require: false + type: TIME + default: 3m + min_value: 0s + max_value: 30m + section: ROOT_SERVICE + need_restart: false + description_en: switch leader duration time for daily merge. + description_local: 合并时,批量切主的时间间隔 +- name: enable_record_trace_log + require: false + type: BOOL + default: false + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: specifies whether to always record the trace log. + description_local: 是否记录追踪日志 +- name: sys_bkgd_migration_change_member_list_timeout + require: false + type: TIME + default: 1h + min_value: 0s + max_value: 24h + section: OBSERVER + need_restart: false + description_en: the timeout for migration change member list retry. + description_local: 副本迁移时变更Paxos成员组操作的超时时间 +- name: rootservice_list + require: false + type: SAFE_STRING_LIST + default: + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: a list of servers which contains rootservice + description_local: RootService及其副本所在的机器列表 +- name: enable_syslog_wf + require: false + type: BOOL + default: true + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: specifies whether any log message with a log level higher than WARN would be printed into a separate file with a suffix of wf + description_local: 是否把WARN以上级别的系统日志打印到一个单独的日志文件中 +- name: global_index_build_single_replica_timeout + require: false + type: TIME + default: 48h + min_value: 1h + max_value: NULL + section: ROOT_SERVICE + need_restart: false + description_en: build single replica task timeout when rootservice schedule to build global index. + description_local: 建全局索引时,每个副本构建的超时时间 +- name: memstore_limit_percentage + require: false + type: INT + default: 50 + min_value: 1 + max_value: 99 + section: TENANT + need_restart: false + description_en: used in calculating the value of MEMSTORE_LIMIT + description_local: 租户用于memstore的内存占其总可用内存的百分比 +- name: minor_deferred_gc_time + require: false + type: TIME + default: 0s + min_value: 0s + max_value: 24h + section: OBSERVER + need_restart: false + description_en: sstable deferred gc time after merge + description_local: 合并之后SSTable延迟回收间隔 +- name: data_disk_usage_limit_percentage + require: false + type: INT + default: 90 + min_value: 50 + max_value: 100 + section: OBSERVER + need_restart: false + description_en: the safe use percentage of data disk + description_local: 数据文件最大可以写入的百分比,超过这个阈值后,禁止数据迁入 +- name: enable_perf_event + require: false + type: BOOL + default: true + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: specifies whether to enable perf event feature. + description_local: perf event调试特性开关 +- name: obconfig_url + require: false + type: STRING + default: + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: URL for OBConfig service + description_local: OBConfig服务的URL地址 +- name: cpu_quota_concurrency + require: false + type: DOUBLE + default: 4 + min_value: 1 + max_value: 10 + section: TENANT + need_restart: false + description_en: max allowed concurrency for 1 CPU quota + description_local: 租户每个CPU配额允许的最大并发数 +- name: zone_merge_order + require: false + type: SAFE_STRING + default: + min_value: NULL + max_value: NULL + section: DAILY_MERGE + need_restart: false + description_en: the order of zone start merge in daily merge + description_local: 轮转合并的时候,多个Zone的顺序。不指定的时候,由系统自动决定。 +- name: backup_recovery_window + require: false + type: TIME + default: 0 + min_value: 0 + max_value: NULL + section: OBSERVER + need_restart: false + description_en: backup expired day limit, 0 means not expired + description_local: 恢复窗口大小 +- name: default_row_format + require: false + type: SAFE_STRING + default: compact + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: default row format in mysql mode + description_local: MySQL模式下,建表时使用的默认行格式 +- name: stack_size + require: false + type: CAPACITY_MB + default: 1M + min_value: 512K + max_value: 20M + section: OBSERVER + need_restart: true + description_en: the size of routine execution stack + description_local: 程序函数调用栈大小 +- name: balancer_idle_time + require: false + type: TIME + default: 5m + min_value: 10s + max_value: NULL + section: LOAD_BALANCE + need_restart: false + description_en: the time interval between the schedules of the partition load-balancing task. + description_local: 负载均衡等后台任务线程空闲时的唤醒间隔时间 +- name: memory_limit + name_local: 最大运行内存 + require: false + essential: true + type: CAPACITY_MB + default: 0 + min_value: NULL + max_value: NULL + modify_limit: decrease + section: OBSERVER + need_restart: false + description_en: the size of the memory reserved for internal use(for testing purpose). Please enter an capacity, such as 8G + description_local: 可用总内存大小。请输入带容量带单位的整数,如8G +- name: system_memory + name_local: 集群系统内存 + essential: true + type: CAPACITY_MB + default: 30G + min_value: 0M + max_value: NULL + section: OBSERVER + need_restart: false + description_en: the memory reserved for internal use which cannot be allocated to any outer-tenant, and should be determined to guarantee every server functions normally. Please enter an capacity, such as 2G + description_local: 系统预留内存大小,不能分配给普通租户使用。请输入带容量带单位的整数,如2G +- name: __min_full_resource_pool_memory + require: true + type: INT + default: 2147483648 + min_value: 1073741824 + max_value: NULL + need_restart: false + description_en: the minimum memory limit of the resource pool + description_local: 资源池最小内存限制 +- name: virtual_table_location_cache_expire_time + require: false + type: TIME + default: 8s + min_value: 1s + max_value: NULL + section: LOCATION_CACHE + need_restart: false + description_en: expiration time for virtual table location info in partiton location cache. + description_local: 虚拟表的位置信息缓存过期时间 +- name: ssl_external_kms_info + require: false + type: SAFE_STRING + default: + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: when using the external key management center for ssl, this parameter will store some key management information + description_local: 配置ssl使用的主密钥管理服务 +- name: enable_sql_operator_dump + require: false + type: BOOL + default: true + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: specifies whether sql operators (sort/hash join/material/window function/interm result/...) allowed to write to disk + description_local: 是否允许SQL处理过程的中间结果写入磁盘以释放内存 +- name: enable_rich_error_msg + require: false + type: BOOL + default: false + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: specifies whether add ip:port, time and trace id to user error message. + description_local: 是否在客户端消息中添加服务器地址、时间、追踪ID等调试信息 +- name: log_archive_concurrency + require: false + type: INT + default: 0 + min_value: 0 + max_value: NULL + section: OBSERVER + need_restart: false + description_en: concurrency for log_archive_sender and log_archive_spiter + description_local: 日志归档并发度 +- name: server_balance_disk_tolerance_percent + require: false + type: INT + default: 1 + min_value: 1 + max_value: 100 + section: LOAD_BALANCE + need_restart: false + description_en: specifies the tolerance (in percentage) of the unbalance of the disk space utilization among all servers. The average disk space utilization is calculated by dividing the total space by the number of servers. server balancer will start a rebalancing task when the deviation between the average usage and some server load is greater than this tolerance + description_local: 节点负载均衡策略中,磁盘资源不均衡的容忍度 +- name: user_tab_col_stat_cache_priority + require: false + type: INT + default: 1 + min_value: 1 + max_value: NULL + section: CACHE + need_restart: false + description_en: user tab col stat cache priority + description_local: 统计数据缓存在缓存系统中的优先级 +- name: recyclebin_object_expire_time + require: false + type: TIME + default: 0s + min_value: 0s + max_value: NULL + section: ROOT_SERVICE + need_restart: false + description_en: recyclebin object expire time, default 0 that means auto purge recyclebin off. + description_local: 回收站对象的有效期,超过有效的对象将被回收;0表示关闭回收功能; +- name: minor_warm_up_duration_time + require: false + type: TIME + default: 30s + min_value: 0s + max_value: 60m + section: OBSERVER + need_restart: false + description_en: warm up duration time for minor freeze. + description_local: 小合并产生新转储文件的预热时间 +- name: migrate_concurrency + require: false + type: INT + default: 10 + min_value: 0 + max_value: 64 + section: OBSERVER + need_restart: false + description_en: set concurrency of migration, set upper limit to migrate_concurrency and set lower limit to migrate_concurrency/2 + description_local: 控制内部数据迁移的并发度 +- name: redundancy_level + require: false + type: SAFE_STRING + default: NORMAL + min_value: NULL + max_value: NULL + section: SSTABLE + need_restart: false + description_en: EXTERNAL, use extrernal redundancy; NORMAL, tolerate one disk failure, HIGH tolerate two disk failure if disk count is enough + description_local: OB内置本地磁盘RAID特性。暂勿使用 +- name: trx_2pc_retry_interval + require: false + type: TIME + default: 100ms + min_value: 1ms + max_value: 5000ms + section: TRANS + need_restart: false + description_en: the time interval between the retries in case of failure during a transactions two-phase commit phase + description_local: 两阶段提交失败时候自动重试的间隔 +- name: cpu_count + name_local: 系统CPU总数 + require: false + essential: true + type: INT + default: 0 + min_value: 0 + max_value: NULL + section: OBSERVER + need_restart: true + description_en: the number of CPUs in the system. If this parameter is set to zero, the number will be set according to sysconf; otherwise, this parameter is used. + description_local: 系统CPU总数,如果设置为0,将自动检测 +- name: devname + name_local: 网卡名 + essential: true + type: SAFE_STRING + min_value: NULL + max_value: NULL + need_restart: true + description_en: name of network adapter + description_local: 非必填, 服务进程绑定的网卡设备名, 默认通过配置的ip设置local_ip, 如果预检查失败可通过配置此项来指定网卡 +- name: local_ip + name_local: 本机ip + type: SAFE_STRING + min_value: NULL + max_value: NULL + need_restart: true + description_en: local ip address + description_local: 本机ip地址 +- name: appname + require: false + type: SAFE_STRING + default: obcluster + min_value: NULL + max_value: NULL + section: OBSERVER + need_redeploy: true + description_en: Name of the cluster + description_local: 本OceanBase集群名 +- name: use_large_pages + require: false + type: SAFE_STRING + default: false + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: true + description_en: used to manage the databases use of large pages, values are false, true, only + description_local: 控制内存大页的行为,"true"表示在操作系统开启内存大页并且有空闲大页时,数据库总是申请内存大页,否则申请普通内存页, "false"表示数据库不使用大页, "only"表示数据库总是分配大页 +- name: dtl_buffer_size + require: false + type: CAPACITY_MB + default: 64K + min_value: 4K + max_value: 2M + section: OBSERVER + need_restart: false + description_en: buffer size for DTL + description_local: SQL数据传输模块使用的缓存大小 +- name: server_balance_critical_disk_waterlevel + require: false + type: INT + default: 80 + min_value: 0 + max_value: 100 + section: LOAD_BALANCE + need_restart: false + description_en: disk water level to determine server balance strategy + description_local: 磁盘水位线超过该阈值时,负载均衡策略将倾向于优先考虑磁盘均衡 +- name: location_fetch_concurrency + require: false + type: INT + default: 20 + min_value: 1 + max_value: 1000 + section: LOCATION_CACHE + need_restart: false + description_en: the maximum number of the tasks which fetch the partition location information concurrently. + description_local: 位置缓存信息刷新的最大并发度 +- name: enable_async_syslog + require: false + type: BOOL + default: true + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: specifies whether use async syslog + description_local: 是否启用系统日志异步写 +- name: clog_sync_time_warn_threshold + require: false + type: TIME + default: 1s + min_value: 1ms + max_value: 10000ms + section: TRANS + need_restart: false + description_en: the time given to the commit log synchronization between a leader and its followers before a warning message is printed in the log file. + description_local: 事务日志同步耗时告警阈值,同步耗时超过该值产生WARN日志 +- name: location_cache_cpu_quota + require: false + type: DOUBLE + default: 5 + min_value: 0 + max_value: 10 + section: TENANT + need_restart: false + description_en: the number of vCPUs allocated for the requests regarding location info of the core tables. + description_local: 位置缓存模块使用的CPU配额 +- name: bf_cache_priority + require: false + type: INT + default: 1 + min_value: 1 + max_value: NULL + section: CACHE + need_restart: false + description_en: bloomfilter cache priority + description_local: 布隆过滤器占用缓存的优先级 +- name: merger_check_interval + require: false + type: TIME + default: 10m + min_value: 10s + max_value: 60m + section: DAILY_MERGE + need_restart: false + description_en: the time interval between the schedules of the task that checks on the progress of MERGE for each zone. + description_local: 合并状态检查线程的调度间隔 +- name: enable_rootservice_standalone + require: false + type: BOOL + default: false + min_value: NULL + max_value: NULL + section: ROOT_SERVICE + need_restart: false + description_en: specifies whether the SYS tenant is allowed to occupy an observer exclusively, thus running in the standalone mode. + description_local: 是否让系统租户和RootService独占observer节点 +- name: px_workers_per_cpu_quota + require: false + type: INT + default: 10 + min_value: 0 + max_value: 20 + section: TENANT + need_restart: false + description_en: the ratio between the number of system allocated px workers vs the maximum number of threads that can be scheduled concurrently. + description_local: 并行执行工作线程数的比例 +- name: large_query_threshold + require: false + type: TIME + default: 100ms + min_value: 1ms + max_value: NULL + section: TENANT + need_restart: false + description_en: threshold for execution time beyond which a request may be paused and rescheduled as large request + description_local: 一个查询执行时间超过该阈值会被判断为大查询,执行大查询调度策略 +- name: sys_bkgd_net_percentage + require: false + type: INT + default: 60 + min_value: 0 + max_value: 100 + section: OBSERVER + need_restart: false + description_en: the net percentage of sys background net. + description_local: 后台系统任务可占用网络带宽百分比 +- name: fuse_row_cache_priority + require: false + type: INT + default: 1 + min_value: 1 + max_value: NULL + section: CACHE + need_restart: false + description_en: fuse row cache priority + description_local: 融合行缓存在缓存系统中的优先级 +- name: rpc_timeout + require: false + type: TIME + default: 2s + min_value: NULL + max_value: NULL + section: RPC + need_restart: false + description_en: the time during which a RPC request is permitted to execute before it is terminated + description_local: 集群内部请求的超时时间 +- name: tenant_task_queue_size + require: false + type: INT + default: 65536 + min_value: 1024 + max_value: NULL + section: OBSERVER + need_restart: false + description_en: the size of the task queue for each tenant. + description_local: 每个租户的请求队列大小 +- name: resource_soft_limit + require: false + type: INT + default: 50 + min_value: 1 + max_value: 10000 + section: LOAD_BALANCE + need_restart: false + description_en: Used along with resource_hard_limit in unit allocation. If server utilization is less than resource_soft_limit, a policy of best fit will be used for unit allocation; otherwise, a least loadpolicy will be employed. Ultimately,system utilization should not be large than resource_hard_limit. + description_local: 当所有节点的资源水位低于该阈值时,不执行负载均衡 +- name: plan_cache_evict_interval + require: false + type: TIME + default: 1s + min_value: 0s + max_value: NULL + section: TENANT + need_restart: false + description_en: time interval for periodic plan cache eviction. + description_local: 执行计划缓存的淘汰间隔 +- name: server_balance_cpu_mem_tolerance_percent + require: false + type: INT + default: 5 + min_value: 1 + max_value: 100 + section: LOAD_BALANCE + need_restart: false + description_en: specifies the tolerance (in percentage) of the unbalance of the cpu/memory utilization among all servers. The average cpu/memory utilization is calculated by dividing the total cpu/memory by the number of servers. server balancer will start a rebalancing task when the deviation between the average usage and some server load is greater than this tolerance + description_local: 节点负载均衡策略中,CPU和内存资源不均衡的容忍度 +- name: autoinc_cache_refresh_interval + require: false + type: TIME + default: 3600s + min_value: 100ms + max_value: NULL + section: OBSERVER + need_restart: false + description_en: auto-increment service cache refresh sync_value in this interval + description_local: 自动刷新自增列值的时间间隔 +- name: all_server_list + require: false + type: SAFE_STRING + default: + min_value: NULL + max_value: NULL + section: LOCATION_CACHE + need_restart: false + description_en: all server addr in cluster + description_local: 集群中所有机器的列表,不建议人工修改 +- name: enable_rebalance + require: false + type: BOOL + default: true + min_value: NULL + max_value: NULL + section: LOAD_BALANCE + need_restart: false + description_en: specifies whether the partition load-balancing is turned on. + description_local: 自动负载均衡开关 +- name: internal_sql_execute_timeout + require: false + type: TIME + default: 30s + min_value: 1000us + max_value: 10m + section: OBSERVER + need_restart: false + description_en: the number of microseconds an internal DML request is permitted to execute before it is terminated. + description_local: 系统内部SQL请求的超时时间 +- name: user_row_cache_priority + require: false + type: INT + default: 1 + min_value: 1 + max_value: NULL + section: CACHE + need_restart: false + description_en: user row cache priority + description_local: 基线数据行缓存在缓存系统中的优先级 +- name: server_permanent_offline_time + require: false + type: TIME + default: 3600s + min_value: 20s + max_value: NULL + section: ROOT_SERVICE + need_restart: false + description_en: the time interval between any two heartbeats beyond which a server is considered to be permanently offline. + description_local: 节点心跳中断多久后认为其被“永久下线”,“永久下线”的节点上的数据副本需要被自动补足 +- name: schema_history_expire_time + require: false + type: TIME + default: 7d + min_value: 1m + max_value: 30d + section: OBSERVER + need_restart: false + description_en: the hour of expire time for schema history + description_local: 元数据历史数据过期时间 +- name: datafile_disk_percentage + require: false + type: INT + min_value: 0 + max_value: 99 + modify_limit: decrease + section: SSTABLE + need_restart: false + description_en: the percentage of disk space used by the data files. + description_local: data_dir所在磁盘将被OceanBase系统初始化用于存储数据,本配置项表示占用该磁盘总空间的百分比 +- name: default_compress_func + require: false + type: SAFE_STRING + default: zstd_1.3.8 + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + description_en: default compress function name for create new table + description_local: MySQL模式下,建表时使用的默认压缩算法 +- name: memory_chunk_cache_size + require: false + type: CAPACITY_MB + default: 0M + min_value: 0M + max_value: NULL + section: OBSERVER + need_restart: false + description_en: the maximum size of memory cached by memory chunk cache. + description_local: 内存分配器缓存的内存块容量。值为0的时候表示系统自适应。 +- name: ob_event_history_recycle_interval + require: false + type: TIME + default: 7d + min_value: 1d + max_value: 180d + section: ROOT_SERVICE + need_restart: false + description_en: the time to recycle event history. + description_local: OB事件表中事件条目的保存期限 +- name: enable_ddl + require: false + type: BOOL + default: true + min_value: NULL + max_value: NULL + section: ROOT_SERVICE + need_restart: false + description_en: specifies whether DDL operation is turned on. + description_local: 是否允许执行DDL +- name: balance_blacklist_failure_threshold + require: false + type: INT + default: 5 + min_value: 0 + max_value: 1000 + section: LOAD_BALANCE + need_restart: false + description_en: a balance task failed count to be putted into blacklist + description_local: 副本迁移等后台任务连续失败超过该阈值后,将被放入黑名单 +- name: wait_leader_batch_count + require: false + type: INT + default: 1024 + min_value: 128 + max_value: 5000 + section: ROOT_SERVICE + need_restart: false + description_en: leader batch count everytime leader coordinator wait. + description_local: RootService发送切主命令的批次大小 +- name: proxyro_password + require: false + type: STRING + default: '' + min_value: NULL + max_value: NULL + need_restart: false + description_en: password of observer proxyro user + description_local: proxyro用户的密码 +- name: root_password + require: false + type: STRING + default: '' + min_value: NULL + max_value: NULL + need_restart: false + description_en: password of observer root user + description_local: sys租户root用户的密码 +# todo: 等文档更新 +- name: sql_login_thread_count + require: false + type: INT + default: 0 + min_value: 0 + max_value: 32 + section: OBSERVER + need_restart: false + need_redeploy: false + description_en: 'the number of threads for sql login request. Range: [0, 32] in integer, 0 stands for use default thread count defined in TG.the default thread count for login request in TG is normal:6 mini-mode:2' + description_local: '' +- name: tcp_keepcnt + require: false + type: INT + default: 10 + min_value: 1 + max_value: NULL + section: OBSERVER + need_restart: false + need_redeploy: false + description_en: 'The maximum number of keepalive probes TCP should send before dropping the connection. Take effect for new established connections. Range: [1,+∞)' + description_local: 关闭一个非活跃连接之前的最大重试次数。 +- name: tcp_keepintvl + require: false + type: TIME + default: 6s + min_value: 1s + max_value: NULL + section: OBSERVER + need_restart: false + need_redeploy: false + description_en: 'The time (in seconds) between individual keepalive probes. Take effect for new established connections. Range: [1s, +∞]' + description_local: 开启客户端连接的探活机制后,前后两次探测之间的时间间隔,单位为秒。 +- name: tcp_keepidle + require: false + type: TIME + default: 7200s + min_value: 1s + max_value: NULL + section: OBSERVER + need_restart: false + need_redeploy: false + description_en: 'The time (in seconds) the connection needs to remain idle before TCP starts sending keepalive probe. Take effect for new established connections. Range: [1s, +∞]' + description_local: 客户端连接上服务器后,如果没有数据发送,多久后会发送 Keepalive 探测分组,单位为秒。 +- name: enable_tcp_keepalive + require: false + type: BOOL + default: true + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + need_redeploy: false + description_en: enable TCP keepalive for the TCP connection of sql protocol. Take effect for new established connections. + description_local: 开启或关闭客户端连接的探活机制。 +- name: ob_ratelimit_stat_period + require: false + type: TIME + default: 1s + min_value: 100ms + max_value: NULL + section: OBSERVER + need_restart: false + need_redeploy: false + description_en: "the time interval to update observer's maximum bandwidth to a certain region. " + description_local: OBServer 计算和更新最大带宽的时间间隔。 +- name: enable_ob_ratelimit + require: false + type: BOOL + default: false + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + need_redeploy: false + description_en: enable ratelimit between regions for RPC connection. + description_local: 开启或关闭客户端连接的探活机制。 +- name: schema_history_recycle_interval + require: false + type: TIME + default: 10m + min_value: 0s + max_value: NULL + section: LOAD_BALANCE + need_restart: false + need_redeploy: false + description_en: 'the time interval between the schedules of schema history recyle task. Range: [0s, +∞)' + description_local: 系统内部执行 schema 多版本记录回收任务的时间间隔。 +- name: backup_data_file_size + require: false + type: CAPACITY_MB + default: 4G + min_value: 512M + max_value: 4G + section: OBSERVER + need_restart: false + need_redeploy: false + description_en: 'backup data file size. Range: [512M, 4G] in integer' + description_local: 备份数据文件的容量。 +- name: data_storage_error_tolerance_time + require: false + type: TIME + default: 300s + min_value: 10s + max_value: 7200s + section: OBSERVER + need_restart: false + need_redeploy: false + description_en: time to tolerate disk read failure, after that, the disk status will be set error. Range [10s,7200s]. The default value is 300s + description_local: 数据盘状态设为 ERROR 状态的容忍时间。 +- name: data_storage_warning_tolerance_time + require: false + type: TIME + default: 30s + min_value: 10s + max_value: 300s + section: OBSERVER + need_restart: false + need_redeploy: false + description_en: time to tolerate disk read failure, after that, the disk status will be set warning. Range [10s,300s]. The default value is 30s + description_local: 数据盘状态设为 WARNING 状态的容忍时间。 +- name: index_block_cache_priority + require: false + type: INT + default: 10 + min_value: 1 + max_value: NULL + section: CACHE + need_restart: false + need_redeploy: false + description_en: index cache priority. Range:[1, ) + description_local: Tablet 映射缓存优先级。 +- name: opt_tab_stat_cache_priority + require: false + type: INT + default: 1 + min_value: 1 + max_value: NULL + section: CACHE + need_restart: false + need_redeploy: false + description_en: tab stat cache priority. Range:[1, ) + description_local: 统计信息缓存优先级。 +- name: tablet_ls_cache_priority + require: false + type: INT + default: 1000 + min_value: 1 + max_value: NULL + section: CACHE + need_restart: false + need_redeploy: false + description_en: tablet ls cache priority. Range:[1, ) + description_local: 元数据索引微块缓存优先级。 +- name: location_cache_refresh_sql_timeout + require: false + type: TIME + default: 1s + min_value: 1ms + max_value: NULL + section: LOCATION_CACHE + need_restart: false + need_redeploy: false + description_en: 'The timeout used for refreshing location cache by SQL. Range: [1ms, +∞)' + description_local: 通过 SQL 刷新 location_cache 的超时时间。 +- name: location_cache_refresh_rpc_timeout + require: false + type: TIME + default: 500ms + min_value: 1ms + max_value: NULL + section: LOCATION_CACHE + need_restart: false + need_redeploy: false + description_en: 'The timeout used for refreshing location cache by RPC. Range: [1ms, +∞)' + description_local: 通过 RPC 刷新 location_cache 的超时时间。 +- name: tablet_meta_table_check_interval + require: false + type: TIME + default: 30m + min_value: 1m + max_value: NULL + section: ROOT_SERVICE + need_restart: false + need_redeploy: false + description_en: 'the time interval that observer compares tablet meta table with local ls replica info and make adjustments to ensure the correctness of tablet meta table. Range: [1m,+∞)' + description_local: DBA_OB_TABLET_REPLICAS/CDB_OB_TABLET_REPLICAS 视图的后台巡检线程的检查间隔。 +- name: ls_meta_table_check_interval + require: false + type: TIME + default: 1s + min_value: 1ms + max_value: NULL + section: ROOT_SERVICE + need_restart: false + need_redeploy: false + description_en: 'the time interval that observer compares ls meta table with local ls replica info and make adjustments to ensure the correctness of ls meta table. Range: [1ms,+∞)' + description_local: DBA_OB_LS_LOCATIONS/CDB_OB_LS_LOCATIONS 视图的后台巡检线程的检查间隔。 +- name: tablet_meta_table_scan_batch_count + require: false + type: INT + default: 999 + min_value: 1 + max_value: 65536 + section: ROOT_SERVICE + need_restart: false + need_redeploy: false + description_en: the number of tablet replica info that will be read by each request on the tablet-related system tables during procedures such as load-balancing, daily merge, election and etc. Range:(0,65536] + description_local: Tablet meta table 迭代器使用过程中在内存里缓存的 Tablet 数量。 +- name: rdma_io_thread_count + require: false + type: INT + default: 0 + min_value: 0 + max_value: 8 + section: OBSERVER + need_restart: true + need_redeploy: false + description_en: 'the number of RDMA I/O threads for Libreasy. Range: [0, 8] in integer, 0 stands for RDMA being disabled.' + description_local: Libreasy 的 RDMA I/O 线程数。 +- name: production_mode + require: true + type: BOOL + default: true + min_value: NULL + max_value: NULL + section: OBSERVER + need_restart: false + need_redeploy: false + description_en: Production mode switch, default True. Adjust the memory_limit and __min_full_resource_pool_memory The lower bound of memory is adjusted to 16G and 2147483648 + description_local: 生产模式开关, 默认开启。开启后调整memory limit 和 __min_full_resource_pool_memory 下界调整为 16G 和 2147483648 +- name: ocp_meta_tenant + require: false + type: DICT + default: + tenant_name: ocp + max_cpu: 1 + memory_size: 2147483648 + need_redeploy: true + description_en: The tenant specifications for ocp meta db + description_local: ocp express的元数据库使用的租户规格 +- name: ocp_meta_tenant_max_cpu + name_local: OCP express元数据库租户的CPU数 + essential: true + require: false + type: INT + default: 1 + need_redeploy: true + description_en: The tenant cpu count for ocp meta db + description_local: ocp express的元数据库使用的CPU数量 +- name: ocp_meta_tenant_memory_size + name_local: OCP express元数据库租户内存 + essential: true + require: false + type: CAPACITY_MB + default: 2G + need_redeploy: true + description_en: The tenant memory size for ocp meta db + description_local: ocp express的元数据库使用的租户内存大小 +- name: ocp_meta_tenant_log_disk_size + name_local: OCP express元数据库租户日志磁盘大小 + essential: true + require: false + type: CAPACITY_MB + default: 6656M + need_redeploy: true + description_en: The tenant log disk size for ocp meta db + description_local: ocp express的元数据库使用的租户日志磁盘大小 +- name: ocp_meta_db + require: false + type: SAFE_STRING + default: ocp_express + need_redeploy: true + description_en: The database name for ocp meta db + description_local: ocp express的元数据库使用的数据库名 +- name: ocp_meta_username + require: false + type: SAFE_STRING + default: meta + need_redeploy: true + description_en: The database name for ocp meta db + description_local: ocp express的元数据库使用的数据库名 +- name: ocp_meta_password + require: false + type: STRING + default: oceanbase + need_redeploy: true + description_en: The database name for ocp meta db + description_local: ocp express的元数据库使用的数据库名 +- name: ocp_agent_monitor_password + require: false + type: STRING + default: '' + need_redeploy: true + description_en: The password for obagent monitor user + description_local: obagent 监控用户的密码 +- name: ocp_monitor_tenant + require: false + type: DICT + default: + tenant_name: ocp_monitor + max_cpu: 1 + memory_size: 2147483648 + need_redeploy: true + description_en: The tenant specifications for ocp monitor db + description_local: ocp 的监控数据库使用的租户定义 +- name: ocp_monitor_tenant_max_cpu + name_local: OCP 监控数据库租户的CPU数 + essential: true + require: false + type: INT + default: 1 + need_redeploy: true + description_en: The tenant cpu count for ocp monitor db + description_local: ocp 监控数据库使用的CPU数量 +- name: ocp_monitor_tenant_memory_size + name_local: OCP 监控数据库租户内存 + essential: true + require: false + type: CAPACITY_MB + default: 2G + need_redeploy: true + description_en: The tenant memory size for ocp monitor db + description_local: ocp 监控数据库使用的租户内存大小 +- name: ocp_monitor_tenant_log_disk_size + name_local: OCP 监控数据库租户日志磁盘大小 + essential: true + require: false + type: CAPACITY_MB + default: 6656M + need_redeploy: true + description_en: The tenant log disk size for ocp monitor db + description_local: ocp 监控数据库使用的租户日志磁盘大小 +- name: ocp_monitor_db + require: false + type: SAFE_STRING + default: ocp_monitor + need_redeploy: true + description_en: The database name for ocp monitor db + description_local: ocp 的监控数据库使用的数据库名 +- name: ocp_monitor_username + require: false + type: SAFE_STRING + default: monitor + need_redeploy: true + description_en: The user name for ocp meta db + description_local: ocp 的监控数据库使用的用户名 +- name: ocp_monitor_password + require: false + type: STRING + default: oceanbase + need_redeploy: true + description_en: The password for ocp monitor db + description_local: ocp 的监控数据库使用的密码 diff --git a/plugins/oceanbase/4.2.1.4/restart.py b/plugins/oceanbase/4.2.1.4/restart.py new file mode 100644 index 0000000..138a7ba --- /dev/null +++ b/plugins/oceanbase/4.2.1.4/restart.py @@ -0,0 +1,314 @@ +# coding: utf-8 +# OceanBase Deploy. +# Copyright (C) 2021 OceanBase +# +# This file is part of OceanBase Deploy. +# +# OceanBase Deploy is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# OceanBase Deploy is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with OceanBase Deploy. If not, see . + + +from __future__ import absolute_import, division, print_function + +import os +import time + + +class Restart(object): + + def __init__(self, plugin_context, local_home_path, start_plugin, reload_plugin, stop_plugin, connect_plugin, display_plugin, repository, new_cluster_config=None, new_clients=None): + self.local_home_path = local_home_path + + self.namespace = plugin_context.namespace + self.namespaces = plugin_context.namespaces + self.deploy_name = plugin_context.deploy_name + self.deploy_status = plugin_context.deploy_status + self.repositories = plugin_context.repositories + self.plugin_name = plugin_context.plugin_name + + self.components = plugin_context.components + self.clients = plugin_context.clients + self.cluster_config = plugin_context.cluster_config + self.cmds = plugin_context.cmds + self.options = plugin_context.options + self.dev_mode = plugin_context.dev_mode + self.stdio = plugin_context.stdio + + self.plugin_context = plugin_context + self.repository = repository + self.start_plugin = start_plugin + self.reload_plugin = reload_plugin + self.connect_plugin = connect_plugin + self.stop_plugin = stop_plugin + self.display_plugin = display_plugin + self.new_clients = new_clients + self.new_cluster_config = new_cluster_config + self.now_clients = {} + self.sub_io = self.stdio.sub_io() + self.db = None + self.cursor = None + for server in self.cluster_config.servers: + self.now_clients[server] = self.clients[server] + + def call_plugin(self, plugin, **kwargs): + args = { + 'namespace': self.namespace, + 'namespaces': self.namespaces, + 'deploy_name': self.deploy_name, + 'deploy_status': self.deploy_status, + 'cluster_config': self.cluster_config, + 'repositories': self.repositories, + 'repository': self.repository, + 'components': self.components, + 'clients': self.clients, + 'cmd': self.cmds, + 'options': self.options, + 'stdio': self.sub_io + } + args.update(kwargs) + + self.stdio.verbose('Call %s for %s' % (plugin, self.repository)) + return plugin(**args) + + def close(self): + if self.db: + self.cursor.close() + self.cursor = None + self.db = None + + def connect(self): + if self.cursor is None or self.execute_sql('select version()', error=False) is False: + self.sub_io.start_loading('Connect to observer') + ret = self.call_plugin(self.connect_plugin) + if not ret: + self.sub_io.stop_loading('fail') + return False + self.sub_io.stop_loading('succeed') + if self.cursor: + self.close() + self.cursor = ret.get_return('cursor') + self.db = ret.get_return('connect') + while self.execute_sql('use oceanbase', error=False) is False: + time.sleep(2) + self.execute_sql('set session ob_query_timeout=1000000000') + return True + + def execute_sql(self, query, args=None, one=True, error=True): + exc_level = 'error' if error else 'verbose' + if one: + result = self.cursor.fetchone(query, args, exc_level=exc_level) + else: + result = self.cursor.fetchall(query, args, exc_level=exc_level) + result and self.stdio.verbose(result) + return result + + def broken_sql(self, sql, sleep_time=3): + while True: + ret = self.execute_sql(sql, error=False) + if ret is None: + break + time.sleep(sleep_time) + + def wait(self): + if not self.connect(): + return False + self.stdio.verbose('server check') + self.broken_sql("select * from oceanbase.__all_server where status != 'active' or stop_time > 0 or start_service_time = 0") + # self.broken_sql("select * from oceanbase.__all_virtual_clog_stat where is_in_sync= 0 and is_offline = 0") + return True + + def start_zone(self, zone=None): + if not self.connect(): + return False + if zone: + self.stdio.verbose('start zone %s' % zone) + start_sql = "alter system start zone %s" % zone + check_sql = "select * from oceanbase.__all_zone where name = 'status' and zone = '%s' and info != 'ACTIVE'" % zone + while True: + if self.execute_sql(start_sql, error=False) is None: + break + if self.execute_sql(check_sql, error=False) is None: + break + time.sleep(3) + self.wait() + return True + + def stop_zone(self, zone): + if not self.wait(): + return False + + self.stdio.verbose('stop zone %s' % zone) + stop_sql = "alter system stop zone %s" % zone + check_sql = "select * from oceanbase.__all_zone where name = 'status' and zone = '%s' and info = 'ACTIVE'" % zone + while True: + if self.execute_sql(stop_sql, error=False) is None: + break + if self.execute_sql(check_sql, error=False): + break + time.sleep(3) + return True + + def rollback(self): + if self.new_clients: + self.stdio.start_loading('Rollback') + cluster_config = self.new_cluster_config if self.new_cluster_config else self.cluster_config + self.call_plugin(self.stop_plugin, clients=self.now_clients, cluster_config=cluster_config) + for server in self.cluster_config.servers: + client = self.clients[server] + new_client = self.now_clients[server] + server_config = self.cluster_config.get_server_conf(server) + chown_cmd = 'sudo chown -R %s:' % client.config.username + for key in ['home_path', 'data_dir', 'redo_dir', 'clog_dir', 'ilog_dir', 'slog_dir', '.meta', 'log_obshell']: + if key in server_config: + chown_cmd += ' %s' % server_config[key] + new_client.execute_command(chown_cmd) + self.stdio.stop_loading('succeed') + + def dir_read_check(self, client, path): + if not client.execute_command('cd %s' % path): + dirpath, name = os.path.split(path) + return self.dir_read_check(client, dirpath) and client.execute_command('sudo chmod +1 %s' % path) + return True + + def _restart(self): + clients = self.clients + if not self.call_plugin(self.stop_plugin, clients=clients): + self.stdio.stop_loading('stop_loading', 'fail') + return False + + if self.new_clients: + self.stdio.verbose('use new clients') + for server in self.cluster_config.servers: + new_client = self.new_clients[server] + server_config = self.cluster_config.get_server_conf(server) + new_client.execute_command('[ -w {dir} ] || chmod +666 {dir}'.format(dir='/tmp/obshell')) + chown_cmd = 'sudo chown -R %s:' % new_client.config.username + for key in ['home_path', 'data_dir', 'redo_dir', 'clog_dir', 'ilog_dir', 'slog_dir', '.meta', 'log_obshell']: + if key in server_config: + chown_cmd += ' %s' % server_config[key] + if not new_client.execute_command(chown_cmd): + self.stdio.stop_loading('stop_loading', 'fail') + return False + self.dir_read_check(new_client, server_config['home_path']) + self.now_clients[server] = new_client + clients = self.new_clients + + cluster_config = self.new_cluster_config if self.new_cluster_config else self.cluster_config + if not self.call_plugin(self.start_plugin, clients=clients, cluster_config=cluster_config, local_home_path=self.local_home_path, repository=self.repository): + self.stdio.stop_loading('stop_loading', 'fail') + return False + self.close() + return True + + def rolling(self, zones_servers): + self.stdio.start_loading('Observer rotation restart') + all_servers = self.cluster_config.servers + pre_zone = None + for zone in zones_servers: + self.cluster_config.servers = zones_servers[zone] + if self.new_cluster_config: + self.new_cluster_config.servers = zones_servers[zone] + if not self.start_zone(pre_zone): + self.stdio.stop_loading('stop_loading', 'fail') + return False + while True: + for server in zones_servers[zone]: + config = self.cluster_config.get_server_conf(server) + sql = ''' + select count(*) as cnt from oceanbase.__all_tenant as a left join ( + select tenant_id, refreshed_schema_version + from oceanbase.__all_virtual_server_schema_info + where svr_ip = %s and svr_port = %s and refreshed_schema_version > 1 + ) as b on a.tenant_id = b.tenant_id + where b.tenant_id is null''' + if self.execute_sql(sql, args=(server.ip, config['rpc_port']), error=False).get('cnt'): + break + else: + break + time.sleep(3) + + while self.execute_sql("select * from oceanbase.__all_virtual_clog_stat where table_id = 1099511627777 and status != 'ACTIVE'", error=False): + time.sleep(3) + + self.stop_zone(zone) + if not self._restart(): + return False + pre_zone = zone + + if not self.start_zone(pre_zone): + self.stdio.stop_loading('stop_loading', 'fail') + return False + + self.cluster_config.servers = all_servers + if self.new_cluster_config: + self.new_cluster_config.servers = all_servers + self.stdio.stop_loading('succeed') + return True + + def un_rolling(self): + self.stdio.start_loading('Observer restart') + + if not self._restart(): + return False + + self.wait() + self.stdio.stop_loading('succeed') + return True + + def restart(self): + zones_servers = {} + all_servers = self.cluster_config.servers + if self.connect(): + self.stdio.start_loading('Server check') + servers = self.execute_sql("select * from oceanbase.__all_server", one=False, error=False) + if isinstance(servers, list) and len(self.cluster_config.servers) == len(servers): + for server in servers: + if server['status'] != 'active' or server['stop_time'] > 0 or server['start_service_time'] == 0: + break + else: + for server in self.cluster_config.servers: + config = self.cluster_config.get_server_conf_with_default(server) + zone = config['zone'] + if zone not in zones_servers: + zones_servers[zone] = [] + zones_servers[zone].append(server) + self.stdio.stop_loading('succeed') + ret = False + try: + if len(zones_servers) > 2: + ret = self.rolling(zones_servers) + else: + ret = self.un_rolling() + + if ret and self.connect(): + if self.display_plugin: + self.call_plugin(self.display_plugin, clients=self.now_clients, cluster_config=self.new_cluster_config if self.new_cluster_config else self.cluster_config, cursor=self.cursor) + if self.new_cluster_config: + self.call_plugin(self.reload_plugin, clients=self.now_clients, cursor=self.cursor, new_cluster_config=self.new_cluster_config, repository_dir=self.repository.repository_dir) + except Exception as e: + self.stdio.exception('Run Exception: %s' % e) + finally: + self.cluster_config.servers = all_servers + if self.new_cluster_config: + self.new_cluster_config.servers = all_servers + if not ret: + self.rollback() + return ret + + +def restart(plugin_context, local_home_path, start_plugin, reload_plugin, stop_plugin, connect_plugin, display_plugin, new_cluster_config=None, new_clients=None, rollback=False, *args, **kwargs): + repository = kwargs.get('repository') + task = Restart(plugin_context, local_home_path, start_plugin, reload_plugin, stop_plugin, connect_plugin, display_plugin, repository, new_cluster_config, new_clients) + call = task.rollback if rollback else task.restart + if call(): + plugin_context.return_true() diff --git a/plugins/oceanbase/4.2.1.4/scale_out_check.py b/plugins/oceanbase/4.2.1.4/scale_out_check.py new file mode 100644 index 0000000..d1dcd12 --- /dev/null +++ b/plugins/oceanbase/4.2.1.4/scale_out_check.py @@ -0,0 +1,57 @@ +# coding: utf-8 +# OceanBase Deploy. +# Copyright (C) 2021 OceanBase +# +# This file is part of OceanBase Deploy. +# +# OceanBase Deploy is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# OceanBase Deploy is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with OceanBase Deploy. If not, see . + + +from __future__ import absolute_import, division, print_function +from const import COMP_OB, COMP_OB_CE + + +def add_plugin(component_name, plugins): + if component_name not in plugins: + plugins.append(component_name) + + +def scale_out_check(plugin_context, *args, **kwargs): + cluster_config = plugin_context.cluster_config + added_components = cluster_config.get_deploy_added_components() + be_depend = cluster_config.be_depends + plugins = [] + plugin_context.set_variable('need_bootstrap', False) + if 'obagent' in added_components and 'obagent' in be_depend: + add_plugin('generate_config', plugins) + add_plugin('connect', plugins) + add_plugin('bootstrap', plugins) + if ('obproxy-ce' in added_components and 'obproxy-ce' in be_depend or 'obproxy' in added_components and 'obproxy' in be_depend): + add_plugin('generate_config', plugins) + add_plugin('connect', plugins) + add_plugin('bootstrap', plugins) + if 'ocp-express' in added_components and 'ocp-express' in be_depend: + add_plugin('generate_config', plugins) + add_plugin('connect', plugins) + add_plugin('bootstrap', plugins) + add_plugin('create_tenant', plugins) + if cluster_config.added_servers: + add_plugin('connect', plugins) + add_plugin('bootstrap', plugins) + if (COMP_OB_CE in added_components or COMP_OB in added_components) and not cluster_config.added_servers: + plugin_context.set_variable('need_bootstrap', True) + + plugin_context.stdio.verbose('scale_out_check plugins: %s' % plugins) + plugin_context.stdio.verbose('added_components: %s' % added_components) + return plugin_context.return_true(plugins=plugins) diff --git a/plugins/oceanbase/4.2.1.4/start.py b/plugins/oceanbase/4.2.1.4/start.py new file mode 100644 index 0000000..a8b5152 --- /dev/null +++ b/plugins/oceanbase/4.2.1.4/start.py @@ -0,0 +1,289 @@ +# coding: utf-8 +# OceanBase Deploy. +# Copyright (C) 2021 OceanBase +# +# This file is part of OceanBase Deploy. +# +# OceanBase Deploy is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# OceanBase Deploy is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with OceanBase Deploy. If not, see . + + +from __future__ import absolute_import, division, print_function + +import json +import time +import requests +from copy import deepcopy + +from _errno import EC_OBSERVER_FAIL_TO_START, EC_OBSERVER_FAIL_TO_START_WITH_ERR, EC_OBSERVER_FAILED_TO_REGISTER, EC_OBSERVER_FAILED_TO_REGISTER_WITH_DETAILS, EC_OBSERVER_FAIL_TO_START_OCS + +from collections import OrderedDict + +from tool import NetUtil, ConfigUtil + + +def config_url(ocp_config_server, appname, cid): + if ocp_config_server[-1] == '?': + link_char = '' + elif ocp_config_server.find('?') == -1: + link_char = '?' + else: + link_char = '&' + cfg_url = '%s%sAction=ObRootServiceInfo&ObCluster=%s' % (ocp_config_server, link_char, appname) + proxy_cfg_url = '%s%sAction=GetObProxyConfig&ObRegionGroup=%s' % (ocp_config_server, link_char, appname) + # Command that clears the URL content for the cluster + cleanup_config_url_content = '%s%sAction=DeleteObRootServiceInfoByClusterName&ClusterName=%s' % (ocp_config_server, link_char, appname) + # Command that register the cluster information to the Config URL + register_to_config_url = '%s%sAction=ObRootServiceRegister&ObCluster=%s&ObClusterId=%s' % (ocp_config_server, link_char, appname, cid) + return cfg_url, cleanup_config_url_content, register_to_config_url + + +def init_config_server(ocp_config_server, appname, cid, force_delete, stdio): + def post(url): + stdio.verbose('post %s' % url) + response = requests.post(url) + if response.status_code != 200: + raise Exception('%s status code %s' % (url, response.status_code)) + return json.loads(response.text)['Code'] + cfg_url, cleanup_config_url_content, register_to_config_url = config_url(ocp_config_server, appname, cid) + ret = post(register_to_config_url) + if ret != 200: + if not force_delete: + raise Exception('%s may have been registered in %s' % (appname, ocp_config_server)) + ret = post(cleanup_config_url_content) + if ret != 200 : + raise Exception('failed to clean up the config url content, return code %s' % ret) + if post(register_to_config_url) != 200: + return False + return cfg_url + + +class EnvVariables(object): + + def __init__(self, environments, client): + self.environments = environments + self.client = client + self.env_done = {} + + def __enter__(self): + for env_key, env_value in self.environments.items(): + self.env_done[env_key] = self.client.get_env(env_key) + self.client.add_env(env_key, env_value, True) + + def __exit__(self, *args, **kwargs): + for env_key, env_value in self.env_done.items(): + if env_value is not None: + self.client.add_env(env_key, env_value, True) + else: + self.client.del_env(env_key) + + +def start(plugin_context, start_obshell=True, *args, **kwargs): + cluster_config = plugin_context.cluster_config + options = plugin_context.options + clients = plugin_context.clients + stdio = plugin_context.stdio + clusters_cmd = {} + root_servers = {} + global_config = cluster_config.get_global_conf() + appname = global_config['appname'] if 'appname' in global_config else None + cluster_id = global_config['cluster_id'] if 'cluster_id' in global_config else None + obconfig_url = global_config['obconfig_url'] if 'obconfig_url' in global_config else None + cfg_url = '' + if obconfig_url: + if not appname or not cluster_id: + stdio.error('need appname and cluster_id') + return + try: + cfg_url = init_config_server(obconfig_url, appname, cluster_id, getattr(options, 'force_delete', False), stdio) + if not cfg_url: + stdio.warn(EC_OBSERVER_FAILED_TO_REGISTER_WITH_DETAILS.format(appname, obconfig_url)) + except: + stdio.warn(EC_OBSERVER_FAILED_TO_REGISTER.format()) + elif 'ob-configserver' in cluster_config.depends and appname: + obc_cluster_config = cluster_config.get_depend_config('ob-configserver') + vip_address = obc_cluster_config.get('vip_address') + if vip_address: + obc_ip = vip_address + obc_port = obc_cluster_config.get('vip_port') + else: + server = cluster_config.get_depend_servers('ob-configserver')[0] + client = clients[server] + obc_ip = NetUtil.get_host_ip() if client.is_localhost() else server.ip + obc_port = obc_cluster_config.get('listen_port') + cfg_url = "http://{0}:{1}/services?Action=ObRootServiceInfo&ObCluster={2}".format(obc_ip, obc_port, appname) + + if cluster_config.added_servers: + scale_out = True + need_bootstrap = False + else: + scale_out = False + need_bootstrap = True + stdio.start_loading('Start observer') + for server in cluster_config.original_servers: + config = cluster_config.get_server_conf(server) + zone = config['zone'] + if zone not in root_servers: + root_servers[zone] = '%s:%s:%s' % (server.ip, config['rpc_port'], config['mysql_port']) + rs_list_opt = '-r \'%s\'' % ';'.join([root_servers[zone] for zone in root_servers]) + for server in cluster_config.servers: + client = clients[server] + server_config = cluster_config.get_server_conf(server) + home_path = server_config['home_path'] + + if not server_config.get('data_dir'): + server_config['data_dir'] = '%s/store' % home_path + + if not server_config.get('local_ip') and not server_config.get('devname'): + server_config['local_ip'] = server.ip + + if client.execute_command('ls %s/clog/tenant_1/' % server_config['data_dir']).stdout.strip(): + need_bootstrap = False + + remote_pid_path = '%s/run/observer.pid' % home_path + remote_pid = client.execute_command('cat %s' % remote_pid_path).stdout.strip() + if remote_pid: + if client.execute_command('ls /proc/%s' % remote_pid): + continue + + stdio.verbose('%s start command construction' % server) + if getattr(options, 'without_parameter', False) and client.execute_command('ls %s/etc/observer.config.bin' % home_path): + use_parameter = False + else: + use_parameter = True + cmd = [] + if use_parameter: + not_opt_str = OrderedDict({ + 'mysql_port': '-p', + 'rpc_port': '-P', + 'zone': '-z', + 'nodaemon': '-N', + 'appname': '-n', + 'cluster_id': '-c', + 'data_dir': '-d', + 'devname': '-i', + 'syslog_level': '-l', + 'ipv6': '-6', + 'mode': '-m', + 'scn': '-f', + 'local_ip': '-I' + }) + not_cmd_opt = [ + 'home_path', 'obconfig_url', 'root_password', 'proxyro_password', + 'redo_dir', 'clog_dir', 'ilog_dir', 'slog_dir', '$_zone_idc', 'production_mode', + 'ocp_monitor_tenant', 'ocp_monitor_username', 'ocp_monitor_password', 'ocp_monitor_db', + 'ocp_meta_tenant', 'ocp_meta_username', 'ocp_meta_password', 'ocp_meta_db', 'ocp_agent_monitor_password','ocp_root_password','obshell_port' + ] + get_value = lambda key: "'%s'" % server_config[key] if isinstance(server_config[key], str) else server_config[key] + opt_str = [] + for key in server_config: + if key not in not_cmd_opt and key not in not_opt_str and not key.startswith('ocp_meta_tenant_'): + value = get_value(key) + opt_str.append('%s=%s' % (key, value)) + if cfg_url: + opt_str.append('obconfig_url=\'%s\'' % cfg_url) + else: + cmd.append(rs_list_opt) + for key in not_opt_str: + if key in server_config: + value = get_value(key) + cmd.append('%s %s' % (not_opt_str[key], value)) + cmd.append('-o %s' % ','.join(opt_str)) + else: + cmd.append('-p %s' % server_config['mysql_port']) + + clusters_cmd[server] = 'cd %s; %s/bin/observer %s' % (home_path, home_path, ' '.join(cmd)) + for server in clusters_cmd: + environments = deepcopy(cluster_config.get_environments()) + client = clients[server] + server_config = cluster_config.get_server_conf(server) + stdio.verbose('starting %s observer', server) + if 'LD_LIBRARY_PATH' not in environments: + environments['LD_LIBRARY_PATH'] = '%s/lib:' % server_config['home_path'] + with EnvVariables(environments, client): + ret = client.execute_command(clusters_cmd[server]) + if not ret: + stdio.stop_loading('fail') + stdio.error(EC_OBSERVER_FAIL_TO_START_WITH_ERR.format(server=server, stderr=ret.stderr)) + return + stdio.stop_loading('succeed') + + start_obshell = start_obshell and not need_bootstrap and not scale_out + stdio.verbose('start_obshell: %s' % start_obshell) + if start_obshell: + for server in cluster_config.servers: + client = clients[server] + server_config = cluster_config.get_server_conf(server) + home_path = server_config['home_path'] + obshell_pid_path = '%s/run/obshell.pid' % home_path + obshell_pid = client.execute_command('cat %s' % obshell_pid_path).stdout.strip() + if obshell_pid and client.execute_command('ls /proc/%s' % obshell_pid): + stdio.verbose('%s obshell[pid: %s] started', server, obshell_pid) + else: + # start obshell + server_config = cluster_config.get_server_conf(server) + password = server_config.get('root_password', '') + client.add_env('OB_ROOT_PASSWORD', password if client._is_local else ConfigUtil.passwd_format(password)) + cmd = 'cd %s; %s/bin/obshell admin start --ip %s --port %s'%(server_config['home_path'], server_config['home_path'], server.ip, server_config['obshell_port']) + stdio.verbose('start obshell: %s' % cmd) + if not client.execute_command(cmd): + stdio.error('%s obshell failed', server) + return + + if not scale_out: + stdio.start_loading('observer program health check') + time.sleep(3) + failed = [] + for server in cluster_config.servers: + client = clients[server] + server_config = cluster_config.get_server_conf(server) + home_path = server_config['home_path'] + remote_pid_path = '%s/run/observer.pid' % home_path + stdio.verbose('%s program health check' % server) + remote_pid = client.execute_command('cat %s' % remote_pid_path).stdout.strip() + if remote_pid and client.execute_command('ls /proc/%s' % remote_pid): + stdio.verbose('%s observer[pid: %s] started', server, remote_pid) + else: + failed.append(EC_OBSERVER_FAIL_TO_START.format(server=server)) + if failed: + stdio.stop_loading('fail') + for msg in failed: + stdio.warn(msg) + return plugin_context.return_false() + else: + stdio.stop_loading('succeed') + + if start_obshell: + # check obshell health + failed = [] + stdio.start_loading('obshell program health check') + for server in cluster_config.servers: + client = clients[server] + server_config = cluster_config.get_server_conf(server) + home_path = server_config['home_path'] + obshell_pid_path = '%s/run/obshell.pid' % home_path + obshell_pid = client.execute_command('cat %s' % obshell_pid_path).stdout.strip() + if obshell_pid and client.execute_command('ls /proc/%s' % obshell_pid): + stdio.verbose('%s obshell[pid: %s] started', server, obshell_pid) + else: + failed.append(EC_OBSERVER_FAIL_TO_START_OCS.format(server=server)) # TODO: 增加obshell相关的错误吗 + if failed: + stdio.stop_loading('fail') + for msg in failed: + stdio.warn(msg) + return plugin_context.return_false() + else: + stdio.stop_loading('succeed') + + stdio.verbose('need_bootstrap: %s' % need_bootstrap) + return plugin_context.return_true(need_bootstrap=need_bootstrap) diff --git a/plugins/oceanbase/4.2.1.4/start_check.py b/plugins/oceanbase/4.2.1.4/start_check.py new file mode 100644 index 0000000..0b1525e --- /dev/null +++ b/plugins/oceanbase/4.2.1.4/start_check.py @@ -0,0 +1,782 @@ +# coding: utf-8 +# OceanBase Deploy. +# Copyright (C) 2021 OceanBase +# +# This file is part of OceanBase Deploy. +# +# OceanBase Deploy is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# OceanBase Deploy is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with OceanBase Deploy. If not, see . + + +from __future__ import absolute_import, division, print_function + +import os +import re +import time +import socket +import sys +import copy +from math import sqrt + +import _errno as err +from _types import Capacity + + +stdio = None +success = True +production_mode = False + +def get_port_socket_inode(client, port): + port = hex(port)[2:].zfill(4).upper() + cmd = "bash -c 'cat /proc/net/{tcp*,udp*}' | awk -F' ' '{if($4==\"0A\") print $2,$4,$10}' | grep ':%s' | awk -F' ' '{print $3}' | uniq" % port + res = client.execute_command(cmd) + if not res or not res.stdout.strip(): + return False + stdio.verbose(res.stdout) + return res.stdout.strip().split('\n') + + +def time_delta(client): + time_st = time.time() * 1000 + time_srv = int(client.execute_command('date +%s%N').stdout) / 1000000 + time_ed = time.time() * 1000 + + time_it = time_ed - time_st + time_srv -= time_it + return time_srv - time_st + + +def get_mount_path(disk, _path): + _mount_path = '/' + for p in disk: + if p in _path and _path.startswith(p): + if len(p) > len(_mount_path): + _mount_path = p + return _mount_path + + +def get_system_memory(memory_limit, min_pool_memory): + if memory_limit <= 8 << 30: + system_memory = 2 << 30 + elif memory_limit <= 16 << 30: + system_memory = 3 << 30 + elif memory_limit <= 32 << 30: + system_memory = 5 << 30 + elif memory_limit <= 48 << 30: + system_memory = 7 << 30 + elif memory_limit <= 64 << 30: + system_memory = 10 << 30 + else: + memory_limit_gb = memory_limit >> 30 + system_memory = int(3 * (sqrt(memory_limit_gb) - 3)) << 30 + return max(system_memory, min_pool_memory) + + +def get_disk_info_by_path(path, client, stdio): + disk_info = {} + ret = client.execute_command('df --block-size=1024 {}'.format(path)) + if ret: + for total, used, avail, puse, path in re.findall(r'(\d+)\s+(\d+)\s+(\d+)\s+(\d+%)\s+(.+)', ret.stdout): + disk_info[path] = {'total': int(total) << 10, 'avail': int(avail) << 10, 'need': 0} + stdio.verbose('get disk info for path {}, total: {} avail: {}'.format(path, disk_info[path]['total'], disk_info[path]['avail'])) + return disk_info + + +def get_disk_info(all_paths, client, stdio): + overview_ret = True + disk_info = get_disk_info_by_path('', client, stdio) + if not disk_info: + overview_ret = False + disk_info = get_disk_info_by_path('/', client, stdio) + if not disk_info: + disk_info['/'] = {'total': 0, 'avail': 0, 'need': 0} + all_path_success = {} + for path in all_paths: + all_path_success[path] = False + cur_path = path + while cur_path not in disk_info: + disk_info_for_current_path = get_disk_info_by_path(cur_path, client, stdio) + if disk_info_for_current_path: + disk_info.update(disk_info_for_current_path) + all_path_success[path] = True + break + else: + cur_path = os.path.dirname(cur_path) + if overview_ret or all(all_path_success.values()): + return disk_info + +def has_obshell(repository): + repository_dir = repository.repository_dir + obshell_path = os.path.join(repository_dir, 'bin', 'obshell') + return os.path.exists(obshell_path) + +def start_check(plugin_context, init_check_status=False, strict_check=False, work_dir_check=False, work_dir_empty_check=True, generate_configs={}, precheck=False, source_option='start', *args, **kwargs): + def check_pass(item): + status = check_status[server] + if status[item].status == err.CheckStatus.WAIT: + status[item].status = err.CheckStatus.PASS + def check_fail(item, error, suggests=[]): + status = check_status[server][item] + if status.status == err.CheckStatus.WAIT: + status.error = error + status.suggests = suggests + status.status = err.CheckStatus.FAIL + def wait_2_pass(): + status = check_status[server] + for item in status: + check_pass(item) + def alert(item, error, suggests=[]): + global success + if strict_check: + success = False + check_fail(item, error, suggests) + print_with_suggests(error, suggests) + else: + stdio.warn(error) + + def alert_strict(item, error, suggests=[]): + global success + if strict_check or production_mode: + success = False + check_fail(item, error, suggests) + print_with_suggests(error, suggests) + else: + stdio.warn(error) + + def error(item, _error, suggests=[]): + global success + if plugin_context.dev_mode: + stdio.warn(_error) + else: + success = False + check_fail(item, _error, suggests) + print_with_suggests(_error, suggests) + + def critical(item, error, suggests=[]): + global success + success = False + check_fail(item, error, suggests) + print_with_suggests(error, suggests) + + def print_with_suggests(error, suggests=[]): + stdio.error('{}, {}'.format(error, suggests[0].msg if suggests else '')) + + def system_memory_check(): + server_memory_config = server_memory_stat['servers'] + for server in server_memory_config: + if server_memory_config[server]['system_memory']: + memory_limit = server_memory_config[server]['num'] + if not memory_limit: + server_memory_config[server]['num'] = memory_limit = server_memory_config[server]['percentage'] * server_memory_stats['total'] + factor = 0.75 + suggest = err.SUG_OBSERVER_SYS_MEM_TOO_LARGE.format(factor=factor) + suggest.auto_fix = 'system_memory' not in global_generate_config and 'system_memory' not in generate_configs.get(server, {}) + if memory_limit < server_memory_config[server]['system_memory']: + critical('mem', err.EC_OBSERVER_SYS_MEM_TOO_LARGE.format(server=server), [suggest]) + elif memory_limit * factor < server_memory_config[server]['system_memory']: + alert('mem', err.WC_OBSERVER_SYS_MEM_TOO_LARGE.format(server=server, factor=factor), [suggest]) + + global stdio, success + success = True + check_status = {} + cluster_config = plugin_context.cluster_config + INF = float('inf') + plugin_context.set_variable('start_check_status', check_status) + + kernel_check_items = [ + {'check_item': 'vm.max_map_count', 'need': [327600, 1310720], 'recommend': 655360}, + {'check_item': 'vm.min_free_kbytes', 'need': [32768, 2097152], 'recommend': 2097152}, + {'check_item': 'vm.overcommit_memory', 'need': 0, 'recommend': 0}, + {'check_item': 'fs.file-max', 'need': [6573688, INF], 'recommend': 6573688}, + ] + + kernel_check_status = {} + for kernel_param in kernel_check_items: + check_item = kernel_param['check_item'] + kernel_check_status[check_item] = err.CheckStatus() + + for server in cluster_config.servers: + check_status[server] = { + 'port': err.CheckStatus(), + 'mem': err.CheckStatus(), + 'disk': err.CheckStatus(), + 'dir': err.CheckStatus(), + 'ulimit': err.CheckStatus(), + 'aio': err.CheckStatus(), + 'net': err.CheckStatus(), + 'ntp': err.CheckStatus(), + 'ocp meta db': err.CheckStatus() + } + check_status[server].update(kernel_check_status) + + if init_check_status: + return plugin_context.return_true(start_check_status=check_status) + + clients = plugin_context.clients + stdio = plugin_context.stdio + servers_clients = {} + servers_port = {} + servers_memory = {} + servers_disk = {} + servers_clog_mount = {} + servers_net_interface = {} + servers_dirs = {} + servers_check_dirs = {} + servers_log_disk_size = {} + servers_min_pool_memory = {} + PRO_MEMORY_MIN = 16 << 30 + PRO_POOL_MEM_MIN = 2147483648 + START_NEED_MEMORY = 3 << 30 + global_generate_config = plugin_context.get_variable('global_generate_config', default=generate_configs.get('global', {})) + plugin_context.set_variable('global_generate_config', global_generate_config) + stdio.start_loading('Check before {} observer'.format(source_option)) + + need_bootstrap = True + parameter_check = True + port_check = True + kernel_check = True + is_running_opt = source_option in ['restart', 'upgrade'] + upgrade_opt = source_option == 'upgrade' + for server in cluster_config.servers: + ip = server.ip + client = clients[server] + server_generate_config = generate_configs.get(server, {}) + servers_clients[ip] = client + server_config = cluster_config.get_server_conf_with_default(server) + original_server_conf = cluster_config.get_original_server_conf_with_global(server) + home_path = server_config['home_path'] + production_mode = server_config.get('production_mode', False) + if not precheck: + if need_bootstrap: + data_dir = server_config['data_dir'] if server_config.get('data_dir') else '%s/store' % home_path + if client.execute_command('ls %s/clog/tenant_1/' % data_dir).stdout.strip(): + need_bootstrap = False + remote_pid_path = '%s/run/observer.pid' % home_path + remote_pid = client.execute_command('cat %s' % remote_pid_path).stdout.strip() + if remote_pid: + if client.execute_command('ls /proc/%s' % remote_pid): + stdio.verbose('%s is runnning, skip' % server) + work_dir_check = False + for repository in plugin_context.repositories: + if repository.name == cluster_config.name: + break + port_check = upgrade_opt and not has_obshell(repository) + parameter_check = False + kernel_check = is_running_opt + + if work_dir_check: + stdio.verbose('%s dir check' % server) + if ip not in servers_dirs: + servers_dirs[ip] = {} + servers_check_dirs[ip] = {} + dirs = servers_dirs[ip] + check_dirs = servers_check_dirs[ip] + + if not server_config.get('data_dir'): + server_config['data_dir'] = '%s/store' % home_path + if not server_config.get('redo_dir'): + server_config['redo_dir'] = server_config['data_dir'] + if not server_config.get('clog_dir'): + server_config['clog_dir'] = '%s/clog' % server_config['redo_dir'] + if not server_config.get('ilog_dir'): + server_config['ilog_dir'] = '%s/ilog' % server_config['redo_dir'] + if not server_config.get('slog_dir'): + server_config['slog_dir'] = '%s/slog' % server_config['data_dir'] + if server_config['redo_dir'] == server_config['data_dir']: + keys = ['home_path', 'data_dir', 'clog_dir', 'ilog_dir', 'slog_dir'] + else: + keys = ['home_path', 'data_dir', 'redo_dir', 'clog_dir', 'ilog_dir', 'slog_dir'] + + for key in keys: + path = server_config.get(key) + suggests = [err.SUG_CONFIG_CONFLICT_DIR.format(key=key, server=server)] + if path in dirs and dirs[path]: + critical('dir', err.EC_CONFIG_CONFLICT_DIR.format(server1=server, path=path, server2=dirs[path]['server'], key=dirs[path]['key']), suggests) + dirs[path] = { + 'server': server, + 'key': key, + } + if key not in original_server_conf: + continue + empty_check = work_dir_empty_check + while True: + if path in check_dirs: + if check_dirs[path] != True: + critical('dir', check_dirs[path], suggests) + break + + if client.execute_command('bash -c "[ -a %s ]"' % path): + is_dir = client.execute_command('[ -d {} ]'.format(path)) + has_write_permission = client.execute_command('[ -w {} ]'.format(path)) + if is_dir and has_write_permission: + if empty_check: + ret = client.execute_command('ls %s' % path) + if not ret or ret.stdout.strip(): + check_dirs[path] = err.EC_FAIL_TO_INIT_PATH.format(server=server, key=key, msg=err.InitDirFailedErrorMessage.NOT_EMPTY.format(path=path)) + else: + check_dirs[path] = True + else: + check_dirs[path] = True + else: + if not is_dir: + check_dirs[path] = err.EC_FAIL_TO_INIT_PATH.format(server=server, key=key, msg=err.InitDirFailedErrorMessage.NOT_DIR.format(path=path)) + else: + check_dirs[path] = err.EC_FAIL_TO_INIT_PATH.format(server=server, key=key, msg=err.InitDirFailedErrorMessage.PERMISSION_DENIED.format(path=path)) + else: + path = os.path.dirname(path) + empty_check = False + + if ip not in servers_port: + servers_disk[ip] = {} + servers_port[ip] = {} + servers_clog_mount[ip] = {} + servers_net_interface[ip] = {} + servers_memory[ip] = {'num': 0, 'percentage': 0, 'servers': {}} + memory = servers_memory[ip] + ports = servers_port[ip] + disk = servers_disk[ip] + clog_mount = servers_clog_mount[ip] + interfaces = servers_net_interface[ip] + + if port_check: + stdio.verbose('%s port check' % server) + if upgrade_opt: + keys = ['obshell_port'] + else: + keys = ['mysql_port', 'rpc_port', 'obshell_port'] + + for key in keys: + port = int(server_config.get(key)) + if port in ports: + critical( + 'port', + err.EC_CONFIG_CONFLICT_PORT.format(server1=server, port=port, server2=ports[port]['server'], key=ports[port]['key']), + [err.SUG_PORT_CONFLICTS.format()] + ) + continue + ports[port] = { + 'server': server, + 'key': key + } + if get_port_socket_inode(client, port): + critical('port', err.EC_CONFLICT_PORT.format(server=ip, port=port), [err.SUG_USE_OTHER_PORT.format()]) + + if parameter_check: + servers_min_pool_memory[server] = __min_full_resource_pool_memory = server_config.get('__min_full_resource_pool_memory') + if production_mode and __min_full_resource_pool_memory < PRO_POOL_MEM_MIN: + error('mem', err.EC_OBSERVER_PRODUCTION_MODE_LIMIT.format(server=server, key="__min_full_resource_pool_memory", limit=PRO_POOL_MEM_MIN), [err.SUB_SET_NO_PRODUCTION_MODE.format()]) + + memory_limit = 0 + percentage = 0 + if server_config.get('memory_limit'): + memory_limit = Capacity(server_config['memory_limit']).btyes + if production_mode and memory_limit < PRO_MEMORY_MIN: + error('mem', err.EC_OBSERVER_PRODUCTION_MODE_LIMIT.format(server=server, key='memory_limit', limit=Capacity(PRO_MEMORY_MIN)), [err.SUB_SET_NO_PRODUCTION_MODE.format()]) + memory['num'] += memory_limit + elif 'memory_limit_percentage' in server_config: + percentage = server_config['memory_limit_percentage'] + memory['percentage'] += percentage + else: + percentage = 80 + memory['percentage'] += percentage + memory['servers'][server] = { + 'num': memory_limit, + 'percentage': percentage, + 'system_memory': Capacity(server_config.get('system_memory', 0)).btyes + } + + data_path = server_config['data_dir'] if server_config.get('data_dir') else os.path.join(server_config['home_path'], 'store') + redo_dir = server_config['redo_dir'] if server_config.get('redo_dir') else data_path + clog_dir = server_config['clog_dir'] if server_config.get('clog_dir') else os.path.join(redo_dir, 'clog') + if not client.execute_command('ls %s/sstable/block_file' % data_path): + disk[data_path] = {'server': server} + clog_mount[clog_dir] = {'server': server} + if 'datafile_size' in server_config and server_config['datafile_size'] and server_config['datafile_size']: + # if need is string, it means use datafile_size + disk[data_path]['need'] = server_config['datafile_size'] + elif 'datafile_disk_percentage' in server_config and server_config['datafile_disk_percentage']: + # if need is integer, it means use datafile_disk_percentage + disk[data_path]['need'] = int(server_config['datafile_disk_percentage']) + + if 'log_disk_size' in server_config and server_config['log_disk_size'] and server_config['log_disk_size']: + # if need is string, it means use log_disk_size + clog_mount[clog_dir]['need'] = server_config['log_disk_size'] + elif 'log_disk_percentage' in server_config and server_config['log_disk_percentage']: + # if need is integer, it means use log_disk_percentage + clog_mount[clog_dir]['need'] = int(server_config['log_disk_percentage']) + + devname = server_config.get('devname') + if devname: + if not client.execute_command("grep -e '^ *%s:' /proc/net/dev" % devname): + suggest = err.SUG_NO_SUCH_NET_DEVIC.format(ip=ip) + suggest.auto_fix = 'devname' not in global_generate_config and 'devname' not in server_generate_config + critical('net', err.EC_NO_SUCH_NET_DEVICE.format(server=server, devname=devname), suggests=[suggest]) + if devname not in interfaces: + interfaces[devname] = [] + interfaces[devname].append(ip) + + ip_server_memory_info = {} + for ip in servers_disk: + if not client.execute_command("[ -w /tmp/ ] || [ -w /tmp/obshell ]"): + critical("dir", err.EC_FAIL_TO_INIT_PATH.format(server=server, key='sock path', msg=err.InitDirFailedErrorMessage.PERMISSION_DENIED.format(path='/tmp/obshell'))) + + ip_servers = servers_memory[ip]['servers'].keys() + server_num = len(ip_servers) + client = servers_clients[ip] + ret = client.execute_command('cat /proc/sys/fs/aio-max-nr /proc/sys/fs/aio-nr') + if not ret: + for server in ip_servers: + alert('aio', err.EC_FAILED_TO_GET_AIO_NR.format(ip=ip), [err.SUG_CONNECT_EXCEPT.format()]) + else: + try: + max_nr, nr = ret.stdout.strip().split('\n') + max_nr, nr = int(max_nr), int(nr) + need = server_num * 20000 + RECD_AIO = 1048576 + if need > max_nr - nr: + for server in ip_servers: + critical('aio', err.EC_AIO_NOT_ENOUGH.format(ip=ip, avail=max_nr - nr, need=need), [err.SUG_SYSCTL.format(var='fs.aio-max-nr', value=max(RECD_AIO, need), ip=ip)]) + elif int(max_nr) < RECD_AIO: + for server in ip_servers: + alert('aio', err.WC_AIO_NOT_ENOUGH.format(ip=ip, current=max_nr), [err.SUG_SYSCTL.format(var='fs.aio-max-nr', value=RECD_AIO, ip=ip)]) + except: + for server in ip_servers: + alert('aio', err.EC_FAILED_TO_GET_AIO_NR.format(ip=ip), [err.SUG_UNSUPPORT_OS.format()]) + stdio.exception('') + + ret = client.execute_command('ulimit -a') + ulimits_min = { + 'open files': { + 'need': lambda x: 20000 * x, + 'recd': lambda x: 655350, + 'name': 'nofile' + }, + 'max user processes': { + 'need': lambda x: 120000, + 'recd': lambda x: 655350, + 'name': 'nproc' + }, + 'core file size': { + 'need': lambda x: 0, + 'recd': lambda x: INF, + 'below_need_error': False, + 'below_recd_error_strict': False, + 'name': 'core' + }, + 'stack size': { + 'need': lambda x: 1024, + 'recd': lambda x: INF, + 'below_recd_error_strict': False, + 'name': 'stack' + }, + } + ulimits = {} + src_data = re.findall('\s?([a-zA-Z\s]+[a-zA-Z])\s+\([a-zA-Z\-,\s]+\)\s+([\d[a-zA-Z]+)', ret.stdout) if ret else [] + for key, value in src_data: + ulimits[key] = value + for key in ulimits_min: + value = ulimits.get(key) + if value == 'unlimited': + continue + if not value or not (value.strip().isdigit()): + for server in ip_servers: + alert('ulimit', '(%s) failed to get %s' % (ip, key), suggests=[err.SUG_UNSUPPORT_OS.format()]) + else: + value = int(value) + need = ulimits_min[key]['need'](server_num) + if need > value: + if (strict_check or production_mode) and ulimits_min[key].get('below_recd_error_strict', True) and value < ulimits_min[key]['recd'](server_num): + need = ulimits_min[key]['recd'](server_num) + need = need if need != INF else 'unlimited' + for server in ip_servers: + if ulimits_min[key].get('below_need_error', True): + critical('ulimit', err.EC_ULIMIT_CHECK.format(server=ip, key=key, need=need, now=value), [err.SUG_ULIMIT.format(name=ulimits_min[key]['name'], value=need, ip=ip)]) + else: + alert('ulimit', err.EC_ULIMIT_CHECK.format(server=ip, key=key, need=need, now=value), suggests=[err.SUG_ULIMIT.format(name=ulimits_min[key]['name'], value=need, ip=ip)]) + else: + need = ulimits_min[key]['recd'](server_num) + if need > value: + need = need if need != INF else 'unlimited' + for server in ip_servers: + if ulimits_min[key].get('below_recd_error_strict', True): + alert('ulimit', err.WC_ULIMIT_CHECK.format(server=ip, key=key, need=need, now=value), suggests=[err.SUG_ULIMIT.format(name=ulimits_min[key]['name'], value=need, ip=ip)]) + else: + stdio.warn(err.WC_ULIMIT_CHECK.format(server=ip, key=key, need=need, now=value)) + + if kernel_check: + # check kernel params + try: + cmd = 'sysctl -a' + ret = client.execute_command(cmd) + if not ret: + alert_strict('kernel', err.EC_FAILED_TO_GET_PARAM.format(key='kernel parameter ', cmd=cmd), [err.SUG_CONNECT_EXCEPT.format(ip=ip)]) + continue + kernel_params = {} + kernel_param_src = ret.stdout.split('\n') + for kernel in kernel_param_src: + if not kernel: + continue + kernel = kernel.split('=') + kernel_params[kernel[0].strip()] = re.findall(r"[-+]?\d+", kernel[1]) + + for kernel_param in kernel_check_items: + check_item = kernel_param['check_item'] + if check_item not in kernel_params: + continue + values = kernel_params[check_item] + needs = kernel_param['need'] + recommends = kernel_param['recommend'] + for i in range(len(values)): + # This case is not handling the value of 'default'. Additional handling is required for 'default' in the future. + item_value = int(values[i]) + need = needs[i] if isinstance(needs, tuple) else needs + recommend = recommends[i] if isinstance(recommends, tuple) else recommends + if isinstance(need, list): + if item_value < need[0] or item_value > need[1]: + suggest = [err.SUG_SYSCTL.format(var=check_item, value=' '.join(str(i) for i in recommend) if isinstance(recommend, list) else recommend, ip=ip)] + need = 'within {}'.format(needs) if needs[-1] != INF else 'greater than {}'.format(needs[0]) + now = '[{}]'.format(', '.join(values)) if len(values) > 1 else item_value + alert_strict(check_item, err.EC_PARAM_NOT_IN_NEED.format(ip=ip, check_item=check_item, need=need, now=now, recommend=recommends), suggest) + break + elif item_value != need: + alert_strict(check_item, err.EC_PARAM_NOT_IN_NEED.format(ip=ip, check_item=check_item, need=needs, recommend=recommend, now=item_value), [err.SUG_SYSCTL.format(var=check_item, value=recommend, ip=ip)]) + except: + stdio.exception('') + + # memory + ret = client.execute_command('cat /proc/meminfo') + if ret: + server_memory_stats = {} + memory_key_map = { + 'MemTotal': 'total', + 'MemFree': 'free', + 'MemAvailable': 'available', + 'Buffers': 'buffers', + 'Cached': 'cached' + } + for key in memory_key_map: + server_memory_stats[memory_key_map[key]] = 0 + for k, v in re.findall('(\w+)\s*:\s*(\d+\s*\w+)', ret.stdout): + if k in memory_key_map: + key = memory_key_map[k] + server_memory_stats[key] = Capacity(str(v)).btyes + + ip_server_memory_info[ip] = server_memory_stats + server_memory_stat = servers_memory[ip] + min_start_need = server_num * START_NEED_MEMORY + total_use = int(server_memory_stat['percentage'] * server_memory_stats['total'] / 100 + server_memory_stat['num']) + if min_start_need > server_memory_stats['available']: + for server in ip_servers: + error('mem', err.EC_OBSERVER_NOT_ENOUGH_MEMORY_ALAILABLE.format(ip=ip, available=Capacity(server_memory_stats['available']), need=Capacity(min_start_need)), [err.SUG_OBSERVER_NOT_ENOUGH_MEMORY_ALAILABLE.format(ip=ip)]) + elif total_use > server_memory_stats['free'] + server_memory_stats['buffers'] + server_memory_stats['cached']: + for server in ip_servers: + server_generate_config = generate_configs.get(server, {}) + suggest = err.SUG_OBSERVER_REDUCE_MEM.format() + suggest.auto_fix = True + for key in ['memory_limit', 'memory_limit_percentage']: + if key in global_generate_config or key in server_generate_config: + suggest.auto_fix = False + break + error('mem', err.EC_OBSERVER_NOT_ENOUGH_MEMORY_CACHED.format(ip=ip, free=Capacity(server_memory_stats['free']), cached=Capacity(server_memory_stats['buffers'] + server_memory_stats['cached']), need=Capacity(total_use)), [suggest]) + elif total_use > server_memory_stats['free']: + system_memory_check() + for server in ip_servers: + alert('mem', err.EC_OBSERVER_NOT_ENOUGH_MEMORY.format(ip=ip, free=Capacity(server_memory_stats['free']), need=Capacity(total_use)), [err.SUG_OBSERVER_REDUCE_MEM.format()]) + else: + system_memory_check() + + # disk + all_path = set(list(servers_disk[ip].keys()) + list(servers_clog_mount[ip].keys())) + disk = get_disk_info(all_paths=all_path, client=client, stdio=stdio) + stdio.verbose('disk: {}'.format(disk)) + for path in servers_disk[ip]: + mount_path = get_mount_path(disk, path) + need = servers_disk[ip][path].get('need') + if not need: + for clog_path in servers_clog_mount[ip]: + clog_mount_path = get_mount_path(disk, clog_path) + if clog_mount_path == mount_path: + need = 60 + stdio.verbose('clog and data use the same disk, datadisk percentage: {}'.format(need)) + break + else: + need = 90 + stdio.verbose('datadisk percentage: {}'.format(need)) + slog_size = float(4 << 30) + if isinstance(need, int): + # slog need 4G + disk[mount_path]['need'] += max(disk[mount_path]['total'] - slog_size, 0) * need / 100 + else: + disk[mount_path]['need'] += Capacity(need).btyes + + disk[mount_path]['need'] += slog_size + disk[mount_path]['is_data_disk'] = True + for path in servers_clog_mount[ip]: + mount_path = get_mount_path(disk, path) + if 'need' in servers_clog_mount[ip][path]: + need = servers_clog_mount[ip][path]['need'] + elif disk[mount_path].get('is_data_disk'): + # hard code + need = 30 + stdio.verbose('clog and data use the same disk, clog percentage: {}'.format(need)) + else: + need = 90 + stdio.verbose('clog percentage: {}'.format(need)) + if isinstance(need, int): + # log_disk_percentage + log_disk_size = disk[mount_path]['total'] * need / 100 + else: + # log_disk_size + log_disk_size = Capacity(need).btyes + servers_log_disk_size[servers_clog_mount[ip][path]['server']] = log_disk_size + disk[mount_path]['need'] += log_disk_size + disk[mount_path]['is_clog_disk'] = True + for p in disk: + avail = disk[p]['avail'] + need = disk[p]['need'] + suggests = [] + if disk[p].get('is_data_disk') and disk[p].get('is_clog_disk'): + suggests.append(err.SUG_OBSERVER_SAME_DISK.format()) + for server in ip_servers: + alert('disk', err.WC_OBSERVER_SAME_DISK.format(ip=ip, disk=p), suggests) + if need > avail: + suggest_temps = { + 'data': { + 'tmplate': err.SUG_OBSERVER_NOT_ENOUGH_DISK, + 'keys': ['datafile_size', 'datafile_disk_percentage'] + } + } + if suggests: + suggest_temps['mem'] = { + 'tmplate': err.SUG_OBSERVER_REDUCE_MEM, + 'keys': ['memory_limit', 'memory_limit_percentage'] + } + suggest_temps['redo'] = { + 'tmplate': err.SUG_OBSERVER_REDUCE_REDO, + 'keys': ['log_disk_size', 'log_disk_percentage'] + } + for server in ip_servers: + tmp_suggests = [] + server_generate_config = generate_configs.get(server, {}) + for item in suggest_temps: + suggest = suggest_temps[item]['tmplate'].format() + suggest.auto_fix = True + for key in suggest_temps[item]['keys']: + if key in global_generate_config or key in server_generate_config: + suggest.auto_fix = False + break + tmp_suggests.append(suggest) + tmp_suggests = sorted(tmp_suggests, key=lambda suggest: suggest.auto_fix, reverse=True) + critical('disk', err.EC_OBSERVER_NOT_ENOUGH_DISK.format(ip=ip, disk=p, avail=Capacity(avail), need=Capacity(need)), tmp_suggests + suggests) + + global_conf = cluster_config.get_global_conf() + has_ocp = 'ocp-express' in plugin_context.components + if not has_ocp and any([key.startswith('ocp_meta') for key in global_conf]): + has_ocp = True + if has_ocp and need_bootstrap and parameter_check: + global_conf_with_default = copy.deepcopy(cluster_config.get_global_conf_with_default()) + original_global_conf = cluster_config.get_original_global_conf() + ocp_meta_tenant_prefix = 'ocp_meta_tenant_' + for key in global_conf_with_default: + if key.startswith(ocp_meta_tenant_prefix) and original_global_conf.get(key, None): + global_conf_with_default['ocp_meta_tenant'][key.replace(ocp_meta_tenant_prefix, '', 1)] = global_conf_with_default[key] + meta_db_memory_size = Capacity(global_conf_with_default['ocp_meta_tenant'].get('memory_size')).btyes + servers_sys_memory = {} + if meta_db_memory_size: + sys_memory_size = None + if 'sys_tenant' in global_conf and 'memory_size' in global_conf['sys_tenant']: + sys_memory_size = global_conf['sys_tenant']['memory_size'] + for server in cluster_config.servers: + if server.ip not in servers_memory or server not in servers_memory[server.ip]['servers'] or server not in servers_min_pool_memory: + stdio.verbose('skip server {} for missing some memory info.'.format(server)) + continue + memory_limit = servers_memory[server.ip]['servers'][server]['num'] + system_memory = servers_memory[server.ip]['servers'][server]['system_memory'] + min_pool_memory = servers_min_pool_memory[server] + if system_memory == 0: + system_memory = get_system_memory(memory_limit, min_pool_memory) + if not sys_memory_size: + sys_memory_size = servers_sys_memory[server] = max(min_pool_memory, min((memory_limit - system_memory) * 0.25, Capacity('16G').btyes)) + if meta_db_memory_size + system_memory + sys_memory_size <= memory_limit: + break + else: + suggest = err.SUG_OCP_EXPRESS_REDUCE_META_DB_MEM.format() + suggest.auto_fix = True + if 'ocp_meta_tenant_memory_size' in global_generate_config: + suggest.auto_fix = False + error('ocp meta db', err.EC_OCP_EXPRESS_META_DB_NOT_ENOUGH_MEM.format(), [suggest]) + + meta_db_log_disk_size = global_conf_with_default['ocp_meta_tenant'].get('log_disk_size') + meta_db_log_disk_size = Capacity(meta_db_log_disk_size).btyes if meta_db_log_disk_size else meta_db_log_disk_size + if not meta_db_log_disk_size and meta_db_memory_size: + meta_db_log_disk_size = meta_db_memory_size * 3 + if meta_db_log_disk_size: + for server in cluster_config.servers: + log_disk_size = servers_log_disk_size[server] + sys_log_disk_size = servers_sys_memory.get(server, 0) + if meta_db_log_disk_size + sys_log_disk_size <= log_disk_size: + break + else: + suggest = err.SUG_OCP_EXPRESS_REDUCE_META_DB_LOG_DISK.format() + suggest.auto_fix = True + if 'ocp_meta_tenant_log_disk_size' in global_generate_config: + suggest.auto_fix = False + error('ocp meta db', err.EC_OCP_EXPRESS_META_DB_NOT_ENOUGH_LOG_DISK.format(), [suggest]) + + if success: + for ip in servers_net_interface: + client = servers_clients[ip] + for devname in servers_net_interface[ip]: + if client.is_localhost() and (devname != 'lo' and devname is not None) or (not client.is_localhost() and devname == 'lo'): + suggest = err.SUG_NO_SUCH_NET_DEVIC.format(ip=ip) + suggest.auto_fix = client.is_localhost() and 'devname' not in global_generate_config and 'devname' not in server_generate_config + for server in ip_servers: + critical('net', err.EC_OBSERVER_PING_FAILED.format(ip1=ip, devname=devname, ip2=ip), [suggest]) + continue + for _ip in servers_clients: + if ip == _ip: + continue + ping_cmd = 'ping -W 1 -c 1 -I %s %s' % (devname, _ip) if devname is not None else 'ping -W 1 -c 1 %s' % _ip + if not client.execute_command(ping_cmd): + suggest = err.SUG_NO_SUCH_NET_DEVIC.format(ip=ip) + suggest.auto_fix = 'devname' not in global_generate_config and 'devname' not in server_generate_config + for server in ip_servers: + if devname is not None: + critical('net', err.EC_OBSERVER_PING_FAILED.format(ip1=ip, devname=devname, ip2=_ip), [suggest]) + else: + critical('net', err.EC_OBSERVER_PING_FAILED_WITH_NO_DEVNAME.format(ip1=ip, ip2=_ip), [suggest]) + break + + + if success: + times = [] + for ip in servers_clients: + client = servers_clients[ip] + delta = time_delta(client) + stdio.verbose('%s time delta %s' % (ip, delta)) + times.append(delta) + if times and max(times) - min(times) > 500: + critical('ntp', err.EC_OBSERVER_TIME_OUT_OF_SYNC.format(), [err.SUG_OBSERVER_TIME_OUT_OF_SYNC.format()]) + for server in cluster_config.servers: + status = check_status[server] + for key in status: + if status[key].status == err.CheckStatus.WAIT: + status[key].status = err.CheckStatus.PASS + + if success: + stdio.stop_loading('succeed') + plugin_context.return_true() + else: + stdio.stop_loading('fail') + + diff --git a/plugins/oceanbase/4.2.1.4/stop.py b/plugins/oceanbase/4.2.1.4/stop.py new file mode 100644 index 0000000..dd563ce --- /dev/null +++ b/plugins/oceanbase/4.2.1.4/stop.py @@ -0,0 +1,162 @@ +# coding: utf-8 +# OceanBase Deploy. +# Copyright (C) 2021 OceanBase +# +# This file is part of OceanBase Deploy. +# +# OceanBase Deploy is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# OceanBase Deploy is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with OceanBase Deploy. If not, see . + + +from __future__ import absolute_import, division, print_function + +import json +import time +import requests + +from tool import NetUtil + + +def config_url(ocp_config_server, appname, cid): + cfg_url = '%s&Action=ObRootServiceInfo&ObCluster=%s' % (ocp_config_server, appname) + proxy_cfg_url = '%s&Action=GetObProxyConfig&ObRegionGroup=%s' % (ocp_config_server, appname) + # 清除集群URL内容命令 + cleanup_config_url_content = '%s&Action=DeleteObRootServiceInfoByClusterName&ClusterName=%s' % ( + ocp_config_server, appname) + # 注册集群信息到Config URL命令 + register_to_config_url = '%s&Action=ObRootServiceRegister&ObCluster=%s&ObClusterId=%s' % ( + ocp_config_server, appname, cid) + return cfg_url, cleanup_config_url_content, register_to_config_url + + +def get_port_socket_inode(client, port): + port = hex(port)[2:].zfill(4).upper() + cmd = "bash -c 'cat /proc/net/{tcp*,udp*}' | awk -F' ' '{print $2,$10}' | grep '00000000:%s' | awk -F' ' '{print $2}' | uniq" % port + res = client.execute_command(cmd) + if not res or not res.stdout.strip(): + return [] + return res.stdout.strip().split('\n') + + +def port_release_check(client, pid, port, count): + socket_inodes = get_port_socket_inode(client, port) + if not socket_inodes: + return True + if count < 5: + ret = client.execute_command("ls -l /proc/%s/fd/ |grep -E 'socket:\[(%s)\]'" % (pid, '|'.join(socket_inodes))) + if ret: + return not ret.stdout.strip() + else: + return not client.execute_command("ls -l /proc/%s" % pid) + return False + + +def stop(plugin_context, *args, **kwargs): + cluster_config = plugin_context.cluster_config + clients = plugin_context.clients + stdio = plugin_context.stdio + global_config = cluster_config.get_global_conf() + appname = global_config['appname'] if 'appname' in global_config else None + cluster_id = global_config['cluster_id'] if 'cluster_id' in global_config else None + obconfig_url = global_config['obconfig_url'] if 'obconfig_url' in global_config else None + stdio.start_loading('Stop observer') + if obconfig_url and appname and cluster_id: + try: + cfg_url, cleanup_config_url_content, register_to_config_url = config_url(obconfig_url, appname, cluster_id) + stdio.verbose('post %s' % cleanup_config_url_content) + response = requests.post(cleanup_config_url_content) + if response.status_code != 200: + stdio.warn('%s status code %s' % (cleanup_config_url_content, response.status_code)) + except: + stdio.warn('failed to clean up the configuration url content') + servers = {} + for server in cluster_config.servers: + server_config = cluster_config.get_server_conf(server) + client = clients[server] + if 'home_path' not in server_config: + stdio.verbose('%s home_path is empty', server) + continue + remote_pid_path = '%s/run/observer.pid' % server_config['home_path'] + remote_pid = client.execute_command('cat %s' % remote_pid_path).stdout.strip() + if remote_pid and client.execute_command('ps uax | egrep " %s " | grep -v grep' % remote_pid): + stdio.verbose('%s observer[pid:%s] stopping ...' % (server, remote_pid)) + client.execute_command('kill -9 %s' % (remote_pid)) + servers[server] = { + 'client': client, + 'mysql_port': server_config['mysql_port'], + 'rpc_port': server_config['rpc_port'], + 'pid': remote_pid, + 'path': remote_pid_path + } + else: + stdio.verbose('%s observer is not running ...' % server) + count = 30 + time.sleep(1) + while count and servers: + tmp_servers = {} + for server in servers: + data = servers[server] + client = clients[server] + stdio.verbose('%s check whether the port is released' % server) + for key in ['rpc_port', 'mysql_port']: + if data[key] and not port_release_check(data['client'], data['pid'], data[key], count): + tmp_servers[server] = data + break + data[key] = '' + else: + client.execute_command('rm -f %s' % (data['path'])) + stdio.verbose('%s observer is stopped', server) + servers = tmp_servers + count -= 1 + if count and servers: + if count == 5: + for server in servers: + data = servers[server] + server_config = cluster_config.get_server_conf(server) + client = clients[server] + client.execute_command( + "if [[ -d /proc/%s ]]; then pkill -9 -u `whoami` -f '%s/bin/observer -p %s';fi" % + (data['pid'], server_config['home_path'], server_config['mysql_port'])) + time.sleep(3) + + if servers: + stdio.stop_loading('fail') + for server in servers: + stdio.warn('%s port not released', server) + else: + stdio.stop_loading('succeed') + + stdio.start_loading('Stop obshell') + for server in cluster_config.servers: + client = clients[server] + server_config = cluster_config.get_server_conf(server) + stdio.verbose('%s obshell stopping ...' % (server)) + home_path = server_config['home_path'] + cmd = 'cd %s; %s/bin/obshell admin stop'%(home_path, home_path) + if not client.execute_command(cmd): + stdio.stop_loading('fail') + return + # check obshell is stopped + remote_pid_path = '%s/run/obshell.pid' % home_path + remote_pid = client.execute_command('cat %s' % remote_pid_path).stdout.strip() + if remote_pid and client.execute_command('ps uax | egrep " %s " | grep -v grep' % remote_pid): + stdio.stop_loading('fail') + return + remote_pid_path = '%s/run/daemon.pid' % home_path + remote_pid = client.execute_command('cat %s' % remote_pid_path).stdout.strip() + if remote_pid and client.execute_command('ps uax | egrep " %s " | grep -v grep' % remote_pid): + stdio.stop_loading('fail') + return + stdio.stop_loading('succeed') + + plugin_context.return_true() \ No newline at end of file diff --git a/plugins/oceanbase/4.2.1.4/takeover.py b/plugins/oceanbase/4.2.1.4/takeover.py new file mode 100644 index 0000000..5e36a1f --- /dev/null +++ b/plugins/oceanbase/4.2.1.4/takeover.py @@ -0,0 +1,275 @@ +# coding: utf-8 +# OceanBase Deploy. +# Copyright (C) 2021 OceanBase +# +# This file is part of OceanBase Deploy. +# +# OceanBase Deploy is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# OceanBase Deploy is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with OceanBase Deploy. If not, see . + + +from __future__ import absolute_import, division, print_function + +import getpass +import os +from collections import defaultdict + +import yaml +from _mirror import get_use_centos_release +from _deploy import DeployStatus, DeployConfigStatus +from ssh import LocalClient +from _types import CapacityMB + + +stdio = None + + +def dump_yaml(config, config_yaml_path): + os.makedirs(os.path.dirname(config_yaml_path), exist_ok=True) + try: + with open(config_yaml_path, 'w') as f: + f.write(yaml.dump(dict(config), sort_keys=False)) + f.flush() + return True + except Exception as e: + stdio.verbose(e) + stdio.error('dump deploy info to %s failed' % config_yaml_path) + return False + + +def get_global_key_value(ips_data): + key_values_map = {} + server_num = len(ips_data) + for data in ips_data.values(): + for k, v in data.items(): + if k not in key_values_map: + key_values_map[k] = [v, 1] + elif key_values_map[k][0] == v: + key_values_map[k][1] += 1 + common_key_values = {k: v[0] for k, v in key_values_map.items() if v[1] == server_num} + return common_key_values + + +def exec_sql(sql, cursor, args=None, exec_type='fetchone', raise_exception=False, exc_level='error'): + if exec_type == 'fetchall': + return cursor.fetchall(sql, args=args, raise_exception=raise_exception, exc_level=exc_level) if cursor else False + elif exec_type == 'fetchone': + return cursor.fetchone(sql, args=args, raise_exception=raise_exception, exc_level=exc_level) if cursor else False + else: + return False + + +def format_server(ip, port): + return '{}_{}'.format(ip, port) + + +def takeover(plugin_context, user_config={}, name='', obd_home='', *args, **kwargs): + def get_option(key, default=''): + value = getattr(options, key, default) + if not value: + value = default + return value + + def error(msg): + stdio.error(msg) + stdio.stop_loading('fail') + return plugin_context.return_false() + + options = plugin_context.options + stdio = plugin_context.stdio + stdio.start_loading('Takeover precheck') + clients = plugin_context.clients + cursor = kwargs.get('cursor') + available_sql = "show databases" + available = exec_sql(available_sql, cursor, exec_type='fetchone', raise_exception=False) + if not available: + stdio.error('The current OceanBase does not support takeover, the database is not available.') + stdio.stop_loading('fail') + return plugin_context.return_false() + + check_ocs_sql = "show databases like 'ocs'" + ocs = exec_sql(check_ocs_sql, cursor, exec_type='fetchone', raise_exception=True) + if not ocs: + stdio.error('The current OceanBase does not support takeover, OCS is not installed.') + stdio.stop_loading('fail') + return plugin_context.return_false() + + # check architecture + check_ocs_sql = "select count(DISTINCT(architecture)) as count from ocs.all_agent;" + count = exec_sql(check_ocs_sql, cursor, exec_type='fetchone', raise_exception=True) + if not count or count['count'] > 1: + stdio.error('The current OceanBase does not support takeover, the architecture of the server is inconsistent.') + stdio.stop_loading('fail') + return plugin_context.return_false() + + # query all server hosts + query_server_sql = "select ob.SVR_IP as ip, ob.SVR_PORT as rpc_port, ob.SQL_PORT as mysql_port, ob.ZONE as zone, ob.STATUS as status, ob.BUILD_VERSION as version, ocs.port as obshell_port, ocs.home_path as home_path from oceanbase.DBA_OB_SERVERS as ob left join ocs.all_agent as ocs on ob.SVR_IP=ocs.ip and ob.SQL_PORT=ocs.mysql_port" + servers = exec_sql(query_server_sql, cursor, exec_type='fetchall', raise_exception=True) + version = None + release = None + dict_servers = {} + bin_is_symbolic = False + + os_release_cmd = '''cat /etc/os-release | grep '^VERSION_ID=' | awk -F '=' '{print $2}' | sed 's/"//g' | awk -F '.' '{print $1}' ''' + ret = LocalClient.execute_command(os_release_cmd, stdio=stdio) + if not ret: + error('Failed to get os version') + os_release, _ = get_use_centos_release() + for server in servers: + if server['status'] != 'ACTIVE': + return error('Server %s:%s is not active' % (server['ip'], server['mysql_port'])) + _version = server['version'].split('-')[0].split('_')[0] + _release = server['version'].split('-')[0].split('_')[1] + if version is None: + version = _version + release = '{}.el{}'.format(_release, os_release) + else: + if version != _version or release != '{}.el{}'.format(_release, os_release): + return error('Server %s:%s version is not match' % (server['ip'], server['mysql_port'])) + + home_path = server['home_path'] + for client in clients.values(): + owner = client.execute_command("ls -ld %s/etc | awk '{print $3}'" % home_path).stdout.strip() + if owner != client.config.username: + return error('Server {}:{} owner is not match. The SSH user for takeover does not match the owner that OceanBase is running under, SSH user: {}, expected: {}.'.format(server['ip'], server['mysql_port'], ssh_user, owner)) + bin_is_symbolic = client.execute_command('''[ -L "%s/bin/observer" ]''' % home_path).code == 0 + break + + ip = server['ip'] + rpc_port = server['rpc_port'] + del server['status'] + del server['version'] + del server['ip'] + + dict_servers[format_server(ip, rpc_port)] = server + stdio.stop_loading('succeed') + + stdio.start_loading('Generate config file') + + bool_parameter = { + 'enable_syslog_recycle': False, + 'enable_syslog_wf': True + } + int_parameter = { + 'log_disk_percentage': 0, + 'datafile_disk_percentage': 0, + 'cpu_count': 0, + 'max_syslog_file_count': 0, + 'memory_limit_percentage': 80, + 'cluster_id': 0 + } + capacity_parameter = { + 'memory_limit': 0, + 'system_memory': 0, + 'log_disk_size': 0, + 'datafile_maxsize': 0, + 'datafile_size': 0, + 'datafile_next': 0 + } + str_parameter = ['data_dir', 'cluster', 'devname'] + + default_config = {} + parameter_keys = [] + str_parameter + for parameters in [bool_parameter, int_parameter, capacity_parameter]: + parameter_keys += list(parameters.keys()) + default_config.update(parameters) + + query_parameter_sql = "show parameters where name in %s" + parameters = exec_sql(query_parameter_sql, cursor, args=[parameter_keys], exec_type='fetchall', raise_exception=True) + for parameter in parameters: + key = parameter['name'] + default = '' + if key in int_parameter: + parameter['value'] = int(parameter['value']) + default = int_parameter[key] + elif key in bool_parameter: + parameter['value'] = bool(parameter['value']) + default = bool_parameter[key] + elif key in capacity_parameter: + parameter['value'] = CapacityMB(parameter['value']).value + default = CapacityMB(capacity_parameter[key]).value + + if parameter['value'] == default: + continue + + server = format_server(parameter['svr_ip'], parameter['svr_port']) + dict_servers[server][key] = parameter['value'] + + PRO_MEMORY_MIN = 16 << 30 + for server in dict_servers: + config = dict_servers[server] + if 'memory_limit' in config: + dict_servers[server]['production_mode'] = CapacityMB(config['memory_limit']).btyes >= PRO_MEMORY_MIN + if 'cluster' in config: + config['appname'] = config['cluster'] + del config['cluster'] + + config = defaultdict(dict) + servers = [] + global_config = get_global_key_value(dict_servers) + if user_config: + config['user'] = user_config + config['oceanbase-ce'] = { + 'version': version, + 'release': release, + 'servers': servers, + 'global': global_config + } + global_config['root_password'] = get_option('root_password', '') + + count = 1 + for server_ip_rpc_port, server_value in dict_servers.items(): + server_ip = server_ip_rpc_port.split('_')[0] + server = dict() + server['name'] = 'server{}'.format(count) + server['ip'] = server_ip + servers.append(server) + server_config = dict() + for key, value in server_value.items(): + if key not in global_config.keys(): + server_config[key] = value + if server_config: + config['oceanbase-ce']['server{}'.format(count)] = server_config + count += 1 + stdio.verbose('dump config to file') + config_yaml_path = '{}/cluster/{}/config.yaml'.format(obd_home, name) + if not dump_yaml(config, config_yaml_path): + return error('dump config to file failed') + + # dump .data file + oceanbase_ce = dict() + oceanbase_ce['version'] = version + oceanbase_ce['release'] = release + data = dict() + data['name'] = name + data['components'] = {'oceanbase-ce': oceanbase_ce} + data['status'] = DeployStatus.STATUS_CONFIGURED.name + data['config_status'] = DeployConfigStatus.UNCHNAGE.name + data_file_path = '{}/cluster/{}/.data'.format(obd_home, name) + if not dump_yaml(data, data_file_path): + LocalClient.execute_command('rm -rf {}'.format(config_yaml_path)) + return error('dump .data file failed') + + # dump inner_config.yaml + inner_config = dict() + inner_config['oceanbase-ce'] = dict() + for i in range(1, count): + inner_config['oceanbase-ce']['servers{}'.format(i)] = dict() + inner_config['$_deploy_install_mode'] = 'ln' if bin_is_symbolic else 'cp' + inner_config_path = '{}/cluster/{}/inner_config.yaml'.format(obd_home, name) + if not dump_yaml(inner_config, inner_config_path): + LocalClient.execute_command('rm -rf {} {}'.format(config_yaml_path, data_file_path)) + return error('dump inner_config.yaml failed') + + stdio.stop_loading('succeed') + plugin_context.return_true() diff --git a/plugins/oceanbase/4.2.1.4/upgrade.py b/plugins/oceanbase/4.2.1.4/upgrade.py new file mode 100644 index 0000000..1ecd8a9 --- /dev/null +++ b/plugins/oceanbase/4.2.1.4/upgrade.py @@ -0,0 +1,621 @@ +# coding: utf-8 +# OceanBase Deploy. +# Copyright (C) 2021 OceanBase +# +# This file is part of OceanBase Deploy. +# +# OceanBase Deploy is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# OceanBase Deploy is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with OceanBase Deploy. If not, see . + + +from __future__ import absolute_import, division, print_function + +import os +import time +from collections import defaultdict + +import tool +from _rpm import Version +from ssh import LocalClient + + +class Exector(object): + + def __init__(self, tmp_prefix, host, port, user, pwd, exector_path, stdio, script_query_timeout=''): + self.tmp_prefix = tmp_prefix + self._host = host + self._port = port + self._user = user + self._pwd = pwd + self._cmd = None + self.stdio = stdio + self._exector = os.path.join(exector_path, 'executer27/bin/executer') + self.script_query_timeout = script_query_timeout + + @property + def host(self): + return self._host + + @property + def port(self): + return self._port + + @property + def user(self): + return self._user + + @property + def pwd(self): + return self._pwd + + @property + def exector(self): + return self._exector + + @property + def cmd(self): + if self._cmd is None: + self._cmd = '%s %%s -h %s -P %s -u %s %s' % (self._exector, self.host, self.port, self.user, "-p '%s'" % self.pwd if self.pwd else '') + return self._cmd + + @host.setter + def host(self, value): + self._host = value + self._cmd = None + + @port.setter + def port(self, value): + self._port = value + self._cmd = None + + @user.setter + def user(self, value): + self._user = value + self._cmd = None + + @pwd.setter + def pwd(self, value): + self._pwd = value + self._cmd = None + + @pwd.setter + def exector(self, exector_path): + self._exector = os.path.join(exector_path, 'bin/executer27') + self._cmd = None + + def create_temp(self, repository, direct_upgrade=False): + tmp_path = os.path.join('/tmp', self.tmp_prefix, repository.md5) + if not os.path.exists(tmp_path): + relative_dir = 'etc/direct_upgrade' if direct_upgrade else 'etc' + script_dir = os.path.join(repository.repository_dir, relative_dir) + LocalClient.put_dir(script_dir, tmp_path) + return tmp_path + + def clear_temp(self): + tmp_path = os.path.join('/tmp', self.tmp_prefix) + tool.DirectoryUtil.rm(tmp_path) + + def exec_script(self, name, repository, direct_upgrade=False, can_skip=False, param=''): + script_dir = self.create_temp(repository, direct_upgrade) + path = os.path.join(script_dir, name) + self.stdio.verbose('exec %s %s' % (repository, name)) + if os.path.exists(path): + cmd = '{} {} {}'.format(self.cmd.replace('%s', path, 1), param, '-t {}'.format(self.script_query_timeout) if self.script_query_timeout else '') + self.stdio.start_loading('Exec %s %s' % (repository, name)) + if LocalClient.execute_command(cmd, stdio=self.stdio): + self.stdio.stop_loading('succeed') + return True + else: + self.stdio.stop_loading('fail') + return False + else: + if can_skip: + self.stdio.print('skip %s %s' % (repository, name)) + return True + else: + self.stdio.error('No such file: %s' % path) + return False + + +class Upgrader(object): + + def __init__(self, plugin_context, search_py_script_plugin, apply_param_plugin, upgrade_ctx, upgrade_repositories, + local_home_path, exector_path, install_repository_to_servers, unuse_lib_repository, script_query_timeout): + self._search_py_script_plugin = search_py_script_plugin + self.apply_param_plugin = apply_param_plugin + self.plugin_context = plugin_context + self.components = plugin_context.components + self.clients = plugin_context.clients + self.cluster_config = plugin_context.cluster_config + self.stdio = plugin_context.stdio + self._connect_plugin = None + self._start_plugin = None + self._stop_plugin = None + self._bootstrap_plugin = None + self._display_plugin = None + self._start_check_plugin = None + self.install_repository_to_servers = install_repository_to_servers + self.unuse_lib_repository = unuse_lib_repository + self.local_home_path = local_home_path + self.exector_path = exector_path + self.components = plugin_context.components + self.exector = None + self.db = None + self.cursor = None + self.ocs_cursor = None + self.repositories = upgrade_repositories + self.upgrade_ctx = upgrade_ctx + self.route = upgrade_ctx.get('route') + self.route_index = upgrade_ctx.get('index') + self.process_index = upgrade_ctx.get('process_index', 0) + self.process_route_index = upgrade_ctx.get('process_route_index', self.route_index) + self.backup_param = upgrade_ctx.get('backup_param', None) + self.script_query_timeout = script_query_timeout + self.has_obshell = upgrade_repositories[0].version >= Version('4.2.2.0') + + self.process = [ + self.backup_config, + self.disable_ddl_and_check, + self.exec_upgrade_checker, + self.exec_upgrade_pre, + self.exec_upgrade_health_checker, + self.upgrade_zone, + self.exec_upgrade_post, + self.restore_config, + self.take_over + ] + self.process_total = len(self.process) + key = [] + for server in self.cluster_config.servers: + config = self.cluster_config.get_server_conf_with_default(server) + port = config.get('rpc_port') + key.append('%s:%s' % (server.ip, port)) + self.tmp_prefix = '_'.join(key) + + def search_py_script_plugin(self, index, name): + repository = self.repositories[index] + return self._search_py_script_plugin([repository], name)[repository] + + @property + def connect_plugin(self): + if self._connect_plugin is None: + self._connect_plugin = self.search_py_script_plugin(self.route_index - 1, 'connect') + return self._connect_plugin + + @property + def start_plugin(self): + if self._start_plugin is None: + self._start_plugin = self.search_py_script_plugin(self.next_stage, 'start') + return self._start_plugin + + @property + def start_check_plugin(self): + if self._start_check_plugin is None: + self._start_check_plugin = self.search_py_script_plugin(self.next_stage, 'start_check') + return self._start_check_plugin + + @property + def stop_plugin(self): + if self._stop_plugin is None: + self._stop_plugin = self.search_py_script_plugin(self.route_index - 1, 'stop') + return self._stop_plugin + + @property + def bootstrap_plugin(self): + if self._bootstrap_plugin is None: + self._bootstrap_plugin = self.search_py_script_plugin(self.next_stage, 'bootstrap') + return self._bootstrap_plugin + + @property + def display_plugin(self): + if self._display_plugin is None: + self._display_plugin = self.search_py_script_plugin(self.route_index - 1, 'display') + return self._display_plugin + + def _clear_plugin(self): + self._connect_plugin = None + self._start_plugin = None + self._stop_plugin = None + self._display_plugin = None + + def call_plugin(self, plugin, *args, **kwargs): + return plugin(self.plugin_context.namespace, self.plugin_context.namespaces, self.plugin_context.deploy_name, self.plugin_context.deploy_status, + self.plugin_context.repositories, self.plugin_context.components, self.plugin_context.clients, + self.plugin_context.cluster_config, self.plugin_context.cmds, self.plugin_context.options, + self.plugin_context.stdio.sub_io(), *args, **kwargs) + + def run(self): + total = len(self.route) + self.apply_param_plugin(self.repositories[self.route_index - 1]) + while self.route_index < total: + start_plugin = self.search_py_script_plugin(self.route_index - 1, 'start') + self.call_plugin(start_plugin, start_obshell=self.has_obshell) + self.close() + if not self.connect(): + return False + self.stdio.verbose('upgrade %s to %s' % (self.repositories[self.route_index], self.repositories[self.next_stage])) + while self.process_index < self.process_total: + try: + if not self.process[self.process_index](): + self._dump() + return False + self.process_index += 1 + self.process_route_index = self.route_index + except Exception as e: + self._dump() + self.stdio.exception(str(e)) + return False + self.process_index = 0 + self.route_index = self.next_stage + 1 + self.exector.clear_temp() + self.stdio.verbose('set route index from %s to %s' % (self.route_index, self.next_stage + 1)) + break + self._dump() + return True + + def _dump(self): + self.upgrade_ctx['route'] = self.route + self.upgrade_ctx['index'] = self.route_index + self.upgrade_ctx['process_index'] = self.process_index + self.upgrade_ctx['process_route_index'] = self.process_route_index + self.upgrade_ctx['backup_param'] = self.backup_param + + def close(self): + if self.cursor: + self.cursor.close() + self.cursor = None + self.ocs_cursor = None + self.db = None + self.exector = None + + def connect(self): + if self.cursor is None or self.execute_sql('show tables', error=False) is False: + ret = self.call_plugin(self.connect_plugin) + if not ret: + return False + if self.cursor: + self.close() + self.cursor = ret.get_return('cursor') + self.db = ret.get_return('connect') + self.ocs_cursor = ret.get_return('ocs_cursor') + while self.execute_sql('use oceanbase', error=False) is False: + time.sleep(2) + self.execute_sql('set session ob_query_timeout=1000000000') + server = ret.get_return('server') + host = server.ip + port = self.db.port + user = 'root' + pwd = self.cluster_config.get_global_conf().get('root_password', '') + self.exector = Exector(self.tmp_prefix, host, port, user, pwd if pwd is not None else '', self.exector_path, self.stdio,self.script_query_timeout) + return True + + def execute_sql(self, query, args=None, one=True, error=True): + exc_level = 'error' if error else 'verbose' + if one: + result = self.cursor.fetchone(query, args, exc_level=exc_level) + else: + result = self.cursor.fetchall(query, args, exc_level=exc_level) + result and self.stdio.verbose(result) + return result + + @property + def next_stage(self): + next_stage = self.route_index + total = len(self.route) - 1 + while next_stage < total: + node = self.route[next_stage] + if node.get('require_from_binary'): + break + next_stage += 1 + return next_stage + + def _exec_script_dest_only(self, name, can_skip=True, param=''): + self.stdio.start_loading('Exec %s' % name) + next_stage = self.next_stage + repository = self.repositories[next_stage] + self.stdio.verbose('exec %s %s' % (repository, name)) + if not self.exector.exec_script(name, repository, direct_upgrade=self.route[next_stage].get('direct_upgrade'), can_skip=can_skip, param=param): + return False + self.stdio.stop_loading('succeed') + return True + + def _exec_script_all_repositories(self, name, can_skip=False): + self.stdio.start_loading('Exec %s' % name) + next_stage = self.next_stage + cur_repository = self.repositories[self.route_index - 1] + while self.process_route_index <= next_stage: + repository = self.repositories[self.process_route_index] + if cur_repository.version == repository.version: + self.stdio.verbose('skip %s %s' % (repository, name)) + else: + self.stdio.verbose('exec %s %s' % (repository, name)) + if not self.exector.exec_script(name, repository, direct_upgrade=self.route[self.process_route_index].get('direct_upgrade'), can_skip=can_skip): + self.stdio.stop_loading('fail') + return False + self.process_route_index += 1 + self.stdio.stop_loading('succeed') + return True + + def execute_upgrade_sql(self, query, args=None, one=True): + if self.execute_sql(query, args, one) is False: + return False + self.process_route_index = self.route_index + return True + + def exec_upgrade_checker(self): + return self._exec_script_dest_only('upgrade_checker.py') + + def exec_upgrade_health_checker(self): + return self._exec_script_dest_only('upgrade_health_checker.py') + + def exec_upgrade_health_checker_zone(self, zone): + return self._exec_script_dest_only('upgrade_health_checker.py', param="-z '{}'".format(zone)) + + def exec_upgrade_pre(self): + return self._exec_script_all_repositories('upgrade_pre.py') + + def disable_ddl_and_check(self): + if self.repositories[self.route_index - 1].version == Version('4.0.0.0'): + self.stdio.start_loading('Disable DDL') + while True: + # check ddl end + while self.execute_sql("select task_id from __all_virtual_ddl_task_status", error=True): + time.sleep(3) + # close ddl + if self.execute_sql('alter system set enable_ddl = false') is False: + self.stdio.stop_loading('fail') + return False + while self.execute_sql("select * from __all_virtual_sys_parameter_stat where name = 'enable_ddl' and value != 'false'"): + time.sleep(3) + + # check ddl end + if self.execute_sql("select task_id from __all_virtual_ddl_task_status", error=True): + if not self.execute_sql('alter system set enable_ddl = true'): + self.stdio.stop_loading('fail') + continue + break + + # check clog + self.stdio.verbose('wait clog sync') + rets = self.execute_sql("select tenant_id, ls_id, max(max_scn) as max_scn from gv$ob_log_stat group by tenant_id, ls_id", one=False, error=True) + if rets is not None: + for ret in rets: + while self.execute_sql("select unsubmitted_log_scn from __all_virtual_replay_stat where tenant_id = %s and ls_id = %s and role != 'leader' and unsubmitted_log_scn <= %s" % (ret['tenant_id'], ret['ls_id'], ret['max_scn']), error=True): + time.sleep(3) + + # major freeze + # 1. wait all tenant global_broadcast_scn = last_scn, record tenant_id, global_broadcast_scn + pre_tenant_scn_dict = {} + tenant_ids = [] + for tenant_info in self.execute_sql("select tenant_id from CDB_OB_MAJOR_COMPACTION", one=False): + tenant_ids.append(tenant_info['tenant_id']) + while tenant_ids: + pre_tenant_scn_list = self.execute_sql("select tenant_id, global_broadcast_scn, last_scn from CDB_OB_MAJOR_COMPACTION where tenant_id in ({})".format(",".join([str(x) for x in tenant_ids])), one=False) + tenant_ids = [] + for pre_tenant_scn in pre_tenant_scn_list: + if pre_tenant_scn['global_broadcast_scn'] > pre_tenant_scn['last_scn']: + tenant_ids.append(pre_tenant_scn['tenant_id']) + continue + pre_tenant_scn_dict[pre_tenant_scn['tenant_id']] = pre_tenant_scn['global_broadcast_scn'] + time.sleep(1) + + # 2. begin merge + self.execute_sql("alter system major freeze tenant = all", error=False) + + # 3. wait merge start + tenant_ids = pre_tenant_scn_dict.keys() + while tenant_ids: + tenant_scn_list = self.execute_sql("select tenant_id, global_broadcast_scn from CDB_OB_MAJOR_COMPACTION where tenant_id in ({})".format(",".join([str(x) for x in tenant_ids])), one=False) + tenant_ids = [] + for tenant_scn in tenant_scn_list: + if pre_tenant_scn_dict[tenant_scn['tenant_id']] >= tenant_scn['global_broadcast_scn']: + tenant_ids.append(tenant_scn['tenant_id']) + continue + time.sleep(3) + + # 4. wait merge finish + while self.execute_sql("select * from CDB_OB_MAJOR_COMPACTION where global_broadcast_scn > last_scn"): + time.sleep(3) + + self.stdio.stop_loading('succeed') + + return True + + def broken_sql(self, sql, sleep_time=3): + while True: + ret = self.execute_sql(sql, error=False) + if ret is None: + break + time.sleep(sleep_time) + + def wait(self): + if not self.connect(): + return False + self.stdio.verbose('server cneck') + self.broken_sql("select * from oceanbase.DBA_OB_SERVERS where STATUS != 'ACTIVE' or STOP_TIME is not NULL or START_SERVICE_TIME is NULL") + self.broken_sql("select * from GV$OB_LOG_STAT where in_sync = 'NO'") + return True + + def start_zone(self, zone=None): + if not self.connect(): + return False + if zone: + self.stdio.verbose('start zone %s' % zone) + start_sql = "alter system start zone %s" % zone + check_sql = "select * from oceanbase.__all_zone where name = 'status' and zone = '%s' and info != 'ACTIVE'" % zone + while True: + if self.execute_sql(start_sql, error=False) is None: + break + if self.execute_sql(check_sql, error=False) is None: + break + time.sleep(3) + self.wait() + return True + + def stop_zone(self, zone): + if not self.wait(): + return False + + self.stdio.verbose('stop zone %s' % zone) + stop_sql = "alter system stop zone %s" % zone + + if self.execute_sql(stop_sql, error=False) is None: + return True + + def upgrade_zone(self): + zones_servers = {} + for server in self.cluster_config.servers: + config = self.cluster_config.get_server_conf_with_default(server) + zone = config['zone'] + if zone not in zones_servers: + zones_servers[zone] = [] + zones_servers[zone].append(server) + servers = self.cluster_config.servers + try: + if len(zones_servers) > 2: + ret = self.rolling_upgrade(zones_servers) + else: + ret = self.un_rolling_upgrade() + if ret: + self._clear_plugin() + return True + return False + except Exception as e: + self.stdio.exception('Run Exception: %s' % e) + return False + finally: + self.cluster_config.servers = servers + + def un_rolling_upgrade(self): + self.stdio.start_loading('Upgrade') + repository = self.repositories[self.next_stage] + repository_dir = repository.repository_dir + self.install_repository_to_servers(self.components, self.cluster_config, repository, self.clients, + self.unuse_lib_repository) + + if not self.call_plugin(self.stop_plugin): + self.stdio.stop_loading('stop_loading', 'fail') + return False + + self.apply_param_plugin(repository) + if not self.call_plugin(self.start_plugin, start_obshell=self.has_obshell, local_home_path=self.local_home_path, repository_dir=repository_dir): + self.stdio.stop_loading('stop_loading', 'fail') + return False + self.close() + self.wait() + self.stdio.stop_loading('succeed') + return True + + def rolling_upgrade(self, zones_servers): + self.stdio.start_loading('Rotation upgrade') + all_servers = self.cluster_config.servers + repository = self.repositories[self.next_stage] + repository_dir = repository.repository_dir + pre_zone = None + for zone in zones_servers: + self.cluster_config.servers = zones_servers[zone] + if not self.start_zone(pre_zone): + self.stdio.stop_loading('stop_loading', 'fail') + return False + + self.stop_zone(zone) + + self.stdio.print('upgrade zone "%s"' % zone) + self.install_repository_to_servers(self.components, self.cluster_config, repository, self.clients, self.unuse_lib_repository) + + + if pre_zone: + self.apply_param_plugin(self.repositories[self.route_index - 1]) + if not self.call_plugin(self.stop_plugin): + self.stdio.stop_loading('stop_loading', 'fail') + return False + + self.apply_param_plugin(repository) + if not self.call_plugin(self.start_plugin, start_obshell=self.has_obshell, local_home_path=self.local_home_path, repository_dir=repository_dir): + self.stdio.stop_loading('stop_loading', 'fail') + return False + + self.wait() + if not self.exec_upgrade_health_checker_zone(zone): + return False + self.close() + pre_zone = zone + + if not self.start_zone(pre_zone): + self.stdio.stop_loading('stop_loading', 'fail') + return False + self.stdio.stop_loading('succeed') + return True + + def exec_upgrade_post(self): + return self._exec_script_all_repositories('upgrade_post.py') + + def backup_config(self): + keys = ("server_permanent_offline_time", "enable_rebalance", "enable_rereplication") + query_sql = 'select SVR_IP,SVR_PORT,NAME,VALUE from oceanbase.GV$OB_PARAMETERS where name in %s ' + ret = self.execute_sql(query_sql, [keys, ], one=False, error=False) + if ret is not None: + backup_param = defaultdict(dict) + for config in ret: + backup_param['{}:{}'.format(config['SVR_IP'], config['SVR_PORT'])].update({config['NAME']: config['VALUE']}) + self.backup_param = backup_param + else: + return False + + return True + + def restore_config(self): + dict_backup_param = dict(self.backup_param) + for server_port in dict_backup_param: + for key in dict_backup_param[server_port]: + alter_sql = 'alter system set {} = %s server %s'.format(key) + if self.execute_sql(alter_sql, [dict_backup_param[server_port][key], server_port], error=False) is False: + return False + return True + + def take_over(self): + if self.route_index < len(self.route) - 1: + return True + repository = self.repositories[self.route_index] + self.apply_param_plugin(repository) + if not self.call_plugin(self.start_check_plugin, source_option='upgrade'): + return False + + self.close() + self._connect_plugin = self.search_py_script_plugin(self.route_index, 'connect') + return self.connect() and self.call_plugin(self.bootstrap_plugin, need_bootstrap=False) + + +def upgrade(plugin_context, search_py_script_plugin, apply_param_plugin, install_repository_to_servers, unuse_lib_repository, *args, **kwargs): + + upgrade_ctx = kwargs.get('upgrade_ctx') + local_home_path = kwargs.get('local_home_path') + upgrade_repositories = kwargs.get('upgrade_repositories') + script_query_timeout = kwargs.get('script_query_timeout') + exector_path = getattr(plugin_context.options, 'executer_path', '/usr/obd/lib/executer') + + upgrader = Upgrader( + plugin_context=plugin_context, + search_py_script_plugin=search_py_script_plugin, + apply_param_plugin=apply_param_plugin, + upgrade_ctx=upgrade_ctx, + upgrade_repositories=upgrade_repositories, + local_home_path=local_home_path, + exector_path=exector_path, + install_repository_to_servers=install_repository_to_servers, + unuse_lib_repository=unuse_lib_repository, + script_query_timeout=script_query_timeout) + if upgrader.run(): + if upgrader.route_index >= len(upgrader.route): + upgrader.call_plugin(upgrader.display_plugin, upgrader.cursor, *args, **kwargs) + plugin_context.return_true() diff --git a/plugins/oceanbase/4.2.2.0/bootstrap.py b/plugins/oceanbase/4.2.2.0/bootstrap.py index 9f7bf63..9ff8806 100644 --- a/plugins/oceanbase/4.2.2.0/bootstrap.py +++ b/plugins/oceanbase/4.2.2.0/bootstrap.py @@ -37,7 +37,7 @@ def is_bootstrap(cursor): return int(ret.get("column_value")) > 0 -def bootstrap(plugin_context, *args, **kwargs): +def bootstrap(plugin_context, need_bootstrap=True, *args, **kwargs): cluster_config = plugin_context.cluster_config stdio = plugin_context.stdio clients = plugin_context.clients @@ -55,9 +55,9 @@ def bootstrap(plugin_context, *args, **kwargs): if added_components: stdio.verbose('bootstrap for components: %s' % added_components) - + raise_cursor = cursor.raise_cursor - if cluster_config.name in added_components: + if cluster_config.name in added_components and need_bootstrap: for server in cluster_config.servers: server_config = cluster_config.get_server_conf(server) zone = server_config['zone'] diff --git a/plugins/oceanbase/4.2.2.0/scale_out_check.py b/plugins/oceanbase/4.2.2.0/scale_out_check.py index b406186..6ccb25c 100644 --- a/plugins/oceanbase/4.2.2.0/scale_out_check.py +++ b/plugins/oceanbase/4.2.2.0/scale_out_check.py @@ -19,16 +19,20 @@ from __future__ import absolute_import, division, print_function +from const import COMP_OB, COMP_OB_CE + def add_plugin(component_name, plugins): if component_name not in plugins: plugins.append(component_name) + def scale_out_check(plugin_context, *args, **kwargs): cluster_config = plugin_context.cluster_config added_components = cluster_config.get_deploy_added_components() be_depend = cluster_config.be_depends plugins = [] + plugin_context.set_variable('need_bootstrap', False) if 'obagent' in added_components and 'obagent' in be_depend: add_plugin('generate_config', plugins) add_plugin('connect', plugins) @@ -45,6 +49,8 @@ def scale_out_check(plugin_context, *args, **kwargs): if cluster_config.added_servers: add_plugin('connect', plugins) add_plugin('bootstrap', plugins) + if (COMP_OB_CE in added_components or COMP_OB in added_components) and not cluster_config.added_servers: + plugin_context.set_variable('need_bootstrap', True) plugin_context.stdio.verbose('scale_out_check plugins: %s' % plugins) plugin_context.stdio.verbose('added_components: %s' % added_components) diff --git a/plugins/oceanbase/4.2.2.0/upgrade.py b/plugins/oceanbase/4.2.2.0/upgrade.py index b2c6479..1ecd8a9 100644 --- a/plugins/oceanbase/4.2.2.0/upgrade.py +++ b/plugins/oceanbase/4.2.2.0/upgrade.py @@ -593,7 +593,7 @@ def take_over(self): self.close() self._connect_plugin = self.search_py_script_plugin(self.route_index, 'connect') - return self.connect() and self.call_plugin(self.bootstrap_plugin) + return self.connect() and self.call_plugin(self.bootstrap_plugin, need_bootstrap=False) def upgrade(plugin_context, search_py_script_plugin, apply_param_plugin, install_repository_to_servers, unuse_lib_repository, *args, **kwargs): diff --git a/plugins/ocp-express/1.0.1/generate_config.py b/plugins/ocp-express/1.0.1/generate_config.py index 262d9e8..6696ba9 100644 --- a/plugins/ocp-express/1.0.1/generate_config.py +++ b/plugins/ocp-express/1.0.1/generate_config.py @@ -80,3 +80,6 @@ def generate_random_password(cluster_config): global_config = cluster_config.get_original_global_conf() if cluster_config.name in add_components and 'admin_passwd' not in global_config: cluster_config.update_global_conf('admin_passwd', ConfigUtil.get_random_pwd_by_rule(), False) + if cluster_config.name in add_components and 'ocp_root_password' not in global_config: + cluster_config.update_global_conf('ocp_root_password', ConfigUtil.get_random_pwd_by_rule(), False) + diff --git a/profile/obd.sh b/profile/obd.sh index 9edb4fe..ccba7f7 100644 --- a/profile/obd.sh +++ b/profile/obd.sh @@ -70,7 +70,7 @@ function _obd_complete_func prev=${!#} all_cmds["obd"]="mirror cluster test update repo demo web obdiag display-trace" - all_cmds["obd cluster"]="autodeploy tenant component start deploy redeploy restart reload destroy stop edit-config export-to-ocp list display upgrade chst check4ocp reinstall scale_out" + all_cmds["obd cluster"]="autodeploy tenant component start deploy redeploy restart reload destroy stop edit-config takeover export-to-ocp list display upgrade chst check4ocp reinstall scale_out" all_cmds["obd cluster *"]="_obd_reply_deploy_names" all_cmds["obd cluster tenant"]="create drop show create-standby switchover failover decouple" all_cmds["obd cluster tenant *"]="_obd_reply_deploy_names" diff --git a/rpm/ob-deploy.spec b/rpm/ob-deploy.spec index f6920c1..fe61eb8 100644 --- a/rpm/ob-deploy.spec +++ b/rpm/ob-deploy.spec @@ -133,6 +133,52 @@ echo -e 'Installation of obd finished successfully\nPlease source /etc/profile.d #/sbin/chkconfig obd on %changelog +* Fri Apr 19 2024 obd 2.8.0 + - new features: supports takeover of OceanBase_CE clusters + - new features: supports custom production_mode configuration in web + - bug fixes: fixed an issue where clicking the previous step after a blank screen deployment of OCP fails does not work as expected + - bug fixes: fixed an issue ith failing to upgrade certain versions of OceanBase +* Thu Mar 28 2024 obd 2.7.0 + - new features: support for the installation of utility software (such as obclient/obdiag, etc.) + - new features: optimize the OCP/OCP-Express deployment experience with automatic JRE installation + - new features: adapt to OceanBase 4.3.0, supporting deployment/tenant creation by tuning through specified load types + - new features: adapt to OBLogproxy V2.0.1_BP1 + - new features: adapt to OBProxy V4.2.3, supporting the setting of global unique session_id + - new features: adapt to OCP V4.2.2 password specifications + - new features: adapt to obdiag V1.6 + - new features: enhance the capability to adapt to capacity units, such as GB, G, etc., which will be self-adapted based on the component + - new features: optimize the metadata cleanup logic when destroying OCP + - new features: optimize the resource calculation logic of components during autodeploy + - new features: enhance the pre-check capability of OCP clockdiff + - bug fixes: fixed an issue where the tenant password was not saved when deploying OCP Express + - bug fixes: fixed a deployment failure caused by the need for manual confirmation during the first SSH fingerprint authentication + - bug fixes: fixed an issue where the password was not cleared when redeploying OCP Express in specific scenarios + - bug fixes: fixed an issue where deploying OCP in certain scenarios resulted in the disk being filled up + - bug fixes: fixed an unexpected issue with IP checks when deploying OBAgent + - bug fixes: fixed an issue where the ocp_site_url was passed abnormally when installing OCP via OBD WEB +* Fri Mar 08 2024 obd 2.6.2 + - bug fixes: fixed an issue where deploying OCP Express version 1.0.1 would cause unexpected startup errors + - bug fixes: fixed an issue where unexpected parameter transmission could occur when opening additional options during graphical deployment + - bug fixes: fixed an issue where the web did not support blank password input during OCP upgrades. + - bug fixes: fixed an issue where the sys tenant authentication information generated by OBDiag did not conform to expectations + - bug fixes: fixed an issue with rsync failing on the initial authentication synchronization + - enhancements: optimized the logic for pre-checking port occupancy +* Thu Feb 08 2024 obd 2.6.1 + - bug fixes: fixed an issue with abnormal upgrades in OCP Express + - bug fixes: fixed an issue with the port pre-check logic in OBShell + - bug fixes: fixed an issue with the ocp.site.url configuration item being overwritten in OCP + - bug fixes: fixed an issue with task status in OCP upgrade check + - enhancements: add the parameter validation for tenant creation + - enhancements: optimize the memory check logic in OCP + - enhancements: add confirmation logic for the destroy operation +* Thu Jan 18 2024 obd 2.6.0 + - new features: adapt to OceanBase-CE V4.2.2 with support for OBShell + - bug fixes: fixed an issue where the link to the English error code leads to incorrect redirection + - bug fixes: fixed an issue where executing restart in OCP Express stop state causes exceptions + - bug fixes: fixed an issue with OBDiag command completion + - bug fixes: fixed an issue with OBDiag offline log analysis + - bug fixes: fixed an issue with errors occurring during OBDiag slog/clog collection + - bug fixes: fixed an issue where it is not possible to log in after scaling out OBProxy nodes * Fri Dec 29 2023 obd 2.5.0 - new features: support for component changes within a deployment - new features: support for scaling up all components except the ocp-server diff --git a/ssh.py b/ssh.py index 9564d16..c3fccb7 100644 --- a/ssh.py +++ b/ssh.py @@ -566,7 +566,7 @@ def _rsync(self, source, target, stdio=None): identity_option += '-i {key_filename} '.format(key_filename=self.config.key_filename) if self.config.port: identity_option += '-p {}'.format(self.config.port) - cmd = 'yes | rsync -a -W -e "ssh {identity_option}" {source} {target}'.format( + cmd = 'yes | rsync -a -W -L -e "ssh {identity_option}" {source} {target}'.format( identity_option=identity_option, source=source, target=target @@ -688,12 +688,14 @@ def _rsync_get_file(self, local_path, remote_path, stdio=None): def _client_get_file(self, local_path, remote_path, stdio=None): try: - self.sftp.get(remote_path, local_path) - stat = self.sftp.stat(remote_path) + res = self.execute_command('realpath {}'.format(remote_path)) + remote_real_path = res.stdout.strip() + self.sftp.get(remote_real_path, local_path) + stat = self.sftp.stat(remote_real_path) os.chmod(local_path, stat.st_mode) return True except Exception as e: - stdio.exception('get %s from %s@%s:%s failed: %s' % (local_path, self.config.username, self.config.host, remote_path, e)) + stdio.exception('get %s from %s@%s:%s failed: %s' % (local_path, self.config.username, self.config.host, remote_real_path, e)) return False def get_dir(self, local_dir, remote_dir, stdio=None): @@ -734,7 +736,7 @@ def _client_get_dir(self, local_dir, remote_dir, stdio=None): has_failed = False if DirectoryUtil.mkdir(local_dir, stdio=stdio): try: - ret = self.execute_command('find %s -type f' % remote_dir) + ret = self.execute_command('find %s -type f -o -type l' % remote_dir) if not ret: stdio.verbose(ret.stderr) has_failed = True diff --git a/web/src/component/MetaDBConfig/DataBaseNodeConfig.tsx b/web/src/component/MetaDBConfig/DataBaseNodeConfig.tsx index 633a7e3..cbc3859 100644 --- a/web/src/component/MetaDBConfig/DataBaseNodeConfig.tsx +++ b/web/src/component/MetaDBConfig/DataBaseNodeConfig.tsx @@ -151,7 +151,7 @@ export default function DataBaseNodeConfig({ }, { validator: (_: any, value: string[]) =>{ - return serversValidator(_, value, allOBServer, 'OBServer',allZoneOBServer,finalValidate) + return serversValidator(_, value, 'OBServer') } }, ], diff --git a/web/src/component/OCPConfigNew/ServiceConfig.tsx b/web/src/component/OCPConfigNew/ServiceConfig.tsx index 88ea079..fedcb1c 100644 --- a/web/src/component/OCPConfigNew/ServiceConfig.tsx +++ b/web/src/component/OCPConfigNew/ServiceConfig.tsx @@ -8,6 +8,7 @@ import { CloseCircleFilled, } from '@ant-design/icons'; import { useEffect, useState } from 'react'; +import { useUpdateEffect } from 'ahooks'; import { useModel, getLocale } from 'umi'; import { siteReg } from '@/utils'; import styles from './index.less'; @@ -98,7 +99,7 @@ export default function ServiceConfig({ } }, [isSingleOcpNode]); - useEffect(() => { + useUpdateEffect(() => { form.setFieldsValue({ ocpserver: { home_path: diff --git a/web/src/pages/Obdeploy/NodeConfig.tsx b/web/src/pages/Obdeploy/NodeConfig.tsx index 70cbf01..6a82475 100644 --- a/web/src/pages/Obdeploy/NodeConfig.tsx +++ b/web/src/pages/Obdeploy/NodeConfig.tsx @@ -25,7 +25,7 @@ import type { } from '@ant-design/pro-components'; import { getObdInfo } from '@/services/ob-deploy-web/Info'; import useRequest from '@/utils/useRequest'; -import { handleQuit, getErrorInfo, serverReg, serversValidator,validateErrors } from '@/utils'; +import { handleQuit, getErrorInfo, serverReg, serversValidator } from '@/utils'; import { commonStyle, pathRule } from '../constants'; import { getAllServers } from '@/utils/helper'; import ServerTags from './ServerTags'; @@ -482,14 +482,7 @@ export default function NodeConfig() { }, { validator: (_: any, value: string[]) => - serversValidator( - _, - value, - allOBServer, - 'OBServer', - allZoneOBServer, - finalValidate - ), + serversValidator(_, value, 'OBServer'), }, ], }, @@ -833,7 +826,7 @@ export default function NodeConfig() { }, { validator: (_: any, value: string[]) => - serversValidator(_, value, allOBServer, 'OBProxy'), + serversValidator(_, value, 'OBProxy'), }, ]} options={formatOptions(allOBServer)} diff --git a/web/src/utils/index.tsx b/web/src/utils/index.tsx index a2d5840..8e856db 100644 --- a/web/src/utils/index.tsx +++ b/web/src/utils/index.tsx @@ -230,24 +230,6 @@ export const ocpServersValidator = (_: any, value: string[]) => { ); }; -const checkIsRepeatByAllServers = (allZoneServers: any, id: string) => { - let currentServers: string[] = [], - otherServers: string[] = []; - Object.keys(allZoneServers).forEach((key) => { - if (id === key) { - currentServers = [...allZoneServers[key]]; - } else { - otherServers = [...otherServers, ...allZoneServers[key]]; - } - }); - for (let server of currentServers) { - if (otherServers.includes(server)) { - return true; - } - } - return false; -}; - export const validateErrors = async ( errorFileds: any[], form: FormInstance, @@ -267,111 +249,36 @@ export const validateErrors = async ( } }; -const checkIp = (value: string[], type: 'OBServer' | 'OBProxy'): ResultType => { - let response: ResultType = { success: false, msg: '' }; - +export const serversValidator = (_: any, value: string[], type: string) => { + let validtor = true; if (value && value.length) { value.some((item) => { - response.success = serverReg.test(item.trim()); + validtor = serverReg.test(item.trim()); return !serverReg.test(item.trim()); }); } - if (!response.success) { - response.msg = - type === 'OBServer' - ? intl.formatMessage({ - id: 'OBD.src.utils.EnterTheCorrectIpAddress', - defaultMessage: '请输入正确的 IP 地址', - }) - : intl.formatMessage({ - id: 'OBD.src.utils.SelectTheCorrectObproxyNode', - defaultMessage: '请选择正确的 OBProxy 节点', - }); - } - return response; -}; - -const checkIsRepeatByPreServers = ( - preAllServers: string[], - inputServer: string, -) => { - if (preAllServers.includes(inputServer)) { - return true; + if (validtor) { + return Promise.resolve(); } - return false; -}; - -const checkRepeat = ( - finalValidate, - allZoneServers, - id, - inputServer, - preAllServers, - type, -) => { - let response: ResultType = { msg: '', success: true }; - if (type === 'OBProxy') return response; - if (finalValidate.current) { - response.success = !checkIsRepeatByAllServers(allZoneServers, id); + if (type === 'OBServer') { + return Promise.reject( + new Error( + intl.formatMessage({ + id: 'OBD.pages.components.NodeConfig.EnterTheCorrectIpAddress', + defaultMessage: '请输入正确的 IP 地址', + }), + ), + ); } else { - response.success = !checkIsRepeatByPreServers(preAllServers, inputServer); - } - if (!response.success) { - response.msg = intl.formatMessage({ - id: 'OBD.src.utils.DoNotEnterDuplicateNodes', - defaultMessage: '禁止输入重复节点', - }); - } - return response; -}; - -type ResultType = { - success: boolean; - msg: string; -}; - -const resultHandlePipeline = (...results: ResultType[]): ResultType => { - for (let result of results) { - if (!result.success) { - return result; - } + return Promise.reject( + new Error( + intl.formatMessage({ + id: 'OBD.pages.components.NodeConfig.SelectTheCorrectObproxyNode', + defaultMessage: '请选择正确的 OBProxy 节点', + }), + ), + ); } - return { - success: true, - msg: '', - }; -}; - -export const serversValidator = ( - _: any, - value: string[], - preAllServers: string[], - type: 'OBServer' | 'OBProxy', - allZoneServers?: any, - finalValidate?: any, -) => { - let result: ResultType = { - success: false, - msg: '', - }, - inputServer = value[value.length - 1]; - let id = _.field?.split('.')[0]; - - result = resultHandlePipeline( - checkIp(value, type), - checkRepeat( - finalValidate, - allZoneServers, - id, - inputServer, - preAllServers, - type, - ), - ); - - if (!result.success) return Promise.reject(new Error(result.msg)); - - return Promise.resolve(); }; export function generateRandomPassword() {