From cc985cf1cdf1844b2cd26a00502529233a89e3ef Mon Sep 17 00:00:00 2001 From: Mehdi Bendriss Date: Sat, 10 Dec 2022 01:05:39 +0100 Subject: [PATCH] [DPE-1033] Role management base implementation (#21) (#22) Role management base implementation --- lib/charms/opensearch/v0/helper_cluster.py | 58 ++++++--------- lib/charms/opensearch/v0/opensearch_config.py | 5 +- src/charm.py | 74 ++++++++++++------- tests/unit/test_charm.py | 9 +-- tests/unit/test_helper_cluster.py | 36 ++++----- tests/unit/test_opensearch_config.py | 3 +- tox.ini | 2 +- 7 files changed, 98 insertions(+), 89 deletions(-) diff --git a/lib/charms/opensearch/v0/helper_cluster.py b/lib/charms/opensearch/v0/helper_cluster.py index 71c4b84af..4adf36a2c 100644 --- a/lib/charms/opensearch/v0/helper_cluster.py +++ b/lib/charms/opensearch/v0/helper_cluster.py @@ -29,53 +29,41 @@ def __init__(self, name: str, roles: List[str], ip: str): class ClusterTopology: - """Class for creating the best possible configuration for a Node. - - The current logic is to try to get to the config: - - 2 dedicated cluster manager nodes - - 1 voting only data node - And create them with the following order: - - cm0 - - data, voting only - - cm1 - - data - - data - """ + """Class for creating the best possible configuration for a Node.""" @staticmethod - def suggest_roles(nodes: List[Node]) -> List[str]: - """Get roles for a Node, for now, we only focus on the 3 most important roles. + def suggest_roles(nodes: List[Node], planned_units: int) -> List[str]: + """Get roles for a Node. - We will do more interesting things with the nodes list, to find the best role. + For now, we don't allow to end-user control roles. The logic here is: - - the first node should be a CM-eligible node, but will add "data" to it - - the second node should be a data and voting-only node, so that: - - a 2 nodes cluster can function - - when a 2CM-eligible node joins, the CM voting can happen immediately - - the 3rd one should be a CM-eligible, but will add "data" to it - - the +4 nodes should be data etc. + — Half of the nodes should be CM-eligible. + — All others should not participate in the voting to speedup voting time. """ nodes_by_roles = ClusterTopology.nodes_count_by_role(nodes) - if nodes_by_roles.get("cluster_manager", 0) == 0: - return ["cluster_manager", "data"] - if nodes_by_roles.get("voting_only", 0) == 0: - return ["voting_only", "data"] + max_managers = planned_units + max_voters = planned_units + if planned_units % 2 == 0: + max_managers -= 1 + max_voters -= 1 - if nodes_by_roles["cluster_manager"] == 1: - return ["cluster_manager", "data"] + if max_managers > 3: + # for a cluster of +3 nodes, we want to have half of the nodes as CMs + max_managers = max_managers // 2 + 1 - return ["data"] + base_roles = ["data", "ingest", "ml", "coordinating_only"] - @staticmethod - def remaining_nodes_for_bootstrap(nodes: List[Node]) -> int: - """Check if cluster is bootstrapped. 2 cm + 1 voting only nodes created.""" - nodes_count = ClusterTopology.nodes_count_by_role(nodes) + if ( + nodes_by_roles.get("cluster_manager", 0) + nodes_by_roles.get("voting_only", 0) + >= max_voters + ): + return base_roles - cms = 2 - nodes_count.get("cluster_manager", 0) - voting_only = 1 - nodes_count.get("voting_only", 0) + if nodes_by_roles.get("cluster_manager", 0) >= max_managers: + return base_roles + ["voting_only"] - return cms + voting_only + return base_roles + ["cluster_manager"] @staticmethod def get_cluster_managers_ips(nodes: List[Node]) -> List[str]: diff --git a/lib/charms/opensearch/v0/opensearch_config.py b/lib/charms/opensearch/v0/opensearch_config.py index 2939f2f53..a2f01729e 100644 --- a/lib/charms/opensearch/v0/opensearch_config.py +++ b/lib/charms/opensearch/v0/opensearch_config.py @@ -121,6 +121,7 @@ def set_node( roles: List[str], cm_names: List[str], cm_ips: List[str], + contribute_to_bootstrap: bool, ) -> None: """Set base config for each node in the cluster.""" self._opensearch.config.put(self.CONFIG_YML, "cluster.name", f"{app_name}-{model_name}") @@ -134,7 +135,7 @@ def set_node( if len(cm_ips) > 0: self._opensearch.config.put(self.CONFIG_YML, "discovery.seed_hosts", cm_ips) - if "cluster_manager" in roles and len(cm_ips) < 2: # cluster NOT bootstrapped yet + if "cluster_manager" in roles and contribute_to_bootstrap: # cluster NOT bootstrapped yet self._opensearch.config.put( self.CONFIG_YML, "cluster.initial_cluster_manager_nodes", cm_names ) @@ -152,6 +153,6 @@ def set_node( self.CONFIG_YML, "plugins.security.ssl.transport.enforce_hostname_verification", True ) - def cleanup_conf_if_bootstrapped(self): + def cleanup_bootstrap_conf(self): """Remove some conf entries when the cluster is bootstrapped.""" self._opensearch.config.delete(self.CONFIG_YML, "cluster.initial_cluster_manager_nodes") diff --git a/src/charm.py b/src/charm.py index b393a768e..c3645954c 100755 --- a/src/charm.py +++ b/src/charm.py @@ -111,10 +111,16 @@ def _on_leader_elected(self, _: LeaderElectedEvent): def _on_start(self, event: StartEvent): """Triggered when on start. Set the right node role.""" - if self.opensearch.is_started() and self.app_peers_data.get("security_index_initialised"): - # in the case where it was on WaitingToStart status, event got deferred - # and the service started in between, put status back to active - self.clear_status(WaitingToStart) + if self.opensearch.is_started(): + if self.app_peers_data.get("security_index_initialised"): + # in the case where it was on WaitingToStart status, event got deferred + # and the service started in between, put status back to active + self.clear_status(WaitingToStart) + + # cleanup bootstrap conf in the node if existing + if self.unit_peers_data.get("bootstrap_contributor"): + self._cleanup_bootstrap_conf_if_applies() + return if not self._is_tls_fully_configured(): @@ -125,19 +131,6 @@ def _on_start(self, event: StartEvent): # configure clients auth self.opensearch_config.set_client_auth() - try: - # Retrieve the nodes of the cluster, needed to configure this node - nodes = self._get_nodes() - except OpenSearchHttpError: - event.defer() - return - - # Set the configuration of the node - self._set_node_conf(nodes) - - # Remove some config entries when cluster bootstrapped - self._cleanup_conf_if_bootstrapped(nodes) - # request the start of opensearch self.unit.status = WaitingStatus(RequestUnitServiceOps.format("start")) self.on[self.service_manager.name].acquire_lock.emit(callback_override="_start_opensearch") @@ -170,6 +163,14 @@ def _on_peer_relation_changed(self, event: RelationChangedEvent): if exclusions_to_remove: self.append_allocation_exclusion_to_remove(exclusions_to_remove) + if data.get("bootstrap_contributor"): + contributor_count = int( + self.app_peers_data.get("bootstrap_contributors_count", 0) + ) + self.app_peers_data["bootstrap_contributors_count"] = str( + contributor_count + 1 + ) + # Restart node when cert renewal for the transport layer if self.unit_peers_data.get("must_reboot_node") == "True": try: @@ -314,6 +315,16 @@ def _start_opensearch(self, event: EventBase) -> None: event.defer() return + try: + # Retrieve the nodes of the cluster, needed to configure this node + nodes = self._get_nodes() + except OpenSearchHttpError: + event.defer() + return + + # Set the configuration of the node + self._set_node_conf(nodes) + try: self.unit.status = BlockedStatus(WaitingToStart) self.opensearch.start() @@ -339,6 +350,10 @@ def _start_opensearch(self, event: EventBase) -> None: self.app_peers_data["remove_from_allocation_exclusions"] ) + # cleanup bootstrap conf in the node + if self.unit_peers_data.get("bootstrap_contributor"): + self._cleanup_bootstrap_conf_if_applies() + def _stop_opensearch(self, event: EventBase) -> None: """Stop OpenSearch if possible.""" try: @@ -490,14 +505,26 @@ def fetch() -> List[Node]: def _set_node_conf(self, nodes: List[Node]) -> None: """Set the configuration of the current node / unit.""" - roles = ClusterTopology.suggest_roles(nodes) + roles = ClusterTopology.suggest_roles(nodes, self.app.planned_units()) cm_names = ClusterTopology.get_cluster_managers_names(nodes) cm_ips = ClusterTopology.get_cluster_managers_ips(nodes) + + contribute_to_bootstrap = False if "cluster_manager" in roles: cm_names.append(self.unit_name) cm_ips.append(self.unit_ip) + cms_in_bootstrap = int(self.app_peers_data.get("bootstrap_contributors_count", 0)) + if cms_in_bootstrap < self.app.planned_units(): + contribute_to_bootstrap = True + + if self.unit.is_leader(): + self.app_peers_data["bootstrap_contributors_count"] = f"{cms_in_bootstrap + 1}" + + # indicates that this unit is part of the "initial cm nodes" + self.unit_peers_data["bootstrap_contributor"] = "True" + self.opensearch_config.set_node( self.app.name, self.model.name, @@ -505,15 +532,12 @@ def _set_node_conf(self, nodes: List[Node]) -> None: roles, cm_names, cm_ips, + contribute_to_bootstrap, ) - def _cleanup_conf_if_bootstrapped(self, nodes: List[Node]) -> None: - """Remove some conf props when cluster is bootstrapped.""" - remaining_nodes_for_bootstrap = ClusterTopology.remaining_nodes_for_bootstrap(nodes) - if remaining_nodes_for_bootstrap == 0: - # this condition means that we just added the last required CM node - # the cluster is bootstrapped now, we need to clean up the conf on the CM nodes - self.opensearch_config.cleanup_conf_if_bootstrapped() + def _cleanup_bootstrap_conf_if_applies(self) -> None: + """Remove some conf props in the CM nodes that contributed to the cluster bootstrapping.""" + self.opensearch_config.cleanup_bootstrap_conf() if __name__ == "__main__": diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index e192ad62a..feb44a78b 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -72,7 +72,6 @@ def test_on_leader_elected_index_initialised(self, _initialize_admin_user): @patch("charms.opensearch.v0.opensearch_config.OpenSearchConfig.set_client_auth") @patch("charm.OpenSearchOperatorCharm._get_nodes") @patch("charm.OpenSearchOperatorCharm._set_node_conf") - @patch("charm.OpenSearchOperatorCharm._cleanup_conf_if_bootstrapped") @patch("charm.OpenSearchOperatorCharm._can_service_start") @patch("opensearch.OpenSearchTarball.start") @patch("charm.OpenSearchOperatorCharm._initialize_security_index") @@ -83,7 +82,6 @@ def test_on_start( _initialize_security_index, start, _can_service_start, - _cleanup_conf_if_bootstrapped, _set_node_conf, _get_nodes, set_client_auth, @@ -114,9 +112,8 @@ def test_on_start( _get_nodes.side_effect = None _can_service_start.return_value = False self.charm.on.start.emit() - _get_nodes.assert_called() - _set_node_conf.assert_called_once() - _cleanup_conf_if_bootstrapped.assert_called_once() + _get_nodes.assert_not_called() + _set_node_conf.assert_not_called() _initialize_security_index.assert_not_called() # initialisation of the security index @@ -124,6 +121,8 @@ def test_on_start( _can_service_start.return_value = True self.harness.set_leader(True) self.charm.on.start.emit() + _get_nodes.assert_called() + _set_node_conf.assert_called() start.assert_called_once() self.assertEqual(self.charm.app_peers_data["security_index_initialised"], "True") _initialize_security_index.assert_called_once() diff --git a/tests/unit/test_helper_cluster.py b/tests/unit/test_helper_cluster.py index 7aaf65989..e31b63891 100644 --- a/tests/unit/test_helper_cluster.py +++ b/tests/unit/test_helper_cluster.py @@ -22,30 +22,23 @@ def setUp(self, _create_directories) -> None: self.opensearch = self.charm.opensearch + self.base_roles = ["data", "ingest", "ml", "coordinating_only"] + self.voting_only_roles = self.base_roles + ["voting_only"] + self.cm_roles = self.base_roles + ["cluster_manager"] + self.nodes_0 = [] - self.nodes_1 = [Node("cm1", ["cluster_manager", "data"], "2.2.2.2")] - self.nodes_2 = self.nodes_1 + [Node("data1", ["voting_only", "data"], "2.2.2.3")] - self.nodes_3 = self.nodes_2 + [Node("cm2", ["cluster_manager", "data"], "2.2.2.4")] - self.nodes_4 = self.nodes_3 + [Node("data2", ["data"], "2.2.2.5")] + self.nodes_1 = [Node("cm1", self.cm_roles, "2.2.2.2")] + self.nodes_2 = self.nodes_1 + [Node("voting1", self.voting_only_roles, "2.2.2.3")] + self.nodes_3 = self.nodes_2 + [Node("cm2", self.cm_roles, "2.2.2.4")] + self.nodes_4 = self.nodes_3 + [Node("data2", self.base_roles, "2.2.2.5")] + self.nodes_5 = self.nodes_4 + [Node("cm3", self.cm_roles, "2.2.2.6")] def test_topology_roles_suggestion(self): """Test the suggestion of roles for a new node.""" - self.assertCountEqual( - ClusterTopology.suggest_roles(self.nodes_0), ["cluster_manager", "data"] - ) - self.assertCountEqual(ClusterTopology.suggest_roles(self.nodes_1), ["voting_only", "data"]) - self.assertCountEqual( - ClusterTopology.suggest_roles(self.nodes_2), ["cluster_manager", "data"] - ) - self.assertCountEqual(ClusterTopology.suggest_roles(self.nodes_3), ["data"]) - - def test_topology_remaining_nodes_for_bootstrap(self): - """Test if cluster is bootstrapped.""" - self.assertTrue(ClusterTopology.remaining_nodes_for_bootstrap(self.nodes_0) == 3) - self.assertTrue(ClusterTopology.remaining_nodes_for_bootstrap(self.nodes_1) == 2) - self.assertTrue(ClusterTopology.remaining_nodes_for_bootstrap(self.nodes_2) == 1) - self.assertTrue(ClusterTopology.remaining_nodes_for_bootstrap(self.nodes_3) == 0) - self.assertTrue(ClusterTopology.remaining_nodes_for_bootstrap(self.nodes_4) == 0) + self.assertCountEqual(ClusterTopology.suggest_roles(self.nodes_0, 2), self.cm_roles) + self.assertCountEqual(ClusterTopology.suggest_roles(self.nodes_1, 2), self.base_roles) + self.assertCountEqual(ClusterTopology.suggest_roles(self.nodes_2, 3), self.cm_roles) + self.assertCountEqual(ClusterTopology.suggest_roles(self.nodes_3, 4), self.base_roles) def test_topology_get_cluster_managers_ips(self): """Test correct retrieval of cm ips from a list of nodes.""" @@ -67,6 +60,9 @@ def test_topology_nodes_count_by_role(self): "cluster_manager": 2, "voting_only": 1, "data": 4, + "ml": 4, + "coordinating_only": 4, + "ingest": 4, }, ) diff --git a/tests/unit/test_opensearch_config.py b/tests/unit/test_opensearch_config.py index 152c2de4a..1530be8bb 100644 --- a/tests/unit/test_opensearch_config.py +++ b/tests/unit/test_opensearch_config.py @@ -145,6 +145,7 @@ def test_set_node_and_cleanup_if_bootstrapped(self): ["cluster_manager", "data"], ["cm1"], ["10.10.10.10"], + True, ) opensearch_conf = self.yaml_conf_setter.load(self.opensearch_yml) self.assertEqual( @@ -164,7 +165,7 @@ def test_set_node_and_cleanup_if_bootstrapped(self): ) # test cleanup_conf_if_bootstrapped - self.opensearch_config.cleanup_conf_if_bootstrapped() + self.opensearch_config.cleanup_bootstrap_conf() opensearch_conf = self.yaml_conf_setter.load(self.opensearch_yml) self.assertNotIn("cluster.initial_cluster_manager_nodes", opensearch_conf) diff --git a/tox.ini b/tox.ini index d26c203ae..65e3a16e7 100644 --- a/tox.ini +++ b/tox.ini @@ -46,7 +46,7 @@ deps = commands = # uncomment the following line if this charm owns a lib codespell {[vars]lib_path} - codespell {toxinidir}/. --skip {toxinidir}/.git --skip {toxinidir}/.tox \ + codespell {toxinidir} --skip {toxinidir}/.git --skip {toxinidir}/.tox \ --skip {toxinidir}/build --skip {toxinidir}/lib --skip {toxinidir}/venv \ --skip {toxinidir}/.mypy_cache --skip {toxinidir}/icon.svg # pflake8 wrapper supports config from pyproject.toml