Skip to content

Commit

Permalink
[DPE-1033] Role management base implementation (#21) (#22)
Browse files Browse the repository at this point in the history
Role management base implementation
  • Loading branch information
Mehdi-Bendriss authored Dec 10, 2022
1 parent 6f133ba commit cc985cf
Show file tree
Hide file tree
Showing 7 changed files with 98 additions and 89 deletions.
58 changes: 23 additions & 35 deletions lib/charms/opensearch/v0/helper_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,53 +29,41 @@ def __init__(self, name: str, roles: List[str], ip: str):


class ClusterTopology:
"""Class for creating the best possible configuration for a Node.
The current logic is to try to get to the config:
- 2 dedicated cluster manager nodes
- 1 voting only data node
And create them with the following order:
- cm0
- data, voting only
- cm1
- data
- data
"""
"""Class for creating the best possible configuration for a Node."""

@staticmethod
def suggest_roles(nodes: List[Node]) -> List[str]:
"""Get roles for a Node, for now, we only focus on the 3 most important roles.
def suggest_roles(nodes: List[Node], planned_units: int) -> List[str]:
"""Get roles for a Node.
We will do more interesting things with the nodes list, to find the best role.
For now, we don't allow to end-user control roles.
The logic here is:
- the first node should be a CM-eligible node, but will add "data" to it
- the second node should be a data and voting-only node, so that:
- a 2 nodes cluster can function
- when a 2CM-eligible node joins, the CM voting can happen immediately
- the 3rd one should be a CM-eligible, but will add "data" to it
- the +4 nodes should be data etc.
— Half of the nodes should be CM-eligible.
— All others should not participate in the voting to speedup voting time.
"""
nodes_by_roles = ClusterTopology.nodes_count_by_role(nodes)
if nodes_by_roles.get("cluster_manager", 0) == 0:
return ["cluster_manager", "data"]

if nodes_by_roles.get("voting_only", 0) == 0:
return ["voting_only", "data"]
max_managers = planned_units
max_voters = planned_units
if planned_units % 2 == 0:
max_managers -= 1
max_voters -= 1

if nodes_by_roles["cluster_manager"] == 1:
return ["cluster_manager", "data"]
if max_managers > 3:
# for a cluster of +3 nodes, we want to have half of the nodes as CMs
max_managers = max_managers // 2 + 1

return ["data"]
base_roles = ["data", "ingest", "ml", "coordinating_only"]

@staticmethod
def remaining_nodes_for_bootstrap(nodes: List[Node]) -> int:
"""Check if cluster is bootstrapped. 2 cm + 1 voting only nodes created."""
nodes_count = ClusterTopology.nodes_count_by_role(nodes)
if (
nodes_by_roles.get("cluster_manager", 0) + nodes_by_roles.get("voting_only", 0)
>= max_voters
):
return base_roles

cms = 2 - nodes_count.get("cluster_manager", 0)
voting_only = 1 - nodes_count.get("voting_only", 0)
if nodes_by_roles.get("cluster_manager", 0) >= max_managers:
return base_roles + ["voting_only"]

return cms + voting_only
return base_roles + ["cluster_manager"]

@staticmethod
def get_cluster_managers_ips(nodes: List[Node]) -> List[str]:
Expand Down
5 changes: 3 additions & 2 deletions lib/charms/opensearch/v0/opensearch_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def set_node(
roles: List[str],
cm_names: List[str],
cm_ips: List[str],
contribute_to_bootstrap: bool,
) -> None:
"""Set base config for each node in the cluster."""
self._opensearch.config.put(self.CONFIG_YML, "cluster.name", f"{app_name}-{model_name}")
Expand All @@ -134,7 +135,7 @@ def set_node(
if len(cm_ips) > 0:
self._opensearch.config.put(self.CONFIG_YML, "discovery.seed_hosts", cm_ips)

if "cluster_manager" in roles and len(cm_ips) < 2: # cluster NOT bootstrapped yet
if "cluster_manager" in roles and contribute_to_bootstrap: # cluster NOT bootstrapped yet
self._opensearch.config.put(
self.CONFIG_YML, "cluster.initial_cluster_manager_nodes", cm_names
)
Expand All @@ -152,6 +153,6 @@ def set_node(
self.CONFIG_YML, "plugins.security.ssl.transport.enforce_hostname_verification", True
)

def cleanup_conf_if_bootstrapped(self):
def cleanup_bootstrap_conf(self):
"""Remove some conf entries when the cluster is bootstrapped."""
self._opensearch.config.delete(self.CONFIG_YML, "cluster.initial_cluster_manager_nodes")
74 changes: 49 additions & 25 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,16 @@ def _on_leader_elected(self, _: LeaderElectedEvent):

def _on_start(self, event: StartEvent):
"""Triggered when on start. Set the right node role."""
if self.opensearch.is_started() and self.app_peers_data.get("security_index_initialised"):
# in the case where it was on WaitingToStart status, event got deferred
# and the service started in between, put status back to active
self.clear_status(WaitingToStart)
if self.opensearch.is_started():
if self.app_peers_data.get("security_index_initialised"):
# in the case where it was on WaitingToStart status, event got deferred
# and the service started in between, put status back to active
self.clear_status(WaitingToStart)

# cleanup bootstrap conf in the node if existing
if self.unit_peers_data.get("bootstrap_contributor"):
self._cleanup_bootstrap_conf_if_applies()

return

if not self._is_tls_fully_configured():
Expand All @@ -125,19 +131,6 @@ def _on_start(self, event: StartEvent):
# configure clients auth
self.opensearch_config.set_client_auth()

try:
# Retrieve the nodes of the cluster, needed to configure this node
nodes = self._get_nodes()
except OpenSearchHttpError:
event.defer()
return

# Set the configuration of the node
self._set_node_conf(nodes)

# Remove some config entries when cluster bootstrapped
self._cleanup_conf_if_bootstrapped(nodes)

# request the start of opensearch
self.unit.status = WaitingStatus(RequestUnitServiceOps.format("start"))
self.on[self.service_manager.name].acquire_lock.emit(callback_override="_start_opensearch")
Expand Down Expand Up @@ -170,6 +163,14 @@ def _on_peer_relation_changed(self, event: RelationChangedEvent):
if exclusions_to_remove:
self.append_allocation_exclusion_to_remove(exclusions_to_remove)

if data.get("bootstrap_contributor"):
contributor_count = int(
self.app_peers_data.get("bootstrap_contributors_count", 0)
)
self.app_peers_data["bootstrap_contributors_count"] = str(
contributor_count + 1
)

# Restart node when cert renewal for the transport layer
if self.unit_peers_data.get("must_reboot_node") == "True":
try:
Expand Down Expand Up @@ -314,6 +315,16 @@ def _start_opensearch(self, event: EventBase) -> None:
event.defer()
return

try:
# Retrieve the nodes of the cluster, needed to configure this node
nodes = self._get_nodes()
except OpenSearchHttpError:
event.defer()
return

# Set the configuration of the node
self._set_node_conf(nodes)

try:
self.unit.status = BlockedStatus(WaitingToStart)
self.opensearch.start()
Expand All @@ -339,6 +350,10 @@ def _start_opensearch(self, event: EventBase) -> None:
self.app_peers_data["remove_from_allocation_exclusions"]
)

# cleanup bootstrap conf in the node
if self.unit_peers_data.get("bootstrap_contributor"):
self._cleanup_bootstrap_conf_if_applies()

def _stop_opensearch(self, event: EventBase) -> None:
"""Stop OpenSearch if possible."""
try:
Expand Down Expand Up @@ -490,30 +505,39 @@ def fetch() -> List[Node]:

def _set_node_conf(self, nodes: List[Node]) -> None:
"""Set the configuration of the current node / unit."""
roles = ClusterTopology.suggest_roles(nodes)
roles = ClusterTopology.suggest_roles(nodes, self.app.planned_units())

cm_names = ClusterTopology.get_cluster_managers_names(nodes)
cm_ips = ClusterTopology.get_cluster_managers_ips(nodes)

contribute_to_bootstrap = False
if "cluster_manager" in roles:
cm_names.append(self.unit_name)
cm_ips.append(self.unit_ip)

cms_in_bootstrap = int(self.app_peers_data.get("bootstrap_contributors_count", 0))
if cms_in_bootstrap < self.app.planned_units():
contribute_to_bootstrap = True

if self.unit.is_leader():
self.app_peers_data["bootstrap_contributors_count"] = f"{cms_in_bootstrap + 1}"

# indicates that this unit is part of the "initial cm nodes"
self.unit_peers_data["bootstrap_contributor"] = "True"

self.opensearch_config.set_node(
self.app.name,
self.model.name,
self.unit_name,
roles,
cm_names,
cm_ips,
contribute_to_bootstrap,
)

def _cleanup_conf_if_bootstrapped(self, nodes: List[Node]) -> None:
"""Remove some conf props when cluster is bootstrapped."""
remaining_nodes_for_bootstrap = ClusterTopology.remaining_nodes_for_bootstrap(nodes)
if remaining_nodes_for_bootstrap == 0:
# this condition means that we just added the last required CM node
# the cluster is bootstrapped now, we need to clean up the conf on the CM nodes
self.opensearch_config.cleanup_conf_if_bootstrapped()
def _cleanup_bootstrap_conf_if_applies(self) -> None:
"""Remove some conf props in the CM nodes that contributed to the cluster bootstrapping."""
self.opensearch_config.cleanup_bootstrap_conf()


if __name__ == "__main__":
Expand Down
9 changes: 4 additions & 5 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def test_on_leader_elected_index_initialised(self, _initialize_admin_user):
@patch("charms.opensearch.v0.opensearch_config.OpenSearchConfig.set_client_auth")
@patch("charm.OpenSearchOperatorCharm._get_nodes")
@patch("charm.OpenSearchOperatorCharm._set_node_conf")
@patch("charm.OpenSearchOperatorCharm._cleanup_conf_if_bootstrapped")
@patch("charm.OpenSearchOperatorCharm._can_service_start")
@patch("opensearch.OpenSearchTarball.start")
@patch("charm.OpenSearchOperatorCharm._initialize_security_index")
Expand All @@ -83,7 +82,6 @@ def test_on_start(
_initialize_security_index,
start,
_can_service_start,
_cleanup_conf_if_bootstrapped,
_set_node_conf,
_get_nodes,
set_client_auth,
Expand Down Expand Up @@ -114,16 +112,17 @@ def test_on_start(
_get_nodes.side_effect = None
_can_service_start.return_value = False
self.charm.on.start.emit()
_get_nodes.assert_called()
_set_node_conf.assert_called_once()
_cleanup_conf_if_bootstrapped.assert_called_once()
_get_nodes.assert_not_called()
_set_node_conf.assert_not_called()
_initialize_security_index.assert_not_called()

# initialisation of the security index
del self.charm.app_peers_data["security_index_initialised"]
_can_service_start.return_value = True
self.harness.set_leader(True)
self.charm.on.start.emit()
_get_nodes.assert_called()
_set_node_conf.assert_called()
start.assert_called_once()
self.assertEqual(self.charm.app_peers_data["security_index_initialised"], "True")
_initialize_security_index.assert_called_once()
Expand Down
36 changes: 16 additions & 20 deletions tests/unit/test_helper_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,30 +22,23 @@ def setUp(self, _create_directories) -> None:

self.opensearch = self.charm.opensearch

self.base_roles = ["data", "ingest", "ml", "coordinating_only"]
self.voting_only_roles = self.base_roles + ["voting_only"]
self.cm_roles = self.base_roles + ["cluster_manager"]

self.nodes_0 = []
self.nodes_1 = [Node("cm1", ["cluster_manager", "data"], "2.2.2.2")]
self.nodes_2 = self.nodes_1 + [Node("data1", ["voting_only", "data"], "2.2.2.3")]
self.nodes_3 = self.nodes_2 + [Node("cm2", ["cluster_manager", "data"], "2.2.2.4")]
self.nodes_4 = self.nodes_3 + [Node("data2", ["data"], "2.2.2.5")]
self.nodes_1 = [Node("cm1", self.cm_roles, "2.2.2.2")]
self.nodes_2 = self.nodes_1 + [Node("voting1", self.voting_only_roles, "2.2.2.3")]
self.nodes_3 = self.nodes_2 + [Node("cm2", self.cm_roles, "2.2.2.4")]
self.nodes_4 = self.nodes_3 + [Node("data2", self.base_roles, "2.2.2.5")]
self.nodes_5 = self.nodes_4 + [Node("cm3", self.cm_roles, "2.2.2.6")]

def test_topology_roles_suggestion(self):
"""Test the suggestion of roles for a new node."""
self.assertCountEqual(
ClusterTopology.suggest_roles(self.nodes_0), ["cluster_manager", "data"]
)
self.assertCountEqual(ClusterTopology.suggest_roles(self.nodes_1), ["voting_only", "data"])
self.assertCountEqual(
ClusterTopology.suggest_roles(self.nodes_2), ["cluster_manager", "data"]
)
self.assertCountEqual(ClusterTopology.suggest_roles(self.nodes_3), ["data"])

def test_topology_remaining_nodes_for_bootstrap(self):
"""Test if cluster is bootstrapped."""
self.assertTrue(ClusterTopology.remaining_nodes_for_bootstrap(self.nodes_0) == 3)
self.assertTrue(ClusterTopology.remaining_nodes_for_bootstrap(self.nodes_1) == 2)
self.assertTrue(ClusterTopology.remaining_nodes_for_bootstrap(self.nodes_2) == 1)
self.assertTrue(ClusterTopology.remaining_nodes_for_bootstrap(self.nodes_3) == 0)
self.assertTrue(ClusterTopology.remaining_nodes_for_bootstrap(self.nodes_4) == 0)
self.assertCountEqual(ClusterTopology.suggest_roles(self.nodes_0, 2), self.cm_roles)
self.assertCountEqual(ClusterTopology.suggest_roles(self.nodes_1, 2), self.base_roles)
self.assertCountEqual(ClusterTopology.suggest_roles(self.nodes_2, 3), self.cm_roles)
self.assertCountEqual(ClusterTopology.suggest_roles(self.nodes_3, 4), self.base_roles)

def test_topology_get_cluster_managers_ips(self):
"""Test correct retrieval of cm ips from a list of nodes."""
Expand All @@ -67,6 +60,9 @@ def test_topology_nodes_count_by_role(self):
"cluster_manager": 2,
"voting_only": 1,
"data": 4,
"ml": 4,
"coordinating_only": 4,
"ingest": 4,
},
)

Expand Down
3 changes: 2 additions & 1 deletion tests/unit/test_opensearch_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def test_set_node_and_cleanup_if_bootstrapped(self):
["cluster_manager", "data"],
["cm1"],
["10.10.10.10"],
True,
)
opensearch_conf = self.yaml_conf_setter.load(self.opensearch_yml)
self.assertEqual(
Expand All @@ -164,7 +165,7 @@ def test_set_node_and_cleanup_if_bootstrapped(self):
)

# test cleanup_conf_if_bootstrapped
self.opensearch_config.cleanup_conf_if_bootstrapped()
self.opensearch_config.cleanup_bootstrap_conf()
opensearch_conf = self.yaml_conf_setter.load(self.opensearch_yml)
self.assertNotIn("cluster.initial_cluster_manager_nodes", opensearch_conf)

Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ deps =
commands =
# uncomment the following line if this charm owns a lib
codespell {[vars]lib_path}
codespell {toxinidir}/. --skip {toxinidir}/.git --skip {toxinidir}/.tox \
codespell {toxinidir} --skip {toxinidir}/.git --skip {toxinidir}/.tox \
--skip {toxinidir}/build --skip {toxinidir}/lib --skip {toxinidir}/venv \
--skip {toxinidir}/.mypy_cache --skip {toxinidir}/icon.svg
# pflake8 wrapper supports config from pyproject.toml
Expand Down

0 comments on commit cc985cf

Please sign in to comment.