Skip to content

Commit

Permalink
Extend opensearch locking to consider nodes that are gone and remove …
Browse files Browse the repository at this point in the history
…defer
  • Loading branch information
phvalguima committed Feb 12, 2024
1 parent 19df41e commit 060072f
Show file tree
Hide file tree
Showing 7 changed files with 363 additions and 57 deletions.
4 changes: 4 additions & 0 deletions lib/charms/opensearch/v0/constants_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
LIBPATCH = 1


SERVICE_MANAGER = "service"


# Blocked statuses
WaitingToStart = "Waiting for OpenSearch to start..."
InstallError = "Could not install OpenSearch."
Expand Down Expand Up @@ -57,6 +60,7 @@
PClusterWrongNodesCountForQuorum = (
"Even number of members in quorum if current unit started. Add or remove 1 unit."
)
LockIsBlockedOnUnit = "Lock in {} is blocked on unit: {}"

# Wait status
RequestUnitServiceOps = "Requesting lock on operation: {}"
Expand Down
29 changes: 20 additions & 9 deletions lib/charms/opensearch/v0/opensearch_base_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from charms.grafana_agent.v0.cos_agent import COSAgentProvider
from charms.opensearch.v0.constants_charm import (
SERVICE_MANAGER,
AdminUserInitProgress,
CertsExpirationError,
ClientRelationName,
Expand Down Expand Up @@ -61,7 +62,7 @@
from charms.opensearch.v0.opensearch_locking import (
OpenSearchOpsLock,
OpenSearchRetryLockLaterException,
RollingOpsManagerWithExclusions,
OpenSearchRollingOpsManager,
)
from charms.opensearch.v0.opensearch_nodes_exclusions import (
ALLOCS_TO_DELETE,
Expand Down Expand Up @@ -114,7 +115,6 @@
LIBPATCH = 2


SERVICE_MANAGER = "service"
STORAGE_NAME = "opensearch-data"


Expand Down Expand Up @@ -154,7 +154,7 @@ def __init__(self, *args, distro: Type[OpenSearchDistribution] = None):
self.plugin_manager = OpenSearchPluginManager(self)
self.backup = OpenSearchBackup(self)

self.service_manager = RollingOpsManagerWithExclusions(
self.service_manager = OpenSearchRollingOpsManager(
self, relation=SERVICE_MANAGER, callback=self._restart_opensearch
)
self.user_manager = OpenSearchUserManager(self)
Expand Down Expand Up @@ -646,14 +646,12 @@ def _start_opensearch(self, _) -> None: # noqa: C901

# Retrieve the nodes of the cluster, needed to configure this node
nodes = self._get_nodes(False)
# validate the roles prior to starting
self.opensearch_peer_cm.validate_roles(nodes, on_new_unit=True)

logger.debug("_start_opensearch: _set_node_conf is being called")
# Set the configuration of the node
self._set_node_conf(nodes)

logger.debug("_start_opensearch: roles validated")
logger.debug("_start_opensearch: start service")

self.opensearch.start(
wait_until_http_200=(
Expand Down Expand Up @@ -726,19 +724,32 @@ def _restart_opensearch(self, event: EventBase) -> None:
service_was_stopped = True
logger.debug("Rolling Ops Manager: stop_opensearch called")

# Retrieve the nodes of the cluster, needed to configure this node
nodes = self._get_nodes(False)
# validate the roles prior to starting
# We want to do it only once, as we may start the service, which changes
# the node count, but we retry the _start_opensearch a couple of times
# while the service itself comes up
self.opensearch_peer_cm.validate_roles(nodes, on_new_unit=True)

self._start_opensearch(event)
except OpenSearchProvidedRolesException as e:
logger.error("Restart failed: provided roles are wrong")
self.app.status = BlockedStatus(str(e))
# We do not restart the service.
# We want to review the provided roles first
retry_restart_later = False
except OpenSearchError as e:
# An error happened: no python-native exception
# In this case, we want to retry later
logger.error(f"Rolling Ops Manager: Restarting OpenSearch failed: {e}")
logger.error(f"Restarting OpenSearch failed: {e}")
retry_restart_later = True
finally:
# in any error, we want to get the service up and running if it
# was the case before. That tries to assure we did not lose a node
# because we did not restart it correctly in the event of a failure.
if service_was_stopped and not self.opensearch.is_active():
self.opensearch.start()

finally:
if retry_restart_later:
# Message the lock manager we want to retry this lock later.
raise OpenSearchRetryLockLaterException()
Expand Down
12 changes: 12 additions & 0 deletions lib/charms/opensearch/v0/opensearch_distro.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@

import requests
import urllib3.exceptions
from charms.opensearch.v0.constants_charm import SERVICE_MANAGER
from charms.opensearch.v0.constants_secrets import ADMIN_PW
from charms.opensearch.v0.helper_cluster import Node
from charms.opensearch.v0.helper_conf_setter import YamlConfigSetter
from charms.opensearch.v0.helper_networking import (
get_host_ip,
is_reachable,
reachable_hosts,
unit_ip,
)
from charms.opensearch.v0.opensearch_exceptions import (
OpenSearchCmdError,
Expand Down Expand Up @@ -161,6 +163,16 @@ def is_node_up(self) -> bool:
except (OpenSearchHttpError, Exception):
return False

def is_remote_node_up(self, unit, relation: str = SERVICE_MANAGER) -> bool:
"""Get status of current node. This assumes OpenSearch is Running."""
try:
resp_code = self.request(
"GET", "/_nodes", host=unit_ip(self._charm, unit, relation), resp_status_code=True
)
return resp_code < 400
except (OpenSearchHttpError, Exception):
return False

def run_bin(self, bin_script_name: str, args: str = None, stdin: str = None) -> str:
"""Run opensearch provided bin command, relative to OPENSEARCH_BIN.
Expand Down
Loading

0 comments on commit 060072f

Please sign in to comment.