From 15e30f6970f01ec473667f99b075649f3cdf9671 Mon Sep 17 00:00:00 2001 From: phvalguima Date: Thu, 8 Aug 2024 20:29:48 +0200 Subject: [PATCH 1/2] [DPE-4575] spinoff - Lock changes This PR separates the lock-related changes from the DPE-4575. --- lib/charms/opensearch/v0/opensearch_base_charm.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/lib/charms/opensearch/v0/opensearch_base_charm.py b/lib/charms/opensearch/v0/opensearch_base_charm.py index b51fd5b7f..97b05352c 100644 --- a/lib/charms/opensearch/v0/opensearch_base_charm.py +++ b/lib/charms/opensearch/v0/opensearch_base_charm.py @@ -471,10 +471,15 @@ def _on_opensearch_data_storage_detaching(self, _: StorageDetachingEvent): # no logger.warning( "Removing units during an upgrade is not supported. The charm may be in a broken, unrecoverable state" ) - # acquire lock to ensure only 1 unit removed at a time - if not self.node_lock.acquired: - # Raise uncaught exception to prevent Juju from removing unit - raise Exception("Unable to acquire lock: Another unit is starting or stopping.") + + for attempt in Retrying(stop=stop_after_attempt(30), wait=wait_fixed(2), reraise=True): + with attempt: + # acquire lock to ensure only 1 unit removed at a time + if not self.node_lock.acquired: + # Raise uncaught exception to prevent Juju from removing unit + raise Exception( + "Unable to acquire lock: Another unit is starting or stopping." + ) # if the leader is departing, and this hook fails "leader elected" won"t trigger, # so we want to re-balance the node roles from here From 0767550426fe6b4e72e999284b558676f59416d0 Mon Sep 17 00:00:00 2001 From: phvalguima Date: Thu, 8 Aug 2024 20:34:37 +0200 Subject: [PATCH 2/2] Update opensearch_locking.py --- .../opensearch/v0/opensearch_locking.py | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/lib/charms/opensearch/v0/opensearch_locking.py b/lib/charms/opensearch/v0/opensearch_locking.py index fd50b5a64..de4ee414d 100644 --- a/lib/charms/opensearch/v0/opensearch_locking.py +++ b/lib/charms/opensearch/v0/opensearch_locking.py @@ -254,9 +254,27 @@ def acquired(self) -> bool: # noqa: C901 self._opensearch, use_localhost=host is not None, hosts=alt_hosts ) ) + try: + logger.debug( + "Current shard allocation status: %s", + self._opensearch.request( + "GET", + "/_cluster/allocation/explain?include_yes_decisions=true&include_disk_info=true", + payload={ + "index": self.OPENSEARCH_INDEX, + "shard": 0, + "primary": "true", + }, + ), + ) + except Exception: + logger.debug("Current shard allocation status: error to connect with API") + pass except OpenSearchHttpError: logger.exception("Error getting OpenSearch nodes") - return False + # If we are trying to acquire the lock at application removal, this condition + # will be eventually hit + return len(self.units) <= 1 logger.debug(f"[Node lock] Opensearch {online_nodes=}") assert online_nodes > 0 try: @@ -266,10 +284,10 @@ def acquired(self) -> bool: # noqa: C901 # if the node lock cannot be acquired, fall back to peer databag lock # this avoids hitting deadlock situations in cases where # the .charm_node_lock index is not available - if online_nodes <= 1: + if online_nodes <= 1 and self._charm.app.planned_units() > 1: return self._peer.acquired else: - return False + return self._charm.app.planned_units() > 1 # If online_nodes == 1, we should acquire the lock via the peer databag. # If we acquired the lock via OpenSearch and this unit was stopping, we would be unable # to release the OpenSearch lock. For example, when scaling to 0. @@ -302,10 +320,13 @@ def acquired(self) -> bool: # noqa: C901 "to acquire lock" ) return False - else: + elif self._charm.app.planned_units() > 1: logger.exception("Error creating OpenSearch lock document") # in this case, try to acquire peer databag lock as fallback return self._peer.acquired + else: + # There is only one unit or less + return True else: # Ensure write was successful on all nodes # "It is important to note that this setting [`wait_for_active_shards`] greatly @@ -359,7 +380,7 @@ def acquired(self) -> bool: # noqa: C901 # If return value is True: # - Lock granted in previous Juju event # - OR, unit is leader & lock granted in this Juju event - return self._peer.acquired + return self._peer.acquired if self._charm.app.planned_units() > 1 else True def release(self): """Release lock.