Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DPE-5125] Remove peer relation lock #392

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions lib/charms/opensearch/v0/opensearch_base_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,10 +471,15 @@ def _on_opensearch_data_storage_detaching(self, _: StorageDetachingEvent): # no
logger.warning(
"Removing units during an upgrade is not supported. The charm may be in a broken, unrecoverable state"
)
# acquire lock to ensure only 1 unit removed at a time
if not self.node_lock.acquired:
# Raise uncaught exception to prevent Juju from removing unit
raise Exception("Unable to acquire lock: Another unit is starting or stopping.")

for attempt in Retrying(stop=stop_after_attempt(30), wait=wait_fixed(2), reraise=True):
with attempt:
# acquire lock to ensure only 1 unit removed at a time
if not self.node_lock.acquired:
# Raise uncaught exception to prevent Juju from removing unit
raise Exception(
"Unable to acquire lock: Another unit is starting or stopping."
)

# if the leader is departing, and this hook fails "leader elected" won"t trigger,
# so we want to re-balance the node roles from here
Expand Down
31 changes: 26 additions & 5 deletions lib/charms/opensearch/v0/opensearch_locking.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,9 +254,27 @@ def acquired(self) -> bool: # noqa: C901
self._opensearch, use_localhost=host is not None, hosts=alt_hosts
)
)
try:
logger.debug(
"Current shard allocation status: %s",
self._opensearch.request(
"GET",
"/_cluster/allocation/explain?include_yes_decisions=true&include_disk_info=true",
payload={
"index": self.OPENSEARCH_INDEX,
"shard": 0,
"primary": "true",
},
),
)
except Exception:
logger.debug("Current shard allocation status: error to connect with API")
pass
except OpenSearchHttpError:
logger.exception("Error getting OpenSearch nodes")
return False
# If we are trying to acquire the lock at application removal, this condition
# will be eventually hit
return len(self.units) <= 1
logger.debug(f"[Node lock] Opensearch {online_nodes=}")
assert online_nodes > 0
try:
Expand All @@ -266,10 +284,10 @@ def acquired(self) -> bool: # noqa: C901
# if the node lock cannot be acquired, fall back to peer databag lock
# this avoids hitting deadlock situations in cases where
# the .charm_node_lock index is not available
if online_nodes <= 1:
if online_nodes <= 1 and self._charm.app.planned_units() > 1:
return self._peer.acquired
else:
return False
return self._charm.app.planned_units() > 1
reneradoi marked this conversation as resolved.
Show resolved Hide resolved
# If online_nodes == 1, we should acquire the lock via the peer databag.
# If we acquired the lock via OpenSearch and this unit was stopping, we would be unable
# to release the OpenSearch lock. For example, when scaling to 0.
Expand Down Expand Up @@ -302,10 +320,13 @@ def acquired(self) -> bool: # noqa: C901
"to acquire lock"
)
return False
else:
elif self._charm.app.planned_units() > 1:
logger.exception("Error creating OpenSearch lock document")
# in this case, try to acquire peer databag lock as fallback
return self._peer.acquired
else:
# There is only one unit or less
return True
else:
# Ensure write was successful on all nodes
# "It is important to note that this setting [`wait_for_active_shards`] greatly
Expand Down Expand Up @@ -359,7 +380,7 @@ def acquired(self) -> bool: # noqa: C901
# If return value is True:
# - Lock granted in previous Juju event
# - OR, unit is leader & lock granted in this Juju event
return self._peer.acquired
return self._peer.acquired if self._charm.app.planned_units() > 1 else True

def release(self):
"""Release lock.
Expand Down
Loading