Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backup] Add error handling to _request #238

Merged
merged 3 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 41 additions & 18 deletions lib/charms/opensearch/v0/opensearch_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def _close_indices_if_needed(self, backup_id: int) -> Set[str]:
OpenSearchHttpError
OpenSearchRestoreIndexClosingError
"""
backup_indices = self._list_backups()[backup_id]["indices"]
backup_indices = self._list_backups().get(backup_id, {}).get("indices", {})
indices_to_close = set()
for index, state in ClusterState.indices(self.charm.opensearch).items():
if (
Expand All @@ -271,7 +271,7 @@ def _close_indices_if_needed(self, backup_id: int) -> Set[str]:

def _restore(self, backup_id: int) -> Dict[str, Any]:
"""Runs the restore and processes the response."""
backup_indices = self._list_backups()[backup_id]["indices"]
backup_indices = self._list_backups().get(backup_id, {}).get("indices", {})
output = self._request(
"POST",
f"_snapshot/{S3_REPOSITORY}/{backup_id}/_restore?wait_for_completion=true",
Expand Down Expand Up @@ -301,6 +301,8 @@ def _is_restore_complete(self) -> bool:
Essentially, check for each index shard: for all type=SNAPSHOT and stage=DONE, return True.
"""
indices_status = self._request("GET", "/_recovery?human")
if not indices_status:
raise OpenSearchRestoreCheckError("_is_restore_complete: failed to get indices status")
for info in indices_status.values():
# Now, check the status of each shard
for shard in info["shards"]:
Expand All @@ -320,13 +322,17 @@ def _is_backup_available_for_restore(self, backup_id: int) -> bool:
except OpenSearchListBackupError:
return False

def _on_restore_backup_action(self, event: ActionEvent) -> None:
def _on_restore_backup_action(self, event: ActionEvent) -> None: # noqa #C901
"""Restores a backup to the current cluster."""
if not self._can_unit_perform_backup(event):
event.fail("Failed: backup service is not configured yet")
return
if not self._is_restore_complete():
event.fail("Failed: previous restore is still in progress")
try:
if not self._is_restore_complete():
event.fail("Failed: previous restore is still in progress")
return
except OpenSearchRestoreCheckError:
event.fail("Failed: error connecting to the cluster")
return
# Now, validate the backup is working
backup_id = str(event.params.get("backup-id"))
Expand Down Expand Up @@ -365,7 +371,13 @@ def _on_restore_backup_action(self, event: ActionEvent) -> None:
event.fail("Failed to restore all the shards")
return

msg = "Restore is complete" if self._is_restore_complete() else "Restore in progress..."
try:
msg = (
"Restore is complete" if self._is_restore_complete() else "Restore in progress..."
)
except OpenSearchRestoreCheckError:
event.fail("Failed: error connecting to the cluster")
return
self.charm.status.clear(RestoreInProgress)
event.set_results(
{"backup-id": backup_id, "status": msg, "closed-indices": str(closed_idx)}
Expand Down Expand Up @@ -430,7 +442,9 @@ def _can_unit_perform_backup(self, event: ActionEvent) -> bool:

def _list_backups(self) -> Dict[int, str]:
"""Returns a mapping of snapshot ids / state."""
response = self._request("GET", f"_snapshot/{S3_REPOSITORY}/_all")
# Using the original request method, as we want to raise an http exception if we
# cannot get the snapshot list.
response = self.charm.opensearch.request("GET", f"_snapshot/{S3_REPOSITORY}/_all")
return {
snapshot["snapshot"]: {
"state": snapshot["state"],
Expand Down Expand Up @@ -541,6 +555,7 @@ def apply_api_config_if_needed(self) -> None:
if state != BackupServiceState.SUCCESS:
logger.error(f"Failed to setup backup service with state {state}")
self.charm.status.set(BlockedStatus(BackupSetupFailed), app=True)
self.charm.status.clear(BackupConfigureStart)
return
self.charm.status.clear(BackupSetupFailed, app=True)
self.charm.status.clear(BackupConfigureStart)
Expand Down Expand Up @@ -685,30 +700,33 @@ def can_use_s3_repository(self) -> bool:
return False
return True

def _request(self, *args, **kwargs) -> str:
def _request(self, *args, **kwargs) -> dict[str, Any] | None:
phvalguima marked this conversation as resolved.
Show resolved Hide resolved
"""Returns the output of OpenSearchDistribution.request() or throws an error.

Request method can return one of many: Union[Dict[str, any], List[any], int]
and raise multiple types of errors.

If int is returned, then throws an exception informing the HTTP request failed.
If the request fails, returns the error text or None if only status code is found.

Raises:
- ValueError
- OpenSearchHttpError
"""
if "retries" not in kwargs.keys():
kwargs["retries"] = 6
if "timeout" not in kwargs.keys():
kwargs["timeout"] = 10
result = self.charm.opensearch.request(*args, **kwargs)

# If the return is an int type, then there was a request error:
if isinstance(result, int):
raise OpenSearchHttpError(f"Request failed with code {result}")
return result

def get_service_status(self, response: Dict[str, Any]) -> BackupServiceState: # noqa: C901
# We are interested to see the entire response
kwargs["resp_status_code"] = False
try:
result = self.charm.opensearch.request(*args, **kwargs)
except OpenSearchHttpError as e:
return e.response_body if e.response_body else None
return result if isinstance(result, dict) else None

def get_service_status( # noqa: C901
self, response: dict[str, Any] | None
) -> BackupServiceState:
"""Returns the response status in a Enum.

Based on:
Expand All @@ -719,6 +737,9 @@ def get_service_status(self, response: Dict[str, Any]) -> BackupServiceState: #
ba78d93acf1da6dae16952d8978de87cb4df2c61/
plugins/repository-s3/src/yamlRestTest/resources/rest-api-spec/test/repository_s3/40_repository_ec2_credentials.yml
"""
if not response:
return BackupServiceState.SNAPSHOT_FAILED_UNKNOWN

try:
if "error" not in response:
return BackupServiceState.SUCCESS
Expand Down Expand Up @@ -759,8 +780,10 @@ def get_service_status(self, response: Dict[str, Any]) -> BackupServiceState: #
# Ensure this is not containing any information about snapshots, return SUCCESS
return self.get_snapshot_status(response)

def get_snapshot_status(self, response: Dict[str, Any]) -> BackupServiceState:
def get_snapshot_status(self, response: Dict[str, Any] | None) -> BackupServiceState:
"""Returns the snapshot status."""
if not response:
return BackupServiceState.SNAPSHOT_FAILED_UNKNOWN
# Now, check snapshot status:
r_str = str(response)
if "IN_PROGRESS" in r_str:
Expand Down
1 change: 1 addition & 0 deletions tests/integration/ha/test_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,7 @@ async def test_wrong_s3_credentials(ops_test: OpsTest) -> None:
apps=[app],
apps_statuses=["blocked"],
units_statuses=["active"],
wait_for_exact_units=3,
idle_period=30,
)

Expand Down
Loading