Skip to content

Commit

Permalink
[DPE-4553][DPE-4208][DPE-3970] Extend large deployments to test_plugi…
Browse files Browse the repository at this point in the history
…ns.py && Fix dashboards (#328)

Renames our dashboard to "Charmed OpenSearch". This PR addresses the
following needs in our COS integration for large deployments:
1) **Fixes data node integration with COS**: COSAgentProvider was not
tracking peer-cluster-* events, which means it was not receiving changes
done to the COSUser by the main orchestrator
2) **Add integration tests to COS**: large deployments integration tests
for COS were missing, this PR makes `test_plugins` more flexible, so it
can run both small and large deployments
3) **Manage user passwords**: large deployments will now answer to
`set-password` action and `MAIN_ORCHESTRATOR` propagates the new
password value down to the other apps
4) **Adds Role to Dashboard**: adds "Roles" as a dropdown option to
Grafana dashboard, allowing to filter each cluster also by OpenSearch
"construct" node role.

## Empty Dashboard Fix

Some dashboards are being set with `$interval`, which defaults to `1m`
and does not allow to run the actual calculation. This PR fixes it by
using
[__rate_interval](https://grafana.com/blog/2020/09/28/new-in-grafana-7.2-__rate_interval-for-prometheus-rate-queries-that-just-work/)
instead, which is a fixed period of X sample times for any `rate()`
calculations.
  • Loading branch information
phvalguima authored Jun 20, 2024
1 parent 5174eed commit 7b7fc59
Show file tree
Hide file tree
Showing 6 changed files with 392 additions and 143 deletions.
11 changes: 11 additions & 0 deletions lib/charms/opensearch/v0/constants_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,14 @@

# User-face Backup ID format
OPENSEARCH_BACKUP_ID_FORMAT = "%Y-%m-%dT%H:%M:%SZ"

# Roles that we can add as tags to prometheus data
COS_TAGGABLE_ROLES = [
"data",
"cluster_manager",
"voting",
"coordinating_only",
"ingest",
"cluster_manager_elegible",
"ml",
]
60 changes: 50 additions & 10 deletions lib/charms/opensearch/v0/opensearch_base_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from charms.grafana_agent.v0.cos_agent import COSAgentProvider
from charms.opensearch.v0.constants_charm import (
COS_TAGGABLE_ROLES,
AdminUser,
AdminUserInitProgress,
AdminUserNotConfigured,
Expand All @@ -24,6 +25,7 @@
COSUser,
OpenSearchSystemUsers,
OpenSearchUsers,
PeerClusterRelationName,
PeerRelationName,
PluginConfigChangeError,
PluginConfigCheck,
Expand Down Expand Up @@ -200,15 +202,6 @@ def __init__(self, *args, distro: Type[OpenSearchDistribution] = None):
self.status = Status(self)
self.health = OpenSearchHealth(self)
self.node_lock = OpenSearchNodeLock(self)
self.cos_integration = COSAgentProvider(
self,
relation_name=COSRelationName,
metrics_endpoints=[],
scrape_configs=self._scrape_config,
refresh_events=[self.on.set_password_action, self.on.secret_changed],
metrics_rules_dir="./src/alert_rules/prometheus",
log_slots=["opensearch:logs"],
)

self.plugin_manager = OpenSearchPluginManager(self)
self.backup = backup(self)
Expand Down Expand Up @@ -246,6 +239,20 @@ def __init__(self, *args, distro: Type[OpenSearchDistribution] = None):
self.framework.observe(self.on.set_password_action, self._on_set_password_action)
self.framework.observe(self.on.get_password_action, self._on_get_password_action)

self.cos_integration = COSAgentProvider(
self,
relation_name=COSRelationName,
metrics_endpoints=[],
scrape_configs=self._scrape_config,
refresh_events=[
self.on.set_password_action,
self.on.secret_changed,
self.on[PeerRelationName].relation_changed,
self.on[PeerClusterRelationName].relation_changed,
],
metrics_rules_dir="./src/alert_rules/prometheus",
log_slots=["opensearch:logs"],
)
# Ensure that only one instance of the `_on_peer_relation_changed` handler exists
# in the deferred event queue
self._is_peer_rel_changed_deferred = False
Expand Down Expand Up @@ -689,8 +696,18 @@ def _on_set_password_action(self, event: ActionEvent):
self._put_or_update_internal_user_leader(user_name, password)
label = self.secrets.password_key(user_name)
event.set_results({label: password})
# We know we are already running for MAIN_ORCH. and its leader unit
self.peer_cluster_provider.refresh_relation_data(event)
except OpenSearchError as e:
event.fail(f"Failed changing the password: {e}")
except RuntimeError as e:
# From:
# https://github.com/canonical/operator/blob/ \
# eb52cef1fba4df2f999f88902fb39555fb6de52f/ops/charm.py
if str(e) == "cannot defer action events":
event.fail("Cluster is not ready to update this password. Try again later.")
else:
event.fail(f"Failed with unknown error: {e}")

def _on_get_password_action(self, event: ActionEvent):
"""Return the password and cert chain for the admin user of the cluster."""
Expand Down Expand Up @@ -1531,19 +1548,42 @@ def _check_certs_expiration(self, event: UpdateStatusEvent) -> None:
Scope.UNIT, "certs_exp_checked_at", datetime.now().strftime(date_format)
)

def _get_prometheus_labels(self) -> Optional[Dict[str, str]]:
"""Return the labels for the prometheus scrape."""
try:
if not self.opensearch.roles:
return None
roles = set(
role if role in COS_TAGGABLE_ROLES else "unrecognized"
for role in self.opensearch.roles
)
roles = sorted(roles)
return {"roles": ",".join(roles)}
except KeyError:
# At very early stages of the deployment, "node.roles" may not be yet present
# in the opensearch.yml, nor APIs is responding. Therefore, we need to catch
# the KeyError here and report the appropriate response.
return None

def _scrape_config(self) -> List[Dict]:
"""Generates the scrape config as needed."""
if (
not (app_secrets := self.secrets.get_object(Scope.APP, CertType.APP_ADMIN.val))
or not (ca := app_secrets.get("ca-cert"))
or not (pwd := self.secrets.get(Scope.APP, self.secrets.password_key(COSUser)))
or not self._get_prometheus_labels()
):
# Not yet ready, waiting for certain values to be set
return []
return [
{
"metrics_path": "/_prometheus/metrics",
"static_configs": [{"targets": [f"{self.unit_ip}:{COSPort}"]}],
"static_configs": [
{
"targets": [f"{self.unit_ip}:{COSPort}"],
"labels": self._get_prometheus_labels(),
}
],
"tls_config": {"ca": ca},
"scheme": "https" if self.is_tls_fully_configured() else "http",
"basic_auth": {"username": f"{COSUser}", "password": f"{pwd}"},
Expand Down
Loading

0 comments on commit 7b7fc59

Please sign in to comment.