Skip to content

Commit

Permalink
fix: create a Service for the workload and fix the metrics collector (#…
Browse files Browse the repository at this point in the history
…101)

* fix: create a Service for the workload and fix the metrics collector

This charm was not deploying any Service for the workload container,
which is fine for its regular functions, but causes an issue when the
Prometheus scraper tries reaching out the metrics endpoint.
This commit adds a Service that is attached to the WORKLOAD (the
container inside the Pod that gets created by the StatefulSet we are
applying manually) so that the metrics from it can be reached correctly.
Because of that, the MetricsEndpointProvider's target has to be refactored
to point to the correct service. In a previous version of this charm,
the target was pointing to the charm's container, which does not have
any metrics endpoit, causing the issues reported in
canonical/bundle-kubeflow#564.

Part of canonical/bundle-kubeflow#564

* tests: add an assertion for checking unit is available

The test_prometheus_grafana_integration test case was doing queries to prometheus
and checking the request returned successfully and that the application name and model
was listed correctly. To make this test case more accurately, we can add an assertion that
also checks that the unit is available, this way we avoid issues like the one described in
canonical/bundle-kubeflow#564.

Part of canonical/bundle-kubeflow#564
  • Loading branch information
DnPlas authored Feb 13, 2024
1 parent cfecf50 commit 5792151
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 18 deletions.
40 changes: 22 additions & 18 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,23 +32,6 @@ def __init__(self, *args):
self.model.unit.status = WaitingStatus("Waiting for leadership")
return

self.prometheus_provider = MetricsEndpointProvider(
charm=self,
relation_name="metrics-endpoint",
jobs=[
{
"metrics_path": METRICS_PATH,
"static_configs": [{"targets": ["*:{}".format(METRICS_PORT)]}],
}
],
)

self.dashboard_provider = GrafanaDashboardProvider(self)

self.framework.observe(self.on.install, self._install)
self.framework.observe(self.on.config_changed, self._install)
self.framework.observe(self.on.update_status, self._update_status)

self.logger: logging.Logger = logging.getLogger(__name__)

self._name: str = self.model.app.name
Expand All @@ -58,6 +41,7 @@ def __init__(self, *args):
"crds": "metacontroller-crds-v1.yaml",
"rbac": "metacontroller-rbac.yaml",
"controller": "metacontroller.yaml",
"service": "metacontroller-svc.yaml",
}

# TODO: Fix file imports and move ./src/files back to ./files
Expand All @@ -66,6 +50,25 @@ def __init__(self, *args):
self._lightkube_client: Optional[lightkube.Client] = None
self._max_time_checking_resources = 150

# Observability integration
self.dashboard_provider = GrafanaDashboardProvider(self)
self.prometheus_provider = MetricsEndpointProvider(
charm=self,
relation_name="metrics-endpoint",
jobs=[
{
"metrics_path": METRICS_PATH,
"static_configs": [
{"targets": [f"{self._name}-svc.{self._namespace}.svc:{METRICS_PORT}"]}
],
}
],
)

self.framework.observe(self.on.install, self._install)
self.framework.observe(self.on.config_changed, self._install)
self.framework.observe(self.on.update_status, self._update_status)

def _install(self, event):
"""Creates k8s resources required for the charm, patching over any existing ones it finds"""
self.logger.info("Installing by instantiating Kubernetes objects")
Expand All @@ -91,8 +94,8 @@ def _install(self, event):
raise e

self._create_resource("crds")

self._create_resource("controller")
self._create_resource("service")

self.logger.info("Waiting for installed Kubernetes objects to be operational")

Expand Down Expand Up @@ -151,6 +154,7 @@ def _render_resource(self, yaml_name: [str, Path]):
"app_name": self._name,
"namespace": self._namespace,
"metacontroller_image": self._metacontroller_image,
"metrics_port": METRICS_PORT,
}
with open(self._manifest_file_root / self._resource_files[yaml_name]) as f:
return codecs.load_all_yaml(f, context=context)
Expand Down
20 changes: 20 additions & 0 deletions src/files/manifests/metacontroller-svc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
apiVersion: v1
kind: Service
metadata:
name: {{ app_name }}-svc
namespace: {{ namespace }}
spec:
ports:
- name: metrics-endpoint
port: {{ metrics_port }}
protocol: TCP
targetPort: {{ metrics_port }}
selector:
# This selector ensures this Service identifies
# the metacontroller workload Pod correctly as ti will have
# the same tag. Please NOTE this is pointing at the workload
# and not the charm. The label is assigned to the Pod via the
# StatefulSet located in the same directory as this file.
app.kubernetes.io/name: {{ namespace }}-{{ app_name }}-charm

6 changes: 6 additions & 0 deletions tests/integration/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,12 @@ async def test_prometheus_grafana_integration(ops_test: OpsTest):
assert response_metric["juju_application"] == APP_NAME
assert response_metric["juju_model"] == ops_test.model_name

# Assert the unit is available by checking the query result
# The data is presented as a list [1707357912.349, '1'], where the
# first value is a timestamp and the second value is the state of the unit
# 1 means available, 0 means unavailable
assert response["data"]["result"][0]["value"][1] == "1"


# Helper to retry calling a function over 30 seconds or 5 attempts
retry_for_5_attempts = tenacity.Retrying(
Expand Down

0 comments on commit 5792151

Please sign in to comment.