From d0f60ce85d98cf15894c668847cefe14cbb65eed Mon Sep 17 00:00:00 2001 From: lindenb1 Date: Wed, 4 Dec 2024 15:43:02 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8(backend)=20Adding=20/prometheus=20log?= =?UTF-8?q?ging=20endpoint=20#455?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit implements various metrics for users, documents and the actual django application Signed-off-by: lindenb1 --- CHANGELOG.md | 3 + docker-compose.yml | 3 + .../core/api/custom_metrics_exporter.py | 100 ++++++++++++++++++ src/backend/core/api/custom_probe_views.py | 92 ++++++++++++++++ src/backend/core/api/decorators.py | 47 ++++++++ src/backend/impress/settings.py | 15 ++- src/backend/impress/urls.py | 22 ++++ src/backend/impress/wsgi.py | 19 ++++ src/backend/pyproject.toml | 1 + 9 files changed, 301 insertions(+), 1 deletion(-) create mode 100644 src/backend/core/api/custom_metrics_exporter.py create mode 100644 src/backend/core/api/custom_probe_views.py create mode 100644 src/backend/core/api/decorators.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 46e174b20..e4b4fc7f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to ## [Unreleased] +## Added + +✨(backend) Adding /prometheus metrics endpoint #455 ## [1.8.2] - 2024-11-28 diff --git a/docker-compose.yml b/docker-compose.yml index 5c7e4d1ad..211901d24 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -50,6 +50,9 @@ services: environment: - PYLINTHOME=/app/.pylint.d - DJANGO_CONFIGURATION=Development + - PROMETHEUS_EXPORTER=true + - K8S_PROBING=true + - MONITORING_ALLOWED_CIDR_RANGES="172.23.0.0/16" # separate by comma env_file: - env.d/development/common - env.d/development/postgresql diff --git a/src/backend/core/api/custom_metrics_exporter.py b/src/backend/core/api/custom_metrics_exporter.py new file mode 100644 index 000000000..c85ade2eb --- /dev/null +++ b/src/backend/core/api/custom_metrics_exporter.py @@ -0,0 +1,100 @@ +from prometheus_client.core import GaugeMetricFamily +from django.utils.timezone import now +from django.db.models import Count, Min, Max, Q, F +from datetime import timedelta +from core import models +from django.conf import settings + + +class CustomMetricsExporter: + """ + Custom Prometheus metrics collector for user and document statistics. + """ + + def collect(self): + namespace = getattr(settings, "PROMETHEUS_METRIC_NAMESPACE", "") + + def prefixed_metric_name(name): + return f"{namespace}_{name}" if namespace else name + + now_time = now() + today_start_utc = now_time.replace(hour=0, minute=0, second=0, microsecond=0) + one_week_ago = today_start_utc - timedelta(days=7) + one_month_ago = today_start_utc - timedelta(days=30) + + user_count = models.User.objects.count() + active_users_today = models.User.objects.filter( + Q(documentaccess__updated_at__gte=today_start_utc) | + Q(link_traces__created_at__gte=today_start_utc) | + Q(last_login__gte=today_start_utc) + ).distinct().count() + active_users_7_days = models.User.objects.filter( + Q(documentaccess__updated_at__gte=one_week_ago) | + Q(link_traces__created_at__gte=one_week_ago) | + Q(last_login__gte=one_week_ago) + ).distinct().count() + active_users_30_days = models.User.objects.filter( + Q(documentaccess__updated_at__gte=one_month_ago) | + Q(link_traces__created_at__gte=one_month_ago) | + Q(last_login__gte=one_month_ago) + ).distinct().count() + + total_documents = models.Document.objects.count() + shared_docs_count = models.Document.objects.annotate( + access_count=Count("accesses") + ).filter(access_count__gt=1).count() + active_docs_today = models.Document.objects.filter( + updated_at__gte=today_start_utc, + updated_at__lt=today_start_utc + timedelta(days=1), + ).count() + active_docs_last_7_days = models.Document.objects.filter( + updated_at__gte=one_week_ago + ).count() + active_docs_last_30_days = models.Document.objects.filter( + updated_at__gte=one_month_ago + ).count() + + oldest_doc_date = models.Document.objects.aggregate( + oldest=Min("created_at") + )["oldest"] + newest_doc_date = models.Document.objects.aggregate( + newest=Max("created_at") + )["newest"] + + user_doc_counts = models.DocumentAccess.objects.values("user_id").annotate( + doc_count=Count("document_id"), + admin_email=F("user__admin_email") + ) + + metrics = [] + metrics.append(GaugeMetricFamily(prefixed_metric_name("total_users"), "Total number of users", value=user_count)) + metrics.append(GaugeMetricFamily(prefixed_metric_name("active_users_today"), "Number of active users today", value=active_users_today)) + metrics.append(GaugeMetricFamily(prefixed_metric_name("active_users_7_days"), "Number of active users in the last 7 days", value=active_users_7_days)) + metrics.append(GaugeMetricFamily(prefixed_metric_name("active_users_30_days"), "Number of active users in the last 30 days", value=active_users_30_days)) + metrics.append(GaugeMetricFamily(prefixed_metric_name("total_documents"), "Total number of documents", value=total_documents)) + metrics.append(GaugeMetricFamily(prefixed_metric_name("shared_documents"), "Number of shared documents", value=shared_docs_count)) + metrics.append(GaugeMetricFamily(prefixed_metric_name("active_documents_today"), "Number of active documents today", value=active_docs_today)) + metrics.append(GaugeMetricFamily(prefixed_metric_name("active_documents_7_days"), "Number of active documents in the last 7 days", value=active_docs_last_7_days)) + metrics.append(GaugeMetricFamily(prefixed_metric_name("active_documents_30_days"), "Number of active documents in the last 30 days", value=active_docs_last_30_days)) + + if oldest_doc_date: + metrics.append(GaugeMetricFamily( + prefixed_metric_name("oldest_document_date"), "Timestamp of the oldest document creation date", + value=oldest_doc_date.timestamp() + )) + if newest_doc_date: + metrics.append(GaugeMetricFamily( + prefixed_metric_name("newest_document_date"), "Timestamp of the newest document creation date", + value=newest_doc_date.timestamp() + )) + + user_distribution_metric = GaugeMetricFamily( + prefixed_metric_name("user_document_distribution"), "Document counts per user", labels=["user_email"] + ) + for user in user_doc_counts: + if user["admin_email"]: # Validate email existence + user_distribution_metric.add_metric([user["admin_email"]], user["doc_count"]) + metrics.append(user_distribution_metric) + + for metric in metrics: + yield metric diff --git a/src/backend/core/api/custom_probe_views.py b/src/backend/core/api/custom_probe_views.py new file mode 100644 index 000000000..220776365 --- /dev/null +++ b/src/backend/core/api/custom_probe_views.py @@ -0,0 +1,92 @@ +import uuid +import requests +from django.http import JsonResponse, HttpResponseServerError, HttpResponse +from django.db import connections +from django.db.utils import OperationalError +from django.core.cache import cache +from django.core.files.storage import default_storage +from django.core.files.base import ContentFile + +from impress import settings + + +def liveness_check(request): + """ + Liveness probe endpoint. + Returns HTTP 200 if the application is alive and running. + """ + + try: + return JsonResponse({"status": "OK"}, status=200) + except Exception as e: + return JsonResponse({"status": "Error", "message": str(e)}, status=500) + + +def readiness_check(request): + """ + Readiness probe endpoint. + Checks database, cache, media storage, and OIDC configuration. + Returns HTTP 200 with JSON status "OK" if all checks pass, + or HTTP 500 with JSON status "Error" and an error message. + """ + + def check_database(): + """Check database connectivity.""" + try: + db_conn = connections['default'] + db_conn.cursor() + except OperationalError as e: + raise Exception(f"Database check failed: {e}") + + def check_cache(): + """Check cache connectivity.""" + test_key = "readiness-probe" + test_value = "ready" + cache.set(test_key, test_value, timeout=5) + if cache.get(test_key) != test_value: + raise Exception("Cache check failed: Value mismatch or cache unavailable") + + def check_media_storage(): + """Check S3 storage connectivity.""" + test_file_name = f"readiness-check-{uuid.uuid4()}.txt" + test_content = ContentFile(b"readiness check") + try: + # Attempt to save the test file + default_storage.save(test_file_name, test_content) + # Attempt to delete the test file + default_storage.delete(test_file_name) + except Exception as e: + # Raise an exception if any error occurs during save or delete + raise Exception(f"Media storage check failed: {e}") + + def check_oidc(): + """Check OIDC configuration and connectivity.""" + required_endpoints = [ + ("OIDC_OP_JWKS_ENDPOINT", settings.OIDC_OP_JWKS_ENDPOINT), + ("OIDC_OP_TOKEN_ENDPOINT", settings.OIDC_OP_TOKEN_ENDPOINT), + ("OIDC_OP_USER_ENDPOINT", settings.OIDC_OP_USER_ENDPOINT), + ] + + missing_endpoints = [name for name, url in required_endpoints if not url] + if missing_endpoints: + raise Exception(f"Missing OIDC configuration for: {', '.join(missing_endpoints)}") + + for name, url in required_endpoints: + try: + requests.get(url, timeout=5) # Just ensure the endpoint responds no matter the http status code + except requests.RequestException as e: + raise Exception(f"Failed to reach {name} ({url}): {e}") + + try: + # Run all checks + check_database() + check_cache() + check_media_storage() + check_oidc() + + # If all checks pass + return JsonResponse({"status": "OK"}, status=200) + + except Exception as e: + # Return error response + return JsonResponse({"status": "Error", "message": str(e)}, status=500) \ No newline at end of file diff --git a/src/backend/core/api/decorators.py b/src/backend/core/api/decorators.py new file mode 100644 index 000000000..442f16e9e --- /dev/null +++ b/src/backend/core/api/decorators.py @@ -0,0 +1,47 @@ +import os +from ipaddress import ip_network, ip_address +from django.http import HttpResponseForbidden + + +def monitoring_cidr_protected_view(view): + """ + Decorator to protect a view with a CIDR filter. + CIDR ranges are fetched from the environment variable `MONITORING_ALLOWED_CIDR_RANGES`. + If set to '*', all clients are allowed. If not set or empty, access is denied. + """ + # Fetch allowed CIDR ranges from the environment variable + cidr_env = os.environ.get("MONITORING_ALLOWED_CIDR_RANGES", "").strip() + + # Handle the special case for allowing all clients + allow_all = cidr_env == "*" + + # Validate and parse CIDR ranges if not allowing all + try: + allowed_cidr_ranges = [ + ip_network(cidr.strip().strip('"').strip("'")) + for cidr in cidr_env.split(",") + if cidr.strip() and cidr != "*" + ] + except ValueError as e: + raise ValueError(f"Invalid CIDR range in MONITORING_ALLOWED_CIDR_RANGES: {e}") + + def wrapped_view(request, *args, **kwargs): + # Get the client's IP address from the request + client_ip = request.META.get("REMOTE_ADDR") + + # Allow all clients if explicitly configured + if allow_all: + return view(request, *args, **kwargs) + + # If no CIDR ranges are configured, deny access + if not allowed_cidr_ranges: + return HttpResponseForbidden("Access denied: No allowed CIDR ranges configured.") + + # Check if the client's IP is in the allowed CIDR ranges + if not any(ip_address(client_ip) in cidr for cidr in allowed_cidr_ranges): + return HttpResponseForbidden("Access denied: Your IP is not allowed.") + + # Proceed to the original view + return view(request, *args, **kwargs) + + return wrapped_view diff --git a/src/backend/impress/settings.py b/src/backend/impress/settings.py index 3bb1d830a..906ff89d1 100755 --- a/src/backend/impress/settings.py +++ b/src/backend/impress/settings.py @@ -23,6 +23,7 @@ # Build paths inside the project like this: BASE_DIR / 'subdir'. BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) DATA_DIR = os.path.join("/", "data") +PROMETHEUS_EXPORTER = os.getenv("PROMETHEUS_EXPORTER", "False").lower() == "true" def get_release(): @@ -282,6 +283,14 @@ class Base(Configuration): "dockerflow.django.middleware.DockerflowMiddleware", ] + if PROMETHEUS_EXPORTER: + MIDDLEWARE.insert(0, "django_prometheus.middleware.PrometheusBeforeMiddleware") + MIDDLEWARE.append("django_prometheus.middleware.PrometheusAfterMiddleware") + PROMETHEUS_METRIC_NAMESPACE = "impress" + PROMETHEUS_LATENCY_BUCKETS = ( + .05, .1, .25, .5, .75, 1.0, 1.5, 2.5, 5.0, 10.0, 15.0, 30.0, float("inf") + ) + AUTHENTICATION_BACKENDS = [ "django.contrib.auth.backends.ModelBackend", "core.authentication.backends.OIDCAuthenticationBackend", @@ -295,6 +304,7 @@ class Base(Configuration): "drf_spectacular", # Third party apps "corsheaders", + "django_prometheus", "dockerflow.django", "rest_framework", "parler", @@ -314,7 +324,10 @@ class Base(Configuration): # Cache CACHES = { - "default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"}, + "default": { + "BACKEND": "django.core.cache.backends.locmem.LocMemCache" if not PROMETHEUS_EXPORTER + else "django_prometheus.cache.backends.locmem.LocMemCache", + }, } REST_FRAMEWORK = { diff --git a/src/backend/impress/urls.py b/src/backend/impress/urls.py index 5dc490ac1..9cd550264 100644 --- a/src/backend/impress/urls.py +++ b/src/backend/impress/urls.py @@ -1,5 +1,6 @@ """URL configuration for the impress project""" +import os from django.conf import settings from django.conf.urls.static import static from django.contrib import admin @@ -12,11 +13,32 @@ SpectacularSwaggerView, ) +from django_prometheus import exports +from core.api.custom_probe_views import liveness_check, readiness_check +from core.api.decorators import monitoring_cidr_protected_view + urlpatterns = [ path("admin/", admin.site.urls), path("", include("core.urls")), ] +# Conditionally add Prometheus Exporter endpoint +if os.environ.get("PROMETHEUS_EXPORTER", "False").lower() == "true": + # Protect the Prometheus view with the CIDR decorator + urlpatterns.append( + path("prometheus/", monitoring_cidr_protected_view(exports.ExportToDjangoView), name="prometheus-django-metrics"), + ) + +# Conditionally add liveness and readiness probe endpoints +if os.environ.get("K8S_PROBING", "False").lower() == "true": + + urlpatterns.append( + path("probes/liveness/", monitoring_cidr_protected_view(liveness_check), name="liveness-probe"), + ) + urlpatterns.append( + path("probes/readiness/", monitoring_cidr_protected_view(readiness_check), name="readiness-probe"), + ) + if settings.DEBUG: urlpatterns = ( urlpatterns diff --git a/src/backend/impress/wsgi.py b/src/backend/impress/wsgi.py index 6076021c6..9f3f2ea0f 100644 --- a/src/backend/impress/wsgi.py +++ b/src/backend/impress/wsgi.py @@ -11,7 +11,26 @@ from configurations.wsgi import get_wsgi_application +# Prometheus Metrics Registration +from prometheus_client import REGISTRY +from core.api.custom_metrics_exporter import CustomMetricsExporter + + +def register_prometheus_exporter(): + """ + Register custom Prometheus metrics collector. + """ + if not any(isinstance(cme, CustomMetricsExporter) for cme in REGISTRY._collector_to_names): + REGISTRY.register(CustomMetricsExporter()) + print("Custom Prometheus metrics registered successfully.") + else: + print("Custom Prometheus metrics already registered.") + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "impress.settings") os.environ.setdefault("DJANGO_CONFIGURATION", "Development") +# Call register_prometheus_exporter to register Prometheus metrics if enabled +if os.environ.get("PROMETHEUS_EXPORTER", "False").lower() == "true": + register_prometheus_exporter() + application = get_wsgi_application() diff --git a/src/backend/pyproject.toml b/src/backend/pyproject.toml index 61863eb95..51daeb7ce 100644 --- a/src/backend/pyproject.toml +++ b/src/backend/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "django-countries==7.6.1", "django-filter==24.3", "django-parler==2.3", + "django-prometheus==2.3.1", "redis==5.1.1", "django-redis==5.4.0", "django-storages[s3]==1.14.4",