Skip to content

Commit

Permalink
[metrics] Fix lost connection when metrics query database
Browse files Browse the repository at this point in the history
  • Loading branch information
wing2fly committed Sep 24, 2024
1 parent 41fd5de commit 4b76309
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 15 deletions.
30 changes: 26 additions & 4 deletions apps/useradmin/src/useradmin/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,18 @@
# limitations under the License.

import logging

from datetime import datetime, timedelta

from django.db import connection
from django.db.utils import OperationalError
from prometheus_client import Gauge

from desktop.lib.metrics import global_registry
from desktop.lib.security_util import get_localhost_name

LOG = logging.getLogger()


def active_users():
from useradmin.models import UserProfile
try:
Expand All @@ -32,11 +35,21 @@ def active_users():
first_login=False,
hostname__isnull=False
).count()
except:
except OperationalError as oe:
LOG.debug('active_users recovering from %s' % str(oe))
connection.close()
connection.connect()
count = UserProfile.objects.filter(
last_activity__gt=datetime.now() - timedelta(hours=1),
first_login=False,
hostname__isnull=False
).count()
except Exception as e:
LOG.exception('Could not get active_users')
count = 0
return count


global_registry().gauge_callback(
name='users.active.total',
callback=active_users,
Expand All @@ -48,15 +61,24 @@ def active_users():
prometheus_active_users = Gauge('hue_active_users', 'Hue Active Users in All Instances')
prometheus_active_users.set_function(active_users)


def active_users_per_instance():
from useradmin.models import UserProfile
try:
count = UserProfile.objects.filter(last_activity__gt=datetime.now() - timedelta(hours=1), hostname=get_localhost_name()).count()
except:
count = UserProfile.objects.filter(last_activity__gt=datetime.now() - timedelta(hours=1),
hostname=get_localhost_name()).count()
except OperationalError as oe:
LOG.debug('active_users_per_instance recovering from %s' % str(oe))
connection.close()
connection.connect()
count = UserProfile.objects.filter(last_activity__gt=datetime.now() - timedelta(hours=1),
hostname=get_localhost_name()).count()
except Exception as e:
LOG.exception('Could not get active_users per instance')
count = 0
return count


global_registry().gauge_callback(
name='users.active',
callback=active_users_per_instance,
Expand Down
43 changes: 32 additions & 11 deletions desktop/core/src/desktop/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,23 @@

from __future__ import absolute_import

from future import standard_library
standard_library.install_aliases()
from builtins import range
import gc
import logging
import multiprocessing
import threading

import multiprocessing
from builtins import range
from datetime import datetime, timedelta
from prometheus_client import Gauge, REGISTRY

from useradmin.models import User
from django.db import connection
from django.db.utils import OperationalError
from future import standard_library
from prometheus_client import REGISTRY, Gauge

from desktop.conf import ENABLE_PROMETHEUS
from desktop.lib.metrics import global_registry
from useradmin.models import User

standard_library.install_aliases()

LOG = logging.getLogger()

Expand All @@ -49,7 +50,9 @@
django_collectors = set()
django_metrics_names = [
name
for name in REGISTRY._names_to_collectors.keys() if name.startswith('django_') and not name.startswith(ALLOWED_DJANGO_PROMETHEUS_METRICS)
for name in REGISTRY._names_to_collectors.keys()
if name.startswith('django_')
and not name.startswith(ALLOWED_DJANGO_PROMETHEUS_METRICS)
]

for metric_name in django_metrics_names:
Expand Down Expand Up @@ -141,14 +144,21 @@

# ------------------------------------------------------------------------------


def user_count():
users = 0
try:
users = User.objects.count()
except:
except OperationalError as oe:
LOG.debug('user_count recovering from %s' % str(oe))
connection.close()
connection.connect()
users = User.objects.count()
except Exception as e:
LOG.exception('Metrics: Failed to get number of user accounts')
return users


user_count = global_registry().gauge_callback(
name='users',
callback=user_count,
Expand Down Expand Up @@ -188,19 +198,30 @@ def user_count():

# ------------------------------------------------------------------------------


def num_of_queries():
from desktop.models import Document2 # Avoid circular dependency
from desktop.models import Document2 # Avoid circular dependency
try:
count = Document2.objects.filter(
type__istartswith='query-',
is_history=True,
last_modified__gt=datetime.now() - timedelta(minutes=10)
).count()
except:
except OperationalError as oe:
LOG.debug('num_of_queries recovering from %s' % str(oe))
connection.close()
connection.connect()
count = Document2.objects.filter(
type__istartswith='query-',
is_history=True,
last_modified__gt=datetime.now() - timedelta(minutes=10)
).count()
except Exception as e:
LOG.exception('Could not get num_of_queries')
count = 0
return count


global_registry().gauge_callback(
name='queries.number',
callback=num_of_queries,
Expand Down

0 comments on commit 4b76309

Please sign in to comment.