From 7b18212d384c0d863652adb51875fcc7e3e1f0b1 Mon Sep 17 00:00:00 2001 From: Zhengda Lu Date: Tue, 19 Nov 2024 07:58:13 -0500 Subject: [PATCH] [mongo] gracefully fail on operation samples colleciton when node is in recovering mode (#19080) * gracefully fail on operation samples colleciton when node is in recovering mode * add changelog * skip samples when replset_state is refreshed and node is recovering --- mongo/changelog.d/19080.fixed | 2 ++ .../mongo/dbm/operation_samples.py | 18 +++++++--- mongo/tests/test_dbm_operation_samples.py | 34 +++++++++++++++++++ 3 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 mongo/changelog.d/19080.fixed diff --git a/mongo/changelog.d/19080.fixed b/mongo/changelog.d/19080.fixed new file mode 100644 index 0000000000000..347af2c7cc8f3 --- /dev/null +++ b/mongo/changelog.d/19080.fixed @@ -0,0 +1,2 @@ +Fix crash in DBM operation samples collection when a node is in recovering mode. + diff --git a/mongo/datadog_checks/mongo/dbm/operation_samples.py b/mongo/datadog_checks/mongo/dbm/operation_samples.py index 7acccf0821547..9ecc14d09309f 100644 --- a/mongo/datadog_checks/mongo/dbm/operation_samples.py +++ b/mongo/datadog_checks/mongo/dbm/operation_samples.py @@ -7,6 +7,7 @@ from typing import List, Optional from bson import json_util +from pymongo.errors import NotPrimaryError from datadog_checks.mongo.dbm.utils import ( format_key_name, @@ -107,6 +108,9 @@ def _should_collect_operation_samples(self) -> bool: if isinstance(deployment, ReplicaSetDeployment) and deployment.is_arbiter: self._check.log.debug("Skipping operation samples collection on arbiter node") return False + elif isinstance(deployment, ReplicaSetDeployment) and deployment.replset_state == 3: + self._check.log.debug("Skipping operation samples collection on node in recovering state") + return False return True def _get_operation_samples(self, now, databases_monitored: List[str]): @@ -149,10 +153,16 @@ def _get_operation_samples(self, now, databases_monitored: List[str]): continue def _get_current_op(self): - operations = self._check.api_client.current_op() - for operation in operations: - self._check.log.debug("Found operation: %s", operation) - yield operation + try: + operations = self._check.api_client.current_op() + for operation in operations: + self._check.log.debug("Found operation: %s", operation) + yield operation + except NotPrimaryError as e: + # If the node is not primary or secondary, for example node is in recovering state + # we could not run the $currentOp command to collect operation samples. + self._check.log.warning("Could not collect operation samples, node is not primary or secondary") + self._check.log.debug("Error details: %s", e) def _should_include_operation(self, operation: dict, databases_monitored: List[str]) -> bool: # Skip operations from db that are not configured to be monitored diff --git a/mongo/tests/test_dbm_operation_samples.py b/mongo/tests/test_dbm_operation_samples.py index 1ad182b3a7631..0621cd7edc32f 100644 --- a/mongo/tests/test_dbm_operation_samples.py +++ b/mongo/tests/test_dbm_operation_samples.py @@ -5,7 +5,9 @@ import json import os +import mock import pytest +from pymongo.errors import NotPrimaryError from . import common from .common import HERE @@ -105,3 +107,35 @@ def test_mongo_operation_samples_arbiter(aggregator, instance_arbiter, check, dd assert len(dbm_samples) == 0 assert len(dbm_activities) == 0 + + +@mock_now(1715911398.1112723) +@common.shard +def test_mongo_operation_samples_not_primary( + aggregator, instance_integration_cluster_autodiscovery, check, dd_run_check +): + instance_integration_cluster_autodiscovery['dbm'] = True + instance_integration_cluster_autodiscovery['operation_samples'] = {'enabled': True, 'run_sync': True} + instance_integration_cluster_autodiscovery['slow_operations'] = {'enabled': False} + instance_integration_cluster_autodiscovery['schemas'] = {'enabled': False} + + mongo_check = check(instance_integration_cluster_autodiscovery) + with mock_pymongo("standalone"): + with mock.patch( + 'datadog_checks.mongo.api.MongoApi.current_op', new_callable=mock.PropertyMock + ) as mock_current_op: + mock_current_op.side_effect = NotPrimaryError("node is recovering") + aggregator.reset() + run_check_once(mongo_check, dd_run_check) + + dbm_activities = aggregator.get_event_platform_events("dbm-activity") + activity_samples = [event for event in dbm_activities if event['dbm_type'] == 'activity'] + assert activity_samples is not None + assert len(activity_samples[0]['mongodb_activity']) == 0 + + aggregator.reset() + mongo_check.deployment_type.replset_state = 3 + run_check_once(mongo_check, dd_run_check) + dbm_activities = aggregator.get_event_platform_events("dbm-activity") + activity_samples = [event for event in dbm_activities if event['dbm_type'] == 'activity'] + assert len(activity_samples) == 0