Skip to content

Commit

Permalink
[mongo] gracefully fail on operation samples colleciton when node is …
Browse files Browse the repository at this point in the history
…in recovering mode (#19080)

* gracefully fail on operation samples colleciton when node is in recovering mode

* add changelog

* skip samples when replset_state is refreshed and node is recovering
  • Loading branch information
lu-zhengda authored Nov 19, 2024
1 parent c875ec6 commit 7b18212
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 4 deletions.
2 changes: 2 additions & 0 deletions mongo/changelog.d/19080.fixed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix crash in DBM operation samples collection when a node is in recovering mode.

18 changes: 14 additions & 4 deletions mongo/datadog_checks/mongo/dbm/operation_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import List, Optional

from bson import json_util
from pymongo.errors import NotPrimaryError

from datadog_checks.mongo.dbm.utils import (
format_key_name,
Expand Down Expand Up @@ -107,6 +108,9 @@ def _should_collect_operation_samples(self) -> bool:
if isinstance(deployment, ReplicaSetDeployment) and deployment.is_arbiter:
self._check.log.debug("Skipping operation samples collection on arbiter node")
return False
elif isinstance(deployment, ReplicaSetDeployment) and deployment.replset_state == 3:
self._check.log.debug("Skipping operation samples collection on node in recovering state")
return False
return True

def _get_operation_samples(self, now, databases_monitored: List[str]):
Expand Down Expand Up @@ -149,10 +153,16 @@ def _get_operation_samples(self, now, databases_monitored: List[str]):
continue

def _get_current_op(self):
operations = self._check.api_client.current_op()
for operation in operations:
self._check.log.debug("Found operation: %s", operation)
yield operation
try:
operations = self._check.api_client.current_op()
for operation in operations:
self._check.log.debug("Found operation: %s", operation)
yield operation
except NotPrimaryError as e:
# If the node is not primary or secondary, for example node is in recovering state
# we could not run the $currentOp command to collect operation samples.
self._check.log.warning("Could not collect operation samples, node is not primary or secondary")
self._check.log.debug("Error details: %s", e)

def _should_include_operation(self, operation: dict, databases_monitored: List[str]) -> bool:
# Skip operations from db that are not configured to be monitored
Expand Down
34 changes: 34 additions & 0 deletions mongo/tests/test_dbm_operation_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import json
import os

import mock
import pytest
from pymongo.errors import NotPrimaryError

from . import common
from .common import HERE
Expand Down Expand Up @@ -105,3 +107,35 @@ def test_mongo_operation_samples_arbiter(aggregator, instance_arbiter, check, dd

assert len(dbm_samples) == 0
assert len(dbm_activities) == 0


@mock_now(1715911398.1112723)
@common.shard
def test_mongo_operation_samples_not_primary(
aggregator, instance_integration_cluster_autodiscovery, check, dd_run_check
):
instance_integration_cluster_autodiscovery['dbm'] = True
instance_integration_cluster_autodiscovery['operation_samples'] = {'enabled': True, 'run_sync': True}
instance_integration_cluster_autodiscovery['slow_operations'] = {'enabled': False}
instance_integration_cluster_autodiscovery['schemas'] = {'enabled': False}

mongo_check = check(instance_integration_cluster_autodiscovery)
with mock_pymongo("standalone"):
with mock.patch(
'datadog_checks.mongo.api.MongoApi.current_op', new_callable=mock.PropertyMock
) as mock_current_op:
mock_current_op.side_effect = NotPrimaryError("node is recovering")
aggregator.reset()
run_check_once(mongo_check, dd_run_check)

dbm_activities = aggregator.get_event_platform_events("dbm-activity")
activity_samples = [event for event in dbm_activities if event['dbm_type'] == 'activity']
assert activity_samples is not None
assert len(activity_samples[0]['mongodb_activity']) == 0

aggregator.reset()
mongo_check.deployment_type.replset_state = 3
run_check_once(mongo_check, dd_run_check)
dbm_activities = aggregator.get_event_platform_events("dbm-activity")
activity_samples = [event for event in dbm_activities if event['dbm_type'] == 'activity']
assert len(activity_samples) == 0

0 comments on commit 7b18212

Please sign in to comment.