Skip to content

Commit

Permalink
Collect sqlserver deadlock data (#18108)
Browse files Browse the repository at this point in the history
* Creating a test for the deadlock

* added test

* Deadlocks first impl

* first full version

* Added checks

* Added the deadlock file

* Added exception for truncated xml

* Added exception handling when obfuscating

* adopted oto the activity pipeline

* Use activity event pipline for the deadlock event

* Removed pdb

* Added logs

* Improved queries

* Added a new query

* fix import

* use temp table

* Made test stable

* Added obfuscation unit test

* Applied formatting

* Fixed linter errors

* propagated errors

* removed old imports

* Added setup to all variants

* Remove unused changes

* Added changelog

* deadlocks spec

* formatted spce

* Improced changelog

* Improved deadlock test

* Improved unit tests

* Fixed data model

* print the number of deadlocks found in test case

* test for empty payloads

* send payload only if deadlocks found

* refactor deadlock event extraction

* refactor deadlock event extraction after deadlock create

* deadlock events to deadlock payloads

* test for obfuscation errors

* test for obfuscation errors

* test refactoring completed

* obfuscation bug fix

* test with dbm: false

* removed reliance on temp table

* replace last date with offset

* default interval

* improved error handling

* fixes

* Update sqlserver/assets/configuration/spec.yaml

Co-authored-by: Justin <[email protected]>

* renamed config to deadlocks_collection

* refactor all deadlock funcs in the deadlock.py

* refactoring

* deadlocks in separate job

* separate deadlock tests

* linter

* deadlocks as async job

* payload as a list of dicts

* renamed query varaible

* query signatures

* fixed test cases

* aux functions from utils to deadlock_test

* change query signature struct

* spid as int

* db rows in dict

* deadlock timestamp

* read date in test case

* fixed test_deadlock_xml_bad_format

* refactored test__create_deadlock_rows

* fixed test_deadlock_calls_obfuscator

* linster

* disabled by default

* fixing rebasing errors

* fixing rebasing errors

* restore test_unit from master

* restore tests/utils from master

* linter

* linter

* linter

* linter

* licence

* instance fix

* import order

* linter: import order

* asset validation

* get_deadlock_obj

* lookback

* key error

* linter

* exception handling

* enrich exception only on code error

* remove sqlncli from ci

* test diag change

* supported only with odbc

---------

Co-authored-by: Nenad Noveljic <[email protected]>
Co-authored-by: Justin <[email protected]>
  • Loading branch information
3 people authored Sep 27, 2024
1 parent 018da87 commit 0eac087
Show file tree
Hide file tree
Showing 17 changed files with 800 additions and 6 deletions.

This file was deleted.

23 changes: 23 additions & 0 deletions sqlserver/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,29 @@ files:
value:
type: number
example: 10
- name: deadlocks_collection
hidden: True
description: |
Configure the collection of deadlock data. The feature is supported for odbc connector only.
options:
- name: enabled
description: |
Enable the collection of deadlock data. Requires `dbm: true`. Disabled by default.
value:
type: boolean
example: false
- name: collection_interval
description: |
Set the interval for collecting deadlock data, in seconds. Defaults to 600 seconds.
value:
type: number
example: 600
- name: max_deadlocks
description: |
Set the maximum number of deadlocks to retrieve per collection.
value:
type: number
example: 100
- template: instances/default
- template: logs
example:
Expand Down
1 change: 1 addition & 0 deletions sqlserver/changelog.d/18108.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added deadlock collection feature to the SQL Server integration.
1 change: 1 addition & 0 deletions sqlserver/datadog_checks/sqlserver/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __init__(self, init_config, instance, log):
self.settings_config: dict = instance.get('collect_settings', {}) or {}
self.activity_config: dict = instance.get('query_activity', {}) or {}
self.schema_config: dict = instance.get('schemas_collection', {}) or {}
self.deadlocks_config: dict = instance.get('deadlocks_collection', {}) or {}
self.cloud_metadata: dict = {}
aws: dict = instance.get('aws', {}) or {}
gcp: dict = instance.get('gcp', {}) or {}
Expand Down
11 changes: 11 additions & 0 deletions sqlserver/datadog_checks/sqlserver/config_models/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,16 @@ class CustomQuery(BaseModel):
tags: Optional[tuple[str, ...]] = None


class DeadlocksCollection(BaseModel):
model_config = ConfigDict(
arbitrary_types_allowed=True,
frozen=True,
)
collection_interval: Optional[float] = None
enabled: Optional[bool] = None
max_deadlocks: Optional[float] = None


class Gcp(BaseModel):
model_config = ConfigDict(
arbitrary_types_allowed=True,
Expand Down Expand Up @@ -185,6 +195,7 @@ class InstanceConfig(BaseModel):
database_instance_collection_interval: Optional[float] = None
db_fragmentation_object_names: Optional[tuple[str, ...]] = None
dbm: Optional[bool] = None
deadlocks_collection: Optional[DeadlocksCollection] = None
disable_generic_tags: Optional[bool] = None
driver: Optional[str] = None
dsn: Optional[str] = None
Expand Down
193 changes: 193 additions & 0 deletions sqlserver/datadog_checks/sqlserver/deadlocks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
# (C) Datadog, Inc. 2024-present
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)

import xml.etree.ElementTree as ET
from time import time

from datadog_checks.base.utils.db.sql import compute_sql_signature
from datadog_checks.base.utils.db.utils import DBMAsyncJob, default_json_event_encoding, obfuscate_sql_with_metadata
from datadog_checks.base.utils.serialization import json
from datadog_checks.base.utils.tracking import tracked_method
from datadog_checks.sqlserver.config import SQLServerConfig
from datadog_checks.sqlserver.const import STATIC_INFO_ENGINE_EDITION, STATIC_INFO_VERSION
from datadog_checks.sqlserver.queries import DEADLOCK_QUERY, DEADLOCK_TIMESTAMP_ALIAS, DEADLOCK_XML_ALIAS

try:
import datadog_agent
except ImportError:
from ..stubs import datadog_agent

DEFAULT_COLLECTION_INTERVAL = 600
MAX_DEADLOCKS = 100
MAX_PAYLOAD_BYTES = 19e6

PAYLOAD_TIMESTAMP = "deadlock_timestamp"
PAYLOAD_QUERY_SIGNATURE = "query_signatures"
PAYLOAD_XML = "xml"


def agent_check_getter(self):
return self._check


class Deadlocks(DBMAsyncJob):
def __init__(self, check, config: SQLServerConfig):
self.tags = [t for t in check.tags if not t.startswith('dd.internal')]
self._check = check
self._log = self._check.log
self._config = config
self._last_deadlock_timestamp = time()
self._max_deadlocks = config.deadlocks_config.get("max_deadlocks", MAX_DEADLOCKS)
self._deadlock_payload_max_bytes = MAX_PAYLOAD_BYTES
self.collection_interval = config.deadlocks_config.get("collection_interval", DEFAULT_COLLECTION_INTERVAL)
super(Deadlocks, self).__init__(
check,
run_sync=True,
enabled=self._config.deadlocks_config.get('enabled', False),
expected_db_exceptions=(),
min_collection_interval=self._config.min_collection_interval,
dbms="sqlserver",
rate_limit=1 / float(self.collection_interval),
job_name="deadlocks",
shutdown_callback=self._close_db_conn,
)
self._conn_key_prefix = "dbm-deadlocks-"

def _close_db_conn(self):
pass

def obfuscate_no_except_wrapper(self, sql_text):
try:
sql_text = obfuscate_sql_with_metadata(
sql_text, self._config.obfuscator_options, replace_null_character=True
)['query']
except Exception as e:
sql_text = "ERROR: failed to obfuscate"
error_text = "Failed to obfuscate sql text within a deadlock"
if self._config.log_unobfuscated_queries:
error_text += "=[%s]" % sql_text
error_text += " | err=[%s]"
self._log.error(error_text, e)
return sql_text

def _obfuscate_xml(self, root):
process_list = root.find(".//process-list")
if process_list is None:
raise Exception("process-list element not found. The deadlock XML is in an unexpected format.")
query_signatures = []
for process in process_list.findall('process'):
for inputbuf in process.findall('.//inputbuf'):
if inputbuf.text is not None:
inputbuf.text = self.obfuscate_no_except_wrapper(inputbuf.text)
spid = process.get('spid')
if spid is not None:
try:
spid = int(spid)
except ValueError:
self._log.error("spid not an integer. Skipping query signature computation.")
continue
if spid in query_signatures:
continue
query_signatures.append({"spid": spid, "signature": compute_sql_signature(inputbuf.text)})
else:
self._log.error("spid not found in process element. Skipping query signature computation.")
for frame in process.findall('.//frame'):
if frame.text is not None:
frame.text = self.obfuscate_no_except_wrapper(frame.text)
return query_signatures

def _get_lookback_seconds(self):
return min(-60, self._last_deadlock_timestamp - time())

def _query_deadlocks(self):
with self._check.connection.open_managed_default_connection(key_prefix=self._conn_key_prefix):
with self._check.connection.get_managed_cursor(key_prefix=self._conn_key_prefix) as cursor:
self._log.debug("collecting sql server deadlocks")
self._log.debug(
"Running query [%s] with max deadlocks %s and timestamp %s",
DEADLOCK_QUERY,
self._max_deadlocks,
self._last_deadlock_timestamp,
)
try:
cursor.execute(DEADLOCK_QUERY, (self._max_deadlocks, self._get_lookback_seconds()))
except Exception as e:
if "Data column of Unknown ADO type" in str(e):
raise Exception(f"{str(e)} | cursor.description: {cursor.description}")
raise e

columns = [column[0] for column in cursor.description]
return [dict(zip(columns, row)) for row in cursor.fetchall()]

def _create_deadlock_rows(self):
db_rows = self._query_deadlocks()
deadlock_events = []
total_number_of_characters = 0
for i, row in enumerate(db_rows):
try:
root = ET.fromstring(row[DEADLOCK_XML_ALIAS])
except Exception as e:
self._log.error(
"""An error occurred while collecting SQLServer deadlocks.
One of the deadlock XMLs couldn't be parsed. The error: {}. XML: {}""".format(
e, row
)
)
continue
query_signatures = {}
try:
query_signatures = self._obfuscate_xml(root)
except Exception as e:
error = "An error occurred while obfuscating SQLServer deadlocks. The error: {}".format(e)
self._log.error(error)
continue

total_number_of_characters += len(row) + len(query_signatures)
if total_number_of_characters > self._deadlock_payload_max_bytes:
self._log.warning(
"""We've dropped {} deadlocks from a total of {} deadlocks as the
max deadlock payload of {} bytes was exceeded.""".format(
len(db_rows) - i, len(db_rows), self._deadlock_payload_max_bytes
)
)
break

deadlock_events.append(
{
PAYLOAD_TIMESTAMP: row[DEADLOCK_TIMESTAMP_ALIAS],
PAYLOAD_XML: ET.tostring(root, encoding='unicode'),
PAYLOAD_QUERY_SIGNATURE: query_signatures,
}
)
self._last_deadlock_timestamp = time()
return deadlock_events

@tracked_method(agent_check_getter=agent_check_getter)
def collect_deadlocks(self):
rows = self._create_deadlock_rows()
# Send payload only if deadlocks found
if rows:
deadlocks_event = self._create_deadlock_event(rows)
payload = json.dumps(deadlocks_event, default=default_json_event_encoding)
self._log.debug("Deadlocks payload: %s", str(payload))
self._check.database_monitoring_query_activity(payload)

def _create_deadlock_event(self, deadlock_rows):
event = {
"host": self._check.resolved_hostname,
"ddagentversion": datadog_agent.get_version(),
"ddsource": "sqlserver",
"dbm_type": "deadlocks",
"collection_interval": self.collection_interval,
"ddtags": self.tags,
"timestamp": time() * 1000,
'sqlserver_version': self._check.static_info_cache.get(STATIC_INFO_VERSION, ""),
'sqlserver_engine_edition': self._check.static_info_cache.get(STATIC_INFO_ENGINE_EDITION, ""),
"cloud_metadata": self._config.cloud_metadata,
"sqlserver_deadlocks": deadlock_rows,
}
return event

def run_job(self):
self.collect_deadlocks()
17 changes: 17 additions & 0 deletions sqlserver/datadog_checks/sqlserver/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,23 @@
FK.name, FK.parent_object_id, FK.referenced_object_id;
"""

DEADLOCK_TIMESTAMP_ALIAS = "timestamp"
DEADLOCK_XML_ALIAS = "event_xml"
DEADLOCK_QUERY = """
SELECT TOP(?) xdr.value('@timestamp', 'datetime') AS [{timestamp}],
xdr.query('.') AS [{xml}]
FROM (SELECT CAST([target_data] AS XML) AS Target_Data
FROM sys.dm_xe_session_targets AS xt
INNER JOIN sys.dm_xe_sessions AS xs ON xs.address = xt.event_session_address
WHERE xs.name = N'system_health'
AND xt.target_name = N'ring_buffer'
) AS XML_Data
CROSS APPLY Target_Data.nodes('RingBufferTarget/event[@name="xml_deadlock_report"]') AS XEventData(xdr)
WHERE xdr.value('@timestamp', 'datetime') >= DATEADD(SECOND, ?, GETDATE())
;""".format(
**{"timestamp": DEADLOCK_TIMESTAMP_ALIAS, "xml": DEADLOCK_XML_ALIAS}
)


def get_query_ao_availability_groups(sqlserver_major_version):
"""
Expand Down
1 change: 0 additions & 1 deletion sqlserver/datadog_checks/sqlserver/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@


class SubmitData:

def __init__(self, submit_data_function, base_event, logger):
self._submit_to_agent_queue = submit_data_function
self._base_event = base_event
Expand Down
11 changes: 10 additions & 1 deletion sqlserver/datadog_checks/sqlserver/sqlserver.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# (C) Datadog, Inc. 2018-present
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)

from __future__ import division

import copy
Expand All @@ -12,7 +13,11 @@
from datadog_checks.base import AgentCheck
from datadog_checks.base.config import is_affirmative
from datadog_checks.base.utils.db import QueryExecutor, QueryManager
from datadog_checks.base.utils.db.utils import default_json_event_encoding, resolve_db_host, tracked_query
from datadog_checks.base.utils.db.utils import (
default_json_event_encoding,
resolve_db_host,
tracked_query,
)
from datadog_checks.base.utils.serialization import json
from datadog_checks.sqlserver.activity import SqlserverActivity
from datadog_checks.sqlserver.agent_history import SqlserverAgentHistory
Expand All @@ -23,6 +28,7 @@
SqlserverDBFragmentationMetrics,
SqlserverIndexUsageMetrics,
)
from datadog_checks.sqlserver.deadlocks import Deadlocks
from datadog_checks.sqlserver.metadata import SqlserverMetadata
from datadog_checks.sqlserver.schemas import Schemas
from datadog_checks.sqlserver.statements import SqlserverStatementMetrics
Expand Down Expand Up @@ -135,6 +141,7 @@ def __init__(self, name, init_config, instances):
self.sql_metadata = SqlserverMetadata(self, self._config)
self.activity = SqlserverActivity(self, self._config)
self.agent_history = SqlserverAgentHistory(self, self._config)
self.deadlocks = Deadlocks(self, self._config)

self.static_info_cache = TTLCache(
maxsize=100,
Expand Down Expand Up @@ -171,6 +178,7 @@ def cancel(self):
self.activity.cancel()
self.sql_metadata.cancel()
self._schemas.cancel()
self.deadlocks.cancel()

def config_checks(self):
if self._config.autodiscovery and self.instance.get("database"):
Expand Down Expand Up @@ -785,6 +793,7 @@ def check(self, _):
self.activity.run_job_loop(self.tags)
self.sql_metadata.run_job_loop(self.tags)
self._schemas.run_job_loop(self.tags)
self.deadlocks.run_job_loop(self.tags)
else:
self.log.debug("Skipping check")

Expand Down
2 changes: 1 addition & 1 deletion sqlserver/hatch.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ setup = ["single", "ha"]
[[envs.default.matrix]]
python = ["3.12"]
os = ["windows"]
driver = ["SQLOLEDB", "SQLNCLI11", "MSOLEDBSQL", "odbc"]
driver = ["SQLOLEDB", "MSOLEDBSQL", "odbc"]
version = ["2019", "2022"]
setup = ["single"]

Expand Down
15 changes: 15 additions & 0 deletions sqlserver/tests/compose-ha/sql/aoag_primary.sql
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,21 @@ CREATE USER bob FOR LOGIN bob;
CREATE USER fred FOR LOGIN fred;
GO

-- Create a simple table for deadlocks
CREATE TABLE [datadog_test-1].dbo.deadlocks (a int PRIMARY KEY not null ,b int null);

INSERT INTO [datadog_test-1].dbo.deadlocks VALUES (1,10),(2,20),(3,30)

-- Grant permissions to bob and fred to update the deadlocks table
GRANT INSERT ON [datadog_test-1].dbo.deadlocks TO bob;
GRANT UPDATE ON [datadog_test-1].dbo.deadlocks TO bob;
GRANT DELETE ON [datadog_test-1].dbo.deadlocks TO bob;

GRANT INSERT ON [datadog_test-1].dbo.deadlocks TO fred;
GRANT UPDATE ON [datadog_test-1].dbo.deadlocks TO fred;
GRANT DELETE ON [datadog_test-1].dbo.deadlocks TO fred;
GO

EXEC sp_addrolemember 'db_datareader', 'bob'
EXEC sp_addrolemember 'db_datareader', 'fred'
EXEC sp_addrolemember 'db_datawriter', 'bob'
Expand Down
Loading

0 comments on commit 0eac087

Please sign in to comment.