From 83fb160b1e461120cf1dc0f8c1610126b3301477 Mon Sep 17 00:00:00 2001 From: Kyle Neale Date: Wed, 27 Nov 2024 12:13:38 -0500 Subject: [PATCH] Nvidia NIM Integration (#18964) * Create Nvidia NIM scaffolding * Add Initial Release changelog * sync models and config * Add metadata and tests * Add Readme * nvidia dash (#19074) * nvidia dash * nits * more nits * nit * validate-assets fixes * remove astericks in README hyperlink ref * Address nits * Update metadata desciption for process.start_time_seconds Co-authored-by: Steven Yuen * Add documentation nits * Final nits --------- Co-authored-by: Steven Yuen --- .codecov.yml | 9 + .github/workflows/config/labeler.yml | 2 + .github/workflows/test-all.yml | 20 + nvidia_nim/CHANGELOG.md | 4 + nvidia_nim/README.md | 62 + nvidia_nim/assets/configuration/spec.yaml | 16 + .../dashboards/nvidia_nim_overview.json | 1163 +++++++++++++++++ nvidia_nim/assets/monitors/latency.json | 33 + nvidia_nim/assets/service_checks.json | 17 + nvidia_nim/changelog.d/18964.added | 1 + nvidia_nim/datadog_checks/__init__.py | 4 + .../datadog_checks/nvidia_nim/__about__.py | 4 + .../datadog_checks/nvidia_nim/__init__.py | 7 + nvidia_nim/datadog_checks/nvidia_nim/check.py | 49 + .../nvidia_nim/config_models/__init__.py | 24 + .../nvidia_nim/config_models/defaults.py | 132 ++ .../nvidia_nim/config_models/instance.py | 171 +++ .../nvidia_nim/config_models/shared.py | 60 + .../nvidia_nim/config_models/validators.py | 13 + .../nvidia_nim/data/conf.yaml.example | 626 +++++++++ .../datadog_checks/nvidia_nim/metrics.py | 34 + nvidia_nim/hatch.toml | 4 + nvidia_nim/manifest.json | 60 + nvidia_nim/metadata.csv | 35 + nvidia_nim/pyproject.toml | 60 + nvidia_nim/tests/__init__.py | 3 + nvidia_nim/tests/common.py | 63 + nvidia_nim/tests/conftest.py | 30 + nvidia_nim/tests/docker/Caddyfile | 15 + nvidia_nim/tests/docker/docker-compose.yaml | 11 + nvidia_nim/tests/fixtures/nim_metrics.txt | 159 +++ nvidia_nim/tests/fixtures/nim_version.json | 1 + nvidia_nim/tests/test_e2e.py | 11 + nvidia_nim/tests/test_unit.py | 61 + 34 files changed, 2964 insertions(+) create mode 100644 nvidia_nim/CHANGELOG.md create mode 100644 nvidia_nim/README.md create mode 100644 nvidia_nim/assets/configuration/spec.yaml create mode 100644 nvidia_nim/assets/dashboards/nvidia_nim_overview.json create mode 100644 nvidia_nim/assets/monitors/latency.json create mode 100644 nvidia_nim/assets/service_checks.json create mode 100644 nvidia_nim/changelog.d/18964.added create mode 100644 nvidia_nim/datadog_checks/__init__.py create mode 100644 nvidia_nim/datadog_checks/nvidia_nim/__about__.py create mode 100644 nvidia_nim/datadog_checks/nvidia_nim/__init__.py create mode 100644 nvidia_nim/datadog_checks/nvidia_nim/check.py create mode 100644 nvidia_nim/datadog_checks/nvidia_nim/config_models/__init__.py create mode 100644 nvidia_nim/datadog_checks/nvidia_nim/config_models/defaults.py create mode 100644 nvidia_nim/datadog_checks/nvidia_nim/config_models/instance.py create mode 100644 nvidia_nim/datadog_checks/nvidia_nim/config_models/shared.py create mode 100644 nvidia_nim/datadog_checks/nvidia_nim/config_models/validators.py create mode 100644 nvidia_nim/datadog_checks/nvidia_nim/data/conf.yaml.example create mode 100644 nvidia_nim/datadog_checks/nvidia_nim/metrics.py create mode 100644 nvidia_nim/hatch.toml create mode 100644 nvidia_nim/manifest.json create mode 100644 nvidia_nim/metadata.csv create mode 100644 nvidia_nim/pyproject.toml create mode 100644 nvidia_nim/tests/__init__.py create mode 100644 nvidia_nim/tests/common.py create mode 100644 nvidia_nim/tests/conftest.py create mode 100644 nvidia_nim/tests/docker/Caddyfile create mode 100644 nvidia_nim/tests/docker/docker-compose.yaml create mode 100644 nvidia_nim/tests/fixtures/nim_metrics.txt create mode 100644 nvidia_nim/tests/fixtures/nim_version.json create mode 100644 nvidia_nim/tests/test_e2e.py create mode 100644 nvidia_nim/tests/test_unit.py diff --git a/.codecov.yml b/.codecov.yml index 12d18bf18d059..105f39b3902da 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -710,6 +710,10 @@ coverage: target: 75 flags: - kyverno + nvidia_nim: + target: 75 + flags: + - nvidia_nim tibco_ems: target: 75 flags: @@ -1289,6 +1293,11 @@ flags: paths: - nginx_ingress_controller/datadog_checks/nginx_ingress_controller - nginx_ingress_controller/tests + nvidia_nim: + carryforward: true + paths: + - nvidia_nim/datadog_checks/nvidia_nim + - nvidia_nim/tests nvidia_triton: carryforward: true paths: diff --git a/.github/workflows/config/labeler.yml b/.github/workflows/config/labeler.yml index ad9be72ddcb7f..2ebb3d4f96817 100644 --- a/.github/workflows/config/labeler.yml +++ b/.github/workflows/config/labeler.yml @@ -367,6 +367,8 @@ integration/ntp: - ntp/**/* integration/nvidia_jetson: - nvidia_jetson/**/* +integration/nvidia_nim: +- nvidia_nim/**/* integration/nvidia_triton: - nvidia_triton/**/* integration/oke: diff --git a/.github/workflows/test-all.yml b/.github/workflows/test-all.yml index 53fc9b5f077c9..606f6125af01e 100644 --- a/.github/workflows/test-all.yml +++ b/.github/workflows/test-all.yml @@ -2654,6 +2654,26 @@ jobs: minimum-base-package: ${{ inputs.minimum-base-package }} pytest-args: ${{ inputs.pytest-args }} secrets: inherit + jb705691: + uses: ./.github/workflows/test-target.yml + with: + job-name: nvidia_nim + target: nvidia_nim + platform: linux + runner: '["ubuntu-22.04"]' + repo: "${{ inputs.repo }}" + python-version: "${{ inputs.python-version }}" + standard: ${{ inputs.standard }} + latest: ${{ inputs.latest }} + agent-image: "${{ inputs.agent-image }}" + agent-image-py2: "${{ inputs.agent-image-py2 }}" + agent-image-windows: "${{ inputs.agent-image-windows }}" + agent-image-windows-py2: "${{ inputs.agent-image-windows-py2 }}" + test-py2: ${{ inputs.test-py2 }} + test-py3: ${{ inputs.test-py3 }} + minimum-base-package: ${{ inputs.minimum-base-package }} + pytest-args: ${{ inputs.pytest-args }} + secrets: inherit j74dc677: uses: ./.github/workflows/test-target.yml with: diff --git a/nvidia_nim/CHANGELOG.md b/nvidia_nim/CHANGELOG.md new file mode 100644 index 0000000000000..efe8873dc872e --- /dev/null +++ b/nvidia_nim/CHANGELOG.md @@ -0,0 +1,4 @@ +# CHANGELOG - nvidia_nim + + + diff --git a/nvidia_nim/README.md b/nvidia_nim/README.md new file mode 100644 index 0000000000000..53d980b7bc0e9 --- /dev/null +++ b/nvidia_nim/README.md @@ -0,0 +1,62 @@ +# Agent Check: nvidia_nim + +## Overview + +This check monitors [NVIDIA NIM][1] through the Datadog Agent. + +## Setup + +Follow the instructions below to install and configure this check for an Agent running on a host. For containerized environments, see the [Autodiscovery Integration Templates][3] for guidance on applying these instructions. + +**Requirements**: +- This check requires Agent v7.61.0+ +- This check uses [OpenMetrics][10] for metric collection, which requires Python 3. + +### Installation + +The NVIDIA NIM check is included in the [Datadog Agent][2] package. No additional installation is needed on your server. + +### Configuration + +NVIDIA NIM provides Prometheus [metrics][1] indicating request statistics. By default, these metrics are available at http://localhost:8000/metrics. The Datadog Agent can collect the exposed metrics using this integration. Follow the instructions below to configure data collection from any or all of the components. + +To start collecting your NVIDIA NIM performance data: +1. Edit the `nvidia_nim.d/conf.yaml` file, in the `conf.d/` folder at the root of your Agent's configuration directory to start collecting your NVIDIA NIM performance data. See the [sample nvidia_nim.d/conf.yaml][4] for all available configuration options. + +2. [Restart the Agent][5]. + +### Validation + +[Run the Agent's status subcommand][6] and look for `nvidia_nim` under the Checks section. + +## Data Collected + +### Metrics + +See [metadata.csv][7] for a list of metrics provided by this integration. + +### Events + +The NVIDIA NIM integration does not include any events. + +### Service Checks + +The NVIDIA NIM integration does not include any service checks. + +See [service_checks.json][8] for a list of service checks provided by this integration. + +## Troubleshooting + +Need help? Contact [Datadog support][9]. + + +[1]: https://docs.nvidia.com/nim/large-language-models/latest/observability.html +[2]: https://app.datadoghq.com/account/settings/agent/latest +[3]: https://docs.datadoghq.com/agent/kubernetes/integrations/ +[4]: https://github.com/DataDog/integrations-core/blob/master/nvidia_nim/datadog_checks/nvidia_nim/data/conf.yaml.example +[5]: https://docs.datadoghq.com/agent/guide/agent-commands/#start-stop-and-restart-the-agent +[6]: https://docs.datadoghq.com/agent/guide/agent-commands/#agent-status-and-information +[7]: https://github.com/DataDog/integrations-core/blob/master/nvidia_nim/metadata.csv +[8]: https://github.com/DataDog/integrations-core/blob/master/nvidia_nim/assets/service_checks.json +[9]: https://docs.datadoghq.com/help/ +[10]: https://docs.datadoghq.com/integrations/openmetrics/ \ No newline at end of file diff --git a/nvidia_nim/assets/configuration/spec.yaml b/nvidia_nim/assets/configuration/spec.yaml new file mode 100644 index 0000000000000..6f739175a5acc --- /dev/null +++ b/nvidia_nim/assets/configuration/spec.yaml @@ -0,0 +1,16 @@ +name: nvidia_nim +files: +- name: nvidia_nim.yaml + options: + - template: init_config + options: + - template: init_config/openmetrics + - template: instances + options: + - template: instances/openmetrics + overrides: + openmetrics_endpoint.required: true + openmetrics_endpoint.value.example: http://localhost:8000/metrics + openmetrics_endpoint.description: | + Endpoint exposing the NVIDIA NIM's Prometheus metrics. For more information refer to: + https://docs.nvidia.com/nim/large-language-models/latest/observability.html \ No newline at end of file diff --git a/nvidia_nim/assets/dashboards/nvidia_nim_overview.json b/nvidia_nim/assets/dashboards/nvidia_nim_overview.json new file mode 100644 index 0000000000000..ee9a6dc7af428 --- /dev/null +++ b/nvidia_nim/assets/dashboards/nvidia_nim_overview.json @@ -0,0 +1,1163 @@ +{ + "title": "NVIDIA NIM Overview", + "description": "## NVIDIA NIM\n\nThis dashboard provides observability for your NIM deployments with the NVIDIA NIM Integration.\n\nIt shows information about how many tokens your model is generating per second as well as exposing low-level details such as GPU usage and Python memory management.\n\n# Useful Links\n- [NVIDIA NIM Integration ↗](https://docs.datadoghq.com/integrations/nvidia_nim)\n- [NVIDIA NIM Metrics ↗](https://docs.nvidia.com/nim/large-language-models/latest/observability.html)\n- [vLLM Documentation ↗](https://docs.nvidia.com/nim/large-language-models/latest/introduction.html)", + "widgets": [ + { + "id": 4717263751542750, + "definition": { + "title": "", + "banner_img": "/static/images/logos/nvidia-nim_large.svg", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 5685022835071772, + "definition": { + "type": "note", + "content": "## NVIDIA NIM\n\nThis dashboard provides observability for your NIM deployments with the NVIDIA NIM Integration.\n\nIt shows information about how many tokens your model is generating per second as well as exposing low-level details such as GPU usage and Python memory management.", + "background_color": "white", + "font_size": "14", + "text_align": "left", + "vertical_align": "center", + "show_tick": false, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 0, + "width": 3, + "height": 3 + } + }, + { + "id": 8921963557059570, + "definition": { + "type": "note", + "content": "# Useful Links\n- [NVIDIA NIM Integration ↗](https://docs.datadoghq.com/integrations/nvidia_nim)\n- [NVIDIA NIM Metrics ↗](https://docs.nvidia.com/nim/large-language-models/latest/observability.html)\n- [NVIDIA NIM Documentation ↗](https://docs.nvidia.com/nim/large-language-models/latest/introduction.html)", + "background_color": "white", + "font_size": "14", + "text_align": "center", + "vertical_align": "center", + "show_tick": false, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 3, + "y": 0, + "width": 3, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 0, + "width": 6, + "height": 6 + } + }, + { + "id": 2737008660122334, + "definition": { + "title": "Overview", + "background_color": "vivid_green", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 4528647613111842, + "definition": { + "type": "note", + "content": "Here you can see an overview of your LLM of your system activity and any NIM alerts. The service checks on the left speak to the health of your NVIDIA NIM environment, while the ones on the right report on the readiness of your dependencies.\n", + "background_color": "green", + "font_size": "14", + "text_align": "center", + "vertical_align": "top", + "show_tick": false, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 0, + "width": 6, + "height": 1 + } + }, + { + "id": 2166067869769356, + "definition": { + "title": "NVIDIA NIM Health Check", + "title_size": "16", + "title_align": "left", + "type": "check_status", + "check": "nvidia_nim.openmetrics.health", + "grouping": "cluster", + "group_by": [ + "endpoint" + ], + "tags": [] + }, + "layout": { + "x": 0, + "y": 1, + "width": 2, + "height": 2 + } + }, + { + "id": 3037068311385910, + "definition": { + "title": "Successful Requests", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "aggregator": "sum", + "data_source": "metrics", + "name": "query1", + "query": "sum:nvidia_nim.request.success.count{$model_name}.as_count()" + } + ], + "response_format": "scalar" + } + ], + "autoscale": true, + "precision": 2, + "timeseries_background": { + "type": "bars" + } + }, + "layout": { + "x": 2, + "y": 1, + "width": 2, + "height": 2 + } + }, + { + "id": 5175941643906344, + "definition": { + "title": "Average Request Latency", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "formula": "query2 / query1" + } + ], + "queries": [ + { + "aggregator": "avg", + "data_source": "metrics", + "name": "query2", + "query": "sum:nvidia_nim.e2e_request_latency.seconds.sum{$model_name}.as_count()" + }, + { + "aggregator": "avg", + "data_source": "metrics", + "name": "query1", + "query": "sum:nvidia_nim.e2e_request_latency.seconds.count{$model_name}.as_count()" + } + ], + "response_format": "scalar" + } + ], + "autoscale": true, + "precision": 2, + "timeseries_background": { + "type": "bars" + } + }, + "layout": { + "x": 4, + "y": 1, + "width": 2, + "height": 2 + } + }, + { + "id": 7873059155305294, + "definition": { + "title": "Monitor Summary", + "type": "manage_status", + "display_format": "countsAndList", + "color_preference": "text", + "hide_zero_counts": true, + "show_status": true, + "last_triggered_format": "relative", + "query": "tag:(integration:vllm)", + "sort": "status,asc", + "count": 50, + "start": 0, + "summary_type": "monitors", + "show_priority": false, + "show_last_triggered": false + }, + "layout": { + "x": 0, + "y": 3, + "width": 6, + "height": 2 + } + } + ] + }, + "layout": { + "x": 6, + "y": 0, + "width": 6, + "height": 6 + } + }, + { + "id": 2300381400792284, + "definition": { + "title": "K/V Cache Utilization and Request Metrics", + "background_color": "vivid_green", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 995705405594846, + "definition": { + "type": "note", + "content": "The GPU is the workhorse of any LLM. It is also expensive to run. See here how many requests your GPU is running and how much you are taking advantage of its caching mechanisms.", + "background_color": "green", + "font_size": "18", + "text_align": "center", + "vertical_align": "center", + "show_tick": false, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 0, + "width": 12, + "height": 1 + } + }, + { + "id": 1818057086692970, + "definition": { + "title": "Requests Waiting", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:nvidia_nim.num_requests.waiting{$model_name} by {model_name}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 1, + "width": 8, + "height": 3 + } + }, + { + "id": 6975549889095854, + "definition": { + "title": "Requests Waiting", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:nvidia_nim.num_requests.waiting{$model_name}", + "aggregator": "last" + } + ], + "conditional_formats": [ + { + "comparator": "=", + "value": 0, + "palette": "white_on_green" + }, + { + "comparator": ">", + "value": 5, + "palette": "white_on_yellow" + }, + { + "comparator": ">", + "value": 15, + "palette": "white_on_red" + } + ], + "formulas": [ + { + "formula": "query1", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "request" + } + } + } + ] + } + ], + "autoscale": true, + "precision": 0, + "timeseries_background": { + "yaxis": { + "include_zero": false + }, + "type": "area" + } + }, + "layout": { + "x": 8, + "y": 1, + "width": 4, + "height": 3 + } + }, + { + "id": 1084963586222678, + "definition": { + "title": "Requests Failed", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:nvidia_nim.request.failure.count{$model_name} by {model_name}.as_count()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 4, + "width": 8, + "height": 3 + } + }, + { + "id": 2525646835263004, + "definition": { + "title": "Requests Failed", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "formula": "query1", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "request" + } + } + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:nvidia_nim.request.failure.count{$model_name}.as_count()", + "aggregator": "last" + } + ], + "response_format": "scalar", + "conditional_formats": [ + { + "comparator": "<=", + "value": 0, + "palette": "white_on_green" + }, + { + "comparator": ">", + "value": 0, + "palette": "white_on_red" + } + ] + } + ], + "autoscale": true, + "precision": 0, + "timeseries_background": { + "type": "bars" + } + }, + "layout": { + "x": 8, + "y": 4, + "width": 4, + "height": 3 + } + }, + { + "id": 6776207665378710, + "definition": { + "title": "Requests per second", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "alias": "requests", + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:nvidia_nim.request.success.count{$model_name} by {model_name}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 7, + "width": 8, + "height": 3 + } + }, + { + "id": 3747999506353878, + "definition": { + "title": "Requests Running", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:nvidia_nim.num_requests.running{$model_name}", + "aggregator": "sum" + } + ], + "response_format": "scalar" + } + ], + "autoscale": true, + "precision": 0, + "timeseries_background": { + "type": "area" + } + }, + "layout": { + "x": 8, + "y": 7, + "width": 4, + "height": 3 + } + }, + { + "id": 2448557456884510, + "definition": { + "title": "K/V Cache Utilization", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:nvidia_nim.gpu_cache_usage_percent{$model_name}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 10, + "width": 8, + "height": 3 + } + }, + { + "id": 5942456558543848, + "definition": { + "title": "K/V Cache Utilization", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "percent" + } + }, + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:nvidia_nim.gpu_cache_usage_percent{$model_name}", + "aggregator": "last" + } + ], + "response_format": "scalar", + "conditional_formats": [ + { + "comparator": "<", + "value": 60, + "palette": "white_on_green" + }, + { + "comparator": "<=", + "value": 80, + "palette": "white_on_yellow" + }, + { + "comparator": ">", + "value": 80, + "palette": "white_on_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": false + }, + "type": "area" + } + }, + "layout": { + "x": 8, + "y": 10, + "width": 4, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 6, + "width": 12, + "height": 14 + } + }, + { + "id": 880646291321010, + "definition": { + "title": "Text Generation", + "background_color": "vivid_green", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 5193429521650892, + "definition": { + "type": "note", + "content": "These metrics measure response latency, input-output token balance, and token generation efficiency to ensure performance and scalability.", + "background_color": "green", + "font_size": "18", + "text_align": "center", + "vertical_align": "center", + "show_tick": false, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 0, + "width": 12, + "height": 1 + } + }, + { + "id": 7057133142091754, + "definition": { + "title": "Average Time to First Token (TFTT)", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "time": {}, + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1 / query2" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:nvidia_nim.time_to_first_token.seconds.sum{$model_name} by {model_name}.as_count()" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:nvidia_nim.time_to_first_token.seconds.count{$model_name} by {model_name}.as_count()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 1, + "width": 6, + "height": 4 + } + }, + { + "id": 1276907480965038, + "definition": { + "title": "Context vs Generated Tokens", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "time": {}, + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "alias": "Context", + "formula": "(query2 / (query2 + query1)) * 100" + }, + { + "alias": "Generated", + "formula": "(query1 / (query2 + query1)) * 100" + } + ], + "queries": [ + { + "query": "avg:nvidia_nim.request.prompt_tokens.sum{$model_name} by {model_name}.as_count()", + "data_source": "metrics", + "name": "query2" + }, + { + "query": "avg:nvidia_nim.request.generation_tokens.sum{$model_name} by {model_name}.as_count()", + "data_source": "metrics", + "name": "query1" + } + ], + "response_format": "timeseries", + "style": { + "palette": "cool", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "bars" + } + ] + }, + "layout": { + "x": 6, + "y": 1, + "width": 6, + "height": 4 + } + }, + { + "id": 1973749730991538, + "definition": { + "title": "Average Inter Token Latency (ITL)", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "time": {}, + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1 / query2" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:nvidia_nim.time_per_output_token.seconds.sum{$model_name} by {model_name}.as_count()" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:nvidia_nim.time_per_output_token.seconds.count{$model_name} by {model_name}.as_count()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "cool", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 5, + "width": 12, + "height": 4 + } + } + ] + }, + "layout": { + "x": 0, + "y": 20, + "width": 12, + "height": 10, + "is_column_break": true + } + }, + { + "id": 3331850504686986, + "definition": { + "title": "Python Garbage Collector", + "background_color": "vivid_green", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 5960991703038874, + "definition": { + "type": "note", + "content": "This section helps explore how NVIDIA NIM uses memory. The garbage collector collects objects in generations. You can see how each generation of objects gets processed.", + "background_color": "green", + "font_size": "18", + "text_align": "center", + "vertical_align": "center", + "show_tick": false, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 0, + "width": 12, + "height": 1 + } + }, + { + "id": 2577004928803106, + "definition": { + "title": "Resident Memory", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:nvidia_nim.process.resident_memory_bytes{$model_name}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "bars" + } + ] + }, + "layout": { + "x": 0, + "y": 1, + "width": 4, + "height": 3 + } + }, + { + "id": 4400803113146958, + "definition": { + "title": "Total Runs", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:nvidia_nim.python.gc.collections.count{$model_name} by {generation}.as_count()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "bars" + } + ] + }, + "layout": { + "x": 4, + "y": 1, + "width": 4, + "height": 6 + } + }, + { + "id": 5270613800707436, + "definition": { + "title": "Uncollectable Objects", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:nvidia_nim.python.gc.objects.uncollectable.count{$host}.as_count()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 8, + "y": 1, + "width": 4, + "height": 3 + } + }, + { + "id": 289938027327656, + "definition": { + "title": "Virtual Memory", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "time": {}, + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:nvidia_nim.process.virtual_memory_bytes{$host} by {host}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 4, + "width": 4, + "height": 3 + } + }, + { + "id": 5699420889371520, + "definition": { + "title": "Collected Objects", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:nvidia_nim.python.gc.objects.collected.count{$model_name} by {generation}.as_count()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "bars" + } + ] + }, + "layout": { + "x": 8, + "y": 4, + "width": 4, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 30, + "width": 12, + "height": 8 + } + } + ], + "template_variables": [ + { + "name": "model_name", + "prefix": "model_name", + "available_values": [], + "default": "*" + }, + { + "name": "process", + "prefix": "process", + "available_values": [], + "default": "*" + }, + { + "name": "host", + "prefix": "host", + "available_values": [], + "default": "*" + } + ], + "layout_type": "ordered", + "notify_list": [], + "reflow_type": "fixed" +} \ No newline at end of file diff --git a/nvidia_nim/assets/monitors/latency.json b/nvidia_nim/assets/monitors/latency.json new file mode 100644 index 0000000000000..5ad93751d27bd --- /dev/null +++ b/nvidia_nim/assets/monitors/latency.json @@ -0,0 +1,33 @@ +{ + "version": 2, + "created_at": "2024-07-02", + "last_updated_at": "2024-07-02", + "title": "Average Request Latency is High", + "description": "This monitor alerts you if NVIDIA request latency is too high. High latency means requests are waiting long to be processed. This results in clients having to wait longer for their requests to complete. It also indicates your NVIDIA server is receiving more requests than it can comfortably handle.", + "tags": [ + "integration:nvidia-nim" + ], + "definition": { + "name": "Average request latency is high", + "type": "query alert", + "query": "sum(last_15m):sum:nvidia_nim.e2e_request_latency.seconds.sum{*}.as_count() / sum:nvidia_nim.e2e_request_latency.seconds.count{*}.as_count() > 0.3", + "message": "The average latency for requests coming into your NVIDIA instance is higher than the threshold. This means requests are waiting too long to be processed.", + "tags": [ + "integration:nvidia_nim" + ], + "options": { + "thresholds": { + "critical": 0.3 + }, + "notify_audit": false, + "include_tags": false, + "avalanche_window": 10, + "new_host_delay": 300, + "silenced": {} + }, + "priority": null, + "restriction_policy": { + "bindings": [] + } + } + } \ No newline at end of file diff --git a/nvidia_nim/assets/service_checks.json b/nvidia_nim/assets/service_checks.json new file mode 100644 index 0000000000000..d0f0c79071ec4 --- /dev/null +++ b/nvidia_nim/assets/service_checks.json @@ -0,0 +1,17 @@ +[ + { + "agent_version": "7.61.0", + "integration": "nvidia_nim", + "check": "nvidia_nim.openmetrics.health", + "statuses": [ + "ok", + "critical" + ], + "groups": [ + "host", + "endpoint" + ], + "name": "NVIDIA NIM OpenMetrics endpoint health", + "description": "Returns `CRITICAL` if the Agent is unable to connect to the NVIDIA NIM OpenMetrics endpoint, otherwise returns `OK`." + } +] \ No newline at end of file diff --git a/nvidia_nim/changelog.d/18964.added b/nvidia_nim/changelog.d/18964.added new file mode 100644 index 0000000000000..aa949b47b7b41 --- /dev/null +++ b/nvidia_nim/changelog.d/18964.added @@ -0,0 +1 @@ +Initial Release \ No newline at end of file diff --git a/nvidia_nim/datadog_checks/__init__.py b/nvidia_nim/datadog_checks/__init__.py new file mode 100644 index 0000000000000..1517d901c0aae --- /dev/null +++ b/nvidia_nim/datadog_checks/__init__.py @@ -0,0 +1,4 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +__path__ = __import__('pkgutil').extend_path(__path__, __name__) # type: ignore diff --git a/nvidia_nim/datadog_checks/nvidia_nim/__about__.py b/nvidia_nim/datadog_checks/nvidia_nim/__about__.py new file mode 100644 index 0000000000000..e9541ce83e9e5 --- /dev/null +++ b/nvidia_nim/datadog_checks/nvidia_nim/__about__.py @@ -0,0 +1,4 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +__version__ = '0.0.1' diff --git a/nvidia_nim/datadog_checks/nvidia_nim/__init__.py b/nvidia_nim/datadog_checks/nvidia_nim/__init__.py new file mode 100644 index 0000000000000..98c1e93e6e79c --- /dev/null +++ b/nvidia_nim/datadog_checks/nvidia_nim/__init__.py @@ -0,0 +1,7 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from .__about__ import __version__ +from .check import NvidiaNIMCheck + +__all__ = ['__version__', 'NvidiaNIMCheck'] diff --git a/nvidia_nim/datadog_checks/nvidia_nim/check.py b/nvidia_nim/datadog_checks/nvidia_nim/check.py new file mode 100644 index 0000000000000..fe48310886506 --- /dev/null +++ b/nvidia_nim/datadog_checks/nvidia_nim/check.py @@ -0,0 +1,49 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from datadog_checks.base import AgentCheck, OpenMetricsBaseCheckV2 # noqa: F401 + +from .metrics import METRIC_MAP, RENAME_LABELS_MAP + + +class NvidiaNIMCheck(OpenMetricsBaseCheckV2): + + DEFAULT_METRIC_LIMIT = 0 + # This will be the prefix of every metric and service check the integration sends + __NAMESPACE__ = 'nvidia_nim' + + def get_default_config(self): + return { + 'metrics': [METRIC_MAP], + "rename_labels": RENAME_LABELS_MAP, + } + + @AgentCheck.metadata_entrypoint + def _submit_version_metadata(self): + + endpoint = self.instance["openmetrics_endpoint"].replace("/metrics", "/v1/version") + response = self.http.get(endpoint) + response.raise_for_status() + + data = response.json() + version = data.get("release", "") + version_split = version.split(".") + if len(version_split) >= 3: + major = version_split[0] + minor = version_split[1] + patch = version_split[2] + + version_raw = f'{major}.{minor}.{patch}' + + version_parts = { + 'major': major, + 'minor': minor, + 'patch': patch, + } + self.set_metadata('version', version_raw, scheme='semver', part_map=version_parts) + else: + self.log.debug("Invalid NVIDIA NIM release format: %s", version) + + def check(self, instance): + super().check(instance) + self._submit_version_metadata() diff --git a/nvidia_nim/datadog_checks/nvidia_nim/config_models/__init__.py b/nvidia_nim/datadog_checks/nvidia_nim/config_models/__init__.py new file mode 100644 index 0000000000000..106fff2032f68 --- /dev/null +++ b/nvidia_nim/datadog_checks/nvidia_nim/config_models/__init__.py @@ -0,0 +1,24 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# This file is autogenerated. +# To change this file you should edit assets/configuration/spec.yaml and then run the following commands: +# ddev -x validate config -s +# ddev -x validate models -s + +from .instance import InstanceConfig +from .shared import SharedConfig + + +class ConfigMixin: + _config_model_instance: InstanceConfig + _config_model_shared: SharedConfig + + @property + def config(self) -> InstanceConfig: + return self._config_model_instance + + @property + def shared_config(self) -> SharedConfig: + return self._config_model_shared diff --git a/nvidia_nim/datadog_checks/nvidia_nim/config_models/defaults.py b/nvidia_nim/datadog_checks/nvidia_nim/config_models/defaults.py new file mode 100644 index 0000000000000..bf7519af75f42 --- /dev/null +++ b/nvidia_nim/datadog_checks/nvidia_nim/config_models/defaults.py @@ -0,0 +1,132 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# This file is autogenerated. +# To change this file you should edit assets/configuration/spec.yaml and then run the following commands: +# ddev -x validate config -s +# ddev -x validate models -s + + +def shared_skip_proxy(): + return False + + +def shared_timeout(): + return 10 + + +def instance_allow_redirects(): + return True + + +def instance_auth_type(): + return 'basic' + + +def instance_cache_metric_wildcards(): + return True + + +def instance_cache_shared_labels(): + return True + + +def instance_collect_counters_with_distributions(): + return False + + +def instance_collect_histogram_buckets(): + return True + + +def instance_disable_generic_tags(): + return False + + +def instance_empty_default_hostname(): + return False + + +def instance_enable_health_service_check(): + return True + + +def instance_histogram_buckets_as_distributions(): + return False + + +def instance_ignore_connection_errors(): + return False + + +def instance_kerberos_auth(): + return 'disabled' + + +def instance_kerberos_delegate(): + return False + + +def instance_kerberos_force_initiate(): + return False + + +def instance_log_requests(): + return False + + +def instance_min_collection_interval(): + return 15 + + +def instance_non_cumulative_histogram_buckets(): + return False + + +def instance_persist_connections(): + return False + + +def instance_request_size(): + return 16 + + +def instance_skip_proxy(): + return False + + +def instance_tag_by_endpoint(): + return True + + +def instance_telemetry(): + return False + + +def instance_timeout(): + return 10 + + +def instance_tls_ignore_warning(): + return False + + +def instance_tls_use_host_header(): + return False + + +def instance_tls_verify(): + return True + + +def instance_use_latest_spec(): + return False + + +def instance_use_legacy_auth_encoding(): + return True + + +def instance_use_process_start_time(): + return False diff --git a/nvidia_nim/datadog_checks/nvidia_nim/config_models/instance.py b/nvidia_nim/datadog_checks/nvidia_nim/config_models/instance.py new file mode 100644 index 0000000000000..8e39a0e921719 --- /dev/null +++ b/nvidia_nim/datadog_checks/nvidia_nim/config_models/instance.py @@ -0,0 +1,171 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# This file is autogenerated. +# To change this file you should edit assets/configuration/spec.yaml and then run the following commands: +# ddev -x validate config -s +# ddev -x validate models -s + +from __future__ import annotations + +from types import MappingProxyType +from typing import Any, Optional, Union + +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator + +from datadog_checks.base.utils.functions import identity +from datadog_checks.base.utils.models import validation + +from . import defaults, validators + + +class AuthToken(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + reader: Optional[MappingProxyType[str, Any]] = None + writer: Optional[MappingProxyType[str, Any]] = None + + +class ExtraMetrics(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + extra='allow', + frozen=True, + ) + name: Optional[str] = None + type: Optional[str] = None + + +class MetricPatterns(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + exclude: Optional[tuple[str, ...]] = None + include: Optional[tuple[str, ...]] = None + + +class Metrics(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + extra='allow', + frozen=True, + ) + name: Optional[str] = None + type: Optional[str] = None + + +class Proxy(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + http: Optional[str] = None + https: Optional[str] = None + no_proxy: Optional[tuple[str, ...]] = None + + +class ShareLabels(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + labels: Optional[tuple[str, ...]] = None + match: Optional[tuple[str, ...]] = None + + +class InstanceConfig(BaseModel): + model_config = ConfigDict( + validate_default=True, + arbitrary_types_allowed=True, + frozen=True, + ) + allow_redirects: Optional[bool] = None + auth_token: Optional[AuthToken] = None + auth_type: Optional[str] = None + aws_host: Optional[str] = None + aws_region: Optional[str] = None + aws_service: Optional[str] = None + cache_metric_wildcards: Optional[bool] = None + cache_shared_labels: Optional[bool] = None + collect_counters_with_distributions: Optional[bool] = None + collect_histogram_buckets: Optional[bool] = None + connect_timeout: Optional[float] = None + disable_generic_tags: Optional[bool] = None + empty_default_hostname: Optional[bool] = None + enable_health_service_check: Optional[bool] = None + exclude_labels: Optional[tuple[str, ...]] = None + exclude_metrics: Optional[tuple[str, ...]] = None + exclude_metrics_by_labels: Optional[MappingProxyType[str, Union[bool, tuple[str, ...]]]] = None + extra_headers: Optional[MappingProxyType[str, Any]] = None + extra_metrics: Optional[tuple[Union[str, MappingProxyType[str, Union[str, ExtraMetrics]]], ...]] = None + headers: Optional[MappingProxyType[str, Any]] = None + histogram_buckets_as_distributions: Optional[bool] = None + hostname_format: Optional[str] = None + hostname_label: Optional[str] = None + ignore_connection_errors: Optional[bool] = None + ignore_tags: Optional[tuple[str, ...]] = None + include_labels: Optional[tuple[str, ...]] = None + kerberos_auth: Optional[str] = None + kerberos_cache: Optional[str] = None + kerberos_delegate: Optional[bool] = None + kerberos_force_initiate: Optional[bool] = None + kerberos_hostname: Optional[str] = None + kerberos_keytab: Optional[str] = None + kerberos_principal: Optional[str] = None + log_requests: Optional[bool] = None + metric_patterns: Optional[MetricPatterns] = None + metrics: Optional[tuple[Union[str, MappingProxyType[str, Union[str, Metrics]]], ...]] = None + min_collection_interval: Optional[float] = None + namespace: Optional[str] = Field(None, pattern='\\w*') + non_cumulative_histogram_buckets: Optional[bool] = None + ntlm_domain: Optional[str] = None + openmetrics_endpoint: str + password: Optional[str] = None + persist_connections: Optional[bool] = None + proxy: Optional[Proxy] = None + raw_line_filters: Optional[tuple[str, ...]] = None + raw_metric_prefix: Optional[str] = None + read_timeout: Optional[float] = None + rename_labels: Optional[MappingProxyType[str, Any]] = None + request_size: Optional[float] = None + service: Optional[str] = None + share_labels: Optional[MappingProxyType[str, Union[bool, ShareLabels]]] = None + skip_proxy: Optional[bool] = None + tag_by_endpoint: Optional[bool] = None + tags: Optional[tuple[str, ...]] = None + telemetry: Optional[bool] = None + timeout: Optional[float] = None + tls_ca_cert: Optional[str] = None + tls_cert: Optional[str] = None + tls_ignore_warning: Optional[bool] = None + tls_private_key: Optional[str] = None + tls_protocols_allowed: Optional[tuple[str, ...]] = None + tls_use_host_header: Optional[bool] = None + tls_verify: Optional[bool] = None + use_latest_spec: Optional[bool] = None + use_legacy_auth_encoding: Optional[bool] = None + use_process_start_time: Optional[bool] = None + username: Optional[str] = None + + @model_validator(mode='before') + def _initial_validation(cls, values): + return validation.core.initialize_config(getattr(validators, 'initialize_instance', identity)(values)) + + @field_validator('*', mode='before') + def _validate(cls, value, info): + field = cls.model_fields[info.field_name] + field_name = field.alias or info.field_name + if field_name in info.context['configured_fields']: + value = getattr(validators, f'instance_{info.field_name}', identity)(value, field=field) + else: + value = getattr(defaults, f'instance_{info.field_name}', lambda: value)() + + return validation.utils.make_immutable(value) + + @model_validator(mode='after') + def _final_validation(cls, model): + return validation.core.check_model(getattr(validators, 'check_instance', identity)(model)) diff --git a/nvidia_nim/datadog_checks/nvidia_nim/config_models/shared.py b/nvidia_nim/datadog_checks/nvidia_nim/config_models/shared.py new file mode 100644 index 0000000000000..0e8a9ecab10a2 --- /dev/null +++ b/nvidia_nim/datadog_checks/nvidia_nim/config_models/shared.py @@ -0,0 +1,60 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# This file is autogenerated. +# To change this file you should edit assets/configuration/spec.yaml and then run the following commands: +# ddev -x validate config -s +# ddev -x validate models -s + +from __future__ import annotations + +from typing import Optional + +from pydantic import BaseModel, ConfigDict, field_validator, model_validator + +from datadog_checks.base.utils.functions import identity +from datadog_checks.base.utils.models import validation + +from . import defaults, validators + + +class Proxy(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + http: Optional[str] = None + https: Optional[str] = None + no_proxy: Optional[tuple[str, ...]] = None + + +class SharedConfig(BaseModel): + model_config = ConfigDict( + validate_default=True, + arbitrary_types_allowed=True, + frozen=True, + ) + proxy: Optional[Proxy] = None + service: Optional[str] = None + skip_proxy: Optional[bool] = None + timeout: Optional[float] = None + + @model_validator(mode='before') + def _initial_validation(cls, values): + return validation.core.initialize_config(getattr(validators, 'initialize_shared', identity)(values)) + + @field_validator('*', mode='before') + def _validate(cls, value, info): + field = cls.model_fields[info.field_name] + field_name = field.alias or info.field_name + if field_name in info.context['configured_fields']: + value = getattr(validators, f'shared_{info.field_name}', identity)(value, field=field) + else: + value = getattr(defaults, f'shared_{info.field_name}', lambda: value)() + + return validation.utils.make_immutable(value) + + @model_validator(mode='after') + def _final_validation(cls, model): + return validation.core.check_model(getattr(validators, 'check_shared', identity)(model)) diff --git a/nvidia_nim/datadog_checks/nvidia_nim/config_models/validators.py b/nvidia_nim/datadog_checks/nvidia_nim/config_models/validators.py new file mode 100644 index 0000000000000..70150e85e6124 --- /dev/null +++ b/nvidia_nim/datadog_checks/nvidia_nim/config_models/validators.py @@ -0,0 +1,13 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +# Here you can include additional config validators or transformers +# +# def initialize_instance(values, **kwargs): +# if 'my_option' not in values and 'my_legacy_option' in values: +# values['my_option'] = values['my_legacy_option'] +# if values.get('my_number') > 10: +# raise ValueError('my_number max value is 10, got %s' % str(values.get('my_number'))) +# +# return values diff --git a/nvidia_nim/datadog_checks/nvidia_nim/data/conf.yaml.example b/nvidia_nim/datadog_checks/nvidia_nim/data/conf.yaml.example new file mode 100644 index 0000000000000..c5e8d23aa4e1b --- /dev/null +++ b/nvidia_nim/datadog_checks/nvidia_nim/data/conf.yaml.example @@ -0,0 +1,626 @@ +## All options defined here are available to all instances. +# +init_config: + + ## @param proxy - mapping - optional + ## Set HTTP or HTTPS proxies for all instances. Use the `no_proxy` list + ## to specify hosts that must bypass proxies. + ## + ## The SOCKS protocol is also supported like so: + ## + ## socks5://user:pass@host:port + ## + ## Using the scheme `socks5` causes the DNS resolution to happen on the + ## client, rather than on the proxy server. This is in line with `curl`, + ## which uses the scheme to decide whether to do the DNS resolution on + ## the client or proxy. If you want to resolve the domains on the proxy + ## server, use `socks5h` as the scheme. + # + # proxy: + # http: http://: + # https: https://: + # no_proxy: + # - + # - + + ## @param skip_proxy - boolean - optional - default: false + ## If set to `true`, this makes the check bypass any proxy + ## settings enabled and attempt to reach services directly. + # + # skip_proxy: false + + ## @param timeout - number - optional - default: 10 + ## The timeout for connecting to services. + # + # timeout: 10 + + ## @param service - string - optional + ## Attach the tag `service:` to every metric, event, and service check emitted by this integration. + ## + ## Additionally, this sets the default `service` for every log source. + # + # service: + +## Every instance is scheduled independently of the others. +# +instances: + + ## @param openmetrics_endpoint - string - required + ## Endpoint exposing the NVIDIA NIM's Prometheus metrics. For more information refer to: + ## https://docs.nvidia.com/nim/large-language-models/latest/observability.html + # + - openmetrics_endpoint: http://localhost:8000/metrics + + ## @param raw_metric_prefix - string - optional + ## A prefix that is removed from all exposed metric names, if present. + ## All configuration options will use the prefix-less name. + # + # raw_metric_prefix: _ + + ## @param extra_metrics - (list of string or mapping) - optional + ## This list defines metrics to collect from the `openmetrics_endpoint`, in addition to + ## what the check collects by default. If the check already collects a metric, then + ## metric definitions here take precedence. Metrics may be defined in 3 ways: + ## + ## 1. If the item is a string, then it represents the exposed metric name, and + ## the sent metric name will be identical. For example: + ## + ## extra_metrics: + ## - + ## - + ## 2. If the item is a mapping, then the keys represent the exposed metric names. + ## + ## a. If a value is a string, then it represents the sent metric name. For example: + ## + ## extra_metrics: + ## - : + ## - : + ## b. If a value is a mapping, then it must have a `name` and/or `type` key. + ## The `name` represents the sent metric name, and the `type` represents how + ## the metric should be handled, overriding any type information the endpoint + ## may provide. For example: + ## + ## extra_metrics: + ## - : + ## name: + ## type: + ## - : + ## name: + ## type: + ## + ## The supported native types are `gauge`, `counter`, `histogram`, and `summary`. + ## + ## Note: To collect counter metrics with names ending in `_total`, specify the metric name without the `_total` + ## suffix. For example, to collect the counter metric `promhttp_metric_handler_requests_total`, specify + ## `promhttp_metric_handler_requests`. This submits to Datadog the metric name appended with `.count`. + ## For more information, see: + ## https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#suffixes + ## + ## Regular expressions may be used to match the exposed metric names, for example: + ## + ## extra_metrics: + ## - ^network_(ingress|egress)_.+ + ## - .+: + ## type: gauge + # + # extra_metrics: [] + + ## @param exclude_metrics - list of strings - optional + ## A list of metrics to exclude, with each entry being either + ## the exact metric name or a regular expression. + ## In order to exclude all metrics but the ones matching a specific filter, + ## you can use a negative lookahead regex like: + ## - ^(?!foo).*$ + # + # exclude_metrics: [] + + ## @param exclude_metrics_by_labels - mapping - optional + ## A mapping of labels to exclude metrics with matching label name and their corresponding metric values. To match + ## all values of a label, set it to `true`. + ## + ## Note: Label filtering happens before `rename_labels`. + ## + ## For example, the following configuration instructs the check to exclude all metrics with + ## a label `worker` or a label `pid` with the value of either `23` or `42`. + ## + ## exclude_metrics_by_labels: + ## worker: true + ## pid: + ## - '23' + ## - '42' + # + # exclude_metrics_by_labels: {} + + ## @param exclude_labels - list of strings - optional + ## A list of labels to exclude, useful for high cardinality values like timestamps or UUIDs. + ## May be used in conjunction with `include_labels`. + ## Labels defined in `exclude_labels` will take precedence in case of overlap. + ## + ## Note: Label filtering happens before `rename_labels`. + # + # exclude_labels: [] + + ## @param include_labels - list of strings - optional + ## A list of labels to include. May be used in conjunction with `exclude_labels`. + ## Labels defined in `exclude_labels` will take precedence in case of overlap. + ## + ## Note: Label filtering happens before `rename_labels`. + # + # include_labels: [] + + ## @param rename_labels - mapping - optional + ## A mapping of label names to their new names. + # + # rename_labels: + # : + # : + + ## @param enable_health_service_check - boolean - optional - default: true + ## Whether or not to send a service check named `.openmetrics.health` which reports + ## the health of the `openmetrics_endpoint`. + # + # enable_health_service_check: true + + ## @param ignore_connection_errors - boolean - optional - default: false + ## Whether or not to ignore connection errors when scraping `openmetrics_endpoint`. + # + # ignore_connection_errors: false + + ## @param hostname_label - string - optional + ## Override the hostname for every metric submission with the value of one of its labels. + # + # hostname_label: + + ## @param hostname_format - string - optional + ## When `hostname_label` is set, this instructs the check how to format the values. The string + ## `` is replaced by the value of the label defined by `hostname_label`. + # + # hostname_format: + + ## @param collect_histogram_buckets - boolean - optional - default: true + ## Whether or not to send histogram buckets. + # + # collect_histogram_buckets: true + + ## @param non_cumulative_histogram_buckets - boolean - optional - default: false + ## Whether or not histogram buckets are non-cumulative and to come with a `lower_bound` tag. + # + # non_cumulative_histogram_buckets: false + + ## @param histogram_buckets_as_distributions - boolean - optional - default: false + ## Whether or not to send histogram buckets as Datadog distribution metrics. This implicitly + ## enables the `collect_histogram_buckets` and `non_cumulative_histogram_buckets` options. + ## + ## Learn more about distribution metrics: + ## https://docs.datadoghq.com/developers/metrics/types/?tab=distribution#metric-types + # + # histogram_buckets_as_distributions: false + + ## @param collect_counters_with_distributions - boolean - optional - default: false + ## Whether or not to also collect the observation counter metrics ending in `.sum` and `.count` + ## when sending histogram buckets as Datadog distribution metrics. This implicitly enables the + ## `histogram_buckets_as_distributions` option. + # + # collect_counters_with_distributions: false + + ## @param use_process_start_time - boolean - optional - default: false + ## Whether to enable a heuristic for reporting counter values on the first scrape. When true, + ## the first time an endpoint is scraped, check `process_start_time_seconds` to decide whether zero + ## initial value can be assumed for counters. This requires keeping metrics in memory until the entire + ## response is received. + # + # use_process_start_time: false + + ## @param share_labels - mapping - optional + ## This mapping allows for the sharing of labels across multiple metrics. The keys represent the + ## exposed metrics from which to share labels, and the values are mappings that configure the + ## sharing behavior. Each mapping must have at least one of the following keys: + ## + ## labels - This is a list of labels to share. All labels are shared if this is not set. + ## match - This is a list of labels to match on other metrics as a condition for sharing. + ## values - This is a list of allowed values as a condition for sharing. + ## + ## To unconditionally share all labels of a metric, set it to `true`. + ## + ## For example, the following configuration instructs the check to apply all labels from `metric_a` + ## to all other metrics, the `node` label from `metric_b` to only those metrics that have a `pod` + ## label value that matches the `pod` label value of `metric_b`, and all labels from `metric_c` + ## to all other metrics if their value is equal to `23` or `42`. + ## + ## share_labels: + ## metric_a: true + ## metric_b: + ## labels: + ## - node + ## match: + ## - pod + ## metric_c: + ## values: + ## - 23 + ## - 42 + # + # share_labels: {} + + ## @param cache_shared_labels - boolean - optional - default: true + ## When `share_labels` is set, it instructs the check to cache labels collected from the first payload + ## for improved performance. + ## + ## Set this to `false` to compute label sharing for every payload at the risk of potentially increased memory usage. + # + # cache_shared_labels: true + + ## @param raw_line_filters - list of strings - optional + ## A list of regular expressions used to exclude lines read from the `openmetrics_endpoint` + ## from being parsed. + # + # raw_line_filters: [] + + ## @param cache_metric_wildcards - boolean - optional - default: true + ## Whether or not to cache data from metrics that are defined by regular expressions rather + ## than the full metric name. + # + # cache_metric_wildcards: true + + ## @param telemetry - boolean - optional - default: false + ## Whether or not to submit metrics prefixed by `.telemetry.` for debugging purposes. + # + # telemetry: false + + ## @param ignore_tags - list of strings - optional + ## A list of regular expressions used to ignore tags added by Autodiscovery and entries in the `tags` option. + # + # ignore_tags: + # - + # - + # - + + ## @param proxy - mapping - optional + ## This overrides the `proxy` setting in `init_config`. + ## + ## Set HTTP or HTTPS proxies for this instance. Use the `no_proxy` list + ## to specify hosts that must bypass proxies. + ## + ## The SOCKS protocol is also supported, for example: + ## + ## socks5://user:pass@host:port + ## + ## Using the scheme `socks5` causes the DNS resolution to happen on the + ## client, rather than on the proxy server. This is in line with `curl`, + ## which uses the scheme to decide whether to do the DNS resolution on + ## the client or proxy. If you want to resolve the domains on the proxy + ## server, use `socks5h` as the scheme. + # + # proxy: + # http: http://: + # https: https://: + # no_proxy: + # - + # - + + ## @param skip_proxy - boolean - optional - default: false + ## This overrides the `skip_proxy` setting in `init_config`. + ## + ## If set to `true`, this makes the check bypass any proxy + ## settings enabled and attempt to reach services directly. + # + # skip_proxy: false + + ## @param auth_type - string - optional - default: basic + ## The type of authentication to use. The available types (and related options) are: + ## + ## - basic + ## |__ username + ## |__ password + ## |__ use_legacy_auth_encoding + ## - digest + ## |__ username + ## |__ password + ## - ntlm + ## |__ ntlm_domain + ## |__ password + ## - kerberos + ## |__ kerberos_auth + ## |__ kerberos_cache + ## |__ kerberos_delegate + ## |__ kerberos_force_initiate + ## |__ kerberos_hostname + ## |__ kerberos_keytab + ## |__ kerberos_principal + ## - aws + ## |__ aws_region + ## |__ aws_host + ## |__ aws_service + ## + ## The `aws` auth type relies on boto3 to automatically gather AWS credentials, for example: from `.aws/credentials`. + ## Details: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#configuring-credentials + # + # auth_type: basic + + ## @param use_legacy_auth_encoding - boolean - optional - default: true + ## When `auth_type` is set to `basic`, this determines whether to encode as `latin1` rather than `utf-8`. + # + # use_legacy_auth_encoding: true + + ## @param username - string - optional + ## The username to use if services are behind basic or digest auth. + # + # username: + + ## @param password - string - optional + ## The password to use if services are behind basic or NTLM auth. + # + # password: + + ## @param ntlm_domain - string - optional + ## If your services use NTLM authentication, specify + ## the domain used in the check. For NTLM Auth, append + ## the username to domain, not as the `username` parameter. + # + # ntlm_domain: \ + + ## @param kerberos_auth - string - optional - default: disabled + ## If your services use Kerberos authentication, you can specify the Kerberos + ## strategy to use between: + ## + ## - required + ## - optional + ## - disabled + ## + ## See https://github.com/requests/requests-kerberos#mutual-authentication + # + # kerberos_auth: disabled + + ## @param kerberos_cache - string - optional + ## Sets the KRB5CCNAME environment variable. + ## It should point to a credential cache with a valid TGT. + # + # kerberos_cache: + + ## @param kerberos_delegate - boolean - optional - default: false + ## Set to `true` to enable Kerberos delegation of credentials to a server that requests delegation. + ## + ## See https://github.com/requests/requests-kerberos#delegation + # + # kerberos_delegate: false + + ## @param kerberos_force_initiate - boolean - optional - default: false + ## Set to `true` to preemptively initiate the Kerberos GSS exchange and + ## present a Kerberos ticket on the initial request (and all subsequent). + ## + ## See https://github.com/requests/requests-kerberos#preemptive-authentication + # + # kerberos_force_initiate: false + + ## @param kerberos_hostname - string - optional + ## Override the hostname used for the Kerberos GSS exchange if its DNS name doesn't + ## match its Kerberos hostname, for example: behind a content switch or load balancer. + ## + ## See https://github.com/requests/requests-kerberos#hostname-override + # + # kerberos_hostname: + + ## @param kerberos_principal - string - optional + ## Set an explicit principal, to force Kerberos to look for a + ## matching credential cache for the named user. + ## + ## See https://github.com/requests/requests-kerberos#explicit-principal + # + # kerberos_principal: + + ## @param kerberos_keytab - string - optional + ## Set the path to your Kerberos key tab file. + # + # kerberos_keytab: + + ## @param auth_token - mapping - optional + ## This allows for the use of authentication information from dynamic sources. + ## Both a reader and writer must be configured. + ## + ## The available readers are: + ## + ## - type: file + ## path (required): The absolute path for the file to read from. + ## pattern: A regular expression pattern with a single capture group used to find the + ## token rather than using the entire file, for example: Your secret is (.+) + ## - type: oauth + ## url (required): The token endpoint. + ## client_id (required): The client identifier. + ## client_secret (required): The client secret. + ## basic_auth: Whether the provider expects credentials to be transmitted in + ## an HTTP Basic Auth header. The default is: false + ## options: Mapping of additional options to pass to the provider, such as the audience + ## or the scope. For example: + ## options: + ## audience: https://example.com + ## scope: read:example + ## + ## The available writers are: + ## + ## - type: header + ## name (required): The name of the field, for example: Authorization + ## value: The template value, for example `Bearer `. The default is: + ## placeholder: The substring in `value` to replace with the token, defaults to: + # + # auth_token: + # reader: + # type: + # : + # : + # writer: + # type: + # : + # : + + ## @param aws_region - string - optional + ## If your services require AWS Signature Version 4 signing, set the region. + ## + ## See https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html + # + # aws_region: + + ## @param aws_host - string - optional + ## If your services require AWS Signature Version 4 signing, set the host. + ## This only needs the hostname and does not require the protocol (HTTP, HTTPS, and more). + ## For example, if connecting to https://us-east-1.amazonaws.com/, set `aws_host` to `us-east-1.amazonaws.com`. + ## + ## Note: This setting is not necessary for official integrations. + ## + ## See https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html + # + # aws_host: + + ## @param aws_service - string - optional + ## If your services require AWS Signature Version 4 signing, set the service code. For a list + ## of available service codes, see https://docs.aws.amazon.com/general/latest/gr/rande.html + ## + ## Note: This setting is not necessary for official integrations. + ## + ## See https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html + # + # aws_service: + + ## @param tls_verify - boolean - optional - default: true + ## Instructs the check to validate the TLS certificate of services. + # + # tls_verify: true + + ## @param tls_use_host_header - boolean - optional - default: false + ## If a `Host` header is set, this enables its use for SNI (matching against the TLS certificate CN or SAN). + # + # tls_use_host_header: false + + ## @param tls_ignore_warning - boolean - optional - default: false + ## If `tls_verify` is disabled, security warnings are logged by the check. + ## Disable those by setting `tls_ignore_warning` to true. + # + # tls_ignore_warning: false + + ## @param tls_cert - string - optional + ## The path to a single file in PEM format containing a certificate as well as any + ## number of CA certificates needed to establish the certificate's authenticity for + ## use when connecting to services. It may also contain an unencrypted private key to use. + # + # tls_cert: + + ## @param tls_private_key - string - optional + ## The unencrypted private key to use for `tls_cert` when connecting to services. This is + ## required if `tls_cert` is set and it does not already contain a private key. + # + # tls_private_key: + + ## @param tls_ca_cert - string - optional + ## The path to a file of concatenated CA certificates in PEM format or a directory + ## containing several CA certificates in PEM format. If a directory, the directory + ## must have been processed using the `openssl rehash` command. See: + ## https://www.openssl.org/docs/man3.2/man1/c_rehash.html + # + # tls_ca_cert: + + ## @param tls_protocols_allowed - list of strings - optional + ## The expected versions of TLS/SSL when fetching intermediate certificates. + ## Only `SSLv3`, `TLSv1.2`, `TLSv1.3` are allowed by default. The possible values are: + ## SSLv3 + ## TLSv1 + ## TLSv1.1 + ## TLSv1.2 + ## TLSv1.3 + # + # tls_protocols_allowed: + # - SSLv3 + # - TLSv1.2 + # - TLSv1.3 + + ## @param headers - mapping - optional + ## The headers parameter allows you to send specific headers with every request. + ## You can use it for explicitly specifying the host header or adding headers for + ## authorization purposes. + ## + ## This overrides any default headers. + # + # headers: + # Host: + # X-Auth-Token: + + ## @param extra_headers - mapping - optional + ## Additional headers to send with every request. + # + # extra_headers: + # Host: + # X-Auth-Token: + + ## @param timeout - number - optional - default: 10 + ## The timeout for accessing services. + ## + ## This overrides the `timeout` setting in `init_config`. + # + # timeout: 10 + + ## @param connect_timeout - number - optional + ## The connect timeout for accessing services. Defaults to `timeout`. + # + # connect_timeout: + + ## @param read_timeout - number - optional + ## The read timeout for accessing services. Defaults to `timeout`. + # + # read_timeout: + + ## @param request_size - number - optional - default: 16 + ## The number of kibibytes (KiB) to read from streaming HTTP responses at a time. + # + # request_size: 16 + + ## @param log_requests - boolean - optional - default: false + ## Whether or not to debug log the HTTP(S) requests made, including the method and URL. + # + # log_requests: false + + ## @param persist_connections - boolean - optional - default: false + ## Whether or not to persist cookies and use connection pooling for improved performance. + # + # persist_connections: false + + ## @param allow_redirects - boolean - optional - default: true + ## Whether or not to allow URL redirection. + # + # allow_redirects: true + + ## @param tags - list of strings - optional + ## A list of tags to attach to every metric and service check emitted by this instance. + ## + ## Learn more about tagging at https://docs.datadoghq.com/tagging + # + # tags: + # - : + # - : + + ## @param service - string - optional + ## Attach the tag `service:` to every metric, event, and service check emitted by this integration. + ## + ## Overrides any `service` defined in the `init_config` section. + # + # service: + + ## @param min_collection_interval - number - optional - default: 15 + ## This changes the collection interval of the check. For more information, see: + ## https://docs.datadoghq.com/developers/write_agent_check/#collection-interval + # + # min_collection_interval: 15 + + ## @param empty_default_hostname - boolean - optional - default: false + ## This forces the check to send metrics with no hostname. + ## + ## This is useful for cluster-level checks. + # + # empty_default_hostname: false + + ## @param metric_patterns - mapping - optional + ## A mapping of metrics to include or exclude, with each entry being a regular expression. + ## + ## Metrics defined in `exclude` will take precedence in case of overlap. + # + # metric_patterns: + # include: + # - + # exclude: + # - diff --git a/nvidia_nim/datadog_checks/nvidia_nim/metrics.py b/nvidia_nim/datadog_checks/nvidia_nim/metrics.py new file mode 100644 index 0000000000000..f140776d35b3d --- /dev/null +++ b/nvidia_nim/datadog_checks/nvidia_nim/metrics.py @@ -0,0 +1,34 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +METRIC_MAP = { + 'process_virtual_memory_bytes': 'process.virtual_memory_bytes', + 'process_resident_memory_bytes': 'process.resident_memory_bytes', + 'process_start_time_seconds': {'name': 'process.start_time_seconds', 'type': 'time_elapsed'}, + 'process_cpu_seconds': 'process.cpu_seconds', + 'process_open_fds': 'process.open_fds', + 'process_max_fds': 'process.max_fds', + 'prompt_tokens': 'prompt_tokens', + 'python_gc_objects_collected': 'python.gc.objects.collected', + 'python_gc_objects_uncollectable': 'python.gc.objects.uncollectable', + 'python_gc_collections': 'python.gc.collections', + 'python_info': 'python.info', + 'num_request_max': 'num_request.max', + 'num_requests_running': 'num_requests.running', + 'num_requests_waiting': 'num_requests.waiting', + 'gpu_cache_usage_perc': 'gpu_cache_usage_percent', + 'generation_tokens': 'generation_tokens', + 'time_to_first_token_seconds': 'time_to_first_token.seconds', + 'time_per_output_token_seconds': 'time_per_output_token.seconds', + 'e2e_request_latency_seconds': 'e2e_request_latency.seconds', + 'request_finish': 'request.finish', + 'request_generation_tokens': 'request.generation_tokens', + 'request_prompt_tokens': 'request.prompt_tokens', + 'request_success': 'request.success', + 'request_failure': 'request.failure', +} + +RENAME_LABELS_MAP = { + 'version': 'python_version', +} diff --git a/nvidia_nim/hatch.toml b/nvidia_nim/hatch.toml new file mode 100644 index 0000000000000..c85c5f07a7df2 --- /dev/null +++ b/nvidia_nim/hatch.toml @@ -0,0 +1,4 @@ +[env.collectors.datadog-checks] + +[[envs.default.matrix]] +python = ["3.12"] diff --git a/nvidia_nim/manifest.json b/nvidia_nim/manifest.json new file mode 100644 index 0000000000000..aa6aded28ea53 --- /dev/null +++ b/nvidia_nim/manifest.json @@ -0,0 +1,60 @@ +{ + "manifest_version": "2.0.0", + "app_uuid": "c7307eb9-7bbf-4dae-b74f-6396bf5bf514", + "app_id": "nvidia-nim", + "display_on_public_website": false, + "tile": { + "overview": "README.md#Overview", + "configuration": "README.md#Setup", + "support": "README.md#Support", + "changelog": "CHANGELOG.md", + "description": "NVIDIA NIM integration with Datadog enables real-time GPU observability by collecting Prometheus metrics for monitoring.", + "title": "nvidia_nim", + "media": [], + "classifier_tags": [ + "Supported OS::Linux", + "Supported OS::Windows", + "Supported OS::macOS", + "Category::Log Collection", + "Category::AI/ML", + "Submitted Data Type::Metrics", + "Offering::Integration" + ] + }, + "assets": { + "integration": { + "auto_install": true, + "source_type_id": 30338252, + "source_type_name": "nvidia_nim", + "configuration": { + "spec": "assets/configuration/spec.yaml" + }, + "events": { + "creates_events": false + }, + "metrics": { + "prefix": "nvidia_nim.", + "check": "nvidia_nim.num_requests.running", + "metadata_path": "metadata.csv" + }, + "service_checks": { + "metadata_path": "assets/service_checks.json" + }, + "process_signatures": [ + "vllm_nvext.entrypoints.openai.api_server" + ] + }, + "dashboards": { + "NVIDIA NIM Overview": "assets/dashboards/nvidia_nim_overview.json" + }, + "monitors": { + "Average Request Latency is High": "assets/monitors/latency.json" + } + }, + "author": { + "support_email": "help@datadoghq.com", + "name": "Datadog", + "homepage": "https://www.datadoghq.com", + "sales_email": "info@datadoghq.com" + } +} diff --git a/nvidia_nim/metadata.csv b/nvidia_nim/metadata.csv new file mode 100644 index 0000000000000..b9d4a088c841d --- /dev/null +++ b/nvidia_nim/metadata.csv @@ -0,0 +1,35 @@ +metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags +nvidia_nim.e2e_request_latency.seconds.bucket,count,,,,The observations of end to end request latency bucketed by seconds.,0,nvidia_nim,,, +nvidia_nim.e2e_request_latency.seconds.count,count,,,,The total number of observations of end to end request latency.,0,nvidia_nim,,, +nvidia_nim.e2e_request_latency.seconds.sum,count,,second,,The sum of end to end request latency in seconds.,0,nvidia_nim,,, +nvidia_nim.generation_tokens.count,count,,token,,Number of generation tokens processed.,0,nvidia_nim,,, +nvidia_nim.gpu_cache_usage_percent,gauge,,fraction,,GPU KV-cache usage. 1 means 100 percent usage.,0,nvidia_nim,,, +nvidia_nim.num_request.max,gauge,,request,,The max number of concurrently running requests.,0,nvidia_nim,,, +nvidia_nim.num_requests.running,gauge,,request,,Number of requests currently running on GPU.,0,nvidia_nim,,, +nvidia_nim.num_requests.waiting,gauge,,request,,Number of requests waiting.,0,nvidia_nim,,, +nvidia_nim.process.cpu_seconds.count,count,,second,,Total user and system CPU time spent in seconds.,0,nvidia_nim,,, +nvidia_nim.process.max_fds,gauge,,file,,Maximum number of open file descriptors.,0,nvidia_nim,,, +nvidia_nim.process.open_fds,gauge,,file,,Number of open file descriptors.,0,nvidia_nim,,, +nvidia_nim.process.resident_memory_bytes,gauge,,byte,,Resident memory size in bytes.,0,nvidia_nim,,, +nvidia_nim.process.start_time_seconds,gauge,,second,,Time in seconds since process started.,0,nvidia_nim,,, +nvidia_nim.process.virtual_memory_bytes,gauge,,byte,,Virtual memory size in bytes.,0,nvidia_nim,,, +nvidia_nim.prompt_tokens.count,count,,token,,Number of prefill tokens processed.,0,nvidia_nim,,, +nvidia_nim.python.gc.collections.count,count,,,,Number of times this generation was collected.,0,nvidia_nim,,, +nvidia_nim.python.gc.objects.collected.count,count,,,,Objects collected during GC.,0,nvidia_nim,,, +nvidia_nim.python.gc.objects.uncollectable.count,count,,,,Uncollectable objects found during GC.,0,nvidia_nim,,, +nvidia_nim.python.info,gauge,,,,Python platform information.,0,nvidia_nim,,, +nvidia_nim.request.failure.count,count,,request,,The count of failed requests.,0,nvidia_nim,,, +nvidia_nim.request.finish.count,count,,request,,The count of finished requests.,0,nvidia_nim,,, +nvidia_nim.request.generation_tokens.bucket,count,,,,Number of generation tokens processed.,0,nvidia_nim,,, +nvidia_nim.request.generation_tokens.count,count,,,,Number of generation tokens processed.,0,nvidia_nim,,, +nvidia_nim.request.generation_tokens.sum,count,,token,,Number of generation tokens processed.,0,nvidia_nim,,, +nvidia_nim.request.prompt_tokens.bucket,count,,,,Number of prefill tokens processed.,0,nvidia_nim,,, +nvidia_nim.request.prompt_tokens.count,count,,,,Number of prefill tokens processed.,0,nvidia_nim,,, +nvidia_nim.request.prompt_tokens.sum,count,,token,,Number of prefill tokens processed.,0,nvidia_nim,,, +nvidia_nim.request.success.count,count,,,,Count of successfully processed requests.,0,nvidia_nim,,, +nvidia_nim.time_per_output_token.seconds.bucket,count,,,,The observations of time per output token bucketed by seconds.,0,nvidia_nim,,, +nvidia_nim.time_per_output_token.seconds.count,count,,,,The total number of observations of time per output token.,0,nvidia_nim,,, +nvidia_nim.time_per_output_token.seconds.sum,count,,second,,The sum of time per output token in seconds.,0,nvidia_nim,,, +nvidia_nim.time_to_first_token.seconds.bucket,count,,,,The observations of time to first token bucketed by seconds.,0,nvidia_nim,,, +nvidia_nim.time_to_first_token.seconds.count,count,,,,The total number of observations of time to first token.,0,nvidia_nim,,, +nvidia_nim.time_to_first_token.seconds.sum,count,,second,,The sum of time to first token in seconds.,0,nvidia_nim,,, \ No newline at end of file diff --git a/nvidia_nim/pyproject.toml b/nvidia_nim/pyproject.toml new file mode 100644 index 0000000000000..9681369b06977 --- /dev/null +++ b/nvidia_nim/pyproject.toml @@ -0,0 +1,60 @@ +[build-system] +requires = [ + "hatchling>=0.13.0", +] +build-backend = "hatchling.build" + +[project] +name = "datadog-nvidia-nim" +description = "The nvidia_nim check" +readme = "README.md" +license = "BSD-3-Clause" +requires-python = ">=3.12" +keywords = [ + "datadog", + "datadog agent", + "datadog check", + "nvidia_nim", +] +authors = [ + { name = "Datadog", email = "packages@datadoghq.com" }, +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: System Administrators", + "License :: OSI Approved :: BSD License", + "Private :: Do Not Upload", + "Programming Language :: Python :: 3.11", + "Topic :: System :: Monitoring", +] +dependencies = [ + "datadog-checks-base>=32.6.0", +] +dynamic = [ + "version", +] + +[project.optional-dependencies] +deps = [] + +[project.urls] +Source = "https://github.com/DataDog/integrations-core" + +[tool.hatch.version] +path = "datadog_checks/nvidia_nim/__about__.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/datadog_checks", + "/tests", + "/manifest.json", +] + +[tool.hatch.build.targets.wheel] +include = [ + "/datadog_checks/nvidia_nim", +] +dev-mode-dirs = [ + ".", +] diff --git a/nvidia_nim/tests/__init__.py b/nvidia_nim/tests/__init__.py new file mode 100644 index 0000000000000..9103122bf028d --- /dev/null +++ b/nvidia_nim/tests/__init__.py @@ -0,0 +1,3 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) diff --git a/nvidia_nim/tests/common.py b/nvidia_nim/tests/common.py new file mode 100644 index 0000000000000..1ee147c550a71 --- /dev/null +++ b/nvidia_nim/tests/common.py @@ -0,0 +1,63 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import os + +from datadog_checks.dev import get_docker_hostname, get_here + +HERE = get_here() +HOST = get_docker_hostname() +PORT = 8000 + + +def get_fixture_path(filename): + return os.path.join(HERE, 'fixtures', filename) + + +MOCKED_INSTANCE = { + "openmetrics_endpoint": f"http://{HOST}:{PORT}/metrics", + "tags": ['test:test'], +} + +MOCKED_VERSION_ENDPOINT = f"http://{HOST}:{PORT}/version" + +COMPOSE_FILE = os.path.join(HERE, 'docker', 'docker-compose.yaml') + +METRICS_MOCK = [ + 'e2e_request_latency.seconds.bucket', + 'e2e_request_latency.seconds.count', + 'e2e_request_latency.seconds.sum', + 'generation_tokens.count', + 'gpu_cache_usage_percent', + 'num_request.max', + 'num_requests.running', + 'num_requests.waiting', + 'process.cpu_seconds.count', + 'process.max_fds', + 'process.open_fds', + 'process.resident_memory_bytes', + 'process.start_time_seconds', + 'process.virtual_memory_bytes', + 'prompt_tokens.count', + 'python.gc.collections.count', + 'python.gc.objects.collected.count', + 'python.gc.objects.uncollectable.count', + 'python.info', + 'request.failure.count', + 'request.finish.count', + 'request.generation_tokens.bucket', + 'request.generation_tokens.count', + 'request.generation_tokens.sum', + 'request.prompt_tokens.bucket', + 'request.prompt_tokens.count', + 'request.prompt_tokens.sum', + 'request.success.count', + 'time_per_output_token.seconds.bucket', + 'time_per_output_token.seconds.count', + 'time_per_output_token.seconds.sum', + 'time_to_first_token.seconds.bucket', + 'time_to_first_token.seconds.count', + 'time_to_first_token.seconds.sum', +] + +METRICS_MOCK = [f'nvidia_nim.{m}' for m in METRICS_MOCK] diff --git a/nvidia_nim/tests/conftest.py b/nvidia_nim/tests/conftest.py new file mode 100644 index 0000000000000..07ddd62b31d33 --- /dev/null +++ b/nvidia_nim/tests/conftest.py @@ -0,0 +1,30 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import copy + +import pytest + +from datadog_checks.dev import docker_run +from datadog_checks.dev.conditions import CheckDockerLogs, CheckEndpoints + +from .common import COMPOSE_FILE, MOCKED_INSTANCE, MOCKED_VERSION_ENDPOINT + + +@pytest.fixture(scope='session') +def dd_environment(): + compose_file = COMPOSE_FILE + conditions = [ + CheckDockerLogs(identifier='caddy', patterns=['server running']), + CheckEndpoints(MOCKED_INSTANCE["openmetrics_endpoint"]), + CheckEndpoints(MOCKED_VERSION_ENDPOINT), + ] + with docker_run(compose_file, conditions=conditions): + yield { + 'instances': [MOCKED_INSTANCE], + } + + +@pytest.fixture +def instance(): + return copy.deepcopy(MOCKED_INSTANCE) diff --git a/nvidia_nim/tests/docker/Caddyfile b/nvidia_nim/tests/docker/Caddyfile new file mode 100644 index 0000000000000..3715320034cb3 --- /dev/null +++ b/nvidia_nim/tests/docker/Caddyfile @@ -0,0 +1,15 @@ +:8000 { + route /metrics { + rewrite * /metrics + file_server { + root /usr/share/caddy + } + } + + route /v1/version { + rewrite * /version + file_server { + root /usr/share/caddy + } + } +} \ No newline at end of file diff --git a/nvidia_nim/tests/docker/docker-compose.yaml b/nvidia_nim/tests/docker/docker-compose.yaml new file mode 100644 index 0000000000000..89fae66a27a87 --- /dev/null +++ b/nvidia_nim/tests/docker/docker-compose.yaml @@ -0,0 +1,11 @@ +version: "3.9" +services: + caddy: + image: caddy:2.7 + container_name: caddy + ports: + - "8000:8000" + volumes: + - ./Caddyfile:/etc/caddy/Caddyfile + - ../fixtures/nim_metrics.txt:/usr/share/caddy/metrics + - ../fixtures/nim_version.json:/usr/share/caddy/version \ No newline at end of file diff --git a/nvidia_nim/tests/fixtures/nim_metrics.txt b/nvidia_nim/tests/fixtures/nim_metrics.txt new file mode 100644 index 0000000000000..d503a454265d7 --- /dev/null +++ b/nvidia_nim/tests/fixtures/nim_metrics.txt @@ -0,0 +1,159 @@ +# HELP python_gc_objects_collected_total Objects collected during gc +# TYPE python_gc_objects_collected_total counter +python_gc_objects_collected_total{generation="0"} 12502.0 +python_gc_objects_collected_total{generation="1"} 5884.0 +python_gc_objects_collected_total{generation="2"} 1228.0 +# HELP python_gc_objects_uncollectable_total Uncollectable objects found during GC +# TYPE python_gc_objects_uncollectable_total counter +python_gc_objects_uncollectable_total{generation="0"} 0.0 +python_gc_objects_uncollectable_total{generation="1"} 0.0 +python_gc_objects_uncollectable_total{generation="2"} 0.0 +# HELP python_gc_collections_total Number of times this generation was collected +# TYPE python_gc_collections_total counter +python_gc_collections_total{generation="0"} 2991.0 +python_gc_collections_total{generation="1"} 271.0 +python_gc_collections_total{generation="2"} 13.0 +# HELP python_info Python platform information +# TYPE python_info gauge +python_info{implementation="CPython",major="3",minor="10",patchlevel="12",version="3.10.12"} 1.0 +# HELP process_virtual_memory_bytes Virtual memory size in bytes. +# TYPE process_virtual_memory_bytes gauge +process_virtual_memory_bytes 1.15891634176e+011 +# HELP process_resident_memory_bytes Resident memory size in bytes. +# TYPE process_resident_memory_bytes gauge +process_resident_memory_bytes 1.0463768576e+010 +# HELP process_start_time_seconds Start time of the process since unix epoch in seconds. +# TYPE process_start_time_seconds gauge +process_start_time_seconds 1.7303128549e+09 +# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. +# TYPE process_cpu_seconds_total counter +process_cpu_seconds_total 44.87 +# HELP process_open_fds Number of open file descriptors. +# TYPE process_open_fds gauge +process_open_fds 159.0 +# HELP process_max_fds Maximum number of open file descriptors. +# TYPE process_max_fds gauge +process_max_fds 1.048576e+06 +# HELP num_requests_running Number of requests currently running on GPU. +# TYPE num_requests_running gauge +num_requests_running{model_name="meta/llama-3.1-8b-instruct"} 1.0 +# HELP num_requests_waiting Number of requests waiting to be processed. +# TYPE num_requests_waiting gauge +num_requests_waiting{model_name="meta/llama-3.1-8b-instruct"} 0.0 +# HELP num_request_max Max number of concurrently running requests. +# TYPE num_request_max gauge +num_request_max{model_name="meta/llama-3.1-8b-instruct"} 64.0 +# HELP gpu_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage. +# TYPE gpu_cache_usage_perc gauge +gpu_cache_usage_perc{model_name="meta/llama-3.1-8b-instruct"} 0.0002848191398461977 +# HELP prompt_tokens_total Number of prefill tokens processed. +# TYPE prompt_tokens_total counter +prompt_tokens_total{model_name="meta/llama-3.1-8b-instruct"} 109.0 +# HELP generation_tokens_total Number of generation tokens processed. +# TYPE generation_tokens_total counter +generation_tokens_total{model_name="meta/llama-3.1-8b-instruct"} 174.0 +# HELP time_to_first_token_seconds Histogram of time to first token in seconds. +# TYPE time_to_first_token_seconds histogram +time_to_first_token_seconds_bucket{le="0.001",model_name="meta/llama-3.1-8b-instruct"} 0.0 +time_to_first_token_seconds_bucket{le="0.005",model_name="meta/llama-3.1-8b-instruct"} 0.0 +time_to_first_token_seconds_bucket{le="0.01",model_name="meta/llama-3.1-8b-instruct"} 0.0 +time_to_first_token_seconds_bucket{le="0.02",model_name="meta/llama-3.1-8b-instruct"} 2.0 +time_to_first_token_seconds_bucket{le="0.04",model_name="meta/llama-3.1-8b-instruct"} 4.0 +time_to_first_token_seconds_bucket{le="0.06",model_name="meta/llama-3.1-8b-instruct"} 4.0 +time_to_first_token_seconds_bucket{le="0.08",model_name="meta/llama-3.1-8b-instruct"} 4.0 +time_to_first_token_seconds_bucket{le="0.1",model_name="meta/llama-3.1-8b-instruct"} 4.0 +time_to_first_token_seconds_bucket{le="0.25",model_name="meta/llama-3.1-8b-instruct"} 4.0 +time_to_first_token_seconds_bucket{le="0.5",model_name="meta/llama-3.1-8b-instruct"} 4.0 +time_to_first_token_seconds_bucket{le="0.75",model_name="meta/llama-3.1-8b-instruct"} 5.0 +time_to_first_token_seconds_bucket{le="1.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +time_to_first_token_seconds_bucket{le="2.5",model_name="meta/llama-3.1-8b-instruct"} 5.0 +time_to_first_token_seconds_bucket{le="5.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +time_to_first_token_seconds_bucket{le="7.5",model_name="meta/llama-3.1-8b-instruct"} 5.0 +time_to_first_token_seconds_bucket{le="10.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +time_to_first_token_seconds_bucket{le="+Inf",model_name="meta/llama-3.1-8b-instruct"} 5.0 +time_to_first_token_seconds_count{model_name="meta/llama-3.1-8b-instruct"} 5.0 +time_to_first_token_seconds_sum{model_name="meta/llama-3.1-8b-instruct"} 0.6119842529296875 +# HELP time_per_output_token_seconds Histogram of time per output token in seconds. +# TYPE time_per_output_token_seconds histogram +time_per_output_token_seconds_bucket{le="0.01",model_name="meta/llama-3.1-8b-instruct"} 0.0 +time_per_output_token_seconds_bucket{le="0.025",model_name="meta/llama-3.1-8b-instruct"} 168.0 +time_per_output_token_seconds_bucket{le="0.05",model_name="meta/llama-3.1-8b-instruct"} 168.0 +time_per_output_token_seconds_bucket{le="0.075",model_name="meta/llama-3.1-8b-instruct"} 169.0 +time_per_output_token_seconds_bucket{le="0.1",model_name="meta/llama-3.1-8b-instruct"} 169.0 +time_per_output_token_seconds_bucket{le="0.15",model_name="meta/llama-3.1-8b-instruct"} 169.0 +time_per_output_token_seconds_bucket{le="0.2",model_name="meta/llama-3.1-8b-instruct"} 169.0 +time_per_output_token_seconds_bucket{le="0.3",model_name="meta/llama-3.1-8b-instruct"} 169.0 +time_per_output_token_seconds_bucket{le="0.4",model_name="meta/llama-3.1-8b-instruct"} 169.0 +time_per_output_token_seconds_bucket{le="0.5",model_name="meta/llama-3.1-8b-instruct"} 169.0 +time_per_output_token_seconds_bucket{le="0.75",model_name="meta/llama-3.1-8b-instruct"} 169.0 +time_per_output_token_seconds_bucket{le="1.0",model_name="meta/llama-3.1-8b-instruct"} 169.0 +time_per_output_token_seconds_bucket{le="2.5",model_name="meta/llama-3.1-8b-instruct"} 169.0 +time_per_output_token_seconds_bucket{le="+Inf",model_name="meta/llama-3.1-8b-instruct"} 169.0 +time_per_output_token_seconds_count{model_name="meta/llama-3.1-8b-instruct"} 169.0 +time_per_output_token_seconds_sum{model_name="meta/llama-3.1-8b-instruct"} 1.856855869293213 +# HELP e2e_request_latency_seconds Histogram of end to end request latency in seconds. +# TYPE e2e_request_latency_seconds histogram +e2e_request_latency_seconds_bucket{le="1.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +e2e_request_latency_seconds_bucket{le="2.5",model_name="meta/llama-3.1-8b-instruct"} 5.0 +e2e_request_latency_seconds_bucket{le="5.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +e2e_request_latency_seconds_bucket{le="10.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +e2e_request_latency_seconds_bucket{le="15.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +e2e_request_latency_seconds_bucket{le="20.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +e2e_request_latency_seconds_bucket{le="30.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +e2e_request_latency_seconds_bucket{le="40.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +e2e_request_latency_seconds_bucket{le="50.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +e2e_request_latency_seconds_bucket{le="60.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +e2e_request_latency_seconds_bucket{le="+Inf",model_name="meta/llama-3.1-8b-instruct"} 5.0 +e2e_request_latency_seconds_count{model_name="meta/llama-3.1-8b-instruct"} 5.0 +e2e_request_latency_seconds_sum{model_name="meta/llama-3.1-8b-instruct"} 2.4688401222229004 +# HELP request_prompt_tokens Number of prefill tokens processed. +# TYPE request_prompt_tokens histogram +request_prompt_tokens_bucket{le="1.0",model_name="meta/llama-3.1-8b-instruct"} 0.0 +request_prompt_tokens_bucket{le="2.0",model_name="meta/llama-3.1-8b-instruct"} 0.0 +request_prompt_tokens_bucket{le="5.0",model_name="meta/llama-3.1-8b-instruct"} 3.0 +request_prompt_tokens_bucket{le="10.0",model_name="meta/llama-3.1-8b-instruct"} 3.0 +request_prompt_tokens_bucket{le="20.0",model_name="meta/llama-3.1-8b-instruct"} 3.0 +request_prompt_tokens_bucket{le="50.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_prompt_tokens_bucket{le="100.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_prompt_tokens_bucket{le="200.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_prompt_tokens_bucket{le="500.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_prompt_tokens_bucket{le="1000.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_prompt_tokens_bucket{le="2000.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_prompt_tokens_bucket{le="5000.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_prompt_tokens_bucket{le="10000.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_prompt_tokens_bucket{le="20000.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_prompt_tokens_bucket{le="50000.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_prompt_tokens_bucket{le="100000.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_prompt_tokens_bucket{le="+Inf",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_prompt_tokens_count{model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_prompt_tokens_sum{model_name="meta/llama-3.1-8b-instruct"} 109.0 +# HELP request_generation_tokens Number of generation tokens processed. +# TYPE request_generation_tokens histogram +request_generation_tokens_bucket{le="1.0",model_name="meta/llama-3.1-8b-instruct"} 0.0 +request_generation_tokens_bucket{le="2.0",model_name="meta/llama-3.1-8b-instruct"} 0.0 +request_generation_tokens_bucket{le="5.0",model_name="meta/llama-3.1-8b-instruct"} 0.0 +request_generation_tokens_bucket{le="10.0",model_name="meta/llama-3.1-8b-instruct"} 0.0 +request_generation_tokens_bucket{le="20.0",model_name="meta/llama-3.1-8b-instruct"} 3.0 +request_generation_tokens_bucket{le="50.0",model_name="meta/llama-3.1-8b-instruct"} 3.0 +request_generation_tokens_bucket{le="100.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_generation_tokens_bucket{le="200.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_generation_tokens_bucket{le="500.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_generation_tokens_bucket{le="1000.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_generation_tokens_bucket{le="2000.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_generation_tokens_bucket{le="5000.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_generation_tokens_bucket{le="10000.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_generation_tokens_bucket{le="20000.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_generation_tokens_bucket{le="50000.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_generation_tokens_bucket{le="100000.0",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_generation_tokens_bucket{le="+Inf",model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_generation_tokens_count{model_name="meta/llama-3.1-8b-instruct"} 5.0 +request_generation_tokens_sum{model_name="meta/llama-3.1-8b-instruct"} 174.0 +# HELP request_finish_total Count of finished requests, differentiated by finish reason as label. +# TYPE request_finish_total counter +request_finish_total{finished_reason="length",model_name="meta/llama-3.1-8b-instruct"} 5.0 +# HELP request_success_total Count of successful requests. +# TYPE request_success_total counter +request_success_total{model_name="meta/llama-3.1-8b-instruct"} 5.0 +# HELP request_failure_total Count of failed requests. +# TYPE request_failure_total counter +request_failure_total{model_name="meta/llama-3.1-8b-instruct"} 0.0 \ No newline at end of file diff --git a/nvidia_nim/tests/fixtures/nim_version.json b/nvidia_nim/tests/fixtures/nim_version.json new file mode 100644 index 0000000000000..697168b8dabdd --- /dev/null +++ b/nvidia_nim/tests/fixtures/nim_version.json @@ -0,0 +1 @@ +{"release":"1.0.0","api":"1.0.0"} \ No newline at end of file diff --git a/nvidia_nim/tests/test_e2e.py b/nvidia_nim/tests/test_e2e.py new file mode 100644 index 0000000000000..9151982259dca --- /dev/null +++ b/nvidia_nim/tests/test_e2e.py @@ -0,0 +1,11 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from datadog_checks.base.constants import ServiceCheck +from datadog_checks.dev.utils import assert_service_checks + + +def test_check_nvidia_nim_e2e(dd_agent_check, instance): + aggregator = dd_agent_check(instance, rate=True) + aggregator.assert_service_check('nvidia_nim.openmetrics.health', ServiceCheck.OK, count=2) + assert_service_checks(aggregator) diff --git a/nvidia_nim/tests/test_unit.py b/nvidia_nim/tests/test_unit.py new file mode 100644 index 0000000000000..980580e11fab3 --- /dev/null +++ b/nvidia_nim/tests/test_unit.py @@ -0,0 +1,61 @@ +# (C) Datadog, Inc. 2024-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +from unittest import mock + +import pytest + +from datadog_checks.base.constants import ServiceCheck +from datadog_checks.dev.http import MockResponse +from datadog_checks.dev.utils import get_metadata_metrics +from datadog_checks.nvidia_nim import NvidiaNIMCheck + +from .common import METRICS_MOCK, get_fixture_path + + +def test_check_nvidia_nim(dd_run_check, aggregator, datadog_agent, instance): + check = NvidiaNIMCheck("nvidia_nim", {}, [instance]) + check.check_id = "test:123" + with mock.patch( + 'requests.get', + side_effect=[ + MockResponse(file_path=get_fixture_path("nim_metrics.txt")), + MockResponse(file_path=get_fixture_path("nim_version.json")), + ], + ): + dd_run_check(check) + + for metric in METRICS_MOCK: + aggregator.assert_metric(metric) + aggregator.assert_metric_has_tag(metric, "test:test") + + aggregator.assert_all_metrics_covered() + aggregator.assert_metrics_using_metadata(get_metadata_metrics()) + aggregator.assert_service_check("nvidia_nim.openmetrics.health", ServiceCheck.OK) + + raw_version = "1.0.0" + major, minor, patch = raw_version.split(".") + version_metadata = { + "version.scheme": "semver", + "version.major": major, + "version.minor": minor, + "version.patch": patch, + "version.raw": raw_version, + } + datadog_agent.assert_metadata("test:123", version_metadata) + + +def test_emits_critical_openemtrics_service_check_when_service_is_down( + dd_run_check, aggregator, instance, mock_http_response +): + """ + If we fail to reach the openmetrics endpoint the openmetrics service check should report as critical + """ + mock_http_response(status_code=404) + check = NvidiaNIMCheck("nvidia_nim", {}, [instance]) + with pytest.raises(Exception, match="requests.exceptions.HTTPError"): + dd_run_check(check) + + aggregator.assert_all_metrics_covered() + aggregator.assert_service_check("nvidia_nim.openmetrics.health", ServiceCheck.CRITICAL)