From 35c6db37776cf17033df66afa8140b45bdf213d2 Mon Sep 17 00:00:00 2001 From: dkirov-dd <166512750+dkirov-dd@users.noreply.github.com> Date: Thu, 19 Dec 2024 14:42:16 +0100 Subject: [PATCH] [LOI-341] Add Neuron error logs parser (#19082) * Add Neuron error logs parser * Switch number to integer in parser * Add test results * Fix source --- aws_neuron/assets/logs/aws_neuron.yaml | 37 +++++++++++++++----- aws_neuron/assets/logs/aws_neuron_tests.yaml | 24 +++++++++++++ 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/aws_neuron/assets/logs/aws_neuron.yaml b/aws_neuron/assets/logs/aws_neuron.yaml index a6699c424ce35..165a55e1e2a3c 100644 --- a/aws_neuron/assets/logs/aws_neuron.yaml +++ b/aws_neuron/assets/logs/aws_neuron.yaml @@ -7,29 +7,45 @@ pipeline: name: AWS Neuron enabled: true filter: - query: 'source:aws_neuron' + query: "source:aws_neuron" processors: - type: grok-parser name: Operator compilation enabled: true source: message samples: - - 'INFO:Neuron: => aten::Int: 96' - - 'INFO:Neuron: => aten::Int: 1 [supported]' - - 'INFO:Neuron: => aten::embedding: 3 [not supported]' + - "INFO:Neuron: => aten::Int: 96" + - "INFO:Neuron: => aten::Int: 1 [supported]" + - "INFO:Neuron: => aten::embedding: 3 [not supported]" grok: supportRules: operator %{word:operator.library}::%{word:operator.type} - matchRules: 'operator_rule INFO:Neuron: => %{operator}: %{integer:count}( \[%{data:not_compiled_msg}\])?' + matchRules: "operator_rule INFO:Neuron: => %{operator}: %{integer:count}( + \\[%{data:not_compiled_msg}\\])?" - type: grok-parser name: Message separation enabled: true source: message samples: - - INFO:Neuron:Number of arithmetic operators (pre-compilation) before = 565, fused = 548, percent fused = 96.99% - - 'INFO:Neuron: => aten::layer_norm: 25' + - INFO:Neuron:Number of arithmetic operators (pre-compilation) before = + 565, fused = 548, percent fused = 96.99% + - "INFO:Neuron: => aten::layer_norm: 25" + - > + 2024-11-15 10:38:24.000103: 4938 ERROR ||NEURON_CC_WRAPPER||: + Compilation failed for + /tmp/ubuntu/neuroncc_compile_workdir/cf6cf570-d889-4a0c-a821-719e225d9bc8/model.MODULE_16150394314145281873+d7517139.hlo_module.pb + after 0 retries. + - > + 2024-Nov-15 + 13:35:03.0879 6475:6475 ERROR NRT:nrt_allocate_neuron_cores NeuronCore(s) + not available - Requested:16 Available:4 grok: supportRules: "" - matchRules: message_rule %{word:level}:Neuron:( => )?%{data:msg} + matchRules: >- + info_rule %{word:level}:Neuron:( => )?%{data:msg} + + error_rule_1 %{date("yyyy-MM-dd' 'HH:mm:ss.SSSSSS"):date}: %{integer:pid} %{word:level} %{data:msg} + + error_rule_2 %{date("yyyy-MMM-dd' 'HH:mm:ss.SSSS"):date} %{integer:pid}:%{integer:tid} %{word:level} %{data:msg} - type: message-remapper name: Define `msg` as the official message of the log enabled: true @@ -40,3 +56,8 @@ pipeline: enabled: true sources: - level + - type: status-remapper + name: Define `level` as the official status of the log + enabled: true + sources: + - level diff --git a/aws_neuron/assets/logs/aws_neuron_tests.yaml b/aws_neuron/assets/logs/aws_neuron_tests.yaml index 9c88b85826efc..14f9b411f27db 100644 --- a/aws_neuron/assets/logs/aws_neuron_tests.yaml +++ b/aws_neuron/assets/logs/aws_neuron_tests.yaml @@ -36,3 +36,27 @@ tests: status: "info" tags: - "source:LOGS_SOURCE" + - + sample: "2024-11-15 10:38:24.000103: 4938 ERROR ||NEURON_CC_WRAPPER||: Compilation failed for /tmp/ubuntu/neuroncc_compile_workdir/cf6cf570-d889-4a0c-a821-719e225d9bc8/model.MODULE_16150394314145281873+d7517139.hlo_module.pb after 0 retries." + result: + custom: + date: 1731667104000 + level: "ERROR" + pid: 4938 + message: "||NEURON_CC_WRAPPER||: Compilation failed for /tmp/ubuntu/neuroncc_compile_workdir/cf6cf570-d889-4a0c-a821-719e225d9bc8/model.MODULE_16150394314145281873+d7517139.hlo_module.pb after 0 retries." + status: "error" + tags: + - "source:LOGS_SOURCE" + - + sample: "2024-Nov-15 13:35:03.0879 6475:6475 ERROR NRT:nrt_allocate_neuron_cores NeuronCore(s) not available - Requested:16 Available:4" + result: + custom: + date: 1731677703087 + level: "ERROR" + pid: 6475 + tid: 6475 + message: " NRT:nrt_allocate_neuron_cores NeuronCore(s) not available - Requested:16 Available:4" + status: "error" + tags: + - "source:LOGS_SOURCE" +