diff --git a/CHANGELOG.md b/CHANGELOG.md index c25db7fb65..6d26aedd41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ Previously, when a customer installed the Ruby agent via [Kubernetes APM auto-attach](https://docs.newrelic.com/docs/kubernetes-pixie/kubernetes-integration/installation/k8s-agent-operator/) and also had the Ruby agent listed in their `Gemfile`, the agent version in `Gemfile` would take precedence. Now, the agent version installed by auto-attach takes priority. [PR#3018](https://github.com/newrelic/newrelic-ruby-agent/pull/3018) +- **Feature: Add health checks when the agent runs within Agent Control** + + When the agent is started within an [Agent Control](https://docs-preview.newrelic.com/docs/new-relic-agent-control) environment, a health check file will be created at the configured file location for every agent process. By default, this location is: '/newrelic/apm/health'. The health check files will be updated at the configured frequency, which defaults to every five seconds. [PR#2995](https://github.com/newrelic/newrelic-ruby-agent/pull/2995) + - **Bugfix: Stop emitting inaccurate debug-level log about deprecated configuration options** In the previous major release, we dropped support for `disable_` configuration options in favor of `instrumentation.`. Previously, a DEBUG level log warning appeared whenever `disable_*` options were set to `true`, even for libraries (e.g. Action Dispatch) without equivalent `instrumentation.*` options: diff --git a/lib/new_relic/agent/agent.rb b/lib/new_relic/agent/agent.rb index 6c22862f80..7e80a130dc 100644 --- a/lib/new_relic/agent/agent.rb +++ b/lib/new_relic/agent/agent.rb @@ -13,6 +13,7 @@ require 'new_relic/coerce' require 'new_relic/agent/autostart' require 'new_relic/agent/harvester' +require 'new_relic/agent/health_check' require 'new_relic/agent/hostname' require 'new_relic/agent/new_relic_service' require 'new_relic/agent/pipe_service' @@ -88,6 +89,7 @@ def init_basics end def init_components + @health_check = HealthCheck.new @service = NewRelicService.new @events = EventListener.new @stats_engine = StatsEngine.new @@ -139,6 +141,8 @@ def instance # Holds all the methods defined on NewRelic::Agent::Agent # instances module InstanceMethods + # the agent control health check file generator + attr_reader :health_check # the statistics engine that holds all the timeslice data attr_reader :stats_engine # the transaction sampler that handles recording transactions diff --git a/lib/new_relic/agent/agent_helpers/connect.rb b/lib/new_relic/agent/agent_helpers/connect.rb index 7cae31c2bb..09277d8fbc 100644 --- a/lib/new_relic/agent/agent_helpers/connect.rb +++ b/lib/new_relic/agent/agent_helpers/connect.rb @@ -68,6 +68,7 @@ def log_error(error) def handle_license_error(error) ::NewRelic::Agent.logger.error(error.message, 'Visit newrelic.com to obtain a valid license key, or to upgrade your account.') + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::INVALID_LICENSE_KEY) disconnect end @@ -191,6 +192,7 @@ def connect(options = {}) @connected_pid = $$ @connect_state = :connected signal_connected + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::HEALTHY) rescue NewRelic::Agent::ForceDisconnectException => e handle_force_disconnect(e) rescue NewRelic::Agent::LicenseException => e @@ -198,6 +200,7 @@ def connect(options = {}) rescue NewRelic::Agent::UnrecoverableAgentException => e handle_unrecoverable_agent_error(e) rescue StandardError, Timeout::Error, NewRelic::Agent::ServerConnectionException => e + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::FAILED_TO_CONNECT) retry if retry_from_error?(e, opts) rescue Exception => e ::NewRelic::Agent.logger.error('Exception of unexpected type during Agent#connect():', e) diff --git a/lib/new_relic/agent/agent_helpers/harvest.rb b/lib/new_relic/agent/agent_helpers/harvest.rb index af5596ad21..893d8e1778 100644 --- a/lib/new_relic/agent/agent_helpers/harvest.rb +++ b/lib/new_relic/agent/agent_helpers/harvest.rb @@ -119,6 +119,7 @@ def send_data_to_endpoint(endpoint, payload, container) rescue UnrecoverableServerException => e NewRelic::Agent.logger.warn("#{endpoint} data was rejected by remote service, discarding. Error: ", e) rescue ServerConnectionException => e + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::FAILED_TO_CONNECT) log_remote_unavailable(endpoint, e) container.merge!(payload) rescue => e @@ -133,9 +134,11 @@ def check_for_and_handle_agent_commands rescue ForceRestartException, ForceDisconnectException raise rescue UnrecoverableServerException => e + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::FAILED_TO_CONNECT) NewRelic::Agent.logger.warn('get_agent_commands message was rejected by remote service, discarding. ' \ 'Error: ', e) rescue ServerConnectionException => e + NewRelic::Agent.health_check.update_status(NewRelic::Agent::HealthCheck::FAILED_TO_CONNECT) log_remote_unavailable(:get_agent_commands, e) rescue => e NewRelic::Agent.logger.info('Error during check_for_and_handle_agent_commands, will retry later: ', e) diff --git a/lib/new_relic/agent/agent_helpers/shutdown.rb b/lib/new_relic/agent/agent_helpers/shutdown.rb index 9bc36393bc..d9469065e8 100644 --- a/lib/new_relic/agent/agent_helpers/shutdown.rb +++ b/lib/new_relic/agent/agent_helpers/shutdown.rb @@ -19,6 +19,9 @@ def shutdown revert_to_default_configuration @started = nil + + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::SHUTDOWN) if NewRelic::Agent.agent.health_check.healthy? + Control.reset end diff --git a/lib/new_relic/agent/agent_helpers/start_worker_thread.rb b/lib/new_relic/agent/agent_helpers/start_worker_thread.rb index 006d492967..e10a020b2b 100644 --- a/lib/new_relic/agent/agent_helpers/start_worker_thread.rb +++ b/lib/new_relic/agent/agent_helpers/start_worker_thread.rb @@ -86,6 +86,7 @@ def handle_force_restart(error) # is the worker thread that gathers data and talks to the # server. def handle_force_disconnect(error) + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::FORCED_DISCONNECT) ::NewRelic::Agent.logger.warn('Agent received a ForceDisconnectException from the server, disconnecting. ' \ "(#{error.message})") disconnect diff --git a/lib/new_relic/agent/agent_helpers/startup.rb b/lib/new_relic/agent/agent_helpers/startup.rb index f683a1e917..4fe4828daf 100644 --- a/lib/new_relic/agent/agent_helpers/startup.rb +++ b/lib/new_relic/agent/agent_helpers/startup.rb @@ -36,6 +36,9 @@ def start # setting up the worker thread and the exit handler to shut # down the agent def check_config_and_start_agent + # some health statuses, such as invalid license key, are ran before + # the agent officially starts + @health_check.create_and_run_health_check_loop return unless monitoring? && has_correct_license_key? return if using_forking_dispatcher? @@ -129,6 +132,7 @@ def monitoring? if Agent.config[:monitor_mode] true else + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::AGENT_DISABLED) ::NewRelic::Agent.logger.warn('Agent configured not to send data in this environment.') false end @@ -140,6 +144,7 @@ def has_license_key? if Agent.config[:license_key] && Agent.config[:license_key].length > 0 true else + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::MISSING_LICENSE_KEY) ::NewRelic::Agent.logger.warn('No license key found. ' + 'This often means your newrelic.yml file was not found, or it lacks a section for the running ' \ "environment, '#{NewRelic::Control.instance.env}'. You may also want to try linting your newrelic.yml " \ @@ -160,6 +165,7 @@ def correct_license_length if key.length == 40 true else + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::INVALID_LICENSE_KEY) ::NewRelic::Agent.logger.error("Invalid license key: #{key}") false end @@ -180,6 +186,7 @@ def agent_should_start? end unless app_name_configured? + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::MISSING_APP_NAME) NewRelic::Agent.logger.error('No application name configured.', 'The agent cannot start without at least one. Please check your ', 'newrelic.yml and ensure that it is valid and has at least one ', diff --git a/lib/new_relic/agent/configuration/default_source.rb b/lib/new_relic/agent/configuration/default_source.rb index 86e84594af..071fe4f5fa 100644 --- a/lib/new_relic/agent/configuration/default_source.rb +++ b/lib/new_relic/agent/configuration/default_source.rb @@ -2188,6 +2188,28 @@ def self.notify :transform => DefaultSource.method(:convert_to_constant_list), :description => 'Specify a list of exceptions you do not want the agent to strip when [strip_exception_messages](#strip_exception_messages-enabled) is `true`. Separate exceptions with a comma. For example, `"ImportantException,PreserveMessageException"`.' }, + # Agent Control + :'agent_control.enabled' => { + :default => false, + :public => false, + :type => Boolean, + :allowed_from_server => false, + :description => 'Boolean value that denotes whether Agent Control functionality should be enabled. At the moment, this functionality is limited to whether agent health should be reported. This configuration will be set using an environment variable by Agent Control, or one of its components, prior to agent startup.' + }, + :'agent_control.health.delivery_location' => { + :default => '/newrelic/apm/health', + :public => false, + :type => String, + :allowed_from_server => false, + :description => 'A `file:` URI that specifies the fully qualified directory path for health file(s) to be written to. This defaults to: `file:///newrelic/apm/health`. This configuration will be set using an environment variable by Agent Control, or one of its components, prior to agent startup.' + }, + :'agent_control.health.frequency' => { + :default => 5, + :public => false, + :type => Integer, + :allowed_from_server => false, + :description => 'The interval, in seconds, of how often the health file(s) will be written to. This configuration will be set using an environment variable by Agent Control, or one of its components, prior to agent startup.' + }, # Thread profiler :'thread_profiler.enabled' => { :default => DefaultSource.thread_profiler_enabled, diff --git a/lib/new_relic/agent/configuration/yaml_source.rb b/lib/new_relic/agent/configuration/yaml_source.rb index 574d29ef83..ca9b8b212d 100644 --- a/lib/new_relic/agent/configuration/yaml_source.rb +++ b/lib/new_relic/agent/configuration/yaml_source.rb @@ -36,6 +36,7 @@ def initialize(path, env) erb_file = process_erb(raw_file) config = process_yaml(erb_file, env, config, @file_path) rescue ScriptError, StandardError => e + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::FAILED_TO_PARSE_CONFIG) log_failure("Failed to read or parse configuration file at #{path}", e) end @@ -99,6 +100,7 @@ def process_erb(file) file.gsub!(/^\s*#.*$/, '#') ERB.new(file).result(binding) rescue ScriptError, StandardError => e + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::FAILED_TO_PARSE_CONFIG) message = 'Failed ERB processing configuration file. This is typically caused by a Ruby error in <% %> templating blocks in your newrelic.yml file.' failure_array = [message, e] failure_array << e.backtrace[0] if Gem::Version.new(RUBY_VERSION) >= Gem::Version.new('3.4.0') diff --git a/lib/new_relic/agent/health_check.rb b/lib/new_relic/agent/health_check.rb new file mode 100644 index 0000000000..73bb9bd7c7 --- /dev/null +++ b/lib/new_relic/agent/health_check.rb @@ -0,0 +1,136 @@ +# This file is distributed under New Relic's license terms. +# See https://github.com/newrelic/newrelic-ruby-agent/blob/main/LICENSE for complete details. +# frozen_string_literal: true + +module NewRelic + module Agent + class HealthCheck + def initialize + @start_time = nano_time + @continue = true + @status = HEALTHY + # the following assignments may set @continue = false if they are invalid + set_enabled + set_delivery_location + set_frequency + end + + HEALTHY = {healthy: true, last_error: 'NR-APM-000', message: 'Healthy'}.freeze + INVALID_LICENSE_KEY = {healthy: false, last_error: 'NR-APM-001', message: 'Invalid license key (HTTP status code 401)'}.freeze + MISSING_LICENSE_KEY = {healthy: false, last_error: 'NR-APM-002', message: 'License key missing in configuration'}.freeze + FORCED_DISCONNECT = {healthy: false, last_error: 'NR-APM-003', message: 'Forced disconnect received from New Relic (HTTP status code 410)'}.freeze + HTTP_ERROR = {healthy: false, last_error: 'NR-APM-004', message: 'HTTP error response code [%s] recevied from New Relic while sending data type [%s]'}.freeze + MISSING_APP_NAME = {healthy: false, last_error: 'NR-APM-005', message: 'Missing application name in agent configuration'}.freeze + APP_NAME_EXCEEDED = {healthy: false, last_error: 'NR-APM-006', message: 'The maximum number of configured app names (3) exceeded'}.freeze + PROXY_CONFIG_ERROR = {healthy: false, last_error: 'NR-APM-007', message: 'HTTP Proxy configuration error; response code [%s]'}.freeze + AGENT_DISABLED = {healthy: false, last_error: 'NR-APM-008', message: 'Agent is disabled via configuration'}.freeze + FAILED_TO_CONNECT = {healthy: false, last_error: 'NR-APM-009', message: 'Failed to connect to New Relic data collector'}.freeze + FAILED_TO_PARSE_CONFIG = {healthy: false, last_error: 'NR-APM-010', message: 'Agent config file is not able to be parsed'}.freeze + SHUTDOWN = {healthy: true, last_error: 'NR-APM-099', message: 'Agent has shutdown'}.freeze + + def create_and_run_health_check_loop + return unless health_checks_enabled? && @continue + + NewRelic::Agent.logger.debug('Agent Control health check conditions met. Starting health checks.') + NewRelic::Agent.record_metric('Supportability/AgentControl/Health/enabled', 1) + + Thread.new do + while @continue + begin + sleep @frequency + write_file + @continue = false if @status == SHUTDOWN + rescue StandardError => e + NewRelic::Agent.logger.error("Aborting Agent Control health check. Error raised: #{e}") + @continue = false + end + end + end + end + + def update_status(status, options = []) + return unless @continue + + @status = status.dup + update_message(options) unless options.empty? + end + + def healthy? + @status == HEALTHY + end + + private + + def set_enabled + @enabled = if ENV['NEW_RELIC_AGENT_CONTROL_ENABLED'] == 'true' + true + else + NewRelic::Agent.logger.debug('NEW_RELIC_AGENT_CONTROL_ENABLED not true, disabling health checks') + @continue = false + false + end + end + + def set_delivery_location + @delivery_location = if ENV['NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION'] + # The spec states file paths for the delivery location will begin with file:// + # This does not create a valid path in Ruby, so remove the prefix when present + ENV['NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION']&.gsub('file://', '') + else + # The spec default is 'file:///newrelic/apm/health', but since we're just going to remove it anyway... + '/newrelic/apm/health' + end + end + + def set_frequency + @frequency = ENV['NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY'] ? ENV['NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY'].to_i : 5 + + if @frequency <= 0 + NewRelic::Agent.logger.debug('NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY zero or less, disabling health checks') + @continue = false + end + end + + def contents + <<~CONTENTS + healthy: #{@status[:healthy]} + status: #{@status[:message]}#{last_error} + start_time_unix_nano: #{@start_time} + status_time_unix_nano: #{nano_time} + CONTENTS + end + + def last_error + @status[:healthy] ? '' : "\nlast_error: #{@status[:last_error]}" + end + + def nano_time + Process.clock_gettime(Process::CLOCK_REALTIME, :nanosecond) + end + + def file_name + "health-#{NewRelic::Agent::GuidGenerator.generate_guid(32)}.yml" + end + + def write_file + @file ||= "#{@delivery_location}/#{file_name}" + + File.write(@file, contents) + rescue StandardError => e + NewRelic::Agent.logger.error("Agent Control health check raised an error while writing a file: #{e}") + @continue = false + end + + def health_checks_enabled? + @enabled && @delivery_location && @frequency > 0 + end + + def update_message(options) + @status[:message] = sprintf(@status[:message], *options) + rescue StandardError => e + NewRelic::Agent.logger.debug("Error raised while updating Agent Control health check message: #{e}." \ + "options = #{options}, @status[:message] = #{@status[:message]}") + end + end + end +end diff --git a/lib/new_relic/agent/new_relic_service.rb b/lib/new_relic/agent/new_relic_service.rb index 467188d455..ec99e98902 100644 --- a/lib/new_relic/agent/new_relic_service.rb +++ b/lib/new_relic/agent/new_relic_service.rb @@ -455,6 +455,8 @@ def attempt_request(request, opts) end def handle_error_response(response, endpoint) + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::HTTP_ERROR, [response.code, endpoint]) + case response when Net::HTTPRequestTimeOut, Net::HTTPTooManyRequests, @@ -637,9 +639,13 @@ def check_post_size(post_string, endpoint) def send_request(opts) request = prep_request(opts) response = relay_request(request, opts) - return response if response.is_a?(Net::HTTPSuccess) || response.is_a?(Net::HTTPAccepted) - handle_error_response(response, opts[:endpoint]) + if response.is_a?(Net::HTTPSuccess) || response.is_a?(Net::HTTPAccepted) + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::HEALTHY) + response + else + handle_error_response(response, opts[:endpoint]) + end end def log_response(response) diff --git a/test/agent_helper.rb b/test/agent_helper.rb index ae7dfea555..4a32cae7b0 100644 --- a/test/agent_helper.rb +++ b/test/agent_helper.rb @@ -115,6 +115,13 @@ def assert_log_contains(log, message) "Could not find message: '#{message.inspect}'. Log contained: #{lines.join("\n")}" end +def refute_log_contains(log, message) + lines = log.array + + refute (lines.any? { |line| line.match(message) }), + "Found message: '#{message.inspect}'. Log contained: #{lines.join("\n")}" +end + def assert_audit_log_contains(audit_log_contents, needle) # Original request bodies dumped to the log have symbol keys, but once # they go through a dump/load, they're strings again, so we strip diff --git a/test/new_relic/agent/agent/start_test.rb b/test/new_relic/agent/agent/start_test.rb index 4c245066a3..c40cf94e6e 100644 --- a/test/new_relic/agent/agent/start_test.rb +++ b/test/new_relic/agent/agent/start_test.rb @@ -11,6 +11,7 @@ class NewRelic::Agent::Agent::StartTest < Minitest::Test def setup @harvester = stub('dummy harvester') + @health_check = stub('dummy health check') @harvest_samplers = stub('dummy sampler collection') end @@ -43,12 +44,38 @@ def test_disabled_negative def test_check_config_and_start_agent_disabled self.expects(:monitoring?).returns(false) + @health_check.expects(:create_and_run_health_check_loop) check_config_and_start_agent end + def test_monitoring_false_updates_health_status + with_config(:monitor_mode => false) do + # make sure the health checks are set up to run + NewRelic::Agent.agent.health_check.instance_variable_set(:@continue, true) + NewRelic::Agent.agent.health_check.expects(:update_status).with(NewRelic::Agent::HealthCheck::AGENT_DISABLED) + + monitoring? + end + end + + def test_missing_app_name_updates_health_status + with_config(:app_name => nil) do + NewRelic::Agent.agent.health_check.expects(:update_status).with(NewRelic::Agent::HealthCheck::MISSING_APP_NAME) + agent_should_start? + end + end + + def test_missing_license_key_updates_health_status + with_config(:license_key => nil) do + NewRelic::Agent.agent.health_check.expects(:update_status).with(NewRelic::Agent::HealthCheck::MISSING_LICENSE_KEY) + has_license_key? + end + end + def test_check_config_and_start_agent_incorrect_key self.expects(:monitoring?).returns(true) self.expects(:has_correct_license_key?).returns(false) + @health_check.expects(:create_and_run_health_check_loop) check_config_and_start_agent end @@ -56,12 +83,14 @@ def test_check_config_and_start_agent_forking self.expects(:monitoring?).returns(true) self.expects(:has_correct_license_key?).returns(true) self.expects(:using_forking_dispatcher?).returns(true) + @health_check.expects(:create_and_run_health_check_loop) check_config_and_start_agent end def test_check_config_and_start_agent_normal @harvester.expects(:mark_started) @harvest_samplers.expects(:load_samplers) + @health_check.expects(:create_and_run_health_check_loop) self.expects(:start_worker_thread) self.expects(:install_exit_handler) self.expects(:environment_for_connect) @@ -74,6 +103,7 @@ def test_check_config_and_start_agent_normal def test_check_config_and_start_agent_sync @harvester.expects(:mark_started) @harvest_samplers.expects(:load_samplers) + @health_check.expects(:create_and_run_health_check_loop) self.expects(:connect_in_foreground) self.expects(:start_worker_thread) self.expects(:install_exit_handler) @@ -187,6 +217,13 @@ def test_correct_license_length_negative end end + def test_correct_license_length_negative_updates_health_status + with_config(:license_key => 'a' * 30) do + NewRelic::Agent.agent.health_check.expects(:update_status).with(NewRelic::Agent::HealthCheck::INVALID_LICENSE_KEY) + correct_license_length + end + end + def test_using_forking_dispatcher_positive with_config(:dispatcher => :passenger) do assert_predicate self, :using_forking_dispatcher? diff --git a/test/new_relic/agent/agent_test.rb b/test/new_relic/agent/agent_test.rb index 05910c6511..b5fde3845b 100644 --- a/test/new_relic/agent/agent_test.rb +++ b/test/new_relic/agent/agent_test.rb @@ -335,6 +335,9 @@ def test_connect_retries_on_server_connection_exception @agent.send(:connect) assert_predicate(@agent, :connected?) + + # status should be healthy because the retry successfully connects + assert_equal NewRelic::Agent::HealthCheck::HEALTHY, @agent.health_check.instance_variable_get(:@status) end def test_connect_does_not_retry_if_keep_retrying_false @@ -342,6 +345,15 @@ def test_connect_does_not_retry_if_keep_retrying_false @agent.send(:connect, :keep_retrying => false) end + def test_agent_health_status_set_to_failed_to_connect + # stub a valid health check, by setting @continue = true + @agent.health_check.instance_variable_set(:@continue, true) + @agent.service.expects(:connect).once.raises(Timeout::Error) + @agent.send(:connect, :keep_retrying => false) + + assert_equal NewRelic::Agent::HealthCheck::FAILED_TO_CONNECT, @agent.health_check.instance_variable_get(:@status) + end + def test_connect_does_not_retry_on_license_error @agent.service.expects(:connect).raises(NewRelic::Agent::LicenseException) @agent.send(:connect) @@ -562,6 +574,16 @@ def test_defer_start_if_no_application_name_configured assert_match(/No application name configured/i, logmsg) end + def test_health_status_updated_if_no_app_name_configured + with_config(:app_name => false) do + # stub a valid health check, by setting @continue = true + @agent.health_check.instance_variable_set(:@continue, true) + @agent.start + end + + assert_equal NewRelic::Agent::HealthCheck::MISSING_APP_NAME, @agent.health_check.instance_variable_get(:@status) + end + def test_harvest_from_container container = mock harvested_items = %w[foo bar baz] @@ -750,6 +772,17 @@ def test_force_disconnect_logs_message refute_empty matching_lines, 'logs should say the agent is disconnecting' end + def test_force_disconnect_sets_health_status + @agent.instance_variable_set(:@service, nil) + @agent.stubs(:sleep) + error = NewRelic::Agent::ForceDisconnectException.new + # stub a valid health check, by setting @continue = true + @agent.health_check.instance_variable_set(:@continue, true) + @agent.handle_force_disconnect(error) + + assert_equal NewRelic::Agent::HealthCheck::FORCED_DISCONNECT, @agent.health_check.instance_variable_get(:@status) + end + def test_discarding_logs_message @agent.service.stubs(:send).raises(UnrecoverableServerException) @@ -801,6 +834,28 @@ def test_exit_connecting_worker_thread assert worker.join(1), 'Worker thread hang on shutdown' end + + def test_agent_health_status_set_to_shutdown_when_healthy + # stub a valid health check, by setting @continue = true + @agent.health_check.instance_variable_set(:@continue, true) + @agent.setup_and_start_agent + + assert_equal NewRelic::Agent::HealthCheck::HEALTHY, @agent.health_check.instance_variable_get(:@status) + + @agent.shutdown + + assert_equal NewRelic::Agent::HealthCheck::SHUTDOWN, @agent.health_check.instance_variable_get(:@status) + end + + def test_agent_health_status_when_not_healthy_is_same_after_shutdown + # stub a valid health check, by setting @continue = true + @agent.health_check.instance_variable_set(:@continue, true) + @agent.setup_and_start_agent + @agent.health_check.instance_variable_set(:@status, NewRelic::Agent::HealthCheck::INVALID_LICENSE_KEY) + @agent.shutdown + + assert_equal NewRelic::Agent::HealthCheck::INVALID_LICENSE_KEY, @agent.health_check.instance_variable_get(:@status) + end end class AgentStartingTest < Minitest::Test diff --git a/test/new_relic/agent/configuration/orphan_configuration_test.rb b/test/new_relic/agent/configuration/orphan_configuration_test.rb index d85827235d..1bbb756a32 100644 --- a/test/new_relic/agent/configuration/orphan_configuration_test.rb +++ b/test/new_relic/agent/configuration/orphan_configuration_test.rb @@ -9,7 +9,15 @@ class OrphanedConfigTest < Minitest::Test include NewRelic::TestHelpers::ConfigScanning # :automatic_custom_instrumentation_method_list - the tranform proc handles all processing, no other reference exists - IGNORED_KEYS = %i[automatic_custom_instrumentation_method_list] + # :'agent_control.enabled' - the config is set by environment variable in agent control, the symbol config is not used + # :'agent_control.health.delivery_location - the config is set by environment variable in agent control, the symbol config is not used + # :'agent_control.health.frequency' - the config is set by environment variable in agent control, the symbol config is not used + IGNORED_KEYS = %i[ + automatic_custom_instrumentation_method_list + agent_control.enabled + agent_control.health.delivery_location + agent_control.health.frequency + ] def setup @default_keys = ::NewRelic::Agent::Configuration::DEFAULTS.keys diff --git a/test/new_relic/agent/health_check_test.rb b/test/new_relic/agent/health_check_test.rb new file mode 100644 index 0000000000..cdf9bb093c --- /dev/null +++ b/test/new_relic/agent/health_check_test.rb @@ -0,0 +1,353 @@ +# This file is distributed under New Relic's license terms. +# See https://github.com/newrelic/newrelic-ruby-agent/blob/main/LICENSE for complete details. +# frozen_string_literal: true + +require 'fileutils' +require_relative '../../test_helper' + +class NewRelicHealthCheckTest < Minitest::Test + # example + # file name: health-bc21b5891f5e44fc9272caef924611a8.yml + # healthy: true + # status: Agent has shutdown + # last_error: NR-APM-099 + # status_time_unix_nano: 1724953624761000000 + # start_time_unix_nano: 1724953587605000000 + + def teardown + mocha_teardown + end + + def test_yaml_health_file_written_to_delivery_location + Dir.mkdir('health') + + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.send(:write_file) + + assert File.directory?('health'), 'Directory not found' + assert File.exist?('health/health-abc123.yml'), 'File not found' # rubocop:disable Minitest/AssertPathExists + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_health_file_written_to_delivery_location_with_file_path_prefix + Dir.mkdir('health') + + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'file://health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.send(:write_file) + + assert File.directory?('./health'), 'Directory not found' + assert File.exist?('./health/health-abc123.yml'), 'File not found' # rubocop:disable Minitest/AssertPathExists + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_file_name_has_health_plus_uuid_without_hyphens + health_check = NewRelic::Agent::HealthCheck.new + + # ex: health-bc21b5891f5e44fc9272caef924611a8.yml + assert_match(/^health-[0-9a-f]{32}\.ya?ml$/, health_check.send(:file_name)) + end + + def test_write_file_called_on_interval + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '3', + 'NEW_RELIC_AGENT_CONTROL_ENABLED' => 'true', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.stub(:write_file, nil) do + health_check.expects(:sleep).with(3).times(3) + health_check.expects(:write_file).times(3).then.returns(nil).then.returns(nil).then.raises('whoa!') + health_check.create_and_run_health_check_loop.join + end + end + end + + def test_create_and_run_health_check_loop_exits_after_shutdown + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '3', + 'NEW_RELIC_AGENT_CONTROL_ENABLED' => 'true', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.stub(:write_file, nil) do + health_check.expects(:sleep).with(3).times(1) + health_check.expects(:write_file).times(1).then.returns(nil) + health_check.update_status(NewRelic::Agent::HealthCheck::SHUTDOWN) + health_check.create_and_run_health_check_loop.join + end + end + end + + def test_write_file_sets_continue_false_when_error + Dir.mkdir('health') + + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + File.stub(:write, ->(arg1, arg2) { raise 'boom!' }) do + health_check = NewRelic::Agent::HealthCheck.new + # stub a valid health check, by setting @continue = true + health_check.instance_variable_set(:@continue, true) + + health_check.send(:write_file) + + refute(health_check.instance_variable_get(:@continue)) + end + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_frequency_defaults_to_five + # deliberately not setting the `NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY` env var + health_check = NewRelic::Agent::HealthCheck.new + + assert_equal 5, health_check.instance_variable_get(:@frequency) + end + + def test_yaml_file_has_healthy_field + Dir.mkdir('health') + + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.send(:write_file) + + assert_predicate File.readlines('health/health-abc123.yml').grep(/healthy:/), :any? + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_file_has_status_field + Dir.mkdir('health') + + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.send(:write_file) + + assert_predicate File.readlines('health/health-abc123.yml').grep(/status:/), :any? + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_file_has_last_error_field_when_status_not_healthy + Dir.mkdir('health') + + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + health_check = NewRelic::Agent::HealthCheck.new + # stub a valid health check, by setting @continue = true + health_check.instance_variable_set(:@continue, true) + health_check.update_status(NewRelic::Agent::HealthCheck::INVALID_LICENSE_KEY) + health_check.send(:write_file) + + assert_predicate File.readlines('health/health-abc123.yml').grep(/last_error:/), :any? + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_file_does_not_have_last_error_field_when_status_healthy + Dir.mkdir('health') + + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.update_status(NewRelic::Agent::HealthCheck::HEALTHY) + health_check.send(:write_file) + + refute_predicate File.readlines('health/health-abc123.yml').grep(/last_error:/), :any? + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_nano_time_in_correct_format + health_check = NewRelic::Agent::HealthCheck.new + time = health_check.send(:nano_time) + + assert_instance_of(Integer, time) + assert(time.to_s.length >= 19) + end + + def test_yaml_file_has_same_start_time_unix_every_write + Dir.mkdir('health') + + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, '1') do + health_check = NewRelic::Agent::HealthCheck.new + start_time = health_check.instance_variable_get(:@start_time_unix_nano) + health_check.send(:write_file) + + assert_predicate File.readlines('health/health-1.yml').grep(/start_time_unix_nano: #{start_time}/), :any? + + health_check.send(:write_file) + + assert_predicate File.readlines('health/health-1.yml').grep(/start_time_unix_nano: #{start_time}/), :any? + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_file_has_status_time_unix_nano + Dir.mkdir('health') + + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.send(:write_file) + + assert_predicate File.readlines('health/health-abc123.yml').grep(/status_time_unix_nano:/), :any? + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_file_has_new_status_time_each_write + Dir.mkdir('health') + + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, '1') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.send(:write_file) + # on a healthy file, the third index/fourth line should hold the status_time_unix_nano data + first_status_time = File.readlines('health/health-1.yml')[3] + health_check.send(:write_file) + second_status_time = File.readlines('health/health-1.yml')[3] + + refute_equal(first_status_time, second_status_time) + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_agent_health_started_if_required_info_present + with_environment('NEW_RELIC_AGENT_CONTROL_ENABLED' => 'true', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => '/health', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '5') do + log = with_array_logger(:debug) do + health_check = NewRelic::Agent::HealthCheck.new + health_check.create_and_run_health_check_loop + end + + assert_log_contains(log, 'Agent Control health check conditions met. Starting health checks.') + refute_log_contains(log, 'NEW_RELIC_AGENT_CONTROL_ENABLED not true') + refute_log_contains(log, 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY zero or less') + end + end + + def test_agent_health_not_generated_if_agent_control_enabled_missing + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => '/health', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '5') do + log = with_array_logger(:debug) do + health_check = NewRelic::Agent::HealthCheck.new + # loop should exit before write_file is called + # raise an error if it's invoked + health_check.stub(:write_file, -> { raise 'kaboom!' }) do + health_check.create_and_run_health_check_loop + end + end + + assert_log_contains(log, 'NEW_RELIC_AGENT_CONTROL_ENABLED not true') + refute_log_contains(log, 'Agent Control health check conditions met. Starting health checks.') + end + end + + def test_agent_health_falls_back_to_default_value_when_env_var_missing + with_environment('NEW_RELIC_AGENT_CONTROL_ENABLED' => 'true', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '5') do + health_check = NewRelic::Agent::HealthCheck.new + + assert_equal '/newrelic/apm/health', health_check.instance_variable_get(:@delivery_location) + end + end + + def test_agent_health_not_generated_if_frequency_is_zero + with_environment('NEW_RELIC_AGENT_CONTROL_ENABLED' => 'true', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => '/health', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '0') do + log = with_array_logger(:debug) do + health_check = NewRelic::Agent::HealthCheck.new + # loop should exit before write_file is called + # raise an error if it's invoked + health_check.stub(:write_file, -> { raise 'kaboom!' }) do + health_check.create_and_run_health_check_loop + end + end + + assert_log_contains(log, 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY zero or less') + refute_log_contains(log, 'Agent Control health check conditions met. Starting health checks.') + end + end + + def test_agent_health_supportability_metric_generated_recorded_when_health_check_loop_starts + NewRelic::Agent.instance.stats_engine.clear_stats + + with_environment('NEW_RELIC_AGENT_CONTROL_ENABLED' => 'true', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => '/health', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '5') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.create_and_run_health_check_loop + + assert_metrics_recorded({'Supportability/AgentControl/Health/enabled' => {call_count: 1}}) + end + end + + def test_update_status_is_a_no_op_when_health_checks_disabled + with_environment('NEW_RELIC_AGENT_CONTROL_ENABLED' => 'false', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'whatev', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '0') do + health_check = NewRelic::Agent::HealthCheck.new + + assert_equal NewRelic::Agent::HealthCheck::HEALTHY, health_check.instance_variable_get(:@status) + + health_check.create_and_run_health_check_loop + health_check.update_status(NewRelic::Agent::HealthCheck::SHUTDOWN) + + assert_equal NewRelic::Agent::HealthCheck::HEALTHY, health_check.instance_variable_get(:@status) + end + end + + def test_update_message_works_with_http_arrays + health_check = NewRelic::Agent::HealthCheck.new + # stub a valid health check, by setting @continue = true + health_check.instance_variable_set(:@continue, true) + result = health_check.update_status(NewRelic::Agent::HealthCheck::HTTP_ERROR, ['401', :preconnect]) + + assert_equal 'HTTP error response code [401] recevied from New Relic while sending data type [preconnect]', result + end + + def test_healthy_true_when_healthy + health_check = NewRelic::Agent::HealthCheck.new + # stub a valid health check, by setting @continue = true + health_check.instance_variable_set(:@continue, true) + health_check.update_status(NewRelic::Agent::HealthCheck::HEALTHY) + + assert_predicate health_check, :healthy? + end + + def test_healthy_false_when_invalid_license_key + health_check = NewRelic::Agent::HealthCheck.new + # stub a valid health check, by setting @continue = true + health_check.instance_variable_set(:@continue, true) + health_check.update_status(NewRelic::Agent::HealthCheck::INVALID_LICENSE_KEY) + + refute_predicate health_check, :healthy? + end +end diff --git a/test/new_relic/agent/new_relic_service_test.rb b/test/new_relic/agent/new_relic_service_test.rb index 36772c7ac2..c05ed09322 100644 --- a/test/new_relic/agent/new_relic_service_test.rb +++ b/test/new_relic/agent/new_relic_service_test.rb @@ -585,6 +585,7 @@ def self.check_status_code_handling(expected_exceptions) method_name = "test_#{status_code}_raises_#{exception_type.name.split('::').last}" define_method(method_name) do @http_handle.respond_to(:metric_data, 'payload', :code => status_code) + assert_raises exception_type do stats_hash = NewRelic::Agent::StatsHash.new @service.metric_data(stats_hash) @@ -608,6 +609,40 @@ def self.check_status_code_handling(expected_exceptions) 429 => NewRelic::Agent::ServerConnectionException, 431 => NewRelic::Agent::UnrecoverableServerException) + def self.check_agent_health_status_updates(expected_exceptions) + expected_exceptions.each do |status_code| + method_name = "test_#{status_code}_updates_agent_health_check" + define_method(method_name) do + NewRelic::Agent.agent.health_check.instance_variable_set(:@continue, true) + NewRelic::Agent.agent.health_check.instance_variable_set(:@status, NewRelic::Agent::HealthCheck::HEALTHY) + + begin + @http_handle.respond_to(:metric_data, 'payload', :code => status_code) + stats_hash = NewRelic::Agent::StatsHash.new + @service.metric_data(stats_hash) + rescue + # no-op, raise the error + end + + expected = { + healthy: false, + last_error: 'NR-APM-004', + message: "HTTP error response code [#{status_code}] recevied from New Relic while sending data type [metric_data]" + } + + assert_equal expected, NewRelic::Agent.agent.health_check.instance_variable_get(:@status) + end + end + end + + # Some status codes may also eventually report other health checks + # Status code 401 is also invalid license key, but that will return a different health check value if that's the reason for the 401 + # Status code 410 is also for forced disconnect, but that forced disconnect is handled where forced disconnect is rescued + # In this method, however, they should report HTTP_ERROR + check_agent_health_status_updates([ + 400, 401, 403, 405, 407, 408, 409, 410, 411, 413, 415, 417, 429, 431 + ]) + # protocol 17 def test_supportability_metrics_for_http_error_responses NewRelic::Agent.drop_buffered_data