From 671edf243dcb88d5e69d16a4c01a23ba64bef090 Mon Sep 17 00:00:00 2001 From: Kayla Reopelle Date: Thu, 12 Dec 2024 15:51:09 -0800 Subject: [PATCH] Add agent control health checks When the agent recognizes it is running in an agent control environment, it will start automatic health checks that will create a new file at a configured destination at a given frequency that provides details about the last reported status of the agent. When the agent is not seen within an agent control environment, files will not be created. --- CHANGELOG.md | 4 + lib/new_relic/agent/agent.rb | 4 + lib/new_relic/agent/agent_helpers/connect.rb | 1 + lib/new_relic/agent/agent_helpers/harvest.rb | 3 + lib/new_relic/agent/agent_helpers/shutdown.rb | 1 + .../agent_helpers/start_worker_thread.rb | 1 + lib/new_relic/agent/agent_helpers/startup.rb | 5 + .../agent/configuration/default_source.rb | 24 ++ .../agent/configuration/yaml_source.rb | 2 + lib/new_relic/agent/health_check.rb | 125 +++++++ lib/new_relic/agent/new_relic_service.rb | 10 +- test/agent_helper.rb | 7 + test/new_relic/agent/agent/start_test.rb | 3 + .../orphan_configuration_test.rb | 10 +- test/new_relic/agent/health_check_test.rb | 320 ++++++++++++++++++ 15 files changed, 517 insertions(+), 3 deletions(-) create mode 100644 lib/new_relic/agent/health_check.rb create mode 100644 test/new_relic/agent/health_check_test.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 2851296782..cf5bbc74dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ The agent now supports Ruby 3.4.0. We've made incremental changes throughout the preview stage to reach compatibility. This release includes an update to the Thread Profiler for compatibility with Ruby 3.4.0's new backtrace format. [Issue#2992](https://github.com/newrelic/newrelic-ruby-agent/issues/2992) [PR#2997](https://github.com/newrelic/newrelic-ruby-agent/pull/2997) +- **Feature: Add health checks when the agent runs within Agent Control** + + When the agent is started with a within an agent control environment, automatic health check files will be created within the configured file destination at the configured frequency. [PR#2995](https://github.com/newrelic/newrelic-ruby-agent/pull/2995) + - **Bugfix: Stop emitting inaccurate debug-level log about deprecated configuration options** In the previous major release, we dropped support for `disable_` configuration options in favor of `instrumentation.`. Previously, a DEBUG level log warning appeared whenever `disable_*` options were set to `true`, even for libraries (e.g. Action Dispatch) without equivalent `instrumentation.*` options: diff --git a/lib/new_relic/agent/agent.rb b/lib/new_relic/agent/agent.rb index 6c22862f80..7e80a130dc 100644 --- a/lib/new_relic/agent/agent.rb +++ b/lib/new_relic/agent/agent.rb @@ -13,6 +13,7 @@ require 'new_relic/coerce' require 'new_relic/agent/autostart' require 'new_relic/agent/harvester' +require 'new_relic/agent/health_check' require 'new_relic/agent/hostname' require 'new_relic/agent/new_relic_service' require 'new_relic/agent/pipe_service' @@ -88,6 +89,7 @@ def init_basics end def init_components + @health_check = HealthCheck.new @service = NewRelicService.new @events = EventListener.new @stats_engine = StatsEngine.new @@ -139,6 +141,8 @@ def instance # Holds all the methods defined on NewRelic::Agent::Agent # instances module InstanceMethods + # the agent control health check file generator + attr_reader :health_check # the statistics engine that holds all the timeslice data attr_reader :stats_engine # the transaction sampler that handles recording transactions diff --git a/lib/new_relic/agent/agent_helpers/connect.rb b/lib/new_relic/agent/agent_helpers/connect.rb index 7cae31c2bb..75fc02d49e 100644 --- a/lib/new_relic/agent/agent_helpers/connect.rb +++ b/lib/new_relic/agent/agent_helpers/connect.rb @@ -198,6 +198,7 @@ def connect(options = {}) rescue NewRelic::Agent::UnrecoverableAgentException => e handle_unrecoverable_agent_error(e) rescue StandardError, Timeout::Error, NewRelic::Agent::ServerConnectionException => e + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::FAILED_TO_CONNECT) retry if retry_from_error?(e, opts) rescue Exception => e ::NewRelic::Agent.logger.error('Exception of unexpected type during Agent#connect():', e) diff --git a/lib/new_relic/agent/agent_helpers/harvest.rb b/lib/new_relic/agent/agent_helpers/harvest.rb index af5596ad21..893d8e1778 100644 --- a/lib/new_relic/agent/agent_helpers/harvest.rb +++ b/lib/new_relic/agent/agent_helpers/harvest.rb @@ -119,6 +119,7 @@ def send_data_to_endpoint(endpoint, payload, container) rescue UnrecoverableServerException => e NewRelic::Agent.logger.warn("#{endpoint} data was rejected by remote service, discarding. Error: ", e) rescue ServerConnectionException => e + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::FAILED_TO_CONNECT) log_remote_unavailable(endpoint, e) container.merge!(payload) rescue => e @@ -133,9 +134,11 @@ def check_for_and_handle_agent_commands rescue ForceRestartException, ForceDisconnectException raise rescue UnrecoverableServerException => e + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::FAILED_TO_CONNECT) NewRelic::Agent.logger.warn('get_agent_commands message was rejected by remote service, discarding. ' \ 'Error: ', e) rescue ServerConnectionException => e + NewRelic::Agent.health_check.update_status(NewRelic::Agent::HealthCheck::FAILED_TO_CONNECT) log_remote_unavailable(:get_agent_commands, e) rescue => e NewRelic::Agent.logger.info('Error during check_for_and_handle_agent_commands, will retry later: ', e) diff --git a/lib/new_relic/agent/agent_helpers/shutdown.rb b/lib/new_relic/agent/agent_helpers/shutdown.rb index 9bc36393bc..7746769e0b 100644 --- a/lib/new_relic/agent/agent_helpers/shutdown.rb +++ b/lib/new_relic/agent/agent_helpers/shutdown.rb @@ -19,6 +19,7 @@ def shutdown revert_to_default_configuration @started = nil + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::SHUTDOWN) Control.reset end diff --git a/lib/new_relic/agent/agent_helpers/start_worker_thread.rb b/lib/new_relic/agent/agent_helpers/start_worker_thread.rb index 006d492967..e10a020b2b 100644 --- a/lib/new_relic/agent/agent_helpers/start_worker_thread.rb +++ b/lib/new_relic/agent/agent_helpers/start_worker_thread.rb @@ -86,6 +86,7 @@ def handle_force_restart(error) # is the worker thread that gathers data and talks to the # server. def handle_force_disconnect(error) + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::FORCED_DISCONNECT) ::NewRelic::Agent.logger.warn('Agent received a ForceDisconnectException from the server, disconnecting. ' \ "(#{error.message})") disconnect diff --git a/lib/new_relic/agent/agent_helpers/startup.rb b/lib/new_relic/agent/agent_helpers/startup.rb index f683a1e917..bc1a98a57b 100644 --- a/lib/new_relic/agent/agent_helpers/startup.rb +++ b/lib/new_relic/agent/agent_helpers/startup.rb @@ -48,6 +48,7 @@ def check_config_and_start_agent # Treatment of @started and env report is important to get right. def setup_and_start_agent(options = {}) @started = true + @health_check.create_and_run_health_check_loop @harvester.mark_started unless in_resque_child_process? @@ -129,6 +130,7 @@ def monitoring? if Agent.config[:monitor_mode] true else + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::AGENT_DISABLED) ::NewRelic::Agent.logger.warn('Agent configured not to send data in this environment.') false end @@ -140,6 +142,7 @@ def has_license_key? if Agent.config[:license_key] && Agent.config[:license_key].length > 0 true else + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::MISSING_LICENSE_KEY) ::NewRelic::Agent.logger.warn('No license key found. ' + 'This often means your newrelic.yml file was not found, or it lacks a section for the running ' \ "environment, '#{NewRelic::Control.instance.env}'. You may also want to try linting your newrelic.yml " \ @@ -160,6 +163,7 @@ def correct_license_length if key.length == 40 true else + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::INVALID_LICENSE_KEY) ::NewRelic::Agent.logger.error("Invalid license key: #{key}") false end @@ -180,6 +184,7 @@ def agent_should_start? end unless app_name_configured? + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::MISSING_APP_NAME) NewRelic::Agent.logger.error('No application name configured.', 'The agent cannot start without at least one. Please check your ', 'newrelic.yml and ensure that it is valid and has at least one ', diff --git a/lib/new_relic/agent/configuration/default_source.rb b/lib/new_relic/agent/configuration/default_source.rb index b379719c62..707c77eaf6 100644 --- a/lib/new_relic/agent/configuration/default_source.rb +++ b/lib/new_relic/agent/configuration/default_source.rb @@ -2188,6 +2188,30 @@ def self.notify :transform => DefaultSource.method(:convert_to_constant_list), :description => 'Specify a list of exceptions you do not want the agent to strip when [strip_exception_messages](#strip_exception_messages-enabled) is `true`. Separate exceptions with a comma. For example, `"ImportantException,PreserveMessageException"`.' }, + # Agent Control + :'agent_control.fleet_id' => { + :default => nil, + :allow_nil => true, + :public => true, + :type => String, + :allowed_from_server => false, + :description => 'This assigns a fleet id to the language agent. This id is generated by agent control. If this setting is present, it indicates the agent is running in a super agent/fleet environment and health file(s) will be generated.' + }, + :'agent_control.health.delivery_location' => { + :default => nil, + :allow_nil => true, + :public => true, + :type => String, + :allowed_from_server => false, + :description => 'A `file:` URI that specifies the fully qualified directory path for health file(s) to be written to. For example: `file:///var/lib/newrelic-super-agent/fleet/agents.d/`. This configuration will be set by agent control, or one of its components, prior to agent startup.' + }, + :'agent_control.health.frequency' => { + :default => 5, + :public => true, + :type => Integer, + :allowed_from_server => false, + :description => 'The interval, in seconds, of how often the health file(s) will be written to. This configuration will be set by agent control, or one of its components, prior to agent startup.' + }, # Thread profiler :'thread_profiler.enabled' => { :default => DefaultSource.thread_profiler_enabled, diff --git a/lib/new_relic/agent/configuration/yaml_source.rb b/lib/new_relic/agent/configuration/yaml_source.rb index 574d29ef83..ca9b8b212d 100644 --- a/lib/new_relic/agent/configuration/yaml_source.rb +++ b/lib/new_relic/agent/configuration/yaml_source.rb @@ -36,6 +36,7 @@ def initialize(path, env) erb_file = process_erb(raw_file) config = process_yaml(erb_file, env, config, @file_path) rescue ScriptError, StandardError => e + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::FAILED_TO_PARSE_CONFIG) log_failure("Failed to read or parse configuration file at #{path}", e) end @@ -99,6 +100,7 @@ def process_erb(file) file.gsub!(/^\s*#.*$/, '#') ERB.new(file).result(binding) rescue ScriptError, StandardError => e + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::FAILED_TO_PARSE_CONFIG) message = 'Failed ERB processing configuration file. This is typically caused by a Ruby error in <% %> templating blocks in your newrelic.yml file.' failure_array = [message, e] failure_array << e.backtrace[0] if Gem::Version.new(RUBY_VERSION) >= Gem::Version.new('3.4.0') diff --git a/lib/new_relic/agent/health_check.rb b/lib/new_relic/agent/health_check.rb new file mode 100644 index 0000000000..f7788ed28c --- /dev/null +++ b/lib/new_relic/agent/health_check.rb @@ -0,0 +1,125 @@ +# This file is distributed under New Relic's license terms. +# See https://github.com/newrelic/newrelic-ruby-agent/blob/main/LICENSE for complete details. +# frozen_string_literal: true + +module NewRelic + module Agent + class HealthCheck + def initialize + @start_time = nano_time + @fleet_id = ENV['NEW_RELIC_AGENT_CONTROL_FLEET_ID'] + # The spec states file paths for the delivery location will begin with file:// + # This does not create a valid path in Ruby, so remove the prefix when present + @delivery_location = ENV['NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION']&.gsub('file://', '') + @frequency = ENV['NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY'].to_i + @continue = true + @status = HEALTHY + end + + HEALTHY = {healthy: true, last_error: 'NR-APM-000', message: 'Healthy'} + INVALID_LICENSE_KEY = {healthy: false, last_error: 'NR-APM-001', message: 'Invalid liense key (HTTP status code 401)'} + MISSING_LICENSE_KEY = {healthy: false, last_error: 'NR-APM-002', message: 'License key missing in configuration'} + FORCED_DISCONNECT = {healthy: false, last_error: 'NR-APM-003', message: 'Forced disconnect received from New Relic (HTTP status code 410)'} + HTTP_ERROR = {healthy: false, last_error: 'NR-APM-004', message: 'HTTP error response code [%s] recevied from New Relic while sending data type [%s]'} + MISSING_APP_NAME = {healthy: false, last_error: 'NR-APM-005', message: 'Missing application name in agent configuration'} + APP_NAME_EXCEEDED = {healthy: false, last_error: 'NR-APM-006', message: 'The maximum number of configured app names (3) exceeded'} + PROXY_CONFIG_ERROR = {healthy: false, last_error: 'NR-APM-007', message: 'HTTP Proxy configuration error; response code [%s]'} + AGENT_DISABLED = {healthy: false, last_error: 'NR-APM-008', message: 'Agent is disabled via configuration'} + FAILED_TO_CONNECT = {healthy: false, last_error: 'NR-APM-009', message: 'Failed to connect to New Relic data collector'} + FAILED_TO_PARSE_CONFIG = {healthy: false, last_error: 'NR-APM-010', message: 'Agent config file is not able to be parsed'} + SHUTDOWN = {healthy: true, last_error: 'NR-APM-099', message: 'Agent has shutdown'} + + def create_and_run_health_check_loop + unless health_check_enabled? + @continue = false + end + + return NewRelic::Agent.logger.debug('NEW_RELIC_AGENT_CONTROL_FLEET_ID not found, skipping health checks') unless @fleet_id + return NewRelic::Agent.logger.debug('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION not found, skipping health checks') unless @delivery_location + return NewRelic::Agent.logger.debug('NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY zero or less, skipping health checks') unless @frequency > 0 + + NewRelic::Agent.logger.debug('Agent control health check conditions met. Starting health checks.') + NewRelic::Agent.record_metric('Supportability/AgentControl/Health/enabled', 1) + + Thread.new do + while @continue + begin + sleep @frequency + write_file + @continue = false if @status == SHUTDOWN + rescue StandardError => e + NewRelic::Agent.logger.error("Aborting agent control health check. Error raised: #{e}") + @continue = false + end + end + end + end + + def update_status(status, options = []) + return unless @continue + + @status = status + update_message(options) unless options.empty? + end + + private + + def contents + <<~CONTENTS + healthy: #{@status[:healthy]} + status: #{@status[:message]}#{last_error} + start_time_unix_nano: #{@start_time} + status_time_unix_nano: #{nano_time} + CONTENTS + end + + def last_error + @status[:healthy] ? '' : "\nlast_error: #{@status[:last_error]}" + end + + def nano_time + Process.clock_gettime(Process::CLOCK_REALTIME, :nanosecond) + end + + def file_name + "health-#{NewRelic::Agent::GuidGenerator.generate_guid(32)}.yml" + end + + def write_file + @path ||= create_file_path + + File.write("#{@path}/#{file_name}", contents) + rescue StandardError => e + NewRelic::Agent.logger.error("Agent control health check raised an error while writing a file: #{e}") + @continue = false + end + + def create_file_path + for abs_path in [File.expand_path(@delivery_location), + File.expand_path(File.join('', @delivery_location))] do + if File.directory?(abs_path) || (Dir.mkdir(abs_path) rescue nil) + return abs_path[%r{^(.*?)/?$}] + end + end + nil + rescue StandardError => e + NewRelic::Agent.logger.error( + 'Agent control health check raised an error while finding or creating the file path defined in ' \ + "NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION: #{e}" + ) + @continue = false + end + + def health_check_enabled? + @fleet_id && @delivery_location && (@frequency > 0) + end + + def update_message(options) + @status[:message] = sprintf(@status[:message], **options) + rescue StandardError => e + NewRelic::Agent.logger.debug("Error raised while updating agent control health check message: #{e}." \ + "Reverting to original message.\noptions = #{options}, @status[:message] = #{@status[:message]}") + end + end + end +end diff --git a/lib/new_relic/agent/new_relic_service.rb b/lib/new_relic/agent/new_relic_service.rb index 467188d455..ec99e98902 100644 --- a/lib/new_relic/agent/new_relic_service.rb +++ b/lib/new_relic/agent/new_relic_service.rb @@ -455,6 +455,8 @@ def attempt_request(request, opts) end def handle_error_response(response, endpoint) + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::HTTP_ERROR, [response.code, endpoint]) + case response when Net::HTTPRequestTimeOut, Net::HTTPTooManyRequests, @@ -637,9 +639,13 @@ def check_post_size(post_string, endpoint) def send_request(opts) request = prep_request(opts) response = relay_request(request, opts) - return response if response.is_a?(Net::HTTPSuccess) || response.is_a?(Net::HTTPAccepted) - handle_error_response(response, opts[:endpoint]) + if response.is_a?(Net::HTTPSuccess) || response.is_a?(Net::HTTPAccepted) + NewRelic::Agent.agent.health_check.update_status(NewRelic::Agent::HealthCheck::HEALTHY) + response + else + handle_error_response(response, opts[:endpoint]) + end end def log_response(response) diff --git a/test/agent_helper.rb b/test/agent_helper.rb index ae7dfea555..4a32cae7b0 100644 --- a/test/agent_helper.rb +++ b/test/agent_helper.rb @@ -115,6 +115,13 @@ def assert_log_contains(log, message) "Could not find message: '#{message.inspect}'. Log contained: #{lines.join("\n")}" end +def refute_log_contains(log, message) + lines = log.array + + refute (lines.any? { |line| line.match(message) }), + "Found message: '#{message.inspect}'. Log contained: #{lines.join("\n")}" +end + def assert_audit_log_contains(audit_log_contents, needle) # Original request bodies dumped to the log have symbol keys, but once # they go through a dump/load, they're strings again, so we strip diff --git a/test/new_relic/agent/agent/start_test.rb b/test/new_relic/agent/agent/start_test.rb index 4c245066a3..54b80914d8 100644 --- a/test/new_relic/agent/agent/start_test.rb +++ b/test/new_relic/agent/agent/start_test.rb @@ -11,6 +11,7 @@ class NewRelic::Agent::Agent::StartTest < Minitest::Test def setup @harvester = stub('dummy harvester') + @health_check = stub('dummy health check') @harvest_samplers = stub('dummy sampler collection') end @@ -62,6 +63,7 @@ def test_check_config_and_start_agent_forking def test_check_config_and_start_agent_normal @harvester.expects(:mark_started) @harvest_samplers.expects(:load_samplers) + @health_check.expects(:create_and_run_health_check_loop) self.expects(:start_worker_thread) self.expects(:install_exit_handler) self.expects(:environment_for_connect) @@ -74,6 +76,7 @@ def test_check_config_and_start_agent_normal def test_check_config_and_start_agent_sync @harvester.expects(:mark_started) @harvest_samplers.expects(:load_samplers) + @health_check.expects(:create_and_run_health_check_loop) self.expects(:connect_in_foreground) self.expects(:start_worker_thread) self.expects(:install_exit_handler) diff --git a/test/new_relic/agent/configuration/orphan_configuration_test.rb b/test/new_relic/agent/configuration/orphan_configuration_test.rb index d85827235d..1788f63303 100644 --- a/test/new_relic/agent/configuration/orphan_configuration_test.rb +++ b/test/new_relic/agent/configuration/orphan_configuration_test.rb @@ -9,7 +9,15 @@ class OrphanedConfigTest < Minitest::Test include NewRelic::TestHelpers::ConfigScanning # :automatic_custom_instrumentation_method_list - the tranform proc handles all processing, no other reference exists - IGNORED_KEYS = %i[automatic_custom_instrumentation_method_list] + # :'agent_control.fleet_id' - the config is set by environment variable in agent control, the symbol config is not used + # :'agent_control.health.delivery_location - the config is set by environment variable in agent control, the symbol config is not used + # :'agent_control.health.frequency' - the config is set by environment variable in agent control, the symbol config is not used + IGNORED_KEYS = %i[ + automatic_custom_instrumentation_method_list + agent_control.fleet_id + agent_control.health.delivery_location + agent_control.health.frequency + ] def setup @default_keys = ::NewRelic::Agent::Configuration::DEFAULTS.keys diff --git a/test/new_relic/agent/health_check_test.rb b/test/new_relic/agent/health_check_test.rb new file mode 100644 index 0000000000..10b58c168d --- /dev/null +++ b/test/new_relic/agent/health_check_test.rb @@ -0,0 +1,320 @@ +# This file is distributed under New Relic's license terms. +# See https://github.com/newrelic/newrelic-ruby-agent/blob/main/LICENSE for complete details. +# frozen_string_literal: true + +require 'fileutils' +require_relative '../../test_helper' + +class NewRelicHealthCheckTest < Minitest::Test + # example + # file name: health-bc21b5891f5e44fc9272caef924611a8.yml + # healthy: true + # status: Agent has shutdown + # last_error: NR-APM-099 + # status_time_unix_nano: 1724953624761000000 + # start_time_unix_nano: 1724953587605000000 + + def teardown + mocha_teardown + end + + def test_yaml_health_file_written_to_delivery_location + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.send(:write_file) + + assert File.directory?('health'), 'Directory not found' + assert File.exist?('health/health-abc123.yml'), 'File not found' # rubocop:disable Minitest/AssertPathExists + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_health_file_written_to_delivery_location_with_file_path_prefix + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'file://health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.send(:write_file) + + assert File.directory?('./health'), 'Directory not found' + assert File.exist?('./health/health-abc123.yml'), 'File not found' # rubocop:disable Minitest/AssertPathExists + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_file_name_has_health_plus_uuid_without_hyphens + health_check = NewRelic::Agent::HealthCheck.new + + # ex: health-bc21b5891f5e44fc9272caef924611a8.yml + assert_match(/health-(.*){32}\.ya?ml/, health_check.send(:file_name)) + end + + def test_write_file_called_on_interval + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '3', + 'NEW_RELIC_AGENT_CONTROL_FLEET_ID' => 'abc', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.stub(:write_file, nil) do + health_check.expects(:sleep).with(3).times(3) + health_check.expects(:write_file).times(3).then.returns(nil).then.returns(nil).then.raises('whoa!') + health_check.create_and_run_health_check_loop.join + end + end + end + + def test_create_and_run_health_check_loop_exits_after_shutdown + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '3', + 'NEW_RELIC_AGENT_CONTROL_FLEET_ID' => 'abc', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.stub(:write_file, nil) do + health_check.expects(:sleep).with(3).times(1) + health_check.expects(:write_file).times(1).then.returns(nil) + health_check.update_status(NewRelic::Agent::HealthCheck::SHUTDOWN) + health_check.create_and_run_health_check_loop.join + end + end + end + + def test_write_file_sets_continue_false_when_error + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + File.stub(:write, ->(arg1, arg2) { raise 'boom!' }) do + health_check = NewRelic::Agent::HealthCheck.new + + assert(health_check.instance_variable_get(:@continue)) + health_check.send(:write_file) + + refute(health_check.instance_variable_get(:@continue)) + end + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_create_file_path_sets_continue_false_when_error_raised + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + File.stub(:directory?, ->(arg1) { raise 'boom!' }) do + health_check = NewRelic::Agent::HealthCheck.new + + assert(health_check.instance_variable_get(:@continue)) + health_check.send(:create_file_path) + + refute(health_check.instance_variable_get(:@continue)) + end + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_file_has_healthy_field + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.send(:write_file) + + assert_predicate File.readlines('health/health-abc123.yml').grep(/healthy:/), :any? + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_file_has_status_field + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.send(:write_file) + + assert_predicate File.readlines('health/health-abc123.yml').grep(/status:/), :any? + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_file_has_last_error_field_when_status_not_healthy + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.update_status(NewRelic::Agent::HealthCheck::INVALID_LICENSE_KEY) + health_check.send(:write_file) + + assert_predicate File.readlines('health/health-abc123.yml').grep(/last_error:/), :any? + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_file_does_not_have_last_error_field_when_status_healthy + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.update_status(NewRelic::Agent::HealthCheck::HEALTHY) + health_check.send(:write_file) + + refute_predicate File.readlines('health/health-abc123.yml').grep(/last_error:/), :any? + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_nano_time_in_correct_format + health_check = NewRelic::Agent::HealthCheck.new + time = health_check.send(:nano_time) + + assert_instance_of(Integer, time) + assert(time.to_s.length >= 19) + end + + def test_yaml_file_has_same_start_time_unix_nano_for_all_files + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + health_check = NewRelic::Agent::HealthCheck.new + start_time = health_check.instance_variable_get(:@start_time_unix_nano) + health_check.expects(:file_name).times(2).then.returns('health-1.yml').then.returns('health-2.yml') + health_check.send(:write_file) + + assert_predicate File.readlines('health/health-1.yml').grep(/start_time_unix_nano: #{start_time}/), :any? + + health_check.send(:write_file) + + assert_predicate File.readlines('health/health-2.yml').grep(/start_time_unix_nano: #{start_time}/), :any? + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_file_has_status_time_unix_nano + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + NewRelic::Agent::GuidGenerator.stub(:generate_guid, 'abc123') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.send(:write_file) + + assert_predicate File.readlines('health/health-abc123.yml').grep(/status_time_unix_nano:/), :any? + end + end + ensure + FileUtils.rm_rf('health') + end + + def test_yaml_file_has_new_status_time_each_file + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => 'health/') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.expects(:file_name).times(2).then.returns('health-1.yml').then.returns('health-2.yml') + health_check.send(:write_file) + # on a healthy file, the third index/fourth line should hold the status_time_unix_nano data + first_status_time = File.readlines('health/health-1.yml')[3] + health_check.send(:write_file) + second_status_time = File.readlines('health/health-2.yml')[3] + + refute_equal(first_status_time, second_status_time) + end + ensure + FileUtils.rm_rf('health') + end + + def test_agent_health_started_if_required_info_present + with_environment('NEW_RELIC_AGENT_CONTROL_FLEET_ID' => 'landslide', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => '/health', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '5') do + log = with_array_logger(:debug) do + health_check = NewRelic::Agent::HealthCheck.new + health_check.create_and_run_health_check_loop + end + + assert_log_contains(log, 'Agent control health check conditions met. Starting health checks.') + refute_log_contains(log, 'NEW_RELIC_AGENT_CONTROL_FLEET_ID not found') + refute_log_contains(log, 'NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION not found') + refute_log_contains(log, 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY zero or less') + end + end + + def test_agent_health_not_generated_if_agent_control_fleet_id_absent + with_environment('NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => '/health', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '5') do + log = with_array_logger(:debug) do + health_check = NewRelic::Agent::HealthCheck.new + # loop should exit before write_file is called + # raise an error if it's invoked + health_check.stub(:write_file, -> { raise 'kaboom!' }) do + health_check.create_and_run_health_check_loop + end + end + + assert_log_contains(log, 'NEW_RELIC_AGENT_CONTROL_FLEET_ID not found') + refute_log_contains(log, 'Agent control health check conditions met. Starting health checks.') + end + end + + def test_agent_health_not_generated_if_delivery_location_absent + with_environment('NEW_RELIC_AGENT_CONTROL_FLEET_ID' => 'mykonos', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '5') do + log = with_array_logger(:debug) do + health_check = NewRelic::Agent::HealthCheck.new + # loop should exit before write_file is called + # raise an error if it's invoked + health_check.stub(:write_file, -> { raise 'kaboom!' }) do + health_check.create_and_run_health_check_loop + end + end + + assert_log_contains(log, 'NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION not found') + refute_log_contains(log, 'Agent control health check conditions met. Starting health checks.') + end + end + + def test_agent_health_not_generated_if_frequency_is_zero + with_environment('NEW_RELIC_AGENT_CONTROL_FLEET_ID' => 'anchors-away', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => '/health', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '0') do + log = with_array_logger(:debug) do + health_check = NewRelic::Agent::HealthCheck.new + # loop should exit before write_file is called + # raise an error if it's invoked + health_check.stub(:write_file, -> { raise 'kaboom!' }) do + health_check.create_and_run_health_check_loop + end + end + + assert_log_contains(log, 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY zero or less') + refute_log_contains(log, 'Agent control health check conditions met. Starting health checks.') + end + end + + def test_agent_health_supportability_metric_generated_recorded_when_health_check_loop_starts + NewRelic::Agent.instance.stats_engine.clear_stats + + with_environment('NEW_RELIC_AGENT_CONTROL_FLEET_ID' => 'landslide', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => '/health', + 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '5') do + health_check = NewRelic::Agent::HealthCheck.new + health_check.create_and_run_health_check_loop + + assert_metrics_recorded({'Supportability/AgentControl/Health/enabled' => {call_count: 1}}) + end + end + + def test_update_status_is_a_no_op_when_health_checks_disabled + with_environment('NEW_RELIC_AGENT_CONTROL_FLEET_ID' => nil, + 'NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION' => nil, + 'NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY' => '0') do + health_check = NewRelic::Agent::HealthCheck.new + + assert_equal NewRelic::Agent::HealthCheck::HEALTHY, health_check.instance_variable_get(:@status) + + health_check.create_and_run_health_check_loop + health_check.update_status(NewRelic::Agent::HealthCheck::SHUTDOWN) + + assert_equal NewRelic::Agent::HealthCheck::HEALTHY, health_check.instance_variable_get(:@status) + end + end +end