Skip to content

Commit

Permalink
Use vendor customizable fan speed threshold checks (#378)
Browse files Browse the repository at this point in the history
  • Loading branch information
spilkey-cisco authored Jul 14, 2023
1 parent db6e340 commit 94242c2
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 97 deletions.
79 changes: 30 additions & 49 deletions sonic-thermalctld/scripts/thermalctld
Original file line number Diff line number Diff line change
Expand Up @@ -119,57 +119,35 @@ class FanStatus(logger.Logger):
self.status = status
return True

def _check_speed_value_available(self, speed, target_speed, tolerance, current_status):
if speed == NOT_AVAILABLE or target_speed == NOT_AVAILABLE or tolerance == NOT_AVAILABLE:
if isinstance(tolerance, int) and (tolerance > 100 or tolerance < 0):
self.log_warning('Invalid tolerance value: {}'.format(tolerance))
return False

if current_status is True:
self.log_warning('Fan speed or target_speed or tolerance became unavailable, '
'speed={}, target_speed={}, tolerance={}'.format(speed, target_speed, tolerance))
return False
return True

def set_under_speed(self, speed, target_speed, tolerance):
def set_under_speed(self, is_under_speed):
"""
Set and cache Fan under speed status
:param speed: Fan speed
:param target_speed: Fan target speed
:param tolerance: Threshold between Fan speed and target speed
:param is_under_speed: Fan under speed threshold status
:return: True if status changed else False
"""
if not self._check_speed_value_available(speed, target_speed, tolerance, self.under_speed):
old_status = self.under_speed
self.under_speed = False
return old_status != self.under_speed
if is_under_speed == NOT_AVAILABLE:
if self.under_speed:
self.log_warning('Fan under speed threshold check became unavailable')
is_under_speed = False

status = speed < target_speed * (1 - float(tolerance) / 100)
if status == self.under_speed:
return False
old_status = self.under_speed
self.under_speed = is_under_speed
return old_status != self.under_speed

self.under_speed = status
return True

def set_over_speed(self, speed, target_speed, tolerance):
def set_over_speed(self, is_over_speed):
"""
Set and cache Fan over speed status
:param speed: Fan speed
:param target_speed: Fan target speed
:param tolerance: Threshold between Fan speed and target speed
:param is_over_speed: Fan over speed threshold status
:return: True if status changed else False
"""
if not self._check_speed_value_available(speed, target_speed, tolerance, self.over_speed):
old_status = self.over_speed
self.over_speed = False
return old_status != self.over_speed
if is_over_speed == NOT_AVAILABLE:
if self.over_speed:
self.log_warning('Fan over speed threshold check became unavailable')
is_over_speed = False

status = speed > target_speed * (1 + float(tolerance) / 100)
if status == self.over_speed:
return False

self.over_speed = status
return True
old_status = self.over_speed
self.over_speed = is_over_speed
return old_status != self.over_speed

def is_ok(self):
"""
Expand Down Expand Up @@ -315,16 +293,18 @@ class FanUpdater(logger.Logger):
fan_status = self.fan_status_dict[fan_name]

speed = NOT_AVAILABLE
speed_tolerance = NOT_AVAILABLE
speed_target = NOT_AVAILABLE
is_under_speed = NOT_AVAILABLE
is_over_speed = NOT_AVAILABLE
fan_fault_status = NOT_AVAILABLE
fan_direction = NOT_AVAILABLE
is_replaceable = try_get(fan.is_replaceable, False)
presence = try_get(fan.get_presence, False)
if presence:
speed = try_get(fan.get_speed)
speed_tolerance = try_get(fan.get_speed_tolerance)

This comment has been minimized.

Copy link
@Yagami-Jiang

Yagami-Jiang Aug 29, 2023

Hi, i want to know why we should delete 'speed_tolerance' field?
Cause as i know, the command 'show system-health summary' will call the file 'hardware_checker.py' which is at master branch, and it will get the 'speed_tolerance' from Redis database.
If we delete the 'speed_tolerance', an error will be generated when using the command 'show system-health summary'
Snipaste_2023-08-29_09-49-04
show system-health summar.log

speed_target = try_get(fan.get_target_speed)
is_under_speed = try_get(fan.is_under_speed)
is_over_speed = try_get(fan.is_over_speed)
fan_fault_status = try_get(fan.get_status, False)
fan_direction = try_get(fan.get_direction)

Expand All @@ -344,20 +324,20 @@ class FanUpdater(logger.Logger):
'Fan fault warning: {} is broken'.format(fan_name)
)

if presence and fan_status.set_under_speed(speed, speed_target, speed_tolerance):
if presence and fan_status.set_under_speed(is_under_speed):
set_led = True
self._log_on_status_changed(not fan_status.under_speed,
'Fan low speed warning cleared: {} speed is back to normal'.format(fan_name),
'Fan low speed warning: {} current speed={}, target speed={}, tolerance={}'.
format(fan_name, speed, speed_target, speed_tolerance)
'Fan low speed warning: {} current speed={}, target speed={}'.
format(fan_name, speed, speed_target)
)

if presence and fan_status.set_over_speed(speed, speed_target, speed_tolerance):
if presence and fan_status.set_over_speed(is_over_speed):
set_led = True
self._log_on_status_changed(not fan_status.over_speed,
'Fan high speed warning cleared: {} speed is back to normal'.format(fan_name),
'Fan high speed warning: {} target speed={}, current speed={}, tolerance={}'.
format(fan_name, speed_target, speed, speed_tolerance)
'Fan high speed warning: {} current speed={}, target speed={}'.
format(fan_name, speed, speed_target)
)

# We don't set PSU led here, PSU led will be handled in psud
Expand All @@ -376,8 +356,9 @@ class FanUpdater(logger.Logger):
('status', str(fan_fault_status)),
('direction', str(fan_direction)),
('speed', str(speed)),
('speed_tolerance', str(speed_tolerance)),
('speed_target', str(speed_target)),
('is_under_speed', str(is_under_speed)),
('is_over_speed', str(is_over_speed)),
('is_replaceable', str(is_replaceable)),
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
])
Expand Down
68 changes: 20 additions & 48 deletions sonic-thermalctld/tests/test_thermalctld.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,30 +68,6 @@ class TestFanStatus(object):
"""
Test cases to cover functionality in FanStatus class
"""
def test_check_speed_value_available(self):
fan_status = thermalctld.FanStatus()

ret = fan_status._check_speed_value_available(30, 32, 5, True)
assert ret == True
assert fan_status.log_warning.call_count == 0

ret = fan_status._check_speed_value_available(thermalctld.NOT_AVAILABLE, 32, 105, True)
assert ret == False
assert fan_status.log_warning.call_count == 1
fan_status.log_warning.assert_called_with('Invalid tolerance value: 105')

# Reset
fan_status.log_warning.reset_mock()

ret = fan_status._check_speed_value_available(thermalctld.NOT_AVAILABLE, 32, 5, False)
assert ret == False
assert fan_status.log_warning.call_count == 0

ret = fan_status._check_speed_value_available(thermalctld.NOT_AVAILABLE, 32, 5, True)
assert ret == False
assert fan_status.log_warning.call_count == 1
fan_status.log_warning.assert_called_with('Fan speed or target_speed or tolerance became unavailable, speed=N/A, target_speed=32, tolerance=5')

def test_set_presence(self):
fan_status = thermalctld.FanStatus()
ret = fan_status.set_presence(True)
Expand All @@ -104,52 +80,48 @@ def test_set_presence(self):

def test_set_under_speed(self):
fan_status = thermalctld.FanStatus()
ret = fan_status.set_under_speed(thermalctld.NOT_AVAILABLE, thermalctld.NOT_AVAILABLE, thermalctld.NOT_AVAILABLE)
assert not ret

ret = fan_status.set_under_speed(thermalctld.NOT_AVAILABLE, thermalctld.NOT_AVAILABLE, 0)
assert not ret

ret = fan_status.set_under_speed(thermalctld.NOT_AVAILABLE, 0, 0)
ret = fan_status.set_under_speed(False)
assert not ret

ret = fan_status.set_under_speed(0, 0, 0)
assert not ret

ret = fan_status.set_under_speed(80, 100, 19)
ret = fan_status.set_under_speed(True)
assert ret
assert fan_status.under_speed
assert not fan_status.is_ok()

ret = fan_status.set_under_speed(81, 100, 19)
ret = fan_status.set_under_speed(True)
assert not ret

ret = fan_status.set_under_speed(False)
assert ret
assert not fan_status.under_speed
assert fan_status.is_ok()

def test_set_over_speed(self):
fan_status = thermalctld.FanStatus()
ret = fan_status.set_over_speed(thermalctld.NOT_AVAILABLE, thermalctld.NOT_AVAILABLE, thermalctld.NOT_AVAILABLE)
assert not ret

ret = fan_status.set_over_speed(thermalctld.NOT_AVAILABLE, thermalctld.NOT_AVAILABLE, 0)
ret = fan_status.set_under_speed(False)
assert not ret

ret = fan_status.set_over_speed(thermalctld.NOT_AVAILABLE, 0, 0)
assert not ret
def test_set_over_speed(self):
fan_status = thermalctld.FanStatus()

ret = fan_status.set_over_speed(0, 0, 0)
ret = fan_status.set_over_speed(False)
assert not ret

ret = fan_status.set_over_speed(120, 100, 19)
ret = fan_status.set_over_speed(True)
assert ret
assert fan_status.over_speed
assert not fan_status.is_ok()

ret = fan_status.set_over_speed(120, 100, 21)
ret = fan_status.set_over_speed(True)
assert not ret

ret = fan_status.set_over_speed(False)
assert ret
assert not fan_status.over_speed
assert fan_status.is_ok()

ret = fan_status.set_over_speed(False)
assert not ret


class TestFanUpdater(object):
"""
Expand Down Expand Up @@ -251,7 +223,7 @@ def test_fan_under_speed(self):
fan_list = chassis.get_all_fans()
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
assert fan_updater.log_warning.call_count == 1
fan_updater.log_warning.assert_called_with('Fan low speed warning: FanDrawer 0 fan 1 current speed=1, target speed=2, tolerance=0')
fan_updater.log_warning.assert_called_with('Fan low speed warning: FanDrawer 0 fan 1 current speed=1, target speed=2')

fan_list[0].make_normal_speed()
fan_updater.update()
Expand All @@ -267,7 +239,7 @@ def test_fan_over_speed(self):
fan_list = chassis.get_all_fans()
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
assert fan_updater.log_warning.call_count == 1
fan_updater.log_warning.assert_called_with('Fan high speed warning: FanDrawer 0 fan 1 target speed=1, current speed=2, tolerance=0')
fan_updater.log_warning.assert_called_with('Fan high speed warning: FanDrawer 0 fan 1 current speed=2, target speed=1')

fan_list[0].make_normal_speed()
fan_updater.update()
Expand Down

0 comments on commit 94242c2

Please sign in to comment.