Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use vendor customizable fan speed threshold checks #1

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 30 additions & 49 deletions sonic-thermalctld/scripts/thermalctld
Original file line number Diff line number Diff line change
Expand Up @@ -119,57 +119,35 @@ class FanStatus(logger.Logger):
self.status = status
return True

def _check_speed_value_available(self, speed, target_speed, tolerance, current_status):
if speed == NOT_AVAILABLE or target_speed == NOT_AVAILABLE or tolerance == NOT_AVAILABLE:
if isinstance(tolerance, int) and (tolerance > 100 or tolerance < 0):
self.log_warning('Invalid tolerance value: {}'.format(tolerance))
return False

if current_status is True:
self.log_warning('Fan speed or target_speed or tolerance became unavailable, '
'speed={}, target_speed={}, tolerance={}'.format(speed, target_speed, tolerance))
return False
return True

def set_under_speed(self, speed, target_speed, tolerance):
def set_under_speed(self, is_under_speed):
"""
Set and cache Fan under speed status
:param speed: Fan speed
:param target_speed: Fan target speed
:param tolerance: Threshold between Fan speed and target speed
:param is_under_speed: Fan under speed threshold status
:return: True if status changed else False
"""
if not self._check_speed_value_available(speed, target_speed, tolerance, self.under_speed):
old_status = self.under_speed
self.under_speed = False
return old_status != self.under_speed
if is_under_speed == NOT_AVAILABLE:
if self.under_speed:
self.log_warning('Fan under speed threshold check became unavailable')
is_under_speed = False
spilkey-cisco marked this conversation as resolved.
Show resolved Hide resolved

status = speed < target_speed * (1 - float(tolerance) / 100)
if status == self.under_speed:
return False
old_status = self.under_speed
self.under_speed = is_under_speed
return old_status != self.under_speed

self.under_speed = status
return True

def set_over_speed(self, speed, target_speed, tolerance):
def set_over_speed(self, is_over_speed):
"""
Set and cache Fan over speed status
:param speed: Fan speed
:param target_speed: Fan target speed
:param tolerance: Threshold between Fan speed and target speed
:param is_over_speed: Fan over speed threshold status
:return: True if status changed else False
"""
if not self._check_speed_value_available(speed, target_speed, tolerance, self.over_speed):
old_status = self.over_speed
self.over_speed = False
return old_status != self.over_speed
if is_over_speed == NOT_AVAILABLE:
if self.over_speed:
self.log_warning('Fan over speed threshold check became unavailable')
is_over_speed = False

status = speed > target_speed * (1 + float(tolerance) / 100)
if status == self.over_speed:
return False

self.over_speed = status
return True
old_status = self.over_speed
self.over_speed = is_over_speed
return old_status != self.over_speed

def is_ok(self):
"""
Expand Down Expand Up @@ -315,16 +293,18 @@ class FanUpdater(logger.Logger):
fan_status = self.fan_status_dict[fan_name]

speed = NOT_AVAILABLE
speed_tolerance = NOT_AVAILABLE
speed_target = NOT_AVAILABLE
is_under_speed = NOT_AVAILABLE
is_over_speed = NOT_AVAILABLE
fan_fault_status = NOT_AVAILABLE
fan_direction = NOT_AVAILABLE
is_replaceable = try_get(fan.is_replaceable, False)
presence = try_get(fan.get_presence, False)
if presence:
speed = try_get(fan.get_speed)
speed_tolerance = try_get(fan.get_speed_tolerance)
speed_target = try_get(fan.get_target_speed)
is_under_speed = try_get(fan.is_under_speed)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it better to check if this function is not implemented and fall back to old code than putting an implementation in the base class (which one would not understand unless he knew about this change) esp because the changes span across repos.

Copy link
Owner Author

@spilkey-cisco spilkey-cisco Jun 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having a default implementation in the base class allows for the same calculation to be used across other code, such as sonic-mgmt tests. If we fall back to a hard-coded calculation inside thermalctld, then the calculation must also be hard-coded anywhere else fan under/over speed conditions should be checked. Existing sonic-mgmt speed tolerance checks use different (and incorrect) calculation logic, so having a default API implementation solves this issue (https://wwwin-github.cisco.com/spilkey/sonic-test/pull/1)

is_over_speed = try_get(fan.is_over_speed)
fan_fault_status = try_get(fan.get_status, False)
fan_direction = try_get(fan.get_direction)

Expand All @@ -344,20 +324,20 @@ class FanUpdater(logger.Logger):
'Fan fault warning: {} is broken'.format(fan_name)
)

if presence and fan_status.set_under_speed(speed, speed_target, speed_tolerance):
if presence and fan_status.set_under_speed(is_under_speed):
set_led = True
self._log_on_status_changed(not fan_status.under_speed,
'Fan low speed warning cleared: {} speed is back to normal'.format(fan_name),
'Fan low speed warning: {} current speed={}, target speed={}, tolerance={}'.
format(fan_name, speed, speed_target, speed_tolerance)
'Fan low speed warning: {} current speed={}, target speed={}'.
format(fan_name, speed, speed_target)
)

if presence and fan_status.set_over_speed(speed, speed_target, speed_tolerance):
if presence and fan_status.set_over_speed(is_over_speed):
set_led = True
self._log_on_status_changed(not fan_status.over_speed,
'Fan high speed warning cleared: {} speed is back to normal'.format(fan_name),
'Fan high speed warning: {} target speed={}, current speed={}, tolerance={}'.
format(fan_name, speed_target, speed, speed_tolerance)
'Fan high speed warning: {} current speed={}, target speed={}'.
format(fan_name, speed, speed_target)
)

# We don't set PSU led here, PSU led will be handled in psud
Expand All @@ -376,8 +356,9 @@ class FanUpdater(logger.Logger):
('status', str(fan_fault_status)),
('direction', str(fan_direction)),
('speed', str(speed)),
('speed_tolerance', str(speed_tolerance)),
('speed_target', str(speed_target)),
('is_under_speed', str(is_under_speed)),
('is_over_speed', str(is_over_speed)),
('is_replaceable', str(is_replaceable)),
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
])
Expand Down
68 changes: 20 additions & 48 deletions sonic-thermalctld/tests/test_thermalctld.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,30 +68,6 @@ class TestFanStatus(object):
"""
Test cases to cover functionality in FanStatus class
"""
def test_check_speed_value_available(self):
fan_status = thermalctld.FanStatus()

ret = fan_status._check_speed_value_available(30, 32, 5, True)
assert ret == True
assert fan_status.log_warning.call_count == 0

ret = fan_status._check_speed_value_available(thermalctld.NOT_AVAILABLE, 32, 105, True)
assert ret == False
assert fan_status.log_warning.call_count == 1
fan_status.log_warning.assert_called_with('Invalid tolerance value: 105')

# Reset
fan_status.log_warning.reset_mock()

ret = fan_status._check_speed_value_available(thermalctld.NOT_AVAILABLE, 32, 5, False)
assert ret == False
assert fan_status.log_warning.call_count == 0

ret = fan_status._check_speed_value_available(thermalctld.NOT_AVAILABLE, 32, 5, True)
assert ret == False
assert fan_status.log_warning.call_count == 1
fan_status.log_warning.assert_called_with('Fan speed or target_speed or tolerance became unavailable, speed=N/A, target_speed=32, tolerance=5')

def test_set_presence(self):
fan_status = thermalctld.FanStatus()
ret = fan_status.set_presence(True)
Expand All @@ -104,52 +80,48 @@ def test_set_presence(self):

def test_set_under_speed(self):
fan_status = thermalctld.FanStatus()
ret = fan_status.set_under_speed(thermalctld.NOT_AVAILABLE, thermalctld.NOT_AVAILABLE, thermalctld.NOT_AVAILABLE)
assert not ret

ret = fan_status.set_under_speed(thermalctld.NOT_AVAILABLE, thermalctld.NOT_AVAILABLE, 0)
assert not ret

ret = fan_status.set_under_speed(thermalctld.NOT_AVAILABLE, 0, 0)
ret = fan_status.set_under_speed(False)
assert not ret

ret = fan_status.set_under_speed(0, 0, 0)
assert not ret

ret = fan_status.set_under_speed(80, 100, 19)
ret = fan_status.set_under_speed(True)
assert ret
assert fan_status.under_speed
assert not fan_status.is_ok()

ret = fan_status.set_under_speed(81, 100, 19)
ret = fan_status.set_under_speed(True)
assert not ret

ret = fan_status.set_under_speed(False)
assert ret
assert not fan_status.under_speed
assert fan_status.is_ok()

def test_set_over_speed(self):
fan_status = thermalctld.FanStatus()
ret = fan_status.set_over_speed(thermalctld.NOT_AVAILABLE, thermalctld.NOT_AVAILABLE, thermalctld.NOT_AVAILABLE)
assert not ret

ret = fan_status.set_over_speed(thermalctld.NOT_AVAILABLE, thermalctld.NOT_AVAILABLE, 0)
ret = fan_status.set_under_speed(False)
assert not ret

ret = fan_status.set_over_speed(thermalctld.NOT_AVAILABLE, 0, 0)
assert not ret
def test_set_over_speed(self):
fan_status = thermalctld.FanStatus()

ret = fan_status.set_over_speed(0, 0, 0)
ret = fan_status.set_over_speed(False)
assert not ret

ret = fan_status.set_over_speed(120, 100, 19)
ret = fan_status.set_over_speed(True)
assert ret
assert fan_status.over_speed
assert not fan_status.is_ok()

ret = fan_status.set_over_speed(120, 100, 21)
ret = fan_status.set_over_speed(True)
assert not ret

ret = fan_status.set_over_speed(False)
assert ret
assert not fan_status.over_speed
assert fan_status.is_ok()

ret = fan_status.set_over_speed(False)
assert not ret


class TestFanUpdater(object):
"""
Expand Down Expand Up @@ -251,7 +223,7 @@ def test_fan_under_speed(self):
fan_list = chassis.get_all_fans()
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
assert fan_updater.log_warning.call_count == 1
fan_updater.log_warning.assert_called_with('Fan low speed warning: FanDrawer 0 fan 1 current speed=1, target speed=2, tolerance=0')
fan_updater.log_warning.assert_called_with('Fan low speed warning: FanDrawer 0 fan 1 current speed=1, target speed=2')

fan_list[0].make_normal_speed()
fan_updater.update()
Expand All @@ -267,7 +239,7 @@ def test_fan_over_speed(self):
fan_list = chassis.get_all_fans()
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
assert fan_updater.log_warning.call_count == 1
fan_updater.log_warning.assert_called_with('Fan high speed warning: FanDrawer 0 fan 1 target speed=1, current speed=2, tolerance=0')
fan_updater.log_warning.assert_called_with('Fan high speed warning: FanDrawer 0 fan 1 current speed=2, target speed=1')

fan_list[0].make_normal_speed()
fan_updater.update()
Expand Down